R/modelFitFunction.R

Defines functions bycatchFit

Documented in bycatchFit

#---------------------------------
#Model setup
#----------------------------------

#Roxygen header
#'Bycatch model fitting routine
#'
#'Produces model-based estimates of bycatch and annual abundance index, as specified in \code{bycatchSetup}
#'
#'
#' @param setupObj  An object produced by \code{bycatchSetup}.
#' @param selectCriteria Character. Model selection criteria. Options are AICc, AIC and BIC
#' @param DoCrossValidation Specify whether to run a 10 fold cross-validation (TRUE or FALSE). This may not work with a small or unbalanced dataset
#' @param DredgeCrossValidation DredgeCrossValidation specifies whether to use information criteria to find the best model in cross validation, using the dredge function, or just keep the same model formula. Do not use dredge for very large datasets, as the run will be slow.
#' @param ResidualTest Logical. Specify whether to exclude models that fail the DHARMa residuals test.
#' @param CIval Specify confidence interval for total bycatch estimates. Should be the alpha level, e.g. 0.05 for 95%
#' @param VarCalc Character. Options are: "Simulate","DeltaMethod", or "None". Variance calculation method. Simulate will not work with a large number of sample units in the logbook data. The delta method for variance calculation is not implemented for the delta-lognormal or delta-gamma methods.
#' @param useParallel Logical. Whether to conduct the analysis using parallel processing. Only initialized when more that two cores are available.
#' @param nSims Number of simulations used to calculate confidence intervals. Ignored if \code{VarCalc} set to "None"
#' @param baseDir Character. A directory to save output. Defaults to current working directory.
#' @param plotValidation Logical. Validation. If you have true values of the total bycatch (for example in a simulation study). Make PlotValidation true and fill out the rest of the specification.
#' @param trueVals The data set that contains the true simulated total catches by year.
#' @param trueCols The column of the true simulated catches that contains true bycatch by year
#' @param doReport Logical. Create a markdown report of the analysis
#' @import MuMIn
#' @export
#' @keywords Fitting functions
#' @examples
#' \dontrun{
#' library(BycatchEstimator)
#' #-------------------------------------------------
#' #Step 1. Run the setup file and review data inputs
#'setupObj<-bycatchSetup(
#'  modelTry = c("Lognormal","Delta-Lognormal","Delta-Gamma","TMBnbinom1","TMBnbinom2","TMBtweedie"),
#'  obsdat = LLSIM_BUM_Example_observer,
#'  logdat = LLSIM_BUM_Example_logbook,
#'  yearVar = "Year",
#'  obsEffort = "hooks",
#'  logEffort = "hooks",
#'  logUnsampledEffort = "unsampledEffort",
#'  includeObsCatch  = TRUE,
#'  matchColumn = "trip",
#'  factorNames = c("Year","fleet","area","season"),
#'  EstimateIndex = TRUE,
#'  EstimateBycatch = TRUE,
#'  logNum = NA,
#'  sampleUnit = "trips",
#'  complexModel = formula(y~Year+fleet+hbf+area+season+Year:area),
#'  simpleModel = formula(y~Year+fleet+area),
#'  indexModel = formula(y~Year+area),
#'  baseDir = getwd(),
#'  runName = "LLSIMBUMtrip2022Aprilobs05mc",
#'  runDescription = "LLSIm BUM by trip, with 5% observer coverage including observed catch in totals April 17 2022",
#'  common = c("Swordfish","Blue marlin")[2],
#'  sp = c("Xiphias gladius","Makaira nigricans")[2],
#'  obsCatch = c("SWO","BUM")[2],
#'  catchUnit = "number",
#'  catchType = "catch"
#')
#'
#'-------------
#' #Step 2. Model Fitting
#'bycatchFit(
#'  setupObj = setupObj,
#'  selectCriteria = "BIC",
#'  DoCrossValidation = TRUE,
#'  DredgeCrossValidation = FALSE,
#'  ResidualTest = FALSE,
#'  CIval = 0.05,
#'  VarCalc = "Simulate",
#'  useParallel = TRUE,
#'  nSims = 1000,
#'  baseDir = getwd(),
#'  plotValidation = FALSE,
#'  trueVals = NULL,
#'  trueCols = NULL
#')}


bycatchFit<-function(

  setupObj,
  selectCriteria = "BIC",
  DoCrossValidation = FALSE,
  DredgeCrossValidation = FALSE,
  ResidualTest = TRUE,
  CIval = 0.05,
  VarCalc = "Simulate",
  useParallel = TRUE,
  nSims = 10,
  baseDir = getwd(),
  plotValidation = FALSE,
  trueVals = NULL,
  trueCols = NULL,
  doReport = TRUE

  ){

  #Unpack setupObj
  modelTry<-obsdat<-logdat<-yearVar<-obsEffort<-logEffort<-logUnsampledEffort<-
    includeObsCatch<-matchColumn<-factorNames<-randomEffects<-randomEffects2<-
  EstimateIndex<-EstimateBycatch<-logNum<-sampleUnit<-complexModel<-simpleModel<-indexModel<-
    designMethods<-designVars<-designPooling<-poolTypes<-pooledVar<-adjacentNum<-
minStrataUnit<-
    baseDir<-runName<-runDescription<-
  common<-sp<-obsCatch<-catchUnit<-catchType<-NULL

  numSp<-modelTable<-modelSelectTable<-modFits<-modPredVals<-modIndexVals<-residualTab<-bestmod<-predbestmod<-indexbestmod<-allmods<-allindex<-
  modelFail<-rmsetab<-metab<-dat<-yearSum<-requiredVarNames<-allVarNames<-indexDat<-strataSum<-NumCores<-NULL

  for(r in 1:NROW(setupObj$bycatchInputs)) assign(names(setupObj$bycatchInputs)[r], setupObj$bycatchInputs[[r]])
  for(r in 1:NROW(setupObj$bycatchOutputs)) assign(names(setupObj$bycatchOutputs)[r],setupObj$bycatchOutputs[[r]])

  #Setup directory naming
  dirname<-list()
  outDir<-paste0(baseDir, paste("/Output", runName))
  if(!dir.exists(outDir)) dir.create(outDir)
  for(run in 1:numSp) {
    dirname[[run]]<-paste0(outDir,"/",common[run]," ",catchType[run],"/")
    if(!dir.exists(dirname[[run]])) dir.create(dirname[[run]])
  }

  #Make sure there are multiple cores to use Parallel processing
  if(NumCores<=1) useParallel=FALSE

  if(!VarCalc %in% c("None","Simulate","DeltaMethod")) VarCalc="None"
  ############## This is the main analysis loop ##########################
  for(run in 1:numSp) {
    datval<-dat[[run]]
    outVal<-dirname[[run]]
    varExclude<-NULL
    #Fit all models except delta
    for(mod in which(!grepl("delta",modelTry,ignore.case=TRUE))){
      modFit<-suppressWarnings(BycatchEstimator:::findBestModelFunc(
        obsdatval = datval,
        modType = modelTry[mod],
        printOutput=TRUE,
        requiredVarNames = requiredVarNames,
        allVarNames = allVarNames,
        complexModel = complexModel,
        common = common,
        randomEffects = randomEffects,
        useParallel = useParallel,
        selectCriteria = selectCriteria,
        catchType = catchType,
        varExclude = varExclude,
        dirname = dirname,
        run = run
      ))
      modelSelectTable[[run]][[modelTry[mod]]]<-modFit[[2]]
      modFits[[run]][[modelTry[mod]]]<-modFit[[1]]
    }

    #Fit delta models
    if(any(grepl("delta",modelTry,ignore.case=TRUE))) {  #Delta models if requested
      posdat<-filter(dat[[run]], .data$pres==1)
      y<-unlist(lapply(posdat[,factorNames],function(x) length(setdiff(levels(x),x)))) #See if all levels are included
      varExclude<-names(y)[y>0]
      if(length(varExclude>0)) print(paste(common[run], "excluding variable",varExclude,"from delta models for positive catch"))
      if((min(summary(posdat$Year))>0 |  is.numeric(datval$Year)) &
         (!is.null(modFits[[run]][["Binomial"]]) | !is.null(modFits[[run]][["TMBbinomial"]]))) { #If all years have at least one positive observation and binomial converged, carry on with delta models
        for(mod in which(grepl("delta",modelTry,ignore.case=TRUE)))  {
          modFit<-suppressWarnings(BycatchEstimator:::findBestModelFunc(
            obsdatval = posdat,
            modType = modelTry[mod],
            requiredVarNames = requiredVarNames,
            allVarNames = allVarNames,
            complexModel = complexModel,
            randomEffects = randomEffects2,
            useParallel = useParallel,
            selectCriteria = selectCriteria,
            varExclude = varExclude,
            printOutput=TRUE,
            catchType = catchType,
            common = common,
            dirname = dirname,
            run = run
          ))
          modelSelectTable[[run]][[modelTry[mod]]]<-modFit[[2]]
          modFits[[run]][[modelTry[mod]]]<-modFit[[1]]
        }
      } else {
        print("Not all years have positive observations, skipping delta models")
        modelFail[run,c("Delta-Lognormal","Delta-Gamma")]<-"data"
        modPredVals[[run]][[modelTry[mod]]]<-NULL
        modIndexVals[[run]][[modelTry[mod]]]<-NULL
      }
    }

    # If estimating bycatch, see if data set is large
    if(EstimateBycatch) {
      BigData<-ifelse(sum(logdat$SampleUnits)>10000, TRUE, FALSE)
      #Add stratum designation and check sample size in strata
     if(length(requiredVarNames)>1) {
      logdat$strata<-apply( logdat[ , requiredVarNames ] , 1 , paste , collapse = "-" )
     }
     if(length(requiredVarNames)==1)   {
       logdat$strata <- pull(logdat,var=requiredVarNames)
     }
     if(length(requiredVarNames)==0)   {
       logdat$strata <- rep(1,nrow(logdat))
     }
     if(max(tapply(logdat$SampleUnits,logdat$strata,sum))>100000) {
       print("Cannot calculate variance for large number of logbook sample units")
       VarCalc<-"None"
     }
    }
    for(mod in 1:length(modelTry)) {
      if(!is.null(modFits[[run]][[modelTry[mod]]])) {
        if(grepl("delta",modelTry[mod],ignore.case = TRUE )) {
          if(grepl("TMB",modelTry[mod])) modFit1=modFits[[run]][["TMBbinomial"]] else
            modFit1=modFits[[run]][["Binomial"]]
          modFit2<-modFits[[run]][[modelTry[mod]]]
        }
        if(!grepl("delta",modelTry[mod],ignore.case = TRUE )) {
          modFit1<-modFits[[run]][[modelTry[mod]]]
          modFit2<-NULL
        }
        if(EstimateBycatch) {
          if(VarCalc=="Simulate" |(VarCalc=="DeltaMethod" & modelTry[mod] %in% c("Delta-Lognormal","Delta-Gamma","Tweedie", "TMBdelta-lognormal","TMBdelta-gamma")))
              modPredVals[[run]][[modelTry[mod]]]<-
                makePredictionsSimVarBig(
                  modfit1=modFit1,
                  modfit2=modFit2,
                  modtype=modelTry[mod],
                  newdat=logdat,
                  obsdatval=datval,
                  includeObsCatch = includeObsCatch,
                  nsim = nSims,
                  requiredVarNames = requiredVarNames,
                  CIval = CIval,
                  common = common,
                  catchType = catchType,
                  dirname = dirname,
                  run = run,
                  randomEffects=randomEffects,
                  randomEffects2=randomEffects2)
          if(VarCalc=="DeltaMethod" & !modelTry[mod] %in% c("Delta-Lognormal","Delta-Gamma","Tweedie","TMBdelta-Lognormal","TMBdelta-Gamma"))
            modPredVals[[run]][[modelTry[mod]]]<-
              makePredictionsDeltaVar(
                modfit1=modFit1,
                modtype=modelTry[mod],
                newdat=logdat,
                obsdatval=datval,
                includeObsCatch = includeObsCatch,
                requiredVarNames = requiredVarNames,
                CIval = CIval,
                common = common,
                catchType = catchType,
                dirname = dirname,
                run = run
              )
          if(VarCalc=="None") {
            modPredVals[[run]][[modelTry[mod]]]<-
              makePredictionsNoVar(
                modfit1=modFit1,
                modfit2=modFit2,
                modtype=modelTry[mod],
                newdat=logdat,
                obsdatval=datval,
                includeObsCatch = includeObsCatch,
                nsims = nSims,
                requiredVarNames = requiredVarNames,
                common = common,
                catchType = catchType,
                dirname = dirname,
                run = run
              )
           }
        }
        if(EstimateIndex) {
          modIndexVals[[run]][[modelTry[mod]]]<-
            makeIndexVar(
              modfit1=modFit1,
              modfit2=modFit2,
              modType=modelTry[mod],
              newdat = indexDat,
              printOutput=TRUE,
              nsims = nSims,
              common = common,
              catchType = catchType,
              dirname = dirname,
              run = run
            )
        }
        modelTable[[run]]$formula[mod]<-paste(formula(modFits[[run]][[modelTry[mod]]]))[[3]]
        temp<-ResidualsFunc(modFits[[run]][[modelTry[mod]]],modelTry[mod],paste0(outVal,"Residuals",modelTry[mod],".pdf"))
        if(!is.null(temp)) {
          residualTab[[run]][,modelTry[mod]]<-temp
          if(residualTab[[run]]["KS.p",modelTry[mod]]<0.01 & ResidualTest) modelFail[run,modelTry[mod]]<-"resid"
        }
        if(is.null(modPredVals[[run]][[modelTry[mod]]]) & EstimateBycatch) modelFail[run,modelTry[mod]]<-"cv"
      } else {
        if(modelFail[run,modelTry[mod]]=="-") modelFail[run,modelTry[mod]]<-"fit"
      }
    }

    #Combine all predictions, except Binomial
    if(EstimateBycatch) {
      if(is.factor(modPredVals[[run]][[1]]$Year))
        yearSumGraph[[run]]$Year<-factor(yearSumGraph[[run]]$Year)
       allmods[[run]]<-bind_rows(modPredVals[[run]],.id="Source") %>%
        filter(!.data$Source=="Binomial",!.data$Source=="TMBbinomial")
      allmods[[run]]<-bind_rows(allmods[[run]],yearSumGraph[[run]])
      allmods[[run]]$Valid<-ifelse(modelFail[run,match(allmods[[run]]$Source,dimnames(modelFail)[[2]])]=="-" | allmods[[run]]$Source %in% c("Unstratified ratio","Ratio","Design Delta"),1,0)
    }
    if(EstimateIndex) {
      allindex[[run]]<-bind_rows(modIndexVals[[run]],.id="Source") %>%
        filter(!.data$Source=="Binomial",!.data$Source=="TMBbinonomial")
      allindex[[run]]$Valid<-ifelse(modelFail[run,match(allindex[[run]]$Source,dimnames(modelFail)[[2]])]=="-" ,1,0)
    }

    #Print the diagnostic tables
    write.csv(residualTab[[run]],paste0(outVal,"residualDiagnostics.csv"))
    write.csv(modelFail,paste0(outDir,"/modelFail.csv"))

    ######## Cross validation 10 fold  ####################################
    rmsetab[[run]]<-matrix(NA,10,length(modelTry),dimnames=list(1:10,modelTry))
    rmsetab[[run]]<-rmsetab[[run]][,colnames(rmsetab[[run]])!="Binomial"]
    metab[[run]]<-rmsetab[[run]]
    if(DoCrossValidation & length(which(modelFail[run,!colnames(modelFail)%in%c("Binomial","TMBbinomial")]=="-"))>1) {  #Don't do unless at least one model worked
      datval$cvsample<-sample(rep(1:10,length=dim(datval)[1]),replace=FALSE)
      for(i in 1:10 ) {
        datin<-datval[datval$cvsample!=i,]
        datout<-datval[datval$cvsample==i,]
        datout$SampleUnits<-rep(1,dim(datout)[1])
        for(mod in which(!grepl("delta",modelTry,ignore.case = TRUE))) {
          if(modelFail[run,modelTry[mod]]=="-") {
            if(DredgeCrossValidation) {
              modFit1<-suppressWarnings(findBestModelFunc(
                datin,
                modelTry[mod],
                requiredVarNames = requiredVarNames,
                allVarNames = allVarNames,
                complexModel = complexModel,
                useParallel = useParallel,
                selectCriteria = selectCriteria,
                catchType = catchType,
                varExclude = varExclude,
                randomEffects=randomEffects
              ))[[1]]
            } else {
              modFit1<-suppressWarnings(FitModelFuncCV(formula(paste0("y~",modelTable[[run]]$formula[mod])),
                                      modType=modelTry[mod],obsdatval=datin))
            }
            if(!modelTry[mod] %in% c("Binomial","TMBbinomial")) {
              predcpue<-makePredictions(
                modFit1,
                modType=modelTry[mod],
                newdat = datout
              )
              rmsetab[[run]][i,modelTry[mod]]<-getRMSE(predcpue$est.cpue,datout$cpue)
              metab[[run]][i,modelTry[mod]]<-getME(predcpue$est.cpue,datout$cpue)
            } else {
               if(modelTry[mod]=="Binomial")
                 binomial1<-modFit1
               if(modelTry[mod]=="TMBbinomial")
                 tmbbin1<-modFit1
            }
          }
        }
        if(any(grepl("delta",modelTry,ignore.case = TRUE))) {
          posdat<-filter(datin, .data$pres==1)
          y<-unlist(lapply(posdat[,factorNames],function(x) length(setdiff(levels(x),x)))) #See if all levels are included
          varExcludecv<-names(y)[y>0]
          for(mod in which(grepl("delta",modelTry,ignore.case = TRUE))) {
            if(modelFail[run,modelTry[mod]]=="-" & !(!is.numeric(posdat$Year) & min(table(posdat$Year))==0)) {
              if(DredgeCrossValidation) {
                modFit1<-suppressWarnings(findBestModelFunc(
                  posdat,
                  modelTry[mod],
                  requiredVarNames = requiredVarNames,
                  allVarNames = allVarNames,
                  complexModel = complexModel,
                  useParallel = useParallel,
                  selectCriteria = selectCriteria,
                  catchType = catchType,
                  varExclude = varExcludecv,
                  randomEffects=randomEffects2
                ))[[1]]
              } else {
                if(length(varExcludecv)==0)
                  modFit1<-suppressWarnings(FitModelFuncCV(formula(paste0("y~",modelTable[[run]]$formula[mod])),modType=modelTry[mod],obsdatval=posdat)) else {
                    temp<-strsplit(gsub(" ","",modelTable[[run]]$formula[mod]),"[+]")[[1]]
                    temp<-paste(temp[!temp %in%  varExcludecv],collapse="+")
                    modFit1<-suppressWarnings(FitModelFuncCV(formula(paste0("y~",temp)),modType=modelTry[mod],obsdatval=posdat))
                  }
              }
              if(grepl("TMB",modelTry[mod]))
                 bin1<-tmbbin1 else
                 bin1<-binomial1
              predcpue<-makePredictions(bin1,modFit1,modelTry[mod],datout)
              rmsetab[[run]][i,modelTry[mod]]<-getRMSE(predcpue$est.cpue,datout$cpue)
              metab[[run]][i,modelTry[mod]]<-getME(predcpue$est.cpue,datout$cpue)
            }
          }
        }
      }
      # Calculate RMSE and ME
      modelTable[[run]]$RMSE[modelTable[[run]]$model!="Binomial"]<-apply(rmsetab[[run]],2,mean,na.rm=TRUE)
      modelTable[[run]]$ME[modelTable[[run]]$model!="Binomial"]<-apply(metab[[run]],2,mean,na.rm=TRUE)
      write.csv(residualTab[[run]],paste0(outVal,"modelSummary.csv"))
      write.csv(rmsetab[[run]],paste0(outVal,"rmse.csv"))
      write.csv(metab[[run]],paste0(outVal,"me.csv"))
      write.csv(modelTable[[run]],paste0(outVal,"crossvalSummary.csv"))
      #Select best model based on cross validation
      best<-which(!is.na( modelTable[[run]]$RMSE) &
                    modelTable[[run]]$RMSE==min(modelTable[[run]]$RMSE,na.rm=TRUE))
      if(length(best) >0 ) {
        bestmod[run]<-modelTry[best]
        predbestmod[[run]]<-modPredVals[[run]][[modelTry[best]]]
        indexbestmod[[run]]<-modIndexVals[[run]][[modelTry[best]]]
      } else {
        bestmod[run]<-"None"
      }
    }

    if(doReport){
      save(list=c("numSp","yearSum","runName","runDescription","allVarNames",
                  "common", "sp","bestmod","CIval","includeObsCatch",
                  "predbestmod","indexbestmod","allmods","allindex","modelTable",
                  "modelSelectTable","modFits","modPredVals","VarCalc"
                  ,"modIndexVals","modelFail","rmsetab","metab",
                  "residualTab" ,"run","modelTry","EstimateIndex","EstimateBycatch",
                  "DoCrossValidation","indexVarNames","selectCriteria","sampleUnit",
                  "modelTry","catchType","catchUnit","residualTab",
                  "plotValidation","trueVals","trueCols","startYear"),
           file=paste0(outVal,"/","resultsR"))

      mkd<-tryCatch({
        system.file("Markdown", "PrintResults.Rmd", package = "BycatchEstimator", mustWork = TRUE)
      },
      error = function(c) NULL
      )
      if(!is.null( mkd)){
        suppressWarnings(rmarkdown::render(mkd,
                          params=list(outVal=outVal),
                          output_file = paste0(common[run], catchType[run],"results.pdf"),
                          output_dir=outVal,
                          quiet = TRUE))
      }
    }
    print(paste(run, common[run],"complete, ",Sys.time()))

  }
}
natureanalytics-ca/BycatchEstimator documentation built on Oct. 15, 2024, 10:28 p.m.