\name{ESPRESSO}

\alias{ESPRESSO}

\docType{package}

\title{Package for power analysis and sample size calculation}

\description{
	Package to estimate sample-size and power by exploring simulated study outcomes. It supports simulation-based power calculation for stand-alone case-control studies and for case-control analyses nested in cohort studies, that take account of realistic assessment error.
}

\details{
 ESPRESSO (Estimating Sample-size and Power in R by Exploring Simulated Study Outcomes) allows for the calculation of the sample size required to achieve a desired statitical power in a case control study. It also allows one to calculate the power achieved with a specified sample size. The simulated dataset consists of a binary or a continuous outcome and two genetic and two environmental determinants. Functions \code{sim.CC.data} and \code{sim.QTL.data} simulate the outcome (phenotype) and the initial effects data considered as the true measures of the determinants. Function \code{make.obs.data} adds some error to the effect data generated by \code{sim.CC.data} or \code{sim.QTL.data} to obtain the observed measures of the determinants. Function \code{regr.analysis} carries out a regression analysis of the covariates (genetic variants, environmental exposures and interaction term) over the outcome. Function \code{samplsize.calc} calculates the sample sizes required to achieve the desired power under the specified effect model (main effect or interaction). Functions \code{empirical.power.calc} and \code{model.power.calc} calculate, respectively, the empirical power and the theoretical power achieved under the specified sample size.
}

\author{
Amadou Gaye and Paul Burton

Maintainer: Amadou Gaye <ag239@le.ac.uk> 
}

\references{
  Burton, P.R., Hansell, A.L., Fortier, I., Manolio, T.A., Khoury, M.J., Little, J. & Elliott, P. 2009, \code{Size matters: just how big is BIG?: Quantifying realistic sample size requirements for human genome epidemiology}, International journal of epidemiology, \bold{vol. 38, no. 1}, pp.263-273.
}

\keyword{package}

\examples{

## This example illustrates how to use the main functions of the package. 

# load input control files and make one table of parameters
data(general.params) 
data(gen.params)
data(env.params)
s.temp <- merge(general.params, gen.params)
s.parameters <- merge(s.temp, env.params)

# create up to 20m subjects in blocks of 20k until required number of
# cases and controls is achieved. in general the only problem in achieving the
# required number of cases will occur if the disease prevalence is very low
allowed.sample.size<-20000000
block.size<-20000

# tracer to monitor iterations
#trace.interval<-10

# total number of scenarios
numscenarios <- dim(s.parameters)[1] 

# scenario to start with
start.at.scenario <- 1

# number of scenarios to run 
stop.at.scenario <- numscenarios 

for(j in start.at.scenario : stop.at.scenario)
{
   set.seed(s.parameters$seed.val[j])

   # general parameters
   scenario.id <- s.parameters$scenario.id[j]				     
   seed.val <- s.parameters$seed.val[j]					          
   numsims <- s.parameters$numsims[j]					            
   numcases <- s.parameters$numcases[j]					          
   numcontrols <- s.parameters$numcontrols[j]		
   num.subjects <- s.parameters$num.subjects[j]				     
   is.interaction <- s.parameters$interaction[j]
   pheno.model <- s.parameters$pheno.model[j]
   disease.prev <- s.parameters$disease.prev[j]	
   or.int <- s.parameters$or.int[j]
   int.efkt <- s.parameters$int.efkt[j]
   if(is.interaction != 0) {or.int <- s.parameters$or.int[j]}
   sigma.subject <- s.parameters$RR.5.95[j]					           
   pval <- s.parameters$p.val[j]						                
   power <- s.parameters$power[j]
   pheno.error <- c(1-s.parameters$sensitivity.pheno[j],1-s.parameters$specificity.pheno[j])
   reliability.pheno <- s.parameters$reliability.pheno[j]    

   # genetic determinants parameters
   is.add <- c(s.parameters$model.geno1[j], s.parameters$model.geno2[j])
   MAF <-  c(s.parameters$MAF.geno1[j], s.parameters$MAF.geno2[j])       
   or.geno <- c(s.parameters$or.geno1[j],s.parameters$or.geno2[j])
   geno.efkt <- c(s.parameters$geno1.efkt[j], s.parameters$geno2.efkt[j])
   LD <- s.parameters$LD[j]
   R.target <- s.parameters$R.target[j]
   display <- s.parameters$display[j]
   geno.error <- c(1-s.parameters$sensitivity.geno[j],1-s.parameters$specificity.geno[j])

   # environmental determinants parameters
   env.expo <- c(s.parameters$model.env1[j],s.parameters$model.env2[j])
   reliability.env <- c(s.parameters$reliability.env1[j],s.parameters$reliability.env2[j])       
   env.prev <- c(s.parameters$env1.prev[j],s.parameters$env2.prev[j])
   env.mean.lowlm <- c(s.parameters$env1.mean.lowlm,s.parameters$env2.mean.lowlm) 
   env.stdev.uplm <- c(s.parameters$env1.stdev.uplm,s.parameters$env2.stdev.uplm)          
   or.env <- c(s.parameters$or.env1[j],s.parameters$or.env2[j])
   env.efkt <- c(s.parameters$env1.efkt[j],s.parameters$env2.efkt[j])
   env.error <- c(1-s.parameters$sensitivity.env[j],1-s.parameters$specificity.env[j]) 
   skewness <- c(s.parameters$skewness1,s.parameters$skewness2)

   # the covariance matrix required to generate 2 variants with the desired ld
   cor.mat <- matrix(c(1,R.target,R.target,1),2,2) # correlation mtrix
   cov.mat.req <- make.cov.mat(cor.mat, c(1-MAF[1], 1-MAF[2]))

   # if the required covariance matrix is not positive-definite get 
   # the nearest positive-definite matrix (tolerance = 1e-06)
   if(!is.posdef(cov.mat.req, 0.000001)){
     cov.mat.req <- make.posdef(cov.mat.req, 0.000001)
   }

   # empty vectors for results of the analyses of each simulation in the scenario

   # genotype
   beta.geno1.results<-rep(NA,numsims)
   se.geno1.results<-rep(NA,numsims)
   z.geno1.results<-rep(NA,numsims)
   beta.geno2.results<-rep(NA,numsims)
   se.geno2.results<-rep(NA,numsims)
   z.geno2.results<-rep(NA,numsims)
   # environment
   beta.env1.results<-rep(NA,numsims)
   se.env1.results<-rep(NA,numsims)
   z.env1.results<-rep(NA,numsims)
   beta.env2.results<-rep(NA,numsims)
   se.env2.results<-rep(NA,numsims)
   z.env2.results<-rep(NA,numsims)
   # interaction
   beta.int.results<-rep(NA,numsims)
   se.int.results<-rep(NA,numsims)
   z.int.results<-rep(NA,numsims)

   # tracer to detect exceeding max allowable sample size
   sample.size.excess <- 0

   # generate and analyse datasets one at a time 
   for(s in 1:numsims) 
   {
      if(pheno.model == 0){ # under binary outcome
        # generate cases and controls untill the required number of cases, 
        # controls and sample size is achieved 
        sim.matrix <- sim.CC.data(block.size, numcases, numcontrols, 
        allowed.sample.size, is.interaction, disease.prev, MAF, is.add, R.target, 
        LD, cov.mat.req, display, or.geno, env.expo, env.mean.lowlm, env.stdev.uplm, 
        env.prev, or.env, skewness, or.int, sigma.subject, pheno.error)

      }else{ # under quantitative outcome model
        # generate the specified number of subjects
        sim.matrix <- sim.QTL.data(num.subjects, is.interaction, MAF, is.add, 
        R.target, LD, cov.mat.req, display, geno.efkt, env.expo, env.mean.lowlm, 
        env.stdev.uplm,env.prev, env.efkt, skewness, int.efkt, reliability.pheno)
      }

      # add appropriate errors to produce observed genotypes 
      observed.data <- get.observed.data(is.interaction, sim.matrix, geno.error, 
      is.add, MAF, env.expo, env.prev, env.error, reliability.env)
      sim.df <- observed.data$sim.df

      # data analysis
      glm.estimates <- regr.analysis(is.interaction, pheno.model, sim.df)

      # genetic variants estimates 
      beta.geno1.results[s] <- glm.estimates[1]
      se.geno1.results[s] <- glm.estimates[2]
      z.geno1.results[s] <- glm.estimates[3]
      beta.geno2.results[s] <- glm.estimates[4]
      se.geno2.results[s] <- glm.estimates[5]
      z.geno2.results[s] <- glm.estimates[6]

      # environment estimates
      beta.env1.results[s] <- glm.estimates[7]
      se.env1.results[s] <- glm.estimates[8]
      z.env1.results[s] <- glm.estimates[9]
      beta.env2.results[s] <- glm.estimates[10]
      se.env2.results[s] <- glm.estimates[11]
      z.env2.results[s] <- glm.estimates[12]
      
      # interaction estimates
      beta.int.results[s] <- glm.estimates[13]
      se.int.results[s] <- glm.estimates[14]
      z.int.results[s] <- glm.estimates[15]  

      # print tracer after every nth dataset created
      # if(s %% trace.interval ==0)cat("\n",s,"of",numsims,"completed in scenario",
      # scenario.id)

   }
   cat("\n\n")

   # summary of primary parameter estimates
   # genetic variants 
   mean.beta.geno1 <- round(mean(beta.geno1.results),3)
   mean.se.geno1 <- round(sqrt(mean(se.geno1.results^2)),3)
   mean.model.z.geno1 <- mean.beta.geno1/mean.se.geno1
   mean.beta.geno2 <- round(mean(beta.geno2.results),3)
   mean.se.geno2 <- round(sqrt(mean(se.geno2.results^2)),3)
   mean.model.z.geno2 <- mean.beta.geno2/mean.se.geno2
   mean.model.z.geno <- c(mean.beta.geno1/mean.se.geno1, mean.beta.geno2/mean.se.geno2)
   # environments
   mean.beta.env1 <- round(mean(beta.env1.results),3)
   mean.se.env1 <-round(sqrt(mean(se.env1.results^2)),3)
   mean.model.z.env1 <- mean.beta.env1/mean.se.env1
   mean.beta.env2 <- round(mean(beta.env2.results),3)
   mean.se.env2 <-round(sqrt(mean(se.env2.results^2)),3)
   mean.model.z.env2 <- mean.beta.env2/mean.se.env2
   mean.model.z.env <- c(mean.beta.env1/mean.se.env1, mean.beta.env2/mean.se.env2)
   # interaction
   if(is.interaction == 0){
      mean.beta.int <- NA
      mean.se.int <- NA
      mean.model.z.int <- NA
   }else{
      mean.beta.int<-round(mean(beta.int.results),3)
      mean.se.int<-round(sqrt(mean(se.int.results^2)),3)
      mean.model.z.int <- mean.beta.int/mean.se.int
   }
   mean.betas <- c(mean.beta.geno1, mean.beta.geno2, mean.beta.env1, mean.beta.env2, mean.beta.int)
   

   # calculate the sample size required under each model
   sample.sizes.required <- samplsize.calc(numcases,numcontrols, num.subjects, 
   pheno.model, is.interaction, pval, power, mean.model.z.geno,mean.model.z.env,
   mean.model.z.int)

   # calculate empirical power ie simply the proportion of simulations in which
   # the z statistic for the parameter of interest exceeds the z statistic
   # for the desired level of statistical significance
   empirical.power <- empirical.power.calc(is.interaction,pval,z.geno1.results,
   z.geno2.results,z.env1.results,z.env2.results, z.int.results)

   # calculate the power reached under the initial sample size 
   model.power <- model.power.calc(is.interaction, pval, mean.model.z.geno, 
   mean.model.z.env, mean.model.z.int)

   # return critical results and print a summary
   res <- get.critical.results(j, is.interaction, pheno.model, is.add, env.expo, 
   sample.sizes.required, empirical.power, model.power, mean.betas)
}
}
