% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/prepare.training.validation.datasets.R
\name{prepare.training.validation.datasets}
\alias{prepare.training.validation.datasets}
\title{Prepare training and validation datasets}
\usage{
prepare.training.validation.datasets(data.directory = ".",
  output.directory = ".", data.types = c("mRNA"),
  data.types.ordinal = c("cna"), min.ordinal.threshold = c(cna = 3),
  centre.data = "median", p.threshold = 0.5,
  feature.selection.datasets = NULL, datasets = NULL,
  truncate.survival = 100, networks.database = "default",
  write.normed.datasets = TRUE, subset = NULL)
}
\arguments{
\item{data.directory}{Path to the directory containing datasets as specified
by \code{datasets}}

\item{output.directory}{Path to the output folder where intermediate and
results files will be saved}

\item{data.types}{A vector of molecular datatypes to load. Defaults to
c('mRNA')}

\item{data.types.ordinal}{A vector of molecular datatypes to be treated as
ordinal. Defaults to c('cna')}

\item{min.ordinal.threshold}{A named vector specifying minimum percent
threshold for each ordinal data type to be used prior to estimating
coefficients. Coefficient for features not satisfying minimum threshold will
not be estimated, and set to 0. Defaults to cna threshold as 3 percent}

\item{centre.data}{A character string specifying the centre value to be used for 
scaling data. Valid values are: 'median', 'mean', or a user defined numeric threshold
e.g. '0.3' when modelling methylation beta values. This value is used for both scaling
as well as for dichotomising data for estimating univariate betas from Cox model.
Defaults to 'median'}

\item{p.threshold}{Cox P value threshold to be applied for selecting features 
(e.g. genes) which will contribute to patient risk score estimation. Defaults to 0.5}

\item{feature.selection.datasets}{A vector containing names of datasets used
for feature selection in function \code{derive.network.features()}}

\item{datasets}{A vector containing names of all the datasets to be later
used for training and validation purposes}

\item{truncate.survival}{A numeric value specifying survival truncation in
years. Defaults to 100 years which effectively means no truncation}

\item{networks.database}{Name of the pathway networks database. Default to
NCI PID/Reactome/Biocarta i-e "default"}

\item{write.normed.datasets}{A toggle to control whether processed mRNA and
survival data should be written to file}

\item{subset}{A list with a Field and Entry component specifying a subset of
patients to be selected whose annotation Field matches Entry}
}
\value{
The output files are stored under \code{output.directory}/output/
}
\description{
Computes per-patient pathway-derived network impact scores across all input
datasets, independently
}
\examples{

# get data directory 
data.directory <- get.program.defaults()[["test.data.dir"]];

# initialise params
output.directory <- ".";
data.types <- c("mRNA");
feature.selection.datasets <- c("Breastdata1");
training.datasets <- c("Breastdata1");
validation.datasets <- c("Breastdata1", "Breastdata2");

# preparing training and validation datasets.
# Normalisation & patientwise subnet feature scores
prepare.training.validation.datasets(
  data.directory = data.directory,
  output.directory = output.directory,
  data.types =  data.types,
  feature.selection.datasets = feature.selection.datasets,
  datasets = unique(c(training.datasets, validation.datasets)),
  networks.database = "test"
  );

}
\author{
Syed Haider
}
\keyword{IO}
