% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_cleansing.R
\name{data_cleansing}
\alias{data_cleansing}
\title{Data Cleaning}
\usage{
data_cleansing(dat, target = NULL, obs_id = NULL, occur_time = NULL,
  pos_flag = NULL, x_list = NULL, ex_cols = NULL,
  miss_values = NULL, remove_dup = TRUE, outlier_proc = TRUE,
  missing_proc = "median", low_var = 0.999, missing_rate = 0.98,
  merge_cat = 30, note = TRUE, parallel = FALSE, save_data = FALSE,
  file_name = NULL, dir_path = tempdir())
}
\arguments{
\item{dat}{A data frame with x and target.}

\item{target}{The name of target variable.}

\item{obs_id}{The name of ID of observations.Default is NULL.}

\item{occur_time}{The name of occur time of observations.Default is NULL.}

\item{pos_flag}{The value of positive class of target variable, default: "1".}

\item{x_list}{A list of x variables.}

\item{ex_cols}{A list of excluded variables. Default is NULL.}

\item{miss_values}{Other extreme value might be used to represent missing values, e.g: -9999, -9998. These miss_values will be encoded to -1 or "missing".}

\item{remove_dup}{Logical, if TRUE, remove the duplicated observations.}

\item{outlier_proc}{Logical, process outliers or not. Default is TRUE.}

\item{missing_proc}{If logical, process missing values or not. If "median", then Nas imputation with k neighbors median. If "avg_dist", the distance weighted average method is applied to determine the NAs imputation with k neighbors. If "default", assigning the missing values to -1 or "missing", otherwise ,processing the missing values according to the results of missing analysis.}

\item{low_var}{The maximum percent of unique values (including NAs) for filtering low variance variables.}

\item{missing_rate}{The maximum percent of missing values for recoding values to missing and non_missing.}

\item{merge_cat}{The minimum number of categories for merging categories of character variables.}

\item{note}{Logical. Outputs info. Default is TRUE.}

\item{parallel}{Logical, parallel computing or not. Default is FALSE.}

\item{save_data}{Logical, save the result or not. Default is FALSE.}

\item{file_name}{The name for periodically saved data file. Default is NULL.}

\item{dir_path}{The path for periodically saved data file. Default is tempdir().}
}
\value{
A preprocessed data.frame
}
\description{
The \code{data_cleansing} function is a simpler wrapper for data cleaning functions, such as
delete variables that values are all NAs;
checking dat and target format.
delete low variance variables
replace null or NULL or blank with NA;
encode variables which NAs &  miss value rate is more than 95% as 1,0 ;
encode variables which unique value  rate is  more than 95% as 1,0;
merge categories of character variables that  is more than 10;
transfer time variables to dateformation;
remove duplicated observations;
process outliers;
process NAs.
}
\examples{
#data cleaning
dat_cl <- data_cleansing(dat = UCICreditCard[1:2000,],
                       target = "default.payment.next.month",
                       x_list = NULL,
                       obs_id = "ID",
                       occur_time = "apply_date",
                       ex_cols = c("PAY_6|BILL_"),
                       outlier_proc = TRUE,
                       missing_proc = TRUE,
                       low_var = TRUE,
                       save_data = FALSE)

}
\seealso{
\code{\link{remove_duplicated}},
\code{\link{null_blank_na}},
\code{\link{entry_rate_na}},
\code{\link{low_variance_filter}},
\code{\link{process_nas}},
\code{\link{process_outliers}}
}
