% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/mallet.R
\name{mallet.import}
\alias{mallet.import}
\title{Import text documents into Mallet format}
\usage{
mallet.import(
  id.array = NULL,
  text.array,
  stoplist = "",
  preserve.case = FALSE,
  token.regexp = "[\\\\p{L}]+"
)
}
\arguments{
\item{id.array}{An array of document IDs. Default is \code{text.array} index.}

\item{text.array}{A character vector with each element containing a document.}

\item{stoplist}{The name of a file containing stopwords (words to ignore), one per line, or a character vector containing stop words.
If the file is not in the current working directory, you may need to include a full path.
Default is no stoplist.}

\item{preserve.case}{By default, the input text is converted to all lowercase.}

\item{token.regexp}{A quoted string representing a regular expression that defines a token. The default
is one or more unicode letter: "[\\\\p\{L\}]+". Note that special characters must
have double backslashes.}
}
\value{
a \code{cc/mallet/types/InstanceList} object.
}
\description{
This function takes an array of document IDs and text files (as character strings)
and converts them into a Mallet instance list.
}
\examples{
\dontrun{
# Read in sotu example data
data(sotu)
sotu.instances <-
   mallet.import(id.array = row.names(sotu),
                 text.array = sotu[["text"]],
                 stoplist = mallet_stoplist_file_path("en"),
                 token.regexp = "\\\\p{L}[\\\\p{L}\\\\p{P}]+\\\\p{L}")

}

}
\seealso{
\code{\link{mallet.word.freqs}} returns term and document frequencies, which may be useful in selecting stopwords.
}
