% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/create_tcorpus.r
\name{create_tcorpus}
\alias{create_tcorpus}
\alias{create_tcorpus.character}
\alias{create_tcorpus.data.frame}
\alias{create_tcorpus.factor}
\alias{create_tcorpus.corpus}
\title{Create a tCorpus}
\usage{
create_tcorpus(x, ...)

\method{create_tcorpus}{character}(
  x,
  doc_id = 1:length(x),
  meta = NULL,
  udpipe_model = NULL,
  split_sentences = F,
  max_sentences = NULL,
  max_tokens = NULL,
  udpipe_model_path = getwd(),
  udpipe_cache = 3,
  udpipe_cores = NULL,
  udpipe_batchsize = 50,
  use_parser = F,
  remember_spaces = TRUE,
  verbose = T,
  ...
)

\method{create_tcorpus}{data.frame}(
  x,
  text_columns = "text",
  doc_column = "doc_id",
  udpipe_model = NULL,
  split_sentences = F,
  max_sentences = NULL,
  max_tokens = NULL,
  udpipe_model_path = getwd(),
  udpipe_cache = 3,
  udpipe_cores = NULL,
  udpipe_batchsize = 50,
  use_parser = F,
  remember_spaces = FALSE,
  verbose = T,
  ...
)

\method{create_tcorpus}{factor}(x, ...)

\method{create_tcorpus}{corpus}(x, ...)
}
\arguments{
\item{x}{main input. can be a character (or factor) vector where each value is a full text, or a data.frame that has a column that contains full texts.}

\item{...}{Arguments passed to create_tcorpus.character}

\item{doc_id}{if x is a character/factor vector, doc_id can be used to specify document ids. This has to be a vector of the same length as x}

\item{meta}{A data.frame with document meta information (e.g., date, source). The rows of the data.frame need to match the values of x}

\item{udpipe_model}{Optionally, the name of a Universal Dependencies language model (e.g., "english-ewt", "dutch-alpino"), to use the udpipe package
(\code{\link[udpipe]{udpipe_annotate}}) for natural language processing. You can use \code{\link{show_udpipe_models}} to get
an overview of the available models. For more information about udpipe and performance benchmarks of the UD models, see the
GitHub page of the \href{https://github.com/bnosac/udpipe}{udpipe package}.}

\item{split_sentences}{Logical. If TRUE, the sentence number of tokens is also computed. (only if udpipe_model is not used)}

\item{max_sentences}{An integer. Limits the number of sentences per document to the specified number. If set when split_sentences == FALSE, split_sentences will be set to TRUE.}

\item{max_tokens}{An integer. Limits the number of tokens per document to the specified number}

\item{udpipe_model_path}{If udpipe_model is used, this path wil be used to look for the model, and if the model doesn't yet exist it will be downloaded to this location. Defaults to working directory}

\item{udpipe_cache}{The number of persistent caches to keep for inputs of udpipe. The caches store tokens in batches.
This way, if a lot of data has to be parsed, or if R crashes, udpipe can continue from the latest batch instead of start over.
The caches are stored in the corpustools_data folder (in udpipe_model_path). Only the most recent [udpipe_caches] caches will be stored.}

\item{udpipe_cores}{If udpipe_model is used, this sets the number of parallel cores. If not specified, will use the same number of cores as used by data.table (or limited to OMP_THREAD_LIMIT).}

\item{udpipe_batchsize}{In order to report progress and cache results, texts are parsed with udpipe in batches of 50.
The price is that there will be some overhead for each batch, so for very large jobs it can be faster to increase the batchsize.
If the number of texts divided by the number of parallel cores is lower than the batchsize, the texts are evenly distributed over cores.}

\item{use_parser}{If TRUE, use dependency parser (only if udpipe_model is used)}

\item{remember_spaces}{If TRUE, a column with spaces after each token is included. Enables correct reconstruction of original text and keeps annotations at the level of character positions (e.g., brat) intact.}

\item{verbose}{If TRUE, report progress. Only if x is large enough to require multiple sequential batches}

\item{text_columns}{if x is a data.frame, this specifies the column(s) that contains text. The texts are paste together in the order specified here.}

\item{doc_column}{If x is a data.frame, this specifies the column with the document ids.}
}
\description{
Create a \link{tCorpus} from raw text input. Input can be a character (or factor) vector, data.frame or quanteda corpus.
If a data.frame is given, all columns other than the document id and text columns are included as meta data. If a quanteda
corpus is given, the ids and texts are already specified, and the docvars will be included in the tCorpus as meta data.
}
\details{
By default, texts will only be tokenized, and basic preprocessing techniques (lowercasing, stemming) can be applied with the
\code{\link{preprocess}} method. Alternatively, the udpipe package can be used to apply more advanced NLP preprocessing, by
using the udpipe_model argument.
}
\examples{
## ...
tc = create_tcorpus(c('Text one first sentence. Text one second sentence', 'Text two'))
tc$tokens

tc = create_tcorpus(c('Text one first sentence. Text one second sentence', 'Text two'),
                    split_sentences = TRUE)
tc$tokens

## with meta (easier to S3 method for data.frame)
meta = data.frame(doc_id = c(1,2), source = c('a','b'))
tc = create_tcorpus(c('Text one first sentence. Text one second sentence', 'Text two'),
                    split_sentences = TRUE,
                    doc_id = c(1,2),
                    meta = meta)
tc
d = data.frame(text = c('Text one first sentence. Text one second sentence.',
               'Text two', 'Text three'),
               date = c('2010-01-01','2010-01-01','2012-01-01'),
               source = c('A','B','B'))

tc = create_tcorpus(d, split_sentences = TRUE)
tc
tc$tokens

## use multiple text columns
d$headline = c('Head one', 'Head two', 'Head three')
## use custom doc_id
d$doc_id = c('#1', '#2', '#3')

tc = create_tcorpus(d, text_columns = c('headline','text'), doc_column = 'doc_id',
                    split_sentences = TRUE)
tc
tc$tokens
## It makes little sense to have full texts as factors, but it tends to happen.
## The create_tcorpus S3 method for factors is essentially identical to the
##  method for a character vector.
text = factor(c('Text one first sentence', 'Text one second sentence'))
tc = create_tcorpus(text)
tc$tokens

library(quanteda)
create_tcorpus(data_corpus_inaugural)
}
