% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils.R
\docType{class}
\name{token_stats}
\alias{token_stats}
\title{token statistics}
\usage{
# utl <- token_stats$new(x_vec = NULL, path_2folder = NULL, path_2file = NULL,

#                               file_delimiter = ' ', n_gram_delimiter = "_")
}
\description{
token statistics

token statistics
}
\details{
the \emph{path_2vector} function returns the words of a \emph{folder} or \emph{file} to a vector ( using the \emph{file_delimiter} to input the data ). Usage: read a vocabulary from a text file

the \emph{freq_distribution} function returns a named-unsorted vector frequency_distribution in R for EITHER a \emph{folder}, a \emph{file} OR a character string \emph{vector}. A specific subset of the result can be retrieved using the \emph{print_frequency} function

the \emph{count_character} function returns the number of characters for each word of the corpus for EITHER a \emph{folder}, a \emph{file} OR a character string \emph{vector}. A specific number of character words can be retrieved using the \emph{print_count_character} function

the \emph{collocation_words} function returns a co-occurence frequency table for n-grams for EITHER a \emph{folder}, a \emph{file} OR a character string \emph{vector}. A collocation is defined as a sequence of two or more consecutive words, that has characteristics of a syntactic and semantic unit, and whose exact and unambiguous meaning or connotation cannot be derived directly from the meaning or connotation of its components ( \url{http://nlp.stanford.edu/fsnlp/promo/colloc.pdf}, page 172 ). The input to the function should be text n-grams separated by a delimiter (for instance 3- or 4-ngrams ). I can retrieve a specific frequency table by using the \emph{print_collocations} function

the \emph{string_dissimilarity_matrix} function returns a string-dissimilarity-matrix using either the \emph{dice}, \emph{levenshtein} or \emph{cosine} distance. The input can be a character string \emph{vector} only. In case that the method is \emph{dice} then the dice-coefficient (similarity) is calculated between two strings for a specific number of character n-grams ( \emph{dice_n_gram} ).

the \emph{look_up_table} returns a look-up-list where the list-names are the n-grams and the list-vectors are the words associated with those n-grams. The words for each n-gram can be retrieved using the \emph{print_words_lookup_tbl} function. The input can be a character string \emph{vector} only.
}
\section{Methods}{


\describe{
 \item{\code{token_stats$new(x_vec = NULL, path_2folder = NULL, path_2file = NULL, file_delimiter = ' ', n_gram_delimiter = "_")}}{}

 \item{\code{--------------}}{}

 \item{\code{path_2vector()}}{}

 \item{\code{--------------}}{}

 \item{\code{freq_distribution()}}{}

 \item{\code{--------------}}{}

 \item{\code{print_frequency(subset = NULL)}}{}

 \item{\code{--------------}}{}

 \item{\code{count_character()}}{}

 \item{\code{--------------}}{}

 \item{\code{print_count_character(number = NULL)}}{}

 \item{\code{--------------}}{}

 \item{\code{collocation_words()}}{}

 \item{\code{--------------}}{}

 \item{\code{print_collocations(word = NULL)}}{}

 \item{\code{--------------}}{}

 \item{\code{string_dissimilarity_matrix(dice_n_gram = 2, method = "dice", split_separator = " ", dice_thresh = 1.0, upper = TRUE, diagonal = TRUE, threads = 1)}}{}

 \item{\code{--------------}}{}

 \item{\code{look_up_table(n_grams = NULL)}}{}

 \item{\code{--------------}}{}

 \item{\code{print_words_lookup_tbl(n_gram = NULL)}}{}
 }
}

\examples{


library(textTinyR)

expl = c('one_word_token', 'two_words_token', 'three_words_token', 'four_words_token')

tk <- token_stats$new(x_vec = expl, path_2folder = NULL, path_2file = NULL)

#-------------------------
# frequency distribution:
#-------------------------

tk$freq_distribution()

# tk$print_frequency()


#------------------
# count characters:
#------------------

cnt <- tk$count_character()

# tk$print_count_character(number = 4)


#----------------------
# collocation of words:
#----------------------

col <- tk$collocation_words()

# tk$print_collocations(word = 'five')


#-----------------------------
# string dissimilarity matrix:
#-----------------------------

dism <- tk$string_dissimilarity_matrix(method = 'levenshtein')


#---------------------
# build a look-up-table:
#---------------------

lut <- tk$look_up_table(n_grams = 3)

# tk$print_words_lookup_tbl(n_gram = 'e_w')
}
\section{Methods}{
\subsection{Public methods}{
\itemize{
\item \href{#method-token_stats-new}{\code{token_stats$new()}}
\item \href{#method-token_stats-path_2vector}{\code{token_stats$path_2vector()}}
\item \href{#method-token_stats-freq_distribution}{\code{token_stats$freq_distribution()}}
\item \href{#method-token_stats-print_frequency}{\code{token_stats$print_frequency()}}
\item \href{#method-token_stats-count_character}{\code{token_stats$count_character()}}
\item \href{#method-token_stats-print_count_character}{\code{token_stats$print_count_character()}}
\item \href{#method-token_stats-collocation_words}{\code{token_stats$collocation_words()}}
\item \href{#method-token_stats-print_collocations}{\code{token_stats$print_collocations()}}
\item \href{#method-token_stats-string_dissimilarity_matrix}{\code{token_stats$string_dissimilarity_matrix()}}
\item \href{#method-token_stats-look_up_table}{\code{token_stats$look_up_table()}}
\item \href{#method-token_stats-print_words_lookup_tbl}{\code{token_stats$print_words_lookup_tbl()}}
\item \href{#method-token_stats-clone}{\code{token_stats$clone()}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-token_stats-new"></a>}}
\if{latex}{\out{\hypertarget{method-token_stats-new}{}}}
\subsection{Method \code{new()}}{
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{token_stats$new(
  x_vec = NULL,
  path_2folder = NULL,
  path_2file = NULL,
  file_delimiter = "\\n",
  n_gram_delimiter = "_"
)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{x_vec}}{either NULL or a string character vector}

\item{\code{path_2folder}}{either NULL or a valid path to a folder (each file in the folder should include words separated by a delimiter)}

\item{\code{path_2file}}{either NULL or a valid path to a file}

\item{\code{file_delimiter}}{either NULL or a character string specifying the file delimiter}

\item{\code{n_gram_delimiter}}{either NULL or a character string specifying the n-gram delimiter. It is used in the \emph{collocation_words} function}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-token_stats-path_2vector"></a>}}
\if{latex}{\out{\hypertarget{method-token_stats-path_2vector}{}}}
\subsection{Method \code{path_2vector()}}{
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{token_stats$path_2vector()}\if{html}{\out{</div>}}
}

}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-token_stats-freq_distribution"></a>}}
\if{latex}{\out{\hypertarget{method-token_stats-freq_distribution}{}}}
\subsection{Method \code{freq_distribution()}}{
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{token_stats$freq_distribution()}\if{html}{\out{</div>}}
}

}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-token_stats-print_frequency"></a>}}
\if{latex}{\out{\hypertarget{method-token_stats-print_frequency}{}}}
\subsection{Method \code{print_frequency()}}{
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{token_stats$print_frequency(subset = NULL)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{subset}}{either NULL or a vector specifying the subset of data to keep (number of rows of the \emph{print_frequency} function)}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-token_stats-count_character"></a>}}
\if{latex}{\out{\hypertarget{method-token_stats-count_character}{}}}
\subsection{Method \code{count_character()}}{
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{token_stats$count_character()}\if{html}{\out{</div>}}
}

}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-token_stats-print_count_character"></a>}}
\if{latex}{\out{\hypertarget{method-token_stats-print_count_character}{}}}
\subsection{Method \code{print_count_character()}}{
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{token_stats$print_count_character(number = NULL)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{number}}{a numeric value for the \emph{print_count_character} function. All words with number of characters equal to the \emph{number} parameter will be returned.}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-token_stats-collocation_words"></a>}}
\if{latex}{\out{\hypertarget{method-token_stats-collocation_words}{}}}
\subsection{Method \code{collocation_words()}}{
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{token_stats$collocation_words()}\if{html}{\out{</div>}}
}

}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-token_stats-print_collocations"></a>}}
\if{latex}{\out{\hypertarget{method-token_stats-print_collocations}{}}}
\subsection{Method \code{print_collocations()}}{
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{token_stats$print_collocations(word = NULL)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{word}}{a character string for the \emph{print_collocations} and \emph{print_prob_next} functions}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-token_stats-string_dissimilarity_matrix"></a>}}
\if{latex}{\out{\hypertarget{method-token_stats-string_dissimilarity_matrix}{}}}
\subsection{Method \code{string_dissimilarity_matrix()}}{
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{token_stats$string_dissimilarity_matrix(
  dice_n_gram = 2,
  method = "dice",
  split_separator = " ",
  dice_thresh = 1,
  upper = TRUE,
  diagonal = TRUE,
  threads = 1
)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{dice_n_gram}}{a numeric value specifying the n-gram for the dice method of the \emph{string_dissimilarity_matrix} function}

\item{\code{method}}{a character string specifying the method to use in the \emph{string_dissimilarity_matrix} function. One of \emph{dice}, \emph{levenshtein} or \emph{cosine}.}

\item{\code{split_separator}}{a character string specifying the string split separator if method equal \emph{cosine} in the \emph{string_dissimilarity_matrix} function. The \emph{cosine} method uses sentences, so for a sentence : "this_is_a_word_sentence" the \emph{split_separator} should be "_"}

\item{\code{dice_thresh}}{a float number to use to threshold the data if method is \emph{dice} in the \emph{string_dissimilarity_matrix} function. It takes values between 0.0 and 1.0. The closer the thresh is to 0.0 the more values of the dissimilarity matrix will take the value of 1.0.}

\item{\code{upper}}{either TRUE or FALSE. If TRUE then both lower and upper parts of the dissimilarity matrix of the \emph{string_dissimilarity_matrix} function will be shown. Otherwise the upper part will be filled with NA's}

\item{\code{diagonal}}{either TRUE or FALSE. If TRUE then the diagonal of the dissimilarity matrix of the \emph{string_dissimilarity_matrix} function will be shown. Otherwise the diagonal will be filled with NA's}

\item{\code{threads}}{a numeric value specifying the number of cores to use in parallel in the \emph{string_dissimilarity_matrix} function}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-token_stats-look_up_table"></a>}}
\if{latex}{\out{\hypertarget{method-token_stats-look_up_table}{}}}
\subsection{Method \code{look_up_table()}}{
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{token_stats$look_up_table(n_grams = NULL)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{n_grams}}{a numeric value specifying the n-grams in the \emph{look_up_table} function}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-token_stats-print_words_lookup_tbl"></a>}}
\if{latex}{\out{\hypertarget{method-token_stats-print_words_lookup_tbl}{}}}
\subsection{Method \code{print_words_lookup_tbl()}}{
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{token_stats$print_words_lookup_tbl(n_gram = NULL)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{n_gram}}{a character string specifying the n-gram to use in the \emph{print_words_lookup_tbl} function}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-token_stats-clone"></a>}}
\if{latex}{\out{\hypertarget{method-token_stats-clone}{}}}
\subsection{Method \code{clone()}}{
The objects of this class are cloneable with this method.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{token_stats$clone(deep = FALSE)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{deep}}{Whether to make a deep clone.}
}
\if{html}{\out{</div>}}
}
}
}
