% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dfm_lookup.R
\name{dfm_lookup}
\alias{dfm_lookup}
\title{apply a dictionary to a dfm}
\usage{
dfm_lookup(x, dictionary, levels = 1:5, exclusive = TRUE,
  valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE,
  capkeys = !exclusive, verbose = TRUE)
}
\arguments{
\item{x}{the dfm to which the dictionary will be applied}

\item{dictionary}{a \link{dictionary} class object}

\item{levels}{levels of entries in a hierachical dictionary that will be 
applied}

\item{exclusive}{if \code{TRUE}, remove all features not in dictionary, 
otherwise, replace values in dictionary with keys while leaving other 
features unaffected}

\item{valuetype}{how to interpret keyword expressions: \code{"glob"} for 
"glob"-style wildcard expressions; \code{"regex"} for regular expressions;
or \code{"fixed"} for exact matching. See \link{valuetype} for details.}

\item{case_insensitive}{ignore the case of dictionary values if \code{TRUE}}

\item{capkeys}{if \code{TRUE}, convert dictionary keys to
uppercase to distinguish them from other features}

\item{verbose}{print status messages if \code{TRUE}}
}
\description{
Apply a dictionary to a dfm by looking up all dfm features for matches in a
a set of \link{dictionary} values, and combine replace those features with a 
count of the dictionary's keys.  If \code{exclusive = FALSE} then the 
behaviour is to apply a "thesaurus" where each value match is replaced by 
the dictionary key, converted to capitals if \code{capkeys = TRUE} (so that 
the replacements are easily distinguished from features that were terms
found originally in the document).
}
\note{
\code{dfm_lookup} should not be used with dictionaries containing
multi-word values, because dfm features will already have been fixed using
a specific ngram value which may not match the multi-word structure of the
dictionary.
}
\examples{
myDict <- dictionary(list(christmas = c("Christmas", "Santa", "holiday"),
                          opposition = c("Opposition", "reject", "notincorpus"),
                          taxglob = "tax*",
                          taxregex = "tax.+$",
                          country = c("United_States", "Sweden")))
myDfm <- dfm(c("My Christmas was ruined by your opposition tax plan.", 
               "Does the United_States or Sweden have more progressive taxation?"),
             remove = stopwords("english"), verbose = FALSE)
myDfm

# glob format
dfm_lookup(myDfm, myDict, valuetype = "glob")
dfm_lookup(myDfm, myDict, valuetype = "glob", case_insensitive = FALSE)

# regex v. glob format: note that "united_states" is a regex match for "tax*"
dfm_lookup(myDfm, myDict, valuetype = "glob")
dfm_lookup(myDfm, myDict, valuetype = "regex", case_insensitive = TRUE)

# fixed format: no pattern matching
dfm_lookup(myDfm, myDict, valuetype = "fixed")
dfm_lookup(myDfm, myDict, valuetype = "fixed", case_insensitive = FALSE)
}
\keyword{dfm}
