% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/pos.R
\name{pos}
\alias{pos}
\alias{pos_by}
\alias{pos_tags}
\title{Parts of Speech Tagging}
\usage{
pos(text.var, parallel = FALSE, cores = detectCores()/2,
  progress.bar = TRUE, na.omit = FALSE, digits = 1, percent = TRUE,
  zero.replace = 0, gc.rate = 10)

pos_by(text.var, grouping.var = NULL, digits = 1, percent = TRUE,
  zero.replace = 0, ...)

pos_tags(type = "pretty")
}
\arguments{
\item{text.var}{The text variable.}

\item{parallel}{logical.  If \code{TRUE} attempts to run the function on
multiple cores.  Note that this may not mean a speed boost if you have one
core or if the data set is smaller as the cluster takes time to create.}

\item{cores}{The number of cores to use if \code{parallel = TRUE}.  Default
is half the number of available cores.}

\item{progress.bar}{logical.  If \code{TRUE} attempts to provide a OS
appropriate progress bar.  If parallel is \code{TRUE} this argument is
ignored. Note that setting this argument to \code{TRUE} may slow down the
function.}

\item{na.omit}{logical.  If \code{TRUE} missing values (\code{NA}) will be
omitted.}

\item{digits}{Integer; number of decimal places to round when printing.}

\item{percent}{logical.  If \code{TRUE} output given as percent.  If
\code{FALSE} the output is proportion.}

\item{zero.replace}{Value to replace 0 values with.}

\item{gc.rate}{An integer value.  This is a necessary argument because of a
problem with the garbage collection in the openNLP function that
\code{\link[qdap]{pos}} wraps.  Consider adjusting this argument upward if
the error \code{java.lang.OutOfMemoryError} occurs.}

\item{grouping.var}{The grouping variables.  Default \code{NULL} generates
one word list for all text.  Also takes a single grouping variable or a list
of 1 or more grouping variables.}

\item{type}{An optional character string giving the output of the pos tags.
This must be one of the strings \code{"pretty"} (a left justified version of
the output optimized for viewing but not good for export),  \code{"matrix"}
(a matrix version of the output), \code{"dataframe"}\\ \code{"df"} (a
dataframe version of the output), \code{"all"} (a list of all three of the
previous output types).}

\item{\ldots}{Other argument supplied to \code{pos}.}
}
\value{
\code{pos} -  returns a list of 4:
\item{text}{The original text}
\item{POStagged}{The original words replaced with parts of speech in context.}
\item{POSprop}{Dataframe of the proportion of parts of speech by row.}
\item{POSfreq}{Dataframe of the frequency of parts of speech by row.}
\item{POSrnp}{Dataframe of the frequency and proportions of parts of speech
by row.}
\item{percent}{The value of percent used for plotting purposes.}
\item{zero.replace}{The value of zero.replace used for plotting purposes.}

\code{pos_by} -  returns a list of 6:
\item{text}{The original text}
\item{POStagged}{The original words replaced with parts of speech in context.}
\item{POSprop}{Dataframe of the proportion of parts of speech by row.}
\item{POSfreq}{Dataframe of the frequency of parts of speech by row.}
\item{POSrnp}{Dataframe of the frequency and proportions of parts of speech
by row.}
\item{pos.by.prop}{Dataframe of the proportion of parts of speech by grouping
variable.}
\item{pos.by.freq}{Dataframe of the frequency of parts of speech by grouping
variable.}
\item{pos.by.rnp}{Dataframe of the frequency and proportions of parts of
speech by grouping variable.}
\item{percent}{The value of percent used for plotting purposes.}
\item{zero.replace}{The value of zero.replace used for plotting purposes.}
}
\description{
\code{pos} - Apply part of speech tagger to transcript(s).

\code{pos_by} - Apply part of speech tagger to transcript(s) by zero or more
grouping variable(s).

\code{pos_tags} - Useful for interpreting the parts of speech tags created by
pos and pos_by.
}
\note{
Note that contractions are treated as two words; for example the word
count on \bold{"what's"} is 2 for \bold{"what + is"}.  This is not consistent
with the \code{\link[qdap]{word_count}} treatment of contractions but makes
sense in a part of speech framework where a phrase such as "She's cool" is
treated as a pronoun, verb and adjective respectively for "She + is + cool".
}
\examples{
\dontrun{
posdat <- pos(DATA$state)
ltruncdf(posdat, 7, 4)
## str(posdat)
names(posdat)
posdat$text           #original text

## Methods
preprocessed(posdat)  #words replaced with parts of speech
counts(posdat)        #frequency of parts of speech by row
proportions(posdat)   #proportion of parts of speech by row

## Methods Plotting
plot(preprocessed(posdat))
plot(counts(posdat))
plot(proportions(posdat))
plot(posdat)

out1 <- pos(DATA$state, parallel = TRUE) # not always useful
ltruncdf(out1, 7, 4)

#use pos_tags to interpret part of speech tags used by pos & pos_by
pos_tags()[1:10, ]
pos_tags("matrix")[1:10, ]
pos_tags("dataframe")[1:10, ]
pos_tags("df")[1:10, ]
ltruncdf(pos_tags("all"), 3)

posbydat <- with(DATA, pos_by(state, sex))
names(posbydat)

## Methods
scores(posbydat)
preprocessed(posbydat)
counts(posbydat)
proportions(posbydat)

## Methods Plotting
plot(preprocessed(posbydat))
plot(counts(posbydat))
plot(proportions(posbydat))
plot(posbydat)

ltruncdf(posbydat, 7, 4)
truncdf(posbydat$pos.by.prop, 4)

POSby <- with(DATA, pos_by(state, list(adult, sex)))
plot(POSby, values = TRUE, digits = 2)
#or more quickly - reuse the output from before
out2 <- with(DATA, pos_by(posbydat, list(adult, sex)))

## Definite/Indefinite Noun
## 2 approached compared...
## The later is more efficient but less accurate

## ------------------------##
## Part off speech tagging ##
## ------------------------##
pos_after <- function(text.var, words, pos){

    posses <- strsplit(as.character(text.var[["POStagged"]][["POStagged"]]), "\\\\s+")
    namespos <- lapply(posses, function(x) {
        y <- unlist(strsplit(x, "/"))
        setNames(y[c(TRUE, FALSE)], y[c(FALSE, TRUE)])
    })

    lapply(namespos, function(x, thewords = words, thepos = pos){
        locs <- which(x \%in\% thewords)
        locs <- locs[!is.na(locs)]

        if (identical(unclass(locs), integer(0))) return(NA_character_)

        nounlocs <- which(names(x) \%in\% thepos)

        unname(x[unique(sapply(locs, function(x){
            min(nounlocs[nounlocs - x > 0])
        }))])
    })
}

out2 <- setNames(lapply(list(a=c("a", "an"), the="the"), function(x) {
    o <- pos_after(rajPOS, x, c("NN", "NNS", "NNP", "NNPS"))
    m <- qdapTools::matrix2df(data.frame(freq=sort(table(unlist(o)), TRUE)), "word")
    m[m$freq> 3, ]
}), c("a", "the"))


dat2 <- setNames(Reduce(function(x, y) {
    merge(x, y, by = "word", all = TRUE)}, out2), c("Word", "A", "THE"))

dat2 <- reshape2::melt(dat2, id="Word", variable.name="Article", value.name="freq")

dat2 <- dat2[order(dat2$freq, dat2$Word), ]

ord2 <- aggregate(freq ~ Word, dat2, sum)

dat2$Word <- factor(dat2$Word, levels=ord2[order(ord2[[2]]), 1])
rownames(dat2) <- NULL
ggplot(dat2, aes(x=freq, y=Word)) +
    geom_point()+ facet_grid(~Article) +
    ggtitle("Part Of Speech Parsing Approach")

dev.new()

## --------------------##
## Regular Expressions ##
## --------------------##

library(qdapRegex);library(ggplot2);library(reshape2)

out <- setNames(lapply(c("@after_a", "@after_the"), function(x) {
    o <- rm_default(stringi:::stri_trans_tolower(raj$dialogue),
        pattern = x, extract=TRUE)
    m <- qdapTools::matrix2df(data.frame(freq=sort(table(unlist(o)), TRUE)), "word")
    m[m$freq> 3, ]
}), c("a", "the"))


dat <- setNames(Reduce(function(x, y) {
    merge(x, y, by = "word", all = TRUE)}, out), c("Word", "A", "THE"))

dat <- reshape2::melt(dat, id="Word", variable.name="Article", value.name="freq")

dat <- dat[order(dat$freq, dat$Word), ]

ord <- aggregate(freq ~ Word, dat, sum)

dat$Word <- factor(dat$Word, levels=ord[order(ord[[2]]), 1])
rownames(dat) <- NULL
ggplot(dat, aes(x=freq, y=Word)) +
    geom_point()+ facet_grid(~Article) +
    ggtitle("Regex Approach")
}
}
\references{
\href{openNLP}{http:/opennlp.apache.org}
}
\seealso{
\code{\link[openNLP]{Maxent_POS_Tag_Annotator}},
\code{\link[qdap]{colcomb2class}}
}
\keyword{parts-of-speech}

