% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/format_text.r
\name{format_text}
\alias{format_text}
\title{Format PDF input text}
\usage{
format_text(
  pdf_text,
  split_pdf = FALSE,
  blank_lines = TRUE,
  remove_hyphen = TRUE,
  convert_sentence = TRUE,
  remove_equations = FALSE,
  split_pattern = "\\\\p{WHITE_SPACE}{3,}",
  ...
)
}
\arguments{
\item{pdf_text}{A list of text from PDF import, most likely from 
`pdftools::pdf_text()`. Each element of the list is a unique page of 
text from the PDF.}

\item{split_pdf}{TRUE/FALSE indicating whether to split the pdf using white 
space. This would be most useful with multicolumn pdf files. 
The split_pdf function attempts to recreate the column layout of the text 
into a single column starting with the left column and proceeding to the 
right.}

\item{blank_lines}{TRUE/FALSE indicating whether blank text lines should
be removed. Default is TRUE.}

\item{remove_hyphen}{TRUE/FALSE indicating whether hyphenated words should
be adjusted to combine onto a single line. Default is TRUE.}

\item{convert_sentence}{TRUE/FALSE indicating if individual lines of PDF file
should be collapsed into a single large paragraph to perform keyword 
searching. Default is TRUE}

\item{remove_equations}{TRUE/FALSE indicating if equations should be removed.
Default behavior is to search for a literal parenthesis,
followed by at least one number followed by another parenthesis at
the end of the text line. This will not detect other patterns or
detect the entire equation if it is a multi-row equation.}

\item{split_pattern}{Regular expression pattern used to split multicolumn 
PDF files using \code{stringi::stri_split_regex}. 
Default pattern is to 
split based on three or more consecutive white space characters.}

\item{...}{Additional arguments, currently not used.}
}
\description{
Performs some formatting of pdf text upon import.
}
