% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/get_pdf_text.R
\name{get_pdf_text}
\alias{get_pdf_text}
\title{Obtain Text from a PDF Document}
\usage{
get_pdf_text(input, output = tempfile(fileext = ".txt"), dpi = 600, psm = 1)
}
\arguments{
\item{input}{'character' string.
File path to PDF document.}

\item{output}{'character' string.
Location to write the text file.}

\item{dpi}{'integer' number between 100 and 1200.
Dots per inch (DPI).
The resolution of an image, specifically the number of pixels per inch.
For optimal optical character recognition (OCR) accuracy, 600 DPI (the default) is recommended.}

\item{psm}{\code{integer} number between 0 and 13.
Page Segmentation Mode (PSM).
Describes the layout of the text you are trying to extract.
For processing two columns of text you should use the page segmentation mode 1 (default).
PSM 1 (default) is used to automatically segment the page into different text areas
and also detect the orientation and script of the text.}
}
\value{
Returns the path to the text file.
Each page from the PDF is transcribed as a separate line in the file.
}
\description{
Obtain text from any PDF document.
Requires that the \pkg{pdftools} and \pkg{tesseract} packages are available.
}
\examples{
\dontrun{
  input <- system.file("extdata", "test.pdf", package = "inlpubs")
  path <- get_pdf_text(input)

  unlink(path)
}
}
\seealso{
\code{\link{add_content}} function to add texts to the \pkg{inlpubs}-package corpus.
}
\author{
J.C. Fisher, U.S. Geological Survey, Idaho Water Science Center
}
