% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/read_pdf.R
\name{read_pdf}
\alias{read_pdf}
\title{Read a Portable Document Format into R}
\usage{
read_pdf(file, skip = 0, remove.empty = TRUE, trim = TRUE, ocr = TRUE, ...)
}
\arguments{
\item{file}{A path to a PDF file.}

\item{skip}{Integer; the number of lines of the data file to skip before
beginning to read data.}

\item{remove.empty}{logical.  If \code{TRUE} empty elements in the vector are
removed.}

\item{trim}{logical.  If \code{TRUE} the leading/training white space is
removed.}

\item{ocr}{logical.  If \code{TRUE} documents with a non-text pull using
\link[pdftools:pdftools]{pdftools::pdf_text()} will be re-run using OCR via the
\code{\link[tesseract:ocr]{tesseract::ocr()}} function.  This will create temporary .png
files and will require a much larger compute time.}

\item{...}{Other arguments passed to \link[pdftools:pdftools]{pdftools::pdf_text()}.}
}
\value{
Returns a \code{\link[base:data.frame]{base::data.frame()}} with the page number
(\code{page_id}), line number (\code{element_id}), and the \code{text}.
}
\description{
A wrapper for \link[pdftools:pdftools]{pdftools::pdf_text()} to read PDFs into \pkg{R}.
}
\note{
A word of caution from \href{http://stackoverflow.com/a/9187015/1000343}{Carl Witthoft}"
"Just a warning to others who may be hoping to extract data: PDF is a
container, not a format. If the original document does not contain actual
text, as opposed to bitmapped images of text or possibly even uglier things
than I can imagine, nothing other than OCR can help you."  If the reader has
OCR needs the \pkg{tesseract} package, available on CRAN
(\url{https://CRAN.R-project.org/package=tesseract}), is an "OCR engine with
Unicode (UTF-8) support" and may be of use.
}
\examples{
pdf_dat <- read_pdf(
    system.file("docs/rl10075oralhistoryst002.pdf", package = "textreadr")
)

pdf_dat_b <- read_pdf(
    system.file("docs/rl10075oralhistoryst002.pdf", package = "textreadr"),
    skip = 1
)

\dontrun{
library(textshape)
system.file("docs/rl10075oralhistoryst002.pdf", package = "textreadr") \%>\%
    read_pdf(1) \%>\%
    `[[`('text') \%>\%
    head(-1) \%>\%
    textshape::combine() \%>\%
    gsub("([A-Z])( )([A-Z])", "\\\\1_\\\\3", .) \%>\%
    strsplit("(-| )(?=[A-Z_]+:)", perl=TRUE) \%>\%
    `[[`(1) \%>\%
    textshape::split_transcript()
}

\dontrun{
## An image based .pdf file returns nothing.  Using the tesseract package as
## a backend for OCR overcomes this problem.

## Non-ocr
read_pdf(
    system.file("docs/McCune2002Choi2010.pdf", package = "textreadr"),
    ocr = FALSE
)

read_pdf(
    system.file("docs/McCune2002Choi2010.pdf", package = "textreadr"),
    ocr = TRUE
)
}
}
\keyword{pdf}
