Parsing PDFs
Show some fancy PDF reading stuff
import sys
!{sys.executable} -m pip install PyPDF2
import PyPDF2
import sys
import io
import requests # $ pip install requests
url = 'https://cdn.wealthfront.com/public.email.images/2020_Career-Launching_List_vF.pdf'
pdf = requests.get(url, stream=True)
pdf.raw.decode_content = True
pdf_obj = PyPDF2.PdfFileReader(stream=io.BytesIO(pdf.content))
page = pdf_obj.getPage(1)
page_text = my_extract_text(page)
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(page_text)
page_words = [token for token in doc]
page_words
def my_extract_text(pdf):
text = ""
content = pdf["/Contents"].getObject()
if not isinstance(content, PyPDF2.pdf.ContentStream):
content = PyPDF2.pdf.ContentStream(content, pdf.pdf)
# Note: we check all strings are TextStringObjects. ByteStringObjects
# are strings where the byte->string encoding was unknown, so adding
# them to the text here would be gibberish.
for operands, operator in content.operations:
if operator == b_("Tj"):
_text = operands[0]
if isinstance(_text, TextStringObject):
text += _text
elif operator == b_("T*"):
text += "\n"
elif operator == b_("'"):
text += "\n"
_text = operands[0]
if isinstance(_text, TextStringObject):
text += operands[0]
elif operator == b_('"'):
_text = operands[2]
if isinstance(_text, TextStringObject):
text += "\n"
text += _text
elif operator == b_("TJ"):
for i in operands[0]:
if isinstance(i, TextStringObject):
text += i
text += "\n"
return text