Parsing PDFs
Show some fancy PDF reading stuff
import sys
!{sys.executable} -m pip install PyPDF2
import PyPDF2
import sys
import io
import requests # $ pip install requests
url = 'https://cdn.wealthfront.com/public.email.images/2020_Career-Launching_List_vF.pdf'
pdf = requests.get(url, stream=True)
pdf.raw.decode_content = True
pdf_obj = PyPDF2.PdfFileReader(stream=io.BytesIO(pdf.content))
def my_extract_text(pdf):
text = ""
content = pdf["/Contents"].getObject()
if not isinstance(content, PyPDF2.pdf.ContentStream):
content = PyPDF2.pdf.ContentStream(content, pdf.pdf)
# Note: we check all strings are TextStringObjects. ByteStringObjects
# are strings where the byte->string encoding was unknown, so adding
# them to the text here would be gibberish.
for operands, operator in content.operations:
if operator == b_("Tj"):
_text = operands[0]
if isinstance(_text, TextStringObject):
text += _text
elif operator == b_("T*"):
text += "\n"
elif operator == b_("'"):
text += "\n"
_text = operands[0]
if isinstance(_text, TextStringObject):
text += operands[0]
elif operator == b_('"'):
_text = operands[2]
if isinstance(_text, TextStringObject):
text += "\n"
text += _text
elif operator == b_("TJ"):
for i in operands[0]:
if isinstance(i, TextStringObject):
text += i
text += "\n"
return text
import spacy
words = []
for page in range(pdf_obj.numPages):
page = pdf_obj.getPage(page)
page_text = my_extract_text(page)
nlp = spacy.load("en_core_web_sm")
doc = nlp(page_text)
page_words = [token for token in doc]
words += page_words
Apartment List
apartmentlist.com
App Annie
appannie.com
AppZen
appzen.com
Arctic Wolf Networks
arcticwolf.com
Asana
asana.com
Avetta
avetta.com
AxiomSL
axiomsl.com
B Platform that connects renters with highly personalized rental listings
San Francisco, CA
Business intelligence solutions
San Francisco, CA
AI platform for finance teams
San Jose, CA
Managed firebreak detection & response security service
Sunnyvale, CA
Work-management software for teams
San Francisco, CA
Cloud-based supply chain risk management platform
Irvine, CA
Provider of risk data management and regulatory reporting solutions
New York, NY
ignore = ['Wealthfront', '™', 's', 'Career', '', '-Launching', 'Companies', 'List2020']
delete = ['EDITION']
words[-100:]
words