import sys
!{sys.executable} -m pip install PyPDF2
Collecting PyPDF2
  Downloading PyPDF2-1.26.0.tar.gz (77 kB)
     |████████████████████████████████| 77 kB 376 kB/s eta 0:00:01
Building wheels for collected packages: PyPDF2
  Building wheel for PyPDF2 (setup.py) ... done
  Created wheel for PyPDF2: filename=PyPDF2-1.26.0-py3-none-any.whl size=61085 sha256=82639c2463cecb7b90989e3a0b08869ffaa787f48ba7fbfa146d7884b60c6c3d
  Stored in directory: /Users/home0/Library/Caches/pip/wheels/80/1a/24/648467ade3a77ed20f35cfd2badd32134e96dd25ca811e64b3
Successfully built PyPDF2
Installing collected packages: PyPDF2
Successfully installed PyPDF2-1.26.0
import PyPDF2
import sys
import io
import requests # $ pip install requests

url = 'https://cdn.wealthfront.com/public.email.images/2020_Career-Launching_List_vF.pdf'
pdf = requests.get(url, stream=True)
pdf.raw.decode_content = True
pdf_obj = PyPDF2.PdfFileReader(stream=io.BytesIO(pdf.content))
def my_extract_text(pdf):
    text = ""
    content = pdf["/Contents"].getObject()
    if not isinstance(content, PyPDF2.pdf.ContentStream):
        content = PyPDF2.pdf.ContentStream(content, pdf.pdf)
    # Note: we check all strings are TextStringObjects.  ByteStringObjects
    # are strings where the byte->string encoding was unknown, so adding
    # them to the text here would be gibberish.
    for operands, operator in content.operations:
        if operator == b_("Tj"):
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += _text
        elif operator == b_("T*"):
            text += "\n"
        elif operator == b_("'"):
            text += "\n"
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += operands[0]
        elif operator == b_('"'):
            _text = operands[2]
            if isinstance(_text, TextStringObject):
                text += "\n"
                text += _text
        elif operator == b_("TJ"):
            for i in operands[0]:
                if isinstance(i, TextStringObject):
                    text += i
            text += "\n"
    return text
import spacy
words = []
for page in range(pdf_obj.numPages):
    page = pdf_obj.getPage(page)
    page_text = my_extract_text(page)
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(page_text)
    page_words = [token for token in doc]
    words += page_words
Apartment List
apartmentlist.com
App Annie
appannie.com
AppZen
appzen.com
Arctic Wolf Networks
arcticwolf.com
Asana
asana.com
Avetta
avetta.com
AxiomSL
axiomsl.com
B	Platform that connects renters with highly personalized rental listings
San Francisco, CA
Business intelligence solutions
San Francisco, CA
AI platform for finance teams
San Jose, CA
Managed firebreak detection & response security service
Sunnyvale, CA
Work-management software for teams
San Francisco, CA
Cloud-based supply chain risk management platform
Irvine, CA
Provider of risk data management and regulatory reporting solutions
New York, NY
ignore = ['Wealthfront', '™', 's', 'Career', '', '-Launching', 'Companies', 'List2020']
delete = ['EDITION']
words[-100:]
[and,
 investment,
 technology,
 New,
 York,
 ,,
 NYCorporate,
 travel,
 management,
 Palo,
 Alto,
 ,,
 CACar,
 sharing,
 rental,
 marketplace,
 San,
 Francisco,
 ,,
 CA,
 https://www.zest.ai/https://www.xant.ai/Wealthfront™s,
 Career,
 -,
 Launching,
 Companies,
 List2020,
 EDITIONXxant.aixebialabs.comSales,
 acceleration,
 platform,
 Provo,
 ,,
 UTContinuous,
 delivery,
 and,
 DevOps,
 software,
 Boston,
 ,,
 MAYyapstone.comElectronic,
 payments,
 for,
 online,
 marketplaces,
 Walnut,
 Creek,
 ,,
 CAZzest.aizola.comzumper.comMachine,
 learning,
 and,
 AI,
 tools,
 for,
 financial,
 institutions,
 Burbank,
 ,,
 CAWedding,
 registryNew,
 York,
 ,,
 NYApartment,
 rental,
 marketplace,
 San,
 Francisco,
 ,,
 CAwatermarkinsights.comwealthfront.comwebflow.comwordpress.orgEducational,
 intelligence,
 software,
 for,
 colleges,
 and,
 universities˚,
 New,
 York,
 ,,
 NYNextgen,
 banking,
 service,
 Palo,
 Alto,
 ,,
 CADesign,
 and,
 hosting,
 platform,
 San,
 Francisco,
 ,,
 CAOpen,
 -,
 source,
 CMS,
 &,
 blog,
 software,
 San,
 Francisco,
 ,,
 CA]
words
[Companies,
 List2020,
 EDITIONCcapsulecares.comcarbon3d.comcarta.comchainalysis.comchartboost.comcheckr.comchimebank.comcloudbees.comcloverhealth.comcoalitioninc.comcode42.comcohesity.comcollectivehealth.comcollibra.comOnline,
 pharmacyNew,
 York,
 ,,
 NY3D,
 manufacturing,
 Redwood,
 City,
 ,,
 CACap,
 table,
 management,
 &,
 compliance˚,
 Palo,
 Alto,
 ,,
 CASystem,
 for,
 blockchain,
 analysis,
 New,
 York,
 ,,
 NYIn,
 -,
 app,
 monetization,
 and,
 programmatic,
 advertising,
 platform,
 San,
 Francisco,
 ,,
 CAProfessional,
 background,
 checks,
 San,
 Francisco,
 ,,
 CAOnline,
 fee,
 -,
 free,
 full,
 -,
 service,
 bank,
 account,
 San,
 Francisco,
 ,,
 CAContinuous,
 deployment,
 software,
 San,
 Jose,
 ,,
 CAHealth,
 insurance,
 Jersey,
 City,
 ,,
 NJCybersecurity,
 tools,
 San,
 Francisco,
 ,,
 CAData,
 loss,
 protection,
 ,,
 visibility,
 ,,
 and,
 recovery,
 solutions,
 Minneapolis,
 ,,
 MNHyperconverged,
 secondary,
 storage,
 San,
 Jose,
 ,,
 CAAlternative,
 to,
 traditional,
 health,
 insurance,
 San,
 Francisco,
 ,,
 CADataNew,
 York,
 ,,
 NY]