import sys
!{sys.executable} -m pip install PyPDF2
Collecting PyPDF2
  Downloading PyPDF2-1.26.0.tar.gz (77 kB)
     |████████████████████████████████| 77 kB 376 kB/s eta 0:00:01
Building wheels for collected packages: PyPDF2
  Building wheel for PyPDF2 (setup.py) ... done
  Created wheel for PyPDF2: filename=PyPDF2-1.26.0-py3-none-any.whl size=61085 sha256=82639c2463cecb7b90989e3a0b08869ffaa787f48ba7fbfa146d7884b60c6c3d
  Stored in directory: /Users/home0/Library/Caches/pip/wheels/80/1a/24/648467ade3a77ed20f35cfd2badd32134e96dd25ca811e64b3
Successfully built PyPDF2
Installing collected packages: PyPDF2
Successfully installed PyPDF2-1.26.0
import PyPDF2
import sys
import io
import requests # $ pip install requests

url = 'https://cdn.wealthfront.com/public.email.images/2020_Career-Launching_List_vF.pdf'
pdf = requests.get(url, stream=True)
pdf.raw.decode_content = True
pdf_obj = PyPDF2.PdfFileReader(stream=io.BytesIO(pdf.content))
page = pdf_obj.getPage(1)
page_text = my_extract_text(page)
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(page_text)
page_words = [token for token in doc]
page_words
[Wealthfront,
 ™,
 s,
 Career,
 -,
 Launching,
 Companies,
 List2020,
 EDITIONBapartmentlist.comappannie.comappzen.comarcticwolf.comasana.comavetta.comaxiomsl.comPlatform,
 that,
 connects,
 renters,
 with,
 highly,
 personalized,
 rental,
 listingsSan,
 Francisco,
 ,,
 CABusiness,
 intelligence,
 solutionsSan,
 Francisco,
 ,,
 CAAI,
 platform,
 for,
 finance,
 teamsSan,
 Jose,
 ,,
 CAManaged,
 firebreak,
 detection,
 &,
 response,
 security,
 serviceSunnyvale,
 ,,
 CAWork,
 -,
 management,
 software,
 for,
 teamsSan,
 Francisco,
 ,,
 CACloud,
 -,
 based,
 supply,
 chain,
 risk,
 management,
 platformIrvine,
 ,,
 CAProvider,
 of,
 risk,
 data,
 management,
 and,
 regulatory,
 reporting,
 solutionsNew,
 York,
 ,,
 NYbenchling.combettercloud.comblend.combraze.combrex.comgetbuilt.comSaaS,
 for,
 life,
 sciences,
 R&D,
 San,
 Francisco,
 ,,
 CASaaS,
 operations,
 management,
 platformNew,
 York,
 ,,
 NYWhite,
 -,
 label,
 consumer,
 lending,
 ecosystem,
 San,
 Francisco,
 ,,
 CACustomer,
 engagement,
 platform,
 New,
 York,
 ,,
 NYCredit,
 cards,
 &,
 cash,
 management,
 accounts,
 for,
 startups,
 San,
 Francisco,
 ,,
 CAConstruction,
 vertical,
 SaaSNashville,
 ,,
 TN]
def my_extract_text(pdf):
    text = ""
    content = pdf["/Contents"].getObject()
    if not isinstance(content, PyPDF2.pdf.ContentStream):
        content = PyPDF2.pdf.ContentStream(content, pdf.pdf)
    # Note: we check all strings are TextStringObjects.  ByteStringObjects
    # are strings where the byte->string encoding was unknown, so adding
    # them to the text here would be gibberish.
    for operands, operator in content.operations:
        if operator == b_("Tj"):
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += _text
        elif operator == b_("T*"):
            text += "\n"
        elif operator == b_("'"):
            text += "\n"
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += operands[0]
        elif operator == b_('"'):
            _text = operands[2]
            if isinstance(_text, TextStringObject):
                text += "\n"
                text += _text
        elif operator == b_("TJ"):
            for i in operands[0]:
                if isinstance(i, TextStringObject):
                    text += i
            text += "\n"
    return text