HTTP Error 403: Forbidden with Tabula/Requests

710 Views Asked by At

I am getting the error "urllib.error.HTTPError: HTTP Error 403: Forbidden" with Tabula, is there a way to fix this? It has worked correctly for most of this year:

import tabula
from bs4 import BeautifulSoup
import requests

url = 'https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports'
r = requests.get(url)

soup = BeautifulSoup(r.content, 'lxml' )
hyperlink_tags = soup.find_all('a')

for hyperlink_tag in hyperlink_tags:
    if 'Situation report' in hyperlink_tag.text:
        file_path = hyperlink_tag['href']
        break

latest_report = f'https://who.int/{file_path}'
file = latest_report

tables = tabula.read_pdf(file, stream=True, pages = "all", multiple_tables = True)

The problem seems to be the last line so I'm not sure if it's requests or tabula

1

There are 1 best solutions below

0
On BEST ANSWER

the request needs the headers parameter for the User-Agent. not sure how to add that parameter with tabula, but you can access and write the pdf to file, then read that in:

import tabula
from bs4 import BeautifulSoup
import requests

url = 'https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports'
r = requests.get(url)

soup = BeautifulSoup(r.content, 'lxml' )
hyperlink_tags = soup.find_all('a')

for hyperlink_tag in hyperlink_tags:
    if 'Situation report' in hyperlink_tag.text:
        file_path = hyperlink_tag['href']
        break

latest_report = f'https://who.int/{file_path}'
file = latest_report

################################################
## Download the PDF ############################

from urllib.request import Request, urlopen
f = open('c:/test/temp.pdf', 'wb')
url_request = Request(file, 
                      headers={"User-Agent": "Mozilla/5.0"})
webpage = urlopen(url_request).read()
f.write(webpage)
f.close()

#################################################

tables = tabula.read_pdf('c:/test/temp.pdf', stream=False, pages = "all", multiple_tables = True)