I'm trying to automate the scraping of all data from every table on a website and output each table into a tab in excel.
I've been using the code currently available from questions such as https://www.thepythoncode.com/article/convert-html-tables-into-csv-files-in-python, https://towardsdatascience.com/web-scraping-html-tables-with-python-c9baba21059 and Python - Web Scraping HTML table and printing to CSV.
When using this URL, I'm struggling to pull both the underlying data and table headers. The HTML format is very dense making it difficult for me to extract the tables in the correct structure.
My current code:
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen
import re
import html2text
import requests
import pandas as pd
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
# US english
LANGUAGE = "en-US,en;q=0.5"
def get_soup(url):
"""Constructs and returns a soup using the HTML content of `url` passed"""
# initialize a session
session = requests.Session()
# set the User-Agent as a regular browser
session.headers['User-Agent'] = USER_AGENT
# request for english content (optional)
session.headers['Accept-Language'] = LANGUAGE
session.headers['Content-Language'] = LANGUAGE
# make the request
html = session.get(url)
# return the soup
return bs(html.content, "html.parser")
def get_all_tables(soup):
"""Extracts and returns all tables in a soup object"""
return soup.find_all("table")
def get_table_headers(table):
"""Given a table soup, returns all the headers"""
headers = []
for th in table.find("tr").find_all("th"):
headers.append(th.text.strip())
return headers
def get_table_rows(table):
"""Given a table, returns all its rows"""
rows = []
for tr in table.find_all("tr")[1:]:
cells = []
# grab all td tags in this table row
tds = tr.find_all("td")
if len(tds) == 0:
# if no td tags, search for th tags
# can be found especially in wikipedia tables below the table
ths = tr.find_all("th")
for th in ths:
cells.append(th.text.strip())
else:
# use regular td tags
for td in tds:
cells.append(td.text.strip())
rows.append(cells)
return rows
def save_as_csv(table_name, headers, rows):
pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv")
def main(url):
# get the soup
soup = get_soup(url)
# extract all the tables from the web page
tables = get_all_tables(soup)
print(f"[+] Found a total of {len(tables)} tables.")
# iterate over all tables
for i, table in enumerate(tables, start=1):
# get the table headers
headers = get_table_headers(table)
# get all the rows of the table
rows = get_table_rows(table)
# save table as csv file
table_name = f"table-{i}"
print(f"[+] Saving {table_name}")
save_as_csv(table_name, headers, rows)
main("https://www.sec.gov/Archives/edgar/data/1701605/000170160519000089/bkr-2019093010xq.htm")
For example, I would need the code to identify a table, such as the one in the attached image, and place all the information into an excel format 
Code from questions such as Extract HTML Tables With Similar Data from Different Sources with Different Formatting - Python and Extract HTML Table Based on Specific Column Headers - Python is able to search through the URL, but is looking for too specific a criterion, as I need all the tables in the URL.
Any help would be appreciated! I'm sure there's an elegant solution that I'm not seeing
https://www.sec.gov/ix?doc=/Archives/edgar/data/1701605/000170160519000089/bkr-2019093010xq.htm