I am a new python user here. I have been writing a code that uses selenium and beautiful soup to go to a website and get the html table and turn it into a data frame.
I am using selenium to loop though a number of different pages and beautiful soup to collect the table from there.
The issue that I am running into is I can't get all those tables to append to each other. If i print off the dataframe it only prints the last table that was scraped. How do I tell beautifulsoup to append one dataframe to the bottom of the other?
Any help would be greatly appreciated, it's been a couple days at this one little part.
states = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "District of Columbia",
"Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine",
"Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire",
"New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon",
"Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia",
"Washington", "West Virginia", "Wisconsin", "Wyoming"]
period = "2020"
num_states = len(states)
state_list = []
for state in states:
driver = webdriver.Chrome(executable_path = 'C:/webdrivers/chromedriver.exe')
driver.get('https://www.nbc.gov/pilt/counties.cfm')
driver.implicitly_wait(20)
state_s = driver.find_element(By.NAME, 'state_code')
drp = Select(state_s)
drp.select_by_visible_text(state)
year_s = driver.find_element(By.NAME, 'fiscal_yr')
drp = Select(year_s)
drp.select_by_visible_text(period)
driver.implicitly_wait(10)
link = driver.find_element(By.NAME, 'Search')
link.click()
url = driver.current_url
page = requests.get(url)
#dfs = pd.read_html(addrss)[2]
# Get the html
soup = BeautifulSoup(page.text, 'lxml')
table = soup.findAll('table')[2]
headers = []
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
df = pd.DataFrame(columns = headers)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
df = pd.DataFrame.rename(columns={'Total Acres':'Total_acres'})
for i in range(s,num_states):
state_list.append([County[i].text, Payment[i].text, Total_acres[i].text])
print(df)
******************** EDIT *********************** period = "2020"
num_states = len(states)
state_list = []
df = pd.DataFrame()
for state in states: driver = webdriver.Chrome(executable_path = 'C:/webdrivers/chromedriver.exe') driver.get('https://www.nbc.gov/pilt/counties.cfm') driver.implicitly_wait(20) state_s = driver.find_element(By.NAME, 'state_code') drp = Select(state_s) drp.select_by_visible_text(state) year_s = driver.find_element(By.NAME, 'fiscal_yr') drp = Select(year_s) drp.select_by_visible_text(period) driver.implicitly_wait(10) link = driver.find_element(By.NAME, 'Search') link.click() url = driver.current_url page = requests.get(url) #dfs = pd.read_html(addrss)[2] # Get the html soup = BeautifulSoup(page.text, 'lxml') table = soup.findAll('table')[2] headers = []
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
dfs = pd.concat([df for state in states])
print(df)
Results in: ValueError: cannot set a frame with no defined columns