I am trying to read the data table from the Indian Central Pollution Controal Board using selenium/python. Here is an example of output. I am essentially following the approach presented here: https://github.com/RachitKamdar/Python-Scraper.
Thanks to @Prophet, I was able to read data from the first page (Select element using XPATH with Python?) but I cannot get selenium to wait for the data table to reload when switching to page 2. I tried to add a webdriverwait instruction but this does seem to work. Any help would be greatly appreciated. Thanks
Here is what I tried to do
browser.find_element_by_tag_name("select").send_keys("100")
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='DataTables_Table_0_paginate']/span/a")))
maxpage = int(browser.find_elements(By.XPATH,"//*[@id='DataTables_Table_0_paginate']/span/a")[-1].text)
i = 1
while i < maxpage + 1:
browser.find_element(By.XPATH,"//*[@id='DataTables_Table_0_paginate']/span/a[contains(text(),'{}')]".format(i)).click()
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.ID,"DataTables_Table_0_wrapper")))
#this works ok for page 1
#this does not wait after the click for the data table to update. As a result res is wrong for page 2 [empty].
res = browser.page_source
soup = BeautifulSoup(res, 'html.parser')
soup = soup.find(id = 'DataTables_Table_0')
...
i = i + 1
Update 1: Following Prophet's suggestion, I made the following modification:
browser.find_element_by_tag_name("select").send_keys("100")
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.ID,"DataTables_Table_0_wrapper")))
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='DataTables_Table_0_paginate']/span/a")))
maxpage = int(browser.find_elements(By.XPATH,"//*[@id='DataTables_Table_0_paginate']/span/a")[-1].text)
print(maxpage)
i = 1
while i < maxpage + 1:
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.ID,"DataTables_Table_0_wrapper")))
res = browser.page_source
soup = BeautifulSoup(res, 'html.parser')
soup = soup.find(id = 'DataTables_Table_0')
if i == 1:
data = getValsHtml(soup)
else:
data = data.append(getValsHtml(soup))
print(i)
print(data)
i = i + 1
browser.find_element(By.XPATH,'//a[@class="paginate_button next"]').click()
This still crashes on page 2 (data is empty). In addition, data should contain 100 items from page 1 but only contains 10. The maxpage number is correct (15).
Update 2:
here is the whole script after incorporating Prophet's recommendations [original script follows https://github.com/RachitKamdar/Python-Scraper]. This only retrieves 10 points from the first page and fails to switch to the next page.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
def getValsHtml(table):
data = []
heads = table.find_all('th')
data.append([ele.text.strip() for ele in heads])
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols]) # Get rid of empty values
data.pop(1)
data = pd.DataFrame(data[1:],columns = data[0])
return data
def parameters(br,param):
br.find_element_by_class_name("list-filter").find_element_by_tag_name("input").send_keys(param)
br.find_elements_by_class_name("pure-checkbox")[1].click()
br.find_element_by_class_name("list-filter").find_element_by_tag_name("input").clear()
timeout = 60
url = 'https://app.cpcbccr.com/ccr/#/caaqm-dashboard-all/caaqm-landing/data'
chdriverpath="/net/f1p/my_soft/chromedriver"
option = webdriver.ChromeOptions()
browser = webdriver.Chrome(executable_path="{}".format(chdriverpath), chrome_options=option)
browser.get(url)
station="Secretariat, Amaravati - APPCB"
state="Andhra Pradesh"
city="Amaravati"
sd=['01', 'Jan', '2018']
ed=['31', 'Dec', '2021']
duration="24 Hours"
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME,"toggle")))
browser.find_elements_by_class_name("toggle")[0].click()
browser.find_element_by_tag_name("input").send_keys(state)
browser.find_element_by_class_name("options").click()
browser.find_elements_by_class_name("toggle")[1].click()
browser.find_element_by_tag_name("input").send_keys(city)
browser.find_element_by_class_name("options").click()
browser.find_elements_by_class_name("toggle")[2].click()
browser.find_element_by_tag_name("input").send_keys(station)
browser.find_element_by_class_name("options").click()
browser.find_elements_by_class_name("toggle")[4].click()
browser.find_element_by_class_name("filter").find_element_by_tag_name("input").send_keys(duration)
browser.find_element_by_class_name("options").click()
browser.find_element_by_class_name("c-btn").click()
for p in ['NH3']:
print(p)
try:
parameters(browser,p)
except:
print("miss")
browser.find_element_by_class_name("list-filter").find_element_by_tag_name("input").clear()
pass
browser.find_element_by_class_name("wc-date-container").click()
browser.find_element_by_class_name("month-year").click()
browser.find_element_by_id("{}".format(sd[1].upper())).click()
browser.find_element_by_class_name("year-dropdown").click()
browser.find_element_by_id("{}".format(int(sd[2]))).click()
browser.find_element_by_xpath('//span[text()="{}"]'.format(int(sd[0]))).click()
browser.find_elements_by_class_name("wc-date-container")[1].click()
browser.find_elements_by_class_name("month-year")[1].click()
browser.find_elements_by_id("{}".format(ed[1].upper()))[1].click()
browser.find_elements_by_class_name("year-dropdown")[1].click()
browser.find_element_by_id("{}".format(int(ed[2]))).click()
browser.find_elements_by_xpath('//span[text()="{}"]'.format(int(ed[0])))[1].click()
browser.find_elements_by_tag_name("button")[-1].click()
next_page_btn_xpath = '//a[@class="paginate_button next"]'
actions = ActionChains(browser)
#This is how you should treat the Select drop down
select = Select(browser.find_element_by_tag_name("select"))
select.select_by_value('100')
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH,'//div[@class="dataTables_wrapper no-footer"]')))
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='DataTables_Table_0_paginate']/span/a")))
maxpage = int(browser.find_elements(By.XPATH,"//*[@id='DataTables_Table_0_paginate']/span/a")[-1].text)
i = 1
while i < maxpage + 1:
res = browser.page_source
soup = BeautifulSoup(res, 'html.parser')
soup = soup.find(id = 'DataTables_Table_0')
if i == 1:
data = getValsHtml(soup)
else:
data = data.append(getValsHtml(soup))
print(i)
print(data)
i = i + 1
#scroll to the next page btn and then click it
next_page_btn = browser.find_element_by_xpath(next_page_btn_xpath)
actions.move_to_element(next_page_btn).perform()
browser.find_element(By.XPATH,next_page_btn).click()
browser.quit()