Scraping tables from the web get complicated when there are 2 or more values in a cell. In order to preserve the table structure, I have devised a way to count the row-number index of its xpath, implementing a nested list when the row number stays the same.
def get_structured_elements(name):
"""For target data that is nested and structured,
such as a table with multiple values in a single cell."""
driver = self.driver
i = 2 # keep track of 'i' to retain the document structure.
number_of_items = number_of_items_found()
elements = [None] * number_of_items # len(elements) will exceed number_of_items.
target_data = driver.find_elements("//table/tbody/tr[" + i + "]/td[2]/a")
while i - 2 < number_of_items:
for item in target_data:
# print(item.text, i-1)
if elements[i - 2] == None:
elements[i - 2] = item.text # set to item.text value if position is empty.
else:
elements[i - 2] = [elements[i - 2]]
elements[i - 2].append(item.text) # make nested list and append new value if position is occupied.
i += 1
return elements
This simple logic was working fine, until I sought to manage all locator variables in one place to make the code more reusable: How do I store this expression "//table/tbody/tr[" + i + "]/td[2]/a" in a list or dictionary so that it still works when plugged in?
The solution (i.e. hack) I came up with is a function that takes in the front and back half of the iterating xpath as arguments, returning front_half + str(i) + back_half if i is part of the parent (iterator) function's local variable.
def split_xpath_at_i(front_half, back_half):
"""Splits xpath string at its counter index.
The 'else' part is to aviod errors
when this function is called outside an indexed environment. """
if 'i' in locals():
string = front_half + str(i) + back_half
else:
string = front_half+"SPLIT_i"+back_half
return string
xpath = [split_xpath_at_i("//table/tbody/tr[","]/td[2]/a"),
"//table/tbody/tr/td[3]/a[1]"
]
def xpath_index_iterator():
for i in range(10):
print(split_xpath_at_i("//table/tbody/tr[","]/td[2]/a"))
xpath_index_iterator()
# //table/tbody/tr[SPLIT_i]/td[2]/a
# //table/tbody/tr[SPLIT_i]/td[2]/a
# //table/tbody/tr[SPLIT_i]/td[2]/a
# //table/tbody/tr[SPLIT_i]/td[2]/a
# //table/tbody/tr[SPLIT_i]/td[2]/a
# //table/tbody/tr[SPLIT_i]/td[2]/a
# //table/tbody/tr[SPLIT_i]/td[2]/a
# //table/tbody/tr[SPLIT_i]/td[2]/a
# //table/tbody/tr[SPLIT_i]/td[2]/a
# //table/tbody/tr[SPLIT_i]/td[2]/a
Problem is, split_xpath_at_i is blind to variables in its immediate environment. What I eventually came up with is to make use of the iterator function's attribute to define the counter i so that the variable can be made available to split_xpath_at_i like so:
def split_xpath_at_i(front_half, back_half):
"""Splits xpath string at its counter index.
The 'else' part is to aviod errors
when this function is called outside an indexed environment. """
try:
i = xpath_index_iterator.i
except:
pass
if 'i' in locals():
string = front_half + str(i) + back_half
else:
string = front_half+"SPLIT_i"+back_half
return string
xpath = [split_xpath_at_i("//table/tbody/tr[","]/td[2]/a"),
"//table/tbody/tr/td[3]/a[1]"
]
def xpath_index_iterator():
xpath_index_iterator.i = 0
lst = []
for xpath_index_iterator.i in range(10):
print(split_xpath_at_i("//table/tbody/tr[","]/td[2]/a"))
xpath_index_iterator()
# //table/tbody/tr[0]/td[2]/a
# //table/tbody/tr[1]/td[2]/a
# //table/tbody/tr[2]/td[2]/a
# //table/tbody/tr[3]/td[2]/a
# //table/tbody/tr[4]/td[2]/a
# //table/tbody/tr[5]/td[2]/a
# //table/tbody/tr[6]/td[2]/a
# //table/tbody/tr[7]/td[2]/a
# //table/tbody/tr[8]/td[2]/a
# //table/tbody/tr[9]/td[2]/a
The problem gets more complicated when I try to invoke split_xpath_at_i via a locator list:
def split_xpath_at_i(front_half, back_half):
"""Splits xpath string at its counter index.
The 'else' part is to aviod errors
when this function is called outside an indexed environment. """
try:
i = xpath_index_iterator.i
except:
pass
if 'i' in locals():
string = front_half + str(i) + back_half
else:
string = front_half+"SPLIT_i"+back_half
return string
xpath = [split_xpath_at_i("//table/tbody/tr[","]/td[2]/a"),
"//table/tbody/tr/td[3]/a[1]"
]
def xpath_index_iterator():
xpath_index_iterator.i = 0
lst = []
for xpath_index_iterator.i in range(10):
# print(split_xpath_at_i("//table/tbody/tr[","]/td[2]/a"))
lst.append(xpath[0])
return lst
xpath_index_iterator()
# ['//table/tbody/tr[9]/td[2]/a',
# '//table/tbody/tr[9]/td[2]/a',
# '//table/tbody/tr[9]/td[2]/a',
# '//table/tbody/tr[9]/td[2]/a',
# '//table/tbody/tr[9]/td[2]/a',
# '//table/tbody/tr[9]/td[2]/a',
# '//table/tbody/tr[9]/td[2]/a',
# '//table/tbody/tr[9]/td[2]/a',
# '//table/tbody/tr[9]/td[2]/a',
# '//table/tbody/tr[9]/td[2]/a']
What would a professional approach to this problem look like?
The Entire Code:
The code below was modified from the Selenium manual.
I've asked a related question over here that concerns the general approach to Page Objects design.
test.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from query import Input
import page
cnki = Input()
driver = cnki.webpage('http://big5.oversea.cnki.net/kns55/')
current_page = page.MainPage(driver)
current_page.submit_search('禮學')
current_page.switch_to_frame()
result = page.SearchResults(driver)
structured = result.get_structured_elements('titles') # I couldn't get this to work.
simple = result.simple_get_structured_elements() # but this works fine.
query.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from selenium import webdriver
class Input:
"""This class provides a wrapper around actual working code."""
# CONSTANTS
URL = None
def __init__(self):
self.driver = webdriver.Chrome
def webpage(self, url):
driver = self.driver()
driver.get(url)
return driver
page.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from element import BasePageElement
from locators import InputLocators, OutputLocators
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
class SearchTextElement(BasePageElement):
"""This class gets the search text from the specified locator"""
#The locator for search box where search string is entered
locator = None
class BasePage:
"""Base class to initialize the base page that will be called from all
pages"""
def __init__(self, driver):
self.driver = driver
class MainPage(BasePage):
"""Home page action methods come here. I.e. Python.org"""
search_keyword = SearchTextElement()
def submit_search(self, keyword):
"""Submits keyword and triggers the search"""
SearchTextElement.locator = InputLocators.SEARCH_FIELD
self.search_keyword = keyword
def select_dropdown_item(self, item):
driver = self.driver
by, val = InputLocators.SEARCH_ATTR
driver.find_element(by, val + "/option[text()='" + item + "']").click()
def click_search_button(self):
driver = self.driver
element = driver.find_element(*InputLocators.SEARCH_BUTTON)
element.click()
def switch_to_frame(self):
"""Use this function to get access to hidden elements. """
driver = self.driver
driver.switch_to.default_content()
driver.switch_to.frame('iframeResult')
# Maximize the number of items on display in the search results.
def max_content(self):
driver = self.driver
max_content = driver.find_element_by_css_selector('#id_grid_display_num > a:nth-child(3)')
max_content.click()
def stop_loading_page_when_element_is_present(self, locator):
driver = self.driver
ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
wait.until(
EC.presence_of_element_located(locator))
driver.execute_script("window.stop();")
def next_page(self):
driver = self.driver
self.stop_loading_page_when_element_is_present(InputLocators.NEXT_PAGE)
driver.execute_script("window.stop();")
try:
driver.find_element(*InputLocators.NEXT_PAGE).click()
print("Navigating to Next Page")
except (TimeoutException, WebDriverException):
print("Last page reached")
class SearchResults(BasePage):
"""Search results page action methods come here"""
def __init__(self, driver):
self.driver = driver
i = None # get_structured_element counter
def wait_for_page_to_load(self):
driver = self.driver
wait = WebDriverWait(driver, 100)
wait.until(
EC.presence_of_element_located(*InputLocators.MAIN_BODY))
def get_single_element(self, name):
"""Returns a single value as target data."""
driver = self.driver
target_data = driver.find_element(*OutputLocators.CNKI[str(name.upper())])
# SearchTextElement.locator = OutputLocators.CNKI[str(name.upper())]
# target_data = SearchTextElement()
return target_data
def number_of_items_found(self):
"""Return the number of items found on a single page."""
driver = self.driver
target_data = driver.find_elements(*OutputLocators.CNKI['INDEX'])
return len(target_data)
def get_elements(self, name):
"""Returns simple list of values in specific data field in a table."""
driver = self.driver
target_data = driver.find_elements(*OutputLocators.CNKI[str(name.upper())])
elements = []
for item in target_data:
elements.append(item.text)
return elements
def get_structured_elements(self, name):
"""For target data that is nested and structured,
such as a table with multiple values in a single cell."""
driver = self.driver
i = 2 # keep track of 'i' to retain the document structure.
number_of_items = self.number_of_items_found()
elements = [None] * number_of_items
while i - 2 < number_of_items:
target_data = driver.find_elements(*OutputLocators.CNKI[str(name.upper())])
for item in target_data:
print(item.text, i - 1)
if elements[i - 2] == None:
elements[i - 2] = item.text
elif isinstance(elements[i - 2], list):
elements[i - 2].append(item.text)
else:
elements[i - 2] = [elements[i - 2]]
elements[i - 2].append(item.text)
i += 1
return elements
def simple_get_structured_elements(self):
"""Simple structured elements code with fixed xpath."""
driver = self.driver
i = 2 # keep track of 'i' to retain the document structure.
number_of_items = self.number_of_items_found()
elements = [None] * number_of_items
while i - 2 < number_of_items:
target_data = driver.find_elements_by_xpath\
('//*[@id="Form1"]/table/tbody/tr[2]/td/table/tbody/tr['\
+ str(i) + ']/td[2]/a')
for item in target_data:
print(item.text, i-1)
if elements[i - 2] == None:
elements[i - 2] = item.text
elif isinstance(elements[i - 2], list):
elements[i - 2].append(item.text)
else:
elements[i - 2] = [elements[i - 2]]
elements[i - 2].append(item.text)
i += 1
return elements
element.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from selenium.webdriver.support.ui import WebDriverWait
class BasePageElement():
"""Base page class that is initialized on every page object class."""
def __set__(self, obj, value):
"""Sets the text to the value supplied"""
driver = obj.driver
text_field = WebDriverWait(driver, 100).until(
lambda driver: driver.find_element(*self.locator))
text_field.clear()
text_field.send_keys(value)
text_field.submit()
def __get__(self, obj, owner):
"""Gets the text of the specified object"""
driver = obj.driver
WebDriverWait(driver, 100).until(
lambda driver: driver.find_element(*self.locator))
element = driver.find_element(*self.locator)
return element.get_attribute("value")
locators.py
This is where split_xpath_at_i sits.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from selenium.webdriver.common.by import By
# import page
class InputLocators():
"""A class for main page locators. All main page locators should come here"""
def dropdown_list_xpath(attribute, value):
string = "//select[@" + attribute + "='" + value + "']"
return string
MAIN_BODY = (By.XPATH, '//GridTableContent/tbody')
SEARCH_FIELD = (By.NAME, 'txt_1_value1') # (By.ID, 'search-content-box')
SEARCH_ATTR = (By.XPATH, dropdown_list_xpath('name', 'txt_1_sel'))
SEARCH_BUTTON = (By.ID, 'btnSearch')
NEXT_PAGE = (By.LINK_TEXT, "下頁")
class OutputLocators():
"""A class for search results locators. All search results locators should
come here"""
def split_xpath_at_i(front_half, back_half):
# try:
# i = page.SearchResults.g_s_elem
# except:
# pass
if 'i' in locals():
string = front_half + str(i) + back_half
else:
string = front_half+"SPLIT_i"+back_half
return string
CNKI = {
"TITLES": (By.XPATH, split_xpath_at_i('//*[@id="Form1"]/table/tbody/tr[2]/td/table/tbody/tr[', ']/td[2]/a')),
"AUTHORS": (By.XPATH, split_xpath_at_i('//*[@id="Form1"]/table/tbody/tr[2]/td/table/tbody/tr[', ']/td[3]/a')),
"JOURNALS": '//*[@id="Form1"]/table/tbody/tr[2]/td/table/tbody/tr/td[4]/a',
"YEAR_ISSUE": '//*[@id="Form1"]/table/tbody/tr[2]/td/table/tbody/tr/td[5]/a',
"DOWNLOAD_PATHS": '//*[@id="Form1"]/table/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td/a[1]',
"INDEX": (By.XPATH, '//*[@id="Form1"]/table/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td/a[2]')
}
# # Interim Data
# CAPTIONS =
# LINKS =
# Target Data
# TITLES =
# AUTHORS =
# JOURNALS =
# VOL =
# ISSUE =
# DATES =
# DOWNLOAD_PATHS =
.pyfiles. Would it be too much to post them here? \$\endgroup\$