the idea of the code is simple. It will go to the search's page, "see" all the products there, click one by one, scrape all the data, go back to the search's page and click on the next product's link. So I'm doing something like this:
def extract_prodirectsports_data():
url_line = "https://www.prodirectsport.com/soccer/l/adults/departments-boots/activity-football/brand-adidas/silo-predator/"
sheet_data = []
pdf_data = []
sizes = []
cotacao_libra() # Assuming this function is already defined
i = 0
driver.get(url_line)
#Cookies
WebDriverWait(driver, 100).until(EC.element_to_be_clickable((By.XPATH,"//button[@title='Accept all cookies']"))).click()
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,"//div[@id='zonos']")))
close_button = driver.find_element(By.XPATH, "//a[@class = 'z-close']")
driver.execute_script("arguments[0].click();", close_button)
WebDriverWait(driver, 100).until(EC.element_to_be_clickable((By.XPATH,"//div[@class='global-popup']")))
button = driver.find_element(By.XPATH,"//button[@aria-label='Close']")
driver.execute_script("arguments[0].click();", button)
#driver.implicitly_wait(500000)
df_sheet = pd.DataFrame(columns=SHEET_COLUMNS)
pdf = FPDF()
pdf.add_page()
pdf.set_font('Arial', 'B', 12)
driver.maximize_window()
product_list = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located((By.XPATH, "//div[@class = 'product-listing__grid']//div[@class = '_root_129ai_6 product-listing__grid-item']/a"))
)
while True:
for index, value in enumerate(product_list):
product_list = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.XPATH, "//div[@class = 'product-listing__grid']//div[@class = '_root_129ai_6 product-listing__grid-item']/a"))
)
driver.execute_script("arguments[0].scrollIntoView();", product_list[index])
driver.save_screenshot("window.png")
image_element = WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.TAG_NAME, "img")))
image = image_element.get_attribute("src")
i += 1
actions.move_to_element(product_list[index]).perform()
driver.execute_script("arguments[0].click();", product_list[index])
driver.implicitly_wait(50)
#driver.save_screenshot("screenproduct.png")
product_name = driver.find_element(By.CLASS_NAME, "ml-meta__title").text
print(product_name)
price = driver.find_element(By.CLASS_NAME, "ml-prices__price").text
price = re.sub("£", "", price)
product_price = float(price)
print(product_price)
product_code = driver.current_url.split('-')[-1]
print(product_code)
size_list = driver.find_element(By.XPATH, "//div[@class = 'ml-size__sizes']")
if size_list:
driver.execute_script("arguments[0].scrollIntoView();", size_list)
for size in size_list.find_elements(By.XPATH, "//button[@class='ml-size__size qa-size-item']"):
product_size = size.text
sizes.append(product_size)
if "-" in size.text:
product_size = size.text.split('-')[0]
sizes.append(product_size)
elif "(" in size.text:
product_size = size.text.split('(')[0]
sizes.append(product_size)
print(sizes)
price_in_reais = calculate_price_in_reais(product_price) # Function needs to be defined
sale_price = calculate_sale_price(product_price) # Function needs to be defined
sheet_data.append({
'Photo': image,
'Code': product_code,
'Description': product_name,
'Purchase': price_in_reais,
'Sale': sale_price,
'Sizes': sizes
})
pdf_data.append({
'Photo': image,
'Code': product_code,
'Description': product_name,
'Price': product_price
})
for index, row in pd.DataFrame(pdf_data).iterrows():
for data in row.values:
pdf.cell(1.6, 0.5, str(data))
pdf.ln()
driver.execute_script("window.history.go(-1)")
print("exited")
driver.implicitly_wait(5000)
botao_vermais = driver.find_element(By.XPATH, "//div[@class = 'product-listing__view-more']/button")
if botao_vermais:
botao_vermais.click()
driver.implicitly_wait(50)
else:
break
pdf.output(url_linha + 'cátalogo.pdf')
df_planilha = pd.DataFrame(data_planilha)
df_planilha.to_excel(url_linha + "catálogo.xlsx")
driver.close()
sometimes it gets around 5 products and then in the last product it still prints "exited" but then it throws a timeout exception. More usually, it gets stuck on first or second product and then throw the timeout exception, also printing "exited" before throwing timeoutexception. So I guess the error is between the go back command and the beginning of the loop. Also, can someone give me the feedback if I'm doing right on automatic scroll?
Trying to avoid timeout exceptions and do some automatic scroll.