0

the idea of the code is simple. It will go to the search's page, "see" all the products there, click one by one, scrape all the data, go back to the search's page and click on the next product's link. So I'm doing something like this:

def extract_prodirectsports_data():
    url_line = "https://www.prodirectsport.com/soccer/l/adults/departments-boots/activity-football/brand-adidas/silo-predator/"
    sheet_data = []
    pdf_data = []
    sizes = []

    cotacao_libra()  # Assuming this function is already defined
    i = 0
    driver.get(url_line)
    
    #Cookies
    WebDriverWait(driver, 100).until(EC.element_to_be_clickable((By.XPATH,"//button[@title='Accept all cookies']"))).click()
    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,"//div[@id='zonos']")))
    close_button = driver.find_element(By.XPATH, "//a[@class = 'z-close']")
    driver.execute_script("arguments[0].click();", close_button)
    WebDriverWait(driver, 100).until(EC.element_to_be_clickable((By.XPATH,"//div[@class='global-popup']")))
    button = driver.find_element(By.XPATH,"//button[@aria-label='Close']")
    driver.execute_script("arguments[0].click();", button)
    #driver.implicitly_wait(500000)
    
    df_sheet = pd.DataFrame(columns=SHEET_COLUMNS)
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font('Arial', 'B', 12)
    
    driver.maximize_window()

    product_list = WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.XPATH, "//div[@class = 'product-listing__grid']//div[@class = '_root_129ai_6 product-listing__grid-item']/a"))
        )
    while True:
        
        for index, value in enumerate(product_list):
            product_list = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, "//div[@class = 'product-listing__grid']//div[@class = '_root_129ai_6 product-listing__grid-item']/a"))
        )

            driver.execute_script("arguments[0].scrollIntoView();", product_list[index])
            
            driver.save_screenshot("window.png")
            image_element = WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.TAG_NAME, "img")))
            image = image_element.get_attribute("src")
            i += 1
            
            actions.move_to_element(product_list[index]).perform()
            
            driver.execute_script("arguments[0].click();", product_list[index])
            driver.implicitly_wait(50)
            #driver.save_screenshot("screenproduct.png")
            product_name = driver.find_element(By.CLASS_NAME, "ml-meta__title").text
            print(product_name)
            price = driver.find_element(By.CLASS_NAME, "ml-prices__price").text
            price = re.sub("£", "", price)
            product_price = float(price)
            print(product_price)

            product_code = driver.current_url.split('-')[-1]
            print(product_code)
            
            size_list = driver.find_element(By.XPATH, "//div[@class = 'ml-size__sizes']")
            
            if size_list:
                driver.execute_script("arguments[0].scrollIntoView();", size_list)
                for size in size_list.find_elements(By.XPATH, "//button[@class='ml-size__size qa-size-item']"):
                    product_size = size.text
                    sizes.append(product_size)
                    if "-" in size.text:
                        product_size = size.text.split('-')[0] 
                        sizes.append(product_size)
                    elif "(" in size.text:
                        product_size = size.text.split('(')[0]
                        sizes.append(product_size)
            print(sizes)

            price_in_reais = calculate_price_in_reais(product_price)  # Function needs to be defined
            sale_price = calculate_sale_price(product_price)  # Function needs to be defined

            sheet_data.append({
                    'Photo': image,
                    'Code': product_code,
                    'Description': product_name,
                    'Purchase': price_in_reais,
                    'Sale': sale_price,
                    'Sizes': sizes
                        })
            pdf_data.append({
                    'Photo': image,
                    'Code': product_code,
                    'Description': product_name,
                    'Price': product_price
                        })

            for index, row in pd.DataFrame(pdf_data).iterrows():
                for data in row.values:
                    pdf.cell(1.6, 0.5, str(data))
                pdf.ln()
            
            driver.execute_script("window.history.go(-1)")
            
            print("exited")
            driver.implicitly_wait(5000)
            botao_vermais = driver.find_element(By.XPATH, "//div[@class = 'product-listing__view-more']/button")        
            
        if botao_vermais:
             botao_vermais.click()
             driver.implicitly_wait(50)
            
        else:
             break
    pdf.output(url_linha + 'cátalogo.pdf')
    df_planilha = pd.DataFrame(data_planilha)
    df_planilha.to_excel(url_linha + "catálogo.xlsx")    
    driver.close()


sometimes it gets around 5 products and then in the last product it still prints "exited" but then it throws a timeout exception. More usually, it gets stuck on first or second product and then throw the timeout exception, also printing "exited" before throwing timeoutexception. So I guess the error is between the go back command and the beginning of the loop. Also, can someone give me the feedback if I'm doing right on automatic scroll?

Trying to avoid timeout exceptions and do some automatic scroll.

2
  • Please provide a Minimal Reproducible Example The code shown in your question is not runnable Commented Oct 15, 2024 at 6:24
  • ok, I've edited and made the url visible, so people can test it. Commented Oct 15, 2024 at 11:19

2 Answers 2

0

Some suggestion: use

EC.visibility_of_element_located()

insted of:

EC.presence_of_element_located()

Sign up to request clarification or add additional context in comments.

Comments

-2

Some suggestions:

  • Ensure Elements are Fully Loaded

  • Try-Catch Blocks

  • Explicit Waits

  • Refresh Product List

    while True: try: product_list = WebDriverWait(driver, 20).until( EC.presence_of_all_elements_located((By.XPATH, "//div[@class='product-listing__grid']//div[@class='_root_129ai_6 product-listing__grid-item']/a")) )

         for index, value in enumerate(product_list):
             try:
                 product_list = WebDriverWait(driver, 10).until(
                     EC.presence_of_all_elements_located((By.XPATH, "//div[@class='product-listing__grid']//div[@class='_root_129ai_6 product-listing__grid-item']/a"))
                 )
    
                 driver.execute_script("arguments.scrollIntoView();", product_list[index])
                 driver.execute_script("arguments.click();", product_list[index])
                 WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "ml-meta__title")))
    
                 product_name = driver.find_element(By.CLASS_NAME, "ml-meta__title").text
                 price = driver.find_element(By.CLASS_NAME, "ml-prices__price").text
                 price = re.sub("£", "", price)
                 product_price = float(price)
                 product_code = driver.current_url.split('-')[-1]
    
                 size_list = driver.find_elements(By.XPATH, "//button[@class='ml-size__size qa-size-item']")
                 sizes = [size.text.split('-').split('(') for size in size_list]
    
                 sheet_data.append({
                     'Photo': image,
                     'Code': product_code,
                     'Description': product_name,
                     'Purchase': calculate_price_in_reais(product_price),
                     'Sale': calculate_sale_price(product_price),
                     'Sizes': sizes
                 })
    
                 driver.execute_script("window.history.go(-1)")
                 WebDriverWait(driver, 20).until(
                     EC.presence_of_all_elements_located((By.XPATH, "//div[@class='product-listing__grid']//div[@class='_root_129ai_6 product-listing__grid-item']/a"))
                 )
                 print("exited")
    
             except TimeoutException:
                 print(f"TimeoutException on product {index}. Retrying...")
                 driver.execute_script("window.history.go(-1)")
                 WebDriverWait(driver, 20).until(
                     EC.presence_of_all_elements_located((By.XPATH, "//div[@class='product-listing__grid']//div[@class='_root_129ai_6 product-listing__grid-item']/a"))
                 )
                 continue
    
         try:
             botao_vermais = WebDriverWait(driver, 10).until(
                 EC.element_to_be_clickable((By.XPATH, "//div[@class='product-listing__view-more']/button"))
             )
             botao_vermais.click()
         except TimeoutException:
             break
    
     except TimeoutException:
         print("TimeoutException on product list. Exiting loop.")
         break
    

1 Comment

How did you test this answer?

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.