3

Somewhere I am getting Index Error when fetching data with BeautifulSoup. I can pull a lot of data but it breaks somewhere. How can I solve it?

import requests
from bs4 import BeautifulSoup
totalCar = 0
for pageNumber in range(3, 7):
    r = requests.get("https://www.autoscout24.com/lst/bmw?sort=standard&desc=0&offer=U&ustate=N%2CU&size=20&page="+
        str(pageNumber)+"&cy=D&mmm=47%7C%7C&mmm=9%7C%7C&atype=C&")
    r.status_code
    r.content
    soup = BeautifulSoup(r.content,"lxml")
    #soup.prettify
    car_details = soup.find_all("div",attrs={"class":"cl-list-element cl-list-element-gap"})
    for detail in car_details:
        car_link = "https://www.autoscout24.com"+detail.a.get("href")
        #print(car_link)
        car_r = requests.get(car_link)
        car_soup = BeautifulSoup(car_r.content,"lxml")
        car_make = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd:nth-of-type(1)")[0].text
        #car_model = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd:nth-of-type(2)")[0].text
        car_model = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd > a")[0].text
        car_year = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd > a")[1].text
        car_color = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd > a")[2].text
        car_body = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd > a")[3].text

        print("Make:{} Model:{} Year:{} Color:{} Body:{}".format(car_make,car_model,car_year,car_color,car_body))
        print("-"*20)
        totalCar+=1
    print(totalCar)
1
  • Would be nice if we can have traceback! Commented Aug 29, 2020 at 23:07

1 Answer 1

3

Sometimes, the car body information isn't present. You need to check that:

import requests
from bs4 import BeautifulSoup

totalCar = 0
for pageNumber in range(3, 7):
    r = requests.get("https://www.autoscout24.com/lst/bmw?sort=standard&desc=0&offer=U&ustate=N%2CU&size=20&page="+
        str(pageNumber)+"&cy=D&mmm=47%7C%7C&mmm=9%7C%7C&atype=C&")
    r.status_code
    r.content
    soup = BeautifulSoup(r.content,"lxml")
    #soup.prettify
    car_details = soup.find_all("div",attrs={"class":"cl-list-element cl-list-element-gap"})
    for detail in car_details:
        car_link = "https://www.autoscout24.com"+detail.a.get("href")
        #print(car_link)
        car_r = requests.get(car_link)
        print(car_link)
        car_soup = BeautifulSoup(car_r.content,"lxml")
        car_make = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd:nth-of-type(1)")[0].text

        a = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd > a")

        car_model = a[0].text
        car_year = a[1].text
        car_color = a[2].text
        car_body = car_body = a[3].text if len(a) > 3 else '-'  # <-- check, if car body information is present

        print("Make:{} Model:{} Year:{} Color:{} Body:{}".format(car_make,car_model,car_year,car_color,car_body))
        print("-"*20)
        totalCar+=1
    print(totalCar)

Prints:

...

--------------------
https://www.autoscout24.com/offers/mercedes-benz-a-180-blueefficiency-limousine-5tuerig-gasoline-grey-73cbbad4-ab1c-4163-a7cf-76037408fcb8
Make:
Mercedes-Benz
 Model:A 180 Year:2009 Color:Grey Body:Sedans
--------------------
https://www.autoscout24.com/offers/audi-a4-ambiente-1-8-ahk-xenon-sitzh-pdc-tempom-8fach-gasoline-black-f6517012-9dfb-4d93-a7dd-d0b9b9bdbbc6
Make:
Audi
 Model:A4 Year:2008 Color:Black Body:Sedans
--------------------
80
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.