Python Beautifulsoup extract text from different span with same class

Question

As I'm new to datascience I'm trying to webscrape a real estate website in order to create a dataset with the listing, the problem that I run into is that different elements (rooms, surface and number of toilets) have the same li class and span class so I get the first element (rooms) also for the other 2 elements. I tried to implement this solution but I get this error:
"'str' object has no attribute 'find_next'"
website: https://www.immobiliare.it/vendita-case/milano/
enter image description here

Code:

import requests
from bs4 import BeautifulSoup
import pandas

base_url = "https://www.immobiliare.it/vendita-case/milano/"

r = requests.get(base_url)
c = r.content
soup = BeautifulSoup(c, "html.parser")


# To extract the first and last page numbers
paging = soup.find("div",{"id":"listing-pagination"}).find("ul",{"class":"pagination pagination__number"}).find_all("a")
start_page = paging[0].text
last_page = paging[len(paging)-1].text

#Empty list to append content
web_content_list = []
for page_number in range(int(start_page),2):
    # To form the url based on page numbers
    print(page_number)
    url = base_url + "?pag=" + str(page_number)
    r = requests.get(url)
    c = r.content
    soup = BeautifulSoup(c, "html.parser")
    #Extract info
    listing_content = soup.find_all("div",{"class":"listing-item_body--content"})
    for item in listing_content:
        #Store info to a dictionary
        web_content_dict = {}
        web_content_dict["Title"] = item.find("p",{"class":"titolo text-primary"}).find("a").get("title")
        web_content_dict["Price"] = item.find("li",{"class":"lif__item lif__princing"})
        web_content_dict["Rooms"] = item.find("span",{"class":"text-bold"}).text
        web_content_dict["Surface"] = web_content_dict["Rooms"].find_next("span").text

        #Store dictionary into a list
        web_content_list.append(web_content_dict)

#Make a dataframe with the list
df = pandas.DataFrame(web_content_list)
#Write dataframe to a csv file
df.to_csv("Output.csv")
print("Done")

Also I would prefer not to use selenium, Thanks for the help

Please do not use the images to write comments / texts / tips / errors on top of it. — axel
– axel, Commented Dec 7, 2019 at 16:18

Andrej Kesely · Accepted Answer · 2019-12-07 16:18:14Z

As quick solution, you can find the <span> for the rooms, and then use find_next for surface and again, find_next for the number of toilets:

For example (also I used get_text(strip=True) to strip the text of whitespaces):

for item in listing_content:
    #Store info to a dictionary
    web_content_dict = {}
    web_content_dict["Title"] = item.find("p",{"class":"titolo text-primary"}).find("a").get("title")
    web_content_dict["Price"] = item.find("li",{"class":"lif__item lif__pricing"}).get_text(strip=True)
    web_content_dict["Rooms"] = item.find("span",{"class":"text-bold"}).get_text(strip=True)
    web_content_dict["Surface"] = item.find("span",{"class":"text-bold"}).find_next("span").get_text(strip=True)
    web_content_dict["Toilets"] = item.find("span",{"class":"text-bold"}).find_next("span").find_next("span").get_text(strip=True)

When I printed the variable web_content_dict, it was like this:

{'Title': "Bilocale via Fra' Giovanni Pantaleo 3, Bovisa, Milano", 'Price': '€ 187.000', 'Rooms': '2', 'Surface': '65', 'Toilets': '1'}
{'Title': 'Trilocale via Monte Rosa 15, Amendola - Buonarroti, Milano', 'Price': '€ 730.000', 'Rooms': '3', 'Surface': '140', 'Toilets': '2'}
{'Title': 'Trilocale via San Senatore, 2, Missori, Milano', 'Price': '€ 665.000', 'Rooms': '3', 'Surface': '109', 'Toilets': '2'}
{'Title': 'Quadrilocale viale Duilio 6, Sempione, Milano', 'Price': '€ 1.150.000', 'Rooms': '4', 'Surface': '165', 'Toilets': '2'}
{'Title': "Appartamento piazza Sant'agostino, 6, Corso Genova, Milano", 'Price': '€ 1.650.000', 'Rooms': '5', 'Surface': '275', 'Toilets': '3+'}
{'Title': 'Trilocale via Val Gardena 25, Precotto, Milano', 'Price': '€ 170.000', 'Rooms': '3', 'Surface': '91', 'Toilets': '1'}
{'Title': 'Appartamento corso Di Porta Nuova, Turati, Milano', 'Price': '€ 1.130.000', 'Rooms': '5+', 'Surface': '210', 'Toilets': '3'}
{'Title': 'Trilocale via Francesco Albani 58, Monte Rosa - Lotto, Milano', 'Price': '€ 380.000', 'Rooms': '3', 'Surface': '90', 'Toilets': '1'}
{'Title': 'Bilocale via Antonio Cesari 47, Niguarda, Milano', 'Price': '€ 115.000', 'Rooms': '2', 'Surface': '46', 'Toilets': '1'}
{'Title': 'Trilocale via mazzucotelli 15, Quartiere Forlanini, Milano', 'Price': '€ 215.000', 'Rooms': '3', 'Surface': '91', 'Toilets': '2'}
{'Title': 'Bilocale via Livorno, Palestro, Milano', 'Price': '€ 520.000', 'Rooms': '2', 'Surface': '57', 'Toilets': '1'}
{'Title': 'Bilocale via Maspero 28, Molise - Cuoco, Milano', 'Price': '€ 290.000', 'Rooms': '2', 'Surface': '70', 'Toilets': '1'}
{'Title': 'Trilocale largo Gemito, 3, Casoretto, Milano', 'Price': '€ 308.000', 'Rooms': '3', 'Surface': '93', 'Toilets': '1'}
{'Title': 'Quadrilocale via Pietro Paleocapa, Cadorna - Castello, Milano', 'Price': '€ 1.300.000', 'Rooms': '4', 'Surface': '180', 'Toilets': '3'}
{'Title': 'Bilocale via Renato Fucini, Città Studi, Milano', 'Price': '€ 511.000', 'Rooms': '2', 'Surface': '85', 'Toilets': '1'}
{'Title': 'Quadrilocale via Lucca, Bisceglie, Milano', 'Price': '€ 275.000', 'Rooms': '4', 'Surface': '100', 'Toilets': '1'}
{'Title': 'Trilocale via RIZZARDI 45, Trenno, Milano', 'Price': '€ 485.000', 'Rooms': '3', 'Surface': '127', 'Toilets': '1'}
{'Title': 'Bilocale via bacchiglione, Corvetto, Milano', 'Price': '€ 220.000', 'Rooms': '2', 'Surface': '50', 'Toilets': '1'}
{'Title': 'Quadrilocale via Cadore, Cadore, Milano', 'Price': '€ 1.060.000', 'Rooms': '4', 'Surface': '210', 'Toilets': '2'}
{'Title': 'Bilocale via  bacchiglione, Corvetto, Milano', 'Price': '€ 195.000', 'Rooms': '2', 'Surface': '42', 'Toilets': '1'}
{'Title': 'Bilocale buono stato, primo piano, Brera, Milano', 'Price': '€ 800.000', 'Rooms': '2', 'Surface': '87', 'Toilets': '2'}
{'Title': 'Trilocale via  bacchiglione, Corvetto, Milano', 'Price': '€ 540.000', 'Rooms': '3', 'Surface': '120', 'Toilets': '2'}
{'Title': 'Bilocale via bacchiglione, Corvetto, Milano', 'Price': '€ 350.000', 'Rooms': '2', 'Surface': '81', 'Toilets': '1'}
{'Title': 'Bilocale via  bacchiglione, Corvetto, Milano', 'Price': '€ 265.000', 'Rooms': '2', 'Surface': '50', 'Toilets': '1'}
{'Title': 'Appartamento via Antonio Pianella, 4, San Siro, Milano', 'Price': '€ 649.000', 'Rooms': '5+', 'Surface': '150', 'Toilets': '3'}

Antoine Boucher · Accepted Answer · 2019-12-07 16:38:25Z

I try to improve your script by using the find_all and index of the tag using try and error, but maybe you can use the .next_siblings property in bs4

import requests
from bs4 import BeautifulSoup
import pandas
base_url = "https://www.immobiliare.it/vendita-case/milano/"

r = requests.get(base_url)
c = r.content
soup = BeautifulSoup(c, "html.parser")


# To extract the first and last page numbers
paging = soup.find("div",{"id":"listing-pagination"}).find("ul",{"class":"pagination pagination__number"}).find_all("a")
start_page = paging[0].text
last_page = paging[len(paging)-1].text

#Empty list to append content
web_content_list = []
for page_number in range(int(start_page),2):
    # To form the url based on page numbers
    print(page_number)
    url = base_url + "?pag=" + str(page_number)
    r = requests.get(url)
    c = r.content
    soup = BeautifulSoup(c, "html.parser")
    #Extract info
    listing_content = soup.find_all("div",{"class":"listing-item_body--content"})
    for item in listing_content:
        #Store info to a dictionary
        web_content_dict = {}
        web_content_dict["Title"] = item.find("p",{"class":"titolo text-primary"}).find("a").get("title")
        web_content_dict["Price"] = item.find_all("li",{"class":"lif__item lif__princing"})
        web_content_dict["Rooms"] = item.find_all("li",{"class":"lif__item"})[1].find("span",{"class":"text-bold"}).get_text(strip=True)
        web_content_dict["Surface"] = item.find_all("li",{"class":"lif__item"})[2].find("span",{"class":"text-bold"}).get_text(strip=True)
        web_content_dict["Bath"] = item.find_all("li",{"class":"lif__item"})[3].find("span",{"class":"text-bold"}).get_text(strip=True)
        try:
            web_content_dict["Floor"] = item.find_all("li",{"class":"lif__item"})[4].find("abbr",{"class":"text-bold"}).get_text(strip=True)
        except IndexError as e:
            web_content_dict["Floor"] = 1

        #Store dictionary into a list
        web_content_list.append(web_content_dict)

#Make a dataframe with the list
df = pandas.DataFrame(web_content_list)
print(df)
#Write dataframe to a csv file
df.to_csv("Output.csv")
print("Done")

Collectives™ on Stack Overflow

Python Beautifulsoup extract text from different span with same class

2 Answers 2

Comments

Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related