I am scraping an Amazone page using Python and saving the result into a csv file. This code is running well, but the problem is that I get some product names without the first word. So for example here I get only: "Schuko Steckdose, EU-Standard 1 Fach Unterputz Mit 2,5D Curved Glas Platte, Wandsteckdose Weiß 86 * 86mm", but it is supposed to be "TAWOIA Schuko Steckdose, EU-Standard 1 Fach Unterputz Mit 2,5D Curved Glas Platte, Wandsteckdose Weiß 86 * 86mm" [enter image description here][1]
here is my code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
'Accept-Language': 'de-DE, de;q=0.5'
}
search_query = 'steckdose'.replace(' ', '+')
base_url = 'https://www.amazon.de/s?k={0}'.format(search_query)
items = []
for i in range(1, 2):
print('Processing {0}...'.format(base_url + '&page={0}'.format(i)))
response= requests.get(base_url + '&page={0}'.format(i), headers = headers)
if response.status_code !=200:
print(f"Error: {response.status_code}")
continue
soup = BeautifulSoup(response.content, 'html.parser')
results = soup.find_all('div', {'data-component-type': 's-search-result'})
if not results:
print('No results found')
continue
for result in results:
try:
# Find the <a> tag first
link = result.find('a', class_='a-link-normal s-line-clamp-4 s-link-style a-text-normal')
if link:
# Extract product name from the <span> tag inside <h2>
product_name = link.find('h2').find('span').text.strip() # Get text from <span>
product_url = 'https://www.amazon.de' + link['href']
items.append([product_name, product_url])
except AttributeError:
continue
sleep(1.5)
df = pd.DataFrame(items, columns=['product', 'product url'])
df.to_csv('{0}.csv'.format(search_query), index = False)