0

I have the following code using Selenium to scrape this page (the list of albums, and the list of songs when you click on the album). The script is running but I would like to create a dataframe with panda with column with the list of albums (one per row) and the list of songs in another column.

I need it to us the data in Excel.

Thanks for all,

Léa

from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

PATH = '/Users/prati/Desktop/WDD/Projet_Rapgenius/chromedriver'
#choix du navigateur
driver = webdriver.Chrome(PATH)

# ouvrir le site web concerné 
driver.get('https://genius.com/Genius-france-discographie-rap-2021-annotated')
sleep(2)

# cliquer sur "j'accepte" pour les cookies 
accept_button = driver.find_element_by_id('onetrust-accept-btn-handler')
accept_button.send_keys(Keys.ENTER)
sleep(2)

# trouver le titre de l'album/date/artiste
links = driver.find_elements_by_class_name('ReferentFragmentVariantdesktop__Highlight-sc-1837hky-1.jShaMP')
# boucle = à chaque fois qu'il le trouve...
for link in links:
# il doit le scraper et afficher puis se reposer 
    try:
        album = link.text
        print(album)
# puis cliquer dessus
        link.click()
        sleep(1)
# et scraper et afficher la liste des titres d'album
        div = driver.find_element_by_class_name('RichText__Container-oz284w-0.gVsQub')
        morceaux = div.find_elements_by_tag_name('li')
        for morceau in morceaux:
            print(morceau.text)
#...s'il trouve pas, il passe
    except:
        pass

#fermer l'onglet
driver.close()
2
  • one row for all songs? Commented Dec 7, 2021 at 16:31
  • the best would be one sound per column (like song1, song2) but there is not the same number of sounds each time Commented Dec 7, 2021 at 16:35

1 Answer 1

1
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service

options = webdriver.ChromeOptions() 
options.add_argument("--disable-popup-blocking")
options.add_argument('--no-default-browser-check')
options.add_argument('--log-level=3')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--start-maximized')
options.add_experimental_option("detach", True)
service = Service('driver/chromedriver.exe')
driver = webdriver.Chrome(options=options, service=service)

driver.get('https://genius.com/Genius-france-discographie-rap-2021-annotated')
WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, "//p/b")))
ListAlbunsDF = []
ListMusicsDF = []
ListMusicsAlbum = []
for k in driver.find_elements(By.XPATH, "//span[contains(@class, 'ReferentFragmentVariantdesktop__Highlight')]"):
    try:
        k.click()
        WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, "//div[contains(@class, 'Annotation__Container')]")))
        for i in driver.find_elements(By.XPATH, "//div[contains(@class, 'Annotation__Container')]//li"):
            ListMusicsAlbum.append(str(i.text))
    except:
        pass
    ListAlbunsDF.append(str(k.text))
    ListMusicsDF.append(ListMusicsAlbum[:])
    ListMusicsAlbum.clear()

    # to track the progress:
    print("{:.0%}".format(len(ListAlbunsDF)/len(driver.find_elements(By.XPATH, "//span[contains(@class, 'ReferentFragmentVariantdesktop__Highlight')]"))))


df = pd.DataFrame(ListMusicsDF, index=ListAlbunsDF)    
df.columns += 1 
df.to_excel('au.xlsx')
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.