So my thoughts are that if I would add something that can split the url-range into 5 and then give each of 5 chromedriver instances their own split of the url-range to handle it would make scraping much faster. And thats my biggest question. But maybe then its better if each chromedriver had their own csv file, or I would need to add something that pools all the scraping in one file? Im really at a loss here and I'm already pushing my skill level. I am eternally grateful for any concrete help on at least how to get multithreading working. Thank you!
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import csv
path_to_file = "test1.csv"
csvFile = open(path_to_file, 'a', encoding="utf-8", newline='')
csvWriter = csv.writer(csvFile)
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)
header_added = False
time.sleep(3)
for i in range(1,153512):
print(f"https://www.ancestry.com/discoveryui-content/view/{i}:61965")
driver.get(f"https://www.ancestry.com/discoveryui-content/view/{i}:61965")
try:
Name = driver.find_element(By.XPATH,"//table[@id='recordServiceData']//tr[contains(.,'Name:')]").text.replace("Name:", "")
except:
Name =''
csvWriter.writerow([i, Name])
print(Name)