I want to make a web scraper in Scrapy that extracts 10000 links of news from this website https://hamariweb.com/news/newscategory.aspx?cat=7 This webpage is dynamic when I scroll down more links load.
I tried it with selenium but it's not working.
import scrapy
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from scrapy import signals
from scrapy.http import HtmlResponse
class WebnewsSpider(scrapy.Spider):
name = 'webnews'
allowed_domains = ['www.hamariweb.com']
start_urls = ['https://hamariweb.com/news/newscategory.aspx?cat=7']
def __init__ (self):
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
# options.add_argument('--blink-settings=imagesEnabled=false')
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
self.driver = webdriver. Chrome("C://Users//hammad//Downloads//chrome
driver",chrome_options=options)
def parse(self, response):
self.driver.get(response.url)
pause_time = 1
last_height = self.driver.execute_script("return document.body.scrollHeight")
#start = datetime.datetime.now()
while True:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight + 400);")
time.sleep(pause_time)
print("\n\n\nend\n\n\n")
new_height = self.driver.execute_script("return document.body.scrollHeight")
The above mentioned code opens a browser in incognito mode and continues to scroll down. I also want to extract 10000 news links and want to stop the browser when limit reached.