0

I want to load the all url using loop and trying to get data from it But i am not unable to do. Somebody have any ideas how to deal multiple urls in scrapy.

Here is my code:

    import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from scrapy.selector import Selector
import time
from scrapy.http import Request

class TrSpider(scrapy.Spider):
    name = 'tr'
    allowed_domains = ['trc.com']
    start_urls = ['https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/0rtv&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAW',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/012gg6&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAc',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/027f5q&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAi',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/02d4_q&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAo',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/0b1nv7&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAu',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/04m7fh&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARA0',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/0b95m1&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARA6',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/0b95tg&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARBA',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/02p77j&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARBG']

    def __init__(self):
        chrome_option = Options()
        chrome_option.add_argument('--headless')

        chrome_path = which('chromedriver')

        driver = webdriver.Chrome(executable_path=chrome_path)

        for a in self.start_urls:
            driver.get(a)
            driver.find_element_by_xpath('//button[@class="VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-INsAgc VfPpkd-LgbsSe-OWXEXe-dgl2Hf Rj2Mlf OLiIxf PDpWxe J3Eqid"]').click()
            time.sleep(1)
            self.html = driver.page_source

    def parse(self, response):
        ab = response.url
        resp = Selector(text = self.html)
        
        for a in resp.xpath('//div[@class="GwjAi "]'):
            yield{
                'Name': a.xpath('./div[1]/div/text()').get(),
                'Rating': a.xpath('./div[2]/span/span/span[1]/text()').get(),
                'Number of reviews': a.xpath('./div[2]/span/span/span[2]/text()').get(),
                'Discription': a.xpath('./div[3]/text()').extract(),
                'Url': ab
            }

If I run with out looping it working but it will not working with multiple links. Thanks

1
  • are you getting any error or where are you facing the problem? Commented Jan 24, 2021 at 21:50

1 Answer 1

1

Try the following:

import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.selector import Selector
from selenium.common.exceptions import TimeoutException

class TrSpider(scrapy.Spider):
    name = 'tr'
    start_urls = [
        'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/0rtv&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAW',
        'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/012gg6&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAc',
        'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/027f5q&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAi',
        'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/02d4_q&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAo',
        'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/0b1nv7&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAu',
        'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/04m7fh&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARA0',
        'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/0b95m1&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARA6',
        'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/0b95tg&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARBA',
        'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/02p77j&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARBG'
    ]

    def start_requests(self):
        driver = webdriver.Chrome()
        wait = WebDriverWait(driver,10)
        for item_link in self.start_urls:
            driver.get(item_link)
            try:
                button = wait.until(EC.presence_of_element_located((By.XPATH, "//span[contains(.,'See all top sights')]")))
                driver.execute_script("arguments[0].click();",button)
            except TimeoutException:
                pass
            htmlelements = driver.page_source
            wait.until(EC.staleness_of(button))
            yield scrapy.Request(item_link,meta={"htmlelements":htmlelements})

    def parse(self, response):
        ab = response.url
        resp = Selector(text=response.meta.get("htmlelements"))
        for a in resp.xpath('//div[starts-with(@class,"GwjAi")]'):
            yield {
                'Name': a.xpath('./div[1]/div/text()').get(),
                'Rating': a.xpath('./div[2]/span/span/span[1]/text()').get(),
                'Number of reviews': a.xpath('./div[2]/span/span/span[2]/text()').get(),
                'Discription': a.xpath('./div[3]/text()').extract(),
                'Url': ab
            }
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.