Scrapy and selenium - works when using selenium alone, but not with scrapy

Question

I am practicing Scrapy and Selenium on Booking.com.

For some reason, using the following Selenium code works just fine.

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which

# chrome_options = Options()
# chrome_options.add_argument("--headless")

chrome_path = which("chromedriver")

driver = webdriver.Chrome(executable_path=chrome_path)
driver.get("https://www.booking.com/searchresults.en-gb.html?label=gen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKPtaX4BcACAdICJDE5OWNlMDNkLWJjZTgtNGQyNC04ZjYwLTZiOTZmOTllYTZlN9gCBOACAQ&lang=en-gb&sb=1&src=searchresults&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.en-gb.html%3Flabel%3Dgen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKPtaX4BcACAdICJDE5OWNlMDNkLWJjZTgtNGQyNC04ZjYwLTZiOTZmOTllYTZlN9gCBOACAQ%3Btmpl%3Dsearchresults%3Bac_click_type%3Db%3Bac_position%3D0%3Bcheckin_month%3D7%3Bcheckin_monthday%3D13%3Bcheckin_year%3D2020%3Bcheckout_month%3D7%3Bcheckout_monthday%3D14%3Bcheckout_year%3D2020%3Bcity%3D900039038%3Bclass_interval%3D1%3Bdest_id%3D1855490%3Bdest_type%3Dhotel%3Bdtdisc%3D0%3Bfrom_sf%3D1%3Bgroup_adults%3D1%3Bgroup_children%3D0%3Bhighlighted_hotels%3D1855490%3Binac%3D0%3Bindex_postcard%3D0%3Blabel_click%3Dundef%3Bno_rooms%3D1%3Boffset%3D0%3Bpostcard%3D0%3Braw_dest_type%3Dhotel%3Broom1%3DA%3Bsb_price_type%3Dtotal%3Bsearch_selected%3D1%3Bshw_aparth%3D1%3Bslp_r_match%3D0%3Bsrc%3Dsearchresults%3Bsrc_elem%3Dsb%3Bsrpvid%3Da53e49abc8b20230%3Bss%3DKiwi%2520As%2520Guest%2520House%252C%2520Rotorua%252C%2520Bay%2520of%2520Plenty%252C%2520New%2520Zealand%3Bss_all%3D0%3Bss_raw%3Dkiwias%3Bssb%3Dempty%3Bsshis%3D0%3Bssne%3DRotorua%3Bssne_untouched%3DRotorua%3Btop_ufis%3D1%26%3B&ss=Rainbow+Holiday+House%2C+Rotorua%2C+Bay+of+Plenty%2C+New+Zealand&is_ski_area=&ssne=Rotorua&ssne_untouched=Rotorua&city=900039038&checkin_year=2020&checkin_month=7&checkin_monthday=13&checkout_year=2020&checkout_month=7&checkout_monthday=14&group_adults=1&group_children=0&no_rooms=1&from_sf=1&ss_raw=rainbow+holiday+house&ac_position=0&ac_langcode=en&ac_click_type=b&dest_id=5288382&dest_type=hotel&place_id_lat=-38.105882&place_id_lon=176.239504&search_pageview_id=a53e49abc8b20230&search_selected=true&search_pageview_id=a53e49abc8b20230&ac_suggestion_list_length=3&ac_suggestion_theme_list_length=0")

cards = driver.find_elements_by_xpath('//div[@class="sr_item  sr_item_new sr_item_default sr_property_block  sr_flex_layout          "]')

for c in cards:

     print('name', c.find_element_by_xpath('./div/div/div/div/h3/a/span[1]').text)

The above code returns the names of the properties.

But when I try to use Selenium together with Scrapy, it fails.

import scrapy
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from scrapy_selenium import SeleniumRequest


class BookSpider(scrapy.Spider):
    name = 'book'
    allowed_domains = ['booking.com']

    def start_requests(self):
        yield SeleniumRequest(url='https://www.booking.com/searchresults.en-gb.html?label=gen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKPtaX4BcACAdICJDE5OWNlMDNkLWJjZTgtNGQyNC04ZjYwLTZiOTZmOTllYTZlN9gCBOACAQ&lang=en-gb&sb=1&src=searchresults&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.en-gb.html%3Flabel%3Dgen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKPtaX4BcACAdICJDE5OWNlMDNkLWJjZTgtNGQyNC04ZjYwLTZiOTZmOTllYTZlN9gCBOACAQ%3Btmpl%3Dsearchresults%3Bac_click_type%3Db%3Bac_position%3D0%3Bcheckin_month%3D7%3Bcheckin_monthday%3D13%3Bcheckin_year%3D2020%3Bcheckout_month%3D7%3Bcheckout_monthday%3D14%3Bcheckout_year%3D2020%3Bcity%3D900039038%3Bclass_interval%3D1%3Bdest_id%3D1855490%3Bdest_type%3Dhotel%3Bdtdisc%3D0%3Bfrom_sf%3D1%3Bgroup_adults%3D1%3Bgroup_children%3D0%3Bhighlighted_hotels%3D1855490%3Binac%3D0%3Bindex_postcard%3D0%3Blabel_click%3Dundef%3Bno_rooms%3D1%3Boffset%3D0%3Bpostcard%3D0%3Braw_dest_type%3Dhotel%3Broom1%3DA%3Bsb_price_type%3Dtotal%3Bsearch_selected%3D1%3Bshw_aparth%3D1%3Bslp_r_match%3D0%3Bsrc%3Dsearchresults%3Bsrc_elem%3Dsb%3Bsrpvid%3Da53e49abc8b20230%3Bss%3DKiwi%2520As%2520Guest%2520House%252C%2520Rotorua%252C%2520Bay%2520of%2520Plenty%252C%2520New%2520Zealand%3Bss_all%3D0%3Bss_raw%3Dkiwias%3Bssb%3Dempty%3Bsshis%3D0%3Bssne%3DRotorua%3Bssne_untouched%3DRotorua%3Btop_ufis%3D1%26%3B&ss=Rainbow+Holiday+House%2C+Rotorua%2C+Bay+of+Plenty%2C+New+Zealand&is_ski_area=&ssne=Rotorua&ssne_untouched=Rotorua&city=900039038&checkin_year=2020&checkin_month=7&checkin_monthday=13&checkout_year=2020&checkout_month=7&checkout_monthday=14&group_adults=1&group_children=0&no_rooms=1&from_sf=1&ss_raw=rainbow+holiday+house&ac_position=0&ac_langcode=en&ac_click_type=b&dest_id=5288382&dest_type=hotel&place_id_lat=-38.105882&place_id_lon=176.239504&search_pageview_id=a53e49abc8b20230&search_selected=true&search_pageview_id=a53e49abc8b20230&ac_suggestion_list_length=3&ac_suggestion_theme_list_length=0',
        wait_time=5,
        callback=self.parse)

    def parse(self, response):

        cards = response.xpath('//div[@class="sr_item  sr_item_new sr_item_default sr_property_block  sr_flex_layout          "]')

        for c in cards:

            yield{'name': c.xpath('./div/div/div/div/h3/a/span[1]/text()').get()}

I really couldn't figure this out because the only difference I'm doing is using SeleniumRequest, if anyone have any ideas it would be great...

Thanks

AaronS · Accepted Answer · 2020-07-12 14:00:24Z

This worked for me, my guess is that it may be due to your XPATH in the card variable. Too many spaces etc.. I tend to use contain(@class,'ATTRIBUTE') for large attributes I want to select.

Code

class TestSpider(scrapy.Spider):
    name = 'test'
    allowed_domains = ['booking.com']
    start_urls = ['https://www.booking.com/searchresults.en-gb.html?label=gen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKPtaX4BcACAdICJDE5OWNlMDNkLWJjZTgtNGQyNC04ZjYwLTZiOTZmOTllYTZlN9gCBOACAQ&lang=en-gb&sb=1&src=searchresults&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.en-gb.html%3Flabel%3Dgen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKPtaX4BcACAdICJDE5OWNlMDNkLWJjZTgtNGQyNC04ZjYwLTZiOTZmOTllYTZlN9gCBOACAQ%3Btmpl%3Dsearchresults%3Bac_click_type%3Db%3Bac_position%3D0%3Bcheckin_month%3D7%3Bcheckin_monthday%3D13%3Bcheckin_year%3D2020%3Bcheckout_month%3D7%3Bcheckout_monthday%3D14%3Bcheckout_year%3D2020%3Bcity%3D900039038%3Bclass_interval%3D1%3Bdest_id%3D1855490%3Bdest_type%3Dhotel%3Bdtdisc%3D0%3Bfrom_sf%3D1%3Bgroup_adults%3D1%3Bgroup_children%3D0%3Bhighlighted_hotels%3D1855490%3Binac%3D0%3Bindex_postcard%3D0%3Blabel_click%3Dundef%3Bno_rooms%3D1%3Boffset%3D0%3Bpostcard%3D0%3Braw_dest_type%3Dhotel%3Broom1%3DA%3Bsb_price_type%3Dtotal%3Bsearch_selected%3D1%3Bshw_aparth%3D1%3Bslp_r_match%3D0%3Bsrc%3Dsearchresults%3Bsrc_elem%3Dsb%3Bsrpvid%3Da53e49abc8b20230%3Bss%3DKiwi%2520As%2520Guest%2520House%252C%2520Rotorua%252C%2520Bay%2520of%2520Plenty%252C%2520New%2520Zealand%3Bss_all%3D0%3Bss_raw%3Dkiwias%3Bssb%3Dempty%3Bsshis%3D0%3Bssne%3DRotorua%3Bssne_untouched%3DRotorua%3Btop_ufis%3D1%26%3B&ss=Rainbow+Holiday+House%2C+Rotorua%2C+Bay+of+Plenty%2C+New+Zealand&is_ski_area=&ssne=Rotorua&ssne_untouched=Rotorua&city=900039038&checkin_year=2020&checkin_month=7&checkin_monthday=13&checkout_year=2020&checkout_month=7&checkout_monthday=14&group_adults=1&group_children=0&no_rooms=1&from_sf=1&ss_raw=rainbow+holiday+house&ac_position=0&ac_langcode=en&ac_click_type=b&dest_id=5288382&dest_type=hotel&place_id_lat=-38.105882&place_id_lon=176.239504&search_pageview_id=a53e49abc8b20230&search_selected=true&search_pageview_id=a53e49abc8b20230&ac_suggestion_list_length=3&ac_suggestion_theme_list_length=0']


    def start_requests(self):
        yield SeleniumRequest(url=self.start_urls[0], wait_time=5,callback=self.parse)

    def parse(self, response):
        cards = response.xpath('//div[contains(@class,"sr_item  s")]')

        for c in cards:
            title = c.xpath('.//span[contains(@class,"hotel")]/text()').get()
            yield{'name': title.strip() }

Collectives™ on Stack Overflow

Scrapy and selenium - works when using selenium alone, but not with scrapy

1 Answer 1

Code

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Code

Comments

Your Answer

Sign up or log in

Post as a guest

Related