0

I am practicing Scrapy and Selenium on Booking.com.

For some reason, using the following Selenium code works just fine.

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which

# chrome_options = Options()
# chrome_options.add_argument("--headless")

chrome_path = which("chromedriver")

driver = webdriver.Chrome(executable_path=chrome_path)
driver.get("https://www.booking.com/searchresults.en-gb.html?label=gen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKPtaX4BcACAdICJDE5OWNlMDNkLWJjZTgtNGQyNC04ZjYwLTZiOTZmOTllYTZlN9gCBOACAQ&lang=en-gb&sb=1&src=searchresults&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.en-gb.html%3Flabel%3Dgen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKPtaX4BcACAdICJDE5OWNlMDNkLWJjZTgtNGQyNC04ZjYwLTZiOTZmOTllYTZlN9gCBOACAQ%3Btmpl%3Dsearchresults%3Bac_click_type%3Db%3Bac_position%3D0%3Bcheckin_month%3D7%3Bcheckin_monthday%3D13%3Bcheckin_year%3D2020%3Bcheckout_month%3D7%3Bcheckout_monthday%3D14%3Bcheckout_year%3D2020%3Bcity%3D900039038%3Bclass_interval%3D1%3Bdest_id%3D1855490%3Bdest_type%3Dhotel%3Bdtdisc%3D0%3Bfrom_sf%3D1%3Bgroup_adults%3D1%3Bgroup_children%3D0%3Bhighlighted_hotels%3D1855490%3Binac%3D0%3Bindex_postcard%3D0%3Blabel_click%3Dundef%3Bno_rooms%3D1%3Boffset%3D0%3Bpostcard%3D0%3Braw_dest_type%3Dhotel%3Broom1%3DA%3Bsb_price_type%3Dtotal%3Bsearch_selected%3D1%3Bshw_aparth%3D1%3Bslp_r_match%3D0%3Bsrc%3Dsearchresults%3Bsrc_elem%3Dsb%3Bsrpvid%3Da53e49abc8b20230%3Bss%3DKiwi%2520As%2520Guest%2520House%252C%2520Rotorua%252C%2520Bay%2520of%2520Plenty%252C%2520New%2520Zealand%3Bss_all%3D0%3Bss_raw%3Dkiwias%3Bssb%3Dempty%3Bsshis%3D0%3Bssne%3DRotorua%3Bssne_untouched%3DRotorua%3Btop_ufis%3D1%26%3B&ss=Rainbow+Holiday+House%2C+Rotorua%2C+Bay+of+Plenty%2C+New+Zealand&is_ski_area=&ssne=Rotorua&ssne_untouched=Rotorua&city=900039038&checkin_year=2020&checkin_month=7&checkin_monthday=13&checkout_year=2020&checkout_month=7&checkout_monthday=14&group_adults=1&group_children=0&no_rooms=1&from_sf=1&ss_raw=rainbow+holiday+house&ac_position=0&ac_langcode=en&ac_click_type=b&dest_id=5288382&dest_type=hotel&place_id_lat=-38.105882&place_id_lon=176.239504&search_pageview_id=a53e49abc8b20230&search_selected=true&search_pageview_id=a53e49abc8b20230&ac_suggestion_list_length=3&ac_suggestion_theme_list_length=0")

cards = driver.find_elements_by_xpath('//div[@class="sr_item  sr_item_new sr_item_default sr_property_block  sr_flex_layout          "]')

for c in cards:

     print('name', c.find_element_by_xpath('./div/div/div/div/h3/a/span[1]').text)

The above code returns the names of the properties.

But when I try to use Selenium together with Scrapy, it fails.

import scrapy
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from scrapy_selenium import SeleniumRequest


class BookSpider(scrapy.Spider):
    name = 'book'
    allowed_domains = ['booking.com']

    def start_requests(self):
        yield SeleniumRequest(url='https://www.booking.com/searchresults.en-gb.html?label=gen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKPtaX4BcACAdICJDE5OWNlMDNkLWJjZTgtNGQyNC04ZjYwLTZiOTZmOTllYTZlN9gCBOACAQ&lang=en-gb&sb=1&src=searchresults&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.en-gb.html%3Flabel%3Dgen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKPtaX4BcACAdICJDE5OWNlMDNkLWJjZTgtNGQyNC04ZjYwLTZiOTZmOTllYTZlN9gCBOACAQ%3Btmpl%3Dsearchresults%3Bac_click_type%3Db%3Bac_position%3D0%3Bcheckin_month%3D7%3Bcheckin_monthday%3D13%3Bcheckin_year%3D2020%3Bcheckout_month%3D7%3Bcheckout_monthday%3D14%3Bcheckout_year%3D2020%3Bcity%3D900039038%3Bclass_interval%3D1%3Bdest_id%3D1855490%3Bdest_type%3Dhotel%3Bdtdisc%3D0%3Bfrom_sf%3D1%3Bgroup_adults%3D1%3Bgroup_children%3D0%3Bhighlighted_hotels%3D1855490%3Binac%3D0%3Bindex_postcard%3D0%3Blabel_click%3Dundef%3Bno_rooms%3D1%3Boffset%3D0%3Bpostcard%3D0%3Braw_dest_type%3Dhotel%3Broom1%3DA%3Bsb_price_type%3Dtotal%3Bsearch_selected%3D1%3Bshw_aparth%3D1%3Bslp_r_match%3D0%3Bsrc%3Dsearchresults%3Bsrc_elem%3Dsb%3Bsrpvid%3Da53e49abc8b20230%3Bss%3DKiwi%2520As%2520Guest%2520House%252C%2520Rotorua%252C%2520Bay%2520of%2520Plenty%252C%2520New%2520Zealand%3Bss_all%3D0%3Bss_raw%3Dkiwias%3Bssb%3Dempty%3Bsshis%3D0%3Bssne%3DRotorua%3Bssne_untouched%3DRotorua%3Btop_ufis%3D1%26%3B&ss=Rainbow+Holiday+House%2C+Rotorua%2C+Bay+of+Plenty%2C+New+Zealand&is_ski_area=&ssne=Rotorua&ssne_untouched=Rotorua&city=900039038&checkin_year=2020&checkin_month=7&checkin_monthday=13&checkout_year=2020&checkout_month=7&checkout_monthday=14&group_adults=1&group_children=0&no_rooms=1&from_sf=1&ss_raw=rainbow+holiday+house&ac_position=0&ac_langcode=en&ac_click_type=b&dest_id=5288382&dest_type=hotel&place_id_lat=-38.105882&place_id_lon=176.239504&search_pageview_id=a53e49abc8b20230&search_selected=true&search_pageview_id=a53e49abc8b20230&ac_suggestion_list_length=3&ac_suggestion_theme_list_length=0',
        wait_time=5,
        callback=self.parse)

    def parse(self, response):

        cards = response.xpath('//div[@class="sr_item  sr_item_new sr_item_default sr_property_block  sr_flex_layout          "]')

        for c in cards:

            yield{'name': c.xpath('./div/div/div/div/h3/a/span[1]/text()').get()}

I really couldn't figure this out because the only difference I'm doing is using SeleniumRequest, if anyone have any ideas it would be great...

Thanks

1 Answer 1

1

This worked for me, my guess is that it may be due to your XPATH in the card variable. Too many spaces etc.. I tend to use contain(@class,'ATTRIBUTE') for large attributes I want to select.

Code

class TestSpider(scrapy.Spider):
    name = 'test'
    allowed_domains = ['booking.com']
    start_urls = ['https://www.booking.com/searchresults.en-gb.html?label=gen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKPtaX4BcACAdICJDE5OWNlMDNkLWJjZTgtNGQyNC04ZjYwLTZiOTZmOTllYTZlN9gCBOACAQ&lang=en-gb&sb=1&src=searchresults&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.en-gb.html%3Flabel%3Dgen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKPtaX4BcACAdICJDE5OWNlMDNkLWJjZTgtNGQyNC04ZjYwLTZiOTZmOTllYTZlN9gCBOACAQ%3Btmpl%3Dsearchresults%3Bac_click_type%3Db%3Bac_position%3D0%3Bcheckin_month%3D7%3Bcheckin_monthday%3D13%3Bcheckin_year%3D2020%3Bcheckout_month%3D7%3Bcheckout_monthday%3D14%3Bcheckout_year%3D2020%3Bcity%3D900039038%3Bclass_interval%3D1%3Bdest_id%3D1855490%3Bdest_type%3Dhotel%3Bdtdisc%3D0%3Bfrom_sf%3D1%3Bgroup_adults%3D1%3Bgroup_children%3D0%3Bhighlighted_hotels%3D1855490%3Binac%3D0%3Bindex_postcard%3D0%3Blabel_click%3Dundef%3Bno_rooms%3D1%3Boffset%3D0%3Bpostcard%3D0%3Braw_dest_type%3Dhotel%3Broom1%3DA%3Bsb_price_type%3Dtotal%3Bsearch_selected%3D1%3Bshw_aparth%3D1%3Bslp_r_match%3D0%3Bsrc%3Dsearchresults%3Bsrc_elem%3Dsb%3Bsrpvid%3Da53e49abc8b20230%3Bss%3DKiwi%2520As%2520Guest%2520House%252C%2520Rotorua%252C%2520Bay%2520of%2520Plenty%252C%2520New%2520Zealand%3Bss_all%3D0%3Bss_raw%3Dkiwias%3Bssb%3Dempty%3Bsshis%3D0%3Bssne%3DRotorua%3Bssne_untouched%3DRotorua%3Btop_ufis%3D1%26%3B&ss=Rainbow+Holiday+House%2C+Rotorua%2C+Bay+of+Plenty%2C+New+Zealand&is_ski_area=&ssne=Rotorua&ssne_untouched=Rotorua&city=900039038&checkin_year=2020&checkin_month=7&checkin_monthday=13&checkout_year=2020&checkout_month=7&checkout_monthday=14&group_adults=1&group_children=0&no_rooms=1&from_sf=1&ss_raw=rainbow+holiday+house&ac_position=0&ac_langcode=en&ac_click_type=b&dest_id=5288382&dest_type=hotel&place_id_lat=-38.105882&place_id_lon=176.239504&search_pageview_id=a53e49abc8b20230&search_selected=true&search_pageview_id=a53e49abc8b20230&ac_suggestion_list_length=3&ac_suggestion_theme_list_length=0']


    def start_requests(self):
        yield SeleniumRequest(url=self.start_urls[0], wait_time=5,callback=self.parse)

    def parse(self, response):
        cards = response.xpath('//div[contains(@class,"sr_item  s")]')

        for c in cards:
            title = c.xpath('.//span[contains(@class,"hotel")]/text()').get()
            yield{'name': title.strip() }
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.