Scrape information using scrapy

Question

I am trying to scrape information as shown below but they will provide the wrong output. what mistakes am I doing? this is page link https://www.thegrommet.com/products/the-vintage-pearlmini-peas-in-the-pod-necklace

from scrapy import Spider
from scrapy.http import Request


class AuthorSpider(Spider):
    name = 'book'
    start_urls = ['https://www.thegrommet.com/gifts/by-type/personalized-gifts']
    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1,
        'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
    }



    def parse(self, response):
        books = response.xpath("//div[@class='flex-grow | p-t-s']//@href").extract()
        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)

    def parse_book(self, response):
        title=response.xpath("//div[@class='f-heading-xl']//text()").get()
        title=title.strip()
        d3=response.xpath("//div[@class='accordion-section | p-t-s p-b-m']")
        for pro in d3:
            data=[tup for tup in pro.xpath('//div//text()')]
            try:
                trip=data[1].get()
            except:
                trip=''
            trip=trip.strip()
            try:
                tuck=data[2].get()
            except:
                tuck=''
            tuck=tuck.strip()
            try:
                tup=data[3].get()
            except:
                tup=''
            tup=tup.strip()
            
        yield{ 
            'title':title,
            'd1':trip,
            'd2':tuck,
            'd3':tup,
            
            
            }

PIC2:

Md. Fazlul Hoque · Accepted Answer · 2022-05-29 21:11:50Z

You can select the xpath expression for d1,d2,d3 the following way and no need to use try except because scrapy handles None value itself. You also can use scrapy built-in method which is normalize-space to remove leading and trailing Whitespace and Newlines.

Full working code:

from scrapy import Spider
from scrapy.http import Request

class AuthorSpider(Spider):
    name = 'book'
    start_urls = ['https://www.thegrommet.com/gifts/by-type/personalized-gifts']
    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1,
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.62 Safari/537.36'
    }

    def parse(self, response):
        books = response.xpath("//div[@class='flex-grow | p-t-s']//@href").extract()
        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)

    def parse_book(self, response):
        title=response.xpath("//div[@class='f-heading-xl']//text()").get()
        title=title.strip()
        
        yield{ 
            'title':title,
            'd1':response.xpath('normalize-space((//*[@class="accordion-section | p-t-s p-b-m"]/div)[1]/text()[1])').get(),
            'd2':response.xpath('normalize-space((//*[@class="accordion-section | p-t-s p-b-m"]/div)[2]/text()[1])').get(),
            'd3':response.xpath('normalize-space((//*[@class="accordion-section | p-t-s p-b-m"]/div)[3]/text()[1])').get(),
            'url':response.url
            
            }

Collectives™ on Stack Overflow

Scrape information using scrapy

1 Answer 1

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Related