Scrapy Playwright Click and loop through Virtual Javascript page

Question

I'm working on a scrapy bot that can get specific details for optics. I need to click on a javascript button to show a virtual page, so that my scrapy bot can scrape the Optic details.

This is what I need the playwright to click on show in a red rectangle. Details tab highlighted in red

On certain pages, the first item details page is already showing. Example:

Virtual page details tab open

I probably need to create some sort of If else statement for this? I would to work on it but I've been stuck on the prior issue.

import scrapy


class UpperSpider(scrapy.Spider):
    name = 'Optic'
    start_urls = [
        'https://www.brownells.com/optics-mounting/scopes/rifle-scopes/strike-eagle-1-6x24mm-rifle-scope-prod135114.aspx']

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url=url,
                                    meta={'playwright': True})

    # Issue Here is I'm not sure if this is working I want to click on the Details Tab
    def virtualpage(self, response, page):
        # Virtual Page button
        vpButton = response.css('div[id="wrap"]')
        for page in vpButton:
            page.click('#detailTab')

    # Also some pages for instance https://www.brownells.com/optics-mounting/electronic-sights/red-dot-sights/carbine-optic-aco--prod73112.aspx
    # Already have there virtual pages showing. I think I would need a if .. statement to make sure it didn't close the page.

    def parse(self, response):
        container = response.css('div[id="wrap"]')
        for products in container:
            yield {
                'ProductName': products.css('span[itemprop="name"]::text').get(),
                'Stock': products.css('span[itemprop="availability"]::text').get(),
                'Brand': response.css('#listMain .wrap .mbm a::text').get(),
                'Name': response.css('#listMain span+ span::text').get(),
                'Price': products.css('#priceContainer > span > p > span::text').get(),
                'Image': products.css('#lnkImgSku img::attr(src)').get(),
                'Battery': products.css('section:nth-child(1) p:contains("Battery")::text').get(),
                'Length': products.css('section:nth-child(1) p:contains("Length")::text').get(),
                'Weight': products.css('section:nth-child(1) p:contains("Weight")::text').get(),
                'URL': response.url,
                'Reticle': products.css('#detailWrap p:contains("Reticle")::text').get()
            }

Okay, So I tried to make scrapy crawler work. I'm pretty sure I know the problem is at my start_request for URL in self.start_urls: I believe it's telling the playwright to start at Start_URLS. How do I tell the playwright to also start on each crawled page? So that "Clickallbtns" can run?

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_playwright.page import PageCoroutine


class UpperSpider(CrawlSpider):
    name = 'Upper'
    allowed_domains = ['brownells.com']
    start_urls = ['https://www.brownells.com/optics-mounting/electronic-sights/red-dot-sights/index.htm']



    le_item_details = LinkExtractor(restrict_css='.listing')

    rule_product_detail = Rule(le_item_details,
                                callback='parse_item',
                                follow=True,
                                )
    rules = (
        rule_product_detail,
    )


    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url,
                                    meta={'playwright': True,
                                        'playwright_page_coroutines': {
                                            #"waitforload": PageCoroutine("waitforNavagation", 'url'),
                                            "clickallbtns": PageCoroutine("evaluate", 'document.querySelectorAll("#detailTab").forEach(x=>x.click())'),
                                        }
                                        }
                                    )

    def parse_item(self, response):
        container = response.css('div[id="wrap"]')
        for products in container:
            yield {
                'ProductName': products.css('span[itemprop="name"]::text').get(),
                'Stock': products.css('span[itemprop="availability"]::text').get(),
                'Brand': response.css('#listMain .wrap .mbm a::text').get(),
                'Name': response.css('#listMain span+ span::text').get(),
                'Price': products.css('#priceContainer > span > p > span::text').get(),
                'Image': products.css('#lnkImgSku img::attr(src)').get(),
                'Battery': products.css('section:nth-child(1) p:contains("Battery")::text').get(),
                'Length': products.css('section:nth-child(1) p:contains("Length")::text').get(),
                'Weight': products.css('section:nth-child(1) p:contains("Weight")::text').get(),
                'URL': response.url,
                'Reticle': products.css('#detailWrap p:contains("Reticle")::text').get()
                }

msenior_ · Accepted Answer · 2022-03-22 03:48:54Z

You need to include the clicking logic in the playwright PageCoroutines dictionary so that the buttons are clicked before the response is returned.

See below sample code. If you are defining the scrapy-playwright values in settings.py then you can comment out the custom_settings variable. Otherwise if you are running it from a script, the below code will be sufficient (using scrapy 2.6.1).

import scrapy
from scrapy_playwright.page import PageCoroutine


class UpperSpider(scrapy.Spider):
    name = 'Optic'
    custom_settings = dict(
        DOWNLOAD_HANDLERS={
            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        },
        TWISTED_REACTOR="twisted.internet.asyncioreactor.AsyncioSelectorReactor",
    )

    start_urls = [
        'https://www.brownells.com/optics-mounting/scopes/rifle-scopes/strike-eagle-1-6x24mm-rifle-scope-prod135114.aspx']

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url=url,
                        meta={'playwright': True,
                            'playwright_page_coroutines': {
                                "clickallbtns": PageCoroutine("evaluate", 'document.querySelectorAll("#detailTab").forEach(x=>x.click())'), 
                                }
                            }
                    )

    def parse(self, response):
        container = response.css('div[id="wrap"]')
        for products in container:
            yield {
                'ProductName': products.css('span[itemprop="name"]::text').get(),
                'Stock': products.css('span[itemprop="availability"]::text').get(),
                'Brand': response.css('#listMain .wrap .mbm a::text').get(),
                'Name': response.css('#listMain span+ span::text').get(),
                'Price': products.css('#priceContainer > span > p > span::text').get(),
                'Image': products.css('#lnkImgSku img::attr(src)').get(),
                'Battery': products.css('section:nth-child(1) p:contains("Battery")::text').get(),
                'Length': products.css('section:nth-child(1) p:contains("Length")::text').get(),
                'Weight': products.css('section:nth-child(1) p:contains("Weight")::text').get(),
                'URL': response.url,
                'Reticle': products.css('#detailWrap p:contains("Reticle")::text').get()
            }

This worked great, I've implemented a crawl spider and I have seemed to break it in again. Do I need to do a pagecouritne to wait for #detailTab to load?
Just a quick update I would love to get this figured out. I've got stuck on this last part wasn't able to ever figure this out.

Collectives™ on Stack Overflow

Scrapy Playwright Click and loop through Virtual Javascript page

1 Answer 1

2 Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

2 Comments

Your Answer

Sign up or log in

Post as a guest

Related