I'm working on a scrapy bot that can get specific details for optics. I need to click on a javascript button to show a virtual page, so that my scrapy bot can scrape the Optic details.
This is what I need the playwright to click on show in a red rectangle. Details tab highlighted in red
On certain pages, the first item details page is already showing. Example:
I probably need to create some sort of If else statement for this? I would to work on it but I've been stuck on the prior issue.
import scrapy
class UpperSpider(scrapy.Spider):
name = 'Optic'
start_urls = [
'https://www.brownells.com/optics-mounting/scopes/rifle-scopes/strike-eagle-1-6x24mm-rifle-scope-prod135114.aspx']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url=url,
meta={'playwright': True})
# Issue Here is I'm not sure if this is working I want to click on the Details Tab
def virtualpage(self, response, page):
# Virtual Page button
vpButton = response.css('div[id="wrap"]')
for page in vpButton:
page.click('#detailTab')
# Also some pages for instance https://www.brownells.com/optics-mounting/electronic-sights/red-dot-sights/carbine-optic-aco--prod73112.aspx
# Already have there virtual pages showing. I think I would need a if .. statement to make sure it didn't close the page.
def parse(self, response):
container = response.css('div[id="wrap"]')
for products in container:
yield {
'ProductName': products.css('span[itemprop="name"]::text').get(),
'Stock': products.css('span[itemprop="availability"]::text').get(),
'Brand': response.css('#listMain .wrap .mbm a::text').get(),
'Name': response.css('#listMain span+ span::text').get(),
'Price': products.css('#priceContainer > span > p > span::text').get(),
'Image': products.css('#lnkImgSku img::attr(src)').get(),
'Battery': products.css('section:nth-child(1) p:contains("Battery")::text').get(),
'Length': products.css('section:nth-child(1) p:contains("Length")::text').get(),
'Weight': products.css('section:nth-child(1) p:contains("Weight")::text').get(),
'URL': response.url,
'Reticle': products.css('#detailWrap p:contains("Reticle")::text').get()
}
Okay, So I tried to make scrapy crawler work. I'm pretty sure I know the problem is at my start_request for URL in self.start_urls: I believe it's telling the playwright to start at Start_URLS. How do I tell the playwright to also start on each crawled page? So that "Clickallbtns" can run?
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_playwright.page import PageCoroutine
class UpperSpider(CrawlSpider):
name = 'Upper'
allowed_domains = ['brownells.com']
start_urls = ['https://www.brownells.com/optics-mounting/electronic-sights/red-dot-sights/index.htm']
le_item_details = LinkExtractor(restrict_css='.listing')
rule_product_detail = Rule(le_item_details,
callback='parse_item',
follow=True,
)
rules = (
rule_product_detail,
)
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url,
meta={'playwright': True,
'playwright_page_coroutines': {
#"waitforload": PageCoroutine("waitforNavagation", 'url'),
"clickallbtns": PageCoroutine("evaluate", 'document.querySelectorAll("#detailTab").forEach(x=>x.click())'),
}
}
)
def parse_item(self, response):
container = response.css('div[id="wrap"]')
for products in container:
yield {
'ProductName': products.css('span[itemprop="name"]::text').get(),
'Stock': products.css('span[itemprop="availability"]::text').get(),
'Brand': response.css('#listMain .wrap .mbm a::text').get(),
'Name': response.css('#listMain span+ span::text').get(),
'Price': products.css('#priceContainer > span > p > span::text').get(),
'Image': products.css('#lnkImgSku img::attr(src)').get(),
'Battery': products.css('section:nth-child(1) p:contains("Battery")::text').get(),
'Length': products.css('section:nth-child(1) p:contains("Length")::text').get(),
'Weight': products.css('section:nth-child(1) p:contains("Weight")::text').get(),
'URL': response.url,
'Reticle': products.css('#detailWrap p:contains("Reticle")::text').get()
}