I’m new to Scrapy, and I’d like to load a page using a proxy. When I don’t use a proxy, the response works fine, and I can see my test outputs with print(). However, when I use a proxy, I encounter the following error:
2024-12-16 17:25:32 [ABC] ERROR: Request failed: URL - [Failure instance: Traceback: <class 'playwright._impl._api_types.Error'>: net::ERR_INVALID_ARGUMENT at URL =========================== logs =========================== navigating to "URL", waiting until "load"
Could someone help me understand why this happens and how to resolve it?
Below is an example of my code. I’ve had to hide the site information due to its sensitive nature.
import scrapy
import json
import calendar
import time
import logging
import re
from scrapy_playwright.page import PageMethod
from scrapy import Request
class ABCSpider(scrapy.Spider):
name = 'ABC'
"""
custom_settings = {
"PLAYWRIGHT_LAUNCH_OPTIONS": {
"headless": True,
"proxy": {
"server": "http://proxy.crawlera.com:8010",
"username": "my_key",
"password": "",
},
},
"PLAYWRIGHT_CONTEXTS": {
"default": {
"ignore_https_errors": True
}
}
}
"""
def start_requests(self):
for place in ['01']:
for zptid in ['church']:
for tt in ['city', 'countryside']:
url = f"https://...{tt}/{zptid}.aspx?Paging=12&Sort=0&Page=0&Search={place}"
yield Request(
url=url,
meta={
"playwright": True,
"playwright_include_page": True,
"playwright_page_coroutines": [
PageMethod("wait_for_selector", "#contentHolder_result"),
PageMethod("wait_for_load_state", "load"),
PageMethod("evaluate", "console.log('Page loaded')")
],
},
callback=self.parse_search_url,
errback=self.errback
)
def parse_search_url(self, response):
url = response.url
print('Analyzing search page:', response.url)
results_number = response.xpath('//span[@id="contentHolder_result"]/text()').get()
links = response.css('a.card-img, a.card-img-en').xpath('@href').getall()
print(f"Found {results_number} results")
for link in links:
print(link)
def errback(self, failure):
page = failure.request.meta["playwright_page"]
page.close()
# Affichage d'erreur pour débogage
self.logger.error(f"Request failed: {failure.request.url} - {failure}")
And in my settings.py
BOT_NAME = "off_abc"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
DOWNLOAD_HANDLERS = {
'http': 'scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler',
'https': 'scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler',
}
SPIDER_MODULES = ["off_abc.spiders"]
NEWSPIDER_MODULE = "off_abc.spiders"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
#DOWNLOADER_MIDDLEWARES = {
# 'scrapy_crawlera.CrawleraMiddleware': 610,
#}
#CRAWLERA_ENABLED = True
#CRAWLERA_APIKEY = 'my_key'
#CRAWLERA_PRESERVE_DELAY: False
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'off_abc.pipelines.DuplicatesPipeline': 100
}
To load the page with proxy