0

I’m new to Scrapy, and I’d like to load a page using a proxy. When I don’t use a proxy, the response works fine, and I can see my test outputs with print(). However, when I use a proxy, I encounter the following error:

2024-12-16 17:25:32 [ABC] ERROR: Request failed: URL - [Failure instance: Traceback: <class 'playwright._impl._api_types.Error'>: net::ERR_INVALID_ARGUMENT at URL =========================== logs =========================== navigating to "URL", waiting until "load"

Could someone help me understand why this happens and how to resolve it?

Below is an example of my code. I’ve had to hide the site information due to its sensitive nature.

import scrapy
import json
import calendar
import time
import logging
import re
from scrapy_playwright.page import PageMethod
from scrapy import Request



class ABCSpider(scrapy.Spider):
    name = 'ABC'

    """  
    custom_settings = {
        "PLAYWRIGHT_LAUNCH_OPTIONS": {
            "headless": True, 
            "proxy": {
                "server": "http://proxy.crawlera.com:8010",
                "username": "my_key", 
                "password": "",
            },
        },
        "PLAYWRIGHT_CONTEXTS": {
            "default": {
                "ignore_https_errors": True
            }
        }
    }
    """
    
    def start_requests(self):
        for place in ['01']:
            for zptid in ['church']: 
                for tt in ['city', 'countryside']: 
                    url = f"https://...{tt}/{zptid}.aspx?Paging=12&Sort=0&Page=0&Search={place}"
                    yield Request(
                        url=url,
                        meta={
                            "playwright": True,
                            "playwright_include_page": True,
                            "playwright_page_coroutines": [
                                PageMethod("wait_for_selector", "#contentHolder_result"),
                                PageMethod("wait_for_load_state", "load"), 
                                PageMethod("evaluate", "console.log('Page loaded')") 
                            ],
                        },
                        callback=self.parse_search_url, 
                        errback=self.errback  
                    )

    def parse_search_url(self, response):
        url = response.url
        print('Analyzing search page:', response.url)
        results_number = response.xpath('//span[@id="contentHolder_result"]/text()').get()
        links = response.css('a.card-img, a.card-img-en').xpath('@href').getall() 

        print(f"Found {results_number} results")

        for link in links:
            print(link)

    def errback(self, failure):

        page = failure.request.meta["playwright_page"]
        page.close()
        # Affichage d'erreur pour débogage
        self.logger.error(f"Request failed: {failure.request.url} - {failure}")

And in my settings.py

BOT_NAME = "off_abc"

TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"

DOWNLOAD_HANDLERS = {
    'http': 'scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler',
    'https': 'scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler',

}

SPIDER_MODULES = ["off_abc.spiders"]
NEWSPIDER_MODULE = "off_abc.spiders"

TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"

#DOWNLOADER_MIDDLEWARES = {
 #   'scrapy_crawlera.CrawleraMiddleware': 610,
#}
#CRAWLERA_ENABLED = True
#CRAWLERA_APIKEY = 'my_key'
#CRAWLERA_PRESERVE_DELAY: False  

ROBOTSTXT_OBEY = True

ITEM_PIPELINES = {
  'off_abc.pipelines.DuplicatesPipeline': 100
}

To load the page with proxy

0

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.