I have written a Scrapy CrawlSpider.
class SiteCrawlerSpider(CrawlSpider):
name = 'site_crawler'
def __init__(self, start_url, **kw):
super(SiteCrawlerSpider, self).__init__(**kw)
self.rules = (
Rule(LinkExtractor(allow=()), callback='parse_start_url', follow=True),
)
self.start_urls = [start_url]
self.allowed_domains = tldextract.extract(start_url).registered_domain
def parse_start_url(self, response):
external_links = LinkExtractor(allow=(), deny=self.allowed_domains).extract_links(response)
for link in external_links:
i = FastcrawlerItem()
i['pageurl'] = response.url
i['ext_link'] = link.url
i['ext_domain'] = tldextract.extract(link.url).registered_domain
yield i
Now I am trying to run this script from another Python script as follows:
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from scrapy_fastcrawler.spiders.site_crawler import SiteCrawlerSpider
from scrapy.utils.project import get_project_settings
spider = SiteCrawlerSpider(start_url='http://www.health.com/')
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()
Problem: Everything runs fine, but the major issue here is that the script processes only the 'start_url' and stops. It does not crawl and move to other links found on the start url and no processing being done. I have also setup pipelines and the items from start_url are correctly being saved to the pipeline setup.
Any help is greatly appreciated.