I'm trying to scrape a games info website using Scrapy. The scraping process goes like this: scraping the categories -> scraping the list of games (multiple pages for each category) -> scraping game info. The scraped info supposed to go into a json file. I'm getting the following result:
[
{"category": "cat1", "games": [...]},
{"category": "cat2", "games": [...]},
...
]
but I want to get this result:
{ "categories":
[
{"category": "cat1", "games": [...]},
{"category": "cat2", "games": [...]},
...
]
}
I tried to use the steps from this post and this post, with no success. couldn't find more related questions.
I would appreciate any help.
My spider:
import scrapy
from ..items import Category, Game
class GamesSpider(scrapy.Spider):
name = 'games'
start_urls = ['https://www.example.com/categories']
base_url = 'https://www.exmple.com'
def parse(self, response):
categories = response.xpath("...")
for category in categories:
cat_name = category.xpath(".//text()").get()
url = self.base_url + category.xpath(".//@href").get()
cat = Category()
cat['category'] = cat_name
yield response.follow(url=url,
callback=self.parse_category,
meta={ 'category': cat })
def parse_category(self, response):
games_url_list = response.xpath('//.../a/@href').getall()
cat = response.meta['category']
url = self.base_url + games_url_list.pop()
next_page = response.xpath('//a[...]/@href').get()
if next_page:
next_page = self.base_url + response.xpath('//a[...]/@href').get()
yield response.follow(url=url,
callback=self.parse_game,
meta={'category': cat,
'games_url_list': games_url_list,
'next_page': next_page})
def parse_game(self, response):
cat = response.meta['category']
game = Game()
try:
cat['games_list']
except:
cat['games_list'] = []
game['title_en'] = response.xpath('...')
game['os'] = response.xpath('...')
game['users_rating'] = response.xpath('...')
cat['games_list'].append(game)
games_url_list = response.meta['games_url_list']
next_page = response.meta['next_page']
if games_url_list:
url = self.base_url + games_url_list.pop()
yield response.follow(url=url,
callback=self.parse_game,
meta={'category': cat,
'games_url_list': games_url_list,
'next_page': next_page})
else:
if next_page:
yield response.follow(url=next_page,
callback=self.parse_category,
meta={'category': cat})
else:
yield cat
My item.py file:
import scrapy
class Category(scrapy.Item):
category = scrapy.Field()
games_list = scrapy.Field()
class Game(scrapy.Item):
title_en = scrapy.Field()
os = scrapy.Field()
users_rating = scrapy.Field()