I am attempting to extract data from divs with scrapy for python 2. I now realize i cannot use a regex command like \d in my extracted div Xpath. how can i work around this? with \d{,2} i am trying to tell python "hey, there is supposed to be a number here with a value between 1-100" thanks in advance
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
import re
class MySpider(CrawlSpider):
name = "craigs" #add the 's' to make functional = "craigs"
allowed_domains = ["craigslist.org"]
start_urls = ["http://philadelphia.craigslist.org/cta/"]
rules = (Rule (SgmlLinkExtractor(allow=("index\d\d\d{,3}\.html", ),restrict_xpaths= ('//*[@id="toc_rows"]/div[3]/div/div/span/a',))
, callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//span[@class="pl"] | //span[@class="12"]')
items = []
for titles in titles:
item = CraigslistSampleItem()
item ["price"] = titles.select('//*[@id="toc_rows"]/div[2]/p[position() <=100])/span[3]/span[1]/text()').extract()
item ["date"] = titles.select('//*[@id="toc_rows"]/div[2]/p[position() <=100]]/span[2]/span/text()').extract()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/@href").extract()
items.append(item)
return(items)
and the html snipet from the urls is this:
item ["date"] = span class="date">Jan 12/span>
item ["price"] = span class="price">$1950/span>
both exist under this parent ancestor node div id="toc_rows"