I can't seem to figure out how to construct this xpath selector. I have even tried using nextsibling::text but to no avail. I have also browsed stackoverflow questions for scraping listed values but could not implement it correctly. I keep getting blank results. Any and all help would be appreciated. Thank you.
The website is https://www.unegui.mn/adv/5737502_10-r-khoroolold-1-oroo/.
Expected Results:
Woods
2015
Current Results:
blank
Current: XPath scrapy code:
list_li = response.xpath(".//ul[contains(@class, 'chars-column')]/li/text()").extract()
list_li = response.xpath("./ul[contains(@class,'value-chars')]//text()").extract()
floor_type = list_li[0].strip() commission_year = list_li[1].strip()
HTML Snippet:
<div class="announcement-characteristics clearfix">
<ul class="chars-column">
<li class="">
<span class="key-chars">Flooring:</span>
<span class="value-chars">Wood</span></li>
<li class="">
<span class="key-chars">Commission year:</span>
<a href="https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/1-r/ashon_min---2011/"class="value-chars">2015</a>
</li>
</ul>
</div>
FURTHER CLARIFICATION: I previously did two selectors (one for the span list, one for the href list), but the problem was some pages on the website dont follow the same span list/a list order (i.e. on one page the table value would be in a span list, but some other page it would be in a href list). That is why I have been trying to only use one selector and get all the values.
This results in values as shown below in the image. Instead of the number of window aka an integer being scraped, it scrapes the address because on some pages the table value is under the href list not under the span list.

Previous 2 selectors:
list_span = response.xpath(".//span[contains(@class,'value-chars')]//text()").extract()
list_a = response.xpath(".//a[contains(@class,'value-chars')]//text()").extract()
Whole Code (if someone needs it to test it):
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from datetime import datetime
from scrapy.crawler import CrawlerProcess
from selenium import webdriver
dt_today = datetime.now().strftime('%Y%m%d')
filename = dt_today + ' UB HPI Buying Data'
# create Spider class
class UneguiApartmentsSpider(scrapy.Spider):
name = "unegui_apts"
allowed_domains = ["www.unegui.mn"]
custom_settings = {
"FEEDS": {
f'{filename}.csv': {
'format': 'csv',
'overwrite': True}}
}
# function used for start url
def start_requests(self):
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/']
for url in urls:
yield Request(url, self.parse)
def parse(self, response, **kwargs):
cards = response.xpath("//li[contains(@class,'announcement-container')]")
# parse details
for card in cards:
name = card.xpath(".//a[@itemprop='name']/@content").extract_first().strip()
price = card.xpath(".//*[@itemprop='price']/@content").extract_first().strip()
rooms = card.xpath("normalize-space(.//div[contains(@class,'announcement-block__breadcrumbs')]/span[2]/text())").extract_first().strip()
link = card.xpath(".//a[@itemprop='url']/@href").extract_first().strip()
date_block = card.xpath("normalize-space(.//div[contains(@class,'announcement-block__date')]/text())").extract_first().split(',')
date = date_block[0].strip()
city = date_block[1].strip()
item = {'name': name,
'date': date,
'rooms': rooms,
'price': price,
'city': city,
}
# follow absolute link to scrape deeper level
yield response.follow(link, callback=self.parse_item, meta={'item': item})
# handling pagination
next_page = response.xpath("//a[contains(@class,'number-list-next js-page-filter number-list-line')]/@href").get()
if next_page:
yield response.follow(next_page, callback=self.parse)
print(f'Scraped {next_page}')
def parse_item(self, response):
# retrieve previously scraped item between callbacks
item = response.meta['item']
# parse additional details
list_li = response.xpath(".//*[contains(@class, 'value-chars')]/text()").extract()
# get additional details from list of <span> tags, element by element
floor_type = list_li[0].strip()
num_balcony = list_li[1].strip()
commission_year = list_li[2].strip()
garage = list_li[3].strip()
window_type = list_li[4].strip()
num_floors = list_li[5].strip()
door_type = list_li[6].strip()
area_sqm = list_li[7].strip()
floor = list_li[8].strip()
leasing = list_li[9].strip()
district = list_li[10].strip()
num_window = list_li[11].strip()
address = list_li[12].strip()
#list_span = response.xpath(".//span[contains(@class,'value-chars')]//text()").extract()
#list_a = response.xpath(".//a[contains(@class,'value-chars')]//text()").extract()
# get additional details from list of <span> tags, element by element
#floor_type = list_span[0].strip()
#num_balcony = list_span[1].strip()
#garage = list_span[2].strip()
#window_type = list_span[3].strip()
#door_type = list_span[4].strip()
#num_window = list_span[5].strip()
# get additional details from list of <a> tags, element by element
#commission_year = list_a[0].strip()
#num_floors = list_a[1].strip()
#area_sqm = list_a[2].strip()
#floor = list_a[3].strip()
#leasing = list_a[4].strip()
#district = list_a[5].strip()
#address = list_a[6].strip()
# update item with newly parsed data
item.update({
'district': district,
'address': address,
'area_sqm': area_sqm,
'floor': floor,
'commission_year': commission_year,
'num_floors': num_floors,
'num_windows': num_window,
'num_balcony': num_balcony,
'floor_type': floor_type,
'window_type': window_type,
'door_type': door_type,
'garage': garage,
'leasing': leasing
})
yield item
def __init__(self):
self.driver = webdriver.Firefox()
def parse_item2(self, response):
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_xpath(".//span[contains(@class,'phone-author__title')]//text()")
try:
next.click()
# get the data and write it to scrapy items
except:
break
self.driver.close()
# main driver
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(UneguiApartmentsSpider)
process.start()