Skip to content
This repository was archived by the owner on Dec 22, 2023. It is now read-only.

Commit 82afd9f

Browse files
committed
Add Economic Times Scraper:
1 parent 63506e9 commit 82afd9f

File tree

1 file changed

+71
-0
lines changed

1 file changed

+71
-0
lines changed
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
from bs4 import BeautifulSoup
2+
from lxml import etree
3+
import requests
4+
import json
5+
import datetime
6+
import sys
7+
8+
## Util
9+
def datestr_to_date(datestr):
10+
[year, month, day] = datestr.split('-')
11+
return datetime.date(
12+
year=int(year),
13+
month=int(month),
14+
day=int(day)
15+
)
16+
17+
## Reference dates
18+
reference_date = datetime.date(2001, 1, 1) ## 2001 Jan 1
19+
reference_date_id = 36892
20+
21+
if len(sys.argv) < 3:
22+
print('economictimes_scraper.py START_DATE END_DATE\nDate fmt: YYYY-MM-DD')
23+
sys.exit(1)
24+
25+
start_date = datestr_to_date(sys.argv[1])
26+
end_date = datestr_to_date(sys.argv[2])
27+
start_dateid = reference_date_id + (start_date - reference_date).days
28+
end_dateid = reference_date_id + (end_date - reference_date).days
29+
30+
if (start_date - reference_date).days < 0:
31+
print('Error: Start date should be > than 2001-01-01')
32+
sys.exit(1)
33+
if (end_date - start_date).days < 0:
34+
print('Error: End date should be > than Start date')
35+
sys.exit(1)
36+
37+
38+
## Gets News article metadata from article url
39+
def fetchNewsArticle(url):
40+
html = requests.get(url).content
41+
root = etree.HTML(html)
42+
x = root.xpath("/html/body//script[@type='application/ld+json']")
43+
metadata = None ## When Article does not exists (404)
44+
if (len(x) >= 2):
45+
metadata = x[1].text
46+
return metadata
47+
48+
et_host = 'https://economictimes.indiatimes.com'
49+
et_date_url = 'https://economictimes.indiatimes.com/archivelist/starttime-'
50+
et_date_extension = '.cms'
51+
52+
fetched_data = {}
53+
54+
for dateid in range(start_dateid, end_dateid + 1):
55+
date = str(reference_date + datetime.timedelta(days = dateid-reference_date_id))
56+
html = requests.get('{}{}{}'.format(et_date_url, dateid, et_date_extension)).content
57+
soup = BeautifulSoup(html, 'html.parser')
58+
fetched_data[date] = []
59+
for x in soup.select('#pageContent table li a'):
60+
print(x.text)
61+
article_metadata = fetchNewsArticle(et_host + x['href'])
62+
fetched_data[date].append({
63+
"metadata": article_metadata,
64+
"title": x.text,
65+
"url": et_host + x['href']
66+
})
67+
68+
out_filename = 'ET_NewsData_{}_{}.json'.format(start_date, end_date)
69+
output_file = open(out_filename, 'w+')
70+
output_file.write(json.dumps(fetched_data, indent=2))
71+
output_file.close()

0 commit comments

Comments
 (0)