0

I try to crawl this link by sending json requests. My first request would be :

parameters1 = {'ticker':'XOM', 'countryCode':'US',
       'dateTime':'', 'docId':'1222737422 ',
       'docType':'806','sequence':'e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2',
       'messageNumber':'','count':'10',
      'channelName':'/news/latest/company/us/xom', 'topic':'',
       '_':'' }
firstUrl = "http://www.marketwatch.com/news/headline/getheadlines"
html1 = requests.get(firstUrl, params = parameters1, headers = header)
html_json1=(json.loads(html1.text))

for sending the next requests, I have to extract docId from the corresponding HTML and add it to the new parameters. I don't know how to do that. Do you have any idea how to get new HTML frile after sending json requestes?

1
  • what does html_json1 has? Commented Nov 24, 2016 at 7:46

2 Answers 2

1
import requests
import json

from bs4 import BeautifulSoup 


def main():

    html_url = 'http://www.marketwatch.com/investing/stock/xom'

    resp = requests.get(html_url)
    if resp.status_code != 200:
        raise Exception("http request failed: %s" % resp)
    soup = BeautifulSoup(resp.text, 'lxml')

    # get value of `data-uniqueid` from last news node of 'MarketWatch News on XOM'
    li_node = soup.select("#mwheadlines > div.headlinewrapper > ol > li[data-uniqueid]")[-1]
    unique_id = li_node['data-uniqueid']
    print('got unique_id=%r, from %r' % (unique_id, li_node.text.replace('\n', ' ').strip()))


    baseUrl = 'http://www.marketwatch.com/news/headline/getheadlines'
    parameters = {
        'ticker':'XOM',
        'countryCode':'US',
        'docType':'806',
        'docId': '', # (Optional) initial value extract from HTML page
        'sequence':'e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2', # initial value extract from HTML page
        'messageNumber':'8589', # initial value extract from HTML page
        'count':'10',
        'channelName': '/news/latest/company/us/xom',
    }

    parameters.update(extract_page_params(unique_id))


    while True:
        resp = requests.get(baseUrl, params = parameters)
        data = json.loads(resp.text) # array of size 10 
        first = data[0] # get first item of array
        last = data[-1] # get last item of array
        print("\ngot %d data, url: %s" % (len(data), resp.url))
        print("\tfirst: %-42s, %s" % (first['UniqueId'], first['SeoHeadlineFragment']))
        print("\t last: %-42s, %s" % (last['UniqueId'], last['SeoHeadlineFragment']))
        print("")


        uid = last['UniqueId'] # get value of UniqueId from dict object `last`

        parameters.update(extract_page_params(uid))

        input("press <enter> to get next")


def extract_page_params(uid):
    sequence = ''
    messageNumber = ''

    docId = ''

    if ':' in uid: # if the symbol ':' in string `uid`
        # uid looks like `e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2:8499`
        # so split it by ':'
        sequence, messageNumber = uid.split(':')
    else:
        docId = uid

    return {
        'sequence': sequence,
        'messageNumber': messageNumber,
        'docId': docId,
    }


if __name__ == '__main__':
    main()

This is my code to solve your problem.
Since you are new to programming, i have added some comments.
You could directly copy and run with python version 3. (2 should work either)

Sign up to request clarification or add additional context in comments.

1 Comment

OK, my gmail is panyanyany ## gmail dot com. Feel free to contact me. ;-) @farshidbalan
0

You can use Beautiful Soup to extract data from html.It is a python library for extracting data from HTML.

1 Comment

Thank you, I didn't know how to do it with BeautifulSOup.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.