automatic crawling using selenium

Question

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

OUTPUT_FILE_NAME = 'output0.txt'
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)

def get_text():
    driver.get("http://law.go.kr/precSc.do?tabMenuId=tab67")
    elem = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#viewHeightDiv > 
table > tbody > "
                                                                 "tr:nth-child(1) > 
td.s_tit > a")))

    title = elem.text.strip().split(" ")[0]
    elem.click()

    wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#viewwrapCenter h2"), 
title))
    content = driver.find_element_by_css_selector("#viewwrapCenter").text
    return content

def main():
    open_output_file = open(OUTPUT_FILE_NAME, 'w')
    result_text = get_text()
    open_output_file.write(result_text)
    open_output_file.close()

main()

based on this code i want to crawl this website. like from the original url selenium goes into 1st link and save text to txt file and it goes back to original url and goes into 2nd link and keeps going but the problem is css_selector values for 1st link is #viewHeightDiv > table > tbody > tr:nth-child(1) > td.s_tit > a and 2nd link is #viewHeightDiv > table > tbody > tr:nth-child(3) > td.s_tit > a only difference between them is number after a child and it seems like has no rule it goes like 1,3,5,9,... so im stuck here...

Sers · Accepted Answer · 2019-02-27 21:39:46Z

1

To scrape all posts you don't need Selenium. You can do all using Requests and BeautifulSoup libraries:

import requests
from bs4 import BeautifulSoup

if __name__ == '__main__':

    # Using request get 50 items from first page. pg=1 is page number, outmax=50 items per page
    response = requests.post(
        "http://law.go.kr/precScListR.do?q=*&section=evtNm&outmax=50&pg=1&fsort=21,10,30&precSeq=0&dtlYn=N")

    # Parse html using BeautifulSoup
    page = BeautifulSoup(response.text, "html.parser")

    # Find "go to last page" element and get "onclick" attribute, inside "onlick" attribute parse last page number
    # for "outmax=50" (used before)
    onclick = str(page.select(".paging > a:last-child")[0].attrs["onclick"])
    last_page_number = int(''.join([n for n in onclick if n.isdigit()]))

    # To test uncomment code below to get items only from first page
    # last_page_number = 1

    # Go through all pages and collect posts numbers in items
    items = []
    for i in range(1, last_page_number + 1):
        if i>1:
            # Go to next page
            response = requests.post(
                "http://law.go.kr/precScListR.do?q=*&section=evtNm&outmax=100&pg=%d&fsort=21,10,30&precSeq=0&dtlYn=N" % i)

        # Get all links
        links = page.select("#viewHeightDiv .s_tit a")
        # Loop all links and collect post numbers
        for link in links:
            # Parse post number from "onclick" attribute
            items.append(''.join([n for n in link.attrs["onclick"] if n.isdigit()]))

    # Open all posts and collect in posts dictionary with keys: number, url and text
    posts = []
    for item in items:
        url = "http://law.go.kr/precInfoR.do?precSeq=%s&vSct=*" % item
        response = requests.get(url)
        t = BeautifulSoup(response.text, "html.parser").find('div', attrs={'id': 'contentBody'}).text
        posts.append({'number': item, 'url': url, 'text': t})

To save to file change last part of the code to below, where /yourfullpath/ replace with your path like "C://files/" or "/Users/myuser/files/":

# Open all posts and collect in posts dictionary with keys: number, url and text
posts = []
for item in items:
    url = "http://law.go.kr/precInfoR.do?precSeq=%s&vSct=*" % item
    response = requests.get(url)
    parsed = BeautifulSoup(response.text, "html.parser")
    text = parsed.find('div', attrs={'id': 'contentBody'}).text
    title = parsed.select_one("h2").text
    posts.append({'number': item, 'url': url, 'text': text, 'title': title})

    with open('/yourfullpath/' + title + '.text', 'w') as f:
        f.write(text)

edited Feb 27, 2019 at 21:39

answered Feb 20, 2019 at 13:46

Sers

12.3k2 gold badges14 silver badges33 bronze badges

Sign up to request clarification or add additional context in comments.

12 Comments

Kwanhee Hwang Over a year ago

im still confused tho... and how to get all the texts into one txt file... use 'a' mode? on text file? dont really understood this code and im lost..

Kwanhee Hwang Over a year ago

so is this code scrape all the body text of all the links? when i cant not see the code working or not.. need more study or explanation.

Sers Over a year ago

@KwanheeHwang yes code will scrape all from all and store posts map. You can use it later to write to file or whatever. Code just get data by posting requests with url. Read a little about requests library and how web works and you’ll understand.

Kwanhee Hwang Over a year ago

how did u get the response url tho? plus im trying to create txt file for each itme's t but it only create one file...

Sers Over a year ago

@KwanheeHwang comment back and be happy

|

Koustubh Madkaikar · Accepted Answer · 2019-02-20 11:03:23Z

0

You can use locator like:

td.s_tit > a

answered Feb 20, 2019 at 11:03

Koustubh Madkaikar

1256 bronze badges

Collectives™ on Stack Overflow

automatic crawling using selenium

2 Answers 2

12 Comments

Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

12 Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related