2
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

OUTPUT_FILE_NAME = 'output0.txt'
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)

def get_text():
    driver.get("http://law.go.kr/precSc.do?tabMenuId=tab67")
    elem = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#viewHeightDiv > 
table > tbody > "
                                                                 "tr:nth-child(1) > 
td.s_tit > a")))

    title = elem.text.strip().split(" ")[0]
    elem.click()

    wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#viewwrapCenter h2"), 
title))
    content = driver.find_element_by_css_selector("#viewwrapCenter").text
    return content

def main():
    open_output_file = open(OUTPUT_FILE_NAME, 'w')
    result_text = get_text()
    open_output_file.write(result_text)
    open_output_file.close()

main()

based on this code i want to crawl this website. like from the original url selenium goes into 1st link and save text to txt file and it goes back to original url and goes into 2nd link and keeps going but the problem is css_selector values for 1st link is #viewHeightDiv > table > tbody > tr:nth-child(1) > td.s_tit > a and 2nd link is #viewHeightDiv > table > tbody > tr:nth-child(3) > td.s_tit > a only difference between them is number after a child and it seems like has no rule it goes like 1,3,5,9,... so im stuck here...

2 Answers 2

1

To scrape all posts you don't need Selenium. You can do all using Requests and BeautifulSoup libraries:

import requests
from bs4 import BeautifulSoup

if __name__ == '__main__':

    # Using request get 50 items from first page. pg=1 is page number, outmax=50 items per page
    response = requests.post(
        "http://law.go.kr/precScListR.do?q=*&section=evtNm&outmax=50&pg=1&fsort=21,10,30&precSeq=0&dtlYn=N")

    # Parse html using BeautifulSoup
    page = BeautifulSoup(response.text, "html.parser")

    # Find "go to last page" element and get "onclick" attribute, inside "onlick" attribute parse last page number
    # for "outmax=50" (used before)
    onclick = str(page.select(".paging > a:last-child")[0].attrs["onclick"])
    last_page_number = int(''.join([n for n in onclick if n.isdigit()]))

    # To test uncomment code below to get items only from first page
    # last_page_number = 1

    # Go through all pages and collect posts numbers in items
    items = []
    for i in range(1, last_page_number + 1):
        if i>1:
            # Go to next page
            response = requests.post(
                "http://law.go.kr/precScListR.do?q=*&section=evtNm&outmax=100&pg=%d&fsort=21,10,30&precSeq=0&dtlYn=N" % i)

        # Get all links
        links = page.select("#viewHeightDiv .s_tit a")
        # Loop all links and collect post numbers
        for link in links:
            # Parse post number from "onclick" attribute
            items.append(''.join([n for n in link.attrs["onclick"] if n.isdigit()]))

    # Open all posts and collect in posts dictionary with keys: number, url and text
    posts = []
    for item in items:
        url = "http://law.go.kr/precInfoR.do?precSeq=%s&vSct=*" % item
        response = requests.get(url)
        t = BeautifulSoup(response.text, "html.parser").find('div', attrs={'id': 'contentBody'}).text
        posts.append({'number': item, 'url': url, 'text': t})

To save to file change last part of the code to below, where /yourfullpath/ replace with your path like "C://files/" or "/Users/myuser/files/":

# Open all posts and collect in posts dictionary with keys: number, url and text
posts = []
for item in items:
    url = "http://law.go.kr/precInfoR.do?precSeq=%s&vSct=*" % item
    response = requests.get(url)
    parsed = BeautifulSoup(response.text, "html.parser")
    text = parsed.find('div', attrs={'id': 'contentBody'}).text
    title = parsed.select_one("h2").text
    posts.append({'number': item, 'url': url, 'text': text, 'title': title})

    with open('/yourfullpath/' + title + '.text', 'w') as f:
        f.write(text)
Sign up to request clarification or add additional context in comments.

12 Comments

im still confused tho... and how to get all the texts into one txt file... use 'a' mode? on text file? dont really understood this code and im lost..
so is this code scrape all the body text of all the links? when i cant not see the code working or not.. need more study or explanation.
@KwanheeHwang yes code will scrape all from all and store posts map. You can use it later to write to file or whatever. Code just get data by posting requests with url. Read a little about requests library and how web works and you’ll understand.
how did u get the response url tho? plus im trying to create txt file for each itme's t but it only create one file...
@KwanheeHwang comment back and be happy
|
0

You can use locator like:

td.s_tit > a

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.