1

Ok, so I'm doing this project which implements Word2Vec on a Bengali language web corpus to find similar contextual words of words and as pre-requisite I am trying to crawl certain news and blog sites and then scraping the links to build a data corpus. I'm using Google Colab on my Chrome browser, as of now.

Here's my Python code for crawling... (I did take help from the internet for code snippets, I have only recently learnt all of this)


import requests
import urllib.parse
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
from urllib.request import urlopen
from urllib.request import Request

# init the colorama module
colorama.init()
GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET
YELLOW = colorama.Fore.YELLOW

# initialize the list of links (unique links)
internal_urls = set() #Set of All internal links
external_urls = set() #Set of All external links
old_internals #Keeps track of internal links before including another


def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)


"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
# all URLs of `url
def get_all_website_links(url):
    global old_internals
    try:
        urls = set()
        # domain name of the URL without the protocol
        domain_name = urlparse(url).netloc
        user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
        req = Request(url, headers={'User-Agent': user_agent})
        article = urlopen(req).read()
        soup = BeautifulSoup(article, "lxml")

        old_internals = internal_urls.copy() #Copies old set of internal links

        for a_tag in soup.findAll("a"): #Links under  <a> tag
            href = a_tag.attrs.get("href")
            if href == "" or href is None:
                # href empty tag
                continue
            # join the URL if it's relative (not absolute link)
            href = urljoin(url, href)

            parsed_href = urlparse(href)
            # remove URL GET parameters, URL fragments, etc.
            href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
            if not is_valid(href):
                # not a valid URL
                continue
            if href in internal_urls:
                # already in the set
                continue
            if domain_name not in href:
                # external link
                if href not in external_urls:
                    print(f"{GRAY}[!] External link: {href}{RESET} \n")
                    external_urls.add(href)
                continue
            print(f"{GREEN}[*] Internal link: {href}{RESET} \n")
            urls.add(href)
            internal_urls.add(href)

        #I could definitely have done this as a function
        #instead of writing the whole code again, but well...
        #(I will change it)
        for link_tag in soup.findAll("link"): #Links under <link> tag
            href = link_tag.attrs.get("href")
            if href == "" or href is None:
                # href empty tag
                continue
            # join the URL if it's relative (not absolute link)
            href = urljoin(url, href)
            parsed_href = urlparse(href)
            # remove URL GET parameters, URL fragments, etc.
            href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
            if not is_valid(href):
                # not a valid URL
                continue
            if href in internal_urls:
                # already in the set
                continue
            if domain_name not in href:
                # external link
                if href not in external_urls:
                    print(f"{GRAY}[!] External link: {href}{RESET} \n")
                    external_urls.add(href)
                continue
            print(f"{GREEN}[*] Internal link: {href}{RESET} \n")
            urls.add(href)            
            internal_urls.add(href)
        return urls
    except Exception as e: 
        #If the link to be added were problematic, just return the list of
        #old internal links. The function was returning an error and stopped 
        #crawling because of certain internal links midway when max count was
        #large, so...
        print("\n")
        print(e)
        print("\nNone returned\n")
        #print(internal_urls, "\n\n")
        return old_internals

# number of urls visited so far will be stored here
total_urls_visited = 0

def crawl(url, max_urls=30):
    """
    Crawls a web page and extracts all links.
    You'll find all links in `external_urls` and `internal_urls` global set variables.
    params:
        max_urls (int): number of max urls to crawl, default is 30.
    """
    global total_urls_visited
    total_urls_visited += 1
    #print(url)
    print(f"{YELLOW}[*] Crawling: {url}{RESET} \n")
    links = get_all_website_links(url)
    loop=links.copy() #Since returning old internal links may change loop size
    for link in loop:
        if total_urls_visited > max_urls:
            break
        crawl(link, max_urls)

def extract_name(link_url): #Program to name the file
  name=""
  link_name= link_url[link_url.index(":")+3:] #skips the "https://" part :)
  link_name=link_name.replace('/', '_')
  link_name=link_name.replace('.', '_')
  link_name=link_name.replace(' ', '_')
  link_name=link_name.replace('-', '_')
  return link_name+".txt"

def fileWrite(fname, lst):
    a_file = open(fname, "wb")
    for element in lst:
      l = len(element)
      if l == 0:
        continue
      a_file.write(element.encode() + "\n".encode())
    a_file.close()

#Runtime
if __name__ == "__main__":
    max_urls = 
    #Arbitrary list of links of Bengali sites
    web_links=["https://www.anandabazar.com/",
               "https://www.prothomalo.com/",
               "https://www.littlemag.org/2019/05/blog-post_60.html"]
    
    #Index of weblink in list
    index=1

    crawl(web_links[index], max_urls)
    fname=extract_name(web_links[index])
    fileWrite(fname, internal_urls)

    print("[+] Total Internal links:", len(internal_urls))
    print("[+] Total External links:", len(external_urls))
    print("[+] Total URLs:", len(external_urls) + len(internal_urls))
    print("[+] Total crawled URLs:", max_urls)

Now my code works without any issues for the first two sites [indices 0, 1] which I presume is because I can even copy the text manually when I go to that site on my chrome browser.

But the site with the index=2, i.e. https://www.littlemag.org/2019/05/blog-post_60.html, it doesn't work at all. And I can't copy or select anything on the browser either. How do I work about this problem and crawl links on the domain of this site?

The same issue is showing up on my web scraping code...

import bs4 as bs
import urllib.request
from urllib.request import Request, urlopen

web_links=["https://www.anandabazar.com/", 
           "https://www.prothomalo.com/", 
           "https://www.littlemag.org/2019/05/blog-post_60.html"]

user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
req = Request(web_links[2], headers={'User-Agent': user_agent})
article = urlopen(req).read()

#print(article)

parsed_article = bs.BeautifulSoup(article, 'lxml')

#I read that the main content of articles on blogs and news sites is stored in the <p> tag,
#(I don't remember everything I studied in html) please feel free to let me know,
#if I should include something else too.

paragraphs = parsed_article.find_all('p')
article_text = ""

for p in paragraphs:
    article_text += " " + p.text

print(article_text)

I can't fetch or scrape any data from the Bengali article on this site, https://www.littlemag.org/2019/05/blog-post_60.html, and print it on the Colab console. What should I change in the two codes to work this about and include data from these now copy-able, selectable sites?


Update:

Thank you Andrej Kesely. My problem with scraping of the site is solved, but I would like to know if there is a way to scrape the headings within that page, using your code?

for content in soup.find_all([re.compile('^h[1-6]$'), 'p']):     
    print(content.text)

This won't work for me, in this case.

Also the piece of code,

import requests
from bs4 import BeautifulSoup

url = "https://www.littlemag.org/2019/05/blog-post_60.html"
soup = BeautifulSoup(requests.get(url).content, "html.parser")

print(soup.select_one(".post-body").get_text(strip=True, separator="\n"))

It is not working for, https://www.littlemag.org/, which is the homepage of the site we are dealing with.

Gives the error AttributeError: 'NoneType' object has no attribute 'get_text'.

What could be the reason and how can I fetch the content along with heading from the homepage as well, https://www.littlemag.org/ ?

4
  • What kind of responses do you get back from this pages? req.status_code Commented Jun 13, 2021 at 20:21
  • In the scraping code, it just gives me all the text of the article on the webpage (whatever under the <p> tag) ideally, if works. But for the specific (non-selectable) site, I am not getting any output at all. Commented Jun 13, 2021 at 20:25
  • @patrickgerard The problem with scraping that link is solved but, also, my crawling code is not being able to urls with Bengali characters in them, I added that in the question... what can do to solve that? Commented Jun 15, 2021 at 14:03
  • Open another question. Commented Jun 15, 2021 at 14:49

1 Answer 1

1

To get post text from this site you can use next example:

import requests
from bs4 import BeautifulSoup

url = "https://www.littlemag.org/2019/05/blog-post_60.html"
soup = BeautifulSoup(requests.get(url).content, "html.parser")

print(soup.select_one(".post-body").get_text(strip=True, separator="\n"))

Prints:

● ছবিতে - বাঙালির পাতের চির নবীন শুক্তো।
■ পদ্মপুরাণে বেহুলার বিয়ের নিরামিষ খাবারের মধ্যে শুক্তোর উল্লেখ পাওয়া যায়। ভারতচন্দ্রের অন্নদামঙ্গলেও বাইশ রকমের নিরামিষ পদের মধ্যে শুক্তুনিকে পাওয়া যায়।
মঙ্গলকাব্য ও বৈষ্ণবসাহিত্যে এই রান্নাটির বহুবার উল্লেখ পাওয়া যায়। কিন্তু বর্তমানে 'শুক্তো' বলতে যেমন উচ্ছে, করলা, পল্‌তা, নিম, সিম, বেগুন প্রভৃতি সবজির তিক্ত ব্যঞ্জনকে বোঝায়, প্রাচীনকালে তা ছিল না। একালের শুক্তোকে সেকালে 'তিতো' বলা হত।
সেকালে 'শুক্তা' রান্না করা হত- বেগুন, কাঁচা কুমড়ো, কাঁচকলা, মোচা এই সবজিগুলি গুঁড়ো বা বাটা মসলা অথবা বেসনের সঙ্গে বেশ ভালো করে মেখে বা নেড়ে নিয়ে ঘন 'পিঠালি' মিশিয়ে রান্না করা হত। পরে হিং, জিরা ও মেথি দিয়ে ঘিয়ে সাঁতলিয়ে নামাতে হত।
কিন্তু 'চৈতন্যচরিতামৃতে' সুকুতা, শুকুতা বা সুক্তা বলতে একধরণের শুকনো পাতাকে বলা হয়েছে। এটি ছিল আম-নাশক। সম্ভবত এটি ছিল শুকনো তিতো পাটপাতা। রাঘব পণ্ডিত মহাপ্রভুর জন্য নীলাচলে যেসব জিনিস নিয়ে গিয়েছিলেন তার মধ্যে এই দ্রব্যটিও ছিল।
আবার 'সুকুতা' বলতে সেই সময় শুকনো শাকের ব্যঞ্জনকেও বোঝাত।
বাঙালির চিরকালের পরিচয় ‘ভেতো বাঙালি’। অর্থাৎ যাদের প্রধান খাদ্য হলো ভাত। প্রাচীনকালে গরিব বাঙালির মুখে শোনা যেত দুঃখের কাঁদুনী, ‘হাড়িত ভাত নাহি নিতি আবেশী’ (চর্যাপদ)। মানে ‘ঘরে ভাত নেই তবু অতিথির আসা যাওয়ার কমতি নেই’। তবে ধনী-নির্ধন সব বাঙালির প্রিয় খাদ্য গরম ভাতে গাওয়া ঘি। যারা দিন আনে দিন খায়, তাঁদের চরম প্রাপ্তি হলো — পান্তা ভাতে বাইগন পোড়া। পণ্ডিতরা বলেন, প্রকৃত বাঙালির মনমতো খাবার ছিল কলাপাতায় ‘ওগ্গারা ভত্তা গাইক ঘিত্তা’, অর্থাৎ গাওয়া ঘি আর ফেনা ভাত। দুধ আর সরু চাল মিশিয়ে পায়েস বড়মানুষের প্রিয় খাদ্য।

...
Sign up to request clarification or add additional context in comments.

4 Comments

That helps. Thank you. Why doesn't this code work for the other given sites in the list? (I tried) Also what do I need to change in my crawling code, to be able to crawl the domain link of this site, littlemag.org ?
@MilindChakraborty Every site is different. Basically, you cannot have one universal code to get text from all sites. So yes, your code doesn't work because this site doesn't have any article in <p> tags.
Um, another question, what if I also want to scrape the headings too along with the content, in this case? I can't really do... for content in soup.find_all([re.compile('^h[1-6]$'), 'p']): print(content.text) What should I do for this specific site?
Also, for the homepage of the same website, [link]littlemag.org, the given code is not returning AttributeError: 'NoneType' object has no attribute 'get_text' as an error. How can we to get the subtexts under the numerous headings, along with the headings on the homepage, through scraping?

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.