Ok, so I'm doing this project which implements Word2Vec on a Bengali language web corpus to find similar contextual words of words and as pre-requisite I am trying to crawl certain news and blog sites and then scraping the links to build a data corpus. I'm using Google Colab on my Chrome browser, as of now.
Here's my Python code for crawling... (I did take help from the internet for code snippets, I have only recently learnt all of this)
import requests
import urllib.parse
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
from urllib.request import urlopen
from urllib.request import Request
# init the colorama module
colorama.init()
GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET
YELLOW = colorama.Fore.YELLOW
# initialize the list of links (unique links)
internal_urls = set() #Set of All internal links
external_urls = set() #Set of All external links
old_internals #Keeps track of internal links before including another
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
# all URLs of `url
def get_all_website_links(url):
global old_internals
try:
urls = set()
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
req = Request(url, headers={'User-Agent': user_agent})
article = urlopen(req).read()
soup = BeautifulSoup(article, "lxml")
old_internals = internal_urls.copy() #Copies old set of internal links
for a_tag in soup.findAll("a"): #Links under <a> tag
href = a_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if domain_name not in href:
# external link
if href not in external_urls:
print(f"{GRAY}[!] External link: {href}{RESET} \n")
external_urls.add(href)
continue
print(f"{GREEN}[*] Internal link: {href}{RESET} \n")
urls.add(href)
internal_urls.add(href)
#I could definitely have done this as a function
#instead of writing the whole code again, but well...
#(I will change it)
for link_tag in soup.findAll("link"): #Links under <link> tag
href = link_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if domain_name not in href:
# external link
if href not in external_urls:
print(f"{GRAY}[!] External link: {href}{RESET} \n")
external_urls.add(href)
continue
print(f"{GREEN}[*] Internal link: {href}{RESET} \n")
urls.add(href)
internal_urls.add(href)
return urls
except Exception as e:
#If the link to be added were problematic, just return the list of
#old internal links. The function was returning an error and stopped
#crawling because of certain internal links midway when max count was
#large, so...
print("\n")
print(e)
print("\nNone returned\n")
#print(internal_urls, "\n\n")
return old_internals
# number of urls visited so far will be stored here
total_urls_visited = 0
def crawl(url, max_urls=30):
"""
Crawls a web page and extracts all links.
You'll find all links in `external_urls` and `internal_urls` global set variables.
params:
max_urls (int): number of max urls to crawl, default is 30.
"""
global total_urls_visited
total_urls_visited += 1
#print(url)
print(f"{YELLOW}[*] Crawling: {url}{RESET} \n")
links = get_all_website_links(url)
loop=links.copy() #Since returning old internal links may change loop size
for link in loop:
if total_urls_visited > max_urls:
break
crawl(link, max_urls)
def extract_name(link_url): #Program to name the file
name=""
link_name= link_url[link_url.index(":")+3:] #skips the "https://" part :)
link_name=link_name.replace('/', '_')
link_name=link_name.replace('.', '_')
link_name=link_name.replace(' ', '_')
link_name=link_name.replace('-', '_')
return link_name+".txt"
def fileWrite(fname, lst):
a_file = open(fname, "wb")
for element in lst:
l = len(element)
if l == 0:
continue
a_file.write(element.encode() + "\n".encode())
a_file.close()
#Runtime
if __name__ == "__main__":
max_urls =
#Arbitrary list of links of Bengali sites
web_links=["https://www.anandabazar.com/",
"https://www.prothomalo.com/",
"https://www.littlemag.org/2019/05/blog-post_60.html"]
#Index of weblink in list
index=1
crawl(web_links[index], max_urls)
fname=extract_name(web_links[index])
fileWrite(fname, internal_urls)
print("[+] Total Internal links:", len(internal_urls))
print("[+] Total External links:", len(external_urls))
print("[+] Total URLs:", len(external_urls) + len(internal_urls))
print("[+] Total crawled URLs:", max_urls)
Now my code works without any issues for the first two sites [indices 0, 1] which I presume is because I can even copy the text manually when I go to that site on my chrome browser.
But the site with the index=2, i.e. https://www.littlemag.org/2019/05/blog-post_60.html, it doesn't work at all. And I can't copy or select anything on the browser either. How do I work about this problem and crawl links on the domain of this site?
The same issue is showing up on my web scraping code...
import bs4 as bs
import urllib.request
from urllib.request import Request, urlopen
web_links=["https://www.anandabazar.com/",
"https://www.prothomalo.com/",
"https://www.littlemag.org/2019/05/blog-post_60.html"]
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
req = Request(web_links[2], headers={'User-Agent': user_agent})
article = urlopen(req).read()
#print(article)
parsed_article = bs.BeautifulSoup(article, 'lxml')
#I read that the main content of articles on blogs and news sites is stored in the <p> tag,
#(I don't remember everything I studied in html) please feel free to let me know,
#if I should include something else too.
paragraphs = parsed_article.find_all('p')
article_text = ""
for p in paragraphs:
article_text += " " + p.text
print(article_text)
I can't fetch or scrape any data from the Bengali article on this site, https://www.littlemag.org/2019/05/blog-post_60.html, and print it on the Colab console. What should I change in the two codes to work this about and include data from these now copy-able, selectable sites?
Update:
Thank you Andrej Kesely. My problem with scraping of the site is solved, but I would like to know if there is a way to scrape the headings within that page, using your code?
for content in soup.find_all([re.compile('^h[1-6]$'), 'p']):
print(content.text)
This won't work for me, in this case.
Also the piece of code,
import requests
from bs4 import BeautifulSoup
url = "https://www.littlemag.org/2019/05/blog-post_60.html"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
print(soup.select_one(".post-body").get_text(strip=True, separator="\n"))
It is not working for, https://www.littlemag.org/, which is the homepage of the site we are dealing with.
Gives the error AttributeError: 'NoneType' object has no attribute 'get_text'.
What could be the reason and how can I fetch the content along with heading from the homepage as well, https://www.littlemag.org/ ?
req.status_code