I made a web crawler that will crawl certain Bengali news portals and receive links and later the content can be scraped to make a web-corpus.
The code for my crawler is given here:
A question I asked recently about web scraping: How do I crawl and scrape this specific website and save the data in a text file using Python?
Here: (Edited)
import requests
import urllib.parse
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
from urllib.request import urlopen
from urllib.request import Request
# init the colorama module
colorama.init()
GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET
YELLOW = colorama.Fore.YELLOW
# initialize the list of links (unique links)
internal_urls = set() #Set of All internal links
external_urls = set() #Set of All external links
old_internals = set() #Keeps track of internal links before including another
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
"""
Returns all URLs that are found on `url` in which it belongs to the same website
"""
# all URLs of `url
def get_all_website_links(url):
global old_internals
try:
urls = set()
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
req = Request(url, headers={'User-Agent': user_agent})
article = urlopen(req).read()
soup = BeautifulSoup(article, "lxml")
old_internals = internal_urls.copy() #Copies old set of internal links
for a_tag in soup.findAll("a"): #Links under <a> tag
href = a_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if domain_name not in href:
# external link
if href not in external_urls:
print(f"{GRAY}[!] External link: {href}{RESET} \n")
external_urls.add(href)
continue
print(f"{GREEN}[*] Internal link: {href}{RESET} \n")
urls.add(href)
internal_urls.add(href)
#I could definitely have done this as a function
#instead of writing the whole code again, but well...
#(I will change it)
for link_tag in soup.findAll("link"): #Links under <link> tag
href = link_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if domain_name not in href:
# external link
if href not in external_urls:
print(f"{GRAY}[!] External link: {href}{RESET} \n")
external_urls.add(href)
continue
print(f"{GREEN}[*] Internal link: {href}{RESET} \n")
urls.add(href)
internal_urls.add(href)
return urls
except Exception as e:
#If the link to be added were problematic, just return the list of
#old internal links. The function was returning an error and stopped
#crawling because of certain internal links midway when max count was
#large, so...
print("\n")
print(e)
print("\nNone returned\n")
#print(internal_urls, "\n\n")
return old_internals
# number of URLs visited so far will be stored here
total_urls_visited = 0
def crawl(url, max_urls=30):
"""
Crawls a web page and extracts all links.
You'll find all links in `external_urls` and `internal_urls` global set variables.
params:
max_urls (int): number of max URLs to crawl, default is 30.
"""
global total_urls_visited
total_urls_visited += 1
#print(url)
print(f"{YELLOW}[*] Crawling: {url}{RESET} \n")
links = get_all_website_links(url)
loop=links.copy() #Since returning old internal links may change loop size
for link in loop:
if total_urls_visited > max_urls:
break
crawl(link, max_urls)
def extract_name(link_url): #Program to name the file
name=""
link_name= link_url[link_url.index(":")+3:] #skips the "https://" part :)
link_name=link_name.replace('/', '_')
link_name=link_name.replace('.', '_')
link_name=link_name.replace(' ', '_')
link_name=link_name.replace('-', '_')
return link_name+".txt"
def fileWrite(fname, lst):
a_file = open(fname, "wb")
for element in lst:
l = len(element)
if l == 0:
continue
a_file.write(element.encode() + "\n".encode())
a_file.close()
#Runtime
if __name__ == "__main__":
max_urls =
#Arbitrary list of links of Bengali sites
web_links=["https://www.anandabazar.com/",
"https://www.prothomalo.com/",
"https://www.littlemag.org/2019/05/blog-post_60.html"]
#Index of weblink in list
index=1
crawl(web_links[index], max_urls)
fname=extract_name(web_links[index])
fileWrite(fname, internal_urls)
print("[+] Total Internal links:", len(internal_urls))
print("[+] Total External links:", len(external_urls))
print("[+] Total URLs:", len(external_urls) + len(internal_urls))
print("[+] Total crawled URLs:", max_urls)
(Edit) Stack overflow was not letting me paste it here again. Something about Spam.
My code is fetching and crawling links just fine as long as the url of the link only contains ASCII characters. But if the url has any Bengali characters, it is probably not able to request it. With web_links[index=1], i.e. https://www.prothomalo.com/, my code gives the following output...
[*] Crawling: https://www.prothomalo.com/sports/football/ছবিতে-ছবিতে-রোনালদোর-রেকর্ডের-গল্প
'ASCII codec can't encode characters in position 21-25: ordinal not in range(128)
None returned
[*] Crawling: https://www.prothomalo.com/sports/football/ছবিতে-ছবিতে-রোনালদোর-রেকর্ডের-গল্প
'ASCII codec can't encode characters in position 21-25: ordinal not in range(128)
None returned
[*] Crawling: https://www.prothomalo.com/sports/football/ছবিতে-ছবিতে-রোনালদোর-রেকর্ডের-গল্প
'ASCII codec can't encode characters in position 21-25: ordinal not in range(128)
None returned
[+] Total Internal links: 34
[+] Total External links: 11
[+] Total URLs: 45
[+] Total crawled URLs: 50
I keep getting this as an error for the unaccepted links...
'ASCII codec can't encode characters in position 16-21: ordinal not in range(128)
None returned
What do I change in my code to deal with this error and make urls with Bengali characters acceptable and crawl the links?
Update:
Ok. So I followed the example in @QHarr's answer... And changed a line of my code which actually made the Bengali urls accessible.
domain_name = urlparse(url).netloc
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
req = Request(urllib.parse.quote(url, safe = ':/', encoding= 'utf-8'), headers={'User-Agent': user_agent})
article = urlopen(req).read()
soup = BeautifulSoup(article, "html.parser")
The rest of the code is the same... But this came up with another problem...
While the other two links are still working fine while crawling https://www.prothomalo.com/. No matter how large the maximum value, that is max_urls, I input, the number of crawled sites is staying around the same as a lot of sites are returning an error.
The majority of the output looks like this:
[*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [+] Total Internal links: 260 [+] Total External links: 24 [+] Total URLs: 284 [+] Total crawled URLs: 100
So I thought of checking some of the problematic links separately, and print all the internal links they contain...
import requests
from bs4 import BeautifulSoup
import urllib
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
url = 'https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF'
domain_name=urlparse(url).netloc
print("\n\n"+domain_name+"\n\n")
r = requests.get(url, headers={'User-Agent': user_agent})
soup = BeautifulSoup(r.content, 'html.parser')
links= set()
for link in soup.find_all(['a', 'link'], href=True):
if(link['href'].find(domain_name) != -1):
links.add(link['href'])
print(links)
And they are working fine:
www.prothomalo.com {'https://www.prothomalo.com/business', 'https://www.prothomalo.com/feature/holiday/বিদেশে-উচ্চশিক্ষা-দেশে-ফিরে-দুগ্ধখামারি', 'https://www.prothomalo.com/world', 'https://www.prothomalo.com/world/india/ভারতে-এবার-গ্রিন-ফাঙ্গাসের-সংক্রমণ', 'https://www.prothomalo.com/sports', 'https://www.prothomalo.com/business/economics/সুদানের-৬৫-কোটি-টাকাঋণের-দায়-নিল-বাংলাদেশ', 'https://www.prothomalo.com/bangladesh/অল-কমিউনিটি-ক্লাবে-ভাঙচুরের-অভিযোগ-পরীমনির-বিরুদ্ধে', 'https://www.prothomalo.com/bangladesh', 'https://www.prothomalo.com/video', 'https://www.prothomalo.com/entertainment', 'https://www.prothomalo.com/', 'https://www.prothomalo.com/opinion', 'https://www.prothomalo.com/life', 'https://www.prothomalo.com/world/europe/কোভিড-১৯-জীবন-বাঁচানোর-আরও-এক-চিকিৎসা', 'https://www.prothomalo.com/bangladesh/district/আলীগের-দুই-নেতাকে-কারণ-দর্শাও-নোটিশ'}
What's wrong with my original crawling code?
What's up with this HTTP Error 404: Not Found error here?
r = requests.get('https://www.prothomalo.com/sports/football/' + urllib.parse.quote('ছবিতে-ছবিতে-রোনালদোর-রেকর্ডের-গল্প'))