The following program is giving me output that includes URLs with and without the forward slash (e.g. ask.census.gov and ask.census.gov/). I need to eliminate one or the other. Thank you in advance for your help!
from bs4 import BeautifulSoup as mySoup
from urllib.parse import urljoin as myJoin
from urllib.request import urlopen as myRequest
my_url = "https://www.census.gov/programs-surveys/popest.html"
# call on packages
html_page = myRequest(my_url)
raw_html = html_page.read()
html_page.close()
page_soup = mySoup(raw_html, "html.parser")
f = open("censusTest.csv", "w")
hyperlinks = page_soup.findAll('a')
set_urls = set()
for checked in hyperlinks:
found_link = checked.get("href")
result_set = myJoin(my_url, found_link)
if result_set and result_set not in set_urls:
set_urls.add(result_set)
f.write(str(result_set) + "\n")
f.close()