This script uses beautiful soup to parse all pdf documents on a particular page of a web site. The script successfully downloads one file but will not download all the files that are returned. I need help making this download all pdf documents that I have parsed.
I have done research but have found no answers
import requests
from bs4 import BeautifulSoup
import html5lib
import lxml
#RFP_Import = ('http://www.staffordmsd.org/cms/One.aspx? portalId=895956&pageId=1606144')
RFP_Import = ('http://www.staffordmsd.org/departments/business_operations/bids_and_proposals')
place_hoder = ('http://www.staffordmsd.org')
def get_pdf_links():
r = requests.get(RFP_Import)
soup= BeautifulSoup(r.content, 'html5lib')
links = soup.find_all('a')
pdf_links = [place_hoder + link['href'] for link in links if link['href'].endswith('pdf')]
return pdf_links
def download_pdf_links (pdf_links):
for link in pdf_links:
file_name = link.split("/")[-1]
print ("Downloading file:%s"%file_name)
r = requests.get(link, stream = True)
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024*1024):
if chunk:
f.write(chunk)
print ('%s downloaded!\n'%file_name)
print ('all RFPs downloaded!')
return
if __name__ == "__main__":
pdf_links = get_pdf_links()
download_pdf_links(pdf_links)
Successfully downloads first pdf document and then stops.
import requests
from bs4 import BeautifulSoup
import html5lib
import lxml
#RFP_Import = ('http://www.staffordmsd.org/cms/One.aspx? portalId=895956&pageId=1606144')
RFP_Import = ('http://www.staffordmsd.org/departments/business_operations/bids_and_proposals')
place_hoder = ('http://www.staffordmsd.org')
def get_pdf_links():
r = requests.get(RFP_Import)
soup= BeautifulSoup(r.content, 'html5lib')
links = soup.find_all('a')
pdf_links = [place_hoder + link['href'] for link in links if link['href'].endswith('pdf')]
return pdf_links
def download_pdf_links (pdf_links):
for link in pdf_links:
file_name = link.split("/")[-1]
print ("Downloading file:%s"%file_name)
r = requests.get(link, stream = True)
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024*1024):
if chunk:
f.write(chunk)
print ('%s downloaded!\n'%file_name)
print ('all RFPs downloaded!')
return
if __name__ == "__main__":
pdf_links = get_pdf_links()
download_pdf_links(pdf_links)