Unable to download multiple files with this python script

Question

This script uses beautiful soup to parse all pdf documents on a particular page of a web site. The script successfully downloads one file but will not download all the files that are returned. I need help making this download all pdf documents that I have parsed.

I have done research but have found no answers

import requests
from bs4 import BeautifulSoup 
import html5lib
import lxml

#RFP_Import = ('http://www.staffordmsd.org/cms/One.aspx?    portalId=895956&pageId=1606144')
RFP_Import =     ('http://www.staffordmsd.org/departments/business_operations/bids_and_proposals')
place_hoder = ('http://www.staffordmsd.org')

def get_pdf_links():
    r = requests.get(RFP_Import)
    soup= BeautifulSoup(r.content, 'html5lib')
    links = soup.find_all('a')
    pdf_links = [place_hoder + link['href'] for link in links if     link['href'].endswith('pdf')]
    return pdf_links



def download_pdf_links (pdf_links):
    for link in pdf_links:
        file_name = link.split("/")[-1]
        print ("Downloading file:%s"%file_name)
        r = requests.get(link, stream = True)
        with open(file_name, 'wb') as f:
            for chunk in r.iter_content(chunk_size = 1024*1024):
                if chunk:
                    f.write(chunk)
        print ('%s downloaded!\n'%file_name)
        print ('all RFPs downloaded!')
        return 

if __name__ == "__main__":
        pdf_links = get_pdf_links()
        download_pdf_links(pdf_links)

Successfully downloads first pdf document and then stops.

import requests
from bs4 import BeautifulSoup 
import html5lib
import lxml

#RFP_Import = ('http://www.staffordmsd.org/cms/One.aspx?       portalId=895956&pageId=1606144')
RFP_Import =     ('http://www.staffordmsd.org/departments/business_operations/bids_and_proposals')
place_hoder = ('http://www.staffordmsd.org')

def get_pdf_links():
    r = requests.get(RFP_Import)
    soup= BeautifulSoup(r.content, 'html5lib')
    links = soup.find_all('a')
    pdf_links = [place_hoder + link['href'] for link in links if     link['href'].endswith('pdf')]
return pdf_links



def download_pdf_links (pdf_links):
    for link in pdf_links:
        file_name = link.split("/")[-1]
        print ("Downloading file:%s"%file_name)
        r = requests.get(link, stream = True)
        with open(file_name, 'wb') as f:
            for chunk in r.iter_content(chunk_size = 1024*1024):
                if chunk:
                    f.write(chunk)
        print ('%s downloaded!\n'%file_name)
        print ('all RFPs downloaded!')
        return 

if __name__ == "__main__":
        pdf_links = get_pdf_links()
        download_pdf_links(pdf_links)

S.L · Accepted Answer · 2019-05-21 15:50:56Z

3

Inside download_pdf_links(), return is misaligned. It should be aligned with for. Otherwise, it is part of the for cycle and the function terminates after the first iteration.

This is probably also true for print ('all RFPs downloaded!'). I guess you want that printed out at the end of the for cycle, after you have been through all links.

answered May 21, 2019 at 15:50

S.L

1031 silver badge6 bronze badges

Sign up to request clarification or add additional context in comments.

1 Comment

Column01 Over a year ago

Oof. Beat me to it.

Column01 · Accepted Answer · 2019-05-21 15:51:16Z

In download_pdf_link you are using return inside your loop, which will return after the first iteration of the loop and stop downloading files. You need to return after the loop finishes by putting it on the same indentation as the start of the loop like this:

def download_pdf_links (pdf_links):
    for link in pdf_links:
        file_name = link.split("/")[-1]
        print ("Downloading file:%s"%file_name)
        r = requests.get(link, stream = True)
        with open(file_name, 'wb') as f:
            for chunk in r.iter_content(chunk_size = 1024*1024):
                if chunk:
                    f.write(chunk)
        print ('%s downloaded!\n'%file_name)
    # Un-indented so it happens after the loop finishes.
    print ('all RFPs downloaded!')
    return

Collectives™ on Stack Overflow

Unable to download multiple files with this python script

2 Answers 2

1 Comment

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

1 Comment

Comments

Your Answer

Sign up or log in

Post as a guest

Related