So. After some hours of searching and trying, I finally got something working. Here is my script.
So my need was:
- Download a ZIP file.
- Find in that zip file a specific text file with
"anystring" in the name
- Extract from that text file the 1st URL containing the string "csv"
#!/bin/env python
from io import BytesIO
from zipfile import ZipFile
import requests
import re
import sys
# define url value
url = "https://whateverurlyouneed"
# Define string to be found in the file name to be extracted
filestr = "anystring"
# Define string to be found in URL
urlstr = "anystring"
# Define regex to extract URL
regularex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|(([^\s()<>]+|(([^\s()<>]+)))))+(?:(([^\s()<>]+|(([^\s()<>]+))))|[^\s`!()[]{};:'\".,<>?«»“”‘’]))"
# download zip file
content = requests.get(url)
# Open stream
zipfile = ZipFile(BytesIO(content.content))
# Open first file from the ZIP archive containing
# the filestr string in the name
data = [zipfile.open(file_name) for file_name in zipfile.namelist() if filestr in file_name][0]
# read lines from the file. If csv found, print URL and exit
# This will return the 1st URL containing CSV in the opened file
for line in data.readlines():
if urlstr in line.decode("latin-1"):
urls = re.findall(regularex,line.decode("latin-1"))
print([url[0] for url in urls])
break
sys.exit(0)