I'm building a python script that gathers data from Instagram, based on a user list provided in my database. However, I'm running into some issues trying to handle unexpected JSON response.
To give some context, the program is fetching a username from my database table (24/7, looping over hundreds of accounts - hence the while True: loop), requesting a URL with that username, and expecting a certain JSON response (specifically, it's looking for ['entry_data']['ProfilePage'][0] in the response).
However when usernames aren't found on Instagram, the JSON is different, and the expected part (['entry_data']['ProfilePage'][0]) is not in there. So my script crashes.
With the current code:
def get_username_from_db():
try:
with connection.cursor() as cursor:
cursor.execute("SELECT * FROM ig_users_raw WHERE `username` IS NOT NULL ORDER BY `ig_users_raw`.`last_checked` ASC LIMIT 1")
row = cursor.fetchall()
username = row[0]['username']
except pymysql.IntegrityError:
print('ERROR: ID already exists in PRIMARY KEY column')
return username
def request_url(url):
try:
response = requests.get(url)
except requests.HTTPError:
raise requests.HTTPError(f'Received non 200 status code from {url}')
except requests.RequestException:
raise requests.RequestException
else:
return response.text
def extract_json_data(url):
try:
r = requests.get(url, headers=headers)
except requests.HTTPError:
raise requests.HTTPError('Received non-200 status code.')
except requests.RequestException:
raise requests.RequestException
else:
print(url)
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
j = json.loads(stringified_json)['entry_data']['ProfilePage'][0]
return j
if __name__ == '__main__':
while True:
sleep(randint(5,15))
username = get_username_from_db()
url = f'https://www.instagram.com/{username}/'
j = extract_json_data(url)
json_string = json.dumps(j)
user_id = j['graphql']['user']['id']
username = j['graphql']['user']['username']
#print(user_id)
try:
with connection.cursor() as cursor:
db_data = (json_string, datetime.datetime.now(),user_id)
sql = "UPDATE `ig_users_raw` SET json=%s, last_checked=%s WHERE `user_id`= %s "
cursor.execute(sql, db_data)
connection.commit()
print(f'{datetime.datetime.now()} - data inserted for user: {user_id} - {username}')
except pymysql.Error:
print('ERROR: ', pymysql.Error)
I'm getting the following error/traceback:
https://www.instagram.com/geloria.itunes/
Traceback (most recent call last):
File "D:\Python\Ministry\ig_raw.py", line 63, in <module>
j = extract_json_data(url)
File "D:\Python\Ministry\ig_raw.py", line 55, in extract_json_data
j = json.loads(stringified_json)['entry_data']['ProfilePage'][0]
File "C:\Users\thoma\AppData\Local\Programs\Python\Python36-32\lib\json\__init__.py", line 354, in loads
return _default_decoder.decode(s)
File "C:\Users\thoma\AppData\Local\Programs\Python\Python36-32\lib\json\decoder.py", line 339, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Users\thoma\AppData\Local\Programs\Python\Python36-32\lib\json\decoder.py", line 357, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 2 column 1 (char 1)
Ideally, I want this to just skip past the account (in this case geloria.itunes), and move to the next one in the database. I might want to remove the account, or at least remove the username from the row.
In an effort to solve this myself, I experimented with if / else loops, but in the case where it would continue, I'd just be looping over the same account.
Do you have any suggestions on how I can tackle this specific issue?
Thanks!