I am trying to learn scraping,
I use exceptions lower down in the code to pass through errors because they dont affect the writing of data to csv
I keep getting a "socket.gaierror" but in the handling of that there is a "urllib.error.URLError" in the handling of that I get "NameError: name 'socket' is not defined" which seems circuitous
I kind of understand that using these exceptions may not be the best way to run the code but I cant seem to get past these errors and I dont know a way around or how to fix the errors.
If you have any suggestions outside of fixing the error exceptions that would be greatly appreciated as well.
import csv
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
base_url = 'http://www.fangraphs.com/' # used in line 27 for concatenation
years = ['2017','2016','2015'] # for enough data to run tests
#Getting Links for letters
player_urls = []
data = urlopen('http://www.fangraphs.com/players.aspx')
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
if link.has_attr('href'):
player_urls.append(base_url + link['href'])
#Getting Alphabet Links
test_for_playerlinks = 'players.aspx?letter='
player_alpha_links = []
for i in player_urls:
if test_for_playerlinks in i:
player_alpha_links.append(i)
# Getting Player Links
ind_player_urls = []
for l in player_alpha_links:
data = urlopen(l)
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
if link.has_attr('href'):
ind_player_urls.append(link['href'])
#Player Links
jan = 'statss.aspx?playerid'
players = []
for j in ind_player_urls:
if jan in j:
players.append(j)
# Building Pitcher List
pitcher = 'position=P'
pitchers = []
pos_players = []
for i in players:
if pitcher in i:
pitchers.append(i)
else:
pos_players.append(i)
# Individual Links to Different Tables Sorted by Base URL differences
splits = 'http://www.fangraphs.com/statsplits.aspx?'
game_logs = 'http://www.fangraphs.com/statsd.aspx?'
split_pp = []
gamel = []
years = ['2017','2016','2015']
for i in pos_players:
for year in years:
split_pp.append(splits + i[12:]+'&season='+ year)
gamel.append(game_logs+ i[12:] + '&type=&gds=&gde=&season=' + year)
split_pitcher = []
gl_pitcher = []
for i in pitchers:
for year in years:
split_pitcher.append(splits + i[12:]+'&season=' + year)
gl_pitcher.append(game_logs + i[12:] + '&type=&gds=&gde=&season=' + year)
# Splits for Pitcher Data
row_sp = []
rows_sp = []
try:
for i in split_pitcher:
sauce = urlopen(i)
soup = BeautifulSoup(sauce, "html.parser")
table1 = soup.find_all('strong', {"style":"font-size:15pt;"})
row_sp = []
for name in table1:
nam = name.get_text()
row_sp.append(nam)
table = soup.find_all('table', {"class":"rgMasterTable"})
for h in table:
he = h.find_all('tr')
for i in he:
td = i.find_all('td')
for j in td:
row_sp.append(j.get_text())
rows_sp.append(row_sp)
except(RuntimeError, TypeError, NameError, URLError, socket.gaierror):
pass
try:
with open('SplitsPitchingData2.csv', 'w') as fp:
writer = csv.writer(fp)
writer.writerows(rows_sp)
except(RuntimeError, TypeError, NameError):
pass