Revision: 63342
Updated Code
at April 29, 2013 07:53 by asimmittal
Updated Code
import urllib2, time, traceback, json
# Removes non ascii characters from a string 's'
def removeNonAscii(s): return "".join(i for i in s if ord(i)<128)
if __name__ == '__main__':
baseUrl = "http://www.nba.com/historical/search/index.jsp?fl=(%l)&pager.offset=(%o)#results"
letter = "(%l)"
offset = "(%o)"
startTime = time.asctime();
# ASCII values for letters from A to Z
ordAllLetters = range(ord('A'), ord('Z') + 1)
resultsPerPage = 50
countPlayers = 0
countPagesScraped = 0
dictPlayerPageUrls = {}
# Keep scraping if there are letters still left in the list of all letters
while len(ordAllLetters) > 0:
# Grab the first letter from the list and try scraping the pages for that letter
# For every letter, there are multiple pages. Each page contains 50 names of
# athletes
each = ordAllLetters[0];
curLetter = chr(each)
curUrl = baseUrl.replace(letter,curLetter).replace(offset,str(''))
# For this letter, find out how many pages exist. We can do this by scraping
# the "rnav" div at the bottom of the page. The number of <a> in that div
# will tell you how many pages there are against this letter. Each page has
# 50 results.
try:
html = urllib2.urlopen(curUrl, timeout=5).read();
resultsCountDivTagStart = '<div class="rnav">'
resultsCountDivTagStop = "</div>"
indexResultCountStart = html.find(resultsCountDivTagStart)
indexResultCountStop = html.find(resultsCountDivTagStop, indexResultCountStart)
resultsArea = html[indexResultCountStart:indexResultCountStop]
linkSearchTag = "</a>"
lstResultsLink = resultsArea.split(linkSearchTag)
del lstResultsLink[0]
# number of items in lstResultsLink represents the number of offset pages
# this letter has. Each page has 50 results per page. The offset values
# range from 0, 50, 100... and so on. So let's create a list that contains
# the exact offset values that we need to add to the baseUrl to get the appropriate
# page for this letter.
listPageOffsets = [i * resultsPerPage for i in range(0,len(lstResultsLink))]
if listPageOffsets == [] : listPageOffsets = [0]
print '\nLetter:', curLetter, ' pages:', len(lstResultsLink), ' offset values:', listPageOffsets
# Now listPageOffsets contains [0,50,100,150...]. For each letter, we have all
# the offsets. So let's create the url for every offset, download that page
# and grab the links for every player's individual pages.
for eachOffsetPage in listPageOffsets:
# We don't want to overload the NBA.com webserver with mulitple calls in a short
# span of time, cuz if you do you'll notice they will lock you out (calls time out)
# So we'll slow things down by making calls only every half a second
time.sleep(0.5)
# So for this page offset, construct the url and GET it
strOffset = str(eachOffsetPage)
curUrl = baseUrl.replace(letter,curLetter).replace(offset,strOffset)
countPagesScraped += 1
try:
# fetch the page and grab the results table. That contains
# the list of players.
htmlPage = urllib2.urlopen(curUrl).read();
indexStart = htmlPage.find('<div id="tableContainer">')
indexStop = htmlPage.find("<script>",indexStart)
resultsBody = htmlPage[indexStart:indexStop]
lstResults = resultsBody.split('<tr class="resultsTable" >')
del lstResults[0];
print '--------> offset: ', eachOffsetPage
# Okay now each element in lstResults represents the html
# surrounding a player. Let's process each element in lstResults
# to extract player data
for eachPlayerHtml in lstResults:
indexStart = eachPlayerHtml.find("<a href='") + len("<a href='")
indexEnd = eachPlayerHtml.find("</a>")
lstPlayerInfo = eachPlayerHtml[indexStart:indexEnd].split("'>")
link = "http://nba.com/"+lstPlayerInfo[0]; name = lstPlayerInfo[1]
key = link[link.find('=') + 1:]
active = True if ('ACTIVE' in eachPlayerHtml) else False
# Now we have a player's details - Name, PageURL & if still active
# Let's save this in a dictionary so that we can use this later
dictPlayerPageUrls[key] = {
'index': countPlayers,
'name' : removeNonAscii(name),
'link' : link,
'active' : active
};
countPlayers+=1
print '------------------------->',countPlayers,":",name
except:
traceback.print_exc()
print 'Exception in player page. Letter: ' + curLetter + ' Offset: ' + strOffset
# All the pages for this letter have been scraped. Let's get rid
# of it and go to the next letter
del ordAllLetters[0]
except :
# Something went wrong while trying to scrape this letter's pages
# so the letter has not been removed from the list 'ordAllLetters'
# and we'll try scraping its pages again and again until its done
print '*********> Exception on letter: ', curLetter
print 'Started at: ', startTime
print 'Ended at: ', time.asctime()
print 'Total number of Players: ', countPlayers
print 'Total pages scraped:', countPagesScraped
# When the control reaches here, all the URLs for the players in the nba.com historical
# stats are scraped and stored in the dictionary. Let's save that in a JSON file for later
try:
strJSON = json.dumps(dictPlayerPageUrls)
f = open('players.json','w')
f.write(strJSON)
f.close()
except UnicodeDecodeError:
print '\n\n\nError in JSON string with some non utf-8 characters'
Revision: 63341
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at April 28, 2013 16:33 by asimmittal
Initial Code
import urllib2, time, traceback, json
if __name__ == '__main__':
baseUrl = "http://www.nba.com/historical/search/index.jsp?fl=(%l)&pager.offset=(%o)#results"
letter = "(%l)"
offset = "(%o)"
startTime = time.asctime();
# ASCII values for letters from A to Z
ordAllLetters = range(ord('A'), ord('Z') + 1)
resultsPerPage = 50
dictPlayerPageUrls = {}
for each in ordAllLetters:
# Pull the first page for this letter from NBA Historical player stats website
# For every letter, there are multiple pages. Each page contains 50 names of
# athletes
curLetter = chr(each)
curUrl = baseUrl.replace(letter,curLetter).replace(offset,str(''))
# For this letter, find out how many pages exist. We can do this by scraping
# the "rnav" div at the bottom of the page. The number of <a> in that div
# will tell you how many pages there are against this letter. Each page has
# 50 results.
try:
html = urllib2.urlopen(curUrl, timeout=5).read();
resultsCountDivTagStart = '<div class="rnav">'
resultsCountDivTagStop = "</div>"
indexResultCountStart = html.find(resultsCountDivTagStart)
indexResultCountStop = html.find(resultsCountDivTagStop, indexResultCountStart)
resultsArea = html[indexResultCountStart:indexResultCountStop]
linkSearchTag = "</a>"
lstResultsLink = resultsArea.split(linkSearchTag)
del lstResultsLink[0]
# number of items in lstResultsLink represents the number of offset pages
# this letter has. Each page has 50 results per page. The offset values
# range from 0, 50, 100... and so on. So let's create a list that contains
# the exact offset values that we need to add to the baseUrl to get the appropriate
# page for this letter.
listPageOffsets = [i * resultsPerPage for i in range(0,len(lstResultsLink))]
if listPageOffsets == [] : listPageOffsets = [0]
print '-----------------'
print 'Letter:', curLetter, ' pages:', len(lstResultsLink), ' offset values:', listPageOffsets
# Now listPageOffsets contains [0,50,100,150...]. For each letter, we have all
# the offsets. So let's create the url for every offset, download that page
# and grab the links for every player's individual pages.
for eachOffsetPage in listPageOffsets:
strOffset = str(eachOffsetPage)
curUrl = baseUrl.replace(letter,curLetter).replace(offset,strOffset)
try:
# fetch the page and grab the results table. That contains
# the list of players.
htmlPage = urllib2.urlopen(curUrl).read();
indexStart = htmlPage.find('<div id="tableContainer">')
indexStop = htmlPage.find("<script>",indexStart)
resultsBody = htmlPage[indexStart:indexStop]
lstResults = resultsBody.split('<tr class="resultsTable" >')
del lstResults[0];
# Okay now each element in lstResults represents the html
# surrounding a player. Let's process each element in lstResults
# to extract player data
for eachPlayerHtml in lstResults:
indexStart = eachPlayerHtml.find("<a href='") + len("<a href='")
indexEnd = eachPlayerHtml.find("</a>")
lstPlayerInfo = eachPlayerHtml[indexStart:indexEnd].split("'>")
link = "http://nba.com/"+lstPlayerInfo[0]; name = lstPlayerInfo[1]
key = link[link.find('=') + 1:]
active = True if ('ACTIVE' in eachPlayerHtml) else False
# Now we have a player's details - Name, PageURL & if still active
# Let's save this in a dictionary so that we can use this later
dictPlayerPageUrls[key] = {
'name' : name,
'link' : link,
'active' : active
};
print 'Grabbed:',name
except:
print 'Exception in player page. Letter: ' + curLetter + ' Offset: ' + strOffset
# When the control reaches here, all the URLs for the players in the nba.com historical
# stats are scraped and stored in the dictionary. Let's save that in a JSON file for later
strJSON = json.dumps(dictPlayerPageUrls)
f = open('players.json','w')
f.write(strJSON)
f.close()
except :
traceback.print_exc()
print '------> Exception on letter: ', curLetter
print 'Started at: ', startTime
print 'Ended at: ', time.asctime()
Initial URL
Initial Description
This is a python script that allows you to scrape historical player names and links from NBA.com historical data
Initial Title
Scrape list of all NBA players
Initial Tags
python
Initial Language
Python