/ Published in: Python
This is a python script that allows you to scrape historical player names and links from NBA.com historical data
Expand |
Embed | Plain Text
Copy this code and paste it in your HTML
import urllib2, time, traceback, json # Removes non ascii characters from a string 's' def removeNonAscii(s): return "".join(i for i in s if ord(i)<128) if __name__ == '__main__': baseUrl = "http://www.nba.com/historical/search/index.jsp?fl=(%l)&pager.offset=(%o)#results" letter = "(%l)" offset = "(%o)" startTime = time.asctime(); # ASCII values for letters from A to Z ordAllLetters = range(ord('A'), ord('Z') + 1) resultsPerPage = 50 countPlayers = 0 countPagesScraped = 0 dictPlayerPageUrls = {} # Keep scraping if there are letters still left in the list of all letters while len(ordAllLetters) > 0: # Grab the first letter from the list and try scraping the pages for that letter # For every letter, there are multiple pages. Each page contains 50 names of # athletes each = ordAllLetters[0]; curLetter = chr(each) curUrl = baseUrl.replace(letter,curLetter).replace(offset,str('')) # For this letter, find out how many pages exist. We can do this by scraping # the "rnav" div at the bottom of the page. The number of <a> in that div # will tell you how many pages there are against this letter. Each page has # 50 results. try: html = urllib2.urlopen(curUrl, timeout=5).read(); resultsCountDivTagStart = '<div class="rnav">' resultsCountDivTagStop = "</div>" indexResultCountStart = html.find(resultsCountDivTagStart) indexResultCountStop = html.find(resultsCountDivTagStop, indexResultCountStart) resultsArea = html[indexResultCountStart:indexResultCountStop] linkSearchTag = "</a>" lstResultsLink = resultsArea.split(linkSearchTag) del lstResultsLink[0] # number of items in lstResultsLink represents the number of offset pages # this letter has. Each page has 50 results per page. The offset values # range from 0, 50, 100... and so on. So let's create a list that contains # the exact offset values that we need to add to the baseUrl to get the appropriate # page for this letter. listPageOffsets = [i * resultsPerPage for i in range(0,len(lstResultsLink))] if listPageOffsets == [] : listPageOffsets = [0] print '\nLetter:', curLetter, ' pages:', len(lstResultsLink), ' offset values:', listPageOffsets # Now listPageOffsets contains [0,50,100,150...]. For each letter, we have all # the offsets. So let's create the url for every offset, download that page # and grab the links for every player's individual pages. for eachOffsetPage in listPageOffsets: # We don't want to overload the NBA.com webserver with mulitple calls in a short # span of time, cuz if you do you'll notice they will lock you out (calls time out) # So we'll slow things down by making calls only every half a second time.sleep(0.5) # So for this page offset, construct the url and GET it strOffset = str(eachOffsetPage) curUrl = baseUrl.replace(letter,curLetter).replace(offset,strOffset) countPagesScraped += 1 try: # fetch the page and grab the results table. That contains # the list of players. htmlPage = urllib2.urlopen(curUrl).read(); indexStart = htmlPage.find('<div id="tableContainer">') indexStop = htmlPage.find("<script>",indexStart) resultsBody = htmlPage[indexStart:indexStop] lstResults = resultsBody.split('<tr class="resultsTable" >') del lstResults[0]; print '--------> offset: ', eachOffsetPage # Okay now each element in lstResults represents the html # surrounding a player. Let's process each element in lstResults # to extract player data for eachPlayerHtml in lstResults: indexStart = eachPlayerHtml.find("<a href='") + len("<a href='") indexEnd = eachPlayerHtml.find("</a>") lstPlayerInfo = eachPlayerHtml[indexStart:indexEnd].split("'>") link = "http://nba.com/"+lstPlayerInfo[0]; name = lstPlayerInfo[1] key = link[link.find('=') + 1:] active = True if ('ACTIVE' in eachPlayerHtml) else False # Now we have a player's details - Name, PageURL & if still active # Let's save this in a dictionary so that we can use this later dictPlayerPageUrls[key] = { 'index': countPlayers, 'name' : removeNonAscii(name), 'link' : link, 'active' : active }; countPlayers+=1 print '------------------------->',countPlayers,":",name except: traceback.print_exc() print 'Exception in player page. Letter: ' + curLetter + ' Offset: ' + strOffset # All the pages for this letter have been scraped. Let's get rid # of it and go to the next letter del ordAllLetters[0] except : # Something went wrong while trying to scrape this letter's pages # so the letter has not been removed from the list 'ordAllLetters' # and we'll try scraping its pages again and again until its done print '*********> Exception on letter: ', curLetter print 'Started at: ', startTime print 'Ended at: ', time.asctime() print 'Total number of Players: ', countPlayers print 'Total pages scraped:', countPagesScraped # When the control reaches here, all the URLs for the players in the nba.com historical # stats are scraped and stored in the dictionary. Let's save that in a JSON file for later try: strJSON = json.dumps(dictPlayerPageUrls) f = open('players.json','w') f.write(strJSON) f.close() except UnicodeDecodeError: print '\n\n\nError in JSON string with some non utf-8 characters'