Scrape list of all NBA players


/ Published in: Python
Save to your folder(s)

This is a python script that allows you to scrape historical player names and links from NBA.com historical data


Copy this code and paste it in your HTML
  1. import urllib2, time, traceback, json
  2.  
  3. # Removes non ascii characters from a string 's'
  4. def removeNonAscii(s): return "".join(i for i in s if ord(i)<128)
  5.  
  6.  
  7. if __name__ == '__main__':
  8.  
  9. baseUrl = "http://www.nba.com/historical/search/index.jsp?fl=(%l)&pager.offset=(%o)#results"
  10. letter = "(%l)"
  11. offset = "(%o)"
  12.  
  13. startTime = time.asctime();
  14.  
  15. # ASCII values for letters from A to Z
  16. ordAllLetters = range(ord('A'), ord('Z') + 1)
  17. resultsPerPage = 50
  18. countPlayers = 0
  19. countPagesScraped = 0
  20. dictPlayerPageUrls = {}
  21.  
  22. # Keep scraping if there are letters still left in the list of all letters
  23.  
  24. while len(ordAllLetters) > 0:
  25.  
  26. # Grab the first letter from the list and try scraping the pages for that letter
  27. # For every letter, there are multiple pages. Each page contains 50 names of
  28. # athletes
  29.  
  30. each = ordAllLetters[0];
  31. curLetter = chr(each)
  32. curUrl = baseUrl.replace(letter,curLetter).replace(offset,str(''))
  33.  
  34. # For this letter, find out how many pages exist. We can do this by scraping
  35. # the "rnav" div at the bottom of the page. The number of <a> in that div
  36. # will tell you how many pages there are against this letter. Each page has
  37. # 50 results.
  38.  
  39. try:
  40.  
  41. html = urllib2.urlopen(curUrl, timeout=5).read();
  42. resultsCountDivTagStart = '<div class="rnav">'
  43. resultsCountDivTagStop = "</div>"
  44.  
  45. indexResultCountStart = html.find(resultsCountDivTagStart)
  46. indexResultCountStop = html.find(resultsCountDivTagStop, indexResultCountStart)
  47. resultsArea = html[indexResultCountStart:indexResultCountStop]
  48. linkSearchTag = "</a>"
  49. lstResultsLink = resultsArea.split(linkSearchTag)
  50. del lstResultsLink[0]
  51.  
  52. # number of items in lstResultsLink represents the number of offset pages
  53. # this letter has. Each page has 50 results per page. The offset values
  54. # range from 0, 50, 100... and so on. So let's create a list that contains
  55. # the exact offset values that we need to add to the baseUrl to get the appropriate
  56. # page for this letter.
  57.  
  58. listPageOffsets = [i * resultsPerPage for i in range(0,len(lstResultsLink))]
  59. if listPageOffsets == [] : listPageOffsets = [0]
  60.  
  61. print '\nLetter:', curLetter, ' pages:', len(lstResultsLink), ' offset values:', listPageOffsets
  62.  
  63. # Now listPageOffsets contains [0,50,100,150...]. For each letter, we have all
  64. # the offsets. So let's create the url for every offset, download that page
  65. # and grab the links for every player's individual pages.
  66.  
  67. for eachOffsetPage in listPageOffsets:
  68.  
  69. # We don't want to overload the NBA.com webserver with mulitple calls in a short
  70. # span of time, cuz if you do you'll notice they will lock you out (calls time out)
  71. # So we'll slow things down by making calls only every half a second
  72.  
  73. time.sleep(0.5)
  74.  
  75. # So for this page offset, construct the url and GET it
  76.  
  77. strOffset = str(eachOffsetPage)
  78. curUrl = baseUrl.replace(letter,curLetter).replace(offset,strOffset)
  79. countPagesScraped += 1
  80.  
  81. try:
  82.  
  83. # fetch the page and grab the results table. That contains
  84. # the list of players.
  85.  
  86. htmlPage = urllib2.urlopen(curUrl).read();
  87. indexStart = htmlPage.find('<div id="tableContainer">')
  88. indexStop = htmlPage.find("<script>",indexStart)
  89. resultsBody = htmlPage[indexStart:indexStop]
  90. lstResults = resultsBody.split('<tr class="resultsTable" >')
  91.  
  92. del lstResults[0];
  93.  
  94. print '--------> offset: ', eachOffsetPage
  95.  
  96. # Okay now each element in lstResults represents the html
  97. # surrounding a player. Let's process each element in lstResults
  98. # to extract player data
  99.  
  100. for eachPlayerHtml in lstResults:
  101.  
  102. indexStart = eachPlayerHtml.find("<a href='") + len("<a href='")
  103. indexEnd = eachPlayerHtml.find("</a>")
  104. lstPlayerInfo = eachPlayerHtml[indexStart:indexEnd].split("'>")
  105. link = "http://nba.com/"+lstPlayerInfo[0]; name = lstPlayerInfo[1]
  106. key = link[link.find('=') + 1:]
  107. active = True if ('ACTIVE' in eachPlayerHtml) else False
  108.  
  109. # Now we have a player's details - Name, PageURL & if still active
  110. # Let's save this in a dictionary so that we can use this later
  111.  
  112. dictPlayerPageUrls[key] = {
  113. 'index': countPlayers,
  114. 'name' : removeNonAscii(name),
  115. 'link' : link,
  116. 'active' : active
  117. };
  118.  
  119. countPlayers+=1
  120. print '------------------------->',countPlayers,":",name
  121.  
  122. except:
  123. traceback.print_exc()
  124. print 'Exception in player page. Letter: ' + curLetter + ' Offset: ' + strOffset
  125.  
  126. # All the pages for this letter have been scraped. Let's get rid
  127. # of it and go to the next letter
  128.  
  129. del ordAllLetters[0]
  130.  
  131. except :
  132.  
  133. # Something went wrong while trying to scrape this letter's pages
  134. # so the letter has not been removed from the list 'ordAllLetters'
  135. # and we'll try scraping its pages again and again until its done
  136.  
  137. print '*********> Exception on letter: ', curLetter
  138.  
  139. print 'Started at: ', startTime
  140. print 'Ended at: ', time.asctime()
  141. print 'Total number of Players: ', countPlayers
  142. print 'Total pages scraped:', countPagesScraped
  143.  
  144. # When the control reaches here, all the URLs for the players in the nba.com historical
  145. # stats are scraped and stored in the dictionary. Let's save that in a JSON file for later
  146.  
  147. try:
  148. strJSON = json.dumps(dictPlayerPageUrls)
  149. f = open('players.json','w')
  150. f.write(strJSON)
  151. f.close()
  152. except UnicodeDecodeError:
  153.  
  154. print '\n\n\nError in JSON string with some non utf-8 characters'

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.