Scrape list of all NBA players - Python Snipplr Social Repository

Revision: 63342

at April 29, 2013 07:53 by asimmittal

Updated Code

import urllib2, time, traceback, json

# Removes non ascii characters from a string 's'
def removeNonAscii(s): return "".join(i for i in s if ord(i)<128)


if __name__ == '__main__':

	baseUrl = "http://www.nba.com/historical/search/index.jsp?fl=(%l)&pager.offset=(%o)#results"
	letter = "(%l)"
	offset = "(%o)"

	startTime = time.asctime();

	# ASCII values for letters from A to Z
	ordAllLetters = range(ord('A'), ord('Z') + 1) 
	resultsPerPage = 50
	countPlayers = 0
	countPagesScraped = 0
	dictPlayerPageUrls = {}
	
	# Keep scraping if there are letters still left in the list of all letters

	while len(ordAllLetters) > 0:

		# Grab the first letter from the list and try scraping the pages for that letter
		# For every letter, there are multiple pages. Each page contains 50 names of 
		# athletes

		each = ordAllLetters[0];
		curLetter = chr(each)
		curUrl = baseUrl.replace(letter,curLetter).replace(offset,str(''))
		
		# For this letter, find out how many pages exist. We can do this by scraping
		# the "rnav" div at the bottom of the page. The number of <a> in that div 
		# will tell you how many pages there are against this letter. Each page has
		# 50 results.

		try:
			
			html = urllib2.urlopen(curUrl, timeout=5).read();
			resultsCountDivTagStart = '<div class="rnav">'
			resultsCountDivTagStop = "</div>"

			indexResultCountStart = html.find(resultsCountDivTagStart)
			indexResultCountStop = html.find(resultsCountDivTagStop, indexResultCountStart)
			resultsArea = html[indexResultCountStart:indexResultCountStop]
			linkSearchTag = "</a>"
			lstResultsLink = resultsArea.split(linkSearchTag)
			del lstResultsLink[0]

			# number of items in lstResultsLink represents the number of offset pages
			# this letter has. Each page has 50 results per page. The offset values
			# range from 0, 50, 100... and so on. So let's create a list that contains
			# the exact offset values that we need to add to the baseUrl to get the appropriate
			# page for this letter.

			listPageOffsets = [i * resultsPerPage for i in range(0,len(lstResultsLink))]
			if listPageOffsets == [] : listPageOffsets = [0]
			
			print '\nLetter:', curLetter, ' pages:', len(lstResultsLink), ' offset values:', listPageOffsets

			# Now listPageOffsets contains [0,50,100,150...]. For each letter, we have all 
			# the offsets. So let's create the url for every offset, download that page
			# and grab the links for every player's individual pages.
			
			for eachOffsetPage in listPageOffsets:

				# We don't want to overload the NBA.com webserver with mulitple calls in a short
				# span of time, cuz if you do you'll notice they will lock you out (calls time out)
				# So we'll slow things down by making calls only every half a second 

				time.sleep(0.5)

				# So for this page offset, construct the url and GET it
				
				strOffset = str(eachOffsetPage)
				curUrl = baseUrl.replace(letter,curLetter).replace(offset,strOffset)
				countPagesScraped += 1

				try:

					# fetch the page and grab the results table. That contains
					# the list of players.

					htmlPage = urllib2.urlopen(curUrl).read();
					indexStart = htmlPage.find('<div id="tableContainer">')
					indexStop = htmlPage.find("<script>",indexStart)
					resultsBody = htmlPage[indexStart:indexStop]
					lstResults = resultsBody.split('<tr class="resultsTable" >')

					del lstResults[0];

					print '--------> offset: ', eachOffsetPage

					# Okay now each element in lstResults represents the html
					# surrounding a player. Let's process each element in lstResults 
					# to extract player data

					for eachPlayerHtml in lstResults:

						indexStart = eachPlayerHtml.find("<a href='") + len("<a href='")
						indexEnd = eachPlayerHtml.find("</a>")
						lstPlayerInfo = eachPlayerHtml[indexStart:indexEnd].split("'>")
						link = "http://nba.com/"+lstPlayerInfo[0]; name = lstPlayerInfo[1]
						key = link[link.find('=') + 1:]
						active = True if ('ACTIVE' in eachPlayerHtml) else False

						# Now we have a player's details - Name, PageURL & if still active
						# Let's save this in a dictionary so that we can use this later

						dictPlayerPageUrls[key] = {
							'index': countPlayers,
							'name' : removeNonAscii(name),
							'link' : link,
							'active' : active
						};

						countPlayers+=1
						print '------------------------->',countPlayers,":",name

				except:					
					traceback.print_exc()
					print 'Exception in player page. Letter: ' + curLetter + ' Offset: ' + strOffset
			
			# All the pages for this letter have been scraped. Let's get rid
			# of it and go to the next letter

			del ordAllLetters[0]

		except :

			# Something went wrong while trying to scrape this letter's pages
			# so the letter has not been removed from the list 'ordAllLetters'
			# and we'll try scraping its pages again and again until its done

			print '*********> Exception on letter: ', curLetter

	print 'Started at: ', startTime
	print 'Ended at: ', time.asctime()
	print 'Total number of Players: ', countPlayers
	print 'Total pages scraped:', countPagesScraped

	# When the control reaches here, all the URLs for the players in the nba.com historical 
	# stats are scraped and stored in the dictionary. Let's save that in a JSON file for later

	try:
		strJSON = json.dumps(dictPlayerPageUrls)
		f = open('players.json','w')
		f.write(strJSON)
		f.close()
	except UnicodeDecodeError:

		print '\n\n\nError in JSON string with some non utf-8 characters'

Revision: 63341

at April 28, 2013 16:33 by asimmittal

Initial Code

import urllib2, time, traceback, json

if __name__ == '__main__':

	baseUrl = "http://www.nba.com/historical/search/index.jsp?fl=(%l)&pager.offset=(%o)#results"
	letter = "(%l)"
	offset = "(%o)"

	startTime = time.asctime();

	# ASCII values for letters from A to Z
	ordAllLetters = range(ord('A'), ord('Z') + 1) 
	resultsPerPage = 50

	dictPlayerPageUrls = {}
	
	for each in ordAllLetters:

		# Pull the first page for this letter from NBA Historical player stats website
		# For every letter, there are multiple pages. Each page contains 50 names of 
		# athletes

		curLetter = chr(each)
		curUrl = baseUrl.replace(letter,curLetter).replace(offset,str(''))
		
		# For this letter, find out how many pages exist. We can do this by scraping
		# the "rnav" div at the bottom of the page. The number of <a> in that div 
		# will tell you how many pages there are against this letter. Each page has
		# 50 results.

		try:
			
			html = urllib2.urlopen(curUrl, timeout=5).read();
			resultsCountDivTagStart = '<div class="rnav">'
			resultsCountDivTagStop = "</div>"

			indexResultCountStart = html.find(resultsCountDivTagStart)
			indexResultCountStop = html.find(resultsCountDivTagStop, indexResultCountStart)
			resultsArea = html[indexResultCountStart:indexResultCountStop]
			linkSearchTag = "</a>"
			lstResultsLink = resultsArea.split(linkSearchTag)
			del lstResultsLink[0]

			# number of items in lstResultsLink represents the number of offset pages
			# this letter has. Each page has 50 results per page. The offset values
			# range from 0, 50, 100... and so on. So let's create a list that contains
			# the exact offset values that we need to add to the baseUrl to get the appropriate
			# page for this letter.

			listPageOffsets = [i * resultsPerPage for i in range(0,len(lstResultsLink))]
			if listPageOffsets == [] : listPageOffsets = [0]
			print '-----------------'
			print 'Letter:', curLetter, ' pages:', len(lstResultsLink), ' offset values:', listPageOffsets

			# Now listPageOffsets contains [0,50,100,150...]. For each letter, we have all 
			# the offsets. So let's create the url for every offset, download that page
			# and grab the links for every player's individual pages.
			
			for eachOffsetPage in listPageOffsets:
				strOffset = str(eachOffsetPage)
				curUrl = baseUrl.replace(letter,curLetter).replace(offset,strOffset)

				try:

					# fetch the page and grab the results table. That contains
					# the list of players.

					htmlPage = urllib2.urlopen(curUrl).read();
					indexStart = htmlPage.find('<div id="tableContainer">')
					indexStop = htmlPage.find("<script>",indexStart)
					resultsBody = htmlPage[indexStart:indexStop]
					lstResults = resultsBody.split('<tr class="resultsTable" >')

					del lstResults[0];

					# Okay now each element in lstResults represents the html
					# surrounding a player. Let's process each element in lstResults 
					# to extract player data

					for eachPlayerHtml in lstResults:
						indexStart = eachPlayerHtml.find("<a href='") + len("<a href='")
						indexEnd = eachPlayerHtml.find("</a>")
						lstPlayerInfo = eachPlayerHtml[indexStart:indexEnd].split("'>")
						link = "http://nba.com/"+lstPlayerInfo[0]; name = lstPlayerInfo[1]
						key = link[link.find('=') + 1:]
						active = True if ('ACTIVE' in eachPlayerHtml) else False

						# Now we have a player's details - Name, PageURL & if still active
						# Let's save this in a dictionary so that we can use this later

						dictPlayerPageUrls[key] = {
							'name' : name,
							'link' : link,
							'active' : active
						};

						print 'Grabbed:',name
				except:					
					print 'Exception in player page. Letter: ' + curLetter + ' Offset: ' + strOffset

			# When the control reaches here, all the URLs for the players in the nba.com historical 
			# stats are scraped and stored in the dictionary. Let's save that in a JSON file for later

			strJSON = json.dumps(dictPlayerPageUrls)
			f = open('players.json','w')
			f.write(strJSON)
			f.close()

		except :
			traceback.print_exc()
			print '------> Exception on letter: ', curLetter


		print 'Started at: ', startTime
		print 'Ended at: ', time.asctime()

Initial URL

Initial Description

This is a python script that allows you to scrape historical player names and links from NBA.com historical data

Initial Title

Scrape list of all NBA players

Initial Tags

python

Initial Language

Python

Choose a language for easy browsing: