Return to Snippet

Revision: 63342
at April 29, 2013 07:53 by asimmittal


Updated Code
import urllib2, time, traceback, json

# Removes non ascii characters from a string 's'
def removeNonAscii(s): return "".join(i for i in s if ord(i)<128)


if __name__ == '__main__':

	baseUrl = "http://www.nba.com/historical/search/index.jsp?fl=(%l)&pager.offset=(%o)#results"
	letter = "(%l)"
	offset = "(%o)"

	startTime = time.asctime();

	# ASCII values for letters from A to Z
	ordAllLetters = range(ord('A'), ord('Z') + 1) 
	resultsPerPage = 50
	countPlayers = 0
	countPagesScraped = 0
	dictPlayerPageUrls = {}
	
	# Keep scraping if there are letters still left in the list of all letters

	while len(ordAllLetters) > 0:

		# Grab the first letter from the list and try scraping the pages for that letter
		# For every letter, there are multiple pages. Each page contains 50 names of 
		# athletes

		each = ordAllLetters[0];
		curLetter = chr(each)
		curUrl = baseUrl.replace(letter,curLetter).replace(offset,str(''))
		
		# For this letter, find out how many pages exist. We can do this by scraping
		# the "rnav" div at the bottom of the page. The number of <a> in that div 
		# will tell you how many pages there are against this letter. Each page has
		# 50 results.

		try:
			
			html = urllib2.urlopen(curUrl, timeout=5).read();
			resultsCountDivTagStart = '<div class="rnav">'
			resultsCountDivTagStop = "</div>"

			indexResultCountStart = html.find(resultsCountDivTagStart)
			indexResultCountStop = html.find(resultsCountDivTagStop, indexResultCountStart)
			resultsArea = html[indexResultCountStart:indexResultCountStop]
			linkSearchTag = "</a>"
			lstResultsLink = resultsArea.split(linkSearchTag)
			del lstResultsLink[0]

			# number of items in lstResultsLink represents the number of offset pages
			# this letter has. Each page has 50 results per page. The offset values
			# range from 0, 50, 100... and so on. So let's create a list that contains
			# the exact offset values that we need to add to the baseUrl to get the appropriate
			# page for this letter.

			listPageOffsets = [i * resultsPerPage for i in range(0,len(lstResultsLink))]
			if listPageOffsets == [] : listPageOffsets = [0]
			
			print '\nLetter:', curLetter, ' pages:', len(lstResultsLink), ' offset values:', listPageOffsets

			# Now listPageOffsets contains [0,50,100,150...]. For each letter, we have all 
			# the offsets. So let's create the url for every offset, download that page
			# and grab the links for every player's individual pages.
			
			for eachOffsetPage in listPageOffsets:

				# We don't want to overload the NBA.com webserver with mulitple calls in a short
				# span of time, cuz if you do you'll notice they will lock you out (calls time out)
				# So we'll slow things down by making calls only every half a second 

				time.sleep(0.5)

				# So for this page offset, construct the url and GET it
				
				strOffset = str(eachOffsetPage)
				curUrl = baseUrl.replace(letter,curLetter).replace(offset,strOffset)
				countPagesScraped += 1

				try:

					# fetch the page and grab the results table. That contains
					# the list of players.

					htmlPage = urllib2.urlopen(curUrl).read();
					indexStart = htmlPage.find('<div id="tableContainer">')
					indexStop = htmlPage.find("<script>",indexStart)
					resultsBody = htmlPage[indexStart:indexStop]
					lstResults = resultsBody.split('<tr class="resultsTable" >')

					del lstResults[0];

					print '--------> offset: ', eachOffsetPage

					# Okay now each element in lstResults represents the html
					# surrounding a player. Let's process each element in lstResults 
					# to extract player data

					for eachPlayerHtml in lstResults:

						indexStart = eachPlayerHtml.find("<a href='") + len("<a href='")
						indexEnd = eachPlayerHtml.find("</a>")
						lstPlayerInfo = eachPlayerHtml[indexStart:indexEnd].split("'>")
						link = "http://nba.com/"+lstPlayerInfo[0]; name = lstPlayerInfo[1]
						key = link[link.find('=') + 1:]
						active = True if ('ACTIVE' in eachPlayerHtml) else False

						# Now we have a player's details - Name, PageURL & if still active
						# Let's save this in a dictionary so that we can use this later

						dictPlayerPageUrls[key] = {
							'index': countPlayers,
							'name' : removeNonAscii(name),
							'link' : link,
							'active' : active
						};

						countPlayers+=1
						print '------------------------->',countPlayers,":",name

				except:					
					traceback.print_exc()
					print 'Exception in player page. Letter: ' + curLetter + ' Offset: ' + strOffset
			
			# All the pages for this letter have been scraped. Let's get rid
			# of it and go to the next letter

			del ordAllLetters[0]

		except :

			# Something went wrong while trying to scrape this letter's pages
			# so the letter has not been removed from the list 'ordAllLetters'
			# and we'll try scraping its pages again and again until its done

			print '*********> Exception on letter: ', curLetter

	print 'Started at: ', startTime
	print 'Ended at: ', time.asctime()
	print 'Total number of Players: ', countPlayers
	print 'Total pages scraped:', countPagesScraped

	# When the control reaches here, all the URLs for the players in the nba.com historical 
	# stats are scraped and stored in the dictionary. Let's save that in a JSON file for later

	try:
		strJSON = json.dumps(dictPlayerPageUrls)
		f = open('players.json','w')
		f.write(strJSON)
		f.close()
	except UnicodeDecodeError:

		print '\n\n\nError in JSON string with some non utf-8 characters'

Revision: 63341
at April 28, 2013 16:33 by asimmittal


Initial Code
import urllib2, time, traceback, json

if __name__ == '__main__':

	baseUrl = "http://www.nba.com/historical/search/index.jsp?fl=(%l)&pager.offset=(%o)#results"
	letter = "(%l)"
	offset = "(%o)"

	startTime = time.asctime();

	# ASCII values for letters from A to Z
	ordAllLetters = range(ord('A'), ord('Z') + 1) 
	resultsPerPage = 50

	dictPlayerPageUrls = {}
	
	for each in ordAllLetters:

		# Pull the first page for this letter from NBA Historical player stats website
		# For every letter, there are multiple pages. Each page contains 50 names of 
		# athletes

		curLetter = chr(each)
		curUrl = baseUrl.replace(letter,curLetter).replace(offset,str(''))
		
		# For this letter, find out how many pages exist. We can do this by scraping
		# the "rnav" div at the bottom of the page. The number of <a> in that div 
		# will tell you how many pages there are against this letter. Each page has
		# 50 results.

		try:
			
			html = urllib2.urlopen(curUrl, timeout=5).read();
			resultsCountDivTagStart = '<div class="rnav">'
			resultsCountDivTagStop = "</div>"

			indexResultCountStart = html.find(resultsCountDivTagStart)
			indexResultCountStop = html.find(resultsCountDivTagStop, indexResultCountStart)
			resultsArea = html[indexResultCountStart:indexResultCountStop]
			linkSearchTag = "</a>"
			lstResultsLink = resultsArea.split(linkSearchTag)
			del lstResultsLink[0]

			# number of items in lstResultsLink represents the number of offset pages
			# this letter has. Each page has 50 results per page. The offset values
			# range from 0, 50, 100... and so on. So let's create a list that contains
			# the exact offset values that we need to add to the baseUrl to get the appropriate
			# page for this letter.

			listPageOffsets = [i * resultsPerPage for i in range(0,len(lstResultsLink))]
			if listPageOffsets == [] : listPageOffsets = [0]
			print '-----------------'
			print 'Letter:', curLetter, ' pages:', len(lstResultsLink), ' offset values:', listPageOffsets

			# Now listPageOffsets contains [0,50,100,150...]. For each letter, we have all 
			# the offsets. So let's create the url for every offset, download that page
			# and grab the links for every player's individual pages.
			
			for eachOffsetPage in listPageOffsets:
				strOffset = str(eachOffsetPage)
				curUrl = baseUrl.replace(letter,curLetter).replace(offset,strOffset)

				try:

					# fetch the page and grab the results table. That contains
					# the list of players.

					htmlPage = urllib2.urlopen(curUrl).read();
					indexStart = htmlPage.find('<div id="tableContainer">')
					indexStop = htmlPage.find("<script>",indexStart)
					resultsBody = htmlPage[indexStart:indexStop]
					lstResults = resultsBody.split('<tr class="resultsTable" >')

					del lstResults[0];

					# Okay now each element in lstResults represents the html
					# surrounding a player. Let's process each element in lstResults 
					# to extract player data

					for eachPlayerHtml in lstResults:
						indexStart = eachPlayerHtml.find("<a href='") + len("<a href='")
						indexEnd = eachPlayerHtml.find("</a>")
						lstPlayerInfo = eachPlayerHtml[indexStart:indexEnd].split("'>")
						link = "http://nba.com/"+lstPlayerInfo[0]; name = lstPlayerInfo[1]
						key = link[link.find('=') + 1:]
						active = True if ('ACTIVE' in eachPlayerHtml) else False

						# Now we have a player's details - Name, PageURL & if still active
						# Let's save this in a dictionary so that we can use this later

						dictPlayerPageUrls[key] = {
							'name' : name,
							'link' : link,
							'active' : active
						};

						print 'Grabbed:',name
				except:					
					print 'Exception in player page. Letter: ' + curLetter + ' Offset: ' + strOffset

			# When the control reaches here, all the URLs for the players in the nba.com historical 
			# stats are scraped and stored in the dictionary. Let's save that in a JSON file for later

			strJSON = json.dumps(dictPlayerPageUrls)
			f = open('players.json','w')
			f.write(strJSON)
			f.close()

		except :
			traceback.print_exc()
			print '------> Exception on letter: ', curLetter


		print 'Started at: ', startTime
		print 'Ended at: ', time.asctime()

Initial URL

                                

Initial Description
This is a python script that allows you to scrape historical player names and links from NBA.com historical data

Initial Title
Scrape list of all NBA players

Initial Tags
python

Initial Language
Python