Revision: 63705
Updated Code
at May 31, 2013 09:21 by richyeung
Updated Code
# This is just a fun little script that acts like a Pandora for food.Its # implementation is simplistic. You choose a set of restaurants on Yelp that you # like, and the script finds all reviewers that gave these restaurants 5 stars. You # trust these reviewers because they share your awesome taste in food. The script # then spits out all restaurants that these "trusted reviewers" also reviewed, and # their rating for each review. # You would need a few additional lines of code to turn the scrapy output into a # sorted list of restaurants. For example, the code below will sort restaurants by # number 5 star reviews from "trusted reviewers": # import pandas # reviews = pandas.read_csv('scrapy_output.csv') # fiveStarReviews = reviews[reviews['rating']==5] # fiveStarReviews.restaurant.value_counts() # There are countless ways you can improve on this. One obvious one is you would # want to normalize by total restaurant reviews. You would probably also want to # pull in restaurant category information. # Happy food hunting! from scrapy.spider import BaseSpider from scrapy.selector import HtmlXPathSelector from scrapy.http import Request import re from pandoraFood.items import Review # url string components for reviewer pages URL_BASE = 'http://www.yelp.com/user_details_reviews_self?userid=' FILTER_SETTINGS = '&review_filter=category&category_filter=restaurants' # yelp unique url endings for each restaurant RESTAURANTS = ['z-and-y-restaurant-san-francisco', \ 'koi-palace-daly-city', \ 'ino-sushi-san-francisco', \ 'blackwood-san-francisco-3'] def createRestaurantPageLinks(self, response): reviewsPerPage = 40 hxs = HtmlXPathSelector(response) totalReviews = int(hxs.select('//h2[@id="total_reviews"]/text()').extract()[0].strip().split(' ')[0]) pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), \ callback=self.parse) \ for n in range(totalReviews/reviewsPerPage)] return pages def createReviewerPageLinks(self, response): reviewsPerPage = 10 hxs = HtmlXPathSelector(response) totalReviews = int(hxs.select('//div[@id="review_lister_header"]/em/text()').extract()[0].split(' ')[0]) pages = [Request(url=response.url + '&rec_pagestart=' + str(reviewsPerPage*(n+1)), \ callback=self.parseReviewer) \ for n in range(totalReviews/reviewsPerPage)] return pages class RestaurantSpider(BaseSpider): name = 'crawlRestaurants' allowed_domains = ['yelp.com'] start_urls = [ 'http://www.yelp.com/biz/%s' % s for s in RESTAURANTS] # default parse used for the landing page for each start_url def parse(self, response): requests = [] # extract all reviews from the page and return a list of requests for the 5 star reviewers' profiles hxs = HtmlXPathSelector(response) userIDs = [userUrl.split('?userid=')[1] for \ userUrl in hxs.select('//li[@class="user-name"]/a/@href').extract()] ratings = hxs.select('//div[@id="reviews-other"]//meta[@itemprop="ratingValue"]/@content').extract() for i in range(len(ratings)): if float(ratings[i]) == 5: requests.append(Request(url=URL_BASE + userIDs[i] + FILTER_SETTINGS, \ callback=self.parseReviewer)) # request additional pages if we are on page 1 of the restaurant if response.url.find('?start=') == -1: requests += createRestaurantPageLinks(self, response) return requests # parse a given reviewer def parseReviewer(self, response): hxs = HtmlXPathSelector(response) restaurantUrls = hxs.select('//div[@class="review clearfix"]/ \ div[@class="biz_info"]/h4/a/@href').extract() restaurants = [re.search(r'(?<=/biz/)[^#]*', rest).group() for rest in restaurantUrls] reviewerName = hxs.select('//title/text()').extract()[0].split('|')[0].replace('\'s Profile','').strip() reviewerUserID = re.search(r'(?<=userid=)[^&]*', response.url).group() ratingText = hxs.select('//div[@class="rating"]/i/@title').extract() ratings = [s.replace(' star rating','') for s in ratingText] reviews = [] for i in range(len(restaurants)): review = Review() review['restaurant'] = restaurants[i] review['reviewerName'] = reviewerName review['reviewerUserID'] = reviewerUserID review['rating'] = float(ratings[i]) reviews.append(review) # request additional pages if we are on page 1 of the reviewer additionalPages = [] if response.url.find('&rec_pagestart=') == -1: additionalPages = createReviewerPageLinks(self, response) return reviews + additionalPages
Revision: 63704
Updated Code
at May 31, 2013 09:19 by richyeung
Updated Code
# This is just a fun little script that acts like a Pandora for food.Its # implementation is simplistic. You choose a set of restaurants on Yelp that you # like, and the script finds all reviewers that gave these restaurants 5 stars. You # trust these reviewers because they share your awesome taste in food. The script # then spits out all restaurants that these "trusted reviewers" also reviewed, and # their rating for each review. # You would need a few additional lines of code to turn the scrapy output into a # sorted list of restaurants. For example, the code below will sort restaurants by # number of "trusted reviewers": # import pandas # reviews = pandas.read_csv('scrapy_output.csv') # fiveStarReviews = reviews[reviews['rating']==5] # fiveStarReviews.restaurant.value_counts() # There are countless ways you can improve on this. One obvious one is you would # want to normalize by total restaurant reviews. You would probably also want to # pull in restaurant category information. # Happy food hunting! from scrapy.spider import BaseSpider from scrapy.selector import HtmlXPathSelector from scrapy.http import Request import re from pandoraFood.items import Review # url string components for reviewer pages URL_BASE = 'http://www.yelp.com/user_details_reviews_self?userid=' FILTER_SETTINGS = '&review_filter=category&category_filter=restaurants' # yelp unique url endings for each restaurant RESTAURANTS = ['z-and-y-restaurant-san-francisco', \ 'koi-palace-daly-city', \ 'ino-sushi-san-francisco', \ 'blackwood-san-francisco-3'] def createRestaurantPageLinks(self, response): reviewsPerPage = 40 hxs = HtmlXPathSelector(response) totalReviews = int(hxs.select('//h2[@id="total_reviews"]/text()').extract()[0].strip().split(' ')[0]) pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), \ callback=self.parse) \ for n in range(totalReviews/reviewsPerPage)] return pages def createReviewerPageLinks(self, response): reviewsPerPage = 10 hxs = HtmlXPathSelector(response) totalReviews = int(hxs.select('//div[@id="review_lister_header"]/em/text()').extract()[0].split(' ')[0]) pages = [Request(url=response.url + '&rec_pagestart=' + str(reviewsPerPage*(n+1)), \ callback=self.parseReviewer) \ for n in range(totalReviews/reviewsPerPage)] return pages class RestaurantSpider(BaseSpider): name = 'crawlRestaurants' allowed_domains = ['yelp.com'] start_urls = [ 'http://www.yelp.com/biz/%s' % s for s in RESTAURANTS] # default parse used for the landing page for each start_url def parse(self, response): requests = [] # extract all reviews from the page and return a list of requests for the 5 star reviewers' profiles hxs = HtmlXPathSelector(response) userIDs = [userUrl.split('?userid=')[1] for \ userUrl in hxs.select('//li[@class="user-name"]/a/@href').extract()] ratings = hxs.select('//div[@id="reviews-other"]//meta[@itemprop="ratingValue"]/@content').extract() for i in range(len(ratings)): if float(ratings[i]) == 5: requests.append(Request(url=URL_BASE + userIDs[i] + FILTER_SETTINGS, \ callback=self.parseReviewer)) # request additional pages if we are on page 1 of the restaurant if response.url.find('?start=') == -1: requests += createRestaurantPageLinks(self, response) return requests # parse a given reviewer def parseReviewer(self, response): hxs = HtmlXPathSelector(response) restaurantUrls = hxs.select('//div[@class="review clearfix"]/ \ div[@class="biz_info"]/h4/a/@href').extract() restaurants = [re.search(r'(?<=/biz/)[^#]*', rest).group() for rest in restaurantUrls] reviewerName = hxs.select('//title/text()').extract()[0].split('|')[0].replace('\'s Profile','').strip() reviewerUserID = re.search(r'(?<=userid=)[^&]*', response.url).group() ratingText = hxs.select('//div[@class="rating"]/i/@title').extract() ratings = [s.replace(' star rating','') for s in ratingText] reviews = [] for i in range(len(restaurants)): review = Review() review['restaurant'] = restaurants[i] review['reviewerName'] = reviewerName review['reviewerUserID'] = reviewerUserID review['rating'] = float(ratings[i]) reviews.append(review) # request additional pages if we are on page 1 of the reviewer additionalPages = [] if response.url.find('&rec_pagestart=') == -1: additionalPages = createReviewerPageLinks(self, response) return reviews + additionalPages
Revision: 63703
Updated Code
at May 31, 2013 09:16 by richyeung
Updated Code
#This is just a fun little script that acts like a Pandora for food.Its #implementation is simplistic. You choose a set of restaurants on Yelp that you #like, and the script finds all reviewers that gave these restaurants 5 stars. You #trust these reviewers because they share your awesome taste in food. The script #then spits out all restaurants that these "trusted reviewers" also reviewed, and #their rating for each review. #You would need a few additional lines of code to turn the scrapy output into a #sorted list of restaurants. For example, the code below will sort restaurants by #number of "trusted reviewers": #import pandas #reviews = pandas.read_csv('scrapy_output.csv') #fiveStarReviews = reviews[reviews['rating']==5] #fiveStarReviews.restaurant.value_counts() #There are countless ways you can improve on this. One obvious one is you would #want to normalize by total restaurant reviews. You would probably also want to #pull in restaurant category information. #Happy food hunting! from scrapy.spider import BaseSpider from scrapy.selector import HtmlXPathSelector from scrapy.http import Request import re from pandoraFood.items import Review # url string components for reviewer pages URL_BASE = 'http://www.yelp.com/user_details_reviews_self?userid=' FILTER_SETTINGS = '&review_filter=category&category_filter=restaurants' # yelp unique url endings for each restaurant RESTAURANTS = ['z-and-y-restaurant-san-francisco', \ 'koi-palace-daly-city', \ 'ino-sushi-san-francisco', \ 'blackwood-san-francisco-3'] def createRestaurantPageLinks(self, response): reviewsPerPage = 40 hxs = HtmlXPathSelector(response) totalReviews = int(hxs.select('//h2[@id="total_reviews"]/text()').extract()[0].strip().split(' ')[0]) pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), \ callback=self.parse) \ for n in range(totalReviews/reviewsPerPage)] return pages def createReviewerPageLinks(self, response): reviewsPerPage = 10 hxs = HtmlXPathSelector(response) totalReviews = int(hxs.select('//div[@id="review_lister_header"]/em/text()').extract()[0].split(' ')[0]) pages = [Request(url=response.url + '&rec_pagestart=' + str(reviewsPerPage*(n+1)), \ callback=self.parseReviewer) \ for n in range(totalReviews/reviewsPerPage)] return pages class RestaurantSpider(BaseSpider): name = 'crawlRestaurants' allowed_domains = ['yelp.com'] start_urls = [ 'http://www.yelp.com/biz/%s' % s for s in RESTAURANTS] # default parse used for the landing page for each start_url def parse(self, response): requests = [] # extract all reviews from the page and return a list of requests for the 5 star reviewers' profiles hxs = HtmlXPathSelector(response) userIDs = [userUrl.split('?userid=')[1] for \ userUrl in hxs.select('//li[@class="user-name"]/a/@href').extract()] ratings = hxs.select('//div[@id="reviews-other"]//meta[@itemprop="ratingValue"]/@content').extract() for i in range(len(ratings)): if float(ratings[i]) == 5: requests.append(Request(url=URL_BASE + userIDs[i] + FILTER_SETTINGS, \ callback=self.parseReviewer)) # request additional pages if we are on page 1 of the restaurant if response.url.find('?start=') == -1: requests += createRestaurantPageLinks(self, response) return requests # parse a given reviewer def parseReviewer(self, response): hxs = HtmlXPathSelector(response) restaurantUrls = hxs.select('//div[@class="review clearfix"]/ \ div[@class="biz_info"]/h4/a/@href').extract() restaurants = [re.search(r'(?<=/biz/)[^#]*', rest).group() for rest in restaurantUrls] reviewerName = hxs.select('//title/text()').extract()[0].split('|')[0].replace('\'s Profile','').strip() reviewerUserID = re.search(r'(?<=userid=)[^&]*', response.url).group() ratingText = hxs.select('//div[@class="rating"]/i/@title').extract() ratings = [s.replace(' star rating','') for s in ratingText] reviews = [] for i in range(len(restaurants)): review = Review() review['restaurant'] = restaurants[i] review['reviewerName'] = reviewerName review['reviewerUserID'] = reviewerUserID review['rating'] = float(ratings[i]) reviews.append(review) # request additional pages if we are on page 1 of the reviewer additionalPages = [] if response.url.find('&rec_pagestart=') == -1: additionalPages = createReviewerPageLinks(self, response) return reviews + additionalPages
Revision: 63702
Updated Code
at May 31, 2013 09:15 by richyeung
Updated Code
#This is just a fun little script that acts like a Pandora for food. Its #implementation is simplistic. You choose a set of restaurants on Yelp that you #like, and the script finds all reviewers that gave these restaurants 5 stars. You #trust these reviewers because they share your awesome taste in food. The script #then spits out all restaurants that these "trusted reviewers" also reviewed, and #their rating for each review. #You would need a few additional lines of code to turn the scrapy output into a #sorted list of restaurants. For example, the code below will sort restaurants by #number of "trusted reviewers": #import pandas #reviews = pandas.read_csv('scrapy_output.csv') #fiveStarReviews = reviews[reviews['rating']==5] #fiveStarReviews.restaurant.value_counts() #There are countless ways you can improve on this. One obvious one is you would #want to normalize by total restaurant reviews. You would probably also want to #pull in restaurant category information. #Happy food hunting! from scrapy.spider import BaseSpider from scrapy.selector import HtmlXPathSelector from scrapy.http import Request import re from pandoraFood.items import Review # url string components for reviewer pages URL_BASE = 'http://www.yelp.com/user_details_reviews_self?userid=' FILTER_SETTINGS = '&review_filter=category&category_filter=restaurants' # yelp unique url endings for each restaurant RESTAURANTS = ['z-and-y-restaurant-san-francisco', \ 'koi-palace-daly-city', \ 'ino-sushi-san-francisco', \ 'blackwood-san-francisco-3'] def createRestaurantPageLinks(self, response): reviewsPerPage = 40 hxs = HtmlXPathSelector(response) totalReviews = int(hxs.select('//h2[@id="total_reviews"]/text()').extract()[0].strip().split(' ')[0]) pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), \ callback=self.parse) \ for n in range(totalReviews/reviewsPerPage)] return pages def createReviewerPageLinks(self, response): reviewsPerPage = 10 hxs = HtmlXPathSelector(response) totalReviews = int(hxs.select('//div[@id="review_lister_header"]/em/text()').extract()[0].split(' ')[0]) pages = [Request(url=response.url + '&rec_pagestart=' + str(reviewsPerPage*(n+1)), \ callback=self.parseReviewer) \ for n in range(totalReviews/reviewsPerPage)] return pages class RestaurantSpider(BaseSpider): name = 'crawlRestaurants' allowed_domains = ['yelp.com'] start_urls = [ 'http://www.yelp.com/biz/%s' % s for s in RESTAURANTS] # default parse used for the landing page for each start_url def parse(self, response): requests = [] # extract all reviews from the page and return a list of requests for the 5 star reviewers' profiles hxs = HtmlXPathSelector(response) userIDs = [userUrl.split('?userid=')[1] for \ userUrl in hxs.select('//li[@class="user-name"]/a/@href').extract()] ratings = hxs.select('//div[@id="reviews-other"]//meta[@itemprop="ratingValue"]/@content').extract() for i in range(len(ratings)): if float(ratings[i]) == 5: requests.append(Request(url=URL_BASE + userIDs[i] + FILTER_SETTINGS, \ callback=self.parseReviewer)) # request additional pages if we are on page 1 of the restaurant if response.url.find('?start=') == -1: requests += createRestaurantPageLinks(self, response) return requests # parse a given reviewer def parseReviewer(self, response): hxs = HtmlXPathSelector(response) restaurantUrls = hxs.select('//div[@class="review clearfix"]/ \ div[@class="biz_info"]/h4/a/@href').extract() restaurants = [re.search(r'(?<=/biz/)[^#]*', rest).group() for rest in restaurantUrls] reviewerName = hxs.select('//title/text()').extract()[0].split('|')[0].replace('\'s Profile','').strip() reviewerUserID = re.search(r'(?<=userid=)[^&]*', response.url).group() ratingText = hxs.select('//div[@class="rating"]/i/@title').extract() ratings = [s.replace(' star rating','') for s in ratingText] reviews = [] for i in range(len(restaurants)): review = Review() review['restaurant'] = restaurants[i] review['reviewerName'] = reviewerName review['reviewerUserID'] = reviewerUserID review['rating'] = float(ratings[i]) reviews.append(review) # request additional pages if we are on page 1 of the reviewer additionalPages = [] if response.url.find('&rec_pagestart=') == -1: additionalPages = createReviewerPageLinks(self, response) return reviews + additionalPages
Revision: 63701
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at May 31, 2013 08:36 by richyeung
Initial Code
from scrapy.spider import BaseSpider from scrapy.selector import HtmlXPathSelector from scrapy.http import Request import re from pandoraFood.items import Review # url string components for reviewer pages URL_BASE = 'http://www.yelp.com/user_details_reviews_self?userid=' FILTER_SETTINGS = '&review_filter=category&category_filter=restaurants' # yelp unique url endings for each restaurant RESTAURANTS = ['z-and-y-restaurant-san-francisco', \ 'koi-palace-daly-city', \ 'ino-sushi-san-francisco', \ 'blackwood-san-francisco-3'] def createRestaurantPageLinks(self, response): reviewsPerPage = 40 hxs = HtmlXPathSelector(response) totalReviews = int(hxs.select('//h2[@id="total_reviews"]/text()').extract()[0].strip().split(' ')[0]) pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), \ callback=self.parse) \ for n in range(totalReviews/reviewsPerPage)] return pages def createReviewerPageLinks(self, response): reviewsPerPage = 10 hxs = HtmlXPathSelector(response) totalReviews = int(hxs.select('//div[@id="review_lister_header"]/em/text()').extract()[0].split(' ')[0]) pages = [Request(url=response.url + '&rec_pagestart=' + str(reviewsPerPage*(n+1)), \ callback=self.parseReviewer) \ for n in range(totalReviews/reviewsPerPage)] return pages class RestaurantSpider(BaseSpider): name = 'crawlRestaurants' allowed_domains = ['yelp.com'] start_urls = [ 'http://www.yelp.com/biz/%s' % s for s in RESTAURANTS] # default parse used for the landing page for each start_url def parse(self, response): requests = [] # extract all reviews from the page and return a list of requests for the 5 star reviewers' profiles hxs = HtmlXPathSelector(response) userIDs = [userUrl.split('?userid=')[1] for \ userUrl in hxs.select('//li[@class="user-name"]/a/@href').extract()] ratings = hxs.select('//div[@id="reviews-other"]//meta[@itemprop="ratingValue"]/@content').extract() for i in range(len(ratings)): if float(ratings[i]) == 5: requests.append(Request(url=URL_BASE + userIDs[i] + FILTER_SETTINGS, \ callback=self.parseReviewer)) # request additional pages if we are on page 1 of the restaurant if response.url.find('?start=') == -1: requests += createRestaurantPageLinks(self, response) return requests # parse a given reviewer def parseReviewer(self, response): hxs = HtmlXPathSelector(response) restaurantUrls = hxs.select('//div[@class="review clearfix"]/ \ div[@class="biz_info"]/h4/a/@href').extract() restaurants = [re.search(r'(?<=/biz/)[^#]*', rest).group() for rest in restaurantUrls] reviewerName = hxs.select('//title/text()').extract()[0].split('|')[0].replace('\'s Profile','').strip() reviewerUserID = re.search(r'(?<=userid=)[^&]*', response.url).group() ratingText = hxs.select('//div[@class="rating"]/i/@title').extract() ratings = [s.replace(' star rating','') for s in ratingText] reviews = [] for i in range(len(restaurants)): review = Review() review['restaurant'] = restaurants[i] review['reviewerName'] = reviewerName review['reviewerUserID'] = reviewerUserID review['rating'] = float(ratings[i]) reviews.append(review) # request additional pages if we are on page 1 of the reviewer additionalPages = [] if response.url.find('&rec_pagestart=') == -1: additionalPages = createReviewerPageLinks(self, response) return reviews + additionalPages
Initial URL
Initial Description
Web crawling Yelp for personalized food recommendations
Initial Title
Pandora for Food – Crawl Yelp for personalized recommendations
Initial Tags
python
Initial Language
Python