Return to Snippet

Revision: 6153
at May 2, 2008 12:21 by mandric


Updated Code
#! /usr/bin/env python
# -*- coding: iso-8859-1 -*-
# vi:ts=4:et
import os,re,sys
from BeautifulSoup import BeautifulSoup
from urllib2 import urlopen, HTTPError, URLError
import datetime
import optparse


CACHE_FILE = '/tmp/twitterlocal_cache'
CACHE_TIME = 10 #minutes

def fetch_html(url, cache_minutes=60*2):
    """
    Return HTML string from a url, caching for two hours by default.
    """
    if not url.startswith('http://'):
        url = 'http://'+url

    try:
        last_mod = datetime.datetime.fromtimestamp(
                    os.stat(CACHE_FILE).st_mtime)
    except OSError:
        # probably couldn't find the file
        last_mod = datetime.datetime(1900,1,1)
        html = urlopen(url).read()
        f = file(CACHE_FILE,'w')
        f.write(html)
        f.close()
    
    delta = datetime.timedelta(minutes=cache_minutes) 
    if last_mod < (datetime.datetime.now() - delta):
        # grab url and save to cache
        html = urlopen(url).read()
        f = file(CACHE_FILE,'w')
        f.write(html)
        f.close()
    else :
        # read cache file
        f= file(CACHE_FILE)
        html = f.read()
        f.close()

    return html


if __name__ == '__main__':
    parser = optparse.OptionParser()
    parser.add_option('--settings')
    options, args = parser.parse_args()
    # call this script like 
    # twitterlocal/bin/scrape.py --settings  myproject.settings
    if options.settings:
        os.environ["DJANGO_SETTINGS_MODULE"] = options.settings

    sys.path.append(os.getcwd())
    try:
        from myproject.twitterlocal.models import TweetsStat
    except ImportError:
        print 'define your settings path, like --settings \'myproject.settings\''
        exit()

    html = fetch_html('http://twitterlocal.net/stats', CACHE_TIME)
    soup = BeautifulSoup(html)

    # Parse dates
    dates = [d for d in soup('h3')[0].contents[2].split(' ') if d]

    # List comprehension fun!
    m,d,y,mi,s = [int(x) for x in dates[3].split('/') + dates[4].split(':')]
    end_time = datetime.datetime(y+2000,m,d,mi,s)

    m,d,y,mi,s = [int(x) for x in dates[0].split('/') + dates[1].split(':')]
    start_time = datetime.datetime(y+2000,m,d,mi,s)

    for row in soup('tbody')[0].findAll('tr') :
        d = row.findAll('td')
        location = d[0].contents[0]
        tweets = d[1].contents[0]
        try:
            t = TweetsStat.objects.get( 
                    location=location, 
                    start_time=start_time,
                    end_time=end_time,)
        except TweetsStat.DoesNotExist:
            t = TweetsStat( 
                    location=location, 
                    tweets=tweets,
                    start_time=start_time,
                    end_time=end_time,)
        
            t.save()

Revision: 6152
at May 1, 2008 17:22 by mandric


Updated Code
#! /usr/bin/env python
# -*- coding: iso-8859-1 -*-
# vi:ts=4:et
import os,re,sys
from BeautifulSoup import BeautifulSoup
from urllib2 import urlopen, HTTPError, URLError
import datetime
import optparse


CACHE_FILE = '/tmp/twitterlocal_cache'
CACHE_TIME = 10 #minutes

def fetch_html(url, cache_minutes=60*2):
    """
    Return HTML string from a url, caching for two hours by default.
    """
    if not url.startswith('http://'):
        url = 'http://'+url

    try:
        last_mod = datetime.datetime.fromtimestamp(
                    os.stat(CACHE_FILE).st_mtime)
    except OSError:
        # probably couldn't find the file
        last_mod = datetime.datetime(1900,1,1)
        html = urlopen(url).read()
        f = file(CACHE_FILE,'w')
        f.write(html)
        f.close()
    
    delta = datetime.timedelta(minutes=cache_minutes) 
    if last_mod < (datetime.datetime.now() - delta):
        # grab url and save to cache
        html = urlopen(url).read()
        f = file(CACHE_FILE,'w')
        f.write(html)
        f.close()
    else :
        # read cache file
        f= file(CACHE_FILE)
        html = f.read()
        f.close()

    return html


if __name__ == '__main__':
    parser = optparse.OptionParser()
    parser.add_option('--settings')
    options, args = parser.parse_args()
    # call this script like 
    # twitterlocal/bin/scrape.py --settings  myproject.settings
    if options.settings:
        os.environ["DJANGO_SETTINGS_MODULE"] = options.settings

    sys.path.append(os.getcwd())
    try:
        from myproject.twitterlocal.models import TweetsStat
    except ImportError:
        print 'define your settings path, like --settings \'myproject.settings\''
        exit()

    html = fetch_html('http://twitterlocal.net/stats', CACHE_TIME)
    soup = BeautifulSoup(html)

    # Parse dates
    dates = [d for d in soup('h3')[0].contents[2].split(' ') if d]

    # List comprehension fun!
    m,d,y,mi,s = [int(x) for x in dates[3].split('/') + dates[4].split(':')]
    end_time = datetime.datetime(y+2000,m,d,mi,s)

    m,d,y,mi,s = [int(x) for x in dates[0].split('/') + dates[1].split(':')]
    start_time = datetime.datetime(y,m,d,mi,s)

    for row in soup('tbody')[0].findAll('tr') :
        d = row.findAll('td')
        location = d[0].contents[0]
        tweets = d[1].contents[0]
        try:
            t = TweetsStat.objects.get( 
                    location=location, 
                    start_time=start_time,
                    end_time=end_time,)
        except TweetsStat.DoesNotExist:
            t = TweetsStat( 
                    location=location, 
                    tweets=tweets,
                    start_time=start_time,
                    end_time=end_time,)
        
            t.save()

Revision: 6151
at May 1, 2008 17:20 by mandric


Initial Code
#! /usr/bin/env python
# -*- coding: iso-8859-1 -*-
# vi:ts=4:et
import os,re,sys
from BeautifulSoup import BeautifulSoup
from urllib2 import urlopen, HTTPError, URLError
import datetime
import optparse


CACHE_FILE = '/tmp/twitterlocal_cache'
CACHE_TIME = 30 #minutes

def fetch_html(url, cache_minutes=60*2):
    """
    Return HTML string from a url, caching for two hours by default.
    """
    if not url.startswith('http://'):
        url = 'http://'+url

    try:
        last_mod = datetime.datetime.fromtimestamp(
                    os.stat(CACHE_FILE).st_mtime)
    except OSError:
        # probably couldn't find the file
        last_mod = datetime.datetime(1900,1,1)
        html = urlopen(url).read()
        f = file(CACHE_FILE,'w')
        f.write(html)
        f.close()
    
    delta = datetime.timedelta(minutes=cache_minutes) 
    if last_mod < (datetime.datetime.now() - delta):
        # grab url and save to cache
        html = urlopen(url).read()
        f = file(CACHE_FILE,'w')
        f.write(html)
        f.close()
    else :
        # read cache file
        f= file(CACHE_FILE)
        html = f.read()
        f.close()

    return html


if __name__ == '__main__':
    parser = optparse.OptionParser()
    parser.add_option('--settings')
    options, args = parser.parse_args()
    # call this script like 
    # twitterlocal/bin/scrape.py --settings  myproject.settings
    if options.settings:
        os.environ["DJANGO_SETTINGS_MODULE"] = options.settings

    sys.path.append(os.getcwd())
    try:
        from myproject.twitterlocal.models import TweetsStat
    except ImportError:
        print 'define your settings path, like --settings \'myproject.settings\''
        exit()

    html = fetch_html('http://twitterlocal.net/stats', CACHE_TIME)
    soup = BeautifulSoup(html)

    # Parse dates
    dates = [d for d in soup('h3')[0].contents[2].split(' ') if d]

    # List comprehension fun!
    m,d,y,mi,s = [int(x) for x in dates[3].split('/') + dates[4].split(':')]
    end_time = datetime.datetime(y+2000,m,d,mi,s)

    m,d,y,mi,s = [int(x) for x in dates[0].split('/') + dates[1].split(':')]
    start_time = datetime.datetime(y,m,d,mi,s)

    for row in soup('tbody')[0].findAll('tr') :
        d = row.findAll('td')
        location = d[0].contents[0]
        tweets = d[1].contents[0]
        try:
            t = TweetsStat.objects.get( 
                    location=location, 
                    start_time=start_time,
                    end_time=end_time,)
        except TweetsStat.DoesNotExist:
            t = TweetsStat( 
                    location=location, 
                    tweets=tweets,
                    start_time=start_time,
                    end_time=end_time,)
        
            t.save()

Initial URL

                                

Initial Description
going to use twitterlocal to play around a bit with pygooglechart.

Initial Title
Scrape twitterlocal and save to Django model

Initial Tags
data, python, twitter

Initial Language
Python