Revision: 17728
Updated Code
at May 8, 2010 11:12 by manatlan
Updated Code
import urllib,urllib2
from gzip import GzipFile
from StringIO import StringIO
def getContent(url,data=None): # data is a dict of posted vars
headers = {
"User-Agent": "Mozilla/5.0 (X11; U; Linux i686; fr-FR; rv:1.7.5) Gecko/20041108 Firefox/1.0",
"Accept": "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5",
"Accept-Language": "fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3",
"Accept-Charset": "utf-8;q=0.7,*;q=0.7",
"Keep-Alive": "300",
"Proxy-Connection": "keep-alive",
'Cookie': '',
"http-referer":"http://www.google.com/"
}
if data:
data = urllib.urlencode(data)
request= urllib2.Request(url,data,headers)
try:
response = urllib2.urlopen(request)
html=response.read(1000000)
try:
html=GzipFile(fileobj=StringIO(html), mode='rb').read(1000000)
except:
pass
return html
except urllib2.HTTPError, exc:
print "HTTP error %d : %s" % (exc.code, exc.msg)
except urllib2.URLError, exc:
print "URL Error : %s" % exc.reason
Revision: 17727
Updated Code
at September 12, 2009 18:17 by manatlan
Updated Code
import urllib,urllib2
def getContent(url,data=None): # data is a dict of posted vars
headers = {
"User-Agent": "Mozilla/5.0 (X11; U; Linux i686; fr-FR; rv:1.7.5) Gecko/20041108 Firefox/1.0",
"Accept": "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5",
"Accept-Language": "fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3",
"Accept-Charset": "utf-8;q=0.7,*;q=0.7",
"Keep-Alive": "300",
"Proxy-Connection": "keep-alive",
'Cookie': '',
"http-referer":"http://www.google.com/"
}
if data:
data = urllib.urlencode(data)
request= urllib2.Request(url,data,headers)
try:
response = urllib2.urlopen(request)
return response.read(500000)
except urllib2.HTTPError, exc:
print "HTTP error %d : %s" % (exc.code, exc.msg)
except urllib2.URLError, exc:
print "URL Error : %s" % exc.reason
Revision: 17726
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at September 12, 2009 17:19 by manatlan
Initial Code
import urllib,urllib2
def getContent(url,data=None): # data is a dict of posted vars
headers = {
"User-Agent": "Mozilla/5.0 (X11; U; Linux i686; fr-FR; rv:1.7.5) Gecko/20041108 Firefox/1.0",
"Accept": "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5",
"Accept-Language": "fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3",
"Accept-Charset": "utf-8;q=0.7,*;q=0.7",
"Keep-Alive": "300",
"Proxy-Connection": "keep-alive",
'Cookie': '',
"http-referer":"http://www.google.com/"
}
if data:
data = urllib.urlencode(data)
request= urllib2.Request(url,data,headers)
response = urllib2.urlopen(request)
return response.read().decode("utf_8")
Initial URL
Initial Description
Initial Title
Get content of a page (with headers, GET, POST), and ensure ungzipped
Initial Tags
python, web
Initial Language
Python