Get content of a page (with headers, GET, POST), and ensure ungzipped


/ Published in: Python
Save to your folder(s)



Copy this code and paste it in your HTML
  1. import urllib,urllib2
  2. from gzip import GzipFile
  3. from StringIO import StringIO
  4.  
  5. def getContent(url,data=None): # data is a dict of posted vars
  6. headers = {
  7. "User-Agent": "Mozilla/5.0 (X11; U; Linux i686; fr-FR; rv:1.7.5) Gecko/20041108 Firefox/1.0",
  8. "Accept": "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5",
  9. "Accept-Language": "fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3",
  10. "Accept-Charset": "utf-8;q=0.7,*;q=0.7",
  11. "Keep-Alive": "300",
  12. "Proxy-Connection": "keep-alive",
  13. 'Cookie': '',
  14. "http-referer":"http://www.google.com/"
  15. }
  16. if data:
  17. data = urllib.urlencode(data)
  18. request= urllib2.Request(url,data,headers)
  19. try:
  20. response = urllib2.urlopen(request)
  21. html=response.read(1000000)
  22. try:
  23. html=GzipFile(fileobj=StringIO(html), mode='rb').read(1000000)
  24. except:
  25. pass
  26. return html
  27.  
  28. except urllib2.HTTPError, exc:
  29. print "HTTP error %d : %s" % (exc.code, exc.msg)
  30. except urllib2.URLError, exc:
  31. print "URL Error : %s" % exc.reason

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.