/ Published in: Python
Expand |
Embed | Plain Text
Copy this code and paste it in your HTML
# This is an example of file uploading using scrapy to a server that uses uberuploader. # The spider logins to the page to make the upload, some webpages dont need to be logged in order to upload. # IMPORTANT : You should increment DOWNLOAD_TIMEOUT in settings.py, but by the time this snippet was wrote isnt working fine. I recompile the whole scrapy changing the default of 3min. # Observations about my snippet: # Is possible this is not the best code, please comment corrections. # This could or should be implemented in a downloader middleware or pipeline? # Don't show uploading state. # Mime message creating could or should be in other place. class fileUploadSpider(CrawlSpider): name = "spidertrigger.upload" allowed_domains = ["uploadhost.com"] start_urls = [ "http://www.uploadhost.com/url_to_login_page", ] def parse(self,response): return [FormRequest.from_response( response, formdata={'user':'username','password':'secret'}, callback=self.after_login, )] def after_login(self,response): if "Log in to your account" in response.body: self.log("Login Failed",level=log.ERROR) return else: dataObjetcs = DataObject.objects.all()#I am using django ORM for data in dataObjects: #note the next line, the url should point to ubr_link_upload.php # I will get the random ticket to be able to upload file,rnd_id is hardcoded but could be generated via code yield Request( url='http://upload.uploadhost.com/upload/ubr_link_upload.php?rnd_id=1280793046605', callback=self.obtener_id_upload, meta={'data' : data}, ) return def get_id_upload(self,response): #here I will get the upload id hxs = HtmlXPathSelector(response) data = response.request.meta['data'] file_name = settings.IMAGES_STORE+'/'+data.path+'.zip' #here I require that the file exist (you should add more code here , like a try catch) #get the upload_id upload_id = re.search('\\\"\w+\\\"',hxs.select('/html/body').extract()[0]).group(0).replace('\"','') #build the fields that the request will have fields = { 'title':data.nombre, 'adpaid' :'0', 'private':'no', 'category[]':'1', 'fontcolor':'black', 'helpbox' : 'Font size: [size=50%]small text[/size]', 'textarea':'', 'fontsize':'', 'compare' : '14936', } files = {'upfile_0':file_name,} headers,body = self.get_mime(fields,files) print 'Iniciando Request POST' #next NOTE that the url should point to cgi-bin/ubr_upload.pl with the proper upload_id yield FormRequest ( url='http://upload.uploadhost.com/cgi-bin/ubr_upload.pl?upload_id='+upload_id, method='POST', body=body, meta={'data' : data}, headers = headers, callback=self.lastcall, ) return #this lastcall is for postprocessing the upload data, is an artificial example to obtain the id of the upload object on the webpage def lastcall(self,response): hxs = HtmlXPathSelector(response) linkUploaded = hxs.select('//div[@id=\'col2contentright\']/p/strong/a/@href').extract()[0] idUploaded = re.search('\d+',linkUploaded) print "Success Uploaded "+ ipUploaded return #this next code will need more improvement, is working for now. It could have problems with binary data! def get_mime(self,fields,files): BOUNDARY = '----------BOUNDARY_$' # CRLF = L = StringIO() for key in fields.keys() : value = fields[key] L.write('--' + BOUNDARY+' ') L.write('Content-Disposition: form-data; name="%s"' % key+' ') L.write(''+' ') L.write(value.encode('utf-8')+' ') for key in files.keys(): value = files[key] filename = value L.write('--' + BOUNDARY+' ') L.write('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, 'full.zip')+' ') L.write('Content-Type: %s' % self.get_content_type(filename)+' ') L.write(''+' ') L.write(open(value,'rb').read()+' ') L.write('--' + BOUNDARY + '--'+' ') L.write(''+' ') body = L.getvalue() content_type = {'Content-Type': 'multipart/form-data; boundary=%s' % BOUNDARY } return content_type,body def get_content_type(self,filename): return mimetypes.guess_type(filename)[0] or 'application/octet-stream' # Snippet imported from snippets.scrapy.org (which no longer works) # author: llazzaro # date : Aug 15, 2010