Revision: 70910
Updated Code
at September 10, 2017 03:02 by martinson
Updated Code
--- config: agent: Firefox iterator: - type: fieldset fields: - user: someusername do: - walk: to: https://www.instagram.com/<%user%>/ do: - variable_set: field: repeat value: 'no' - variable_clear: queryid - find: path: body do: - parse: filter: window\._sharedData\s+\=\s+([^;]+); - normalize: routine: json2xml - to_block - find: path: config>csrf_token do: - parse - variable_set: token - cookie_get: mid - variable_set: mid - find: path: script[type="text/javascript"] do: - parse: attr: src - if: match: Commons\.js do: - normalize: routine: url - walk: to: value headers: Cookue: csrftoken=<%token%>; mid=<%mid%>; do: - find: path: script do: - parse - normalize: routine: unescape_html - filter: args: return\s*e\.profilePosts\.byUserId\.get\(t\)\.pagination\}\,queryId\:\"(\d+)\" - if: match: \d+ do: - variable_set: queryid - find: path: body do: - cookie_get: mid - variable_set: mid - parse: filter: window\._sharedData\s+\=\s+([^;]+); - normalize: routine: json2xml - to_block - find: path: config>csrf_token do: - parse - variable_set: token - find: path: entry_data>profilepage do: - register_set: https://www.instagram.com/p - variable_set: baseurl - object_new: user - find: path: user>id do: - parse - object_field_set: object: user field: id - variable_set: userid - find: path: user>username do: - parse - object_field_set: object: user field: username - find: path: user>full_name do: - parse - object_field_set: object: user field: full_name - find: path: user>biography do: - parse - object_field_set: object: user field: biography - find: path: user>profile_pic_url do: - parse - object_field_set: object: user field: profile_pic_url - find: path: user>profile_pic_url_hd do: - parse - object_field_set: object: user field: profile_pic_url_hd - find: path: user>external_url do: - parse - object_field_set: object: user field: external_url - find: path: user>external_url_linkshimmed do: - parse - object_field_set: object: user field: external_url_linkshimmed - find: path: user>country_block do: - parse - object_field_set: object: user field: country_block - find: path: user>follows>count do: - parse - object_field_set: object: user field: follows - find: path: user>followed_by>count do: - parse - object_field_set: object: user field: followed_by - find: path: user>media>nodes do: - object_new: nodes - find: path: id do: - parse - object_field_set: object: nodes field: id - find: path: is_video do: - parse - object_field_set: object: nodes field: is_video - find: path: video_views do: - parse - object_field_set: object: nodes field: video_views - find: path: date do: - parse - normalize: routine: date_format args: format_in: '%s' format_out: '%Y-%m-%d %H:%M:%S' - object_field_set: object: nodes field: date - find: path: dimensions>width do: - parse - object_field_set: object: nodes field: width - find: path: dimensions>height do: - parse - object_field_set: object: nodes field: height - find: path: likes>count do: - parse - object_field_set: object: nodes field: likes_count - find: path: comments>count do: - parse - object_field_set: object: nodes field: comments_count - find: path: comments_disabled do: - parse - object_field_set: object: nodes field: comments_disabled - find: path: caption_safe do: - parse - object_field_set: object: nodes field: caption - find: path: thumbnail_src do: - parse - object_field_set: object: nodes field: thumbnail - find: path: display_src do: - parse - object_field_set: object: nodes field: media - object_save: name: nodes to: user - find: path: user>media>page_info do: - find: path: has_next_page do: - parse - if: match: true do: - variable_set: field: repeat value: 'yes' - find: path: end_cursor do: - parse - eval: routine: js body: '(function () {return encodeURIComponent("<%register%>")})();' - variable_set: cursor - walk: to: https://www.instagram.com/graphql/query/?query_id=<%queryid%>&variables=%7B%22id%22%3A%22<%userid%>%22%2C%22first%22%3A12%2C%22after%22%3A%22<%cursor%>%22%7D repeat: <%repeat%> headers: accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 accept-language: ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4 cache-control: max-age=0 upgrade-insecure-requests: 1 Cookie: mid=<%mid%>; do: - sleep: 5 - variable_set: field: repeat value: 'no' - find: path: edge_owner_to_timeline_media>page_info do: - find: path: has_next_page do: - parse - if: match: true do: - variable_set: field: repeat value: 'yes' - find: path: end_cursor do: - parse - eval: routine: js body: '(function () {return encodeURIComponent("<%register%>")})();' - variable_set: cursor - find: path: edge_owner_to_timeline_media>count do: - parse - object_field_set: object: user field: media_count - find: path: edge_owner_to_timeline_media>edges>node do: - object_new: nodes - find: path: id do: - parse - object_field_set: object: nodes field: id - find: path: is_video do: - parse - object_field_set: object: nodes field: is_video - find: path: video_views do: - parse - object_field_set: object: nodes field: video_views - find: path: taken_at_timestamp do: - parse - normalize: routine: date_format args: format_in: '%s' format_out: '%Y-%m-%d %H:%M:%S' - object_field_set: object: nodes field: date - find: path: dimensions>width do: - parse - object_field_set: object: nodes field: width - find: path: dimensions>height do: - parse - object_field_set: object: nodes field: height - find: path: edge_media_preview_like>count do: - parse - object_field_set: object: nodes field: likes_count - find: path: edge_media_to_comment>count do: - parse - object_field_set: object: nodes field: comments_count - find: path: comments_disabled do: - parse - object_field_set: object: nodes field: comments_disabled - find: path: edge_media_to_caption do: - parse - object_field_set: object: nodes field: caption - find: path: thumbnail_src do: - parse - object_field_set: object: nodes field: thumbnail - find: path: display_url do: - parse - object_field_set: object: nodes field: media - object_save: name: nodes to: user - object_save: name: user
Revision: 70909
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at September 13, 2016 22:46 by martinson
Initial Code
--- config: agent: Firefox iterator: - type: fieldset fields: - user: somusername itemstoget: 30 mode: simple do: - walk: to: https://www.instagram.com/<%user%>/ do: - find: path: body do: - parse: filter: window\._sharedData\s+\=\s+([^;]+); - normalize: routine: json2xml - to_block - find: path: config>csrf_token do: - parse - variable_set: token - find: path: entry_data>profilepage do: - register_set: https://www.instagram.com/p - variable_set: baseurl - object_new: user - find: path: user>id do: - parse - object_field_set: object: user field: id - variable_set: userid - find: path: user>username do: - parse - object_field_set: object: user field: username - find: path: user>full_name do: - parse - object_field_set: object: user field: full_name - find: path: user>biography do: - parse - object_field_set: object: user field: biography - find: path: user>profile_pic_url do: - parse - object_field_set: object: user field: profile_pic_url - find: path: user>profile_pic_url_hd do: - parse - object_field_set: object: user field: profile_pic_url_hd - find: path: user>external_url do: - parse - object_field_set: object: user field: external_url - find: path: user>external_url_linkshimmed do: - parse - object_field_set: object: user field: external_url_linkshimmed - find: path: user>country_block do: - parse - object_field_set: object: user field: country_block - find: path: user>follows>count do: - parse - object_field_set: object: user field: follows - find: path: user>followed_by>count do: - parse - object_field_set: object: user field: followed_by - find: path: user>media>page_info>start_cursor do: - parse - variable_set: cursor - argument_get: itemstoget - if: match: all do: - find: path: user>media>count do: - parse - variable_set: items_get else: - variable_set: items_get - walk: to: post: https://www.instagram.com/query/ headers: x-csrftoken: <%token%> x-instagram-ajax: 1 x-requested-with: XMLHttpRequest data: q: 'ig_user(<%userid%>) { media.after(<%cursor%>, <%items_get%>) { count, nodes { caption, code, comments { count }, comments_disabled, date, dimensions { height, width }, display_src, id, is_video, likes { count }, owner { id }, thumbnail_src, video_views }, page_info } }' ref: 'users::show' do: - find: path: media>count do: - parse - object_field_set: object: user field: media_count - find: path: media>nodes do: - object_new: nodes - find: path: id do: - parse - object_field_set: object: nodes field: id - find: path: is_video do: - parse - object_field_set: object: nodes field: is_video - find: path: video_views do: - parse - object_field_set: object: nodes field: vide_views - find: path: date do: - parse - normalize: routine: date_format args: format_in: '%s' format_out: '%Y-%m-%d %H:%M:%S' - object_field_set: object: nodes field: date - find: path: dimensions>width do: - parse - object_field_set: object: nodes field: width - find: path: dimensions>height do: - parse - object_field_set: object: nodes field: height - find: path: likes>count do: - parse - object_field_set: object: nodes field: likes_count - find: path: comments>count do: - parse - object_field_set: object: nodes field: comments_count - find: path: comments_disabled do: - parse - object_field_set: object: nodes field: comments_disabled - find: path: caption_safe do: - parse - object_field_set: object: nodes field: caption - find: path: thumbnail_src do: - parse - object_field_set: object: nodes field: thumbnail - find: path: display_src do: - parse - object_field_set: object: nodes field: media - find: path: code do: - parse - variable_prepend: field: baseurl joinby: "/" - object_field_set: object: nodes field: url - variable_set: node_url - argument_get: mode - if: match: extended do: - variable_get: node_url - walk: to: value do: - find: path: body do: - parse: filter: window\._sharedData\s+\=\s+([^;]+); - normalize: routine: json2xml - to_block - find: path: entry_data>postpage>media>comments>nodes do: - object_new: comments - find: path: '>id' do: - parse - object_field_set: object: comments field: id - find: path: user>id do: - parse - object_field_set: object: comments field: user_id - find: path: user>username do: - parse - object_field_set: object: comments field: username - find: path: user>profile_pic_url do: - parse - object_field_set: object: comments field: user_profile_pic - find: path: text do: - parse - object_field_set: object: comments field: text - find: path: created_at do: - parse: filter: (\d+)\.\d+ - normalize: routine: date_format args: format_in: '%s' format_out: '%Y-%m-%d %H:%M:%S' - object_field_set: object: comments field: date - object_save: name: comments to: nodes - find: path: entry_data>postpage>media>likes>nodes do: - object_new: likes - find: path: user>id do: - parse - object_field_set: object: likes field: user_id - find: path: user>username do: - parse - object_field_set: object: likes field: username - find: path: user>profile_pic_url do: - parse - object_field_set: object: likes field: user_profile_pic - object_save: name: likes to: nodes - object_save: name: nodes to: user - object_save: name: user
Initial URL
https://www.diggernaut.com
Initial Description
Hey guys, Sharing my diggernaut's scripts for web scraping, hope it will be useful for you. This is script for scraping user's accounts without logging in to instagram, so no risk. What you can get with it: all information about user (his full name, username, id, avatar, number of follows and followers, number of posts), information about his posts (url, image, number of likes, information about persons who liked, comments, caption). So you probably will be interested in getting not all posts but just lets say 10 (or 30) of most recent. You can adjust it with settings. You can also set mode for script, to simple or extended. In simple mode it will not retrieve list of persons who likes post and comments. It works faster and eats less bandwidth in this case. If you look into script (lines 5-9): - type: fieldset fields: - user: somusername you can see settings you may adjust, instead of "someusername" you need to set username of instagram account you want to scrape. You can set multiple users to scrape, you can need in this case add additional settings chunks, like below: - type: fieldset fields: - user: somusername1 - user: somusername2
Initial Title
Public Instagram User Scraper
Initial Tags
data, web
Initial Language
Other