Revision: 70910
Updated Code
at September 10, 2017 03:02 by martinson
Updated Code
---
config:
agent: Firefox
iterator:
- type: fieldset
fields:
- user: someusername
do:
- walk:
to: https://www.instagram.com/<%user%>/
do:
- variable_set:
field: repeat
value: 'no'
- variable_clear: queryid
- find:
path: body
do:
- parse:
filter: window\._sharedData\s+\=\s+([^;]+);
- normalize:
routine: json2xml
- to_block
- find:
path: config>csrf_token
do:
- parse
- variable_set: token
- cookie_get: mid
- variable_set: mid
- find:
path: script[type="text/javascript"]
do:
- parse:
attr: src
- if:
match: Commons\.js
do:
- normalize:
routine: url
- walk:
to: value
headers:
Cookue: csrftoken=<%token%>; mid=<%mid%>;
do:
- find:
path: script
do:
- parse
- normalize:
routine: unescape_html
- filter:
args:
return\s*e\.profilePosts\.byUserId\.get\(t\)\.pagination\}\,queryId\:\"(\d+)\"
- if:
match: \d+
do:
- variable_set: queryid
- find:
path: body
do:
- cookie_get: mid
- variable_set: mid
- parse:
filter: window\._sharedData\s+\=\s+([^;]+);
- normalize:
routine: json2xml
- to_block
- find:
path: config>csrf_token
do:
- parse
- variable_set: token
- find:
path: entry_data>profilepage
do:
- register_set: https://www.instagram.com/p
- variable_set: baseurl
- object_new: user
- find:
path: user>id
do:
- parse
- object_field_set:
object: user
field: id
- variable_set: userid
- find:
path: user>username
do:
- parse
- object_field_set:
object: user
field: username
- find:
path: user>full_name
do:
- parse
- object_field_set:
object: user
field: full_name
- find:
path: user>biography
do:
- parse
- object_field_set:
object: user
field: biography
- find:
path: user>profile_pic_url
do:
- parse
- object_field_set:
object: user
field: profile_pic_url
- find:
path: user>profile_pic_url_hd
do:
- parse
- object_field_set:
object: user
field: profile_pic_url_hd
- find:
path: user>external_url
do:
- parse
- object_field_set:
object: user
field: external_url
- find:
path: user>external_url_linkshimmed
do:
- parse
- object_field_set:
object: user
field: external_url_linkshimmed
- find:
path: user>country_block
do:
- parse
- object_field_set:
object: user
field: country_block
- find:
path: user>follows>count
do:
- parse
- object_field_set:
object: user
field: follows
- find:
path: user>followed_by>count
do:
- parse
- object_field_set:
object: user
field: followed_by
- find:
path: user>media>nodes
do:
- object_new: nodes
- find:
path: id
do:
- parse
- object_field_set:
object: nodes
field: id
- find:
path: is_video
do:
- parse
- object_field_set:
object: nodes
field: is_video
- find:
path: video_views
do:
- parse
- object_field_set:
object: nodes
field: video_views
- find:
path: date
do:
- parse
- normalize:
routine: date_format
args:
format_in: '%s'
format_out: '%Y-%m-%d %H:%M:%S'
- object_field_set:
object: nodes
field: date
- find:
path: dimensions>width
do:
- parse
- object_field_set:
object: nodes
field: width
- find:
path: dimensions>height
do:
- parse
- object_field_set:
object: nodes
field: height
- find:
path: likes>count
do:
- parse
- object_field_set:
object: nodes
field: likes_count
- find:
path: comments>count
do:
- parse
- object_field_set:
object: nodes
field: comments_count
- find:
path: comments_disabled
do:
- parse
- object_field_set:
object: nodes
field: comments_disabled
- find:
path: caption_safe
do:
- parse
- object_field_set:
object: nodes
field: caption
- find:
path: thumbnail_src
do:
- parse
- object_field_set:
object: nodes
field: thumbnail
- find:
path: display_src
do:
- parse
- object_field_set:
object: nodes
field: media
- object_save:
name: nodes
to: user
- find:
path: user>media>page_info
do:
- find:
path: has_next_page
do:
- parse
- if:
match: true
do:
- variable_set:
field: repeat
value: 'yes'
- find:
path: end_cursor
do:
- parse
- eval:
routine: js
body: '(function () {return encodeURIComponent("<%register%>")})();'
- variable_set: cursor
- walk:
to: https://www.instagram.com/graphql/query/?query_id=<%queryid%>&variables=%7B%22id%22%3A%22<%userid%>%22%2C%22first%22%3A12%2C%22after%22%3A%22<%cursor%>%22%7D
repeat: <%repeat%>
headers:
accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
accept-language: ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4
cache-control: max-age=0
upgrade-insecure-requests: 1
Cookie: mid=<%mid%>;
do:
- sleep: 5
- variable_set:
field: repeat
value: 'no'
- find:
path: edge_owner_to_timeline_media>page_info
do:
- find:
path: has_next_page
do:
- parse
- if:
match: true
do:
- variable_set:
field: repeat
value: 'yes'
- find:
path: end_cursor
do:
- parse
- eval:
routine: js
body: '(function () {return encodeURIComponent("<%register%>")})();'
- variable_set: cursor
- find:
path: edge_owner_to_timeline_media>count
do:
- parse
- object_field_set:
object: user
field: media_count
- find:
path: edge_owner_to_timeline_media>edges>node
do:
- object_new: nodes
- find:
path: id
do:
- parse
- object_field_set:
object: nodes
field: id
- find:
path: is_video
do:
- parse
- object_field_set:
object: nodes
field: is_video
- find:
path: video_views
do:
- parse
- object_field_set:
object: nodes
field: video_views
- find:
path: taken_at_timestamp
do:
- parse
- normalize:
routine: date_format
args:
format_in: '%s'
format_out: '%Y-%m-%d %H:%M:%S'
- object_field_set:
object: nodes
field: date
- find:
path: dimensions>width
do:
- parse
- object_field_set:
object: nodes
field: width
- find:
path: dimensions>height
do:
- parse
- object_field_set:
object: nodes
field: height
- find:
path: edge_media_preview_like>count
do:
- parse
- object_field_set:
object: nodes
field: likes_count
- find:
path: edge_media_to_comment>count
do:
- parse
- object_field_set:
object: nodes
field: comments_count
- find:
path: comments_disabled
do:
- parse
- object_field_set:
object: nodes
field: comments_disabled
- find:
path: edge_media_to_caption
do:
- parse
- object_field_set:
object: nodes
field: caption
- find:
path: thumbnail_src
do:
- parse
- object_field_set:
object: nodes
field: thumbnail
- find:
path: display_url
do:
- parse
- object_field_set:
object: nodes
field: media
- object_save:
name: nodes
to: user
- object_save:
name: user
Revision: 70909
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at September 13, 2016 22:46 by martinson
Initial Code
---
config:
agent: Firefox
iterator:
- type: fieldset
fields:
- user: somusername
itemstoget: 30
mode: simple
do:
- walk:
to: https://www.instagram.com/<%user%>/
do:
- find:
path: body
do:
- parse:
filter: window\._sharedData\s+\=\s+([^;]+);
- normalize:
routine: json2xml
- to_block
- find:
path: config>csrf_token
do:
- parse
- variable_set: token
- find:
path: entry_data>profilepage
do:
- register_set: https://www.instagram.com/p
- variable_set: baseurl
- object_new: user
- find:
path: user>id
do:
- parse
- object_field_set:
object: user
field: id
- variable_set: userid
- find:
path: user>username
do:
- parse
- object_field_set:
object: user
field: username
- find:
path: user>full_name
do:
- parse
- object_field_set:
object: user
field: full_name
- find:
path: user>biography
do:
- parse
- object_field_set:
object: user
field: biography
- find:
path: user>profile_pic_url
do:
- parse
- object_field_set:
object: user
field: profile_pic_url
- find:
path: user>profile_pic_url_hd
do:
- parse
- object_field_set:
object: user
field: profile_pic_url_hd
- find:
path: user>external_url
do:
- parse
- object_field_set:
object: user
field: external_url
- find:
path: user>external_url_linkshimmed
do:
- parse
- object_field_set:
object: user
field: external_url_linkshimmed
- find:
path: user>country_block
do:
- parse
- object_field_set:
object: user
field: country_block
- find:
path: user>follows>count
do:
- parse
- object_field_set:
object: user
field: follows
- find:
path: user>followed_by>count
do:
- parse
- object_field_set:
object: user
field: followed_by
- find:
path: user>media>page_info>start_cursor
do:
- parse
- variable_set: cursor
- argument_get: itemstoget
- if:
match: all
do:
- find:
path: user>media>count
do:
- parse
- variable_set: items_get
else:
- variable_set: items_get
- walk:
to:
post: https://www.instagram.com/query/
headers:
x-csrftoken: <%token%>
x-instagram-ajax: 1
x-requested-with: XMLHttpRequest
data:
q: 'ig_user(<%userid%>) { media.after(<%cursor%>, <%items_get%>) {
count,
nodes {
caption,
code,
comments {
count
},
comments_disabled,
date,
dimensions {
height,
width
},
display_src,
id,
is_video,
likes {
count
},
owner {
id
},
thumbnail_src,
video_views
},
page_info
}
}'
ref: 'users::show'
do:
- find:
path: media>count
do:
- parse
- object_field_set:
object: user
field: media_count
- find:
path: media>nodes
do:
- object_new: nodes
- find:
path: id
do:
- parse
- object_field_set:
object: nodes
field: id
- find:
path: is_video
do:
- parse
- object_field_set:
object: nodes
field: is_video
- find:
path: video_views
do:
- parse
- object_field_set:
object: nodes
field: vide_views
- find:
path: date
do:
- parse
- normalize:
routine: date_format
args:
format_in: '%s'
format_out: '%Y-%m-%d %H:%M:%S'
- object_field_set:
object: nodes
field: date
- find:
path: dimensions>width
do:
- parse
- object_field_set:
object: nodes
field: width
- find:
path: dimensions>height
do:
- parse
- object_field_set:
object: nodes
field: height
- find:
path: likes>count
do:
- parse
- object_field_set:
object: nodes
field: likes_count
- find:
path: comments>count
do:
- parse
- object_field_set:
object: nodes
field: comments_count
- find:
path: comments_disabled
do:
- parse
- object_field_set:
object: nodes
field: comments_disabled
- find:
path: caption_safe
do:
- parse
- object_field_set:
object: nodes
field: caption
- find:
path: thumbnail_src
do:
- parse
- object_field_set:
object: nodes
field: thumbnail
- find:
path: display_src
do:
- parse
- object_field_set:
object: nodes
field: media
- find:
path: code
do:
- parse
- variable_prepend:
field: baseurl
joinby: "/"
- object_field_set:
object: nodes
field: url
- variable_set: node_url
- argument_get: mode
- if:
match: extended
do:
- variable_get: node_url
- walk:
to: value
do:
- find:
path: body
do:
- parse:
filter: window\._sharedData\s+\=\s+([^;]+);
- normalize:
routine: json2xml
- to_block
- find:
path: entry_data>postpage>media>comments>nodes
do:
- object_new: comments
- find:
path: '>id'
do:
- parse
- object_field_set:
object: comments
field: id
- find:
path: user>id
do:
- parse
- object_field_set:
object: comments
field: user_id
- find:
path: user>username
do:
- parse
- object_field_set:
object: comments
field: username
- find:
path: user>profile_pic_url
do:
- parse
- object_field_set:
object: comments
field: user_profile_pic
- find:
path: text
do:
- parse
- object_field_set:
object: comments
field: text
- find:
path: created_at
do:
- parse:
filter: (\d+)\.\d+
- normalize:
routine: date_format
args:
format_in: '%s'
format_out: '%Y-%m-%d %H:%M:%S'
- object_field_set:
object: comments
field: date
- object_save:
name: comments
to: nodes
- find:
path: entry_data>postpage>media>likes>nodes
do:
- object_new: likes
- find:
path: user>id
do:
- parse
- object_field_set:
object: likes
field: user_id
- find:
path: user>username
do:
- parse
- object_field_set:
object: likes
field: username
- find:
path: user>profile_pic_url
do:
- parse
- object_field_set:
object: likes
field: user_profile_pic
- object_save:
name: likes
to: nodes
- object_save:
name: nodes
to: user
- object_save:
name: user
Initial URL
https://www.diggernaut.com
Initial Description
Hey guys,
Sharing my diggernaut's scripts for web scraping, hope it will be useful for you.
This is script for scraping user's accounts without logging in to instagram, so no risk. What you can get with it: all information about user (his full name, username, id, avatar, number of follows and followers, number of posts), information about his posts (url, image, number of likes, information about persons who liked, comments, caption). So you probably will be interested in getting not all posts but just lets say 10 (or 30) of most recent. You can adjust it with settings. You can also set mode for script, to simple or extended. In simple mode it will not retrieve list of persons who likes post and comments. It works faster and eats less bandwidth in this case.
If you look into script (lines 5-9):
- type: fieldset
fields:
- user: somusername
you can see settings you may adjust, instead of "someusername" you need to set username of instagram account you want to scrape. You can set multiple users to scrape, you can need in this case add additional settings chunks, like below:
- type: fieldset
fields:
- user: somusername1
- user: somusername2
Initial Title
Public Instagram User Scraper
Initial Tags
data, web
Initial Language
Other