Return to Snippet

Revision: 72840
at February 15, 2018 10:38 by martinson


Initial Code
---
config:
    agent: Firefox
    debug: 2
do:
- variable_set:
    field: username
    value: YOU_ACCOUNT_USERNAME_HERE
- variable_set:
    field: password
    value: YOU_ACCOUNT_PASSWORD_HERE
- variable_set:
    field: accounts
    value: LIST OF USERNAMES YOU WANT TO EXTRACT, COMMA SEPARATED
- walk:
    to: https://www.instagram.com/
    do:
    - find:
        path: body
        do:
        - parse:
            filter: window\._sharedData\s+\=\s+([^;]+);
        - normalize:
            routine: json2xml
        - to_block
        - find:
            path: config>csrf_token
            do:
            - parse
            - variable_set: token
        - walk:
            to:
                post: https://www.instagram.com/accounts/login/ajax/
                headers:
                    x-csrftoken: <%token%>
                    x-instagram-ajax: 1
                    x-requested-with: XMLHttpRequest
                data:
                    username: <%username%>
                    password: <%password%>
            do:
            - find:
                path: status
                do:
                - parse
                - if:
                    match: "fail"
                    do:
                    - cannot_login_probably_checkpoint_is_required
                    - exit
            - find:
                path: authenticated
                do:
                - parse
                - if:
                    match: "true"
                    else:
                    - wrong_login_or_password
                    - exit
                - cookie_get: mid
                - variable_set: mid
                - cookie_get: rur
                - variable_set: rur
                - cookie_get: ds_user_id
                - variable_set: dsuserid
                - cookie_get: sessionid
                - variable_set: sessionid
                - variable_get: accounts
                - to_block
                - split:
                    context: text
                    delimiter: ','
                - find:
                    path: div.splitted
                    do:
                    - parse
                    - space_dedupe
                    - trim
                    - variable_set: account
                    - walk:
                        to: https://www.instagram.com/<%account%>/?__a=1
                        do:
                        - find:
                            path: graphql > user > id
                            do:
                            - parse
                            - variable_set: id
                            - walk:
                                to: https://i.instagram.com/api/v1/users/<%id%>/info/
                                headers:
                                    X-IG-App-ID: 567067343352427
                                    X-IG-Capabilities: 3brDAw==
                                    X-IG-Connection-Type: WIFI
                                    X-IG-Connection-Speed: 3400
                                    X-IG-Bandwidth-Speed-KBPS: -1.000
                                    X-IG-Bandwidth-TotalBytes-B: 0
                                    X-IG-Bandwidth-TotalTime-MS: 0
                                    Cookie: mid=<%mid%>; csrftoken=<%token%>; rur=<%rur%>; ds_user_id=<%dsuserid%>; sessionid=<%sessionid%>; ig_or=;
                                    X-FB-HTTP-Engine: Liger
                                    Accept: '*/*'
                                    Accept-Language: en-US
                                do:
                                - find:
                                    path: body_safe > user
                                    do:
                                    - object_new: item
                                    - find:
                                        path: address_street
                                        do:
                                        - parse
                                        - space_dedupe
                                        - trim
                                        - object_field_set:
                                            object: item
                                            field: address_street
                                    - find:
                                        path: category
                                        do:
                                        - parse
                                        - space_dedupe
                                        - trim
                                        - object_field_set:
                                            object: item
                                            field: category
                                    - find:
                                        path: city_name
                                        do:
                                        - parse
                                        - space_dedupe
                                        - trim
                                        - object_field_set:
                                            object: item
                                            field: city_name
                                    - find:
                                        path: contact_phone_number
                                        do:
                                        - parse
                                        - space_dedupe
                                        - trim
                                        - object_field_set:
                                            object: item
                                            field: contact_phone_number
                                    - find:
                                        path: external_url
                                        do:
                                        - parse
                                        - space_dedupe
                                        - trim
                                        - object_field_set:
                                            object: item
                                            field: external_url
                                    - find:
                                        path: full_name
                                        do:
                                        - parse
                                        - space_dedupe
                                        - trim
                                        - object_field_set:
                                            object: item
                                            field: full_name
                                    - find:
                                        path: is_business
                                        do:
                                        - parse
                                        - space_dedupe
                                        - trim
                                        - object_field_set:
                                            object: item
                                            field: is_business
                                    - find:
                                        path: latitude
                                        do:
                                        - parse
                                        - space_dedupe
                                        - trim
                                        - object_field_set:
                                            object: item
                                            field: latitude
                                    - find:
                                        path: longitude
                                        do:
                                        - parse
                                        - space_dedupe
                                        - trim
                                        - object_field_set:
                                            object: item
                                            field: longitude
                                    - find:
                                        path: pk
                                        do:
                                        - parse
                                        - space_dedupe
                                        - trim
                                        - object_field_set:
                                            object: item
                                            field: id
                                    - find:
                                        path: public_email
                                        do:
                                        - parse
                                        - space_dedupe
                                        - trim
                                        - object_field_set:
                                            object: item
                                            field: public_email
                                    - find:
                                        path: public_phone_country_code
                                        do:
                                        - parse
                                        - space_dedupe
                                        - trim
                                        - object_field_set:
                                            object: item
                                            field: public_phone_country_code
                                    - find:
                                        path: public_phone_number
                                        do:
                                        - parse
                                        - space_dedupe
                                        - trim
                                        - object_field_set:
                                            object: item
                                            field: public_phone_number
                                    - find:
                                        path: username
                                        do:
                                        - parse
                                        - space_dedupe
                                        - trim
                                        - object_field_set:
                                            object: item
                                            field: username
                                    - find:
                                        path: zip
                                        do:
                                        - parse
                                        - space_dedupe
                                        - trim
                                        - object_field_set:
                                            object: item
                                            field: zip
                                    - object_save:
                                        name: item
                    - sleep: 5

Initial URL
https://www.diggernaut.com

Initial Description
To use this free scraper for instagram business profiles you need to have account at known [web scraping service](https://www.diggernaut.com/). This scraper allow you to scrape contact details from business profiles and also indicate if profile is business or not.

Scraper uses mobile API, so you will need to use instagram login and password. **MAKE SURE YOU DONT USE YOUR MAIN ACCOUNT**. API usage is unofficial and you are using it on your own risk.

So to use it you need to login to your Diggernaut account, create project, then create a digger and then click on "Add configuration" button and copy&paste below scraper code there.

You need to set your instagram username at line 8, instagram password at line 11 and list of usernames you want to retrieve data for (as comma separated list) at the line 14.

Then save your configuration and run the digger. In some time you should be able to download data.

Initial Title
Instagram Business Profile Scraper

Initial Tags
free, web, Business

Initial Language
Other