1

The snippet below is part of an HTML page. I need to scrape the data but not sure what would be the most reliable way. The best way would be JSON, but I'm not sure if the following can be converted to JSON. Is Regular Expression my only choice?

<script type="text/javascript"> 

    window.arMailRuMessages = [];

    arMailRuMessages = (function() {
        var k = 1024,
            u = ajs.Html.unescape,
            m = function(data) {
                try {
                    return u(decodeURIComponent(data.text));
                } catch (e) {}
                return '';
            };

        return [

            {
                id: "14412430340000000392",
                prev: "",
                next: "14412428590000000596",
                subject: u("hi"),
                date: "1441243034",
                size: "3" | 0,
                folder: "0",
                correspondents: {
                    from: [{
                        name: u("firstname lastname"),
                        email: u("[email protected]"),
                        avatars: {
                            "default": u("\/\/filin.mail.ru\/pic?email=firstname%40gmail.com&amp;trust=true&amp;user=firstname%40mail.ru&amp;sign=CA0D4E8E74E806A459EA9C793CE8BC665EB2D049")
                        }
                    }],
                    to: [{
                        name: u(""),
                        email: u("[email protected]"),
                        avatars: {
                            "default": u("")
                        }
                    }],
                    cc: []
                },
                flags: {
                    spf: true,
                    unread: true,
                    flagged: false,
                    reply: false,
                    forward: false,
                    attach: false
                },
                snippet: m({
                    "ntype": "letter",
                    "text": "thisisaford"
                }),
                priority: 3
            }, {
                id: "14412428590000000596",
                prev: "14412430340000000392",
                next: "",
                subject: u("hi"),
                date: "1441242859",
                size: "3" | 0,
                folder: "0",
                correspondents: {
                    from: [{
                        name: u("firstname lastname"),
                        email: u("[email protected]"),
                        avatars: {
                            "default": u("\/\/filin.mail.ru\/pic?email=firstname%40gmail.com&amp;trust=true&amp;user=firstname%40mail.ru&amp;sign=CA0D4E8E74E806A459EA9C793CE8BC665EB2D049")
                        }
                    }],
                    to: [{
                        name: u(""),
                        email: u("[email protected]"),
                        avatars: {
                            "default": u("")
                        }
                    }],
                    cc: []
                },
                flags: {
                    spf: true,
                    unread: true,
                    flagged: false,
                    reply: false,
                    forward: false,
                    attach: false
                },
                snippet: m({
                    "ntype": "letter",
                    "text": "thisisatest"
                }),
                priority: 3
            }
        ];
    })();
    __log.letters_data_js = 1;
</script>
4
  • scrape what data, exactly? that is in the return statement? if the script isnt running in strict mode, it is going to assign all the data you want to window.arMailRuMessages Commented Sep 3, 2015 at 10:36
  • Java != Javascript. The only similarity between the two is that the first four letters in the name happen to be the same. Please don't confuse them. Commented Sep 3, 2015 at 10:43
  • @NickJ know Java is not JavaScript. The reason I put the Java tag is because I will be using HTMLUnit with Java to scrape pages. Commented Sep 3, 2015 at 11:05
  • @SpringLearner because that's the language I will use to scrape Commented Sep 3, 2015 at 11:05

1 Answer 1

1

With HtmlUnit, you can use htmlPage.executeJavaScript, which will return an Object to manipulate.

Below is a complete example:

    try (final WebClient webClient = new WebClient(BrowserVersion.CHROME)) {
        String url = "http://localhost/test.html";
        HtmlPage htmlPage = webClient.getPage(url);
        NativeArray array = (NativeArray) htmlPage.executeJavaScript("arMailRuMessages").getJavaScriptResult();
        for (int i = 0; i < array.getLength(); i++) {
            NativeObject object = (NativeObject) array.get(i);
            String id = (String) object.get("id");
            System.out.println(id);
            NativeObject correspondents = (NativeObject) object.get("correspondents");
            NativeArray from = (NativeArray) correspondents.get("from");
            System.out.println(((NativeObject) from.get(0)).get("name"));
        }
    }
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.