从 html 页面中的 Javascript 抓取数据
scraping data from Javascript in html page
下面的代码片段是 HTML 页面的一部分。我需要抓取数据但不确定什么是最可靠的方法。最好的方法是JSON,但我不确定下面是否可以转换为JSON。正则表达式是我唯一的选择吗?
<script type="text/javascript">
window.arMailRuMessages = [];
arMailRuMessages = (function() {
var k = 1024,
u = ajs.Html.unescape,
m = function(data) {
try {
return u(decodeURIComponent(data.text));
} catch (e) {}
return '';
};
return [
{
id: "14412430340000000392",
prev: "",
next: "14412428590000000596",
subject: u("hi"),
date: "1441243034",
size: "3" | 0,
folder: "0",
correspondents: {
from: [{
name: u("firstname lastname"),
email: u("firstname@gmail.com"),
avatars: {
"default": u("\/\/filin.mail.ru\/pic?email=firstname%40gmail.com&trust=true&user=firstname%40mail.ru&sign=CA0D4E8E74E806A459EA9C793CE8BC665EB2D049")
}
}],
to: [{
name: u(""),
email: u("firstname6000@mail.ru"),
avatars: {
"default": u("")
}
}],
cc: []
},
flags: {
spf: true,
unread: true,
flagged: false,
reply: false,
forward: false,
attach: false
},
snippet: m({
"ntype": "letter",
"text": "thisisaford"
}),
priority: 3
}, {
id: "14412428590000000596",
prev: "14412430340000000392",
next: "",
subject: u("hi"),
date: "1441242859",
size: "3" | 0,
folder: "0",
correspondents: {
from: [{
name: u("firstname lastname"),
email: u("firstname@gmail.com"),
avatars: {
"default": u("\/\/filin.mail.ru\/pic?email=firstname%40gmail.com&trust=true&user=firstname%40mail.ru&sign=CA0D4E8E74E806A459EA9C793CE8BC665EB2D049")
}
}],
to: [{
name: u(""),
email: u("firstname@mail.ru"),
avatars: {
"default": u("")
}
}],
cc: []
},
flags: {
spf: true,
unread: true,
flagged: false,
reply: false,
forward: false,
attach: false
},
snippet: m({
"ntype": "letter",
"text": "thisisatest"
}),
priority: 3
}
];
})();
__log.letters_data_js = 1;
</script>
有了 HtmlUnit,您可以使用 htmlPage。executeJavaScript,它将 return 一个要操作的对象。
下面是一个完整的例子:
try (final WebClient webClient = new WebClient(BrowserVersion.CHROME)) {
String url = "http://localhost/test.html";
HtmlPage htmlPage = webClient.getPage(url);
NativeArray array = (NativeArray) htmlPage.executeJavaScript("arMailRuMessages").getJavaScriptResult();
for (int i = 0; i < array.getLength(); i++) {
NativeObject object = (NativeObject) array.get(i);
String id = (String) object.get("id");
System.out.println(id);
NativeObject correspondents = (NativeObject) object.get("correspondents");
NativeArray from = (NativeArray) correspondents.get("from");
System.out.println(((NativeObject) from.get(0)).get("name"));
}
}
下面的代码片段是 HTML 页面的一部分。我需要抓取数据但不确定什么是最可靠的方法。最好的方法是JSON,但我不确定下面是否可以转换为JSON。正则表达式是我唯一的选择吗?
<script type="text/javascript">
window.arMailRuMessages = [];
arMailRuMessages = (function() {
var k = 1024,
u = ajs.Html.unescape,
m = function(data) {
try {
return u(decodeURIComponent(data.text));
} catch (e) {}
return '';
};
return [
{
id: "14412430340000000392",
prev: "",
next: "14412428590000000596",
subject: u("hi"),
date: "1441243034",
size: "3" | 0,
folder: "0",
correspondents: {
from: [{
name: u("firstname lastname"),
email: u("firstname@gmail.com"),
avatars: {
"default": u("\/\/filin.mail.ru\/pic?email=firstname%40gmail.com&trust=true&user=firstname%40mail.ru&sign=CA0D4E8E74E806A459EA9C793CE8BC665EB2D049")
}
}],
to: [{
name: u(""),
email: u("firstname6000@mail.ru"),
avatars: {
"default": u("")
}
}],
cc: []
},
flags: {
spf: true,
unread: true,
flagged: false,
reply: false,
forward: false,
attach: false
},
snippet: m({
"ntype": "letter",
"text": "thisisaford"
}),
priority: 3
}, {
id: "14412428590000000596",
prev: "14412430340000000392",
next: "",
subject: u("hi"),
date: "1441242859",
size: "3" | 0,
folder: "0",
correspondents: {
from: [{
name: u("firstname lastname"),
email: u("firstname@gmail.com"),
avatars: {
"default": u("\/\/filin.mail.ru\/pic?email=firstname%40gmail.com&trust=true&user=firstname%40mail.ru&sign=CA0D4E8E74E806A459EA9C793CE8BC665EB2D049")
}
}],
to: [{
name: u(""),
email: u("firstname@mail.ru"),
avatars: {
"default": u("")
}
}],
cc: []
},
flags: {
spf: true,
unread: true,
flagged: false,
reply: false,
forward: false,
attach: false
},
snippet: m({
"ntype": "letter",
"text": "thisisatest"
}),
priority: 3
}
];
})();
__log.letters_data_js = 1;
</script>
有了 HtmlUnit,您可以使用 htmlPage。executeJavaScript,它将 return 一个要操作的对象。
下面是一个完整的例子:
try (final WebClient webClient = new WebClient(BrowserVersion.CHROME)) {
String url = "http://localhost/test.html";
HtmlPage htmlPage = webClient.getPage(url);
NativeArray array = (NativeArray) htmlPage.executeJavaScript("arMailRuMessages").getJavaScriptResult();
for (int i = 0; i < array.getLength(); i++) {
NativeObject object = (NativeObject) array.get(i);
String id = (String) object.get("id");
System.out.println(id);
NativeObject correspondents = (NativeObject) object.get("correspondents");
NativeArray from = (NativeArray) correspondents.get("from");
System.out.println(((NativeObject) from.get(0)).get("name"));
}
}