从 Playstation 官方网站抓取奖杯数据
Scraping Trophy Data from The Official Playstation Website
我正在尝试使用 PhantomJS 从 http://my.playstation.com/logged-in/trophies/public-trophies/
中抓取奖杯数据
该页面要求您输入有效的用户名,然后单击 'go' 页面将加载数据。我已经让它有点工作了,但它从未将奖杯数据加载到 div 中。我希望我遗漏了一些 ajax 相关的东西,这就是造成这种情况的原因?
var fullpagehtml = page.evaluate(function()
{
document.getElementById("trophiesId").value = "<<valid user id>>";
//checkPTrophies(); btn click calls this function
$('#btn_publictrophy').click().delay( 6000 );
console.log("\nWaiting for trophy list to load...");
var trophylist = document.getElementById("trophyTrophyList").innerHtml; // all the data i want ends up inside this div
var counter = 0; //delay andset timeout wont work here so this is the best i coukld think of
while (trophylist == null)
{
//presumably the ajax query should kick in on the page and populate this div, but it doesnt.
trophylist = document.getElementById("trophyTrophyList").innerHtml;
counter ++;
if(counter == 1000000)
{
console.log($('#trophyTrophyList').html());
counter = 0;
}
}
return document.all[0].outerHTML;
});
delay( 6000 )
完全没有像 documentation 所说的那样:
The .delay()
method is best for delaying between queued jQuery effects. Because it is limited—it doesn't, for example, offer a way to cancel the delay—.delay()
is not a replacement for JavaScript's native setTimeout
function, which may be more appropriate for certain use cases.
要等待,您必须在页面上下文之外执行此操作(忙等待在 JavaScript 中不起作用,因为它是单线程的):
page.evaluate(function() {
document.getElementById("trophiesId").value = "<<valid user id>>";
//checkPTrophies(); btn click calls this function
$('#btn_publictrophy').click();
});
console.log("\nWaiting for trophy list to load...");
setTimeout(function(){
var fullpagehtml = page.evaluate(function() {
var trophylist = document.getElementById("trophyTrophyList").innerHTML;
return trophylist;
});
}, 20000);
您可能还想使用 waitFor
等待 #trophyTrophyList
被填充,而不是使用 setTimeout
:
waitFor(function(){
return page.evaluate(function(){
var e = document.getElementById("trophyTrophyList");
return e && e.innerHTML;
});
}, function(){
// TODO: get trophies
});
这不会让你走得太远,因为仅仅因为 #trophyTrophyList
被加载,并不意味着后代元素已经在 DOM 中。您必须找到一些选择器来表示页面已充分加载,例如等待页面中存在 .trophy-image
。 waitFor
函数超时 20 秒对我有用。
waitFor(function(){
return page.evaluate(function(){
var e = document.querySelector("#trophyTrophyList .trophy-image");
return e;
});
}, function(){
setTimeout(function(){
var trophiesDiv = page.evaluate(function(){
return document.getElementById("trophyTrophyList").innerHTML;
});
console.log(trophiesDiv);
}, 1000); // wait a little longer
}, 20000);
不要忘记您需要 page.evaluate
才能真正访问 DOM。顺便说一句,它是 innerHTML
而不是 innerHtml
。
我正在尝试使用 PhantomJS 从 http://my.playstation.com/logged-in/trophies/public-trophies/
中抓取奖杯数据该页面要求您输入有效的用户名,然后单击 'go' 页面将加载数据。我已经让它有点工作了,但它从未将奖杯数据加载到 div 中。我希望我遗漏了一些 ajax 相关的东西,这就是造成这种情况的原因?
var fullpagehtml = page.evaluate(function()
{
document.getElementById("trophiesId").value = "<<valid user id>>";
//checkPTrophies(); btn click calls this function
$('#btn_publictrophy').click().delay( 6000 );
console.log("\nWaiting for trophy list to load...");
var trophylist = document.getElementById("trophyTrophyList").innerHtml; // all the data i want ends up inside this div
var counter = 0; //delay andset timeout wont work here so this is the best i coukld think of
while (trophylist == null)
{
//presumably the ajax query should kick in on the page and populate this div, but it doesnt.
trophylist = document.getElementById("trophyTrophyList").innerHtml;
counter ++;
if(counter == 1000000)
{
console.log($('#trophyTrophyList').html());
counter = 0;
}
}
return document.all[0].outerHTML;
});
delay( 6000 )
完全没有像 documentation 所说的那样:
The
.delay()
method is best for delaying between queued jQuery effects. Because it is limited—it doesn't, for example, offer a way to cancel the delay—.delay()
is not a replacement for JavaScript's nativesetTimeout
function, which may be more appropriate for certain use cases.
要等待,您必须在页面上下文之外执行此操作(忙等待在 JavaScript 中不起作用,因为它是单线程的):
page.evaluate(function() {
document.getElementById("trophiesId").value = "<<valid user id>>";
//checkPTrophies(); btn click calls this function
$('#btn_publictrophy').click();
});
console.log("\nWaiting for trophy list to load...");
setTimeout(function(){
var fullpagehtml = page.evaluate(function() {
var trophylist = document.getElementById("trophyTrophyList").innerHTML;
return trophylist;
});
}, 20000);
您可能还想使用 waitFor
等待 #trophyTrophyList
被填充,而不是使用 setTimeout
:
waitFor(function(){
return page.evaluate(function(){
var e = document.getElementById("trophyTrophyList");
return e && e.innerHTML;
});
}, function(){
// TODO: get trophies
});
这不会让你走得太远,因为仅仅因为 #trophyTrophyList
被加载,并不意味着后代元素已经在 DOM 中。您必须找到一些选择器来表示页面已充分加载,例如等待页面中存在 .trophy-image
。 waitFor
函数超时 20 秒对我有用。
waitFor(function(){
return page.evaluate(function(){
var e = document.querySelector("#trophyTrophyList .trophy-image");
return e;
});
}, function(){
setTimeout(function(){
var trophiesDiv = page.evaluate(function(){
return document.getElementById("trophyTrophyList").innerHTML;
});
console.log(trophiesDiv);
}, 1000); // wait a little longer
}, 20000);
不要忘记您需要 page.evaluate
才能真正访问 DOM。顺便说一句,它是 innerHTML
而不是 innerHtml
。