如何按顺序发出 page.open() 请求?
How can I make my page.open() requests sequentially?
我对此进行了编码,但它仍然是非顺序的。我希望它会等到实际请求完成的功能,直到调用一个新的请求....但这是行不通的。
问题 1: page.open() 调用不是连续的,如您在此处所见:
6 protocol: https: type: Content
7 protocol: https: type: Content
8 protocol: https: type: Content
9 protocol: https: type: Content
LINE: https://www.roller.de/einrichten/
10 protocol: https: type: Content
11 protocol: https: type: Content
12 protocol: https: type: Content
LINE: https://www.roller.de/einrichten/anrichte/
LINE: https://www.roller.de/einrichten/arbeitsstuhl/
LINE: https://www.roller.de/einrichten/arbeitstisch/
LINE: https://www.roller.de/einrichten/armlehnstuehle/
LINE: https://www.roller.de/einrichten/badezimmermoebel
LINE: https://www.roller.de/einrichten/bistrostuehle/
LINE: https://www.roller.de/einrichten/buecherregal/
13 protocol: https: type: Content
14 protocol: https: type: Content
15 protocol: https: type: Content
16 protocol: https: type: Content
LINE: https://www.roller.de/einrichten/buerocontainer/
LINE: https://www.roller.de/einrichten/bueroregale/
17 protocol: https: type: Content
18 protocol: https: type: Content
LINE: 每个请求只应打印一次,但它出现了几次但没有 page.open 结果,导致早期 stream.atEnd() = true。如果是连续的,那应该是不可能的。
问题2:最后一行没有被占用,当我有一个包含100个链接(每行1个)的.txt文件时,打印了99个,一个没有
问题 3: 当我给它一个包含 1000 个 url 的列表时它崩溃了
问题 4:10 个链接 = 10 次打印,100 个链接 = 98 次打印,stream.atEnd() 确实出现了几次,500 个链接 = 497-498 次打印 + stream.atEnd() 问题,1000 个链接 = 崩溃
console.log('Hello, world!');
var fs = require('fs');
var stream = fs.open('100sitemap.txt', 'r');
var webPage = require('webpage');
var i = 1;
function nextPage() {
if (stream.atEnd()) {
//stream.close();
console.log("STREAM END: " + stream.atEnd());
console.log("FILE ENDS HERE");
//phantom.exit();
}
if (!stream.atEnd()) {
var line = stream.readLine();
console.log("LINE: " + line);
getRequest(line);
}
}
function getRequest(line2) {
//console.log(line);
var page = webPage.create();
page.settings.loadImages = false;
page.open(line2, function() {});
//console.log("page.open() " + line2);
//console.log("opened " + line2);
page.onResourceRequested = function(requestData, request) {
//console.log("BEFORE: " + requestData.url);
var match = requestData.url.match(/example.com\/ca/g)
//console.log("Match: " + match);
//console.log(request.url);
if (match != null) {
hasFound = true;
var targetString = decodeURI(JSON.stringify(requestData.url));
var klammerauf = targetString.indexOf("{");
var jsonobjekt = targetString.substr(klammerauf, (targetString.indexOf("}") - klammerauf) + 1);
targetJSON = (decodeURIComponent(jsonobjekt));
var t = JSON.parse(targetJSON);
console.log(i + " " + t['groups'] + " " + t['campID']);
i++;
//console.log(targetJSON);
request.abort;
}
};
page.onLoadFinished = function(status) {
if (!hasFound) {
console.log(i + " :NOT FOUND: " + line2);
i++;
}
//request.abort();
page.close();
nextPage();
}
}
nextPage();
现在它可以使用这段代码,iFrame 似乎触发了 onLoadFinished() 两次,所以我检查了 hasOnLoadFinished 以防止函数中的多个条目(一次使用多个 page.open() 确实是PhantomJS 中的坏主意)。
请注意,由于未知原因(大多数时候没有错误消息,很少有 "QThread::start: Failed to create thread ()".
为防止出现这种情况,请使用 1.9.8 版本而不是 2.0,这似乎是一个错误,在 Github 上用转储填充了崩溃报告。
/edit 在 3836 个链接与 1.9.8 链接后崩溃,没有错误消息...... PHANTOM。
console.log('Hello, world!');
var fs = require('fs');
var stream = fs.open('linklist.de.txt', 'r');
var webPage = require('webpage');
var i = 1;
var hasFound = Array();
var hasonLoadFinished = Array();
function handle_page(link) {
var page = webPage.create();
page.settings.loadImages = false;
page.open(link, function() {});
page.onResourceRequested = function(requestData, request) {
var match = requestData.url.match(/example.com\/searchmeI'maString/g)
if (match != null) {
hasFound[link] = true;
var targetString = decodeURI(JSON.stringify(requestData.url));
var klammerauf = targetString.indexOf("{");
var jsonobjekt = targetString.substr(klammerauf, (targetString.indexOf("}") - klammerauf) + 1);
targetJSON = (decodeURIComponent(jsonobjekt));
var t = JSON.parse(targetJSON);
console.log(i + " " + t + " " + t['id']);
//console.log(targetJSON);
//console.log("");
request.abort;
} else {
request.abort;
return;
}
};
page.onLoadFinished = function(status) {
if (!hasonLoadFinished[link]) {
hasonLoadFinished[link] = true;
//console.log(" " + status + " " + link);
//console.log("onLoadFinished()")
//setTimeout(function(){/* Look mah! No name! */},1000);
if (!hasFound[link]) {
console.log(i + " :NOT FOUND: " + link);
console.log("");
}
i++;
page.close();
nextPage();
}
}
};
function nextPage() {
var link = stream.readLine();
if (!link) {
end = Date.now();
console.log("");
console.log(((end - start) / 1000) + " Sekunden");
console.log("FILE ENDS HERE!!!");
phantom.exit(0);
}
hasFound[link] = false;
hasonLoadFinished[link] = false;
handle_page(link);
}
start = Date.now();
nextPage();
我对此进行了编码,但它仍然是非顺序的。我希望它会等到实际请求完成的功能,直到调用一个新的请求....但这是行不通的。
问题 1: page.open() 调用不是连续的,如您在此处所见:
6 protocol: https: type: Content 7 protocol: https: type: Content 8 protocol: https: type: Content 9 protocol: https: type: Content LINE: https://www.roller.de/einrichten/ 10 protocol: https: type: Content 11 protocol: https: type: Content 12 protocol: https: type: Content LINE: https://www.roller.de/einrichten/anrichte/ LINE: https://www.roller.de/einrichten/arbeitsstuhl/ LINE: https://www.roller.de/einrichten/arbeitstisch/ LINE: https://www.roller.de/einrichten/armlehnstuehle/ LINE: https://www.roller.de/einrichten/badezimmermoebel LINE: https://www.roller.de/einrichten/bistrostuehle/ LINE: https://www.roller.de/einrichten/buecherregal/ 13 protocol: https: type: Content 14 protocol: https: type: Content 15 protocol: https: type: Content 16 protocol: https: type: Content LINE: https://www.roller.de/einrichten/buerocontainer/ LINE: https://www.roller.de/einrichten/bueroregale/ 17 protocol: https: type: Content 18 protocol: https: type: Content
LINE: 每个请求只应打印一次,但它出现了几次但没有 page.open 结果,导致早期 stream.atEnd() = true。如果是连续的,那应该是不可能的。
问题2:最后一行没有被占用,当我有一个包含100个链接(每行1个)的.txt文件时,打印了99个,一个没有
问题 3: 当我给它一个包含 1000 个 url 的列表时它崩溃了
问题 4:10 个链接 = 10 次打印,100 个链接 = 98 次打印,stream.atEnd() 确实出现了几次,500 个链接 = 497-498 次打印 + stream.atEnd() 问题,1000 个链接 = 崩溃
console.log('Hello, world!');
var fs = require('fs');
var stream = fs.open('100sitemap.txt', 'r');
var webPage = require('webpage');
var i = 1;
function nextPage() {
if (stream.atEnd()) {
//stream.close();
console.log("STREAM END: " + stream.atEnd());
console.log("FILE ENDS HERE");
//phantom.exit();
}
if (!stream.atEnd()) {
var line = stream.readLine();
console.log("LINE: " + line);
getRequest(line);
}
}
function getRequest(line2) {
//console.log(line);
var page = webPage.create();
page.settings.loadImages = false;
page.open(line2, function() {});
//console.log("page.open() " + line2);
//console.log("opened " + line2);
page.onResourceRequested = function(requestData, request) {
//console.log("BEFORE: " + requestData.url);
var match = requestData.url.match(/example.com\/ca/g)
//console.log("Match: " + match);
//console.log(request.url);
if (match != null) {
hasFound = true;
var targetString = decodeURI(JSON.stringify(requestData.url));
var klammerauf = targetString.indexOf("{");
var jsonobjekt = targetString.substr(klammerauf, (targetString.indexOf("}") - klammerauf) + 1);
targetJSON = (decodeURIComponent(jsonobjekt));
var t = JSON.parse(targetJSON);
console.log(i + " " + t['groups'] + " " + t['campID']);
i++;
//console.log(targetJSON);
request.abort;
}
};
page.onLoadFinished = function(status) {
if (!hasFound) {
console.log(i + " :NOT FOUND: " + line2);
i++;
}
//request.abort();
page.close();
nextPage();
}
}
nextPage();
现在它可以使用这段代码,iFrame 似乎触发了 onLoadFinished() 两次,所以我检查了 hasOnLoadFinished 以防止函数中的多个条目(一次使用多个 page.open() 确实是PhantomJS 中的坏主意)。
请注意,由于未知原因(大多数时候没有错误消息,很少有 "QThread::start: Failed to create thread ()".
为防止出现这种情况,请使用 1.9.8 版本而不是 2.0,这似乎是一个错误,在 Github 上用转储填充了崩溃报告。
/edit 在 3836 个链接与 1.9.8 链接后崩溃,没有错误消息...... PHANTOM。
console.log('Hello, world!');
var fs = require('fs');
var stream = fs.open('linklist.de.txt', 'r');
var webPage = require('webpage');
var i = 1;
var hasFound = Array();
var hasonLoadFinished = Array();
function handle_page(link) {
var page = webPage.create();
page.settings.loadImages = false;
page.open(link, function() {});
page.onResourceRequested = function(requestData, request) {
var match = requestData.url.match(/example.com\/searchmeI'maString/g)
if (match != null) {
hasFound[link] = true;
var targetString = decodeURI(JSON.stringify(requestData.url));
var klammerauf = targetString.indexOf("{");
var jsonobjekt = targetString.substr(klammerauf, (targetString.indexOf("}") - klammerauf) + 1);
targetJSON = (decodeURIComponent(jsonobjekt));
var t = JSON.parse(targetJSON);
console.log(i + " " + t + " " + t['id']);
//console.log(targetJSON);
//console.log("");
request.abort;
} else {
request.abort;
return;
}
};
page.onLoadFinished = function(status) {
if (!hasonLoadFinished[link]) {
hasonLoadFinished[link] = true;
//console.log(" " + status + " " + link);
//console.log("onLoadFinished()")
//setTimeout(function(){/* Look mah! No name! */},1000);
if (!hasFound[link]) {
console.log(i + " :NOT FOUND: " + link);
console.log("");
}
i++;
page.close();
nextPage();
}
}
};
function nextPage() {
var link = stream.readLine();
if (!link) {
end = Date.now();
console.log("");
console.log(((end - start) / 1000) + " Sekunden");
console.log("FILE ENDS HERE!!!");
phantom.exit(0);
}
hasFound[link] = false;
hasonLoadFinished[link] = false;
handle_page(link);
}
start = Date.now();
nextPage();