如何按顺序发出 page.open() 请求?

How can I make my page.open() requests sequentially?

我对此进行了编码,但它仍然是非顺序的。我希望它会等到实际请求完成的功能,直到调用一个新的请求....但这是行不通的。

问题 1: page.open() 调用不是连续的,如您在此处所见:

6       protocol: https:     type: Content
7       protocol: https:     type: Content
8       protocol: https:     type: Content
9       protocol: https:     type: Content
LINE: https://www.roller.de/einrichten/
10       protocol: https:     type: Content
11       protocol: https:     type: Content
12       protocol: https:     type: Content
LINE: https://www.roller.de/einrichten/anrichte/
LINE: https://www.roller.de/einrichten/arbeitsstuhl/
LINE: https://www.roller.de/einrichten/arbeitstisch/
LINE: https://www.roller.de/einrichten/armlehnstuehle/
LINE: https://www.roller.de/einrichten/badezimmermoebel
LINE: https://www.roller.de/einrichten/bistrostuehle/
LINE: https://www.roller.de/einrichten/buecherregal/
13       protocol: https:     type: Content
14       protocol: https:     type: Content
15       protocol: https:     type: Content
16       protocol: https:     type: Content
LINE: https://www.roller.de/einrichten/buerocontainer/
LINE: https://www.roller.de/einrichten/bueroregale/
17       protocol: https:     type: Content
18       protocol: https:     type: Content

LINE: 每个请求只应打印一次,但它出现了几次但没有 page.open 结果,导致早期 stream.atEnd() = true。如果是连续的,那应该是不可能的。

问题2:最后一行没有被占用,当我有一个包含100个链接(每行1个)的.txt文件时,打印了99个,一个没有

问题 3: 当我给它一个包含 1000 个 url 的列表时它崩溃了

问题 4:10 个链接 = 10 次打印,100 个链接 = 98 次打印,stream.atEnd() 确实出现了几次,500 个链接 = 497-498 次打印 + stream.atEnd() 问题,1000 个链接 = 崩溃

console.log('Hello, world!');
var fs = require('fs');
var stream = fs.open('100sitemap.txt', 'r');
var webPage = require('webpage');
var i = 1;

function nextPage() {
    if (stream.atEnd()) {
        //stream.close();
        console.log("STREAM END: " + stream.atEnd());
        console.log("FILE ENDS HERE");
        //phantom.exit();
    }
    if (!stream.atEnd()) {
        var line = stream.readLine();
        console.log("LINE: " + line);
        getRequest(line);
    }
}

function getRequest(line2) {
    //console.log(line);
    var page = webPage.create();
    page.settings.loadImages = false;
    page.open(line2, function() {});
    //console.log("page.open() " + line2);
    //console.log("opened " + line2);
    page.onResourceRequested = function(requestData, request) {
        //console.log("BEFORE: " + requestData.url);
        var match = requestData.url.match(/example.com\/ca/g)
        //console.log("Match: " + match);
        //console.log(request.url);
        if (match != null) {
            hasFound = true;
            var targetString = decodeURI(JSON.stringify(requestData.url));
            var klammerauf = targetString.indexOf("{");
            var jsonobjekt = targetString.substr(klammerauf,      (targetString.indexOf("}") - klammerauf) + 1);
            targetJSON = (decodeURIComponent(jsonobjekt));
            var t = JSON.parse(targetJSON);
            console.log(i + "       " + t['groups'] + "     " +    t['campID']);
            i++;
            //console.log(targetJSON);
            request.abort;
        }
    };
    page.onLoadFinished = function(status) {
        if (!hasFound) {
            console.log(i + " :NOT FOUND: " + line2);
            i++;
        }
        //request.abort();
        page.close();
        nextPage();
    }
}

nextPage();

现在它可以使用这段代码,iFrame 似乎触发了 onLoadFinished() 两次,所以我检查了 hasOnLoadFinished 以防止函数中的多个条目(一次使用多个 page.open() 确实是PhantomJS 中的坏主意)。

请注意,由于未知原因(大多数时候没有错误消息,很少有 "QThread::start: Failed to create thread ()".

为防止出现这种情况,请使用 1.9.8 版本而不是 2.0,这似乎是一个错误,在 Github 上用转储填充了崩溃报告。

/edit 在 3836 个链接与 1.9.8 链接后崩溃,没有错误消息...... PHANTOM。

console.log('Hello, world!');
var fs = require('fs');
var stream = fs.open('linklist.de.txt', 'r');
var webPage = require('webpage');
var i = 1;
var hasFound = Array();
var hasonLoadFinished = Array();

function handle_page(link) {
var page = webPage.create();
page.settings.loadImages = false;
page.open(link, function() {});

page.onResourceRequested = function(requestData, request) {
    var match = requestData.url.match(/example.com\/searchmeI'maString/g)
    if (match != null) {
        hasFound[link] = true;
        var targetString = decodeURI(JSON.stringify(requestData.url));
        var klammerauf = targetString.indexOf("{");
        var jsonobjekt = targetString.substr(klammerauf,     (targetString.indexOf("}") - klammerauf) + 1);
        targetJSON = (decodeURIComponent(jsonobjekt));
        var t = JSON.parse(targetJSON);
        console.log(i + "   " + t + "       " + t['id']);
        //console.log(targetJSON);
        //console.log("");
        request.abort;
    } else {
        request.abort;
        return;
    }

};
page.onLoadFinished = function(status) {    
    if (!hasonLoadFinished[link]) {
        hasonLoadFinished[link] = true;
        //console.log(" " + status + "  " + link);
        //console.log("onLoadFinished()")   
        //setTimeout(function(){/* Look mah! No name! */},1000);
        if (!hasFound[link]) {
            console.log(i + " :NOT FOUND: " + link);
            console.log("");
        }
        i++;
        page.close();
        nextPage();
    }
}
};

function nextPage() {
var link = stream.readLine();
if (!link) {
    end = Date.now();
    console.log("");
    console.log(((end - start) / 1000) + " Sekunden");
    console.log("FILE ENDS HERE!!!");
    phantom.exit(0);
}
hasFound[link] = false;
hasonLoadFinished[link] = false;
handle_page(link);
}

start = Date.now();
nextPage();