及时请求与另一个站点的许多连接并激活跨源资源共享会导致数据损坏

Question

我正在制作一个 node.js 应用程序，我的部分代码请求来自 193 个不同 url 的数据，以便从每个 url 下载 json 数据。这是其中一个 url：https://www.gemeentegeschiedenis.nl/gemeentenaam/json/Apeldoorn 对于某些下载的 json 数据很好且完整。但是到最后，某些文件会发生损坏。部分数据变为无效，然后有一些数据出现数据库错误。我认为这与在短时间内从如此多的 url 请求数据有关（这就是为什么我尝试了“setTimeout”功能（但实际上不起作用））。

function writeToFile(url) {
    // get name to make each new file unique
    var name = url.split("json/")[1];
    var fileStream = fs.createWriteStream(`jsonFiles/${name}.json`);
    var options = {
        url: `${url}`,
        method: 'GET',
        headers: {
            'Accept': 'application/json',
            'Accept-Charset': 'utf-8',
            json: true
        }
    }
    //request the data from the site and download to the file.
    request.get(options).pipe(fileStream);
}

function getMunicipalityGeoJsonData(req, res) {
    //Get all the urls pointing to the JSON data for the province, Gelderland
    getGelderlandJsonUrls((err, jsonUrls) => {
        //for all those urls, write the data to files.
        for (url of jsonUrls) {
            console.log(url);
            writeToFile(url);
        }
    })
}

function getGelderlandJsonUrls(callback) {
    getMunicipalityJsonUrls("Gelderland", (err, data) => {
        jsonUrls = data;
        callback(null, jsonUrls);
    });
}

function getMunicipalityJsonUrls(provinceName, callback) {
    request({ uri: `https://www.gemeentegeschiedenis.nl/provincie/json/${provinceName}` }, (error, response, body) => {
        body = JSON.parse(body);
        // extracting each json URL from all the municipalities in Gelderland
        var jsonUrls = [];
        var numberMun = body.length;
        for (var i = 0; i < numberMun; i++) {
            var url = body[i].uri.naam;
            var urlSplit = url.split("gemeentenaam");
            var jsonUrl = urlSplit[0] + "gemeentenaam/json" + urlSplit[1];
            jsonUrl = jsonUrl.replace("http://", "https://");
            jsonUrls.push(jsonUrl);
        }
        callback(null, jsonUrls);
    });
}

The last json data downloaded into the file as an html page with a database error from the url: https://www.gemeentegeschiedenis.nl/gemeentenaam/json/Zutphen which actually just took just under 6 seconds to load up looking at the network tab on Chrome the 1812 has null for its properties when it should have a bunch of coordinates https://www.gemeentegeschiedenis.nl/gemeentenaam/json/Winssen (took just over a second to load on chrome

我是 node 的菜鸟，但请帮助我解决这个问题，也许可以通过某种检查数据是否损坏或其他方式。感谢先进的帮助:)

编辑：我试图在 for 循环中一次最多执行 200 urls。

Answer 1

首先，向 getMunicipalityJsonUrls() 和 getGelderlandJsonUrls() 添加适当的错误处理。这意味着：

检查 err 参数存在的所有地方并将错误传播回调用者。
从 JSON.parse()
检查 http statusCode。

这是修复后的代码：

function getMunicipalityJsonUrls(provinceName, callback) {
    request({ uri: `https://www.gemeentegeschiedenis.nl/provincie/json/${provinceName}` }, (error, response, body) => {
        if (err) {
            callback(err);
            return;
        }
        if (response.statusCode !== 200) {
            callback(new Error(`http status code ${response.statusCode}`));
            return;
        }
        try {
            const jsonUrls = JSON.parse(body).map(url => {
                let urlSplit = url.split("gemeentenaam");
                let jsonUrl = urlSplit[0] + "gemeentenaam/json" + urlSplit[1];
                return jsonUrl.replace("http://", "https://");
            });
            callback(null, jsonUrls);
        } catch(e) {
            callback(e);
        }
    });
}

function getGelderlandJsonUrls(callback) {
    getMunicipalityJsonUrls("Gelderland", (err, data) => {
        if (err) {
            callback(err);
        } else {
            callback(null, data);
        }
    });
}

然后，在 writeToFile() 中，添加错误处理和完成监控，我选择将其包装在承诺中而不是简单的回调中，因为我想将它与一些使用承诺的实用程序一起使用。

function writeToFile(url) {
    return new Promise((resolve, reject) => {
        // get name to make each new file unique
        var name = url.split("json/")[1];
        var fileStream = fs.createWriteStream(`jsonFiles/${name}.json`);
        fileStream.on('error', (e) => {
            reject(e);
        });
        var options = {
            url: `${url}`,
            method: 'GET',
            headers: {
                'Accept': 'application/json',
                'Accept-Charset': 'utf-8',
                json: true
            }
        }
        //request the data from the site and download to the file.
        request.get(options).pipe(fileStream).on('error', (e) => {
            reject(e);
        }).on('finish', () => {
            resolve(url);
        });
    });
}

现在，我们需要决定如何遍历所有 URL。如果任何 url 可能会尝试写入同一个文件（即使这种可能性很小），那么您必须序列化 url 以防止它们有多个异步操作试图写入同一个文件同时因为那只会弄乱那个文件。因此，如果是这种情况，您可以像这样序列化写入文件：

// option 1 - serialize writing to files
async function getMunicipalityGeoJsonData(req, res) {
    //Get all the urls pointing to the JSON data for the province, Gelderland
    getGelderlandJsonUrls((err, jsonUrls) => {
        if (err) {
            console.log(err);
            res.sendStatus(500);
        } else {
            try {
                //for all those urls, write the data to files.
                for (url of jsonUrls) {
                    console.log(url);
                    await writeToFile(url);
                }
                res.send("All done");
            } catch(e) {
                console.log(e);
                res.sendStatus(500);
            }
        }
    });
}

如果您完全确定这些 URL 中的 none 会导致写入同一文件，那么您可以一次运行 N 个 URL 来确定其中的最低值N 是让你有不错的表现。较高的 N 值消耗更多的峰值资源（内存和文件句柄）。 N 运行的较低值并联的东西较少。如果目标主机名都是相同的服务器，那么通常您不希望 N 超过 5。如果您从中检索数据的目标主机都不同，您可以尝试将 N 的值设置为最多 20。

// option 2 - run N at a time in parallel
function getMunicipalityGeoJsonData(req, res) {
    //Get all the urls pointing to the JSON data for the province, Gelderland
    getGelderlandJsonUrls((err, jsonUrls) => {
        if (err) {
            console.log(err);
            res.sendStatus(500);
        } else {
            //for all those urls, write the data to files.
            const numConcurrent = 5;
            mapConcurrent(jsonUrls, numConcurrent, writeToFile).then(() => {
                res.send("All done");
            }).catch(err => {
                console.log(err);
                res.sendStatus(500);
            });
        }
    })
}

mapConcurrent()函数来自这个答案，如下。它希望您向它传递一个要迭代的项目数组、您希望同时飞行的最大值和一个函数，该函数将传递一个数组项目并将 return 连接到完成时的承诺或有一个错误：

function mapConcurrent(items, maxConcurrent, fn) {
    let index = 0;
    let inFlightCntr = 0;
    let doneCntr = 0;
    let results = new Array(items.length);
    let stop = false;

    return new Promise(function(resolve, reject) {

        function runNext() {
            let i = index;
            ++inFlightCntr;
            fn(items[index], index++).then(function(val) {
                ++doneCntr;
                --inFlightCntr;
                results[i] = val;
                run();
            }, function(err) {
                // set flag so we don't launch any more requests
                stop = true;
                reject(err);
            });
        }

        function run() {
            // launch as many as we're allowed to
            while (!stop && inflightCntr < maxConcurrent && index < items.length) {
                runNext();
            }
            // if all are done, then resolve parent promise with results
            if (doneCntr === items.length) {
                resolve(results);
            }
        }

        run();
    });
}

Bluebird 的 Promise.map() 和 Async 库中有类似的函数。

因此，使用此代码，您现在可以控制同时进行的 requests/writeToFile() 操作的数量，并且您可以捕获并记录所有可能的错误。这样做，您可以调整同时运行的数量以获得最佳性能和最低的资源使用，如果有任何错误，您应该记录这些错误以便进行调试。

此代码当前设置为在出现错误时停止处理任何进一步的 URL。如果您通过调整 mapConcurrent() 得到错误，如果您想继续访问其他 URL，您可以更改它。但是，我仍然会确保您记录所有错误，以便您知道何时出现错误并可以调查您看到错误的原因。

另外一个注意事项。如果这是我的代码，我会将所有内容都转换为 promises（没有简单的回调）并且我会使用 got() library instead of the now deprecated request() 库。我不使用 request() 库编写任何新代码。

及时请求与另一个站点的许多连接并激活跨源资源共享会导致数据损坏

Requesting many connections to another site with Cross Origin Resource Sharing activated close in time makes the data corrupted

download

node.js