使用变量 URL 循环遍历 api get 请求
Loop through an api get request with variable URL
我正在尝试致电 CompaniesHouse API 并获取在 11 月和 2 月之间注册的公司。我采用的方法是选择一个起始索引(一家在 11 月注册的公司)和一个停止索引(一家在 2 月注册的公司)并循环获取在起始索引和停止索引之间注册的公司。像这样:
var needle = require("needle");
var startIdx = 11059000;
var stopIdx = 11211109;
for(idx = startIdx; idx < stopIdx; idx++)
{
needle('get', "https://api.companieshouse.gov.uk/company/"+idx, {
username: key,password:""
})
.then(function(data) {
})
.catch(function(err) {
console.log('Call the locksmith!' + err)
})
}
但这不起作用,因为会出现超时或套接字挂起错误。
API 目前处于测试阶段,一些功能尚未实现。
var needle = require("needle");
var startIdx = 11059000;
var stopIdx = 11211109;
const promises = [];
for(idx = startIdx; idx < stopIdx; idx++)
{
promises.push(
needle('get', "https://api.companieshouse.gov.uk/company/"+idx, {
username: key,password:""
})
)
}
Promise.all(promises).then(results => {console.log(results);}).catch(err => console.log(err));
一个简单的 Promise.all
实施可以提供帮助。
因为 for
同步循环 运行 并且您对 needle()
的调用是异步的,因此不会阻塞,您最终尝试同时启动超过 100,000 个网络请求.这会使您的本地计算机或目标服务器不堪重负,并且您开始收到套接字错误。
对于这么多的请求,您需要一次 运行 X 个,这样同时飞行的不超过 X 个。为了最大限度地提高性能,您必须弄清楚要使用的 X 值是多少,因为它取决于目标服务器及其处理大量并发请求的方式。通常从 5 开始是安全的,然后从那里增加它以测试更高的值。
如果您正在处理一个数组,那么一次 运行 X 请求有多个 pre-built 选项。最简单的是使用 pre-built 并发管理操作,例如 Bluebird。或者你可以自己写。您可以在此处查看两者的示例:
但是,由于您没有处理数组,而只是为每个连续的请求递增一个数字,所以我找不到执行此操作的 pre-built 选项。所以,我写了一个通用的,你可以在其中填写增加索引的函数:
// fn gets called on each iteration - must return a promise
// limit is max number of requests to be in flight at once
// cnt is number of times to call fn
// options is optional and can be {continueOnError: true}
// runN returns a promise that resolves with results array.
// If continueOnError is set, then results array
// contains error values too (presumed to be instanceof Error so caller can discern
// them from regular values)
function runN(fn, limit, cnt, options = {}) {
return new Promise((resolve, reject) => {
let inFlightCntr = 0;
let results = [];
let cntr = 0;
let doneCnt = 0;
function run() {
while (inFlightCntr < limit && cntr < cnt) {
let resultIndex = cntr++;
++inFlightCntr;
fn().then(result => {
--inFlightCntr;
++doneCnt;
results[resultIndex] = result;
run(); // run any more that still need to be run
}).catch(err => {
--inFlightCntr;
++doneCnt;
if (options.continueOnError) {
// assumes error is instanceof Error so caller can tell the
// difference between a genuine result and an error
results[resultIndex] = err;
run(); // run any more that still need to be run
} else {
reject(err);
}
});
}
if (doneCnt === cnt) {
resolve(results);
}
}
run();
});
}
那么,你可以这样使用:
const needle = require("needle");
const startIdx = 11059000;
const stopIdx = 11211109;
const numConcurrent = 5;
let idxCntr = startIdx;
runN(function() {
let idx = idxCntr++;
return needle('get', "https://api.companieshouse.gov.uk/company/"+idx, {
username: key,password:""
});
}, numConcurrent, stopIdx - startIdx + 1, {continueOnError: true}).then(results => {
console.log(results);
}).catch(err => {
console.log(err);
});
为了最大限度地减少内存使用,您可以在对 needle()
的调用中使用 .then()
处理程序,并 trim 仅响应最终数组中需要的内容:
const needle = require("needle");
const startIdx = 11059000;
const stopIdx = 11211109;
const numConcurrent = 5;
let idxCntr = startIdx;
runN(function() {
let idx = idxCntr++;
return needle('get', "https://api.companieshouse.gov.uk/company/"+idx, {
username: key,password:""
}).then(response => {
// construct the smallest possible response here and then return it
// to minimize memory use for your 100,000+ requests
return response.someProperty;
});
}, numConcurrent, stopIdx - startIdx + 1, {continueOnError: true}).then(results => {
console.log(results);
}).catch(err => {
console.log(err);
});
我正在尝试致电 CompaniesHouse API 并获取在 11 月和 2 月之间注册的公司。我采用的方法是选择一个起始索引(一家在 11 月注册的公司)和一个停止索引(一家在 2 月注册的公司)并循环获取在起始索引和停止索引之间注册的公司。像这样:
var needle = require("needle");
var startIdx = 11059000;
var stopIdx = 11211109;
for(idx = startIdx; idx < stopIdx; idx++)
{
needle('get', "https://api.companieshouse.gov.uk/company/"+idx, {
username: key,password:""
})
.then(function(data) {
})
.catch(function(err) {
console.log('Call the locksmith!' + err)
})
}
但这不起作用,因为会出现超时或套接字挂起错误。
API 目前处于测试阶段,一些功能尚未实现。
var needle = require("needle");
var startIdx = 11059000;
var stopIdx = 11211109;
const promises = [];
for(idx = startIdx; idx < stopIdx; idx++)
{
promises.push(
needle('get', "https://api.companieshouse.gov.uk/company/"+idx, {
username: key,password:""
})
)
}
Promise.all(promises).then(results => {console.log(results);}).catch(err => console.log(err));
一个简单的 Promise.all
实施可以提供帮助。
因为 for
同步循环 运行 并且您对 needle()
的调用是异步的,因此不会阻塞,您最终尝试同时启动超过 100,000 个网络请求.这会使您的本地计算机或目标服务器不堪重负,并且您开始收到套接字错误。
对于这么多的请求,您需要一次 运行 X 个,这样同时飞行的不超过 X 个。为了最大限度地提高性能,您必须弄清楚要使用的 X 值是多少,因为它取决于目标服务器及其处理大量并发请求的方式。通常从 5 开始是安全的,然后从那里增加它以测试更高的值。
如果您正在处理一个数组,那么一次 运行 X 请求有多个 pre-built 选项。最简单的是使用 pre-built 并发管理操作,例如 Bluebird。或者你可以自己写。您可以在此处查看两者的示例:
但是,由于您没有处理数组,而只是为每个连续的请求递增一个数字,所以我找不到执行此操作的 pre-built 选项。所以,我写了一个通用的,你可以在其中填写增加索引的函数:
// fn gets called on each iteration - must return a promise
// limit is max number of requests to be in flight at once
// cnt is number of times to call fn
// options is optional and can be {continueOnError: true}
// runN returns a promise that resolves with results array.
// If continueOnError is set, then results array
// contains error values too (presumed to be instanceof Error so caller can discern
// them from regular values)
function runN(fn, limit, cnt, options = {}) {
return new Promise((resolve, reject) => {
let inFlightCntr = 0;
let results = [];
let cntr = 0;
let doneCnt = 0;
function run() {
while (inFlightCntr < limit && cntr < cnt) {
let resultIndex = cntr++;
++inFlightCntr;
fn().then(result => {
--inFlightCntr;
++doneCnt;
results[resultIndex] = result;
run(); // run any more that still need to be run
}).catch(err => {
--inFlightCntr;
++doneCnt;
if (options.continueOnError) {
// assumes error is instanceof Error so caller can tell the
// difference between a genuine result and an error
results[resultIndex] = err;
run(); // run any more that still need to be run
} else {
reject(err);
}
});
}
if (doneCnt === cnt) {
resolve(results);
}
}
run();
});
}
那么,你可以这样使用:
const needle = require("needle");
const startIdx = 11059000;
const stopIdx = 11211109;
const numConcurrent = 5;
let idxCntr = startIdx;
runN(function() {
let idx = idxCntr++;
return needle('get', "https://api.companieshouse.gov.uk/company/"+idx, {
username: key,password:""
});
}, numConcurrent, stopIdx - startIdx + 1, {continueOnError: true}).then(results => {
console.log(results);
}).catch(err => {
console.log(err);
});
为了最大限度地减少内存使用,您可以在对 needle()
的调用中使用 .then()
处理程序,并 trim 仅响应最终数组中需要的内容:
const needle = require("needle");
const startIdx = 11059000;
const stopIdx = 11211109;
const numConcurrent = 5;
let idxCntr = startIdx;
runN(function() {
let idx = idxCntr++;
return needle('get', "https://api.companieshouse.gov.uk/company/"+idx, {
username: key,password:""
}).then(response => {
// construct the smallest possible response here and then return it
// to minimize memory use for your 100,000+ requests
return response.someProperty;
});
}, numConcurrent, stopIdx - startIdx + 1, {continueOnError: true}).then(results => {
console.log(results);
}).catch(err => {
console.log(err);
});