如何在 while 循环中执行 nightmarejs
how to execute nightmarejs in a while loop
问题
我正在尝试抓取图像并自动分页。我使用的是页面上项目的 span
描述与项目总数:1 - 20 of 83,829 results
。我希望噩梦通过这个 while 循环 运行,但它挂起并给我一个 Javascript heap out of memory
错误。有没有办法让它每次都执行而不是压入堆栈,因为我觉得它在做什么。
要修复的代码
function scrapeEach(paginate) {
// while paginate has next scrapeEach
while (paginate.next) {
nightmare
.wait(4000)
.realClick('#pageNext .results-next')
.wait(4000)
.evaluate(() => {
return document.body.innerHTML
})
.then(result => {
scrapeImages(result);
paginate.update();
paginate.state();
})
.catch(error => {
console.error('Search failed:', error);
});
}
return nightmare.end()
}
这是与 scrapeEach()
一起使用的额外代码,
我创建了一个分页对象来跟踪这样的页面:
function Paginate(url, pgd) {
this.url = url;
this.array = pgd.split(" ");
this.currentPage = Number(this.array[0]);
this.totalItems = Number(_.join(_.filter(this.array[4], char => char != ","), ''));
this.itemsPerPage = Number(this.array[2]);
this.totalPages = Math.floor(this.totalItems / this.itemsPerPage)
this.next = true;
this.update = () => {
this.currentPage += 1;
if (this.currentPage >= this.totalPages)
this.next = false;
}
this.state = () => {
console.log("-------- Pagination ----------")
console.log("current page: " + this.currentPage);
console.log("total pages: " + this.totalPages);
console.log("total items: " + this.totalItems);
console.log("items per page: " + this.itemsPerPage);
console.log("has next page: " + this.next);
console.log("------------------------------\n");
}
}
这会将图像从一页上刮掉
// scrapes all image data on one page and updates to db
function scrapeImages(html) {
xr(html, '#returns > li', [
{
img: 'dl.return-art > dd > a > img@src',
title: 'dl.return-art > dt > a@html',
created: 'dl.return-art > .created',
medium: 'dl.return-art > .medium',
dimensions: 'dl.return-art > .dimensions',
credit: 'dl.return-art > .credit',
accession: 'dl.return-art > .accession'
}
])((err, res) => {
if (err)
throw err;
Artwork.addArt(res);
})
}
这个函数启动了整个过程
// the onview endpoint
function onView() {
nightmare.goto(config.NGA.online)
nightmare
.wait(3000)
.evaluate(() => {
return [document.location.href, document.querySelector('span.results-span').innerHTML]
})
.then(([url, pgd]) => scrapeEach(new Paginate(url, pgd)))
}
错误信息
FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap out of memory
1: node::Abort() [/usr/local/bin/node]
2: node::FatalException(v8::Isolate*, v8::Local<v8::Value>, v8::Local<v8::Message>) [/usr/local/bin/node]
3: v8::internal::V8::FatalProcessOutOfMemory(char const*, bool) [/usr/local/bin/node]
4: v8::internal::Factory::NewRawOneByteString(int, v8::internal::PretenureFlag) [/usr/local/bin/node]
5: v8::internal::Factory::NumberToString(v8::internal::Handle<v8::internal::Object>, bool) [/usr/local/bin/node]
6: v8::internal::Runtime_NumberToStringSkipCache(int, v8::internal::Object**, v8::internal::Isolate*) [/usr/local/bin/node]
7: 0x18caa0a079a7
8: 0x18caa0f37cbc
Abort trap: 6
限制进程运行并行
var limit = 10; // concurrent read // this can be increased
var running = 0;
function scrapeEach(paginate) {
// while paginate has next scrapeEach
while (paginate.next && running < limit) {
running++;
nightmare
.wait(4000)
.realClick('#pageNext .results-next')
.wait(4000)
.evaluate(() => {
return document.body.innerHTML
})
.then(result => {
scrapeImages(result , function(){
paginate.update();
paginate.state();
running--;
});
})
.catch(error => {
console.error('Search failed:', error);
running--;
});
}
return nightmare.end()
}
// scrapes all image data on one page and updates to db
function scrapeImages(html ,cb) {
xr(html, '#returns > li', [
{
img: 'dl.return-art > dd > a > img@src',
title: 'dl.return-art > dt > a@html',
created: 'dl.return-art > .created',
medium: 'dl.return-art > .medium',
dimensions: 'dl.return-art > .dimensions',
credit: 'dl.return-art > .credit',
accession: 'dl.return-art > .accession'
}
])((err, res) => {
if (err)
throw err;
Artwork.addArt(res);
cb();
})
}
所以我想通了。问题是一个异步问题 while 循环是 运行 异步直到它完成然后噩梦实例永远不会得到 运行。我更新了我的解决方案,从自动点击下一页到计算 url &pageNumber=
中的下一页,并传递有关页面、url 和页面上的项目的数据此站点的自定义页面对象。我还添加了一些调试信息来显示。
function Paginate(url, pgd) {
this.url = url;
this.array = pgd.split(" ");
this.currentPage = Number(this.array[0]);
this.totalItems = Number(_.join(_.filter(this.array[4], char => char != ","), ''));
this.itemsPerPage = Number(this.array[2]);
this.totalPages = Math.floor(this.totalItems / this.itemsPerPage)
this.next = true;
this.update = () => {
let chunks = url.split("&").filter(segment => !segment.includes('Number='));
this.currentPage += 1;
if (this.currentPage >= this.totalPages)
this.next = false;
this.url = _.join(chunks, "") + '&pageNumber=' + this.currentPage;
}
this.state = () => {
console.log("-------- Pagination ----------")
console.log("current page: " + this.currentPage);
console.log("total pages: " + this.totalPages);
console.log("total items: " + this.totalItems);
console.log("items per page: " + this.itemsPerPage);
console.log("has next page: " + this.next);
console.log("current url: " + this.url);
console.log("------------------------------\n");
}
}
我使用异步的 whilst
来同步使用 while 循环并在每次迭代中执行噩梦。
function scrapeEach(paginate) {
// while paginate has next scrapeEach
let hasNext = () => paginate.next && paginate.currentPage < 10
async.whilst(hasNext, next => {
nightmare
.goto(paginate.url)
.wait(4000)
.evaluate(() => {
return document.body.innerHTML
})
.then(result => {
scrapeImages(result);
paginate.update();
paginate.state();
next();
})
.catch(error => {
console.error('Search failed:', error);
});
}, err => {
if (err)
throw err;
console.log("finished!");
})
return nightmare;
}
问题
我正在尝试抓取图像并自动分页。我使用的是页面上项目的 span
描述与项目总数:1 - 20 of 83,829 results
。我希望噩梦通过这个 while 循环 运行,但它挂起并给我一个 Javascript heap out of memory
错误。有没有办法让它每次都执行而不是压入堆栈,因为我觉得它在做什么。
要修复的代码
function scrapeEach(paginate) {
// while paginate has next scrapeEach
while (paginate.next) {
nightmare
.wait(4000)
.realClick('#pageNext .results-next')
.wait(4000)
.evaluate(() => {
return document.body.innerHTML
})
.then(result => {
scrapeImages(result);
paginate.update();
paginate.state();
})
.catch(error => {
console.error('Search failed:', error);
});
}
return nightmare.end()
}
这是与 scrapeEach()
一起使用的额外代码,
我创建了一个分页对象来跟踪这样的页面:
function Paginate(url, pgd) {
this.url = url;
this.array = pgd.split(" ");
this.currentPage = Number(this.array[0]);
this.totalItems = Number(_.join(_.filter(this.array[4], char => char != ","), ''));
this.itemsPerPage = Number(this.array[2]);
this.totalPages = Math.floor(this.totalItems / this.itemsPerPage)
this.next = true;
this.update = () => {
this.currentPage += 1;
if (this.currentPage >= this.totalPages)
this.next = false;
}
this.state = () => {
console.log("-------- Pagination ----------")
console.log("current page: " + this.currentPage);
console.log("total pages: " + this.totalPages);
console.log("total items: " + this.totalItems);
console.log("items per page: " + this.itemsPerPage);
console.log("has next page: " + this.next);
console.log("------------------------------\n");
}
}
这会将图像从一页上刮掉
// scrapes all image data on one page and updates to db
function scrapeImages(html) {
xr(html, '#returns > li', [
{
img: 'dl.return-art > dd > a > img@src',
title: 'dl.return-art > dt > a@html',
created: 'dl.return-art > .created',
medium: 'dl.return-art > .medium',
dimensions: 'dl.return-art > .dimensions',
credit: 'dl.return-art > .credit',
accession: 'dl.return-art > .accession'
}
])((err, res) => {
if (err)
throw err;
Artwork.addArt(res);
})
}
这个函数启动了整个过程
// the onview endpoint
function onView() {
nightmare.goto(config.NGA.online)
nightmare
.wait(3000)
.evaluate(() => {
return [document.location.href, document.querySelector('span.results-span').innerHTML]
})
.then(([url, pgd]) => scrapeEach(new Paginate(url, pgd)))
}
错误信息
FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap out of memory
1: node::Abort() [/usr/local/bin/node]
2: node::FatalException(v8::Isolate*, v8::Local<v8::Value>, v8::Local<v8::Message>) [/usr/local/bin/node]
3: v8::internal::V8::FatalProcessOutOfMemory(char const*, bool) [/usr/local/bin/node]
4: v8::internal::Factory::NewRawOneByteString(int, v8::internal::PretenureFlag) [/usr/local/bin/node]
5: v8::internal::Factory::NumberToString(v8::internal::Handle<v8::internal::Object>, bool) [/usr/local/bin/node]
6: v8::internal::Runtime_NumberToStringSkipCache(int, v8::internal::Object**, v8::internal::Isolate*) [/usr/local/bin/node]
7: 0x18caa0a079a7
8: 0x18caa0f37cbc
Abort trap: 6
限制进程运行并行
var limit = 10; // concurrent read // this can be increased
var running = 0;
function scrapeEach(paginate) {
// while paginate has next scrapeEach
while (paginate.next && running < limit) {
running++;
nightmare
.wait(4000)
.realClick('#pageNext .results-next')
.wait(4000)
.evaluate(() => {
return document.body.innerHTML
})
.then(result => {
scrapeImages(result , function(){
paginate.update();
paginate.state();
running--;
});
})
.catch(error => {
console.error('Search failed:', error);
running--;
});
}
return nightmare.end()
}
// scrapes all image data on one page and updates to db
function scrapeImages(html ,cb) {
xr(html, '#returns > li', [
{
img: 'dl.return-art > dd > a > img@src',
title: 'dl.return-art > dt > a@html',
created: 'dl.return-art > .created',
medium: 'dl.return-art > .medium',
dimensions: 'dl.return-art > .dimensions',
credit: 'dl.return-art > .credit',
accession: 'dl.return-art > .accession'
}
])((err, res) => {
if (err)
throw err;
Artwork.addArt(res);
cb();
})
}
所以我想通了。问题是一个异步问题 while 循环是 运行 异步直到它完成然后噩梦实例永远不会得到 运行。我更新了我的解决方案,从自动点击下一页到计算 url &pageNumber=
中的下一页,并传递有关页面、url 和页面上的项目的数据此站点的自定义页面对象。我还添加了一些调试信息来显示。
function Paginate(url, pgd) {
this.url = url;
this.array = pgd.split(" ");
this.currentPage = Number(this.array[0]);
this.totalItems = Number(_.join(_.filter(this.array[4], char => char != ","), ''));
this.itemsPerPage = Number(this.array[2]);
this.totalPages = Math.floor(this.totalItems / this.itemsPerPage)
this.next = true;
this.update = () => {
let chunks = url.split("&").filter(segment => !segment.includes('Number='));
this.currentPage += 1;
if (this.currentPage >= this.totalPages)
this.next = false;
this.url = _.join(chunks, "") + '&pageNumber=' + this.currentPage;
}
this.state = () => {
console.log("-------- Pagination ----------")
console.log("current page: " + this.currentPage);
console.log("total pages: " + this.totalPages);
console.log("total items: " + this.totalItems);
console.log("items per page: " + this.itemsPerPage);
console.log("has next page: " + this.next);
console.log("current url: " + this.url);
console.log("------------------------------\n");
}
}
我使用异步的 whilst
来同步使用 while 循环并在每次迭代中执行噩梦。
function scrapeEach(paginate) {
// while paginate has next scrapeEach
let hasNext = () => paginate.next && paginate.currentPage < 10
async.whilst(hasNext, next => {
nightmare
.goto(paginate.url)
.wait(4000)
.evaluate(() => {
return document.body.innerHTML
})
.then(result => {
scrapeImages(result);
paginate.update();
paginate.state();
next();
})
.catch(error => {
console.error('Search failed:', error);
});
}, err => {
if (err)
throw err;
console.log("finished!");
})
return nightmare;
}