Nightmare.js 网络抓取在服务器上不工作
Nightmare.js web-scraping not working on server
对于我的(开源)Node.js 项目 bundeszirkus.de I need to implement a web scraper that searches and downloads URLs. Currently, I am using Nightmare.js like this: link to full file
exports.scrape = function(cb) {
_callback = cb
_downloadedLinks = 0
let nightmare = new Nightmare({ show: false })
const url = 'https://www.bundestag.de/services/opendata'
// we request nightmare to browse to the bundestag.de url and extract the whole inner html
nightmare
.goto(url)
.wait('body')
.evaluate(() => document.querySelector('body').innerHTML)
.end()
.then(response => {
_downloadedLinks = 0
let validLinks = extractLinks(response)
_foundLinks = validLinks.length
logger.info("[scraper] found " + validLinks.length + " valid links.")
if(validLinks.length > 0){
validLinks.forEach(href => {
downloadFileFromHref(BT_LINK + href)
});
} else {
logger.info("[scraper] did not download any files.")
_callback()
}
}).catch(err => {
logger.info("[scraper] did not download any files.")
_callback()
});
// Extracting the links we need
let extractLinks = html => {
data = [];
const $ = cheerio.load(html);
$('.bt-link-dokument').each(function() {
data.push(this.attribs.href);
});
return data.filter(checkDocumentLink)
}
}
当 运行 在我的本地计算机上时,这工作得很好。但是,当 运行 在我的 ubuntu 服务器 (AWS) 上安装它时似乎出现了问题。我读到这是由于我的服务器上没有可用的图形界面造成的,所以我正在尝试 运行 Xvfb 它。
这是我的 ecosystem.config.js 文件。
当运行ning pm2 ls
时可以看到Xvfb和我的服务器都是运行ning:
ubuntu@ip-XXX-XX-XX-XXX:~/bundeszirkus-server/current$ pm2 ls
┌─────────────────────┬────┬─────────┬──────┬───────┬────────┬─────────┬────────┬─────┬────────────┬────────┬──────────┐
│ App name │ id │ version │ mode │ pid │ status │ restart │ uptime │ cpu │ mem │ user │ watching │
├─────────────────────┼────┼─────────┼──────┼───────┼────────┼─────────┼────────┼─────┼────────────┼────────┼──────────┤
│ Xvfb │ 1 │ N/A │ fork │ 26063 │ online │ 6 │ 14m │ 0% │ 17.5 MB │ ubuntu │ disabled │
│ bundeszirkus-server │ 0 │ 1.0.0 │ fork │ 26057 │ online │ 6 │ 14m │ 0% │ 246.4 MB │ ubuntu │ disabled │
└─────────────────────┴────┴─────────┴──────┴───────┴────────┴─────────┴────────┴─────┴────────────┴────────┴──────────┘
Use `pm2 show <id|name>` to get more details about an app
据我所知,一切似乎都已正确设置,但来自服务器的日志条目显示,没有正在下载的文件:
{"message":"Starting server!","level":"info","timestamp":"2020-01-11 11:56:38"}
{"message":"Starting initial scraping.","level":"info","timestamp":"2020-01-11 11:56:38"}
{"message":"[scraper] found 0 valid links.","level":"info","timestamp":"2020-01-11 11:56:42"}
{"message":"[scraper] did not download any files.","level":"info","timestamp":"2020-01-11 11:56:42"}
{"message":"Loading data.","level":"info","timestamp":"2020-01-11 11:56:42"}
{"message":"[loader] loading data ...","level":"info","timestamp":"2020-01-11 11:56:42"}
同时它在我的本地 (Ubuntu) 机器上 运行 时工作:
{"message":"Starting server!","level":"info","timestamp":"2020-01-11 12:52:47"}
{"message":"Starting initial scraping.","level":"info","timestamp":"2020-01-11 12:52:47"}
{"message":"[scraper] found 5 valid links.","level":"info","timestamp":"2020-01-11 12:52:49"}
{"message":"[scraper] downloading file: 19138-data.xml from href: http://www.bundestag.de/resource/blob/674998/86249f57e79b8308e820d6581e7e2a95/19138-data.xml","level":"info","timestamp":"2020-01-11 12:52:49"}
{"message":"[scraper] downloading file: 19136-data.xml from href: http://www.bundestag.de/resource/blob/674328/0e9d258d50d08923fe6d6ad1381bdb3f/19136-data.xml","level":"info","timestamp":"2020-01-11 12:52:49"}
{"message":"[scraper] downloading file: 19137-data.xml from href: http://www.bundestag.de/resource/blob/674730/2bc751b619488227c9267e3cbe12c4c3/19137-data.xml","level":"info","timestamp":"2020-01-11 12:52:49"}
{"message":"[scraper] downloading file: 19135-data.xml from href: http://www.bundestag.de/resource/blob/673576/147b80c74d6d681833568cfcf36f9670/19135-data.xml","level":"info","timestamp":"2020-01-11 12:52:49"}
{"message":"[scraper] downloading file: 19134-data.xml from href: http://www.bundestag.de/resource/blob/673116/982f9d0ec845b85bddd289ede4a589fd/19134-data.xml","level":"info","timestamp":"2020-01-11 12:52:49"}
{"message":"[scraper] finished downloading all 5 files.","level":"info","timestamp":"2020-01-11 12:52:51"}
{"message":"Loading data.","level":"info","timestamp":"2020-01-11 12:52:51"}
我对如何寻找丢失的部分有点迷茫。非常感谢任何帮助!
执行以下操作后现在可以使用了:
- 将
xvfb
添加到这样的代码中:
let xvfb = new Xvfb();
try {
xvfb.startSync();
}
catch (e) {
console.log(e);
}
// scraping
xvfb.stopSync();
- 更改此行:
.wait('body')
到 .wait(2000)
。
对于我的(开源)Node.js 项目 bundeszirkus.de I need to implement a web scraper that searches and downloads URLs. Currently, I am using Nightmare.js like this: link to full file
exports.scrape = function(cb) {
_callback = cb
_downloadedLinks = 0
let nightmare = new Nightmare({ show: false })
const url = 'https://www.bundestag.de/services/opendata'
// we request nightmare to browse to the bundestag.de url and extract the whole inner html
nightmare
.goto(url)
.wait('body')
.evaluate(() => document.querySelector('body').innerHTML)
.end()
.then(response => {
_downloadedLinks = 0
let validLinks = extractLinks(response)
_foundLinks = validLinks.length
logger.info("[scraper] found " + validLinks.length + " valid links.")
if(validLinks.length > 0){
validLinks.forEach(href => {
downloadFileFromHref(BT_LINK + href)
});
} else {
logger.info("[scraper] did not download any files.")
_callback()
}
}).catch(err => {
logger.info("[scraper] did not download any files.")
_callback()
});
// Extracting the links we need
let extractLinks = html => {
data = [];
const $ = cheerio.load(html);
$('.bt-link-dokument').each(function() {
data.push(this.attribs.href);
});
return data.filter(checkDocumentLink)
}
}
当 运行 在我的本地计算机上时,这工作得很好。但是,当 运行 在我的 ubuntu 服务器 (AWS) 上安装它时似乎出现了问题。我读到这是由于我的服务器上没有可用的图形界面造成的,所以我正在尝试 运行 Xvfb 它。
这是我的 ecosystem.config.js 文件。
当运行ning pm2 ls
时可以看到Xvfb和我的服务器都是运行ning:
ubuntu@ip-XXX-XX-XX-XXX:~/bundeszirkus-server/current$ pm2 ls
┌─────────────────────┬────┬─────────┬──────┬───────┬────────┬─────────┬────────┬─────┬────────────┬────────┬──────────┐
│ App name │ id │ version │ mode │ pid │ status │ restart │ uptime │ cpu │ mem │ user │ watching │
├─────────────────────┼────┼─────────┼──────┼───────┼────────┼─────────┼────────┼─────┼────────────┼────────┼──────────┤
│ Xvfb │ 1 │ N/A │ fork │ 26063 │ online │ 6 │ 14m │ 0% │ 17.5 MB │ ubuntu │ disabled │
│ bundeszirkus-server │ 0 │ 1.0.0 │ fork │ 26057 │ online │ 6 │ 14m │ 0% │ 246.4 MB │ ubuntu │ disabled │
└─────────────────────┴────┴─────────┴──────┴───────┴────────┴─────────┴────────┴─────┴────────────┴────────┴──────────┘
Use `pm2 show <id|name>` to get more details about an app
据我所知,一切似乎都已正确设置,但来自服务器的日志条目显示,没有正在下载的文件:
{"message":"Starting server!","level":"info","timestamp":"2020-01-11 11:56:38"}
{"message":"Starting initial scraping.","level":"info","timestamp":"2020-01-11 11:56:38"}
{"message":"[scraper] found 0 valid links.","level":"info","timestamp":"2020-01-11 11:56:42"}
{"message":"[scraper] did not download any files.","level":"info","timestamp":"2020-01-11 11:56:42"}
{"message":"Loading data.","level":"info","timestamp":"2020-01-11 11:56:42"}
{"message":"[loader] loading data ...","level":"info","timestamp":"2020-01-11 11:56:42"}
同时它在我的本地 (Ubuntu) 机器上 运行 时工作:
{"message":"Starting server!","level":"info","timestamp":"2020-01-11 12:52:47"}
{"message":"Starting initial scraping.","level":"info","timestamp":"2020-01-11 12:52:47"}
{"message":"[scraper] found 5 valid links.","level":"info","timestamp":"2020-01-11 12:52:49"}
{"message":"[scraper] downloading file: 19138-data.xml from href: http://www.bundestag.de/resource/blob/674998/86249f57e79b8308e820d6581e7e2a95/19138-data.xml","level":"info","timestamp":"2020-01-11 12:52:49"}
{"message":"[scraper] downloading file: 19136-data.xml from href: http://www.bundestag.de/resource/blob/674328/0e9d258d50d08923fe6d6ad1381bdb3f/19136-data.xml","level":"info","timestamp":"2020-01-11 12:52:49"}
{"message":"[scraper] downloading file: 19137-data.xml from href: http://www.bundestag.de/resource/blob/674730/2bc751b619488227c9267e3cbe12c4c3/19137-data.xml","level":"info","timestamp":"2020-01-11 12:52:49"}
{"message":"[scraper] downloading file: 19135-data.xml from href: http://www.bundestag.de/resource/blob/673576/147b80c74d6d681833568cfcf36f9670/19135-data.xml","level":"info","timestamp":"2020-01-11 12:52:49"}
{"message":"[scraper] downloading file: 19134-data.xml from href: http://www.bundestag.de/resource/blob/673116/982f9d0ec845b85bddd289ede4a589fd/19134-data.xml","level":"info","timestamp":"2020-01-11 12:52:49"}
{"message":"[scraper] finished downloading all 5 files.","level":"info","timestamp":"2020-01-11 12:52:51"}
{"message":"Loading data.","level":"info","timestamp":"2020-01-11 12:52:51"}
我对如何寻找丢失的部分有点迷茫。非常感谢任何帮助!
执行以下操作后现在可以使用了:
- 将
xvfb
添加到这样的代码中:
let xvfb = new Xvfb();
try {
xvfb.startSync();
}
catch (e) {
console.log(e);
}
// scraping
xvfb.stopSync();
- 更改此行:
.wait('body')
到.wait(2000)
。