如何使用 chrome-remote-interface 节点 js 获取多个 DOM 元素?
How to get multiple DOM elements with chrome-remote-interface node js?
我只是想用 chrome-remote-interface 构建一个爬虫,但我不知道如何获取多个 dom 元素,例如特定目标 id,类。
例如:
price = document.getelementbyid('price')
name= document.getelementbyid('name')
代码
const CDP = require('chrome-remote-interface');
CDP((client) => {
// Extract used DevTools domains.
const {Page, Runtime} = client;
// Enable events on domains we are interested in.
Promise.all([
Page.enable()
]).then(() => {
return Page.navigate({url: 'http://example.com'})
});
// Evaluate outerHTML after page has loaded.
Page.loadEventFired(() => {
Runtime.evaluate({expression: 'document.body.outerHTML'}).then((result) => {
//How to get Multiple Dom elements
console.log(result.result.value);
client.close();
});
});
}).on('error', (err) => {
console.error('Cannot connect to browser:', err);
});
更新
const CDP = require('chrome-remote-interface');
CDP((client) => {
// Extract used DevTools domains.
const {DOM,Page, Runtime} = client;
// Enable events on domains we are interested in.
Promise.all([
Page.enable()
]).then(() => {
return Page.navigate({url: 'https://someDomain.com'});
})
Page.loadEventFired(() => {
const expression = `({
test: document.getElementsByClassName('rows')),
})`
Runtime.evaluate({expression,returnByValue: true}).then((result) => {
console.log(result.result) // Error
client.close()
})
})
}).on('error', (err) => {
console.error('Cannot connect to browser:', err);
});
错误
{ type: 'object',
subtype: 'error',
className: 'SyntaxError',
description: 'SyntaxError: Unexpected token )',
objectId: '{"injectedScriptId":14,"id":1}' }
其实我想遍历元素列表但是我不知道哪里错了
您不能将 DOM 对象从浏览器上下文移动到 Node.js 上下文,您所能做的就是传递 属性 或任何可以被视为 JSON 的内容目的。在这里,我假设您对计算的 HTML.
感兴趣
一个可能的解决方案是:
const CDP = require('chrome-remote-interface');
CDP((client) => {
// Extract used DevTools domains.
const {Page, Runtime} = client;
// Enable events on domains we are interested in.
Promise.all([
Page.enable()
]).then(() => {
return Page.navigate({url: 'http://example.com'});
});
// Evaluate outerHTML after page has loaded.
Page.loadEventFired(() => {
const expression = `({
name: document.getElementById('name').outerHTML,
price: document.getElementById('price').outerHTML
})`;
Runtime.evaluate({
expression,
returnByValue: true
}).then(({result}) => {
const {name, price} = result.value;
console.log(`name: ${name}`);
console.log(`price: ${price}`);
client.close();
});
});
}).on('error', (err) => {
console.error('Cannot connect to browser:', err);
});
关键点是使用 returnByValue: true
.
返回一个 JSON 对象
更新: 你的表达式有一个错误,...('rows')),
中的尾部 )
。但即使你修复了它,你仍然会陷入错误的境地,因为你试图传递一个 DOM 对象的数组(参见这个答案的第一段)。同样,如果你只想要外部 HTML 你可以这样做:
// Evaluate outerHTML after page has loaded.
Page.loadEventFired(() => {
const expression = `
// fetch an array-like of DOM elements
var elements = document.getElementsByTagName('p');
// create and return an array containing
// just a property (in this case `outerHTML`)
Array.prototype.map.call(elements, x => x.outerHTML);
`;
Runtime.evaluate({
expression,
returnByValue: true
}).then(({result}) => {
// this is the returned array
const elements = result.value;
elements.forEach((html) => {
console.log(`- ${html}`);
});
client.close();
});
});
我只是想用 chrome-remote-interface 构建一个爬虫,但我不知道如何获取多个 dom 元素,例如特定目标 id,类。
例如:
price = document.getelementbyid('price')
name= document.getelementbyid('name')
代码
const CDP = require('chrome-remote-interface');
CDP((client) => {
// Extract used DevTools domains.
const {Page, Runtime} = client;
// Enable events on domains we are interested in.
Promise.all([
Page.enable()
]).then(() => {
return Page.navigate({url: 'http://example.com'})
});
// Evaluate outerHTML after page has loaded.
Page.loadEventFired(() => {
Runtime.evaluate({expression: 'document.body.outerHTML'}).then((result) => {
//How to get Multiple Dom elements
console.log(result.result.value);
client.close();
});
});
}).on('error', (err) => {
console.error('Cannot connect to browser:', err);
});
更新
const CDP = require('chrome-remote-interface');
CDP((client) => {
// Extract used DevTools domains.
const {DOM,Page, Runtime} = client;
// Enable events on domains we are interested in.
Promise.all([
Page.enable()
]).then(() => {
return Page.navigate({url: 'https://someDomain.com'});
})
Page.loadEventFired(() => {
const expression = `({
test: document.getElementsByClassName('rows')),
})`
Runtime.evaluate({expression,returnByValue: true}).then((result) => {
console.log(result.result) // Error
client.close()
})
})
}).on('error', (err) => {
console.error('Cannot connect to browser:', err);
});
错误
{ type: 'object',
subtype: 'error',
className: 'SyntaxError',
description: 'SyntaxError: Unexpected token )',
objectId: '{"injectedScriptId":14,"id":1}' }
其实我想遍历元素列表但是我不知道哪里错了
您不能将 DOM 对象从浏览器上下文移动到 Node.js 上下文,您所能做的就是传递 属性 或任何可以被视为 JSON 的内容目的。在这里,我假设您对计算的 HTML.
感兴趣一个可能的解决方案是:
const CDP = require('chrome-remote-interface');
CDP((client) => {
// Extract used DevTools domains.
const {Page, Runtime} = client;
// Enable events on domains we are interested in.
Promise.all([
Page.enable()
]).then(() => {
return Page.navigate({url: 'http://example.com'});
});
// Evaluate outerHTML after page has loaded.
Page.loadEventFired(() => {
const expression = `({
name: document.getElementById('name').outerHTML,
price: document.getElementById('price').outerHTML
})`;
Runtime.evaluate({
expression,
returnByValue: true
}).then(({result}) => {
const {name, price} = result.value;
console.log(`name: ${name}`);
console.log(`price: ${price}`);
client.close();
});
});
}).on('error', (err) => {
console.error('Cannot connect to browser:', err);
});
关键点是使用 returnByValue: true
.
更新: 你的表达式有一个错误,...('rows')),
中的尾部 )
。但即使你修复了它,你仍然会陷入错误的境地,因为你试图传递一个 DOM 对象的数组(参见这个答案的第一段)。同样,如果你只想要外部 HTML 你可以这样做:
// Evaluate outerHTML after page has loaded.
Page.loadEventFired(() => {
const expression = `
// fetch an array-like of DOM elements
var elements = document.getElementsByTagName('p');
// create and return an array containing
// just a property (in this case `outerHTML`)
Array.prototype.map.call(elements, x => x.outerHTML);
`;
Runtime.evaluate({
expression,
returnByValue: true
}).then(({result}) => {
// this is the returned array
const elements = result.value;
elements.forEach((html) => {
console.log(`- ${html}`);
});
client.close();
});
});