如何使用 chrome-remote-interface 获取 iframe 内容?
how to get iframe content with chrome-remote-interface?
我正在构建一个爬虫但是我需要爬取 iframe 内容chrome-远程接口不转储 iframe 内容有什么办法可以做到这一点。
代码
CDP.New({'url':url},(err,target) => {
if(!err){
CDP({target},(client) => {
const {Network, Page, Runtime} = client;
Network.setUserAgentOverride({'userAgent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'});
Network.enable();
Page.enable();
Runtime.enable();
Page.navigate({url});
Page.loadEventFired(() => {
Runtime.evaluate({
//I have no idea what to do ..
expression:'document.documentElement',
returnValue:true})
.then(({result}) => {
cb(null,{html:result.value})
CDP.Close({id:target.id})
})
})
})
我认为这是禁止的,你可以做的是使用 DOM.getDocument
来获取整棵树,你不能将它直接转换为 HTML 字符串:
const {root} = await DOM.getDocument({depth: -1, pierce: true});
使用Chrome启动器
https://www.npmjs.com/package/chrome-launcher
如果您使用 chromeLauncher 并传入 '--disable-web-security'
标志,这实际上很容易做到。这是一个如何设置的示例。
const chromeLauncher = require('chrome-launcher');
const CDP = require('chrome-remote-interface');
let launchChrome = () => {
console.log('launchChrome..');
return chromeLauncher.launch({
chromeFlags: [
'--disable-web-security', // Query within iframes
],
logLevel: 'error'
}).catch(function(e) {
console.log('Error launching chrome: ' + e);
});
}
let initChrome = async () => {
console.log('initChrome..');
const chrome = await launchChrome();
const protocol = await CDP({port: chrome.port});
const {Page, Runtime, Network} = protocol;
const userAgent = 'Mozilla/5.0 (X11; Linux x86_64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.39 Safari/537.36';
await Promise.all([Page.enable(), Runtime.enable(), Network.setUserAgentOverride({userAgent})]);
return {chrome: chrome, protocol: protocol, Page: Page, Runtime: Runtime}
}
let run = async () => {
let {chrome, protocol, Page, Runtime} = await initChrome();
try {
await Page.navigate({url: 'https://www.example.com/'});
await Page.loadEventFired();
//////////////////////////////////
// YOU CAN NOW QUERY IN IFRAMES //
console.log(await Runtime.evaluate({expression: `document.querySelector('iframe')`, returnByValue: true}));
//////////////////////////////////
console.log('..Finished');
} catch (err) {
console.log(err);
}
protocol.close();
chrome.kill();
}
我正在构建一个爬虫但是我需要爬取 iframe 内容chrome-远程接口不转储 iframe 内容有什么办法可以做到这一点。
代码
CDP.New({'url':url},(err,target) => {
if(!err){
CDP({target},(client) => {
const {Network, Page, Runtime} = client;
Network.setUserAgentOverride({'userAgent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'});
Network.enable();
Page.enable();
Runtime.enable();
Page.navigate({url});
Page.loadEventFired(() => {
Runtime.evaluate({
//I have no idea what to do ..
expression:'document.documentElement',
returnValue:true})
.then(({result}) => {
cb(null,{html:result.value})
CDP.Close({id:target.id})
})
})
})
我认为这是禁止的,你可以做的是使用 DOM.getDocument
来获取整棵树,你不能将它直接转换为 HTML 字符串:
const {root} = await DOM.getDocument({depth: -1, pierce: true});
使用Chrome启动器
https://www.npmjs.com/package/chrome-launcher
如果您使用 chromeLauncher 并传入 '--disable-web-security'
标志,这实际上很容易做到。这是一个如何设置的示例。
const chromeLauncher = require('chrome-launcher');
const CDP = require('chrome-remote-interface');
let launchChrome = () => {
console.log('launchChrome..');
return chromeLauncher.launch({
chromeFlags: [
'--disable-web-security', // Query within iframes
],
logLevel: 'error'
}).catch(function(e) {
console.log('Error launching chrome: ' + e);
});
}
let initChrome = async () => {
console.log('initChrome..');
const chrome = await launchChrome();
const protocol = await CDP({port: chrome.port});
const {Page, Runtime, Network} = protocol;
const userAgent = 'Mozilla/5.0 (X11; Linux x86_64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.39 Safari/537.36';
await Promise.all([Page.enable(), Runtime.enable(), Network.setUserAgentOverride({userAgent})]);
return {chrome: chrome, protocol: protocol, Page: Page, Runtime: Runtime}
}
let run = async () => {
let {chrome, protocol, Page, Runtime} = await initChrome();
try {
await Page.navigate({url: 'https://www.example.com/'});
await Page.loadEventFired();
//////////////////////////////////
// YOU CAN NOW QUERY IN IFRAMES //
console.log(await Runtime.evaluate({expression: `document.querySelector('iframe')`, returnByValue: true}));
//////////////////////////////////
console.log('..Finished');
} catch (err) {
console.log(err);
}
protocol.close();
chrome.kill();
}