当我在 puppeteer 中使用代理标志时,waitForSelector returns a TimeoutError
When I use a proxy flag in puppeteer, waitForSelector returns a TimeoutError
我正在做一些 Puppeteer 网络抓取,出于某种原因,当我放入通过在此站点上创建帐户生成的代理标志时 (https://proxy.webshare.io/proxy/list?),它会导致 waitForSelector() 获取超时错误。不确定是什么问题,因为如果我不使用代理,则不会抛出任何错误。
const puppeteer = require('puppeteer');
const puppeteerExtra = require('puppeteer-extra');
const pluginStealth = require('puppeteer-extra-plugin-stealth');
const proxyChain = require('proxy-chain');
async function scrape() {
try {
const preparePageForTests = async (page) => {
const userAgent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36';
await page.setUserAgent(userAgent);
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
});
// Pass the Chrome Test.
await page.evaluateOnNewDocument(() => {
// We can mock this in as much depth as we need for the test.
window.navigator.chrome = {
app: {
isInstalled: false,
},
webstore: {
onInstallStageChanged: {},
onDownloadProgress: {},
},
runtime: {
PlatformOs: {
MAC: 'mac',
WIN: 'win',
ANDROID: 'android',
CROS: 'cros',
LINUX: 'linux',
OPENBSD: 'openbsd',
},
PlatformArch: {
ARM: 'arm',
X86_32: 'x86-32',
X86_64: 'x86-64',
},
PlatformNaclArch: {
ARM: 'arm',
X86_32: 'x86-32',
X86_64: 'x86-64',
},
RequestUpdateCheckStatus: {
THROTTLED: 'throttled',
NO_UPDATE: 'no_update',
UPDATE_AVAILABLE: 'update_available',
},
OnInstalledReason: {
INSTALL: 'install',
UPDATE: 'update',
CHROME_UPDATE: 'chrome_update',
SHARED_MODULE_UPDATE: 'shared_module_update',
},
OnRestartRequiredReason: {
APP_UPDATE: 'app_update',
OS_UPDATE: 'os_update',
PERIODIC: 'periodic',
},
}
};
});
await page.evaluateOnNewDocument(() => {
const originalQuery = window.navigator.permissions.query;
return window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
});
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'plugins', {
// This just needs to have `length > 0` for the current test,
// but we could mock the plugins too if necessary.
get: () => [1, 2, 3, 4, 5],
});
});
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en'],
});
});
}
const oldProxyUrl = 'http://##.##.##.##:####'
const newProxyUrl = await proxyChain.anonymizeProxy(oldProxyUrl);
const browser = await puppeteerExtra.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--proxy-server=http://##.##.##.##:####'// tried hard coding the proxy in, also tried ${newProxyUrl}`,
]
});
const page = await browser.newPage();
await page.authenticate({username: 'usernameOnWebsite', password: 'passwordOnWebsite'});
await preparePageForTests(page);
await page.goto(`https://www.google.com/search?q=concerts+near+new+york&client=safari&rls=en&uact=5&ibp=htl;events&rciv=evn&sa=X&fpstate=tldetail#htivrt=events&htidocid=L2F1dGhvcml0eS9ob3Jpem9uL2NsdXN0ZXJlZF9ldmVudC8yMDIxLTA2LTA0fDIxMjMzMzg4NTU2Nzc1NDk%3D&fpstate=tldetail`);
const results = await getResults(page) //the error occurs inside this function
} catch(err) {
console.log(err)
}
}
在 getResults() 函数中,waitForSelector() 的第一行出现 TimeoutError 失败。
async function getResults(page) {
try {
await page.waitForSelector("ul", { timeout: 30000 })
//I do a bunch of stuff with the ul after this point, but timeout is happening on the line above
} catch(err) {
console.log(err)
}
}
如果我删除代理并使用内置的 Heroku IP,一切正常。不确定这里的问题是什么。
使用密码保护代理的正确方法 proxy-chain
would be this:
// Give the credentials of the proxy to proxy-chain
const oldProxyUrl = 'http://bob:password123@proxy.example.com:8000';
const newProxyUrl = await proxyChain.anonymizeProxy(oldProxyUrl);
// Prints something like "http://127.0.0.1:45678"
console.log(newProxyUrl);
const browser = await puppeteer.launch({
args: [`--proxy-server=${newProxyUrl}`],
});
const page = await browser.newPage();
// Go to a page as usual, no need to authenticate
await page.goto('https://www.example.com');
您需要向 proxy-chain
提供完整代理 URL,包括用户名和密码,然后在启动 [=19= 时使用本地转发代理 URL,如 http://127.0.0.1:45678
].
我正在做一些 Puppeteer 网络抓取,出于某种原因,当我放入通过在此站点上创建帐户生成的代理标志时 (https://proxy.webshare.io/proxy/list?),它会导致 waitForSelector() 获取超时错误。不确定是什么问题,因为如果我不使用代理,则不会抛出任何错误。
const puppeteer = require('puppeteer');
const puppeteerExtra = require('puppeteer-extra');
const pluginStealth = require('puppeteer-extra-plugin-stealth');
const proxyChain = require('proxy-chain');
async function scrape() {
try {
const preparePageForTests = async (page) => {
const userAgent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36';
await page.setUserAgent(userAgent);
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
});
// Pass the Chrome Test.
await page.evaluateOnNewDocument(() => {
// We can mock this in as much depth as we need for the test.
window.navigator.chrome = {
app: {
isInstalled: false,
},
webstore: {
onInstallStageChanged: {},
onDownloadProgress: {},
},
runtime: {
PlatformOs: {
MAC: 'mac',
WIN: 'win',
ANDROID: 'android',
CROS: 'cros',
LINUX: 'linux',
OPENBSD: 'openbsd',
},
PlatformArch: {
ARM: 'arm',
X86_32: 'x86-32',
X86_64: 'x86-64',
},
PlatformNaclArch: {
ARM: 'arm',
X86_32: 'x86-32',
X86_64: 'x86-64',
},
RequestUpdateCheckStatus: {
THROTTLED: 'throttled',
NO_UPDATE: 'no_update',
UPDATE_AVAILABLE: 'update_available',
},
OnInstalledReason: {
INSTALL: 'install',
UPDATE: 'update',
CHROME_UPDATE: 'chrome_update',
SHARED_MODULE_UPDATE: 'shared_module_update',
},
OnRestartRequiredReason: {
APP_UPDATE: 'app_update',
OS_UPDATE: 'os_update',
PERIODIC: 'periodic',
},
}
};
});
await page.evaluateOnNewDocument(() => {
const originalQuery = window.navigator.permissions.query;
return window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
});
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'plugins', {
// This just needs to have `length > 0` for the current test,
// but we could mock the plugins too if necessary.
get: () => [1, 2, 3, 4, 5],
});
});
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en'],
});
});
}
const oldProxyUrl = 'http://##.##.##.##:####'
const newProxyUrl = await proxyChain.anonymizeProxy(oldProxyUrl);
const browser = await puppeteerExtra.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--proxy-server=http://##.##.##.##:####'// tried hard coding the proxy in, also tried ${newProxyUrl}`,
]
});
const page = await browser.newPage();
await page.authenticate({username: 'usernameOnWebsite', password: 'passwordOnWebsite'});
await preparePageForTests(page);
await page.goto(`https://www.google.com/search?q=concerts+near+new+york&client=safari&rls=en&uact=5&ibp=htl;events&rciv=evn&sa=X&fpstate=tldetail#htivrt=events&htidocid=L2F1dGhvcml0eS9ob3Jpem9uL2NsdXN0ZXJlZF9ldmVudC8yMDIxLTA2LTA0fDIxMjMzMzg4NTU2Nzc1NDk%3D&fpstate=tldetail`);
const results = await getResults(page) //the error occurs inside this function
} catch(err) {
console.log(err)
}
}
在 getResults() 函数中,waitForSelector() 的第一行出现 TimeoutError 失败。
async function getResults(page) {
try {
await page.waitForSelector("ul", { timeout: 30000 })
//I do a bunch of stuff with the ul after this point, but timeout is happening on the line above
} catch(err) {
console.log(err)
}
}
如果我删除代理并使用内置的 Heroku IP,一切正常。不确定这里的问题是什么。
使用密码保护代理的正确方法 proxy-chain
would be this:
// Give the credentials of the proxy to proxy-chain
const oldProxyUrl = 'http://bob:password123@proxy.example.com:8000';
const newProxyUrl = await proxyChain.anonymizeProxy(oldProxyUrl);
// Prints something like "http://127.0.0.1:45678"
console.log(newProxyUrl);
const browser = await puppeteer.launch({
args: [`--proxy-server=${newProxyUrl}`],
});
const page = await browser.newPage();
// Go to a page as usual, no need to authenticate
await page.goto('https://www.example.com');
您需要向 proxy-chain
提供完整代理 URL,包括用户名和密码,然后在启动 [=19= 时使用本地转发代理 URL,如 http://127.0.0.1:45678
].