如何卷曲需要先单击同意按钮的页面?
How do I curl a page that needs first clicking a consent button?
我喜欢卷曲 URL 喜欢 this。 (当您没有站点的 cookie 时,URL 需要欧盟国家/地区的同意。)
我拼凑了一个执行此操作的木偶脚本,但它对我来说看起来非常重量级和脆弱。有更好的解决方案吗?
#!/usr/bin/env node
const url = process.argv[2];
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(url)
await page.waitForSelector('.con-wizard > .wizard-body > #consent-text > .content-list > .list-item:nth-child(1)')
await page.click('.con-wizard > .wizard-body > #consent-text > .content-list > .list-item:nth-child(1)')
await page.waitForSelector('.con-wizard > .wizard-body > .actions > .consent-form > .primary')
await page.click('.con-wizard > .wizard-body > .actions > .consent-form > .primary')
const timeout = ((process.env.cfTimeout) || 20) * 1000
await page.waitFor(timeout);
const html = await page.content();
console.log(html);
await browser.close()
})()
我通过监视 Chrome 的网络窗格中的请求,使用普通 curl 成功地做到了这一点:
function techcrunch-curl() {
local url="${1:?}"
local con="$(curl -o /dev/null -w %{url_effective} $url)"
if [[ "$con" =~ 'https://consent\.yahoo\.com/v2/collectConsent\?sessionId=(.*)' ]] ; then
local sid="${match[1]}"
curl -o /dev/stdout --fail --location --cookie-jar =() 'https://consent.yahoo.com/v2/collectConsent?sessionId='$sid \
-H 'Connection: keep-alive' \
-H 'Pragma: no-cache' \
-H 'Cache-Control: no-cache' \
-H 'Origin: https://consent.yahoo.com' \
-H 'Upgrade-Insecure-Requests: 1' \
-H 'DNT: 1' \
-H 'Content-Type: application/x-www-form-urlencoded' \
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36' \
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \
-H 'Sec-Fetch-Site: same-origin' \
-H 'Sec-Fetch-Mode: navigate' \
-H 'Sec-Fetch-User: ?1' \
-H 'Sec-Fetch-Dest: document' \
-H 'Referer: https://consent.yahoo.com/v2/collectConsent?sessionId='$sid \
-H 'Accept-Language: en-US,en;q=0.9' \
--data-raw 'sessionId='$sid'&originalDoneUrl='"$(<<<$url url-encode.py)"'&namespace=techcrunch&agree=agree&agree=agree' \
--compressed
else
curl $url
fi
}
我喜欢卷曲 URL 喜欢 this。 (当您没有站点的 cookie 时,URL 需要欧盟国家/地区的同意。)
我拼凑了一个执行此操作的木偶脚本,但它对我来说看起来非常重量级和脆弱。有更好的解决方案吗?
#!/usr/bin/env node
const url = process.argv[2];
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(url)
await page.waitForSelector('.con-wizard > .wizard-body > #consent-text > .content-list > .list-item:nth-child(1)')
await page.click('.con-wizard > .wizard-body > #consent-text > .content-list > .list-item:nth-child(1)')
await page.waitForSelector('.con-wizard > .wizard-body > .actions > .consent-form > .primary')
await page.click('.con-wizard > .wizard-body > .actions > .consent-form > .primary')
const timeout = ((process.env.cfTimeout) || 20) * 1000
await page.waitFor(timeout);
const html = await page.content();
console.log(html);
await browser.close()
})()
我通过监视 Chrome 的网络窗格中的请求,使用普通 curl 成功地做到了这一点:
function techcrunch-curl() {
local url="${1:?}"
local con="$(curl -o /dev/null -w %{url_effective} $url)"
if [[ "$con" =~ 'https://consent\.yahoo\.com/v2/collectConsent\?sessionId=(.*)' ]] ; then
local sid="${match[1]}"
curl -o /dev/stdout --fail --location --cookie-jar =() 'https://consent.yahoo.com/v2/collectConsent?sessionId='$sid \
-H 'Connection: keep-alive' \
-H 'Pragma: no-cache' \
-H 'Cache-Control: no-cache' \
-H 'Origin: https://consent.yahoo.com' \
-H 'Upgrade-Insecure-Requests: 1' \
-H 'DNT: 1' \
-H 'Content-Type: application/x-www-form-urlencoded' \
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36' \
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \
-H 'Sec-Fetch-Site: same-origin' \
-H 'Sec-Fetch-Mode: navigate' \
-H 'Sec-Fetch-User: ?1' \
-H 'Sec-Fetch-Dest: document' \
-H 'Referer: https://consent.yahoo.com/v2/collectConsent?sessionId='$sid \
-H 'Accept-Language: en-US,en;q=0.9' \
--data-raw 'sessionId='$sid'&originalDoneUrl='"$(<<<$url url-encode.py)"'&namespace=techcrunch&agree=agree&agree=agree' \
--compressed
else
curl $url
fi
}