我正在尝试从网站上抓取数据并在主体中使用 JS 函数取回基本 HTML
I'm trying to scrap data from a website and getting back basic HTML with JS function in the body
大家好,
我正在玩弄 Node.js 和 cheerio 包作为我 node.js 学习的一部分,我正在尝试构建一个网络抓取器,它将从购物网站,但是当我尝试 console.log html 变量时,它 returns 一个基本的 html 结构,其中包含一些试图防止抓取的 Js 函数。
我的代码:
const needle = require('needle')
const http = require('http')
const cheerio = require("cheerio")
needle.get('https://ksp.co.il/web/item/130984', (error, response, html) => {
if (!error && response.statusCode == 200){
const $ = cheerio.load(html)
console.log(html)
http.createServer(function (req, res) {
res.writeHead(200, {'Content-Type': 'text/html'});
res.write(html)
res.end();
}).listen(3000);
}
})
我猜这是某种保护层,但这是我得到的结果:
<html lang="he">
<head>
<meta charset="utf-8" />
<link rel="icon" id="header-icon" href="/web/favicon.ico">
<link rel="canonical" id="header-canonical">
<meta name="viewport" content="width=device-width,initial-scale=1" />
<meta name="description" content="מעל 38,000 מוצרים: מחשבים סלולר, בשמים, למטבח, למשרד טיפוח, פארם, צעצועים, נעלים ומיזוג" />
<link rel="manifest" href="/web/manifest.json" />
<script src='https://ksp.co.il/_cache/dictionary_site_only/all.js?ts=1640275687'></script>
<script src="/web/encoding.min.js"></script>
<script>
! function() {
if ("function" == typeof window.CustomEvent) return !1;
function t(t, e) {
e = e || {
bubbles: !1,
cancelable: !1,
detail: void 0
};
var n = document.createEvent("CustomEvent");
return n.initCustomEvent(t, e.bubbles, e.cancelable, e.detail), n
}
t.prototype = window.Event.prototype, window.CustomEvent = t
}()
</script>
<script src='https://ksp.co.il/_cache/menu_dev/menu.js?ts=1640280140'></script>
<style>
.lang_he svg.revert {
transform: scaleX(-1)
}
* {
box-sizing: border-box
}
a:visited {
color: inherit
}
</style>
<title>KSP</title>
<link rel="preload" href="/web/fonts/Assistant/w300en.woff2" as="font" crossorigin="anonymous">
<link rel="preload" href="/web/fonts/Assistant/w400en.woff2" as="font" crossorigin="anonymous">
<link rel="preload" href="/web/fonts/Assistant/w600en.woff2" as="font" crossorigin="anonymous">
<link rel="preload" href="/web/fonts/Assistant/w700en.woff2" as="font" crossorigin="anonymous">
<link rel="preload" href="/web/fonts/Assistant/w800en.woff2" as="font" crossorigin="anonymous">
<link rel="preload" href="/web/fonts/Assistant/w300he.woff2" as="font" crossorigin="anonymous">
<link rel="preload" href="/web/fonts/Assistant/w400he.woff2" as="font" crossorigin="anonymous">
<link rel="preload" href="/web/fonts/Assistant/w600he.woff2" as="font" crossorigin="anonymous">
<link rel="preload" href="/web/fonts/Assistant/w700he.woff2" as="font" crossorigin="anonymous">
<link rel="preload" href="/web/fonts/Assistant/w800he.woff2" as="font" crossorigin="anonymous">
<link rel="stylesheet" href="/web/fonts/Assistant/index.css">
<script>
! function(e, t, a, n, g) {
e[n] = e[n] || [], e[n].push({
"gtm.start": (new Date).getTime(),
event: "gtm.js"
});
var m = t.getElementsByTagName(a)[0],
r = t.createElement(a);
r.async = !0, r.src = "https://www.googletagmanager.com/gtm.js?id=GTM-59D9ZCV", m.parentNode.insertBefore(r, m)
}(window, document, "script", "dataLayer")
</script>
<link href="/web/static/css/2.f825bd5a.chunk.css" rel="stylesheet">
<link href="/web/static/css/main.4e4460ee.chunk.css" rel="stylesheet">
</head>
<body style="overflow-x:hidden"><noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-59D9ZCV" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript><noscript>You need to enable JavaScript to run this app.</noscript>
<div>
<div id="root"></div>
</div>
<script async src="https://www.googletagmanager.com/gtag/js?id=UA-109261-1"></script>
<script async src="https://www.googletagmanager.com/gtag/js?id=AW-1032006858"></script>
<script>
function gtag() {
dataLayer.push(arguments)
}
window.dataLayer = window.dataLayer || [], gtag("js", new Date)
</script>
<script>
! function(e, t, n, c, o, a, f) {
e.fbq || (o = e.fbq = function() {
o.callMethod ? o.callMethod.apply(o, arguments) : o.queue.push(arguments)
}, e._fbq || (e._fbq = o), o.push = o, o.loaded = !0, o.version = "2.0", o.queue = [], (a = t.createElement(n)).async = !0, a.src = "https://connect.facebook.net/en_US/fbevents.js", (f = t.getElementsByTagName(n)[0]).parentNode.insertBefore(a, f))
}(window, document, "script"), fbq("init", "1179615532183839"), fbq("track", "PageView")
</script><noscript><img height="1" width="1" style="display:none" src="https://www.facebook.com/tr?id=1179615532183839&ev=PageView&noscript=1" /></noscript>
<script>
! function(e) {
function r(r) {
for (var n, l, f = r[0], i = r[1], a = r[2], p = 0, s = []; p < f.length; p++) l = f[p], Object.prototype.hasOwnProperty.call(o, l) && o[l] && s.push(o[l][0]), o[l] = 0;
for (n in i) Object.prototype.hasOwnProperty.call(i, n) && (e[n] = i[n]);
for (c && c(r); s.length;) s.shift()();
return u.push.apply(u, a || []), t()
}
function t() {
for (var e, r = 0; r < u.length; r++) {
for (var t = u[r], n = !0, f = 1; f < t.length; f++) {
var i = t[f];
0 !== o[i] && (n = !1)
}
n && (u.splice(r--, 1), e = l(l.s = t[0]))
}
return e
}
var n = {},
o = {
1: 0
},
u = [];
function l(r) {
if (n[r]) return n[r].exports;
var t = n[r] = {
i: r,
l: !1,
exports: {}
};
return e[r].call(t.exports, t, t.exports, l), t.l = !0, t.exports
}
l.m = e, l.c = n, l.d = function(e, r, t) {
l.o(e, r) || Object.defineProperty(e, r, {
enumerable: !0,
get: t
})
}, l.r = function(e) {
"undefined" != typeof Symbol && Symbol.toStringTag && Object.defineProperty(e, Symbol.toStringTag, {
value: "Module"
}), Object.defineProperty(e, "__esModule", {
value: !0
})
}, l.t = function(e, r) {
if (1 & r && (e = l(e)), 8 & r) return e;
if (4 & r && "object" == typeof e && e && e.__esModule) return e;
var t = Object.create(null);
if (l.r(t), Object.defineProperty(t, "default", {
enumerable: !0,
value: e
}), 2 & r && "string" != typeof e)
for (var n in e) l.d(t, n, function(r) {
return e[r]
}.bind(null, n));
return t
}, l.n = function(e) {
var r = e && e.__esModule ? function() {
return e.default
} : function() {
return e
};
return l.d(r, "a", r), r
}, l.o = function(e, r) {
return Object.prototype.hasOwnProperty.call(e, r)
}, l.p = "/web/";
var f = this.webpackJsonpcode = this.webpackJsonpcode || [],
i = f.push.bind(f);
f.push = r, f = f.slice();
for (var a = 0; a < f.length; a++) r(f[a]);
var c = i;
t()
}([])
</script>
<script src="/web/static/js/2.159a3d73.chunk.js"></script>
<script src="/web/static/js/main.1ab08410.chunk.js"></script>
</body>
</html>
知道我该如何克服这个问题吗?
谢谢大家
这可能不是爬虫保护。相反,该站点可能正在使用一些 Web 框架,在 JS 具有 运行 之后加载可视数据和 DOM 元素。解决这个问题的最简单方法是使用像 puppeteer 这样的库,它将加载站点并像真实浏览器一样处理它。这是您可能需要的基本示例:
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('[the full URL you want to scrape]');
// once the page has loaded, you can find data in a few ways:
// 1: querying
const elements = await page.$$("[any JS selector]")
// 2: evaluate
const elements1 = await page.evaluate(() => {
// run any code on the site and have it's result returned to you
});
// 3: text
const wholePage = await page.evaluate(() => document.querySelector("*").outerHTML);
// this gives you the text content of the whole page
// which you can then put in to cheerio or any parser
// and use how you were using before
await browser.close();
})();
您可以阅读有关 puppeteer more broadly, method 1, method 2 and method 3 的更多信息。
大家好,
我正在玩弄 Node.js 和 cheerio 包作为我 node.js 学习的一部分,我正在尝试构建一个网络抓取器,它将从购物网站,但是当我尝试 console.log html 变量时,它 returns 一个基本的 html 结构,其中包含一些试图防止抓取的 Js 函数。
我的代码:
const needle = require('needle')
const http = require('http')
const cheerio = require("cheerio")
needle.get('https://ksp.co.il/web/item/130984', (error, response, html) => {
if (!error && response.statusCode == 200){
const $ = cheerio.load(html)
console.log(html)
http.createServer(function (req, res) {
res.writeHead(200, {'Content-Type': 'text/html'});
res.write(html)
res.end();
}).listen(3000);
}
})
我猜这是某种保护层,但这是我得到的结果:
<html lang="he">
<head>
<meta charset="utf-8" />
<link rel="icon" id="header-icon" href="/web/favicon.ico">
<link rel="canonical" id="header-canonical">
<meta name="viewport" content="width=device-width,initial-scale=1" />
<meta name="description" content="מעל 38,000 מוצרים: מחשבים סלולר, בשמים, למטבח, למשרד טיפוח, פארם, צעצועים, נעלים ומיזוג" />
<link rel="manifest" href="/web/manifest.json" />
<script src='https://ksp.co.il/_cache/dictionary_site_only/all.js?ts=1640275687'></script>
<script src="/web/encoding.min.js"></script>
<script>
! function() {
if ("function" == typeof window.CustomEvent) return !1;
function t(t, e) {
e = e || {
bubbles: !1,
cancelable: !1,
detail: void 0
};
var n = document.createEvent("CustomEvent");
return n.initCustomEvent(t, e.bubbles, e.cancelable, e.detail), n
}
t.prototype = window.Event.prototype, window.CustomEvent = t
}()
</script>
<script src='https://ksp.co.il/_cache/menu_dev/menu.js?ts=1640280140'></script>
<style>
.lang_he svg.revert {
transform: scaleX(-1)
}
* {
box-sizing: border-box
}
a:visited {
color: inherit
}
</style>
<title>KSP</title>
<link rel="preload" href="/web/fonts/Assistant/w300en.woff2" as="font" crossorigin="anonymous">
<link rel="preload" href="/web/fonts/Assistant/w400en.woff2" as="font" crossorigin="anonymous">
<link rel="preload" href="/web/fonts/Assistant/w600en.woff2" as="font" crossorigin="anonymous">
<link rel="preload" href="/web/fonts/Assistant/w700en.woff2" as="font" crossorigin="anonymous">
<link rel="preload" href="/web/fonts/Assistant/w800en.woff2" as="font" crossorigin="anonymous">
<link rel="preload" href="/web/fonts/Assistant/w300he.woff2" as="font" crossorigin="anonymous">
<link rel="preload" href="/web/fonts/Assistant/w400he.woff2" as="font" crossorigin="anonymous">
<link rel="preload" href="/web/fonts/Assistant/w600he.woff2" as="font" crossorigin="anonymous">
<link rel="preload" href="/web/fonts/Assistant/w700he.woff2" as="font" crossorigin="anonymous">
<link rel="preload" href="/web/fonts/Assistant/w800he.woff2" as="font" crossorigin="anonymous">
<link rel="stylesheet" href="/web/fonts/Assistant/index.css">
<script>
! function(e, t, a, n, g) {
e[n] = e[n] || [], e[n].push({
"gtm.start": (new Date).getTime(),
event: "gtm.js"
});
var m = t.getElementsByTagName(a)[0],
r = t.createElement(a);
r.async = !0, r.src = "https://www.googletagmanager.com/gtm.js?id=GTM-59D9ZCV", m.parentNode.insertBefore(r, m)
}(window, document, "script", "dataLayer")
</script>
<link href="/web/static/css/2.f825bd5a.chunk.css" rel="stylesheet">
<link href="/web/static/css/main.4e4460ee.chunk.css" rel="stylesheet">
</head>
<body style="overflow-x:hidden"><noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-59D9ZCV" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript><noscript>You need to enable JavaScript to run this app.</noscript>
<div>
<div id="root"></div>
</div>
<script async src="https://www.googletagmanager.com/gtag/js?id=UA-109261-1"></script>
<script async src="https://www.googletagmanager.com/gtag/js?id=AW-1032006858"></script>
<script>
function gtag() {
dataLayer.push(arguments)
}
window.dataLayer = window.dataLayer || [], gtag("js", new Date)
</script>
<script>
! function(e, t, n, c, o, a, f) {
e.fbq || (o = e.fbq = function() {
o.callMethod ? o.callMethod.apply(o, arguments) : o.queue.push(arguments)
}, e._fbq || (e._fbq = o), o.push = o, o.loaded = !0, o.version = "2.0", o.queue = [], (a = t.createElement(n)).async = !0, a.src = "https://connect.facebook.net/en_US/fbevents.js", (f = t.getElementsByTagName(n)[0]).parentNode.insertBefore(a, f))
}(window, document, "script"), fbq("init", "1179615532183839"), fbq("track", "PageView")
</script><noscript><img height="1" width="1" style="display:none" src="https://www.facebook.com/tr?id=1179615532183839&ev=PageView&noscript=1" /></noscript>
<script>
! function(e) {
function r(r) {
for (var n, l, f = r[0], i = r[1], a = r[2], p = 0, s = []; p < f.length; p++) l = f[p], Object.prototype.hasOwnProperty.call(o, l) && o[l] && s.push(o[l][0]), o[l] = 0;
for (n in i) Object.prototype.hasOwnProperty.call(i, n) && (e[n] = i[n]);
for (c && c(r); s.length;) s.shift()();
return u.push.apply(u, a || []), t()
}
function t() {
for (var e, r = 0; r < u.length; r++) {
for (var t = u[r], n = !0, f = 1; f < t.length; f++) {
var i = t[f];
0 !== o[i] && (n = !1)
}
n && (u.splice(r--, 1), e = l(l.s = t[0]))
}
return e
}
var n = {},
o = {
1: 0
},
u = [];
function l(r) {
if (n[r]) return n[r].exports;
var t = n[r] = {
i: r,
l: !1,
exports: {}
};
return e[r].call(t.exports, t, t.exports, l), t.l = !0, t.exports
}
l.m = e, l.c = n, l.d = function(e, r, t) {
l.o(e, r) || Object.defineProperty(e, r, {
enumerable: !0,
get: t
})
}, l.r = function(e) {
"undefined" != typeof Symbol && Symbol.toStringTag && Object.defineProperty(e, Symbol.toStringTag, {
value: "Module"
}), Object.defineProperty(e, "__esModule", {
value: !0
})
}, l.t = function(e, r) {
if (1 & r && (e = l(e)), 8 & r) return e;
if (4 & r && "object" == typeof e && e && e.__esModule) return e;
var t = Object.create(null);
if (l.r(t), Object.defineProperty(t, "default", {
enumerable: !0,
value: e
}), 2 & r && "string" != typeof e)
for (var n in e) l.d(t, n, function(r) {
return e[r]
}.bind(null, n));
return t
}, l.n = function(e) {
var r = e && e.__esModule ? function() {
return e.default
} : function() {
return e
};
return l.d(r, "a", r), r
}, l.o = function(e, r) {
return Object.prototype.hasOwnProperty.call(e, r)
}, l.p = "/web/";
var f = this.webpackJsonpcode = this.webpackJsonpcode || [],
i = f.push.bind(f);
f.push = r, f = f.slice();
for (var a = 0; a < f.length; a++) r(f[a]);
var c = i;
t()
}([])
</script>
<script src="/web/static/js/2.159a3d73.chunk.js"></script>
<script src="/web/static/js/main.1ab08410.chunk.js"></script>
</body>
</html>
知道我该如何克服这个问题吗? 谢谢大家
这可能不是爬虫保护。相反,该站点可能正在使用一些 Web 框架,在 JS 具有 运行 之后加载可视数据和 DOM 元素。解决这个问题的最简单方法是使用像 puppeteer 这样的库,它将加载站点并像真实浏览器一样处理它。这是您可能需要的基本示例:
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('[the full URL you want to scrape]');
// once the page has loaded, you can find data in a few ways:
// 1: querying
const elements = await page.$$("[any JS selector]")
// 2: evaluate
const elements1 = await page.evaluate(() => {
// run any code on the site and have it's result returned to you
});
// 3: text
const wholePage = await page.evaluate(() => document.querySelector("*").outerHTML);
// this gives you the text content of the whole page
// which you can then put in to cheerio or any parser
// and use how you were using before
await browser.close();
})();
您可以阅读有关 puppeteer more broadly, method 1, method 2 and method 3 的更多信息。