遍历一个复杂的 DOM 并抓取值
Traversing a complex DOM and scraping values
考虑DOM.
中的以下结构
<div class="bodyCells">
<div style="foo">
<div style="foo">
<div style="foo">
<div style="foo1"> '1-contains the list of text elements I want to scrape'</div>
<div style="foo2"> '2-contains the list of text elements I want to scrape'</div>
</div>
<div style="foo">
<div style="foo3"> '3-contains the list of text elements I want to scrape'</div>
<div style="foo4"> '4-contains the list of text elements I want to scrape'</div>
</div>
</div>
</div>
</div>
通过使用 class 名称 bodyCells,我需要一次从 每个 div 中抓取数据time(即)最初从第一个div开始,然后从下一个div开始,依此类推,并将其存储在单独的数组中。我怎么可能做到这一点? (使用人偶操作)
注意: 我曾尝试直接使用 class 名称来实现此目的,但是,它在一个数组中提供了 所有文本。 我需要分别从每个标签中获取数据并将其存储在不同的数组中。
预期输出:
array1=["text present within style="foo1" div tag"]
array2=["text present within style="foo2" div tag"]
array3=["text present within style="foo3" div tag"]
array4=["text present within style="foo4" div tag"]
这是我目前所做的:
var value=[];
value = await page1.evaluate(() =>{
if (!window.document){window.document = {};}
var textitems=[]
var extracted_items=[]
textitems = document.getElementsByClassName("bodyCells");
for (var i = 0; i < textitems.length; i++) {
item=textitems[i].textContent
extracted_items.push(item);
}
return extracted_items;
});
不确定这是否是您需要的...
const html = `
<!doctype html>
<html>
<head><meta charset="UTF-8"><title>Test</title></head>
<body>
<div class="bodyCells">
<div style="foo">
<div style="foo">
<div style="foo">
<div style="foo1"> '1-contains the list of text elements I want to scrape'</div>
<div style="foo2"> '2-contains the list of text elements I want to scrape'</div>
</div>
<div style="foo">
<div style="foo3"> '3-contains the list of text elements I want to scrape'</div>
<div style="foo4"> '4-contains the list of text elements I want to scrape'</div>
</div>
</div>
</div>
</div>
</body>
</html>`;
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(`data:text/html,${html}`);
const data = await page.evaluate(() => Array.from(
document.querySelectorAll('div.bodyCells > div > div > div > div'),
div => [div.innerText],
));
console.log(data);
await browser.close();
} catch (err) {
console.error(err);
}
})();
输出:
[
[ "'1-contains the list of text elements I want to scrape'" ],
[ "'2-contains the list of text elements I want to scrape'" ],
[ "'3-contains the list of text elements I want to scrape'" ],
[ "'4-contains the list of text elements I want to scrape'" ]
]
考虑DOM.
中的以下结构 <div class="bodyCells">
<div style="foo">
<div style="foo">
<div style="foo">
<div style="foo1"> '1-contains the list of text elements I want to scrape'</div>
<div style="foo2"> '2-contains the list of text elements I want to scrape'</div>
</div>
<div style="foo">
<div style="foo3"> '3-contains the list of text elements I want to scrape'</div>
<div style="foo4"> '4-contains the list of text elements I want to scrape'</div>
</div>
</div>
</div>
</div>
通过使用 class 名称 bodyCells,我需要一次从 每个 div 中抓取数据time(即)最初从第一个div开始,然后从下一个div开始,依此类推,并将其存储在单独的数组中。我怎么可能做到这一点? (使用人偶操作)
注意: 我曾尝试直接使用 class 名称来实现此目的,但是,它在一个数组中提供了 所有文本。 我需要分别从每个标签中获取数据并将其存储在不同的数组中。
预期输出:
array1=["text present within style="foo1" div tag"]
array2=["text present within style="foo2" div tag"]
array3=["text present within style="foo3" div tag"]
array4=["text present within style="foo4" div tag"]
这是我目前所做的:
var value=[];
value = await page1.evaluate(() =>{
if (!window.document){window.document = {};}
var textitems=[]
var extracted_items=[]
textitems = document.getElementsByClassName("bodyCells");
for (var i = 0; i < textitems.length; i++) {
item=textitems[i].textContent
extracted_items.push(item);
}
return extracted_items;
});
不确定这是否是您需要的...
const html = `
<!doctype html>
<html>
<head><meta charset="UTF-8"><title>Test</title></head>
<body>
<div class="bodyCells">
<div style="foo">
<div style="foo">
<div style="foo">
<div style="foo1"> '1-contains the list of text elements I want to scrape'</div>
<div style="foo2"> '2-contains the list of text elements I want to scrape'</div>
</div>
<div style="foo">
<div style="foo3"> '3-contains the list of text elements I want to scrape'</div>
<div style="foo4"> '4-contains the list of text elements I want to scrape'</div>
</div>
</div>
</div>
</div>
</body>
</html>`;
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(`data:text/html,${html}`);
const data = await page.evaluate(() => Array.from(
document.querySelectorAll('div.bodyCells > div > div > div > div'),
div => [div.innerText],
));
console.log(data);
await browser.close();
} catch (err) {
console.error(err);
}
})();
输出:
[
[ "'1-contains the list of text elements I want to scrape'" ],
[ "'2-contains the list of text elements I want to scrape'" ],
[ "'3-contains the list of text elements I want to scrape'" ],
[ "'4-contains the list of text elements I want to scrape'" ]
]