如何过滤我的网络抓取结果以不包含 /n 或制表符
How do I filter my web scraping result to not have /n or tab
我正在从就业市场网站上抓取数据。我的刮 return 我想要的结果,但它充满了 /n、/t 或很多间距。如何过滤结果以仅获取内容?
这是我的代码:
async function start() {
const browser = await puppeteer.launch({
headless: true,
defaultViewport: null,
args: ['--ignore-certificate-errors']
});
var name = ["Job Name"];
var country = ["Country"];
var company = ["Company Name"];
var type = ["Job Type"];
var salary = ["Salary"];
var skills = ["Skills Require"];
var desc = ["Job Description"];
var req = ["Job Requirements"];
var resp = ["Job Responsibility"];
var industry = ["Industry"];
for (var j = 1; j < 2; j++) {
const page = await browser.newPage();
page.setDefaultNavigationTimeout(0);
await page.goto('https://startupjobs.asia/job/search?q=&job-list-dpl-page=' + j, {
waitUntil: "networkidle2",
timeout: 3000000
});
console.log('browsing page ' + j);
for (var i = 1; i < 31; i++) {
await page.waitForXPath("/html/body/div[1]/div[3]/div[1]/div/div[1]/ul/li[" + i + "]/div/div[1]/div/h5/a")
var b = await page.$x("/html/body/div[1]/div[3]/div[1]/div/div[1]/ul/li[" + i + "]/div/div[1]/div/h5/a")
await b[0].click();
const elementsToFind = [
{ xpath: "/html/body/div[1]/div[3]/div[1]/div/div[1]/ul/li[" + i + "]/div/div[1]/div/h5/a", propName: 'job_name' },
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[1]/div[2]/div/h6[2]/a', propName: 'country' },
{ xpath: "/html/body/div[1]/div[3]/div[1]/div/div[1]/ul/li[" + i + "]/div/div[1]/div/p[1]/a", propName: 'company' },
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[1]/div[3]/p', propName: 'job_type' },
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[1]/div[1]/p', propName: 'salary' },
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[1]/div[4]/p', propName: 'skills' },
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[2]/div[1]/div', propName: 'job_description' },
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[2]/div[3]/div', propName: 'job_requirement' },
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[2]/div[2]/div', propName: 'job_responsibility' },
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[1]/div[2]/p', propName: 'industry' },
// ...
];
/*const elementsToFind2 = [
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/p/a', propName2: 'website' },
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[3]/div[2]/div[1]/div', propName2: 'about' },
// ...
];*/
var results = {};
for (var { xpath, propName } of elementsToFind) {
await page.waitForXPath(xpath);
var [el] = await page.$x(xpath);
results[propName] = !el ? 'Not Found' : await (await el.getProperty('textContent')).jsonValue();
}
name.push(results['job_name']);
country.push(results['country']);
company.push(results['company']);
type.push(results['job_type']);
salary.push(results['salary']);
skills.push(results['skills']);
desc.push(results['job_description']);
req.push(results['job_requirement']);
resp.push(results['job_responsibility']);
industry.push(results['industry']);
//await page.evaluate(() => document.querySelector("#suj-single-jobdetail-wrapper > div.detail-body > div.row > div.col.s12.tabs-wrapper.suj-company-review-tabs-wrapper > ul > li:nth-child(2) > a").click())
}
await page.close();
}
await browser.close();
我得到的结果是这样的:
'\n' +
' \n' +
' Part-Time \n' +
' | \n' +
' \n' +
' Temporary \n' +
' | \n' +
' \n' +
' Contract \n' +
'
我希望它只是兼职,临时和合同。
您可以使用正则表达式 \s+
匹配字符串中的所有白色 space(space、制表符和换行符)并将其与 replace
. You can then either split
结果一起删除数组或 replace
|
个带逗号的字符:
let s = '\n' + ' \n' + ' Part-Time \n' + ' | \n' + ' \n' + ' Temporary \n' + ' | \n' + ' \n' + ' Contract \n';
// remove whitespace
s = s.replace(/\s+/g, '')
// various output options
console.log(s.split('|'))
console.log(s.replace(/\|/g, ', ')) // for older browsers
console.log(s.replaceAll('|', ', ')) // for more modern browsers
我正在从就业市场网站上抓取数据。我的刮 return 我想要的结果,但它充满了 /n、/t 或很多间距。如何过滤结果以仅获取内容?
这是我的代码:
async function start() {
const browser = await puppeteer.launch({
headless: true,
defaultViewport: null,
args: ['--ignore-certificate-errors']
});
var name = ["Job Name"];
var country = ["Country"];
var company = ["Company Name"];
var type = ["Job Type"];
var salary = ["Salary"];
var skills = ["Skills Require"];
var desc = ["Job Description"];
var req = ["Job Requirements"];
var resp = ["Job Responsibility"];
var industry = ["Industry"];
for (var j = 1; j < 2; j++) {
const page = await browser.newPage();
page.setDefaultNavigationTimeout(0);
await page.goto('https://startupjobs.asia/job/search?q=&job-list-dpl-page=' + j, {
waitUntil: "networkidle2",
timeout: 3000000
});
console.log('browsing page ' + j);
for (var i = 1; i < 31; i++) {
await page.waitForXPath("/html/body/div[1]/div[3]/div[1]/div/div[1]/ul/li[" + i + "]/div/div[1]/div/h5/a")
var b = await page.$x("/html/body/div[1]/div[3]/div[1]/div/div[1]/ul/li[" + i + "]/div/div[1]/div/h5/a")
await b[0].click();
const elementsToFind = [
{ xpath: "/html/body/div[1]/div[3]/div[1]/div/div[1]/ul/li[" + i + "]/div/div[1]/div/h5/a", propName: 'job_name' },
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[1]/div[2]/div/h6[2]/a', propName: 'country' },
{ xpath: "/html/body/div[1]/div[3]/div[1]/div/div[1]/ul/li[" + i + "]/div/div[1]/div/p[1]/a", propName: 'company' },
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[1]/div[3]/p', propName: 'job_type' },
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[1]/div[1]/p', propName: 'salary' },
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[1]/div[4]/p', propName: 'skills' },
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[2]/div[1]/div', propName: 'job_description' },
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[2]/div[3]/div', propName: 'job_requirement' },
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[2]/div[2]/div', propName: 'job_responsibility' },
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[1]/div[2]/p', propName: 'industry' },
// ...
];
/*const elementsToFind2 = [
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/p/a', propName2: 'website' },
{ xpath: '/html/body/div[1]/div[3]/div[2]/div[2]/div[1]/div[3]/div[2]/div[1]/div', propName2: 'about' },
// ...
];*/
var results = {};
for (var { xpath, propName } of elementsToFind) {
await page.waitForXPath(xpath);
var [el] = await page.$x(xpath);
results[propName] = !el ? 'Not Found' : await (await el.getProperty('textContent')).jsonValue();
}
name.push(results['job_name']);
country.push(results['country']);
company.push(results['company']);
type.push(results['job_type']);
salary.push(results['salary']);
skills.push(results['skills']);
desc.push(results['job_description']);
req.push(results['job_requirement']);
resp.push(results['job_responsibility']);
industry.push(results['industry']);
//await page.evaluate(() => document.querySelector("#suj-single-jobdetail-wrapper > div.detail-body > div.row > div.col.s12.tabs-wrapper.suj-company-review-tabs-wrapper > ul > li:nth-child(2) > a").click())
}
await page.close();
}
await browser.close();
我得到的结果是这样的:
'\n' + ' \n' + ' Part-Time \n' + ' | \n' + ' \n' + ' Temporary \n' + ' | \n' + ' \n' + ' Contract \n' + '
我希望它只是兼职,临时和合同。
您可以使用正则表达式 \s+
匹配字符串中的所有白色 space(space、制表符和换行符)并将其与 replace
. You can then either split
结果一起删除数组或 replace
|
个带逗号的字符:
let s = '\n' + ' \n' + ' Part-Time \n' + ' | \n' + ' \n' + ' Temporary \n' + ' | \n' + ' \n' + ' Contract \n';
// remove whitespace
s = s.replace(/\s+/g, '')
// various output options
console.log(s.split('|'))
console.log(s.replace(/\|/g, ', ')) // for older browsers
console.log(s.replaceAll('|', ', ')) // for more modern browsers