使用 cheerio 和 node 解析嵌套的多个 <ul><li> 标记项
Parse nested multiple <ul><li> tagged items with cheerio and node
我正在尝试抓取网页并将其部分内容转换为 JSON,以便以后可以通过 wordpress 读取。但是,我似乎无法正确循环列表项。
例如,我想遍历这个:
<div class="acalog-core">
<h4><a name="GEC01WrittenCommunication6Hours"></a><a name="gec01writtencommunication6hours" id="core_95236"></a>GEC 01. Written Communication (6 hours)</h4>
<hr>
<ul>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87026',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95236~;}'); return false;">ENG 101 - Composition One</a> 3 hrs.</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87027',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95236~;}'); return false;">ENG 102 - Composition Two</a> 3 hrs.</span></li>
</ul>
</div>
<div class="acalog-core">
<h4><a name="GEC02NaturalScience810HoursMinimum"></a><a name="gec02naturalscience810hoursminimum" id="core_95238"></a>GEC 02. Natural Science (8-10 hours minimum)</h4>
<hr>
<p>Select 2 courses with labs:</p>
<ul>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86280',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">AST 111 - General Astronomy I</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86281',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">AST 111L - General Astronomy I Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86282',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">AST 112 - General Astronomy II</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86283',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">AST 112L - General Astronomy II Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86284',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 103 - Biology and Society</a> 3 hrs. ◊ AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86285',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 103L - Biology and Society Laboratory</a> 1 hr ◊</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86290',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 110 - Principles of Biological Science I</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86291',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 110L - Principles of Biological Science I Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86292',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 111 - Principles of Biological Science II</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86293',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 111L - Principles of Biological Science II Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86299',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 250 - Human Anatomy and Physiology I</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86300',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 250L - Human Anatomy and Physiology I Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86301',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 251 - Human Anatomy and Physiology II</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86302',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 251L - Human Anatomy and Physiology II Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86484',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">CHE 104 - Chemistry and Our Environment</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86485',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">CHE 104L - Chemistry and Our Environment Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86486',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">CHE 106 - General Chemistry I</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86487',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">CHE 106L - General Chemistry I Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86488',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">CHE 107 - General Chemistry II</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86489',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">CHE 107L - General Chemistry II Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87208',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GHY 104 - Weather and Climate</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87719',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GHY 104L - Weather and Climate Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87209',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GHY 105 - Land and Water</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87720',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GHY 105L - Land and Water Lab</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87237',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GLY 101 - Physical Geology</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87749',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GLY 101L - Physical Geology Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87750',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GLY 103 - Historical Geology</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87238',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GLY 103L - Historical Geology Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87507',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">MAR 151 - Introduction to Ocean Science</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87508',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">MAR 151L - Introduction to Ocean Science Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88573',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 103 - Introductory Physics</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88574',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 103L - Introductory Physics Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88575',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 111 - General Physics I</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88576',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 111L - General Physics I Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88577',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 112 - General Physics II</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88578',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 112L - General Physics II Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88580',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 201 - General Physics I with Calculus</a> 4 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88581',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 201L - General Physics I with Calculus Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88582',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 202 - General Physics II with Calculus</a> 4 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88583',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 202L - General Physics II with Calculus Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88661',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PSC 190 - Living in a Material World</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88662',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PSC 190L - Lab for Living in a Material World</a> 1 hr.</span></li>
</ul>
</div>
<div class="acalog-core">
<h4><a name="GEC03Humanities9Hours"></a><a name="gec03humanities9hours" id="core_95240"></a>GEC 03. Humanities (9 hours)</h4>
<hr>
<ul>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87537',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95240~;}'); return false;">ENG 203 - World Literature</a> 3 hrs.</span></li>
</ul>
</div>
<div style="padding-left: 20px;">
<div class="acalog-core">
<h5><a name="Select2Courses1HistoryRequired"></a><a name="select2courses1historyrequired" id="core_95241"></a>Select 2 courses, 1 History required:</h5>
<hr>
<ul>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87272',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95241~;}'); return false;">HIS 101 - World Civilizations: Beginnings to 1500 C.E.</a> 3 hrs.</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87273',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95241~;}'); return false;">HIS 102 - World Civilizations: 1500 to the present</a> 3 hrs.</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88541',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95241~;}'); return false;">PHI 151 - Introduction to Philosophy</a> 3 hrs.</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88542',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95241~;}'); return false;">PHI 171 - Ethics and Good Living</a> 3 hrs.</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88756',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95241~;}'); return false;">REL 131 - Comparative Religion</a> 3 hrs.</span></li>
</ul>
</div>
请注意,这看起来非常凌乱,而且确实如此。但我必须解析每个列表并使用 <h4>
headers 作为标题,然后在每个 <h4>
下分别放置列表项,以便 JSON 文件如下所示:
[
title: 'GEC 01',
courseid: [
'ENG 101',
'ENG 102',
... etc
],
coursetitle: [
'Composition One 3 hrs.',
'Composition Two 3 hrs.',
... etc
],
labid: [
//follows same format as above, if no lab this is empty
],
labtitle: [
//follows same format as above, if no lab this is empty
]
我目前正在将 objects 推入一个数组,然后在将其全部解析为 JSON 文件类型后输出该数组。
我目前的循环结构如下:
$('h4').each(function(i, elem){ // -- Looks for <h4> tagged items
let data = new courses('');
if($(this).text() !== '') { // -- Check to make sure <h4> isn't empty
data.title = $(this).text(); // Set title to <h4> text
}
$('li').each(function (j, ele) { // -- Looks for <li> tagged items in <h4> items
input = $(ele).find('span').text().split(" - "); // -- Get <span> text items and split them into input[0] and input[1]
courseid = input[0]; // -- Puts for example CSC 101 into here
coursename = input[1]; // -- Puts for example 'Introduction to computer science' into here
// -- Make sure there is actual data --
if (coursename !== '' && courseid !== '' && coursename !== undefined && coursename !== null) {
// -- If there is a lab enter here
if (courseid.indexOf("L", 7) !== -1) { // indexOf searches for char value, the 7 specifies location
data.addli(courseid); // Input course value here
//=========================Check for And || or to remove=========================
if (coursename.indexOf("AND") !== -1 || coursename.indexOf("or") !== -1) {
if (coursename.indexOf("AND") !== -1) { // Replace AND with nothing
data.addln(coursename.replace(' AND', ''));
}
else { // Otherwise remove or
data.addln(coursename.replace('or', ''));
}
}
else { // -- If no 'AND' or 'or' then come here and push lab name
data.addln(coursename);
}
//=========================END of check==========================================
}
// -- If there is no lab, enter here to push course
else {
data.addci(courseid);
//=========================Check for And || or to remove=========================
if (coursename.indexOf("AND") !== -1 || coursename.indexOf("or") !== -1) {
if (coursename.indexOf("AND") !== -1) { // Replace AND with nothing
data.addcn(coursename.replace(' AND', ''));
}
else { // Otherwise remove or
data.addcn(coursename.replace('or', ''));
}
}
else { // -- If no 'AND' or 'or' then come here and push lab name
data.addcn(coursename);
}
}
}
});
parsedResults.push(data);
});
我 运行 遇到的问题是 li 项目都是一次性传入的,我似乎无法根据它们属于哪个 h4 将它们分开。所以我的问题是,有什么方法可以停止特定 li 标记末尾的嵌套 each 循环,然后转到下一个 ul 分组,然后获取这些项目?希望这是有道理的。
假设您要抓取 URL 为 abc.com 的网站
npm install x-ray
var Xray = require('x-ray');
var x = Xray();
x('http:// abc .com', {
title: '.acalog-core>h4@text',
courseid : '.acalog-core li',
and so on...
})(function(err, obj) {
// do some lite filtering in obj to strip of the HTML
// write To json.. obj contains all the scrapped content as key ,value pair
})
我正在尝试抓取网页并将其部分内容转换为 JSON,以便以后可以通过 wordpress 读取。但是,我似乎无法正确循环列表项。
例如,我想遍历这个:
<div class="acalog-core">
<h4><a name="GEC01WrittenCommunication6Hours"></a><a name="gec01writtencommunication6hours" id="core_95236"></a>GEC 01. Written Communication (6 hours)</h4>
<hr>
<ul>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87026',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95236~;}'); return false;">ENG 101 - Composition One</a> 3 hrs.</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87027',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95236~;}'); return false;">ENG 102 - Composition Two</a> 3 hrs.</span></li>
</ul>
</div>
<div class="acalog-core">
<h4><a name="GEC02NaturalScience810HoursMinimum"></a><a name="gec02naturalscience810hoursminimum" id="core_95238"></a>GEC 02. Natural Science (8-10 hours minimum)</h4>
<hr>
<p>Select 2 courses with labs:</p>
<ul>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86280',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">AST 111 - General Astronomy I</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86281',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">AST 111L - General Astronomy I Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86282',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">AST 112 - General Astronomy II</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86283',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">AST 112L - General Astronomy II Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86284',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 103 - Biology and Society</a> 3 hrs. ◊ AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86285',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 103L - Biology and Society Laboratory</a> 1 hr ◊</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86290',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 110 - Principles of Biological Science I</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86291',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 110L - Principles of Biological Science I Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86292',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 111 - Principles of Biological Science II</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86293',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 111L - Principles of Biological Science II Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86299',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 250 - Human Anatomy and Physiology I</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86300',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 250L - Human Anatomy and Physiology I Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86301',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 251 - Human Anatomy and Physiology II</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86302',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 251L - Human Anatomy and Physiology II Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86484',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">CHE 104 - Chemistry and Our Environment</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86485',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">CHE 104L - Chemistry and Our Environment Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86486',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">CHE 106 - General Chemistry I</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86487',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">CHE 106L - General Chemistry I Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86488',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">CHE 107 - General Chemistry II</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86489',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">CHE 107L - General Chemistry II Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87208',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GHY 104 - Weather and Climate</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87719',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GHY 104L - Weather and Climate Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87209',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GHY 105 - Land and Water</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87720',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GHY 105L - Land and Water Lab</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87237',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GLY 101 - Physical Geology</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87749',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GLY 101L - Physical Geology Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87750',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GLY 103 - Historical Geology</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87238',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GLY 103L - Historical Geology Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87507',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">MAR 151 - Introduction to Ocean Science</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87508',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">MAR 151L - Introduction to Ocean Science Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88573',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 103 - Introductory Physics</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88574',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 103L - Introductory Physics Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88575',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 111 - General Physics I</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88576',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 111L - General Physics I Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88577',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 112 - General Physics II</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88578',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 112L - General Physics II Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88580',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 201 - General Physics I with Calculus</a> 4 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88581',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 201L - General Physics I with Calculus Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88582',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 202 - General Physics II with Calculus</a> 4 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88583',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 202L - General Physics II with Calculus Laboratory</a> 1 hr.</span></li>
<li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
<p> </p>
</li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88661',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PSC 190 - Living in a Material World</a> 3 hrs. AND</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88662',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PSC 190L - Lab for Living in a Material World</a> 1 hr.</span></li>
</ul>
</div>
<div class="acalog-core">
<h4><a name="GEC03Humanities9Hours"></a><a name="gec03humanities9hours" id="core_95240"></a>GEC 03. Humanities (9 hours)</h4>
<hr>
<ul>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87537',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95240~;}'); return false;">ENG 203 - World Literature</a> 3 hrs.</span></li>
</ul>
</div>
<div style="padding-left: 20px;">
<div class="acalog-core">
<h5><a name="Select2Courses1HistoryRequired"></a><a name="select2courses1historyrequired" id="core_95241"></a>Select 2 courses, 1 History required:</h5>
<hr>
<ul>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87272',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95241~;}'); return false;">HIS 101 - World Civilizations: Beginnings to 1500 C.E.</a> 3 hrs.</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87273',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95241~;}'); return false;">HIS 102 - World Civilizations: 1500 to the present</a> 3 hrs.</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88541',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95241~;}'); return false;">PHI 151 - Introduction to Philosophy</a> 3 hrs.</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88542',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95241~;}'); return false;">PHI 171 - Ethics and Good Living</a> 3 hrs.</span></li>
<li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88756',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95241~;}'); return false;">REL 131 - Comparative Religion</a> 3 hrs.</span></li>
</ul>
</div>
请注意,这看起来非常凌乱,而且确实如此。但我必须解析每个列表并使用 <h4>
headers 作为标题,然后在每个 <h4>
下分别放置列表项,以便 JSON 文件如下所示:
[
title: 'GEC 01',
courseid: [
'ENG 101',
'ENG 102',
... etc
],
coursetitle: [
'Composition One 3 hrs.',
'Composition Two 3 hrs.',
... etc
],
labid: [
//follows same format as above, if no lab this is empty
],
labtitle: [
//follows same format as above, if no lab this is empty
]
我目前正在将 objects 推入一个数组,然后在将其全部解析为 JSON 文件类型后输出该数组。
我目前的循环结构如下:
$('h4').each(function(i, elem){ // -- Looks for <h4> tagged items
let data = new courses('');
if($(this).text() !== '') { // -- Check to make sure <h4> isn't empty
data.title = $(this).text(); // Set title to <h4> text
}
$('li').each(function (j, ele) { // -- Looks for <li> tagged items in <h4> items
input = $(ele).find('span').text().split(" - "); // -- Get <span> text items and split them into input[0] and input[1]
courseid = input[0]; // -- Puts for example CSC 101 into here
coursename = input[1]; // -- Puts for example 'Introduction to computer science' into here
// -- Make sure there is actual data --
if (coursename !== '' && courseid !== '' && coursename !== undefined && coursename !== null) {
// -- If there is a lab enter here
if (courseid.indexOf("L", 7) !== -1) { // indexOf searches for char value, the 7 specifies location
data.addli(courseid); // Input course value here
//=========================Check for And || or to remove=========================
if (coursename.indexOf("AND") !== -1 || coursename.indexOf("or") !== -1) {
if (coursename.indexOf("AND") !== -1) { // Replace AND with nothing
data.addln(coursename.replace(' AND', ''));
}
else { // Otherwise remove or
data.addln(coursename.replace('or', ''));
}
}
else { // -- If no 'AND' or 'or' then come here and push lab name
data.addln(coursename);
}
//=========================END of check==========================================
}
// -- If there is no lab, enter here to push course
else {
data.addci(courseid);
//=========================Check for And || or to remove=========================
if (coursename.indexOf("AND") !== -1 || coursename.indexOf("or") !== -1) {
if (coursename.indexOf("AND") !== -1) { // Replace AND with nothing
data.addcn(coursename.replace(' AND', ''));
}
else { // Otherwise remove or
data.addcn(coursename.replace('or', ''));
}
}
else { // -- If no 'AND' or 'or' then come here and push lab name
data.addcn(coursename);
}
}
}
});
parsedResults.push(data);
});
我 运行 遇到的问题是 li 项目都是一次性传入的,我似乎无法根据它们属于哪个 h4 将它们分开。所以我的问题是,有什么方法可以停止特定 li 标记末尾的嵌套 each 循环,然后转到下一个 ul 分组,然后获取这些项目?希望这是有道理的。
假设您要抓取 URL 为 abc.com 的网站 npm install x-ray
var Xray = require('x-ray');
var x = Xray();
x('http:// abc .com', {
title: '.acalog-core>h4@text',
courseid : '.acalog-core li',
and so on...
})(function(err, obj) {
// do some lite filtering in obj to strip of the HTML
// write To json.. obj contains all the scrapped content as key ,value pair
})