DOM解析器 - children 不是 DOM objects
DOMParser - children are not DOM objects
DOMParser 有一个奇怪的行为。当我使用 "text/xml" 作为参数时,我得到我的 object 并且每次我使用 child(如 parentNodes)时,child 本身就是一个 DOM object。但是,当我使用 "text/html" 作为参数时, children 不是 DOM objects。为什么会这样,我怎样才能让所有 children 都拥有 DOM object?
我是这样做的:
parser = new DOMParser();
doc = parser.parseFromString(stringContainingHTMLSource, "text/html").getElementsByTagName('p');
console.log(doc[0].childNodes[0]);
My childNode returns 元素但不是 DOM object...
编辑:
这是我的递归函数:
var getParents = function(node, parentNodes){
if(node.nodeName == 'span'){
parentNodes.push(node.attributes[0].nodeValue);
} else if(node.nodeName == 'p' && node.attributes.length > 0) {
parentNodes.push(node.nodeName);
parentNodes.push(node.attributes[0].nodeValue);
} else {
parentNodes.push(node.nodeName);
}
if(node.parentNode.nodeName != '#document'){
getParents(node.parentNode, parentNodes);
}
return parentNodes;
};
var parse = function(node, vertical, horizontal, paragraph){
if(node.childNodes.length > 0){
for(var int = 0; int < node.childNodes.length; int++){
parse(node.childNodes[int], vertical, horizontal, paragraph);
}
} else{
var object = {};
var attributes = getParents(node, []);
for(var int = 0; int < attributes.length; int++) {
// right alignment
if(/text-align/i.test(attributes[int])){
object.alignment = attributes[int].split(": ")[1].replace(';','');
} else if (/color/i.test(attributes[int])) {
// color
object.color = attributes[int].split(":")[1];
} else if (attributes[int] == 'em') {
// italic
if (object.italics) {
delete object.bold;
object.bolditalics = true;
} else {
object.italics = true;
}
} else if (attributes[int] == 'strong') {
// bold
if (object.italics) {
delete object.italics;
object.bolditalics = true;
} else {
object.bold = true;
}
} else if (attributes[int] == 'u') {
// underline
object.decoration = 'underline';
} else if (attributes[int] == 's') {
// strike
object.decoration = 'lineThrough';
}
}
object.text = node.textContent;
pdfContent[vertical][horizontal].push(object);
}
};
for(var vertical = 0; vertical < payment.htmlContent.length; vertical++) {
for(var horizontal = 0; horizontal < payment.htmlContent[vertical].length; horizontal++) {
var parser = new DOMParser();
var paragraphs = parser.parseFromString(payment.htmlContent[vertical][horizontal], "text/xml").getElementsByTagName('p');
for (var paragraph = 0; paragraph < paragraphs.length; paragraph++) {
for (var num = 0; num < paragraphs[paragraph].childNodes.length; num++) {
parse(paragraphs[paragraph].childNodes[num], vertical, horizontal, paragraph);
}
}
}
}
我对这些值做了一些假设,在我向您的代码中添加了一些验证(例如 if(node.attributes.length>0)
之后,它似乎有效。
var payment={htmlContent:[['<p>some<em>text</em></p>', '<p>some<span>text<strong>here</strong></span></p>'],['<p>some<s>text</s></p>', '<p>some<span style="color:#FF00FF">text</span></p>']]};
var getParents = function(node, parentNodes){
if(node.nodeName == 'span'){
if(node.attributes.length>0)
parentNodes.push(node.attributes[0].nodeValue);
} else if(node.nodeName == 'p' && node.attributes.length > 0) {
parentNodes.push(node.nodeName);
if(node.attributes.length>0)
parentNodes.push(node.attributes[0].nodeValue);
} else {
parentNodes.push(node.nodeName);
}
if(node.parentNode.nodeName != '#document'){
getParents(node.parentNode, parentNodes);
}
return parentNodes;
};
var parse = function(node, vertical, horizontal, paragraph){
if(node.childNodes.length > 0){
for(var int = 0; int < node.childNodes.length; int++){
parse(node.childNodes[int], vertical, horizontal, paragraph);
}
} else{
var object = {};
var attributes = getParents(node, []);
console.log(attributes);
for(var int = 0; int < attributes.length; int++) {
// right alignment
if(/text-align/i.test(attributes[int])){
object.alignment = attributes[int].split(": ")[1].replace(';','');
} else if (/color/i.test(attributes[int])) {
// color
object.color = attributes[int].split(":")[1];
} else if (attributes[int] == 'em') {
// italic
if (object.italics) {
delete object.bold;
object.bolditalics = true;
} else {
object.italics = true;
}
} else if (attributes[int] == 'strong') {
// bold
if (object.italics) {
delete object.italics;
object.bolditalics = true;
} else {
object.bold = true;
}
} else if (attributes[int] == 'u') {
// underline
object.decoration = 'underline';
} else if (attributes[int] == 's') {
// strike
object.decoration = 'lineThrough';
}
}
object.text = node.textContent;
if(!pdfContent[vertical])pdfContent[vertical]=[];
if(!pdfContent[vertical][horizontal])
pdfContent[vertical][horizontal]=[];
pdfContent[vertical][horizontal].push(object);
}
};
var pdfContent = [];
for(var vertical = 0; vertical < payment.htmlContent.length; vertical++) {
for(var horizontal = 0; horizontal < payment.htmlContent[vertical].length; horizontal++) {
var parser = new DOMParser();
var paragraphs = parser.parseFromString(payment.htmlContent[vertical][horizontal], "text/xml").getElementsByTagName('p');
for (var paragraph = 0; paragraph < paragraphs.length; paragraph++) {
for (var num = 0; num < paragraphs[paragraph].childNodes.length; num++) {
parse(paragraphs[paragraph].childNodes[num], vertical, horizontal, paragraph);
}
}
}
}
for(var i=0; i<pdfContent.length; i++){
for(var j=0; j<pdfContent[i].length; j++){
document.querySelector('#log').textContent+=pdfContent[i][j].toSource();
}
}
<p id="log"></p>
DOMParser 有一个奇怪的行为。当我使用 "text/xml" 作为参数时,我得到我的 object 并且每次我使用 child(如 parentNodes)时,child 本身就是一个 DOM object。但是,当我使用 "text/html" 作为参数时, children 不是 DOM objects。为什么会这样,我怎样才能让所有 children 都拥有 DOM object?
我是这样做的:
parser = new DOMParser();
doc = parser.parseFromString(stringContainingHTMLSource, "text/html").getElementsByTagName('p');
console.log(doc[0].childNodes[0]);
My childNode returns 元素但不是 DOM object...
编辑: 这是我的递归函数:
var getParents = function(node, parentNodes){
if(node.nodeName == 'span'){
parentNodes.push(node.attributes[0].nodeValue);
} else if(node.nodeName == 'p' && node.attributes.length > 0) {
parentNodes.push(node.nodeName);
parentNodes.push(node.attributes[0].nodeValue);
} else {
parentNodes.push(node.nodeName);
}
if(node.parentNode.nodeName != '#document'){
getParents(node.parentNode, parentNodes);
}
return parentNodes;
};
var parse = function(node, vertical, horizontal, paragraph){
if(node.childNodes.length > 0){
for(var int = 0; int < node.childNodes.length; int++){
parse(node.childNodes[int], vertical, horizontal, paragraph);
}
} else{
var object = {};
var attributes = getParents(node, []);
for(var int = 0; int < attributes.length; int++) {
// right alignment
if(/text-align/i.test(attributes[int])){
object.alignment = attributes[int].split(": ")[1].replace(';','');
} else if (/color/i.test(attributes[int])) {
// color
object.color = attributes[int].split(":")[1];
} else if (attributes[int] == 'em') {
// italic
if (object.italics) {
delete object.bold;
object.bolditalics = true;
} else {
object.italics = true;
}
} else if (attributes[int] == 'strong') {
// bold
if (object.italics) {
delete object.italics;
object.bolditalics = true;
} else {
object.bold = true;
}
} else if (attributes[int] == 'u') {
// underline
object.decoration = 'underline';
} else if (attributes[int] == 's') {
// strike
object.decoration = 'lineThrough';
}
}
object.text = node.textContent;
pdfContent[vertical][horizontal].push(object);
}
};
for(var vertical = 0; vertical < payment.htmlContent.length; vertical++) {
for(var horizontal = 0; horizontal < payment.htmlContent[vertical].length; horizontal++) {
var parser = new DOMParser();
var paragraphs = parser.parseFromString(payment.htmlContent[vertical][horizontal], "text/xml").getElementsByTagName('p');
for (var paragraph = 0; paragraph < paragraphs.length; paragraph++) {
for (var num = 0; num < paragraphs[paragraph].childNodes.length; num++) {
parse(paragraphs[paragraph].childNodes[num], vertical, horizontal, paragraph);
}
}
}
}
我对这些值做了一些假设,在我向您的代码中添加了一些验证(例如 if(node.attributes.length>0)
之后,它似乎有效。
var payment={htmlContent:[['<p>some<em>text</em></p>', '<p>some<span>text<strong>here</strong></span></p>'],['<p>some<s>text</s></p>', '<p>some<span style="color:#FF00FF">text</span></p>']]};
var getParents = function(node, parentNodes){
if(node.nodeName == 'span'){
if(node.attributes.length>0)
parentNodes.push(node.attributes[0].nodeValue);
} else if(node.nodeName == 'p' && node.attributes.length > 0) {
parentNodes.push(node.nodeName);
if(node.attributes.length>0)
parentNodes.push(node.attributes[0].nodeValue);
} else {
parentNodes.push(node.nodeName);
}
if(node.parentNode.nodeName != '#document'){
getParents(node.parentNode, parentNodes);
}
return parentNodes;
};
var parse = function(node, vertical, horizontal, paragraph){
if(node.childNodes.length > 0){
for(var int = 0; int < node.childNodes.length; int++){
parse(node.childNodes[int], vertical, horizontal, paragraph);
}
} else{
var object = {};
var attributes = getParents(node, []);
console.log(attributes);
for(var int = 0; int < attributes.length; int++) {
// right alignment
if(/text-align/i.test(attributes[int])){
object.alignment = attributes[int].split(": ")[1].replace(';','');
} else if (/color/i.test(attributes[int])) {
// color
object.color = attributes[int].split(":")[1];
} else if (attributes[int] == 'em') {
// italic
if (object.italics) {
delete object.bold;
object.bolditalics = true;
} else {
object.italics = true;
}
} else if (attributes[int] == 'strong') {
// bold
if (object.italics) {
delete object.italics;
object.bolditalics = true;
} else {
object.bold = true;
}
} else if (attributes[int] == 'u') {
// underline
object.decoration = 'underline';
} else if (attributes[int] == 's') {
// strike
object.decoration = 'lineThrough';
}
}
object.text = node.textContent;
if(!pdfContent[vertical])pdfContent[vertical]=[];
if(!pdfContent[vertical][horizontal])
pdfContent[vertical][horizontal]=[];
pdfContent[vertical][horizontal].push(object);
}
};
var pdfContent = [];
for(var vertical = 0; vertical < payment.htmlContent.length; vertical++) {
for(var horizontal = 0; horizontal < payment.htmlContent[vertical].length; horizontal++) {
var parser = new DOMParser();
var paragraphs = parser.parseFromString(payment.htmlContent[vertical][horizontal], "text/xml").getElementsByTagName('p');
for (var paragraph = 0; paragraph < paragraphs.length; paragraph++) {
for (var num = 0; num < paragraphs[paragraph].childNodes.length; num++) {
parse(paragraphs[paragraph].childNodes[num], vertical, horizontal, paragraph);
}
}
}
}
for(var i=0; i<pdfContent.length; i++){
for(var j=0; j<pdfContent[i].length; j++){
document.querySelector('#log').textContent+=pdfContent[i][j].toSource();
}
}
<p id="log"></p>