获取元素的 innerHTML,但不是元素本身
Get the innerHTML of an element, but not the element itself
我正在努力从 2 列中提取数据 table。第一列是变量名,第二列是该变量的数据。
我几乎可以正常工作,但有些数据可能包含 HTML 并且通常包含在 DIV 中。我想在 DIV 中获取 HTML,而不是 DIV 本身。我知道正则表达式可能是一种解决方案,但我想更好地理解 DOMDocument。
private function readHtml()
$url = "https://docs.google.com/spreadsheets/d/1Klpic32Gb_TDblDZDJQOkDedFGuNHAokxUXqrCPDFWE/pubhtml";
$curl = curl_init($url);
$htmlData = curl_exec($curl);
$dom = new \DOMDocument();
$html = $dom->loadHTML($htmlData);
$dom->preserveWhiteSpace = false;
$tables = $dom->getElementsByTagName('table');
$rows = $tables->item(0)->getElementsByTagName('tr');
$cols = $rows->item(1)->getElementsByTagName('td');
$table = [];
$key = null;
$value = null;
foreach ($rows as $i => $row){
//skip the heading columns
if($i <= 1 ) continue;
$cols = $row->getElementsByTagName('td');
foreach ($cols as $count => $node) {
if($count == 0) {
$key = strtolower(str_replace(' ', '_',$node->textContent));
} else {
$htmlNode = $node->getElementsByTagName('div');
if($htmlNode->length >=1) {
$innerHTML= '';
foreach ($htmlNode as $innerNode) {
$innerHTML .= $innerNode->ownerDocument->saveHTML( $innerNode );
$value = $innerHTML;
} else {
$value = $node->textContent;
$table[$key] = $value;
return $table;
我的输出是正确的,但我不想包含包含 HTML:
的数据的包装器 DIV
[type] => raw
[direction] => north
[intro] => Welcome to the test.
[html_body] => <div class="softmerge-inner" style="width: 5653px; left: -1px;">Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut <span style="font-weight:bold;">aliquip</span> ex ea commodo consequat. Duis aute irure dolor in <span style="text-decoration:underline;">reprehenderit</span> in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, <span style="font-style:italic;">sunt in</span> culpa qui officia deserunt mollit anim id est laborum.</div>
[count] => 1003
private function readHtml()
# the url given in your example
$url = "https://docs.google.com/spreadsheets/d/1Klpic32Gb_TDblDZDJQOkDedFGuNHAokxUXqrCPDFWE/pubhtml";
$dom = new \DOMDocument();
$dom->preserveWhiteSpace = false;
$tables = $dom->getElementsByTagName('table');
$rows = $tables->item(0)->getElementsByTagName('tr');
$cols = $rows->item(1)->getElementsByTagName('td');
$table = [];
$key = null;
$value = null;
foreach ($rows as $i => $row){
//skip the heading columns
if($i <= 1 ) continue;
$cols = $row->getElementsByTagName('td');
foreach ($cols as $count => $node) {
if($count == 0) {
$key = strtolower(str_replace(' ', '_',$node->textContent));
} else {
$value = $node->ownerDocument->saveHTML( $node );
$value = preg_replace('/(<div.*?>|<\/div>)/','',$value);
$value = preg_replace('/(<td.*?>|<\/td>)/','',$value);
$table[$key] = $value;
return $table;
有关正则表达式的用法,请参阅 here for preg_replace
. See here。
或者!您可以像这样使用 simple_html_dom.php:
include 'simple_html_dom.php';//<--- Must download to current directory
$url = 'https://docs.google.com/spreadsheets/d/1Klpic32Gb_TDblDZDJQOkDedFGuNHAokxUXqrCPDFWE/pubhtml';
$html = file_get_html( $url );
foreach ( $html->find( "div[class=softmerge-inner]" ) as $element ) {
echo $element->innertext;
//See http://simplehtmldom.sourceforge.net/manual.htm for usage
你走对了!下一级别是学习 非常强大 xpath
语句,像 DomDocument
# the url given in your example
$url = "https://docs.google.com/spreadsheets/d/1Klpic32Gb_TDblDZDJQOkDedFGuNHAokxUXqrCPDFWE/pubhtml";
$doc = new \DOMDocument();
$xpath = new \DOMXpath($doc);
# here comes the magic
$html_body = $xpath->query("//td[text()='html_body']")->item(0);
$div_text = $html_body->nextSibling->textContent;
echo $div_text;
线索是在 DOM
中查询文本节点等于 html_body
的列,这是通过 //td[here comes the expression to filter on all columns in the dom]
完成的。之后,只需选择下一个兄弟姐妹。考虑到这一点,您甚至可以在 waffle
中的所有行上使用 foreach 重写整个函数
foreach($xpath->query("//table[@class='waffle']//tr") as $row) {
// do sth. useful here
$url = "https://docs.google.com/spreadsheets/d/1Klpic32Gb_TDblDZDJQOkDedFGuNHAokxUXqrCPDFWE/pubhtml";
$doc = new \DOMDocument();
$xpath = new \DOMXpath($doc);
foreach ($xpath->query("//table[@class='waffle']//tr") as $row) {
$columns = $xpath->query("./td", $row);
$key_td = $columns->item(0);
$value_td = $columns->item(1);
echo "[" . $key_td->nodeValue . "]: " . $value_td->nodeValue . "\n";
我正在努力从 2 列中提取数据 table。第一列是变量名,第二列是该变量的数据。
我几乎可以正常工作,但有些数据可能包含 HTML 并且通常包含在 DIV 中。我想在 DIV 中获取 HTML,而不是 DIV 本身。我知道正则表达式可能是一种解决方案,但我想更好地理解 DOMDocument。
private function readHtml()
$url = "https://docs.google.com/spreadsheets/d/1Klpic32Gb_TDblDZDJQOkDedFGuNHAokxUXqrCPDFWE/pubhtml";
$curl = curl_init($url);
$htmlData = curl_exec($curl);
$dom = new \DOMDocument();
$html = $dom->loadHTML($htmlData);
$dom->preserveWhiteSpace = false;
$tables = $dom->getElementsByTagName('table');
$rows = $tables->item(0)->getElementsByTagName('tr');
$cols = $rows->item(1)->getElementsByTagName('td');
$table = [];
$key = null;
$value = null;
foreach ($rows as $i => $row){
//skip the heading columns
if($i <= 1 ) continue;
$cols = $row->getElementsByTagName('td');
foreach ($cols as $count => $node) {
if($count == 0) {
$key = strtolower(str_replace(' ', '_',$node->textContent));
} else {
$htmlNode = $node->getElementsByTagName('div');
if($htmlNode->length >=1) {
$innerHTML= '';
foreach ($htmlNode as $innerNode) {
$innerHTML .= $innerNode->ownerDocument->saveHTML( $innerNode );
$value = $innerHTML;
} else {
$value = $node->textContent;
$table[$key] = $value;
return $table;
我的输出是正确的,但我不想包含包含 HTML:
的数据的包装器 DIV Array
[type] => raw
[direction] => north
[intro] => Welcome to the test.
[html_body] => <div class="softmerge-inner" style="width: 5653px; left: -1px;">Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut <span style="font-weight:bold;">aliquip</span> ex ea commodo consequat. Duis aute irure dolor in <span style="text-decoration:underline;">reprehenderit</span> in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, <span style="font-style:italic;">sunt in</span> culpa qui officia deserunt mollit anim id est laborum.</div>
[count] => 1003
private function readHtml()
# the url given in your example
$url = "https://docs.google.com/spreadsheets/d/1Klpic32Gb_TDblDZDJQOkDedFGuNHAokxUXqrCPDFWE/pubhtml";
$dom = new \DOMDocument();
$dom->preserveWhiteSpace = false;
$tables = $dom->getElementsByTagName('table');
$rows = $tables->item(0)->getElementsByTagName('tr');
$cols = $rows->item(1)->getElementsByTagName('td');
$table = [];
$key = null;
$value = null;
foreach ($rows as $i => $row){
//skip the heading columns
if($i <= 1 ) continue;
$cols = $row->getElementsByTagName('td');
foreach ($cols as $count => $node) {
if($count == 0) {
$key = strtolower(str_replace(' ', '_',$node->textContent));
} else {
$value = $node->ownerDocument->saveHTML( $node );
$value = preg_replace('/(<div.*?>|<\/div>)/','',$value);
$value = preg_replace('/(<td.*?>|<\/td>)/','',$value);
$table[$key] = $value;
return $table;
有关正则表达式的用法,请参阅 here for preg_replace
. See here。
或者!您可以像这样使用 simple_html_dom.php:
include 'simple_html_dom.php';//<--- Must download to current directory
$url = 'https://docs.google.com/spreadsheets/d/1Klpic32Gb_TDblDZDJQOkDedFGuNHAokxUXqrCPDFWE/pubhtml';
$html = file_get_html( $url );
foreach ( $html->find( "div[class=softmerge-inner]" ) as $element ) {
echo $element->innertext;
//See http://simplehtmldom.sourceforge.net/manual.htm for usage
你走对了!下一级别是学习 非常强大 xpath
语句,像 DomDocument
# the url given in your example
$url = "https://docs.google.com/spreadsheets/d/1Klpic32Gb_TDblDZDJQOkDedFGuNHAokxUXqrCPDFWE/pubhtml";
$doc = new \DOMDocument();
$xpath = new \DOMXpath($doc);
# here comes the magic
$html_body = $xpath->query("//td[text()='html_body']")->item(0);
$div_text = $html_body->nextSibling->textContent;
echo $div_text;
线索是在 DOM
中查询文本节点等于 html_body
的列,这是通过 //td[here comes the expression to filter on all columns in the dom]
完成的。之后,只需选择下一个兄弟姐妹。考虑到这一点,您甚至可以在 waffle
foreach($xpath->query("//table[@class='waffle']//tr") as $row) {
// do sth. useful here
$url = "https://docs.google.com/spreadsheets/d/1Klpic32Gb_TDblDZDJQOkDedFGuNHAokxUXqrCPDFWE/pubhtml";
$doc = new \DOMDocument();
$xpath = new \DOMXpath($doc);
foreach ($xpath->query("//table[@class='waffle']//tr") as $row) {
$columns = $xpath->query("./td", $row);
$key_td = $columns->item(0);
$value_td = $columns->item(1);
echo "[" . $key_td->nodeValue . "]: " . $value_td->nodeValue . "\n";