在已解析的 table 中更改/替换 url 的特定部分
change / replace specific part of url inside a parsed table
我想通过 simple_html_dom 解析 table。到目前为止,一切都很好。现在我想更改 table 单元格内的所有链接。它们目前以“.htm”结尾,应更改为“.php”,因此链接指向相同的文件名,但文件类型不同。由于源文件的内容不断变化,它必须独立于文件名工作。
示例:
<td><a href="www.website.com/site1.htm" ... --> <td><a href="www.website.com/site1.php"
这是当前代码:
// Download simple_html_dom.php first from http://simplehtmldom.sourceforge.net/
require_once('simple_html_dom.php');
// Get the contents of the HTML document either using cURL, a crawling
// framework, or use the provided file_get_html() function.
$html = file_get_html('mywebsite/example.htm');
// Table 1
$table = $html->find('table', 1);
$rowData = array();
foreach($table->find('tr') as $row) {
// initialize array to store the cell data from each row
$flight = array();
foreach($row->find('td') as $cell) {
// push the cell's text to the array
$flight[] = $cell->innertext;
}
foreach($row->find('th') as $cell) {
// push the cell's text to the array
$flight[] = $cell->innertext;
}
$rowData[] = $flight;
}
foreach ($rowData as $row => $tr) {
echo '<tr>';
foreach ($tr as $td)
echo '<td>' . $td .'</td>';
echo '</tr>';
}
来源看起来像:
table><hr>
<tr><th>po</th><th>player</th><th>age</th><th>2ga</th><th>2g%</th><th>fta</th><th>ft%</th><th>3ga</th><th>3g%</th><th>orb</th><th>drb</th><th>ast</th><th>stl</th><th>to</th><th>blk</th><th>o-o</th><th>d-o</th><th>p-o</th><th>t-o</th><th>o-d</th><th>d-d</th><th>p-d</th><th>t-d</th></tr>
<tr><td CLASS=tdp>PG</td><td CLASS=tdp><a href="JamesHarden7.htm">James Harden </a></td><td>27</td><td>48</td><td>53</td><td>95</td><td>85</td><td>85</td><td>35</td><td>20</td><td>59</td><td>99</td><td>57</td><td>1</td><td>12</td><td>4</td><td>9</td><td>7</td><td>9</td><td>8</td><td>6</td><td>5</td><td>7</td></tr>
<tr><td CLASS=tdp>PG</td><td CLASS=tdp><a href="TerryRozier1.htm">Terry Rozier </a></td><td>22</td><td>31</td><td>41</td><td>15</td><td>77</td><td>43</td><td>32</td><td>18</td><td>42</td><td>31</td><td>46</td><td>79</td><td>8</td><td>5</td><td>4</td><td>4</td><td>2</td><td>6</td><td>5</td><td>4</td><td>6</td></tr>
<tr><td CLASS=tdp>SG</td><td CLASS=tdp><a href="DannyGreen6.htm">Danny Green
and so on...
您可以使用 find("td a")
为您的示例获取锚点。
然后您可以使用 foreach
遍历结果并将 href
属性 的最后 3 个字符替换为 php
使用例如 substr_replace
$html = <<<HTML
<table><hr>
<tr><th>po</th><th>player</th><th>age</th><th>2ga</th><th>2g%</th><th>fta</th><th>ft%</th><th>3ga</th><th>3g%</th><th>orb</th><th>drb</th><th>ast</th><th>stl</th><th>to</th><th>blk</th><th>o-o</th><th>d-o</th><th>p-o</th><th>t-o</th><th>o-d</th><th>d-d</th><th>p-d</th><th>t-d</th></tr>
<tr><td CLASS=tdp>PG</td><td CLASS=tdp><a href="JamesHarden7.htm">James Harden </a></td><td>27</td><td>48</td><td>53</td><td>95</td><td>85</td><td>85</td><td>35</td><td>20</td><td>59</td><td>99</td><td>57</td><td>1</td><td>12</td><td>4</td><td>9</td><td>7</td><td>9</td><td>8</td><td>6</td><td>5</td><td>7</td></tr>
<tr><td CLASS=tdp>PG</td><td CLASS=tdp><a href="TerryRozier1.htm">Terry Rozier </a></td><td>22</td><td>31</td><td>41</td><td>15</td><td>77</td><td>43</td><td>32</td><td>18</td><td>42</td><td>31</td><td>46</td><td>79</td><td>8</td><td>5</td><td>4</td><td>4</td><td>2</td><td>6</td><td>5</td><td>4</td><td>6</td></tr>
</table>
HTML;
$html = str_get_html($html);
foreach ($html->find("td a") as $a) {
$a->href = substr_replace($a->href, 'php', -3);
}
您可以改用 DOMDocument 并使用 XPath 查找对您网站的任何引用。此代码加载 HTML 并将查找 any href 属性,该属性将网站作为 URL 的一部分。然后它改变了
php.
的 htm 扩展名
$dom = new DOMDocument();
$dom->loadHTML($html);
$xp = new DOMXPath($dom);
$links = $xp->query("//@href[contains(.,'www.website.com')]");
foreach ( $links as $href ) {
$href->nodeValue = str_replace(".htm", ".php", $href->nodeValue);
}
echo $dom->saveHTML();
我想通过 simple_html_dom 解析 table。到目前为止,一切都很好。现在我想更改 table 单元格内的所有链接。它们目前以“.htm”结尾,应更改为“.php”,因此链接指向相同的文件名,但文件类型不同。由于源文件的内容不断变化,它必须独立于文件名工作。
示例:
<td><a href="www.website.com/site1.htm" ... --> <td><a href="www.website.com/site1.php"
这是当前代码:
// Download simple_html_dom.php first from http://simplehtmldom.sourceforge.net/
require_once('simple_html_dom.php');
// Get the contents of the HTML document either using cURL, a crawling
// framework, or use the provided file_get_html() function.
$html = file_get_html('mywebsite/example.htm');
// Table 1
$table = $html->find('table', 1);
$rowData = array();
foreach($table->find('tr') as $row) {
// initialize array to store the cell data from each row
$flight = array();
foreach($row->find('td') as $cell) {
// push the cell's text to the array
$flight[] = $cell->innertext;
}
foreach($row->find('th') as $cell) {
// push the cell's text to the array
$flight[] = $cell->innertext;
}
$rowData[] = $flight;
}
foreach ($rowData as $row => $tr) {
echo '<tr>';
foreach ($tr as $td)
echo '<td>' . $td .'</td>';
echo '</tr>';
}
来源看起来像:
table><hr>
<tr><th>po</th><th>player</th><th>age</th><th>2ga</th><th>2g%</th><th>fta</th><th>ft%</th><th>3ga</th><th>3g%</th><th>orb</th><th>drb</th><th>ast</th><th>stl</th><th>to</th><th>blk</th><th>o-o</th><th>d-o</th><th>p-o</th><th>t-o</th><th>o-d</th><th>d-d</th><th>p-d</th><th>t-d</th></tr>
<tr><td CLASS=tdp>PG</td><td CLASS=tdp><a href="JamesHarden7.htm">James Harden </a></td><td>27</td><td>48</td><td>53</td><td>95</td><td>85</td><td>85</td><td>35</td><td>20</td><td>59</td><td>99</td><td>57</td><td>1</td><td>12</td><td>4</td><td>9</td><td>7</td><td>9</td><td>8</td><td>6</td><td>5</td><td>7</td></tr>
<tr><td CLASS=tdp>PG</td><td CLASS=tdp><a href="TerryRozier1.htm">Terry Rozier </a></td><td>22</td><td>31</td><td>41</td><td>15</td><td>77</td><td>43</td><td>32</td><td>18</td><td>42</td><td>31</td><td>46</td><td>79</td><td>8</td><td>5</td><td>4</td><td>4</td><td>2</td><td>6</td><td>5</td><td>4</td><td>6</td></tr>
<tr><td CLASS=tdp>SG</td><td CLASS=tdp><a href="DannyGreen6.htm">Danny Green
and so on...
您可以使用 find("td a")
为您的示例获取锚点。
然后您可以使用 foreach
遍历结果并将 href
属性 的最后 3 个字符替换为 php
使用例如 substr_replace
$html = <<<HTML
<table><hr>
<tr><th>po</th><th>player</th><th>age</th><th>2ga</th><th>2g%</th><th>fta</th><th>ft%</th><th>3ga</th><th>3g%</th><th>orb</th><th>drb</th><th>ast</th><th>stl</th><th>to</th><th>blk</th><th>o-o</th><th>d-o</th><th>p-o</th><th>t-o</th><th>o-d</th><th>d-d</th><th>p-d</th><th>t-d</th></tr>
<tr><td CLASS=tdp>PG</td><td CLASS=tdp><a href="JamesHarden7.htm">James Harden </a></td><td>27</td><td>48</td><td>53</td><td>95</td><td>85</td><td>85</td><td>35</td><td>20</td><td>59</td><td>99</td><td>57</td><td>1</td><td>12</td><td>4</td><td>9</td><td>7</td><td>9</td><td>8</td><td>6</td><td>5</td><td>7</td></tr>
<tr><td CLASS=tdp>PG</td><td CLASS=tdp><a href="TerryRozier1.htm">Terry Rozier </a></td><td>22</td><td>31</td><td>41</td><td>15</td><td>77</td><td>43</td><td>32</td><td>18</td><td>42</td><td>31</td><td>46</td><td>79</td><td>8</td><td>5</td><td>4</td><td>4</td><td>2</td><td>6</td><td>5</td><td>4</td><td>6</td></tr>
</table>
HTML;
$html = str_get_html($html);
foreach ($html->find("td a") as $a) {
$a->href = substr_replace($a->href, 'php', -3);
}
您可以改用 DOMDocument 并使用 XPath 查找对您网站的任何引用。此代码加载 HTML 并将查找 any href 属性,该属性将网站作为 URL 的一部分。然后它改变了 php.
的 htm 扩展名$dom = new DOMDocument();
$dom->loadHTML($html);
$xp = new DOMXPath($dom);
$links = $xp->query("//@href[contains(.,'www.website.com')]");
foreach ( $links as $href ) {
$href->nodeValue = str_replace(".htm", ".php", $href->nodeValue);
}
echo $dom->saveHTML();