修正 SimpleHtmlDom 抓取网页中失效链接的函数
Function for correcting broken links in web page scraped by SimpleHtmlDom
我正在使用 SimpleHtmlDom 抓取 HTML,它获取所写的 HTML,导致图像和脚本出现很多损坏的 link,因为它们不包含完整的 url 到他们的资源位置。因此页面显示错误。
我已经通过将这些字母替换为 src="http:// example.com/" 但是当 link 中没有前导斜杠时它会变得棘手,因此很难判断它是本地 link 还是完整 link.
例如:
<img src="images/pic.jpg">
我需要定位并更正阅读:
<img src="http://example.com/images/pic.jpg">
是否有正则表达式或函数可用于在没有前导斜杠的情况下找到 src="?还需要满足所有类型的 link如ahref、脚本等
您可以 <img src=\"(.+)\">
并检查 $1 是否包含“http”。
如果您使用的是 simple HTML dom
,则可以使用以下代码段来调整 URL 的
<?php
require 'simple_html_dom.php';
class Parser {
protected $url;
protected $url_parts;
protected $html_dom = null;
protected $path = null;
public function __construct($url) {
$this->setUrl($url);
}
protected function setUrl($url) {
$this->url = $url;
$this->url_parts = parse_url($url);
return $this;
}
protected function getUrl() {
return $this->url;
}
protected function getUrlParts() {
return $this->url_parts;
}
protected function getHtmlDom() {
if ($this->html_dom === null) $this->html_dom = file_get_html($this->getUrl());
return $this->html_dom;
}
/** ------------
- path ends with /, e.g. foo/bar/foo/, so the full path for the relative image is foo/bar/foo
- path doesn't end with / e.g. foo/bar/foo, so the full path the relative image is foo/bar
------------ **/
public function getPath() {
if ($this->path === null) $this->path = isset($this->getUrlParts()['path']) ? implode('/', explode('/', $this->getUrlParts()['path'], -1)) : '';
return $this->path;
}
public function getHost() {
return (isset($this->getUrlParts()['scheme']) ? $this->getUrlParts()['scheme'] : 'http').'://'.$this->getUrlParts()['host'];
}
public function adjust($tag, $attribute) {
foreach($this->getHtmlDom()->find($tag) as $element) {
if (parse_url($element->$attribute, PHP_URL_SCHEME) === null) {
// Test if SRC starts with /, if so only append host part of the URL cause image starts at root
if (strpos($element->$attribute, '/') === 0) {
$element->$attribute = $this->getHost().$element->$attribute;
}else{
$element->$attribute = $this->getHost().$this->getPath().'/'.$element->$attribute;
}
}
}
return $this;
}
public function getHtml() {
return (string)$this->getHtmlDom();
}
}
$parser = new Parser('https://www.darkbee.be/stack/images/index.html');
$parser->adjust('img', 'src')
->adjust('a', 'href')
->adjust('link', 'href')
->adjust('script', 'src');
;
echo $parser->getHtml();
我正在使用 SimpleHtmlDom 抓取 HTML,它获取所写的 HTML,导致图像和脚本出现很多损坏的 link,因为它们不包含完整的 url 到他们的资源位置。因此页面显示错误。
我已经通过将这些字母替换为 src="http:// example.com/" 但是当 link 中没有前导斜杠时它会变得棘手,因此很难判断它是本地 link 还是完整 link.
例如:
<img src="images/pic.jpg">
我需要定位并更正阅读:
<img src="http://example.com/images/pic.jpg">
是否有正则表达式或函数可用于在没有前导斜杠的情况下找到 src="?还需要满足所有类型的 link如ahref、脚本等
您可以 <img src=\"(.+)\">
并检查 $1 是否包含“http”。
如果您使用的是 simple HTML dom
,则可以使用以下代码段来调整 URL 的
<?php
require 'simple_html_dom.php';
class Parser {
protected $url;
protected $url_parts;
protected $html_dom = null;
protected $path = null;
public function __construct($url) {
$this->setUrl($url);
}
protected function setUrl($url) {
$this->url = $url;
$this->url_parts = parse_url($url);
return $this;
}
protected function getUrl() {
return $this->url;
}
protected function getUrlParts() {
return $this->url_parts;
}
protected function getHtmlDom() {
if ($this->html_dom === null) $this->html_dom = file_get_html($this->getUrl());
return $this->html_dom;
}
/** ------------
- path ends with /, e.g. foo/bar/foo/, so the full path for the relative image is foo/bar/foo
- path doesn't end with / e.g. foo/bar/foo, so the full path the relative image is foo/bar
------------ **/
public function getPath() {
if ($this->path === null) $this->path = isset($this->getUrlParts()['path']) ? implode('/', explode('/', $this->getUrlParts()['path'], -1)) : '';
return $this->path;
}
public function getHost() {
return (isset($this->getUrlParts()['scheme']) ? $this->getUrlParts()['scheme'] : 'http').'://'.$this->getUrlParts()['host'];
}
public function adjust($tag, $attribute) {
foreach($this->getHtmlDom()->find($tag) as $element) {
if (parse_url($element->$attribute, PHP_URL_SCHEME) === null) {
// Test if SRC starts with /, if so only append host part of the URL cause image starts at root
if (strpos($element->$attribute, '/') === 0) {
$element->$attribute = $this->getHost().$element->$attribute;
}else{
$element->$attribute = $this->getHost().$this->getPath().'/'.$element->$attribute;
}
}
}
return $this;
}
public function getHtml() {
return (string)$this->getHtmlDom();
}
}
$parser = new Parser('https://www.darkbee.be/stack/images/index.html');
$parser->adjust('img', 'src')
->adjust('a', 'href')
->adjust('link', 'href')
->adjust('script', 'src');
;
echo $parser->getHtml();