如何抓取 link 使用相同 css
How to crawl link use the same css
我使用此代码抓取网站,但我希望 link 作为单独的结果。
我希望标签结果与艺术家分开,将它们放在变量中。
<?php
require 'vendor/autoload.php';
use Symfony\Component\DomCrawler\Crawler;
$client = new \GuzzleHttp\Client();
$url = 'https://hentaifox.com/gallery/58091/';
$res = $client->request('GET', $url);
$html = ''.$res->getBody();
$crawler = new Crawler($html);
foreach ($crawler->filter('#content .left_content .info .artists') as $domElement)
{
$domElement = new Crawler($domElement);
$manga_tag = $domElement->html();
print_r($manga_tag);
echo "<br>";
};
我不知道如何使用 Symfony 的 DomCrawler 来做到这一点,但是 PHP 有不错的内置工具来解析 HTML,即 "DOMDocument" 和 "DOMXPath" ,在 DOMDocument 中它看起来像这样:
$domd = @DOMDocument::loadHTML($html);
$xp = new DOMXPath($domd);
$tags = array();
$artists = array();
foreach ($xp->query("//a[contains(@href,'/tag/')]/span[1]") as $tag) {
$tags[trim($tag->textContent)] = merge_relative_absolute_urls('https://hentaifox.com/gallery/58091/', $tag->parentNode->getAttribute("href"));
}
foreach ($xp->query("//a[contains(@href,'/artist/')]/span[1]") as $artist) {
$artists[trim($artist->textContent)] = merge_relative_absolute_urls('https://hentaifox.com/gallery/58091/', $artist->parentNode->getAttribute("href"));
}
print_r([
'artists' => $artists,
'tags' => $tags
]);
function merge_relative_absolute_urls(string $base_url, string $relative_url): string
{
// strip ?whatever in base url (the browser does this too, i think)
$pos = strpos($base_url, "?");
if (false !== $pos) {
$base_url = substr($base_url, 0, $pos);
}
// strip file.php from /file.php if present
$pos = strrpos($base_url, "/");
if (false !== $pos) {
$base_url = substr($base_url, 0, $pos + 1);
}
if (0 === stripos($relative_url, "http://") || 0 === stripos($relative_url, "https://") || 0 === strpos($relative_url, "//") || 0 === strpos($relative_url, "://")) {
return $relative_url;
}
if (substr($relative_url, 0, 1) === "/") {
$info = parse_url($base_url);
$url = ($info['scheme'] ?? "") . "://" . $info['host'];
if (isset($info['port'])) {
$url .= ":" . $info['port'];
}
$url .= $relative_url;
return $url;
}
$url = $base_url . $relative_url;
return $url;
}
输出:
$ php wtf3.php
Array
(
[artists] => Array
(
[Sahara-wataru] => https://hentaifox.com/artist/sahara-wataru/
)
[tags] => Array
(
[Big-breasts] => https://hentaifox.com/tag/big-breasts/
[Sole-male] => https://hentaifox.com/tag/sole-male/
[Nakadashi] => https://hentaifox.com/tag/nakadashi/
[Blowjob] => https://hentaifox.com/tag/blowjob/
[Full-color] => https://hentaifox.com/tag/full-color/
[Big-ass] => https://hentaifox.com/tag/big-ass/
[Blowjob-face] => https://hentaifox.com/tag/blowjob-face/
)
)
我使用此代码抓取网站,但我希望 link 作为单独的结果。
我希望标签结果与艺术家分开,将它们放在变量中。
<?php
require 'vendor/autoload.php';
use Symfony\Component\DomCrawler\Crawler;
$client = new \GuzzleHttp\Client();
$url = 'https://hentaifox.com/gallery/58091/';
$res = $client->request('GET', $url);
$html = ''.$res->getBody();
$crawler = new Crawler($html);
foreach ($crawler->filter('#content .left_content .info .artists') as $domElement)
{
$domElement = new Crawler($domElement);
$manga_tag = $domElement->html();
print_r($manga_tag);
echo "<br>";
};
我不知道如何使用 Symfony 的 DomCrawler 来做到这一点,但是 PHP 有不错的内置工具来解析 HTML,即 "DOMDocument" 和 "DOMXPath" ,在 DOMDocument 中它看起来像这样:
$domd = @DOMDocument::loadHTML($html);
$xp = new DOMXPath($domd);
$tags = array();
$artists = array();
foreach ($xp->query("//a[contains(@href,'/tag/')]/span[1]") as $tag) {
$tags[trim($tag->textContent)] = merge_relative_absolute_urls('https://hentaifox.com/gallery/58091/', $tag->parentNode->getAttribute("href"));
}
foreach ($xp->query("//a[contains(@href,'/artist/')]/span[1]") as $artist) {
$artists[trim($artist->textContent)] = merge_relative_absolute_urls('https://hentaifox.com/gallery/58091/', $artist->parentNode->getAttribute("href"));
}
print_r([
'artists' => $artists,
'tags' => $tags
]);
function merge_relative_absolute_urls(string $base_url, string $relative_url): string
{
// strip ?whatever in base url (the browser does this too, i think)
$pos = strpos($base_url, "?");
if (false !== $pos) {
$base_url = substr($base_url, 0, $pos);
}
// strip file.php from /file.php if present
$pos = strrpos($base_url, "/");
if (false !== $pos) {
$base_url = substr($base_url, 0, $pos + 1);
}
if (0 === stripos($relative_url, "http://") || 0 === stripos($relative_url, "https://") || 0 === strpos($relative_url, "//") || 0 === strpos($relative_url, "://")) {
return $relative_url;
}
if (substr($relative_url, 0, 1) === "/") {
$info = parse_url($base_url);
$url = ($info['scheme'] ?? "") . "://" . $info['host'];
if (isset($info['port'])) {
$url .= ":" . $info['port'];
}
$url .= $relative_url;
return $url;
}
$url = $base_url . $relative_url;
return $url;
}
输出:
$ php wtf3.php
Array
(
[artists] => Array
(
[Sahara-wataru] => https://hentaifox.com/artist/sahara-wataru/
)
[tags] => Array
(
[Big-breasts] => https://hentaifox.com/tag/big-breasts/
[Sole-male] => https://hentaifox.com/tag/sole-male/
[Nakadashi] => https://hentaifox.com/tag/nakadashi/
[Blowjob] => https://hentaifox.com/tag/blowjob/
[Full-color] => https://hentaifox.com/tag/full-color/
[Big-ass] => https://hentaifox.com/tag/big-ass/
[Blowjob-face] => https://hentaifox.com/tag/blowjob-face/
)
)