使用简单 html dom 和 simpleXML 抓取数据
Scraping data using simple html dom and simpleXML
我正在尝试从 xml 文件中检索的多个链接中抓取数据。但是我不断收到一个错误,该错误似乎只出现在某些新闻中。下面你可以看到我得到的输出
http://www.hltv.org/news/14971-rgn-pro-series-groups-drawnRGN Pro Series groups drawn
http://www.hltv.org/news/14969-k1ck-reveal-new-teamk1ck reveal new team
http://www.hltv.org/news/14968-world-championships-captains-unveiled
Fatal error: Call to a member function find() on a non-object in /app/scrape.php on line 266
这是第 266 行
$hltv_full_text = $hltv_deep_link->find("//div[@class='rNewsContent']", 0);
完整代码
抓取功能
function scrape_hltv() {
$hltv = "http://www.hltv.org/news.rss.php";
$sxml = simplexml_load_file($hltv);
global $con;
foreach($sxml->channel->item as $item)
{
$hltv_title = (string)$item->title;
$hltv_link = (string)$item->link;
$hltv_date = date('Y-m-d H:i:s', strtotime((string)$item->pubDate));
echo $hltv_link;
//if (date('Y-m-d', strtotime((string)$item->pubDate)) == date('Y-m-d')){
if (strpos($hltv_title,'Video:') === false) {
$hltv_deep_link = file_get_html($hltv_link);
$hltv_full_text = $hltv_deep_link->find("//div[@class='rNewsContent']", 0);
echo $hltv_title . '<br><br>';
}
//}
}
}
scrape_hltv();
有几次file_get_html()
returns false
.
在此处查看源代码:
http://sourceforge.net/p/simplehtmldom/code/HEAD/tree/trunk/simple_html_dom.php#l79
if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
{
return false;
}
为了你的link
http://www.hltv.org/news/14968-world-championships-captains-unveiled
我认为是因为页面内容大于MAX_FILE_SIZE
(600 000 字节)。页面大小实际上约为 3 MB。
如果你也想处理更大的文件,你可以尝试修改版本的函数:
define('DEFAULT_TARGET_CHARSET', 'UTF-8');
define('DEFAULT_BR_TEXT', "\r\n");
define('DEFAULT_SPAN_TEXT', " ");
function file_get_html_modified($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
{
$dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
$contents = file_get_contents($url, $use_include_path, $context, $offset);
if (empty($contents))
{
return false;
}
$dom->load($contents, $lowercase, $stripRN);
return $dom;
}
...|| strlen($contents) > MAX_FILE_SIZE
已删除。
我正在尝试从 xml 文件中检索的多个链接中抓取数据。但是我不断收到一个错误,该错误似乎只出现在某些新闻中。下面你可以看到我得到的输出
http://www.hltv.org/news/14971-rgn-pro-series-groups-drawnRGN Pro Series groups drawn
http://www.hltv.org/news/14969-k1ck-reveal-new-teamk1ck reveal new team
http://www.hltv.org/news/14968-world-championships-captains-unveiled
Fatal error: Call to a member function find() on a non-object in /app/scrape.php on line 266
这是第 266 行
$hltv_full_text = $hltv_deep_link->find("//div[@class='rNewsContent']", 0);
完整代码
抓取功能
function scrape_hltv() {
$hltv = "http://www.hltv.org/news.rss.php";
$sxml = simplexml_load_file($hltv);
global $con;
foreach($sxml->channel->item as $item)
{
$hltv_title = (string)$item->title;
$hltv_link = (string)$item->link;
$hltv_date = date('Y-m-d H:i:s', strtotime((string)$item->pubDate));
echo $hltv_link;
//if (date('Y-m-d', strtotime((string)$item->pubDate)) == date('Y-m-d')){
if (strpos($hltv_title,'Video:') === false) {
$hltv_deep_link = file_get_html($hltv_link);
$hltv_full_text = $hltv_deep_link->find("//div[@class='rNewsContent']", 0);
echo $hltv_title . '<br><br>';
}
//}
}
}
scrape_hltv();
有几次file_get_html()
returns false
.
在此处查看源代码: http://sourceforge.net/p/simplehtmldom/code/HEAD/tree/trunk/simple_html_dom.php#l79
if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
{
return false;
}
为了你的link
http://www.hltv.org/news/14968-world-championships-captains-unveiled
我认为是因为页面内容大于MAX_FILE_SIZE
(600 000 字节)。页面大小实际上约为 3 MB。
如果你也想处理更大的文件,你可以尝试修改版本的函数:
define('DEFAULT_TARGET_CHARSET', 'UTF-8');
define('DEFAULT_BR_TEXT', "\r\n");
define('DEFAULT_SPAN_TEXT', " ");
function file_get_html_modified($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
{
$dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
$contents = file_get_contents($url, $use_include_path, $context, $offset);
if (empty($contents))
{
return false;
}
$dom->load($contents, $lowercase, $stripRN);
return $dom;
}
...|| strlen($contents) > MAX_FILE_SIZE
已删除。