多页抓取
Multiple page scraping
我正在寻找一种方法来使以下脚本抓取数组中的多个页面并将所选内容写入文本或 excel 文档。
有什么想法吗?
这可能吗?
另一个问题是为什么脚本在 localhost 上运行但在服务器上运行时不起作用。谢谢。
<?php
include_once('simple_html_dom.php');
function scraping() {
// create HTML DOM
$html = file_get_html('http://lmvz.anofm.ro:8080/lmv/detalii.jsp?UNIQUEJVID=50/01/1150001435/1&judet=50');
// get article block
if($html && is_object($html) && isset($html->nodes)){
foreach($html->find('/html/body/table') as $article) {
// get title
$item['titlu'] = trim($article->find('/tbody/tr[1]/td/div', 0)->plaintext);
// get body
$item['tr2'] = trim($article->find('/tbody/tr[2]/td[2]', 0)->plaintext);
$item['tr3'] = trim($article->find('/tbody/tr[3]/td[2]', 0)->plaintext);
$item['tr4'] = trim($article->find('/tbody/tr[4]/td[2]', 0)->plaintext);
$item['tr5'] = trim($article->find('/tbody/tr[5]/td[2]', 0)->plaintext);
$item['tr6'] = trim($article->find('/tbody/tr[6]/td[2]', 0)->plaintext);
$item['tr7'] = trim($article->find('/tbody/tr[7]/td[2]', 0)->plaintext);
$item['tr8'] = trim($article->find('/tbody/tr[8]/td[2]', 0)->plaintext);
$item['tr9'] = trim($article->find('/tbody/tr[9]/td[2]', 0)->plaintext);
$item['tr10'] = trim($article->find('/tbody/tr[10]/td[2]', 0)->plaintext);
$item['tr11'] = trim($article->find('/tbody/tr[11]/td[2]', 0)->plaintext);
$item['tr12'] = trim($article->find('/tbody/tr[12]/td/div/]', 0)->plaintext);
$ret[] = $item;
}
// clean up memory
$html->clear();
unset($html);
return $ret;}
}
echo '<pre>';
$ret = scraping();
foreach($ret as $v) {
echo $v['titlu'].'<br>';
echo $v['tr2'].'<br>';
echo $v['tr3'].'<br>';
echo $v['tr4'].'<br>';
echo $v['tr5'].'<br>';
echo $v['tr6'].'<br>';
echo $v['tr7'].'<br>';
echo $v['tr8'].'<br>';
echo $v['tr9'].'<br>';
echo $v['tr10'].'<br>';
echo $v['tr11'].'<br>';
echo $v['tr12'].'<br>';
}
?>
我正在寻找一种方法来使以下脚本抓取数组中的多个页面并将所选内容写入文本或 excel 文档。
有什么想法吗? 这可能吗? 另一个问题是为什么脚本在 localhost 上运行但在服务器上运行时不起作用。谢谢。
<?php
include_once('simple_html_dom.php');
function scraping() {
// create HTML DOM
$html = file_get_html('http://lmvz.anofm.ro:8080/lmv/detalii.jsp?UNIQUEJVID=50/01/1150001435/1&judet=50');
// get article block
if($html && is_object($html) && isset($html->nodes)){
foreach($html->find('/html/body/table') as $article) {
// get title
$item['titlu'] = trim($article->find('/tbody/tr[1]/td/div', 0)->plaintext);
// get body
$item['tr2'] = trim($article->find('/tbody/tr[2]/td[2]', 0)->plaintext);
$item['tr3'] = trim($article->find('/tbody/tr[3]/td[2]', 0)->plaintext);
$item['tr4'] = trim($article->find('/tbody/tr[4]/td[2]', 0)->plaintext);
$item['tr5'] = trim($article->find('/tbody/tr[5]/td[2]', 0)->plaintext);
$item['tr6'] = trim($article->find('/tbody/tr[6]/td[2]', 0)->plaintext);
$item['tr7'] = trim($article->find('/tbody/tr[7]/td[2]', 0)->plaintext);
$item['tr8'] = trim($article->find('/tbody/tr[8]/td[2]', 0)->plaintext);
$item['tr9'] = trim($article->find('/tbody/tr[9]/td[2]', 0)->plaintext);
$item['tr10'] = trim($article->find('/tbody/tr[10]/td[2]', 0)->plaintext);
$item['tr11'] = trim($article->find('/tbody/tr[11]/td[2]', 0)->plaintext);
$item['tr12'] = trim($article->find('/tbody/tr[12]/td/div/]', 0)->plaintext);
$ret[] = $item;
}
// clean up memory
$html->clear();
unset($html);
return $ret;}
}
echo '<pre>';
$ret = scraping();
foreach($ret as $v) {
echo $v['titlu'].'<br>';
echo $v['tr2'].'<br>';
echo $v['tr3'].'<br>';
echo $v['tr4'].'<br>';
echo $v['tr5'].'<br>';
echo $v['tr6'].'<br>';
echo $v['tr7'].'<br>';
echo $v['tr8'].'<br>';
echo $v['tr9'].'<br>';
echo $v['tr10'].'<br>';
echo $v['tr11'].'<br>';
echo $v['tr12'].'<br>';
}
?>