PHP 脚本在本地工作,但放在网络服务器上时不工作
PHP script working locally but not when placed on webserver
以下代码从给定网页中抓取链接列表,然后将它们放入另一个脚本中,该脚本从给定链接中抓取文本并将数据放入 csv 文档中。该代码在本地主机 (wampserver 5.5 php) 上运行完美,但在域上运行时会严重失败。
您可以在 http://miskai.tk/ANOFM/csv.php 查看脚本的功能。
此外,文件获取 html 和 curl 都已在服务器上启用。
<?php
header('Content-Type: application/excel');
header('Content-Disposition: attachment; filename="Mehedinti.csv"');
include_once 'simple_html_dom.php';
include_once 'csv.php';
$urls = scrape_main_page();
function scraping($url) {
// create HTML DOM
$html = file_get_html($url);
// get article block
if ($html && is_object($html) && isset($html->nodes)) {
foreach ($html->find('/html/body/table') as $article) {
// get title
$item['titlu'] = trim($article->find('/tbody/tr[1]/td/div', 0)->plaintext);
// get body
$item['tr2'] = trim($article->find('/tbody/tr[2]/td[2]', 0)->plaintext);
$item['tr3'] = trim($article->find('/tbody/tr[3]/td[2]', 0)->plaintext);
$item['tr4'] = trim($article->find('/tbody/tr[4]/td[2]', 0)->plaintext);
$item['tr5'] = trim($article->find('/tbody/tr[5]/td[2]', 0)->plaintext);
$item['tr6'] = trim($article->find('/tbody/tr[6]/td[2]', 0)->plaintext);
$item['tr7'] = trim($article->find('/tbody/tr[7]/td[2]', 0)->plaintext);
$item['tr8'] = trim($article->find('/tbody/tr[8]/td[2]', 0)->plaintext);
$item['tr9'] = trim($article->find('/tbody/tr[9]/td[2]', 0)->plaintext);
$item['tr10'] = trim($article->find('/tbody/tr[10]/td[2]', 0)->plaintext);
$item['tr11'] = trim($article->find('/tbody/tr[11]/td[2]', 0)->plaintext);
$item['tr12'] = trim($article->find('/tbody/tr[12]/td/div/]', 0)->plaintext);
$ret[] = $item;
}
// clean up memory
$html->clear();
unset($html);
return $ret;}
}
$output = fopen("php://output", "w");
foreach ($urls as $url) {
$ret = scraping($url);
foreach($ret as $v){
fputcsv($output, $v);}
}
fclose($output);
exit();
第二个文件
<?php
function get_contents($url) {
// We could just use file_get_contents but using curl makes it more future-proof (setting a timeout for example)
$ch = curl_init($url);
curl_setopt_array($ch, array(CURLOPT_RETURNTRANSFER => true,));
$content = curl_exec($ch);
curl_close($ch);
return $content;
}
function scrape_main_page() {
set_time_limit(300);
libxml_use_internal_errors(true); // Prevent DOMDocument from spraying errors onto the page and hide those errors internally ;)
$html = get_contents("http://lmvz.anofm.ro:8080/lmv/index2.jsp?judet=26");
$dom = new DOMDocument();
$dom->loadHTML($html);
die(var_dump($html));
$xpath = new DOMXPath($dom);
$results = $xpath->query("//table[@width=\"645\"]/tr");
$all = array();
//var_dump($results);
for($i = 1; $i < $results->length; $i++) {
$tr = $results->item($i);
$id = $tr->childNodes->item(0)->textContent;
$requesturl = "http://lmvz.anofm.ro:8080/lmv/detalii.jsp?UNIQUEJVID=" . urlencode($id) .
"&judet=26";
$details = scrape_detail_page($requesturl);
$newObj = new stdClass();
$newObj = $id;
$all[] = $newObj;
}
foreach($all as $xtr) {
$urls[] = "http://lmvz.anofm.ro:8080/lmv/detalii.jsp?UNIQUEJVID=" . $xtr .
"&judet=26";
}
return $urls;
}
scrape_main_page();
是的,这里的问题是您的 php.ini 配置。确保服务器支持 curl 和 fopen。如果不启动您自己的 linux 服务器。
以下代码从给定网页中抓取链接列表,然后将它们放入另一个脚本中,该脚本从给定链接中抓取文本并将数据放入 csv 文档中。该代码在本地主机 (wampserver 5.5 php) 上运行完美,但在域上运行时会严重失败。
您可以在 http://miskai.tk/ANOFM/csv.php 查看脚本的功能。 此外,文件获取 html 和 curl 都已在服务器上启用。
<?php
header('Content-Type: application/excel');
header('Content-Disposition: attachment; filename="Mehedinti.csv"');
include_once 'simple_html_dom.php';
include_once 'csv.php';
$urls = scrape_main_page();
function scraping($url) {
// create HTML DOM
$html = file_get_html($url);
// get article block
if ($html && is_object($html) && isset($html->nodes)) {
foreach ($html->find('/html/body/table') as $article) {
// get title
$item['titlu'] = trim($article->find('/tbody/tr[1]/td/div', 0)->plaintext);
// get body
$item['tr2'] = trim($article->find('/tbody/tr[2]/td[2]', 0)->plaintext);
$item['tr3'] = trim($article->find('/tbody/tr[3]/td[2]', 0)->plaintext);
$item['tr4'] = trim($article->find('/tbody/tr[4]/td[2]', 0)->plaintext);
$item['tr5'] = trim($article->find('/tbody/tr[5]/td[2]', 0)->plaintext);
$item['tr6'] = trim($article->find('/tbody/tr[6]/td[2]', 0)->plaintext);
$item['tr7'] = trim($article->find('/tbody/tr[7]/td[2]', 0)->plaintext);
$item['tr8'] = trim($article->find('/tbody/tr[8]/td[2]', 0)->plaintext);
$item['tr9'] = trim($article->find('/tbody/tr[9]/td[2]', 0)->plaintext);
$item['tr10'] = trim($article->find('/tbody/tr[10]/td[2]', 0)->plaintext);
$item['tr11'] = trim($article->find('/tbody/tr[11]/td[2]', 0)->plaintext);
$item['tr12'] = trim($article->find('/tbody/tr[12]/td/div/]', 0)->plaintext);
$ret[] = $item;
}
// clean up memory
$html->clear();
unset($html);
return $ret;}
}
$output = fopen("php://output", "w");
foreach ($urls as $url) {
$ret = scraping($url);
foreach($ret as $v){
fputcsv($output, $v);}
}
fclose($output);
exit();
第二个文件
<?php
function get_contents($url) {
// We could just use file_get_contents but using curl makes it more future-proof (setting a timeout for example)
$ch = curl_init($url);
curl_setopt_array($ch, array(CURLOPT_RETURNTRANSFER => true,));
$content = curl_exec($ch);
curl_close($ch);
return $content;
}
function scrape_main_page() {
set_time_limit(300);
libxml_use_internal_errors(true); // Prevent DOMDocument from spraying errors onto the page and hide those errors internally ;)
$html = get_contents("http://lmvz.anofm.ro:8080/lmv/index2.jsp?judet=26");
$dom = new DOMDocument();
$dom->loadHTML($html);
die(var_dump($html));
$xpath = new DOMXPath($dom);
$results = $xpath->query("//table[@width=\"645\"]/tr");
$all = array();
//var_dump($results);
for($i = 1; $i < $results->length; $i++) {
$tr = $results->item($i);
$id = $tr->childNodes->item(0)->textContent;
$requesturl = "http://lmvz.anofm.ro:8080/lmv/detalii.jsp?UNIQUEJVID=" . urlencode($id) .
"&judet=26";
$details = scrape_detail_page($requesturl);
$newObj = new stdClass();
$newObj = $id;
$all[] = $newObj;
}
foreach($all as $xtr) {
$urls[] = "http://lmvz.anofm.ro:8080/lmv/detalii.jsp?UNIQUEJVID=" . $xtr .
"&judet=26";
}
return $urls;
}
scrape_main_page();
是的,这里的问题是您的 php.ini 配置。确保服务器支持 curl 和 fopen。如果不启动您自己的 linux 服务器。