如何使用代理 运行 本地 PHP 简单 HTML DOM 解析器?
How Can I Run a Local PHP Simple HTML DOM Parser with a Proxy?
我在 MAMP 中有一个 PHP 简单 HTML DOM 本地解析器,它可以提取信息并且可以很好地与网站的日本版本配合使用,因为我位于日本。但是,我想从网站的英国版本中提取信息。最简单的方法是什么?
我尝试了文档中的以下方法,但没有用。
$context = array('http' => array('proxy' => '212.82.126.32:80','request_fulluri' => true,),);
$stream = stream_context_create($context);
$html = file_get_html('http://www.supremenewyork.com/shop/new', false, $stream);
我还尝试了经过修改的 curl 版本,因为该站点启用了安全模式。那也没用。
function curl_exec_follow(/*resource*/ $ch, /*int*/ &$maxredirect = null) {
$mr = $maxredirect === null ? 5 : intval($maxredirect);
if (ini_get('open_basedir') == '' && ini_get('safe_mode' == 'Off')) {
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, $mr > 0);
curl_setopt($ch, CURLOPT_MAXREDIRS, $mr);
} else {
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
if ($mr > 0) {
$newurl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
$rch = curl_copy_handle($ch);
curl_setopt($rch, CURLOPT_HEADER, true);
curl_setopt($rch, CURLOPT_NOBODY, true);
curl_setopt($rch, CURLOPT_FORBID_REUSE, false);
curl_setopt($rch, CURLOPT_RETURNTRANSFER, true);
do {
curl_setopt($rch, CURLOPT_URL, $newurl);
$header = curl_exec($rch);
if (curl_errno($rch)) {
$code = 0;
} else {
$code = curl_getinfo($rch, CURLINFO_HTTP_CODE);
if ($code == 301 || $code == 302) {
preg_match('/Location:(.*?)\n/', $header, $matches);
$newurl = trim(array_pop($matches));
} else {
$code = 0;
}
}
} while ($code && --$mr);
curl_close($rch);
if (!$mr) {
if ($maxredirect === null) {
trigger_error('Too many redirects. When following redirects, libcurl hit the maximum amount.', E_USER_WARNING);
} else {
$maxredirect = 0;
}
return false;
}
curl_setopt($ch, CURLOPT_URL, $newurl);
}
}
return curl_exec($ch);
}
$url = 'http://www.supremenewyork.com/shop/new';
$proxy = '212.82.126.32:80';
$options = array(
CURLOPT_PROXY => $proxy,
CURLOPT_HTTPPROXYTUNNEL => 0,
CURLOPT_REFERER => "http://www.google.com",
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_USERAGENT => "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1",
CURLOPT_CONNECTTIMEOUT => 20,
CURLOPT_TIMEOUT => 20,
CURLOPT_MAXREDIRS => 10,
CURLOPT_HEADER => true,
);
$ch = curl_init( $url );
//curl_setopt_array( $ch, $options );
$content = curl_exec_follow( $ch );
$html = new simple_html_dom();
$html->load($content,true,false);
我也试过上传到美国和英国的服务器,但没有用,它只会提取美国数据。请帮忙?
无论安全模式是启用还是禁用,Curl 都能正常工作。
您的 Curl 脚本太复杂,请简化并重试。
$content = curl_exec_follow('http://www.supremenewyork.com/shop/new');
$html = new simple_html_dom();
$html->load($content,true,false);
我修改了你的代码,你可以试试
// define cookie file path here
define('CRAWLER_COOKIE_FILENAME', 'cookie.txt');
function curl_exec_follow($url) {
$proxy = '212.82.126.32:80';
$agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1';
// Some websites check referrer
$host = parse_url($url, PHP_URL_HOST);
$scheme = parse_url($url, PHP_URL_SCHEME);
$referrer = $scheme . '://' . $host;
$ch = curl_init();
$curl_defaults = array(
CURLOPT_HEADER => 0,
CURLOPT_FOLLOWLOCATION => 1,
CURLOPT_RETURNTRANSFER => 1,
);
curl_setopt_array($ch, $curl_defaults);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_PROXY, $proxy);
curl_setopt($ch, CURLOPT_USERAGENT, $agent);
curl_setopt($ch, CURLOPT_REFERER, $referrer);
if ( !file_exists(CRAWLER_COOKIE_FILENAME) || !is_writable(CRAWLER_COOKIE_FILENAME) ) {
echo 'Cookie file is missing or not writable.';
exit;
}
curl_setopt($ch, CURLOPT_COOKIESESSION, 0);
curl_setopt($ch, CURLOPT_COOKIEFILE, CRAWLER_COOKIE_FILENAME);
curl_setopt($ch, CURLOPT_COOKIEJAR, CRAWLER_COOKIE_FILENAME);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5);
// allow to crawl https webpages
curl_setopt($ch,CURLOPT_SSL_VERIFYHOST,0);
curl_setopt($ch,CURLOPT_SSL_VERIFYPEER,0);
// the download speed must be at least 1 byte per second
curl_setopt($ch,CURLOPT_LOW_SPEED_LIMIT, 1);
// if the download speed is below 1 byte per second for more than 30 seconds curl will give up
curl_setopt($ch,CURLOPT_LOW_SPEED_TIME, 30);
$content = curl_exec($ch);
if ($ret === FALSE) {
echo curl_error($ch);
}
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if ( $code != '200' ) echo 'http error code: ' . $code;
curl_close($ch);
return $content;
}
我在 MAMP 中有一个 PHP 简单 HTML DOM 本地解析器,它可以提取信息并且可以很好地与网站的日本版本配合使用,因为我位于日本。但是,我想从网站的英国版本中提取信息。最简单的方法是什么?
我尝试了文档中的以下方法,但没有用。
$context = array('http' => array('proxy' => '212.82.126.32:80','request_fulluri' => true,),);
$stream = stream_context_create($context);
$html = file_get_html('http://www.supremenewyork.com/shop/new', false, $stream);
我还尝试了经过修改的 curl 版本,因为该站点启用了安全模式。那也没用。
function curl_exec_follow(/*resource*/ $ch, /*int*/ &$maxredirect = null) {
$mr = $maxredirect === null ? 5 : intval($maxredirect);
if (ini_get('open_basedir') == '' && ini_get('safe_mode' == 'Off')) {
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, $mr > 0);
curl_setopt($ch, CURLOPT_MAXREDIRS, $mr);
} else {
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
if ($mr > 0) {
$newurl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
$rch = curl_copy_handle($ch);
curl_setopt($rch, CURLOPT_HEADER, true);
curl_setopt($rch, CURLOPT_NOBODY, true);
curl_setopt($rch, CURLOPT_FORBID_REUSE, false);
curl_setopt($rch, CURLOPT_RETURNTRANSFER, true);
do {
curl_setopt($rch, CURLOPT_URL, $newurl);
$header = curl_exec($rch);
if (curl_errno($rch)) {
$code = 0;
} else {
$code = curl_getinfo($rch, CURLINFO_HTTP_CODE);
if ($code == 301 || $code == 302) {
preg_match('/Location:(.*?)\n/', $header, $matches);
$newurl = trim(array_pop($matches));
} else {
$code = 0;
}
}
} while ($code && --$mr);
curl_close($rch);
if (!$mr) {
if ($maxredirect === null) {
trigger_error('Too many redirects. When following redirects, libcurl hit the maximum amount.', E_USER_WARNING);
} else {
$maxredirect = 0;
}
return false;
}
curl_setopt($ch, CURLOPT_URL, $newurl);
}
}
return curl_exec($ch);
}
$url = 'http://www.supremenewyork.com/shop/new';
$proxy = '212.82.126.32:80';
$options = array(
CURLOPT_PROXY => $proxy,
CURLOPT_HTTPPROXYTUNNEL => 0,
CURLOPT_REFERER => "http://www.google.com",
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_USERAGENT => "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1",
CURLOPT_CONNECTTIMEOUT => 20,
CURLOPT_TIMEOUT => 20,
CURLOPT_MAXREDIRS => 10,
CURLOPT_HEADER => true,
);
$ch = curl_init( $url );
//curl_setopt_array( $ch, $options );
$content = curl_exec_follow( $ch );
$html = new simple_html_dom();
$html->load($content,true,false);
我也试过上传到美国和英国的服务器,但没有用,它只会提取美国数据。请帮忙?
无论安全模式是启用还是禁用,Curl 都能正常工作。 您的 Curl 脚本太复杂,请简化并重试。
$content = curl_exec_follow('http://www.supremenewyork.com/shop/new');
$html = new simple_html_dom();
$html->load($content,true,false);
我修改了你的代码,你可以试试
// define cookie file path here
define('CRAWLER_COOKIE_FILENAME', 'cookie.txt');
function curl_exec_follow($url) {
$proxy = '212.82.126.32:80';
$agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1';
// Some websites check referrer
$host = parse_url($url, PHP_URL_HOST);
$scheme = parse_url($url, PHP_URL_SCHEME);
$referrer = $scheme . '://' . $host;
$ch = curl_init();
$curl_defaults = array(
CURLOPT_HEADER => 0,
CURLOPT_FOLLOWLOCATION => 1,
CURLOPT_RETURNTRANSFER => 1,
);
curl_setopt_array($ch, $curl_defaults);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_PROXY, $proxy);
curl_setopt($ch, CURLOPT_USERAGENT, $agent);
curl_setopt($ch, CURLOPT_REFERER, $referrer);
if ( !file_exists(CRAWLER_COOKIE_FILENAME) || !is_writable(CRAWLER_COOKIE_FILENAME) ) {
echo 'Cookie file is missing or not writable.';
exit;
}
curl_setopt($ch, CURLOPT_COOKIESESSION, 0);
curl_setopt($ch, CURLOPT_COOKIEFILE, CRAWLER_COOKIE_FILENAME);
curl_setopt($ch, CURLOPT_COOKIEJAR, CRAWLER_COOKIE_FILENAME);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5);
// allow to crawl https webpages
curl_setopt($ch,CURLOPT_SSL_VERIFYHOST,0);
curl_setopt($ch,CURLOPT_SSL_VERIFYPEER,0);
// the download speed must be at least 1 byte per second
curl_setopt($ch,CURLOPT_LOW_SPEED_LIMIT, 1);
// if the download speed is below 1 byte per second for more than 30 seconds curl will give up
curl_setopt($ch,CURLOPT_LOW_SPEED_TIME, 30);
$content = curl_exec($ch);
if ($ret === FALSE) {
echo curl_error($ch);
}
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if ( $code != '200' ) echo 'http error code: ' . $code;
curl_close($ch);
return $content;
}