带 SSL 和重定向的多线程 cURL
Multi-threaded cURL with SSL and redirect
我有一个非常简单的抓取器,现在可以满足我的需要,但是速度很慢,它在 3 秒内抓取了 2 张图片,而我需要做的是在几秒钟内至少抓取 1000 张图片。
这是我现在使用的代码
<?php
require_once('config.php');
//Calling PHasher class file.
include_once('classes/phasher.class.php');
$I = PHasher::Instance();
//Prevent execution timeout.
set_time_limit(0);
//Solving SSL Problem.
$arrContextOptions=array(
"ssl"=>array(
"verify_peer"=>false,
"verify_peer_name"=>false,
),
);
//Check if the database contains hashed pictures or if it's empty, Then start from the latest hashed picture or start from 4.
$check = mysqli_query($con, "SELECT fid FROM images ORDER BY fid DESC LIMIT 1;");
if(mysqli_num_rows($check) > 0){
$max_fid = mysqli_fetch_row($check);
$fid = $max_fid[0]+1;
} else {
$fid = 4;
}
$deletedProfile = "https://z-1-static.xx.fbcdn.net/rsrc.php/v2/yo/r/UlIqmHJn-SK.gif";
//Infinte while loop to fetch profiles pictures and save them inside avatar folder.
$initial = $fid;
while($fid = $initial){
$url = 'https://graph.facebook.com/'.$fid.'/picture?width=378&height=378';
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); // follow the redirects
curl_setopt($ch, CURLOPT_HEADER, false); // no needs to pass the headers to the data stream
curl_setopt($ch, CURLOPT_NOBODY, true); // get the resource without a body
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // accept any server certificate
curl_exec($ch);
// get the last used URL
$lastUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
curl_close($ch);
if($lastUrl == $deletedProfile){
$initial++;
}else{
$imageUrl = file_get_contents($url, false, stream_context_create($arrContextOptions));
$savedImage = dirname(__file__).'/avatar/image.jpg';
file_put_contents($savedImage, $imageUrl);
//Exclude deleted profiles or corrupted pictures.
if(getimagesize($savedImage) > 0 ){
//PHasher class call to hash the images to hexdecimal values or binary values.
$hash = $I->FastHashImage($savedImage);
$hex = $I->HashAsString($hash);
//Store Facebook id and hashed values for the images in hexa values.
mysqli_query($con, "INSERT INTO images(fid, hash) VALUES ('$fid', '$hex')");
$initial++;
} else {
$initial++;
}
}
}
?>
之前一直没想好怎么弄,现在想到的是:
1-每个循环分成1000个profile存入数组
$items = array();
for($i=$fid; $i <= $fid+1000; $i++){
$url = 'https://graph.facebook.com/'.$i.'/picture?width=378&height=378';
$items[$i] = array($url);
}
但结果不正确我想知道如何修复数组的输出。
Array ( [28990] => Array ( [0] => https://graph.facebook.com/28990/picture?width=378&height=378 )
[28991] => Array ( [0] => https://graph.facebook.com/28991/picture?width=378&height=378 )
[28992] => Array ( [0] => https://graph.facebook.com/28992/picture?width=378&height=378 )
[28993] => Array ( [0] => https://graph.facebook.com/28993/picture?width=378&height=378 )
[28994] => Array ( [0] => https://graph.facebook.com/28994/picture?width=378&height=378 )
[28995] => Array ( [0] => https://graph.facebook.com/28995/picture?width=378&height=378 )
[28996] => Array ( [0] => https://graph.facebook.com/28996/picture?width=378&height=378 )
[28997] => Array ( [0] => https://graph.facebook.com/28997/picture?width=378&height=378 )
2- 然后我想在 Mulit curl 中使用输出数组,允许异步处理多个 cURL 句柄。
3- 检查输出 URL 是否等于已删除的配置文件,如果不传递它以使用 PHasher 将其转换为哈希值并将其存储在数据库中。
我刚好有你需要的东西,虽然我还没有达到那种吞吐量(每秒 1000 个并行请求)[=12=]
我忘了我以前从哪里得到这个,但我正在用它来下载 reddit 内容:
class ParallelCurl {
public $max_requests;
public $options;
public $outstanding_requests;
public $multi_handle;
public function __construct($in_max_requests = 10, $in_options = array()) {
$this->max_requests = $in_max_requests;
$this->options = $in_options;
$this->outstanding_requests = array();
$this->multi_handle = curl_multi_init();
}
//Ensure all the requests finish nicely
public function __destruct() {
$this->finishAllRequests();
}
// Sets how many requests can be outstanding at once before we block and wait for one to
// finish before starting the next one
public function setMaxRequests($in_max_requests) {
$this->max_requests = $in_max_requests;
}
// Sets the options to pass to curl, using the format of curl_setopt_array()
public function setOptions($in_options) {
$this->options = $in_options;
}
// Start a fetch from the $url address, calling the $callback function passing the optional
// $user_data value. The callback should accept 3 arguments, the url, curl handle and user
// data, eg on_request_done($url, $ch, $user_data);
public function startRequest($url, $callback, $user_data = array(), $post_fields = null, $headers = null) {
if ($this->max_requests > 0)
$this->waitForOutstandingRequestsToDropBelow($this->max_requests);
$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt_array($ch, $this->options);
curl_setopt($ch, CURLOPT_URL, $url);
if (isset($post_fields)) {
curl_setopt($ch, CURLOPT_POST, TRUE);
curl_setopt($ch, CURLOPT_POSTFIELDS, $post_fields);
}
if (is_array($headers)) {
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
}
curl_multi_add_handle($this->multi_handle, $ch);
$ch_array_key = (int) $ch;
$this->outstanding_requests[$ch_array_key] = array(
'link_url' => $url,
'callback' => $callback,
'user_data' => $user_data,
);
$this->checkForCompletedRequests();
}
// You *MUST* call this function at the end of your script. It waits for any running requests
// to complete, and calls their callback functions
public function finishAllRequests() {
$this->waitForOutstandingRequestsToDropBelow(1);
}
// Checks to see if any of the outstanding requests have finished
private function checkForCompletedRequests() {
/*
// Call select to see if anything is waiting for us
if (curl_multi_select($this->multi_handle, 0.0) === -1)
return;
// Since something's waiting, give curl a chance to process it
do {
$mrc = curl_multi_exec($this->multi_handle, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
*/
// fix for https://bugs.php.net/bug.php?id=63411
do {
$mrc = curl_multi_exec($this->multi_handle, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active && $mrc == CURLM_OK) {
if (curl_multi_select($this->multi_handle) != -1) {
do {
$mrc = curl_multi_exec($this->multi_handle, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
} else
return;
}
// Now grab the information about the completed requests
while ($info = curl_multi_info_read($this->multi_handle)) {
$ch = $info['handle'];
$ch_array_key = (int) $ch;
if (!isset($this->outstanding_requests[$ch_array_key])) {
die("Error - handle wasn't found in requests: '$ch' in " .
print_r($this->outstanding_requests, true));
}
$request = $this->outstanding_requests[$ch_array_key];
$url = $request['link_url'];
$content = curl_multi_getcontent($ch);
$callback = $request['callback'];
$user_data = $request['user_data'];
call_user_func($callback, $content, $url, $ch, $user_data);
unset($this->outstanding_requests[$ch_array_key]);
curl_multi_remove_handle($this->multi_handle, $ch);
}
}
// Blocks until there's less than the specified number of requests outstanding
private function waitForOutstandingRequestsToDropBelow($max) {
while (1) {
$this->checkForCompletedRequests();
if (count($this->outstanding_requests) < $max)
break;
usleep(10000);
}
}
}
它的工作方式是你传递给 ParallelCurl::startRequest() 一个 URL 和一个回调函数(可以是匿名的),然后为这个 URL 排队下载,然后下载完成后调用该函数。
$pcurl = new ParallelCurl(10, array(
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_FOLLOWLOCATION => 1,
CURLOPT_SSL_VERIFYPEER => 1,
));
$pcurl->startRequest($url, function($data) {
// download finished. $data is html or binary, whatever you requested
echo $data;
});
我有一个非常简单的抓取器,现在可以满足我的需要,但是速度很慢,它在 3 秒内抓取了 2 张图片,而我需要做的是在几秒钟内至少抓取 1000 张图片。
这是我现在使用的代码
<?php
require_once('config.php');
//Calling PHasher class file.
include_once('classes/phasher.class.php');
$I = PHasher::Instance();
//Prevent execution timeout.
set_time_limit(0);
//Solving SSL Problem.
$arrContextOptions=array(
"ssl"=>array(
"verify_peer"=>false,
"verify_peer_name"=>false,
),
);
//Check if the database contains hashed pictures or if it's empty, Then start from the latest hashed picture or start from 4.
$check = mysqli_query($con, "SELECT fid FROM images ORDER BY fid DESC LIMIT 1;");
if(mysqli_num_rows($check) > 0){
$max_fid = mysqli_fetch_row($check);
$fid = $max_fid[0]+1;
} else {
$fid = 4;
}
$deletedProfile = "https://z-1-static.xx.fbcdn.net/rsrc.php/v2/yo/r/UlIqmHJn-SK.gif";
//Infinte while loop to fetch profiles pictures and save them inside avatar folder.
$initial = $fid;
while($fid = $initial){
$url = 'https://graph.facebook.com/'.$fid.'/picture?width=378&height=378';
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); // follow the redirects
curl_setopt($ch, CURLOPT_HEADER, false); // no needs to pass the headers to the data stream
curl_setopt($ch, CURLOPT_NOBODY, true); // get the resource without a body
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // accept any server certificate
curl_exec($ch);
// get the last used URL
$lastUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
curl_close($ch);
if($lastUrl == $deletedProfile){
$initial++;
}else{
$imageUrl = file_get_contents($url, false, stream_context_create($arrContextOptions));
$savedImage = dirname(__file__).'/avatar/image.jpg';
file_put_contents($savedImage, $imageUrl);
//Exclude deleted profiles or corrupted pictures.
if(getimagesize($savedImage) > 0 ){
//PHasher class call to hash the images to hexdecimal values or binary values.
$hash = $I->FastHashImage($savedImage);
$hex = $I->HashAsString($hash);
//Store Facebook id and hashed values for the images in hexa values.
mysqli_query($con, "INSERT INTO images(fid, hash) VALUES ('$fid', '$hex')");
$initial++;
} else {
$initial++;
}
}
}
?>
之前一直没想好怎么弄,现在想到的是:
1-每个循环分成1000个profile存入数组
$items = array();
for($i=$fid; $i <= $fid+1000; $i++){
$url = 'https://graph.facebook.com/'.$i.'/picture?width=378&height=378';
$items[$i] = array($url);
}
但结果不正确我想知道如何修复数组的输出。
Array ( [28990] => Array ( [0] => https://graph.facebook.com/28990/picture?width=378&height=378 )
[28991] => Array ( [0] => https://graph.facebook.com/28991/picture?width=378&height=378 )
[28992] => Array ( [0] => https://graph.facebook.com/28992/picture?width=378&height=378 )
[28993] => Array ( [0] => https://graph.facebook.com/28993/picture?width=378&height=378 )
[28994] => Array ( [0] => https://graph.facebook.com/28994/picture?width=378&height=378 )
[28995] => Array ( [0] => https://graph.facebook.com/28995/picture?width=378&height=378 )
[28996] => Array ( [0] => https://graph.facebook.com/28996/picture?width=378&height=378 )
[28997] => Array ( [0] => https://graph.facebook.com/28997/picture?width=378&height=378 )
2- 然后我想在 Mulit curl 中使用输出数组,允许异步处理多个 cURL 句柄。
3- 检查输出 URL 是否等于已删除的配置文件,如果不传递它以使用 PHasher 将其转换为哈希值并将其存储在数据库中。
我刚好有你需要的东西,虽然我还没有达到那种吞吐量(每秒 1000 个并行请求)[=12=]
我忘了我以前从哪里得到这个,但我正在用它来下载 reddit 内容:
class ParallelCurl {
public $max_requests;
public $options;
public $outstanding_requests;
public $multi_handle;
public function __construct($in_max_requests = 10, $in_options = array()) {
$this->max_requests = $in_max_requests;
$this->options = $in_options;
$this->outstanding_requests = array();
$this->multi_handle = curl_multi_init();
}
//Ensure all the requests finish nicely
public function __destruct() {
$this->finishAllRequests();
}
// Sets how many requests can be outstanding at once before we block and wait for one to
// finish before starting the next one
public function setMaxRequests($in_max_requests) {
$this->max_requests = $in_max_requests;
}
// Sets the options to pass to curl, using the format of curl_setopt_array()
public function setOptions($in_options) {
$this->options = $in_options;
}
// Start a fetch from the $url address, calling the $callback function passing the optional
// $user_data value. The callback should accept 3 arguments, the url, curl handle and user
// data, eg on_request_done($url, $ch, $user_data);
public function startRequest($url, $callback, $user_data = array(), $post_fields = null, $headers = null) {
if ($this->max_requests > 0)
$this->waitForOutstandingRequestsToDropBelow($this->max_requests);
$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt_array($ch, $this->options);
curl_setopt($ch, CURLOPT_URL, $url);
if (isset($post_fields)) {
curl_setopt($ch, CURLOPT_POST, TRUE);
curl_setopt($ch, CURLOPT_POSTFIELDS, $post_fields);
}
if (is_array($headers)) {
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
}
curl_multi_add_handle($this->multi_handle, $ch);
$ch_array_key = (int) $ch;
$this->outstanding_requests[$ch_array_key] = array(
'link_url' => $url,
'callback' => $callback,
'user_data' => $user_data,
);
$this->checkForCompletedRequests();
}
// You *MUST* call this function at the end of your script. It waits for any running requests
// to complete, and calls their callback functions
public function finishAllRequests() {
$this->waitForOutstandingRequestsToDropBelow(1);
}
// Checks to see if any of the outstanding requests have finished
private function checkForCompletedRequests() {
/*
// Call select to see if anything is waiting for us
if (curl_multi_select($this->multi_handle, 0.0) === -1)
return;
// Since something's waiting, give curl a chance to process it
do {
$mrc = curl_multi_exec($this->multi_handle, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
*/
// fix for https://bugs.php.net/bug.php?id=63411
do {
$mrc = curl_multi_exec($this->multi_handle, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active && $mrc == CURLM_OK) {
if (curl_multi_select($this->multi_handle) != -1) {
do {
$mrc = curl_multi_exec($this->multi_handle, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
} else
return;
}
// Now grab the information about the completed requests
while ($info = curl_multi_info_read($this->multi_handle)) {
$ch = $info['handle'];
$ch_array_key = (int) $ch;
if (!isset($this->outstanding_requests[$ch_array_key])) {
die("Error - handle wasn't found in requests: '$ch' in " .
print_r($this->outstanding_requests, true));
}
$request = $this->outstanding_requests[$ch_array_key];
$url = $request['link_url'];
$content = curl_multi_getcontent($ch);
$callback = $request['callback'];
$user_data = $request['user_data'];
call_user_func($callback, $content, $url, $ch, $user_data);
unset($this->outstanding_requests[$ch_array_key]);
curl_multi_remove_handle($this->multi_handle, $ch);
}
}
// Blocks until there's less than the specified number of requests outstanding
private function waitForOutstandingRequestsToDropBelow($max) {
while (1) {
$this->checkForCompletedRequests();
if (count($this->outstanding_requests) < $max)
break;
usleep(10000);
}
}
}
它的工作方式是你传递给 ParallelCurl::startRequest() 一个 URL 和一个回调函数(可以是匿名的),然后为这个 URL 排队下载,然后下载完成后调用该函数。
$pcurl = new ParallelCurl(10, array(
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_FOLLOWLOCATION => 1,
CURLOPT_SSL_VERIFYPEER => 1,
));
$pcurl->startRequest($url, function($data) {
// download finished. $data is html or binary, whatever you requested
echo $data;
});