使用 multi curl 获取所有 URL
Get all the URLs using multi curl
我正在开发一个应用程序,该应用程序从一组网站获取所有 URL 并以数组形式显示或 JSON。
我可以使用 for 循环来完成,问题是当我尝试 10 个 URL 时的执行时间它给我一个错误提示 exceeds maximum execution time
。
经过搜索我发现了这个 multi curl
我也找到了这个Fast PHP CURL Multiple Requests: Retrieve the content of multiple URLs using CURL。我尝试添加我的代码但没有成功,因为我不知道如何使用该功能。
希望你能帮助我。
谢谢。
这是我的示例代码。
<?php
$urls=array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/');
$mh = curl_multi_init();
foreach ($urls as $i => $url) {
$urlContent = file_get_contents($url);
$dom = new DOMDocument();
@$dom->loadHTML($urlContent);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if(!filter_var($url, FILTER_VALIDATE_URL) === false){
echo '<a href="'.$url.'">'.$url.'</a><br />';
}
}
$conn[$i]=curl_init($url);
$fp[$i]=fopen ($g, "w");
curl_setopt ($conn[$i], CURLOPT_FILE, $fp[$i]);
curl_setopt ($conn[$i], CURLOPT_HEADER ,0);
curl_setopt($conn[$i],CURLOPT_CONNECTTIMEOUT,60);
curl_multi_add_handle ($mh,$conn[$i]);
}
do {
$n=curl_multi_exec($mh,$active);
}
while ($active);
foreach ($urls as $i => $url) {
curl_multi_remove_handle($mh,$conn[$i]);
curl_close($conn[$i]);
fclose ($fp[$i]);
}
curl_multi_close($mh);
?>
您可能正在使用无限循环 - 如果没有,您可以在 php.ini 中增加最大执行时间或使用:
ini_set('max_execution_time', 600); // 600 seconds = 10 minutes
试试这个简化版本:
$urls = [
'https://en.wikipedia.org/',
'https://secure.php.net/',
];
set_time_limit(0);
libxml_use_internal_errors(true);
$hrefs = [];
foreach ($urls as $url) {
$html = file_get_contents($url);
$doc = new DOMDocument;
$doc->loadHTML($html);
foreach ($doc->getElementsByTagName('a') as $link) {
$href = filter_var($link->getAttribute('href'), FILTER_SANITIZE_URL);
if (filter_var($href, FILTER_VALIDATE_URL)) {
echo "<a href='{$href}'>{$href}</a><br/>\n";
}
}
}
首先,我知道 OP 确实询问了 multi_curl
但我只是添加了另一种选择,如果 OP 可能改变主意的话。我在这里所做的是将 url 分成许多请求,因此 cpu 的使用不会那么大。如果 OP 仍然想使用 multi_curl
,也许这里的 PHP 大师可以提供更好的解决方案。
<?php
$num = preg_replace('/[^0-9]/','',$_GET['num']);
$num = empty($num) ? 0 : $num;
$urls=array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/');
if(!empty($urls[$num]))
{
/* do your single curl stuff here and store its data here*/
/*now redirect to the next url. dont use header location redirect, it would ends up too many redirect error in browser*/
$next_url = !empty($urls[$num+1]) ? $urls[$num+1] : 'done';
echo '<html>
<head>
<meta http-equiv="refresh" content="0;url=http://yourcodedomain.com/yourpath/yourcode.php?num='.$next_url.'" />
</head>
<body>
<p>Fetching: '.$num+1.' / '.count($urls).'</p>
</body>
</html>';
}
elseif($_GET['num'] == 'done')
{
/*if all sites have been fetched, do something here*/
}
else
{
/*throws exception here*/
}
?>
这是我在编写代码后取得的成果,它有效但不确定这是否是最佳答案。请检查我的代码。
<?php
$array = array('https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/');
print_r (getUrls($array));
function getUrls($array) {
$arrUrl = array();
$arrList = array();
$url_count = count($array);
$curl_array = array();
$ch = curl_multi_init();
foreach($array as $count => $url) {
$curl_array[$count] = curl_init($url);
curl_setopt($curl_array[$count], CURLOPT_RETURNTRANSFER, true);
curl_multi_add_handle($ch, $curl_array[$count]);
}
do{
curl_multi_exec($ch, $exec);
curl_multi_select($ch,1);
}while($exec);
foreach($array as $count => $url) {
$arrUrl = array();
$urlContent = curl_multi_getcontent($curl_array[$count]);
$dom = new DOMDocument();
@$dom->loadHTML($urlContent);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if (filter_var($url, FILTER_VALIDATE_URL) !== false) {
if (strpos($url, 'mailto') === false) {
$arrUrl[] = $url;
}
}
}
array_push($arrList, array_unique($arrUrl));
}
foreach($array as $count => $url) {
curl_multi_remove_handle($ch, $curl_array[$count]);
}
curl_multi_close($ch);
foreach($array as $count => $url) {
curl_close($curl_array[$count]);
}
return $arrList;
}
这是我整理的一个函数,可以正确使用 curl_multi_init()
函数。它与您在 PHP.net 上找到的功能大致相同,只是做了一些小的调整。我在这方面取得了巨大的成功。
function multi_thread_curl($urlArray, $optionArray, $nThreads) {
//Group your urls into groups/threads.
$curlArray = array_chunk($urlArray, $nThreads, $preserve_keys = true);
//Iterate through each batch of urls.
$ch = 'ch_';
foreach($curlArray as $threads) {
//Create your cURL resources.
foreach($threads as $thread=>$value) {
${$ch . $thread} = curl_init();
curl_setopt_array(${$ch . $thread}, $optionArray); //Set your main curl options.
curl_setopt(${$ch . $thread}, CURLOPT_URL, $value); //Set url.
}
//Create the multiple cURL handler.
$mh = curl_multi_init();
//Add the handles.
foreach($threads as $thread=>$value) {
curl_multi_add_handle($mh, ${$ch . $thread});
}
$active = null;
//execute the handles.
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active && $mrc == CURLM_OK) {
if (curl_multi_select($mh) != -1) {
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
//Get your data and close the handles.
foreach($threads as $thread=>$value) {
$results[$thread] = curl_multi_getcontent(${$ch . $thread});
curl_multi_remove_handle($mh, ${$ch . $thread});
}
//Close the multi handle exec.
curl_multi_close($mh);
}
return $results;
}
//Add whatever options here. The CURLOPT_URL is left out intentionally.
//It will be added in later from the url array.
$optionArray = array(
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0',//Pick your user agent.
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_TIMEOUT => 10
);
//Create an array of your urls.
$urlArray = array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/'
);
//Play around with this number and see what works best.
//This is how many urls it will try to do at one time.
$nThreads = 20;
//To use run the function.
$results = multi_thread_curl($urlArray, $optionArray, $nThreads);
完成后,您将得到一个包含网站列表中所有 html 的数组。正是在这一点上,我将遍历它们并提取所有 url。
像这样:
foreach($results as $page){
$dom = new DOMDocument();
@$dom->loadHTML($page);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if(!filter_var($url, FILTER_VALIDATE_URL) === false){
echo '<a href="'.$url.'">'.$url.'</a><br />';
}
}
}
增加脚本的 运行 时间的能力也值得牢记在心。
如果您使用托管服务,无论您将最长执行时间设置为多少,您都可能被限制在两分钟内。只是值得深思。
完成者:
ini_set('max_execution_time', 120);
你总是可以尝试更多的时间,但你永远不会知道直到你计时。
希望对您有所帮助。
我遇到了同样的问题,然后我使用 usleep() 解决了这个尝试并让我知道
do {
usleep(10000);
$n=curl_multi_exec($mh,$active);
}
我正在开发一个应用程序,该应用程序从一组网站获取所有 URL 并以数组形式显示或 JSON。
我可以使用 for 循环来完成,问题是当我尝试 10 个 URL 时的执行时间它给我一个错误提示 exceeds maximum execution time
。
经过搜索我发现了这个 multi curl
我也找到了这个Fast PHP CURL Multiple Requests: Retrieve the content of multiple URLs using CURL。我尝试添加我的代码但没有成功,因为我不知道如何使用该功能。
希望你能帮助我。
谢谢。
这是我的示例代码。
<?php
$urls=array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/');
$mh = curl_multi_init();
foreach ($urls as $i => $url) {
$urlContent = file_get_contents($url);
$dom = new DOMDocument();
@$dom->loadHTML($urlContent);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if(!filter_var($url, FILTER_VALIDATE_URL) === false){
echo '<a href="'.$url.'">'.$url.'</a><br />';
}
}
$conn[$i]=curl_init($url);
$fp[$i]=fopen ($g, "w");
curl_setopt ($conn[$i], CURLOPT_FILE, $fp[$i]);
curl_setopt ($conn[$i], CURLOPT_HEADER ,0);
curl_setopt($conn[$i],CURLOPT_CONNECTTIMEOUT,60);
curl_multi_add_handle ($mh,$conn[$i]);
}
do {
$n=curl_multi_exec($mh,$active);
}
while ($active);
foreach ($urls as $i => $url) {
curl_multi_remove_handle($mh,$conn[$i]);
curl_close($conn[$i]);
fclose ($fp[$i]);
}
curl_multi_close($mh);
?>
您可能正在使用无限循环 - 如果没有,您可以在 php.ini 中增加最大执行时间或使用:
ini_set('max_execution_time', 600); // 600 seconds = 10 minutes
试试这个简化版本:
$urls = [
'https://en.wikipedia.org/',
'https://secure.php.net/',
];
set_time_limit(0);
libxml_use_internal_errors(true);
$hrefs = [];
foreach ($urls as $url) {
$html = file_get_contents($url);
$doc = new DOMDocument;
$doc->loadHTML($html);
foreach ($doc->getElementsByTagName('a') as $link) {
$href = filter_var($link->getAttribute('href'), FILTER_SANITIZE_URL);
if (filter_var($href, FILTER_VALIDATE_URL)) {
echo "<a href='{$href}'>{$href}</a><br/>\n";
}
}
}
首先,我知道 OP 确实询问了 multi_curl
但我只是添加了另一种选择,如果 OP 可能改变主意的话。我在这里所做的是将 url 分成许多请求,因此 cpu 的使用不会那么大。如果 OP 仍然想使用 multi_curl
,也许这里的 PHP 大师可以提供更好的解决方案。
<?php
$num = preg_replace('/[^0-9]/','',$_GET['num']);
$num = empty($num) ? 0 : $num;
$urls=array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/');
if(!empty($urls[$num]))
{
/* do your single curl stuff here and store its data here*/
/*now redirect to the next url. dont use header location redirect, it would ends up too many redirect error in browser*/
$next_url = !empty($urls[$num+1]) ? $urls[$num+1] : 'done';
echo '<html>
<head>
<meta http-equiv="refresh" content="0;url=http://yourcodedomain.com/yourpath/yourcode.php?num='.$next_url.'" />
</head>
<body>
<p>Fetching: '.$num+1.' / '.count($urls).'</p>
</body>
</html>';
}
elseif($_GET['num'] == 'done')
{
/*if all sites have been fetched, do something here*/
}
else
{
/*throws exception here*/
}
?>
这是我在编写代码后取得的成果,它有效但不确定这是否是最佳答案。请检查我的代码。
<?php
$array = array('https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/');
print_r (getUrls($array));
function getUrls($array) {
$arrUrl = array();
$arrList = array();
$url_count = count($array);
$curl_array = array();
$ch = curl_multi_init();
foreach($array as $count => $url) {
$curl_array[$count] = curl_init($url);
curl_setopt($curl_array[$count], CURLOPT_RETURNTRANSFER, true);
curl_multi_add_handle($ch, $curl_array[$count]);
}
do{
curl_multi_exec($ch, $exec);
curl_multi_select($ch,1);
}while($exec);
foreach($array as $count => $url) {
$arrUrl = array();
$urlContent = curl_multi_getcontent($curl_array[$count]);
$dom = new DOMDocument();
@$dom->loadHTML($urlContent);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if (filter_var($url, FILTER_VALIDATE_URL) !== false) {
if (strpos($url, 'mailto') === false) {
$arrUrl[] = $url;
}
}
}
array_push($arrList, array_unique($arrUrl));
}
foreach($array as $count => $url) {
curl_multi_remove_handle($ch, $curl_array[$count]);
}
curl_multi_close($ch);
foreach($array as $count => $url) {
curl_close($curl_array[$count]);
}
return $arrList;
}
这是我整理的一个函数,可以正确使用 curl_multi_init()
函数。它与您在 PHP.net 上找到的功能大致相同,只是做了一些小的调整。我在这方面取得了巨大的成功。
function multi_thread_curl($urlArray, $optionArray, $nThreads) {
//Group your urls into groups/threads.
$curlArray = array_chunk($urlArray, $nThreads, $preserve_keys = true);
//Iterate through each batch of urls.
$ch = 'ch_';
foreach($curlArray as $threads) {
//Create your cURL resources.
foreach($threads as $thread=>$value) {
${$ch . $thread} = curl_init();
curl_setopt_array(${$ch . $thread}, $optionArray); //Set your main curl options.
curl_setopt(${$ch . $thread}, CURLOPT_URL, $value); //Set url.
}
//Create the multiple cURL handler.
$mh = curl_multi_init();
//Add the handles.
foreach($threads as $thread=>$value) {
curl_multi_add_handle($mh, ${$ch . $thread});
}
$active = null;
//execute the handles.
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active && $mrc == CURLM_OK) {
if (curl_multi_select($mh) != -1) {
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
//Get your data and close the handles.
foreach($threads as $thread=>$value) {
$results[$thread] = curl_multi_getcontent(${$ch . $thread});
curl_multi_remove_handle($mh, ${$ch . $thread});
}
//Close the multi handle exec.
curl_multi_close($mh);
}
return $results;
}
//Add whatever options here. The CURLOPT_URL is left out intentionally.
//It will be added in later from the url array.
$optionArray = array(
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0',//Pick your user agent.
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_TIMEOUT => 10
);
//Create an array of your urls.
$urlArray = array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/'
);
//Play around with this number and see what works best.
//This is how many urls it will try to do at one time.
$nThreads = 20;
//To use run the function.
$results = multi_thread_curl($urlArray, $optionArray, $nThreads);
完成后,您将得到一个包含网站列表中所有 html 的数组。正是在这一点上,我将遍历它们并提取所有 url。
像这样:
foreach($results as $page){
$dom = new DOMDocument();
@$dom->loadHTML($page);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if(!filter_var($url, FILTER_VALIDATE_URL) === false){
echo '<a href="'.$url.'">'.$url.'</a><br />';
}
}
}
增加脚本的 运行 时间的能力也值得牢记在心。
如果您使用托管服务,无论您将最长执行时间设置为多少,您都可能被限制在两分钟内。只是值得深思。
完成者:
ini_set('max_execution_time', 120);
你总是可以尝试更多的时间,但你永远不会知道直到你计时。
希望对您有所帮助。
我遇到了同样的问题,然后我使用 usleep() 解决了这个尝试并让我知道
do {
usleep(10000);
$n=curl_multi_exec($mh,$active);
}