检查远程文件(图像)是否存在的最快方法
Fastest way to check for remote file (image) existence
我已经编写了一个在本地服务器 运行 连接商家应用程序和托管商店网店的远程 Web 服务器之间的产品同步脚本...
对于完全同步选项,我需要同步大约 5000 多种产品,包括它们的图像等...即使具有共享相同产品的相同产品的尺寸变化(不同的产品尺寸 - 例如鞋子)图片,我需要检查大约 3500 张图片是否存在...
所以,对于第一个 运行,我通过 FTP 上传了除了其中几张之外的所有产品图片,然后让脚本 运行 检查它是否会上传这些图片丢失的图像...
问题是脚本 运行 4 小时,这是不可接受的...我的意思是,我没有重新上传每张图片...它只是检查每张图片以确定它是否将跳过或上传它(通过 ftp_put()
)。
我是这样执行检查的:
if (stripos(get_headers(DESTINATION_URL . "{$path}/{$file}")[0], '200 OK') === false) {
速度相当快,但显然不够快,无法在合理的时间内同步到 运行...
你们如何处理必须检查大量远程文件是否存在的情况?
作为最后的手段,我已经离开使用 ftp_nlist()
下载远程文件列表,然后编写一个算法来或多或少地对本地文件和远程文件进行文件比较。 ..
我试过了,递归算法构建文件列表需要很长时间,实际上需要 30 多分钟...你看,我的文件不在一个文件夹中...整棵树跨越 1,956文件夹,文件列表由 3,653 个产品图像文件组成,并且还在不断增加……另外请注意,我什至没有使用大小“技巧”(与 ftp_nlist()
结合使用)来确定文件是文件还是一个文件夹,而是使用较新的 ftp_mlsd()
,它明确 returns 一个保存该信息的类型参数...您可以在此处阅读更多信息:
curl_multi 可能是最快的方法。不幸的是 curl_multi 很难使用,一个例子对我有很大帮助。检查加拿大 2 个不同数据中心的 2x 1gbps 专用服务器之间的 url,此脚本设法每秒检查 3000 个 url,使用 500 个并发 tcp 连接 (并且可以通过重新使用卷曲句柄而不是打开+关闭来使其更快)
<?php
declare(strict_types=1);
$urls=array();
for($i=0;$i<100000;++$i){
$urls[]="http://ratma.net/";
}
validate_urls($urls,500,1000,false,false,false);
// if return_fault_reason is false, then the return is a simple array of strings of urls that validated.
// otherwise it's an array with the url as the key containing array(bool validated,int curl_error_code,string reason) for every url
function validate_urls(array $urls, int $max_connections, int $timeout_ms = 10000, bool $consider_http_300_redirect_as_error = true, bool $return_fault_reason) : array
{
if ($max_connections < 1) {
throw new InvalidArgumentException("max_connections MUST be >=1");
}
foreach ($urls as $key => $foo) {
if (!is_string($foo)) {
throw new \InvalidArgumentException("all urls must be strings!");
}
if (empty($foo)) {
unset($urls[$key]); //?
}
}
unset($foo);
// DISABLED for benchmarking purposes: $urls = array_unique($urls); // remove duplicates.
$ret = array();
$mh = curl_multi_init();
$workers = array();
$work = function () use (&$ret, &$workers, &$mh, &$return_fault_reason) {
// > If an added handle fails very quickly, it may never be counted as a running_handle
while (1) {
curl_multi_exec($mh, $still_running);
if ($still_running < count($workers)) {
break;
}
$cms=curl_multi_select($mh, 10);
//var_dump('sr: ' . $still_running . " c: " . count($workers)." cms: ".$cms);
}
while (false !== ($info = curl_multi_info_read($mh))) {
//echo "NOT FALSE!";
//var_dump($info);
{
if ($info['msg'] !== CURLMSG_DONE) {
continue;
}
if ($info['result'] !== CURLM_OK) {
if ($return_fault_reason) {
$ret[$workers[(int)$info['handle']]] = array(false, $info['result'], "curl_exec error " . $info['result'] . ": " . curl_strerror($info['result']));
}
} elseif (CURLE_OK !== ($err = curl_errno($info['handle']))) {
if ($return_fault_reason) {
$ret[$workers[(int)$info['handle']]] = array(false, $err, "curl error " . $err . ": " . curl_strerror($err));
}
} else {
$code = (string)curl_getinfo($info['handle'], CURLINFO_HTTP_CODE);
if ($code[0] === "3") {
if ($consider_http_300_redirect_as_error) {
if ($return_fault_reason) {
$ret[$workers[(int)$info['handle']]] = array(false, -1, "got a http " . $code . " redirect, which is considered an error");
}
} else {
if ($return_fault_reason) {
$ret[$workers[(int)$info['handle']]] = array(true, 0, "got a http " . $code . " redirect, which is considered a success");
} else {
$ret[] = $workers[(int)$info['handle']];
}
}
} elseif ($code[0] === "2") {
if ($return_fault_reason) {
$ret[$workers[(int)$info['handle']]] = array(true, 0, "got a http " . $code . " code, which is considered a success");
} else {
$ret[] = $workers[(int)$info['handle']];
}
} else {
// all non-2xx and non-3xx are always considered errors (500 internal server error, 400 client error, 404 not found, etcetc)
if ($return_fault_reason) {
$ret[$workers[(int)$info['handle']]] = array(false, -1, "got a http " . $code . " code, which is considered an error");
}
}
}
curl_multi_remove_handle($mh, $info['handle']);
assert(isset($workers[(int)$info['handle']]));
unset($workers[(int)$info['handle']]);
curl_close($info['handle']);
}
}
//echo "NO MORE INFO!";
};
foreach ($urls as $url) {
while (count($workers) >= $max_connections) {
//echo "TOO MANY WORKERS!\n";
$work();
}
$neww = curl_init($url);
if (!$neww) {
trigger_error("curl_init() failed! probably means that max_connections is too high and you ran out of resources", E_USER_WARNING);
if ($return_fault_reason) {
$ret[$url] = array(false, -1, "curl_init() failed");
}
continue;
}
$workers[(int)$neww] = $url;
curl_setopt_array($neww, array(
CURLOPT_NOBODY => 1,
CURLOPT_SSL_VERIFYHOST => 0,
CURLOPT_SSL_VERIFYPEER => 0,
CURLOPT_TIMEOUT_MS => $timeout_ms
));
curl_multi_add_handle($mh, $neww);
//curl_multi_exec($mh, $unused_here); LIKELY TO BE MUCH SLOWER IF DONE IN THIS LOOP: TOO MANY SYSCALLS
}
while (count($workers) > 0) {
//echo "WAITING FOR WORKERS TO BECOME 0!";
//var_dump(count($workers));
$work();
}
curl_multi_close($mh);
return $ret;
}
我已经编写了一个在本地服务器 运行 连接商家应用程序和托管商店网店的远程 Web 服务器之间的产品同步脚本...
对于完全同步选项,我需要同步大约 5000 多种产品,包括它们的图像等...即使具有共享相同产品的相同产品的尺寸变化(不同的产品尺寸 - 例如鞋子)图片,我需要检查大约 3500 张图片是否存在...
所以,对于第一个 运行,我通过 FTP 上传了除了其中几张之外的所有产品图片,然后让脚本 运行 检查它是否会上传这些图片丢失的图像...
问题是脚本 运行 4 小时,这是不可接受的...我的意思是,我没有重新上传每张图片...它只是检查每张图片以确定它是否将跳过或上传它(通过 ftp_put()
)。
我是这样执行检查的:
if (stripos(get_headers(DESTINATION_URL . "{$path}/{$file}")[0], '200 OK') === false) {
速度相当快,但显然不够快,无法在合理的时间内同步到 运行...
你们如何处理必须检查大量远程文件是否存在的情况?
作为最后的手段,我已经离开使用 ftp_nlist()
下载远程文件列表,然后编写一个算法来或多或少地对本地文件和远程文件进行文件比较。 ..
我试过了,递归算法构建文件列表需要很长时间,实际上需要 30 多分钟...你看,我的文件不在一个文件夹中...整棵树跨越 1,956文件夹,文件列表由 3,653 个产品图像文件组成,并且还在不断增加……另外请注意,我什至没有使用大小“技巧”(与 ftp_nlist()
结合使用)来确定文件是文件还是一个文件夹,而是使用较新的 ftp_mlsd()
,它明确 returns 一个保存该信息的类型参数...您可以在此处阅读更多信息:
curl_multi 可能是最快的方法。不幸的是 curl_multi 很难使用,一个例子对我有很大帮助。检查加拿大 2 个不同数据中心的 2x 1gbps 专用服务器之间的 url,此脚本设法每秒检查 3000 个 url,使用 500 个并发 tcp 连接 (并且可以通过重新使用卷曲句柄而不是打开+关闭来使其更快)
<?php
declare(strict_types=1);
$urls=array();
for($i=0;$i<100000;++$i){
$urls[]="http://ratma.net/";
}
validate_urls($urls,500,1000,false,false,false);
// if return_fault_reason is false, then the return is a simple array of strings of urls that validated.
// otherwise it's an array with the url as the key containing array(bool validated,int curl_error_code,string reason) for every url
function validate_urls(array $urls, int $max_connections, int $timeout_ms = 10000, bool $consider_http_300_redirect_as_error = true, bool $return_fault_reason) : array
{
if ($max_connections < 1) {
throw new InvalidArgumentException("max_connections MUST be >=1");
}
foreach ($urls as $key => $foo) {
if (!is_string($foo)) {
throw new \InvalidArgumentException("all urls must be strings!");
}
if (empty($foo)) {
unset($urls[$key]); //?
}
}
unset($foo);
// DISABLED for benchmarking purposes: $urls = array_unique($urls); // remove duplicates.
$ret = array();
$mh = curl_multi_init();
$workers = array();
$work = function () use (&$ret, &$workers, &$mh, &$return_fault_reason) {
// > If an added handle fails very quickly, it may never be counted as a running_handle
while (1) {
curl_multi_exec($mh, $still_running);
if ($still_running < count($workers)) {
break;
}
$cms=curl_multi_select($mh, 10);
//var_dump('sr: ' . $still_running . " c: " . count($workers)." cms: ".$cms);
}
while (false !== ($info = curl_multi_info_read($mh))) {
//echo "NOT FALSE!";
//var_dump($info);
{
if ($info['msg'] !== CURLMSG_DONE) {
continue;
}
if ($info['result'] !== CURLM_OK) {
if ($return_fault_reason) {
$ret[$workers[(int)$info['handle']]] = array(false, $info['result'], "curl_exec error " . $info['result'] . ": " . curl_strerror($info['result']));
}
} elseif (CURLE_OK !== ($err = curl_errno($info['handle']))) {
if ($return_fault_reason) {
$ret[$workers[(int)$info['handle']]] = array(false, $err, "curl error " . $err . ": " . curl_strerror($err));
}
} else {
$code = (string)curl_getinfo($info['handle'], CURLINFO_HTTP_CODE);
if ($code[0] === "3") {
if ($consider_http_300_redirect_as_error) {
if ($return_fault_reason) {
$ret[$workers[(int)$info['handle']]] = array(false, -1, "got a http " . $code . " redirect, which is considered an error");
}
} else {
if ($return_fault_reason) {
$ret[$workers[(int)$info['handle']]] = array(true, 0, "got a http " . $code . " redirect, which is considered a success");
} else {
$ret[] = $workers[(int)$info['handle']];
}
}
} elseif ($code[0] === "2") {
if ($return_fault_reason) {
$ret[$workers[(int)$info['handle']]] = array(true, 0, "got a http " . $code . " code, which is considered a success");
} else {
$ret[] = $workers[(int)$info['handle']];
}
} else {
// all non-2xx and non-3xx are always considered errors (500 internal server error, 400 client error, 404 not found, etcetc)
if ($return_fault_reason) {
$ret[$workers[(int)$info['handle']]] = array(false, -1, "got a http " . $code . " code, which is considered an error");
}
}
}
curl_multi_remove_handle($mh, $info['handle']);
assert(isset($workers[(int)$info['handle']]));
unset($workers[(int)$info['handle']]);
curl_close($info['handle']);
}
}
//echo "NO MORE INFO!";
};
foreach ($urls as $url) {
while (count($workers) >= $max_connections) {
//echo "TOO MANY WORKERS!\n";
$work();
}
$neww = curl_init($url);
if (!$neww) {
trigger_error("curl_init() failed! probably means that max_connections is too high and you ran out of resources", E_USER_WARNING);
if ($return_fault_reason) {
$ret[$url] = array(false, -1, "curl_init() failed");
}
continue;
}
$workers[(int)$neww] = $url;
curl_setopt_array($neww, array(
CURLOPT_NOBODY => 1,
CURLOPT_SSL_VERIFYHOST => 0,
CURLOPT_SSL_VERIFYPEER => 0,
CURLOPT_TIMEOUT_MS => $timeout_ms
));
curl_multi_add_handle($mh, $neww);
//curl_multi_exec($mh, $unused_here); LIKELY TO BE MUCH SLOWER IF DONE IN THIS LOOP: TOO MANY SYSCALLS
}
while (count($workers) > 0) {
//echo "WAITING FOR WORKERS TO BECOME 0!";
//var_dump(count($workers));
$work();
}
curl_multi_close($mh);
return $ret;
}