使用 php 从 url 中提取元数据,奇怪的字符?
Pulling metadate from url with php, strange characters?
我遇到了一些问题,我正在尝试从外部提取基本元数据 URL,我已经成功地做到了大部分,但它导致了一些字符问题当我调用图像 url 实际上是 mäenjaksa7-300x200.jpg 时,作为 Ä ä ö 的字母会像 mäenjaksa7-300x200.jpg 一样出现,我的代码在下面,感谢您的帮助。
function file_get_contents_curl($url)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$data = curl_exec($ch);
curl_close($ch);
return $data; }
$html = file_get_contents_curl($params['url']);
//parsing begins here:
$doc = new DOMDocument();
@$doc->loadHTML($html);
$nodes = $doc->getElementsByTagName('title');
//get and display what you need:
$urltitle = $nodes->item(0)->nodeValue;
$metas = $doc->getElementsByTagName('meta');
for ($i = 0; $i < $metas->length; $i++)
{
$meta = $metas->item($i);
if($meta->getAttribute('name') == 'description')
$description = $meta->getAttribute('content');
if($meta->getAttribute('name') == 'keywords')
$keywords = $meta->getAttribute('content');
if($meta->getAttribute('property') == 'og:image')
$ogimage = $meta->getAttribute('content');
if($meta->getAttribute('rel') == 'image_src')
$relimage = $meta->getAttribute('content');
}
if( empty($ogimage) ) {
$metaimage = $relimage;
} else {
$metaimage = $ogimage;
}
也许你必须确保你的 url header 有 content-type
-> charset
到 utf-8或合适的。您必须确保您的 url 不是内容 none Ascii 字符或确保您已正确设置适当的 "character’s encoder"。也许我没有很好地理解你的问题,但是看看这个与你的代码无关但可能有用的例子:
$url = "http://www.example.com/services/calculation";
$page = "/services/calculation";
$headers = array(
"POST ".$page." HTTP/1.0",
"Content-type: text/xml;charset=\"utf-8\"",
"Accept: text/xml",
"Cache-Control: no-cache",
"Pragma: no-cache",
"SOAPAction: \"run\"",
"Content-length: ".strlen($xml_data),
"Authorization: Basic " . base64_encode($credentials)
);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL,$url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_TIMEOUT, 60);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
curl_setopt($ch, CURLOPT_USERAGENT, $defined_vars['HTTP_USER_AGENT']);
解决方法:
在下面添加
查找:
$html = file_get_contents_curl($url);
在下面添加:
//Change encoding to UTF-8 from ISO-8859-1
$html = iconv('UTF-8', 'ISO-8859-1//TRANSLIT', $html);
我遇到了一些问题,我正在尝试从外部提取基本元数据 URL,我已经成功地做到了大部分,但它导致了一些字符问题当我调用图像 url 实际上是 mäenjaksa7-300x200.jpg 时,作为 Ä ä ö 的字母会像 mäenjaksa7-300x200.jpg 一样出现,我的代码在下面,感谢您的帮助。
function file_get_contents_curl($url)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$data = curl_exec($ch);
curl_close($ch);
return $data; }
$html = file_get_contents_curl($params['url']);
//parsing begins here:
$doc = new DOMDocument();
@$doc->loadHTML($html);
$nodes = $doc->getElementsByTagName('title');
//get and display what you need:
$urltitle = $nodes->item(0)->nodeValue;
$metas = $doc->getElementsByTagName('meta');
for ($i = 0; $i < $metas->length; $i++)
{
$meta = $metas->item($i);
if($meta->getAttribute('name') == 'description')
$description = $meta->getAttribute('content');
if($meta->getAttribute('name') == 'keywords')
$keywords = $meta->getAttribute('content');
if($meta->getAttribute('property') == 'og:image')
$ogimage = $meta->getAttribute('content');
if($meta->getAttribute('rel') == 'image_src')
$relimage = $meta->getAttribute('content');
}
if( empty($ogimage) ) {
$metaimage = $relimage;
} else {
$metaimage = $ogimage;
}
也许你必须确保你的 url header 有 content-type
-> charset
到 utf-8或合适的。您必须确保您的 url 不是内容 none Ascii 字符或确保您已正确设置适当的 "character’s encoder"。也许我没有很好地理解你的问题,但是看看这个与你的代码无关但可能有用的例子:
$url = "http://www.example.com/services/calculation";
$page = "/services/calculation";
$headers = array(
"POST ".$page." HTTP/1.0",
"Content-type: text/xml;charset=\"utf-8\"",
"Accept: text/xml",
"Cache-Control: no-cache",
"Pragma: no-cache",
"SOAPAction: \"run\"",
"Content-length: ".strlen($xml_data),
"Authorization: Basic " . base64_encode($credentials)
);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL,$url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_TIMEOUT, 60);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
curl_setopt($ch, CURLOPT_USERAGENT, $defined_vars['HTTP_USER_AGENT']);
解决方法: 在下面添加 查找:
$html = file_get_contents_curl($url);
在下面添加:
//Change encoding to UTF-8 from ISO-8859-1
$html = iconv('UTF-8', 'ISO-8859-1//TRANSLIT', $html);