使用 php 从每个页面的帖子中获取数据
Getting data from posts in each page using php
这是为了使用 curl 和 PHP Dom 从每个帖子(来自 forbes.com)获取标题和日期。当我在 foreach 中打印日期时,标题和时间显示正确的计数“30”。但我的问题是当我打印插入查询时,得到的结果是错误的。每个页面有 15 个帖子,下面的代码用于从两个页面获取数据。请检查下面的代码并帮助我解决这个问题?
<?php
require_once('dbconnect.php');
//use curl to get html content
function getHTML($url)
{
$curl=curl_init();
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_CUSTOMREQUEST, "GET");
curl_setopt($curl, CURLOPT_COOKIEFILE, '/cookies.txt');
$result = curl_exec($curl);
curl_close($curl);
return $result;
}
$url = "http://www.forbes.com/search/post/REIT/15/All-time/0/";
$results = getHTML($url);
$dom_document = new DOMDocument();
$dom_document->loadHTML($results);
$remove[] = ",";
$remove[] = " ";
$remove[] = "results";
/* ############################################################################ Total ###################################################################*/
$total_results = @$dom_document->getElementsByTagName('div');
foreach ($total_results as $total_result) {
$total_result_class = $total_result->getAttribute('class');
if(strstr($total_result_class, 'total_records')){
$total_result_replace = str_replace($remove, '', $total_result->textContent);
}
}
$pages_divide = $total_result_replace / 15; //338.4666
$pages_floor = floor($pages_divide); //338
for($i = 1; $i<= 2; $i++) { // $i<= 2 (two page)
$url_without_page = "http://www.forbes.com/search/post/REIT/15/All-time/";
$url_with_page = $url_without_page . $i . '/';
/*echo '<pre>';
print_r($url_with_page);*/
$url_pages = getHTML($url_with_page);
$dom_document_pages = new DOMDocument();
$dom_document_pages->loadHTML($url_pages);
/* ############################################################################ Title ###################################################################*/
$title_result = array();
$titles = @$dom_document_pages->getElementsByTagName('h2');
foreach ($titles as $title) {
foreach($title->childNodes as $nodes){
if($nodes->tagName == 'a'){
$title_result = str_replace("'", "", $nodes->textContent);
/* echo '<pre>';
print_r($title_result);*/
}
}
}
/* ############################################################################# Time ###################################################################*/
$time_result = array();
$times = @$dom_document_pages->getElementsByTagName('time');
foreach ($times as $time) {
$date = new DateTime();
$date_replace = str_replace(",", "", $time->textContent); //Feb 10, 2016
$string_to_time = strtotime($date_replace);
$date->setTimestamp($string_to_time);
$time_result = $date->format('Y-m-d');
/*echo '<pre>';
print_r($time_result);*/
}
$query_insert = "INSERT INTO article_forbes(title, datetime) VALUE ('".$title_result."', '".$time_result."')";
echo '<pre>';
echo $query_insert;
//$mysqli->query($query_insert);
}
您没有在循环中插入数据。所以你只是插入最后一个标题和最后一次。
你将$title_result
和$time_result
初始化为数组,所以我怀疑你原本打算把所有的标题和时间都收集到那里。但是,您没有将它们推入循环中的数组,而是用字符串替换了数组。变化
$title_result = str_replace("'", "", $nodes->textContent);
至
$title_result[] = $nodes->textContent;
以便您推入数组,并对 $time_result
进行类似的更改。
那么你应该在循环中插入。
$stmt = mysqli_prepare("INSERT INTO article_forbes(title, datetime) VALUES (?, ?)");
mysqli_bind_param($stmt, "ss", $title, $time);
for ($i = 0; $i < count($title_result); $i++) {
$title = $title_result[$i];
$time = $time_result[$i];
mysqli_execute($stmt);
}
使用这样的准备语句意味着您不需要从标题中删除特殊字符。
这是为了使用 curl 和 PHP Dom 从每个帖子(来自 forbes.com)获取标题和日期。当我在 foreach 中打印日期时,标题和时间显示正确的计数“30”。但我的问题是当我打印插入查询时,得到的结果是错误的。每个页面有 15 个帖子,下面的代码用于从两个页面获取数据。请检查下面的代码并帮助我解决这个问题?
<?php
require_once('dbconnect.php');
//use curl to get html content
function getHTML($url)
{
$curl=curl_init();
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_CUSTOMREQUEST, "GET");
curl_setopt($curl, CURLOPT_COOKIEFILE, '/cookies.txt');
$result = curl_exec($curl);
curl_close($curl);
return $result;
}
$url = "http://www.forbes.com/search/post/REIT/15/All-time/0/";
$results = getHTML($url);
$dom_document = new DOMDocument();
$dom_document->loadHTML($results);
$remove[] = ",";
$remove[] = " ";
$remove[] = "results";
/* ############################################################################ Total ###################################################################*/
$total_results = @$dom_document->getElementsByTagName('div');
foreach ($total_results as $total_result) {
$total_result_class = $total_result->getAttribute('class');
if(strstr($total_result_class, 'total_records')){
$total_result_replace = str_replace($remove, '', $total_result->textContent);
}
}
$pages_divide = $total_result_replace / 15; //338.4666
$pages_floor = floor($pages_divide); //338
for($i = 1; $i<= 2; $i++) { // $i<= 2 (two page)
$url_without_page = "http://www.forbes.com/search/post/REIT/15/All-time/";
$url_with_page = $url_without_page . $i . '/';
/*echo '<pre>';
print_r($url_with_page);*/
$url_pages = getHTML($url_with_page);
$dom_document_pages = new DOMDocument();
$dom_document_pages->loadHTML($url_pages);
/* ############################################################################ Title ###################################################################*/
$title_result = array();
$titles = @$dom_document_pages->getElementsByTagName('h2');
foreach ($titles as $title) {
foreach($title->childNodes as $nodes){
if($nodes->tagName == 'a'){
$title_result = str_replace("'", "", $nodes->textContent);
/* echo '<pre>';
print_r($title_result);*/
}
}
}
/* ############################################################################# Time ###################################################################*/
$time_result = array();
$times = @$dom_document_pages->getElementsByTagName('time');
foreach ($times as $time) {
$date = new DateTime();
$date_replace = str_replace(",", "", $time->textContent); //Feb 10, 2016
$string_to_time = strtotime($date_replace);
$date->setTimestamp($string_to_time);
$time_result = $date->format('Y-m-d');
/*echo '<pre>';
print_r($time_result);*/
}
$query_insert = "INSERT INTO article_forbes(title, datetime) VALUE ('".$title_result."', '".$time_result."')";
echo '<pre>';
echo $query_insert;
//$mysqli->query($query_insert);
}
您没有在循环中插入数据。所以你只是插入最后一个标题和最后一次。
你将$title_result
和$time_result
初始化为数组,所以我怀疑你原本打算把所有的标题和时间都收集到那里。但是,您没有将它们推入循环中的数组,而是用字符串替换了数组。变化
$title_result = str_replace("'", "", $nodes->textContent);
至
$title_result[] = $nodes->textContent;
以便您推入数组,并对 $time_result
进行类似的更改。
那么你应该在循环中插入。
$stmt = mysqli_prepare("INSERT INTO article_forbes(title, datetime) VALUES (?, ?)");
mysqli_bind_param($stmt, "ss", $title, $time);
for ($i = 0; $i < count($title_result); $i++) {
$title = $title_result[$i];
$time = $time_result[$i];
mysqli_execute($stmt);
}
使用这样的准备语句意味着您不需要从标题中删除特殊字符。