从命令行向脚本传递参数
Passing parameters from command line to script
我正在编写一个程序来抓取以下网站:https://filmstoon.in/
我想从中找到几部电影(蝙蝠侠前传、钢铁侠、敢死队 3)和电视剧(权力的游戏)并抓取标题、主持人 url 和元 url。我设法做到了这一点,但是,它是为特定标题手动制作的。代码:
include ("simple_html_dom.php");
ini_set('max_execution_time', 0);
date_default_timezone_set('Europe/Vilnius');
$link = "https://filmstoon.in/series/game-of-thrones/";
$link1 = "https://filmstoon.in/batman-begins/";
$link2 = "https://filmstoon.in/iron-man/";
$link3 = "https://filmstoon.in/expendables-3/";
//TV Series
class episode{
private $title;
private $host_url;
private $linking_url;
public function setTitle($title){
$this->title = $title;
}
public function getTitle(){
return $this->title;
}
public function setHost_url($host_url){
$this->host_url = $host_url;
}
public function getHost_url(){
return $this->host_url;
}
public function setLinking_url($linking_url){
$this->linking_url = $linking_url;
}
public function getLinking_url(){
return $this->linking_url;
}
}
function main(){
$array_url = getting_url();
foreach($array_url as $single_link){
$episodeObject = info_from_linking($single_link);
echo_to_server($episodeObject->getLinking_url(), $episodeObject->getHost_url(), $episodeObject->getTitle());
writeToFile($episodeObject->getLinking_url(), $episodeObject->getHost_url(), $episodeObject->getTitle());
}
}
function getting_url(){
global $link;
$html = file_get_html($link);
$array_url = array();
foreach($html->find('.les-content a') as $divClass) {
$linking_url = $divClass->href;
array_push($array_url, $linking_url);
}
return $array_url;
}
function info_from_linking($episode_link){
$inside_linking = file_get_html($episode_link);
$mainDiv = $inside_linking->find('div[class="main-content main-detail"]')[0];
$title = $mainDiv->find('h3[itemprop="name"]',0)->plaintext;
$host_url = $mainDiv->find('iframe',1)->src;
$class = new episode;
$class->setTitle($title);
$class->setHost_url($host_url);
$class->setLinking_url($episode_link);
return $class;
}
function echo_to_server($linking_url, $host_url, $title){
$date = date('m/d/Y H:i', time());
echo "{$date} \t {$linking_url} \t {$host_url} \t {$title} \n";
}
function writeToFile($linking_url, $host_url, $title){
$date = date('m/d/Y H:i', time());
$result = array($date, $linking_url, $host_url, $title);
$output = 'scrape.txt';
file_put_contents($output, print_r($result, true), FILE_APPEND);
}
main();
//Movies
function get_content_movies($url){
$htmlContent = file_get_contents($url);
$dom = new simple_html_dom();
$dom->load($htmlContent);
if(count($dom->find('div[class="main-content main-detail"]'))>0){
$file = $dom->find('div[class="main-content main-detail"]')[0];
$title = $file->find('h3[itemprop="name"]',0)->plaintext;
$host_url = $file->find('iframe',1)->src;
$meta_link = $dom->find('meta[property="og:url"]',0)->content;
$date = date('m/d/Y H:i', time());
echo "{$date} \t {$host_url} \t {$meta_link} \t {$title} \n";
$result = array($title, $host_url, $meta_link, $date);
$output = 'scrape.txt';
file_put_contents($output, print_r($result, true), FILE_APPEND);
}
}
get_content_movies($link1);
get_content_movies($link2);
get_content_movies($link3);
一切正常,但是,我想这样写:
php crawler.php batman begins
或任何其他标题在cmd中执行代码时,它会专门找到movie/tv系列并执行我编写的脚本。
到目前为止,我唯一的想法就是抓取整个页面,将其存储在数据库中(例如 .txt 文件),然后使用 $argc 从中查找内容和 $argv。或者 - 转到主页并使用搜索功能。我在命令行中编写的参数将被传递到搜索表单,然后它将执行脚本。
但是,由于我是新手,所以我无法全神贯注地思考如何实现这些想法。
function getArgumentValues($argv, $seperator){
$values = "$argv[1]";
foreach($argv as $key=>$value){
if($key>1){
$values.="$seperator$value";
}
}
return $values;
}
function get_content_movies($linkMovies, $argv){
$htmlContent = file_get_contents($linkMovies);
$argvValue = getArgumentValues($argv, " ");
if(!preg_match("/href\=\"(.*?)\".*?oldtitle\=\"$argvValue/i", $htmlContent, $search)){
return null;
};
$key = array_values($search)[1];
$htmlContent = file_get_contents($key);
if(preg_match("/series/", $key)){
main();
}
else{
preg_match('/\<h3 itemprop\=\"name\"\>(.*)<\/h3>/iSU', $htmlContent, $title);
preg_match('/<iframe.*data-lazy-src=\"(.*)\".*><\/iframe>/iSU', $htmlContent, $embed_url);
preg_match('/<meta.*property="og:url".*content="(.*)".*\/>/iSU', $htmlContent, $meta_url);
$date = date('m/d/Y H:i', time());
echo "{$date} \t {$embed_url[1]} \t {$meta_url[1]} \t {$title[1]} \n";
$result = array($date, $embed_url[1], $meta_url[1], $title[1]);
$output = 'scrape.txt';
file_put_contents($output, print_r($result, true), FILE_APPEND);
}
}
get_content_movies($linkMovies, $argv);
我正在编写一个程序来抓取以下网站:https://filmstoon.in/
我想从中找到几部电影(蝙蝠侠前传、钢铁侠、敢死队 3)和电视剧(权力的游戏)并抓取标题、主持人 url 和元 url。我设法做到了这一点,但是,它是为特定标题手动制作的。代码:
include ("simple_html_dom.php");
ini_set('max_execution_time', 0);
date_default_timezone_set('Europe/Vilnius');
$link = "https://filmstoon.in/series/game-of-thrones/";
$link1 = "https://filmstoon.in/batman-begins/";
$link2 = "https://filmstoon.in/iron-man/";
$link3 = "https://filmstoon.in/expendables-3/";
//TV Series
class episode{
private $title;
private $host_url;
private $linking_url;
public function setTitle($title){
$this->title = $title;
}
public function getTitle(){
return $this->title;
}
public function setHost_url($host_url){
$this->host_url = $host_url;
}
public function getHost_url(){
return $this->host_url;
}
public function setLinking_url($linking_url){
$this->linking_url = $linking_url;
}
public function getLinking_url(){
return $this->linking_url;
}
}
function main(){
$array_url = getting_url();
foreach($array_url as $single_link){
$episodeObject = info_from_linking($single_link);
echo_to_server($episodeObject->getLinking_url(), $episodeObject->getHost_url(), $episodeObject->getTitle());
writeToFile($episodeObject->getLinking_url(), $episodeObject->getHost_url(), $episodeObject->getTitle());
}
}
function getting_url(){
global $link;
$html = file_get_html($link);
$array_url = array();
foreach($html->find('.les-content a') as $divClass) {
$linking_url = $divClass->href;
array_push($array_url, $linking_url);
}
return $array_url;
}
function info_from_linking($episode_link){
$inside_linking = file_get_html($episode_link);
$mainDiv = $inside_linking->find('div[class="main-content main-detail"]')[0];
$title = $mainDiv->find('h3[itemprop="name"]',0)->plaintext;
$host_url = $mainDiv->find('iframe',1)->src;
$class = new episode;
$class->setTitle($title);
$class->setHost_url($host_url);
$class->setLinking_url($episode_link);
return $class;
}
function echo_to_server($linking_url, $host_url, $title){
$date = date('m/d/Y H:i', time());
echo "{$date} \t {$linking_url} \t {$host_url} \t {$title} \n";
}
function writeToFile($linking_url, $host_url, $title){
$date = date('m/d/Y H:i', time());
$result = array($date, $linking_url, $host_url, $title);
$output = 'scrape.txt';
file_put_contents($output, print_r($result, true), FILE_APPEND);
}
main();
//Movies
function get_content_movies($url){
$htmlContent = file_get_contents($url);
$dom = new simple_html_dom();
$dom->load($htmlContent);
if(count($dom->find('div[class="main-content main-detail"]'))>0){
$file = $dom->find('div[class="main-content main-detail"]')[0];
$title = $file->find('h3[itemprop="name"]',0)->plaintext;
$host_url = $file->find('iframe',1)->src;
$meta_link = $dom->find('meta[property="og:url"]',0)->content;
$date = date('m/d/Y H:i', time());
echo "{$date} \t {$host_url} \t {$meta_link} \t {$title} \n";
$result = array($title, $host_url, $meta_link, $date);
$output = 'scrape.txt';
file_put_contents($output, print_r($result, true), FILE_APPEND);
}
}
get_content_movies($link1);
get_content_movies($link2);
get_content_movies($link3);
一切正常,但是,我想这样写:
php crawler.php batman begins
或任何其他标题在cmd中执行代码时,它会专门找到movie/tv系列并执行我编写的脚本。
到目前为止,我唯一的想法就是抓取整个页面,将其存储在数据库中(例如 .txt 文件),然后使用 $argc 从中查找内容和 $argv。或者 - 转到主页并使用搜索功能。我在命令行中编写的参数将被传递到搜索表单,然后它将执行脚本。
但是,由于我是新手,所以我无法全神贯注地思考如何实现这些想法。
function getArgumentValues($argv, $seperator){
$values = "$argv[1]";
foreach($argv as $key=>$value){
if($key>1){
$values.="$seperator$value";
}
}
return $values;
}
function get_content_movies($linkMovies, $argv){
$htmlContent = file_get_contents($linkMovies);
$argvValue = getArgumentValues($argv, " ");
if(!preg_match("/href\=\"(.*?)\".*?oldtitle\=\"$argvValue/i", $htmlContent, $search)){
return null;
};
$key = array_values($search)[1];
$htmlContent = file_get_contents($key);
if(preg_match("/series/", $key)){
main();
}
else{
preg_match('/\<h3 itemprop\=\"name\"\>(.*)<\/h3>/iSU', $htmlContent, $title);
preg_match('/<iframe.*data-lazy-src=\"(.*)\".*><\/iframe>/iSU', $htmlContent, $embed_url);
preg_match('/<meta.*property="og:url".*content="(.*)".*\/>/iSU', $htmlContent, $meta_url);
$date = date('m/d/Y H:i', time());
echo "{$date} \t {$embed_url[1]} \t {$meta_url[1]} \t {$title[1]} \n";
$result = array($date, $embed_url[1], $meta_url[1], $title[1]);
$output = 'scrape.txt';
file_put_contents($output, print_r($result, true), FILE_APPEND);
}
}
get_content_movies($linkMovies, $argv);