使用 DOM 解析器在头部 HTML 中获取 <script>
Get <script> in head HTML using DOM Parser
我目前正在为我的项目使用 DOM 解析器。另外,我在 php 中使用 CURL 来抓取网站。我想从我得到的 HTML 头部的脚本标签中获取一个值。但我真的很困惑该怎么做。如果 运行 代码如下:
$data_dom = new simple_html_dom();
$data_dom->load($html);
foreach($data_dom->find('script') as $script){
echo $script->plaintext."<br>";
}
结果是空值,我检查的时候只出现了br标签。我想获得所有使用脚本标签的东西。这是头部值:
<head>
I will give you the script I want to get
.....
<script type="text/javascript">
var keysearch = {"departureLabel":"Surabaya (SUB : Juanda) Jawa Timur Indonesia","arrivalLabel":"Palangkaraya (PKY : Tjilik Riwut | Panarung) Kalimantan Tengah Indonesia","adultNum":"1","childNum":"0","infantNum":"0","departure":"SUB","arrival":"PKY","departDate":"20181115","roundTrip":0,"cabinType":-1,"departureCode":"ID-Surabaya-SUB","arrivalCode":"ID-Palangkaraya-PKY"};
(function(window, _gtm, keysearch){
if (window.gtmInstance){
var departureExp = keysearch.departureCode.split("-");
var arrivalExp = keysearch.arrivalCode.split("-");
gtmInstance.setFlightData({
'ITEM_TYPE': 'flight',
'FLY_OUTB_CODE': departureExp[2],
'FLY_OUTB_CITY': departureExp[1],
'FLY_OUTB_COUNTRYCODE': departureExp[0],
'FLY_OUTB_DATE': keysearch.departDate,
'FLY_INB_CODE': arrivalExp[2],
'FLY_INB_CITY': arrivalExp[1],
'FLY_INB_COUNTRYCODE': arrivalExp[0],
'FLY_INB_DATE': keysearch.returnDate,
'FLY_NBPAX_ADL': keysearch.adultNum,
'FLY_NBPAX_CHL': keysearch.childNum,
'FLY_NBPAX_INF': keysearch.infantNum,
});
gtmInstance.pushFlightSearchEvent();
}
}(window, gtmInstance, keysearch));
var key = "rkey=10fe7b6fd1f7fa1ef0f4fa538f917811dbc7f4628a791ba69962f2ed305fb72d061b67737afd843aaaeeee946f1442bb";
var staticRoot = 'http://sta.nusatrip.net';
$(function() {
$("#currencySelector").nusaCurrencyOptions({
selected: getCookie("curCode"),
});
});
</script>
</head>
我想获取关键变量。我将使用它从网站获取数据。谢谢
根据其余标记的外观,您可以只使用 DOMDocument and XPath, then parse out the value of the var with preg_match。此示例将回显密钥。
<?php
$html = <<<END
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
<script type="text/javascript">
var keysearch = {"departureLabel":"Surabaya (SUB : Juanda) Jawa Timur Indonesia","arrivalLabel":"Palangkaraya (PKY : Tjilik Riwut | Panarung) Kalimantan Tengah Indonesia","adultNum":"1","childNum":"0","infantNum":"0","departure":"SUB","arrival":"PKY","departDate":"20181115","roundTrip":0,"cabinType":-1,"departureCode":"ID-Surabaya-SUB","arrivalCode":"ID-Palangkaraya-PKY"};
(function(window, _gtm, keysearch){
if (window.gtmInstance){
var departureExp = keysearch.departureCode.split("-");
var arrivalExp = keysearch.arrivalCode.split("-");
gtmInstance.setFlightData({
'ITEM_TYPE': 'flight',
'FLY_OUTB_CODE': departureExp[2],
'FLY_OUTB_CITY': departureExp[1],
'FLY_OUTB_COUNTRYCODE': departureExp[0],
'FLY_OUTB_DATE': keysearch.departDate,
'FLY_INB_CODE': arrivalExp[2],
'FLY_INB_CITY': arrivalExp[1],
'FLY_INB_COUNTRYCODE': arrivalExp[0],
'FLY_INB_DATE': keysearch.returnDate,
'FLY_NBPAX_ADL': keysearch.adultNum,
'FLY_NBPAX_CHL': keysearch.childNum,
'FLY_NBPAX_INF': keysearch.infantNum,
});
gtmInstance.pushFlightSearchEvent();
}
}(window, gtmInstance, keysearch));
var key = "rkey=10fe7b6fd1f7fa1ef0f4fa538f917811dbc7f4628a791ba69962f2ed305fb72d061b67737afd843aaaeeee946f1442bb";
var staticRoot = 'http://sta.nusatrip.net';
$(function() {
$("#currencySelector").nusaCurrencyOptions({
selected: getCookie("curCode"),
});
});
</script>
</head>
<body>foo</body>
</html>
END;
$dom = new DOMDocument();
$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$result = $xpath->query('//script');
foreach($result as $currScriptTag)
{
$currScriptContent = $currScriptTag->nodeValue;
$matchFound = preg_match('/var key = "(.*)"/', $currScriptContent, $matches);
if($matchFound)
{
/*
* $matches[0] will contain the whole line like var key = "..."
* $matches[1] just contains the value of the var
*/
$key = $matches[1];
echo $key.PHP_EOL;
}
}
我目前正在为我的项目使用 DOM 解析器。另外,我在 php 中使用 CURL 来抓取网站。我想从我得到的 HTML 头部的脚本标签中获取一个值。但我真的很困惑该怎么做。如果 运行 代码如下:
$data_dom = new simple_html_dom();
$data_dom->load($html);
foreach($data_dom->find('script') as $script){
echo $script->plaintext."<br>";
}
结果是空值,我检查的时候只出现了br标签。我想获得所有使用脚本标签的东西。这是头部值:
<head>
I will give you the script I want to get
.....
<script type="text/javascript">
var keysearch = {"departureLabel":"Surabaya (SUB : Juanda) Jawa Timur Indonesia","arrivalLabel":"Palangkaraya (PKY : Tjilik Riwut | Panarung) Kalimantan Tengah Indonesia","adultNum":"1","childNum":"0","infantNum":"0","departure":"SUB","arrival":"PKY","departDate":"20181115","roundTrip":0,"cabinType":-1,"departureCode":"ID-Surabaya-SUB","arrivalCode":"ID-Palangkaraya-PKY"};
(function(window, _gtm, keysearch){
if (window.gtmInstance){
var departureExp = keysearch.departureCode.split("-");
var arrivalExp = keysearch.arrivalCode.split("-");
gtmInstance.setFlightData({
'ITEM_TYPE': 'flight',
'FLY_OUTB_CODE': departureExp[2],
'FLY_OUTB_CITY': departureExp[1],
'FLY_OUTB_COUNTRYCODE': departureExp[0],
'FLY_OUTB_DATE': keysearch.departDate,
'FLY_INB_CODE': arrivalExp[2],
'FLY_INB_CITY': arrivalExp[1],
'FLY_INB_COUNTRYCODE': arrivalExp[0],
'FLY_INB_DATE': keysearch.returnDate,
'FLY_NBPAX_ADL': keysearch.adultNum,
'FLY_NBPAX_CHL': keysearch.childNum,
'FLY_NBPAX_INF': keysearch.infantNum,
});
gtmInstance.pushFlightSearchEvent();
}
}(window, gtmInstance, keysearch));
var key = "rkey=10fe7b6fd1f7fa1ef0f4fa538f917811dbc7f4628a791ba69962f2ed305fb72d061b67737afd843aaaeeee946f1442bb";
var staticRoot = 'http://sta.nusatrip.net';
$(function() {
$("#currencySelector").nusaCurrencyOptions({
selected: getCookie("curCode"),
});
});
</script>
</head>
我想获取关键变量。我将使用它从网站获取数据。谢谢
根据其余标记的外观,您可以只使用 DOMDocument and XPath, then parse out the value of the var with preg_match。此示例将回显密钥。
<?php
$html = <<<END
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
<script type="text/javascript">
var keysearch = {"departureLabel":"Surabaya (SUB : Juanda) Jawa Timur Indonesia","arrivalLabel":"Palangkaraya (PKY : Tjilik Riwut | Panarung) Kalimantan Tengah Indonesia","adultNum":"1","childNum":"0","infantNum":"0","departure":"SUB","arrival":"PKY","departDate":"20181115","roundTrip":0,"cabinType":-1,"departureCode":"ID-Surabaya-SUB","arrivalCode":"ID-Palangkaraya-PKY"};
(function(window, _gtm, keysearch){
if (window.gtmInstance){
var departureExp = keysearch.departureCode.split("-");
var arrivalExp = keysearch.arrivalCode.split("-");
gtmInstance.setFlightData({
'ITEM_TYPE': 'flight',
'FLY_OUTB_CODE': departureExp[2],
'FLY_OUTB_CITY': departureExp[1],
'FLY_OUTB_COUNTRYCODE': departureExp[0],
'FLY_OUTB_DATE': keysearch.departDate,
'FLY_INB_CODE': arrivalExp[2],
'FLY_INB_CITY': arrivalExp[1],
'FLY_INB_COUNTRYCODE': arrivalExp[0],
'FLY_INB_DATE': keysearch.returnDate,
'FLY_NBPAX_ADL': keysearch.adultNum,
'FLY_NBPAX_CHL': keysearch.childNum,
'FLY_NBPAX_INF': keysearch.infantNum,
});
gtmInstance.pushFlightSearchEvent();
}
}(window, gtmInstance, keysearch));
var key = "rkey=10fe7b6fd1f7fa1ef0f4fa538f917811dbc7f4628a791ba69962f2ed305fb72d061b67737afd843aaaeeee946f1442bb";
var staticRoot = 'http://sta.nusatrip.net';
$(function() {
$("#currencySelector").nusaCurrencyOptions({
selected: getCookie("curCode"),
});
});
</script>
</head>
<body>foo</body>
</html>
END;
$dom = new DOMDocument();
$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$result = $xpath->query('//script');
foreach($result as $currScriptTag)
{
$currScriptContent = $currScriptTag->nodeValue;
$matchFound = preg_match('/var key = "(.*)"/', $currScriptContent, $matches);
if($matchFound)
{
/*
* $matches[0] will contain the whole line like var key = "..."
* $matches[1] just contains the value of the var
*/
$key = $matches[1];
echo $key.PHP_EOL;
}
}