抓取文件以转换为 json
Scraping file to convert to json
我没有像预期的那样从下半部分得到任何输出。我可以抓取顶部 table 的数据,但我也在尝试抓取底部 table 的数据并将它们编码到 json 中。我需要抓取的列是
1.周约会回家注意事项
<?php
libxml_use_internal_errors(true);
$doc = new DOMDocument();
$doc->loadHTML(file_get_contents('https://www.leagueleader.net/sharedreport.php?operatorid=98&code=bc155b01-7492-412d-aa75-3c1e357248f1'));
$doc->strictErrorChecking = false;
$pre = [];
$keys = ['team', 'div', 'team-site-name', 'site-address', 'site-phone'];
$keys2 = ['week', 'date', 'home', 'away', 'at', 'notes'];
foreach ($doc->getElementsByTagName('table') as $k => $table) {
if (strpos($table->getAttribute('class'), 'report') === false) {
continue;
}
foreach ($table->getElementsByTagName('tr') as $i => $tr) {
if ($tr->parentNode->nodeName === 'thead') continue; // skip headers
$row_values = [];
foreach ($tr->childNodes as $td) {
$text = trim($td->nodeValue);
if ($text === '') continue;
$row_values[] = $text;
}
if($k == 1 ){
$row_values = array_combine($keys, $row_values);
}else if($k == 2 ){
unset($row_values[1]);
$row_values = array_combine($keys2, $row_values);
}
$pre[$row_values['name']][] = $row_values;
}
}
$combined = [];
foreach($pre as $week => $row){
$combined[$name] = [
"week"=> $week,
"team"=> $row[0]['team'],
"div"=> $row[0]['div'],
"team-site-name" => $row[0]['team-site-name'],
"site-address" => $row[0]['site-address'],
"site-phone" => $row[0]['site-phone'],
//"week" => $row[1]['week'],
"date" => $row[1]['date'],
"home" => $row[1]['home'],
"away" => $row[1]['away'],
"at" => $row[1]['at'],
"notes" => $row[1]['notes']
];
}
echo '<pre>'.json_encode($combined, JSON_PRETTY_PRINT).'</pre>';
?>
这是输出
{
"": {
"week": "",
"team": "1",
"div": "A",
"team-site-name": "Team 01Freer Bar",
"site-address": "\u00a07355 Michigan Ave Detroit, MI 48210",
"site-phone": "\u00a03138993699",
"date": null,
"home": null,
"away": null,
"at": null,
"notes": null
}
}
为了从第二个 table 中获取匹配的数据,我已将处理更改为使用 XPath。这从带有 class='report'
的第二个 table 的正文中提取 <tr>
标签(使用 //table[@class='report'][2]/tbody/tr
)。
所以这将 return table 正文中的所有行。然后提取所有 <td>
元素并挑出行中的详细信息。如果存在 week/date,它只会覆盖当前数据,如果存在匹配详细信息,它会在输出中创建一行...
$xpath = new DOMXPath($doc);
$reportRow = $xpath->query("//table[@class='report'][2]/tbody/tr");
$matches = [];
$week = '';
$date = '';
foreach ($reportRow as $row) {
$cells = $row->getElementsByTagName("td");
// Set week and date if present in the current row
$week = trim($cells[0]->textContent)?:$week;
$date = trim($cells[1]->textContent)?:$date;
// Extract the other details
$teamHome = trim($cells[2]->textContent);
$teamAway = trim($cells[3]->textContent);
$at = trim($cells[4]->textContent);
$notes = trim($cells[5]->textContent);
// If there are some match details, the store them
if ( !empty($teamHome) ) {
$matches[] = ["week" => $week, "date" => $date,
"teamHome" =>$teamHome, "teamAway" =>$teamAway,
"at" => $at, "notes" => $notes
];
}
}
print_r($matches);
这给...
Array
(
[0] => Array
(
[week] => 1
[date] => 09/10/2019
[teamHome] => Team 01
[teamAway] => BYE
[at] => BYE
[notes] =>
)
我没有像预期的那样从下半部分得到任何输出。我可以抓取顶部 table 的数据,但我也在尝试抓取底部 table 的数据并将它们编码到 json 中。我需要抓取的列是
1.周约会回家注意事项
<?php
libxml_use_internal_errors(true);
$doc = new DOMDocument();
$doc->loadHTML(file_get_contents('https://www.leagueleader.net/sharedreport.php?operatorid=98&code=bc155b01-7492-412d-aa75-3c1e357248f1'));
$doc->strictErrorChecking = false;
$pre = [];
$keys = ['team', 'div', 'team-site-name', 'site-address', 'site-phone'];
$keys2 = ['week', 'date', 'home', 'away', 'at', 'notes'];
foreach ($doc->getElementsByTagName('table') as $k => $table) {
if (strpos($table->getAttribute('class'), 'report') === false) {
continue;
}
foreach ($table->getElementsByTagName('tr') as $i => $tr) {
if ($tr->parentNode->nodeName === 'thead') continue; // skip headers
$row_values = [];
foreach ($tr->childNodes as $td) {
$text = trim($td->nodeValue);
if ($text === '') continue;
$row_values[] = $text;
}
if($k == 1 ){
$row_values = array_combine($keys, $row_values);
}else if($k == 2 ){
unset($row_values[1]);
$row_values = array_combine($keys2, $row_values);
}
$pre[$row_values['name']][] = $row_values;
}
}
$combined = [];
foreach($pre as $week => $row){
$combined[$name] = [
"week"=> $week,
"team"=> $row[0]['team'],
"div"=> $row[0]['div'],
"team-site-name" => $row[0]['team-site-name'],
"site-address" => $row[0]['site-address'],
"site-phone" => $row[0]['site-phone'],
//"week" => $row[1]['week'],
"date" => $row[1]['date'],
"home" => $row[1]['home'],
"away" => $row[1]['away'],
"at" => $row[1]['at'],
"notes" => $row[1]['notes']
];
}
echo '<pre>'.json_encode($combined, JSON_PRETTY_PRINT).'</pre>';
?>
这是输出
{
"": {
"week": "",
"team": "1",
"div": "A",
"team-site-name": "Team 01Freer Bar",
"site-address": "\u00a07355 Michigan Ave Detroit, MI 48210",
"site-phone": "\u00a03138993699",
"date": null,
"home": null,
"away": null,
"at": null,
"notes": null
}
}
为了从第二个 table 中获取匹配的数据,我已将处理更改为使用 XPath。这从带有 class='report'
的第二个 table 的正文中提取 <tr>
标签(使用 //table[@class='report'][2]/tbody/tr
)。
所以这将 return table 正文中的所有行。然后提取所有 <td>
元素并挑出行中的详细信息。如果存在 week/date,它只会覆盖当前数据,如果存在匹配详细信息,它会在输出中创建一行...
$xpath = new DOMXPath($doc);
$reportRow = $xpath->query("//table[@class='report'][2]/tbody/tr");
$matches = [];
$week = '';
$date = '';
foreach ($reportRow as $row) {
$cells = $row->getElementsByTagName("td");
// Set week and date if present in the current row
$week = trim($cells[0]->textContent)?:$week;
$date = trim($cells[1]->textContent)?:$date;
// Extract the other details
$teamHome = trim($cells[2]->textContent);
$teamAway = trim($cells[3]->textContent);
$at = trim($cells[4]->textContent);
$notes = trim($cells[5]->textContent);
// If there are some match details, the store them
if ( !empty($teamHome) ) {
$matches[] = ["week" => $week, "date" => $date,
"teamHome" =>$teamHome, "teamAway" =>$teamAway,
"at" => $at, "notes" => $notes
];
}
}
print_r($matches);
这给...
Array
(
[0] => Array
(
[week] => 1
[date] => 09/10/2019
[teamHome] => Team 01
[teamAway] => BYE
[at] => BYE
[notes] =>
)