如何使用 curl 或 wget 以 csv 或 tsv 格式获取此网页的内容

Question

我有以下 link:

http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?method=loadAircraftConditionsResultPage&enterpriseName=%E6%AD%A6%E6%B1%89%E8%88%AA%E8%BE%BE%E8%88%AA%E7%A9%BA%E7%A7%91%E6%8A%80%E5%8F%91%E5%B1%95%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&licenceCode=&partsNumber=&partsName=&ataChaptersection=

我想下载此数据库中的所有表格并转换为 csv 或 tsv 文件。是否有任何 curl 或 wget 命令可以让我解析这个数据库？

像这样

wget -r -np -k http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet\?method\=loadAircraftConditionsResultPage\&enterpriseName\=%E6%AD%A6%E6%B1%89%E8%88%AA%E8%BE%BE%E8%88%AA%E7%A9%BA%E7%A7%91%E6%8A%80%E5%8F%91%E5%B1%95%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8\&licenceCode\=\&partsNumber\=\&partsName\=\&ataChaptersection\=

只给我网站代码，而不是表格本身的内容。

Answer 1

如果你想要 json 输出 URL 是

http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?method=aircraftConditionsQuery&iColumns=10&sColumns=&iDisplayStart=0&iDisplayLength=20&mDataProp_0=enterpriseName&mDataProp_1=licenceCode&mDataProp_2=partsNumber&mDataProp_3=partsName&mDataProp_4=ataChaptersection&mDataProp_5=manufacturers&mDataProp_6=6&mDataProp_7=fileToAccord&mDataProp_8=mainDevices&mDataProp_9=remark&enterpriseName=%E6%AD%A6%E6%B1%89%E8%88%AA%E8%BE%BE%E8%88%AA%E7%A9%BA%E7%A7%91%E6%8A%80%E5%8F%91%E5%B1%95%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&licenceCode=&partsNumber=&partsName=&ataChaptersection=

在 URL 中，您在 iDisplayStart 参数中进行了分页，从 0 开始，结果为 20 个步骤。

使用 gnu parallel、jq 和 Miller 您可以下载所有内容并转换为 CSV。

例如要下载前 6 json 个输出（从 0 到 100），您可以运行

echo {0..100..20} | tr " " "\n" | parallel -j0 'curl "http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?method=aircraftConditionsQuery&iColumns=10&sColumns=&iDisplayStart={}&iDisplayLength=20&mDataProp_0=enterpriseName&mDataProp_1=licenceCode&mDataProp_2=partsNumber&mDataProp_3=partsName&mDataProp_4=ataChaptersection&mDataProp_5=manufacturers&mDataProp_6=6&mDataProp_7=fileToAccord&mDataProp_8=mainDevices&mDataProp_9=remark&enterpriseName=%E6%AD%A6%E6%B1%89%E8%88%AA%E8%BE%BE%E8%88%AA%E7%A9%BA%E7%A7%91%E6%8A%80%E5%8F%91%E5%B1%95%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&licenceCode=&partsNumber=&partsName=&ataChaptersection=" | jq -c '"'"'.aaData[]'"'"' >{}.jsonl'

在输出中你将有 6 个 json 个文件。

要将它们合并到 CSV 文件中，您可以运行

mlr --j2c unsparsify *.jsonl >output.csv

它会像 https://gist.github.com/aborruso/777fce957865011c85a7a689cd0bd413

Answer 2

因为它有一个 web-scraping 标签，我认为如果你想从该站点抓取 ALL 表，你可以尝试 Python。

ALL 我的意思是所有 2347 页到一个单个 .csv 文件，看起来像这样:

在继续编写代码之前，我想分享一个小的免责声明：

The code below relies heavily on multi-threading for requesting data from the server, which might result in connection refusal / forced interruption and/or getting banned from further connections.

明智地使用它并自行负责。

话虽如此，您实际上不必运行下面的代码，因为整个转储可以作为 caac_gov_merged_.csv.

获取

这样一来，您就不会因为过度抓取而使服务器承受不必要的压力。但是，如果您想自己完成此操作，请查看代码。

代码：

import functools
import json
import pathlib
import random
import time
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path, PurePath
from urllib.parse import urlencode

import pandas as pd
import requests

BATCH_DIR = "./batches"
RESULTS_DIR = "./batch_results"
DEFAULT_BATCH_SIZE = 250


def timer(func):
    @functools.wraps(func)
    def wrapper_timer(*args, **kwargs):
        start_time = time.perf_counter()
        value = func(*args, **kwargs)
        end_time = time.perf_counter()
        run_time = end_time - start_time
        print(f"Finished {func.__name__!r} in {run_time:.4f} secs")
        return value
    return wrapper_timer


def build_payloads() -> list:
    return [
        [
            ('method', 'aircraftConditionsQuery'),
            ('iDisplayStart', page),
            ('iDisplayLength', '20'),
            ('enterpriseName', '武汉航达航空科技发展有限公司'),
        ] for page in [i for i in range(0, 46921, 10)]
    ]


def create_urls(payloads: list) -> list:
    base_url = "http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?"
    return [f"{base_url}{urlencode(payload)}" for payload in payloads]


def urls_to_batches(urls: list, step: int = DEFAULT_BATCH_SIZE) -> list:
    return [urls[i:i+step] for i in range(0, len(urls), step)]


def prepare_batches(url_batches: list, batch_dir: str = BATCH_DIR):
    pathlib.Path(batch_dir).mkdir(parents=False, exist_ok=False)
    for number, batch in enumerate(url_batches, start=1):
        batch_file = PurePath(batch_dir).joinpath(f"batch_{number}.batch")
        with open(batch_file, "w") as out:
            out.writelines("\n".join(batch))


def previous_jobs() -> bool:
    return Path(BATCH_DIR).exists()


def read_files(batch_dir: str = BATCH_DIR) -> iter:
    yield from (path for path in Path(batch_dir).iterdir())


def open_batch_file(batch_file_path: Path) -> list:
    with open(Path.joinpath(batch_file_path)) as batch_file:
        return [line.strip() for line in batch_file.readlines()]


def get_data(api_url: str) -> list:
    return requests.get(api_url).json()["aaData"]


@timer
def thread(batch_urls: list) -> list:
    results = []
    with ThreadPoolExecutor() as executor:
        for result in [executor.submit(get_data, url) for url in batch_urls]:
            results.extend(result.result())
    return results


def dump_thread_results(
        results: list,
        batch_file_path: Path,
        batch_results: str = RESULTS_DIR
):
    pathlib.Path(batch_results).mkdir(parents=False, exist_ok=True)
    output_file = f"{batch_file_path.name.rsplit('.')[0]}.json"
    with open(PurePath(batch_results).joinpath(output_file), "w") as out:
        json.dump(results, out, indent=4, sort_keys=True)


def wait(start: int = 60, stop: int = 180):
    sleep_for = random.randint(start, stop)
    print(f"Sleeping for {sleep_for} seconds.")
    time.sleep(sleep_for)


def scrape_data():
    for batch in read_files():
        print(f"Processing {batch}...")
        print(f"There are {len(list(read_files()))} batches left.")
        try:
            dump_thread_results(thread(open_batch_file(batch)), batch)
            Path(batch).unlink()
            wait()
        except (
                requests.exceptions.ConnectionError,
                requests.exceptions.ChunkedEncodingError,
        ) as error:
            print(f"Connection failed: {error}")
            wait(start=180, stop=360)
            continue
    Path(BATCH_DIR).rmdir()


def merge_results(file_name: str = "caac_gov_merged_data.csv"):
    merged_batch_results = []
    for result_file in read_files(Path(RESULTS_DIR)):
        with open(result_file) as result:
            merged_batch_results.extend(json.load(result))
    pd.DataFrame(
        merged_batch_results
    ).drop(
        "pageScrollParas",
        axis=1,
    ).dropna(
        how="all",
        axis=1,
    ).to_csv(
        Path(RESULTS_DIR).joinpath(file_name),
        index=False,
    )


def run_scrapper():
    scrape_data()
    merge_results()


@timer
def main():
    if previous_jobs():
        run_scrapper()
    else:
        prepare_batches(urls_to_batches(create_urls(build_payloads())))
        run_scrapper()


if __name__ == "__main__":
    main()

Answer 3

Is there any curl or wget command that allows me to parse this database

不是，curl 和 wget 都不知道如何解析 JSON，也不知道如何编写 CSV。

php-cli 是一个选项，命令

php -r 'function csv_quote(string $str): string {     return '\''"'\'' . strtr($str, ['\''"'\'' => '\''""'\'']) . '\''"'\''; } function shittycvs(array $d): void {     $str = '\'''\'';     $fields = [];     foreach ($d as $foo) {         foreach ($foo as $key => $_) {              $fields[$key] = true;         }     }     $fields = array_keys($fields);     $str .= implode(",", $fields) . "\n";     foreach ($d as $v) {         foreach ($fields as $field) {             if (is_array($v[$field] ?? "")) {                 $v[$field] = print_r($v[$field], true);             }             $str .= csv_quote($v[$field] ?? "") . ",";         }         $str = substr($str, 0, -1) . "\n";     }     echo $str; }  $ch = curl_init(); curl_setopt_array($ch, array(CURLOPT_RETURNTRANSFER => 1, CURLOPT_VERBOSE => 1, CURLOPT_ENCODING => '\'''\'')); $data = []; for ($i = 0; $i < 100; $i += 20) {     $url = '\''http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?'\'' . http_build_query(array(         '\''method'\'' => '\''aircraftConditionsQuery'\'',         '\''iColumns'\'' => 10,         '\''sColumns'\'' => '\'''\'',         '\''mDataProp_0'\'' => '\''enterpriseName'\'',         '\''mDataProp_1'\'' => '\''licenceCode'\'',         '\''mDataProp_2'\'' => '\''partsNumber'\'',         '\''mDataProp_3'\'' => '\''partsName'\'',         '\''mDataProp_4'\'' => '\''ataChaptersection'\'',         '\''mDataProp_5'\'' => '\''manufacturers'\'',         '\''mDataProp_6'\'' => '\''6'\'',         '\''mDataProp_7'\'' => '\''fileToAccord'\'',         '\''mDataProp_8'\'' => '\''mainDevices'\'',         '\''mDataProp_9'\'' => '\''remark'\'',         '\''enterpriseName'\'' => '\'''\'',         '\''licenceCode'\'' => '\'''\'',         '\''partsNumber'\'' => '\'''\'',         '\''partsName'\'' => '\'''\'',         '\''ataChaptersection'\'' => '\'''\'',         '\''iDisplayLength'\'' => 20,         '\''iDisplayStart'\'' => $i,     ));     curl_setopt($ch, CURLOPT_URL, $url);     $js = curl_exec($ch);     $jsd = json_decode($js, true, 999, JSON_THROW_ON_ERROR | JSON_INVALID_UTF8_SUBSTITUTE);     if (empty($jsd['\''aaData'\''])) {         /*on last page aaData is empty..*/         break;     }     foreach ($jsd['\''aaData'\''] as $v) {         $data[] = $v;     } } shittycvs($data);'

输出https://gist.github.com/divinity76/236d1df60a6b29783fcfb90fac12d7bf

它获取前 100 个结果，但总共有超过 46,000 个结果，要获取所有结果，请将 $i < 100 替换为 $i < 999999 或其他内容，从那里它应该在 [=14 处中断=] 几个小时后.

解压后的 shell 命令如下所示：

<?php
function csv_quote(string $str): string
{
    return '"' . strtr($str, ['"' => '""']) . '"';
}
function shittycvs(array $d): void
{
    $str = '';
    $fields = [];
    foreach ($d as $foo) {
        foreach ($foo as $key => $_) {

            $fields[$key] = true;
        }
    }
    $fields = array_keys($fields);
    $str .= implode(",", $fields) . "\n";
    foreach ($d as $v) {
        foreach ($fields as $field) {
            if (is_array($v[$field] ?? "")) {
                $v[$field] = print_r($v[$field], true);
            }
            $str .= csv_quote($v[$field] ?? "") . ",";
        }
        $str = substr($str, 0, -1) . "\n";
    }
    echo $str;
}

$ch = curl_init();
curl_setopt_array($ch, array(CURLOPT_RETURNTRANSFER => 1, CURLOPT_VERBOSE => 1, CURLOPT_ENCODING => ''));
$data = [];
for ($i = 0; $i < 100; $i += 20) {
    $url = 'http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?' . http_build_query(array(
        'method' => 'aircraftConditionsQuery',
        'iColumns' => 10,
        'sColumns' => '',
        'mDataProp_0' => 'enterpriseName',
        'mDataProp_1' => 'licenceCode',
        'mDataProp_2' => 'partsNumber',
        'mDataProp_3' => 'partsName',
        'mDataProp_4' => 'ataChaptersection',
        'mDataProp_5' => 'manufacturers',
        'mDataProp_6' => '6',
        'mDataProp_7' => 'fileToAccord',
        'mDataProp_8' => 'mainDevices',
        'mDataProp_9' => 'remark',
        'enterpriseName' => '',
        'licenceCode' => '',
        'partsNumber' => '',
        'partsName' => '',
        'ataChaptersection' => '',
        'iDisplayLength' => 20,
        'iDisplayStart' => $i,
    ));
    curl_setopt($ch, CURLOPT_URL, $url);
    $js = curl_exec($ch);
    $jsd = json_decode($js, true, 999, JSON_THROW_ON_ERROR | JSON_INVALID_UTF8_SUBSTITUTE);
    if (empty($jsd['aaData'])) {
        /*on last page aaData is empty..*/
        break;
    }
    foreach ($jsd['aaData'] as $v) {
        $data[] = $v;
    }
}
shittycvs($data);

如何使用 curl 或 wget 以 csv 或 tsv 格式获取此网页的内容

how to get the contents of this web page in csv or tsv format with curl or wget

curl

wget

web-scraping