如何使用 curl 或 wget 以 csv 或 tsv 格式获取此网页的内容
how to get the contents of this web page in csv or tsv format with curl or wget
我有以下 link:
我想下载此数据库中的所有表格并转换为 csv 或 tsv 文件。是否有任何 curl 或 wget 命令可以让我解析这个数据库?
像这样
wget -r -np -k http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet\?method\=loadAircraftConditionsResultPage\&enterpriseName\=%E6%AD%A6%E6%B1%89%E8%88%AA%E8%BE%BE%E8%88%AA%E7%A9%BA%E7%A7%91%E6%8A%80%E5%8F%91%E5%B1%95%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8\&licenceCode\=\&partsNumber\=\&partsName\=\&ataChaptersection\=
只给我网站代码,而不是表格本身的内容。
如果你想要 json 输出 URL 是
http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?method=aircraftConditionsQuery&iColumns=10&sColumns=&iDisplayStart=0&iDisplayLength=20&mDataProp_0=enterpriseName&mDataProp_1=licenceCode&mDataProp_2=partsNumber&mDataProp_3=partsName&mDataProp_4=ataChaptersection&mDataProp_5=manufacturers&mDataProp_6=6&mDataProp_7=fileToAccord&mDataProp_8=mainDevices&mDataProp_9=remark&enterpriseName=%E6%AD%A6%E6%B1%89%E8%88%AA%E8%BE%BE%E8%88%AA%E7%A9%BA%E7%A7%91%E6%8A%80%E5%8F%91%E5%B1%95%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&licenceCode=&partsNumber=&partsName=&ataChaptersection=
在 URL 中,您在 iDisplayStart
参数中进行了分页,从 0 开始,结果为 20 个步骤。
使用 gnu parallel、jq 和 Miller 您可以下载所有内容并转换为 CSV。
例如要下载前 6 json 个输出(从 0 到 100),您可以 运行
echo {0..100..20} | tr " " "\n" | parallel -j0 'curl "http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?method=aircraftConditionsQuery&iColumns=10&sColumns=&iDisplayStart={}&iDisplayLength=20&mDataProp_0=enterpriseName&mDataProp_1=licenceCode&mDataProp_2=partsNumber&mDataProp_3=partsName&mDataProp_4=ataChaptersection&mDataProp_5=manufacturers&mDataProp_6=6&mDataProp_7=fileToAccord&mDataProp_8=mainDevices&mDataProp_9=remark&enterpriseName=%E6%AD%A6%E6%B1%89%E8%88%AA%E8%BE%BE%E8%88%AA%E7%A9%BA%E7%A7%91%E6%8A%80%E5%8F%91%E5%B1%95%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&licenceCode=&partsNumber=&partsName=&ataChaptersection=" | jq -c '"'"'.aaData[]'"'"' >{}.jsonl'
在输出中你将有 6 个 json 个文件。
要将它们合并到 CSV 文件中,您可以 运行
mlr --j2c unsparsify *.jsonl >output.csv
它会像 https://gist.github.com/aborruso/777fce957865011c85a7a689cd0bd413
因为它有一个 web-scraping
标签,我认为如果你想从该站点抓取 ALL 表,你可以尝试 Python
。
ALL 我的意思是所有 2347
页到一个 单个 .csv
文件,看起来像这样:
在继续编写代码之前,我想分享一个小的免责声明:
The code below relies heavily on multi-threading for requesting data
from the server, which might result in connection refusal / forced interruption and/or getting banned from further connections.
明智地使用它并自行负责。
话虽如此,您实际上不必 运行 下面的代码,因为整个转储可以作为 caac_gov_merged_.csv.
获取
这样一来,您就不会因为过度抓取而使服务器承受不必要的压力。但是,如果您想自己完成此操作,请查看代码。
代码:
import functools
import json
import pathlib
import random
import time
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path, PurePath
from urllib.parse import urlencode
import pandas as pd
import requests
BATCH_DIR = "./batches"
RESULTS_DIR = "./batch_results"
DEFAULT_BATCH_SIZE = 250
def timer(func):
@functools.wraps(func)
def wrapper_timer(*args, **kwargs):
start_time = time.perf_counter()
value = func(*args, **kwargs)
end_time = time.perf_counter()
run_time = end_time - start_time
print(f"Finished {func.__name__!r} in {run_time:.4f} secs")
return value
return wrapper_timer
def build_payloads() -> list:
return [
[
('method', 'aircraftConditionsQuery'),
('iDisplayStart', page),
('iDisplayLength', '20'),
('enterpriseName', '武汉航达航空科技发展有限公司'),
] for page in [i for i in range(0, 46921, 10)]
]
def create_urls(payloads: list) -> list:
base_url = "http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?"
return [f"{base_url}{urlencode(payload)}" for payload in payloads]
def urls_to_batches(urls: list, step: int = DEFAULT_BATCH_SIZE) -> list:
return [urls[i:i+step] for i in range(0, len(urls), step)]
def prepare_batches(url_batches: list, batch_dir: str = BATCH_DIR):
pathlib.Path(batch_dir).mkdir(parents=False, exist_ok=False)
for number, batch in enumerate(url_batches, start=1):
batch_file = PurePath(batch_dir).joinpath(f"batch_{number}.batch")
with open(batch_file, "w") as out:
out.writelines("\n".join(batch))
def previous_jobs() -> bool:
return Path(BATCH_DIR).exists()
def read_files(batch_dir: str = BATCH_DIR) -> iter:
yield from (path for path in Path(batch_dir).iterdir())
def open_batch_file(batch_file_path: Path) -> list:
with open(Path.joinpath(batch_file_path)) as batch_file:
return [line.strip() for line in batch_file.readlines()]
def get_data(api_url: str) -> list:
return requests.get(api_url).json()["aaData"]
@timer
def thread(batch_urls: list) -> list:
results = []
with ThreadPoolExecutor() as executor:
for result in [executor.submit(get_data, url) for url in batch_urls]:
results.extend(result.result())
return results
def dump_thread_results(
results: list,
batch_file_path: Path,
batch_results: str = RESULTS_DIR
):
pathlib.Path(batch_results).mkdir(parents=False, exist_ok=True)
output_file = f"{batch_file_path.name.rsplit('.')[0]}.json"
with open(PurePath(batch_results).joinpath(output_file), "w") as out:
json.dump(results, out, indent=4, sort_keys=True)
def wait(start: int = 60, stop: int = 180):
sleep_for = random.randint(start, stop)
print(f"Sleeping for {sleep_for} seconds.")
time.sleep(sleep_for)
def scrape_data():
for batch in read_files():
print(f"Processing {batch}...")
print(f"There are {len(list(read_files()))} batches left.")
try:
dump_thread_results(thread(open_batch_file(batch)), batch)
Path(batch).unlink()
wait()
except (
requests.exceptions.ConnectionError,
requests.exceptions.ChunkedEncodingError,
) as error:
print(f"Connection failed: {error}")
wait(start=180, stop=360)
continue
Path(BATCH_DIR).rmdir()
def merge_results(file_name: str = "caac_gov_merged_data.csv"):
merged_batch_results = []
for result_file in read_files(Path(RESULTS_DIR)):
with open(result_file) as result:
merged_batch_results.extend(json.load(result))
pd.DataFrame(
merged_batch_results
).drop(
"pageScrollParas",
axis=1,
).dropna(
how="all",
axis=1,
).to_csv(
Path(RESULTS_DIR).joinpath(file_name),
index=False,
)
def run_scrapper():
scrape_data()
merge_results()
@timer
def main():
if previous_jobs():
run_scrapper()
else:
prepare_batches(urls_to_batches(create_urls(build_payloads())))
run_scrapper()
if __name__ == "__main__":
main()
Is there any curl or wget command that allows me to parse this database
不是,curl 和 wget 都不知道如何解析 JSON,也不知道如何编写 CSV。
php-cli 是一个选项,命令
php -r 'function csv_quote(string $str): string { return '\''"'\'' . strtr($str, ['\''"'\'' => '\''""'\'']) . '\''"'\''; } function shittycvs(array $d): void { $str = '\'''\''; $fields = []; foreach ($d as $foo) { foreach ($foo as $key => $_) { $fields[$key] = true; } } $fields = array_keys($fields); $str .= implode(",", $fields) . "\n"; foreach ($d as $v) { foreach ($fields as $field) { if (is_array($v[$field] ?? "")) { $v[$field] = print_r($v[$field], true); } $str .= csv_quote($v[$field] ?? "") . ","; } $str = substr($str, 0, -1) . "\n"; } echo $str; } $ch = curl_init(); curl_setopt_array($ch, array(CURLOPT_RETURNTRANSFER => 1, CURLOPT_VERBOSE => 1, CURLOPT_ENCODING => '\'''\'')); $data = []; for ($i = 0; $i < 100; $i += 20) { $url = '\''http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?'\'' . http_build_query(array( '\''method'\'' => '\''aircraftConditionsQuery'\'', '\''iColumns'\'' => 10, '\''sColumns'\'' => '\'''\'', '\''mDataProp_0'\'' => '\''enterpriseName'\'', '\''mDataProp_1'\'' => '\''licenceCode'\'', '\''mDataProp_2'\'' => '\''partsNumber'\'', '\''mDataProp_3'\'' => '\''partsName'\'', '\''mDataProp_4'\'' => '\''ataChaptersection'\'', '\''mDataProp_5'\'' => '\''manufacturers'\'', '\''mDataProp_6'\'' => '\''6'\'', '\''mDataProp_7'\'' => '\''fileToAccord'\'', '\''mDataProp_8'\'' => '\''mainDevices'\'', '\''mDataProp_9'\'' => '\''remark'\'', '\''enterpriseName'\'' => '\'''\'', '\''licenceCode'\'' => '\'''\'', '\''partsNumber'\'' => '\'''\'', '\''partsName'\'' => '\'''\'', '\''ataChaptersection'\'' => '\'''\'', '\''iDisplayLength'\'' => 20, '\''iDisplayStart'\'' => $i, )); curl_setopt($ch, CURLOPT_URL, $url); $js = curl_exec($ch); $jsd = json_decode($js, true, 999, JSON_THROW_ON_ERROR | JSON_INVALID_UTF8_SUBSTITUTE); if (empty($jsd['\''aaData'\''])) { /*on last page aaData is empty..*/ break; } foreach ($jsd['\''aaData'\''] as $v) { $data[] = $v; } } shittycvs($data);'
输出https://gist.github.com/divinity76/236d1df60a6b29783fcfb90fac12d7bf
它获取前 100 个结果,但总共有超过 46,000 个结果,要获取所有结果,请将 $i < 100
替换为 $i < 999999
或其他内容,从那里它应该在 [=14 处中断=] 几个小时后.
解压后的 shell 命令如下所示:
<?php
function csv_quote(string $str): string
{
return '"' . strtr($str, ['"' => '""']) . '"';
}
function shittycvs(array $d): void
{
$str = '';
$fields = [];
foreach ($d as $foo) {
foreach ($foo as $key => $_) {
$fields[$key] = true;
}
}
$fields = array_keys($fields);
$str .= implode(",", $fields) . "\n";
foreach ($d as $v) {
foreach ($fields as $field) {
if (is_array($v[$field] ?? "")) {
$v[$field] = print_r($v[$field], true);
}
$str .= csv_quote($v[$field] ?? "") . ",";
}
$str = substr($str, 0, -1) . "\n";
}
echo $str;
}
$ch = curl_init();
curl_setopt_array($ch, array(CURLOPT_RETURNTRANSFER => 1, CURLOPT_VERBOSE => 1, CURLOPT_ENCODING => ''));
$data = [];
for ($i = 0; $i < 100; $i += 20) {
$url = 'http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?' . http_build_query(array(
'method' => 'aircraftConditionsQuery',
'iColumns' => 10,
'sColumns' => '',
'mDataProp_0' => 'enterpriseName',
'mDataProp_1' => 'licenceCode',
'mDataProp_2' => 'partsNumber',
'mDataProp_3' => 'partsName',
'mDataProp_4' => 'ataChaptersection',
'mDataProp_5' => 'manufacturers',
'mDataProp_6' => '6',
'mDataProp_7' => 'fileToAccord',
'mDataProp_8' => 'mainDevices',
'mDataProp_9' => 'remark',
'enterpriseName' => '',
'licenceCode' => '',
'partsNumber' => '',
'partsName' => '',
'ataChaptersection' => '',
'iDisplayLength' => 20,
'iDisplayStart' => $i,
));
curl_setopt($ch, CURLOPT_URL, $url);
$js = curl_exec($ch);
$jsd = json_decode($js, true, 999, JSON_THROW_ON_ERROR | JSON_INVALID_UTF8_SUBSTITUTE);
if (empty($jsd['aaData'])) {
/*on last page aaData is empty..*/
break;
}
foreach ($jsd['aaData'] as $v) {
$data[] = $v;
}
}
shittycvs($data);
我有以下 link:
我想下载此数据库中的所有表格并转换为 csv 或 tsv 文件。是否有任何 curl 或 wget 命令可以让我解析这个数据库?
像这样
wget -r -np -k http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet\?method\=loadAircraftConditionsResultPage\&enterpriseName\=%E6%AD%A6%E6%B1%89%E8%88%AA%E8%BE%BE%E8%88%AA%E7%A9%BA%E7%A7%91%E6%8A%80%E5%8F%91%E5%B1%95%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8\&licenceCode\=\&partsNumber\=\&partsName\=\&ataChaptersection\=
只给我网站代码,而不是表格本身的内容。
如果你想要 json 输出 URL 是
http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?method=aircraftConditionsQuery&iColumns=10&sColumns=&iDisplayStart=0&iDisplayLength=20&mDataProp_0=enterpriseName&mDataProp_1=licenceCode&mDataProp_2=partsNumber&mDataProp_3=partsName&mDataProp_4=ataChaptersection&mDataProp_5=manufacturers&mDataProp_6=6&mDataProp_7=fileToAccord&mDataProp_8=mainDevices&mDataProp_9=remark&enterpriseName=%E6%AD%A6%E6%B1%89%E8%88%AA%E8%BE%BE%E8%88%AA%E7%A9%BA%E7%A7%91%E6%8A%80%E5%8F%91%E5%B1%95%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&licenceCode=&partsNumber=&partsName=&ataChaptersection=
在 URL 中,您在 iDisplayStart
参数中进行了分页,从 0 开始,结果为 20 个步骤。
使用 gnu parallel、jq 和 Miller 您可以下载所有内容并转换为 CSV。
例如要下载前 6 json 个输出(从 0 到 100),您可以 运行
echo {0..100..20} | tr " " "\n" | parallel -j0 'curl "http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?method=aircraftConditionsQuery&iColumns=10&sColumns=&iDisplayStart={}&iDisplayLength=20&mDataProp_0=enterpriseName&mDataProp_1=licenceCode&mDataProp_2=partsNumber&mDataProp_3=partsName&mDataProp_4=ataChaptersection&mDataProp_5=manufacturers&mDataProp_6=6&mDataProp_7=fileToAccord&mDataProp_8=mainDevices&mDataProp_9=remark&enterpriseName=%E6%AD%A6%E6%B1%89%E8%88%AA%E8%BE%BE%E8%88%AA%E7%A9%BA%E7%A7%91%E6%8A%80%E5%8F%91%E5%B1%95%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&licenceCode=&partsNumber=&partsName=&ataChaptersection=" | jq -c '"'"'.aaData[]'"'"' >{}.jsonl'
在输出中你将有 6 个 json 个文件。
要将它们合并到 CSV 文件中,您可以 运行
mlr --j2c unsparsify *.jsonl >output.csv
它会像 https://gist.github.com/aborruso/777fce957865011c85a7a689cd0bd413
因为它有一个 web-scraping
标签,我认为如果你想从该站点抓取 ALL 表,你可以尝试 Python
。
ALL 我的意思是所有 2347
页到一个 单个 .csv
文件,看起来像这样:
在继续编写代码之前,我想分享一个小的免责声明:
The code below relies heavily on multi-threading for requesting data from the server, which might result in connection refusal / forced interruption and/or getting banned from further connections.
明智地使用它并自行负责。
话虽如此,您实际上不必 运行 下面的代码,因为整个转储可以作为 caac_gov_merged_.csv.
获取这样一来,您就不会因为过度抓取而使服务器承受不必要的压力。但是,如果您想自己完成此操作,请查看代码。
代码:
import functools
import json
import pathlib
import random
import time
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path, PurePath
from urllib.parse import urlencode
import pandas as pd
import requests
BATCH_DIR = "./batches"
RESULTS_DIR = "./batch_results"
DEFAULT_BATCH_SIZE = 250
def timer(func):
@functools.wraps(func)
def wrapper_timer(*args, **kwargs):
start_time = time.perf_counter()
value = func(*args, **kwargs)
end_time = time.perf_counter()
run_time = end_time - start_time
print(f"Finished {func.__name__!r} in {run_time:.4f} secs")
return value
return wrapper_timer
def build_payloads() -> list:
return [
[
('method', 'aircraftConditionsQuery'),
('iDisplayStart', page),
('iDisplayLength', '20'),
('enterpriseName', '武汉航达航空科技发展有限公司'),
] for page in [i for i in range(0, 46921, 10)]
]
def create_urls(payloads: list) -> list:
base_url = "http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?"
return [f"{base_url}{urlencode(payload)}" for payload in payloads]
def urls_to_batches(urls: list, step: int = DEFAULT_BATCH_SIZE) -> list:
return [urls[i:i+step] for i in range(0, len(urls), step)]
def prepare_batches(url_batches: list, batch_dir: str = BATCH_DIR):
pathlib.Path(batch_dir).mkdir(parents=False, exist_ok=False)
for number, batch in enumerate(url_batches, start=1):
batch_file = PurePath(batch_dir).joinpath(f"batch_{number}.batch")
with open(batch_file, "w") as out:
out.writelines("\n".join(batch))
def previous_jobs() -> bool:
return Path(BATCH_DIR).exists()
def read_files(batch_dir: str = BATCH_DIR) -> iter:
yield from (path for path in Path(batch_dir).iterdir())
def open_batch_file(batch_file_path: Path) -> list:
with open(Path.joinpath(batch_file_path)) as batch_file:
return [line.strip() for line in batch_file.readlines()]
def get_data(api_url: str) -> list:
return requests.get(api_url).json()["aaData"]
@timer
def thread(batch_urls: list) -> list:
results = []
with ThreadPoolExecutor() as executor:
for result in [executor.submit(get_data, url) for url in batch_urls]:
results.extend(result.result())
return results
def dump_thread_results(
results: list,
batch_file_path: Path,
batch_results: str = RESULTS_DIR
):
pathlib.Path(batch_results).mkdir(parents=False, exist_ok=True)
output_file = f"{batch_file_path.name.rsplit('.')[0]}.json"
with open(PurePath(batch_results).joinpath(output_file), "w") as out:
json.dump(results, out, indent=4, sort_keys=True)
def wait(start: int = 60, stop: int = 180):
sleep_for = random.randint(start, stop)
print(f"Sleeping for {sleep_for} seconds.")
time.sleep(sleep_for)
def scrape_data():
for batch in read_files():
print(f"Processing {batch}...")
print(f"There are {len(list(read_files()))} batches left.")
try:
dump_thread_results(thread(open_batch_file(batch)), batch)
Path(batch).unlink()
wait()
except (
requests.exceptions.ConnectionError,
requests.exceptions.ChunkedEncodingError,
) as error:
print(f"Connection failed: {error}")
wait(start=180, stop=360)
continue
Path(BATCH_DIR).rmdir()
def merge_results(file_name: str = "caac_gov_merged_data.csv"):
merged_batch_results = []
for result_file in read_files(Path(RESULTS_DIR)):
with open(result_file) as result:
merged_batch_results.extend(json.load(result))
pd.DataFrame(
merged_batch_results
).drop(
"pageScrollParas",
axis=1,
).dropna(
how="all",
axis=1,
).to_csv(
Path(RESULTS_DIR).joinpath(file_name),
index=False,
)
def run_scrapper():
scrape_data()
merge_results()
@timer
def main():
if previous_jobs():
run_scrapper()
else:
prepare_batches(urls_to_batches(create_urls(build_payloads())))
run_scrapper()
if __name__ == "__main__":
main()
Is there any curl or wget command that allows me to parse this database
不是,curl 和 wget 都不知道如何解析 JSON,也不知道如何编写 CSV。
php-cli 是一个选项,命令
php -r 'function csv_quote(string $str): string { return '\''"'\'' . strtr($str, ['\''"'\'' => '\''""'\'']) . '\''"'\''; } function shittycvs(array $d): void { $str = '\'''\''; $fields = []; foreach ($d as $foo) { foreach ($foo as $key => $_) { $fields[$key] = true; } } $fields = array_keys($fields); $str .= implode(",", $fields) . "\n"; foreach ($d as $v) { foreach ($fields as $field) { if (is_array($v[$field] ?? "")) { $v[$field] = print_r($v[$field], true); } $str .= csv_quote($v[$field] ?? "") . ","; } $str = substr($str, 0, -1) . "\n"; } echo $str; } $ch = curl_init(); curl_setopt_array($ch, array(CURLOPT_RETURNTRANSFER => 1, CURLOPT_VERBOSE => 1, CURLOPT_ENCODING => '\'''\'')); $data = []; for ($i = 0; $i < 100; $i += 20) { $url = '\''http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?'\'' . http_build_query(array( '\''method'\'' => '\''aircraftConditionsQuery'\'', '\''iColumns'\'' => 10, '\''sColumns'\'' => '\'''\'', '\''mDataProp_0'\'' => '\''enterpriseName'\'', '\''mDataProp_1'\'' => '\''licenceCode'\'', '\''mDataProp_2'\'' => '\''partsNumber'\'', '\''mDataProp_3'\'' => '\''partsName'\'', '\''mDataProp_4'\'' => '\''ataChaptersection'\'', '\''mDataProp_5'\'' => '\''manufacturers'\'', '\''mDataProp_6'\'' => '\''6'\'', '\''mDataProp_7'\'' => '\''fileToAccord'\'', '\''mDataProp_8'\'' => '\''mainDevices'\'', '\''mDataProp_9'\'' => '\''remark'\'', '\''enterpriseName'\'' => '\'''\'', '\''licenceCode'\'' => '\'''\'', '\''partsNumber'\'' => '\'''\'', '\''partsName'\'' => '\'''\'', '\''ataChaptersection'\'' => '\'''\'', '\''iDisplayLength'\'' => 20, '\''iDisplayStart'\'' => $i, )); curl_setopt($ch, CURLOPT_URL, $url); $js = curl_exec($ch); $jsd = json_decode($js, true, 999, JSON_THROW_ON_ERROR | JSON_INVALID_UTF8_SUBSTITUTE); if (empty($jsd['\''aaData'\''])) { /*on last page aaData is empty..*/ break; } foreach ($jsd['\''aaData'\''] as $v) { $data[] = $v; } } shittycvs($data);'
输出https://gist.github.com/divinity76/236d1df60a6b29783fcfb90fac12d7bf
它获取前 100 个结果,但总共有超过 46,000 个结果,要获取所有结果,请将 $i < 100
替换为 $i < 999999
或其他内容,从那里它应该在 [=14 处中断=] 几个小时后.
解压后的 shell 命令如下所示:
<?php
function csv_quote(string $str): string
{
return '"' . strtr($str, ['"' => '""']) . '"';
}
function shittycvs(array $d): void
{
$str = '';
$fields = [];
foreach ($d as $foo) {
foreach ($foo as $key => $_) {
$fields[$key] = true;
}
}
$fields = array_keys($fields);
$str .= implode(",", $fields) . "\n";
foreach ($d as $v) {
foreach ($fields as $field) {
if (is_array($v[$field] ?? "")) {
$v[$field] = print_r($v[$field], true);
}
$str .= csv_quote($v[$field] ?? "") . ",";
}
$str = substr($str, 0, -1) . "\n";
}
echo $str;
}
$ch = curl_init();
curl_setopt_array($ch, array(CURLOPT_RETURNTRANSFER => 1, CURLOPT_VERBOSE => 1, CURLOPT_ENCODING => ''));
$data = [];
for ($i = 0; $i < 100; $i += 20) {
$url = 'http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?' . http_build_query(array(
'method' => 'aircraftConditionsQuery',
'iColumns' => 10,
'sColumns' => '',
'mDataProp_0' => 'enterpriseName',
'mDataProp_1' => 'licenceCode',
'mDataProp_2' => 'partsNumber',
'mDataProp_3' => 'partsName',
'mDataProp_4' => 'ataChaptersection',
'mDataProp_5' => 'manufacturers',
'mDataProp_6' => '6',
'mDataProp_7' => 'fileToAccord',
'mDataProp_8' => 'mainDevices',
'mDataProp_9' => 'remark',
'enterpriseName' => '',
'licenceCode' => '',
'partsNumber' => '',
'partsName' => '',
'ataChaptersection' => '',
'iDisplayLength' => 20,
'iDisplayStart' => $i,
));
curl_setopt($ch, CURLOPT_URL, $url);
$js = curl_exec($ch);
$jsd = json_decode($js, true, 999, JSON_THROW_ON_ERROR | JSON_INVALID_UTF8_SUBSTITUTE);
if (empty($jsd['aaData'])) {
/*on last page aaData is empty..*/
break;
}
foreach ($jsd['aaData'] as $v) {
$data[] = $v;
}
}
shittycvs($data);