Elixir:提取大型结构化数据中的元素
Elixir: extract element in a large structured data
我用 HTTPoison 发出了一个获取请求,但现在我正在尝试解析 JSON 文件。为此,我正在使用 Poison 库。但是我在从 JSON 对象中获取元素时遇到了一些问题。
这是我的代码:
def parse_json do
IO.puts("\nLet's parse JSON file.")
url = "https://entreprise.data.gouv.fr/api/sirene/v3/etablissements/?page=1&per_page=1&etat_administratif=A&denomination_usuelle=" <> "SAFENERGY"
start()
case get(url) do
{:ok, %{status_code: 200, body: body}} ->
IO.puts("Success research.")
decode!(body)
#|> Map.to_list() NON-FUNCTIONAL
#|> Enum.find(&match?(["etablissements" | _], &1)) NON-FUNCTIONAL
#|> Enum.find_value(fn %{"siren" => siren} -> siren end) NON-FUNCTIONAL
#|> IO.puts(&(&1.siren)) NON-FUNCTIONAL
|> IO.inspect()
{:ok, %{status_code: 404}} ->
IO.puts("None match between establishment's name and Sirene API.")
{:ok, %{status_code: 500}} ->
IO.puts("Nonfunctional Sirene API's server (maintenance...).")
{:ok, %{status_code: 429}} ->
IO.puts("Exceeding the maximum call volume (7 requests/s maximum).")
{:error, %{reason: reason}} ->
IO.puts("Failure research.")
IO.inspect(reason)
_ -> IO.puts("Unknown error! Good luck to find it!")
end
end
我的输出:
%{
"etablissements" => [
%{
"statut_diffusion" => "O",
"libelle_voie" => "MAS DES PERES",
"distribution_speciale_2" => nil,
"geo_ligne" => "G",
"libelle_voie_2" => nil,
"unite_legale_id" => 33014385,
"type_voie_2" => nil,
"geo_type" => "street",
"code_postal_2" => nil,
"libelle_pays_etranger" => nil,
"indice_repetition_2" => nil,
"libelle_commune" => "MAUGUIO",
"siret" => "48944519700036",
"id" => 70725092,
"distribution_speciale" => nil,
"indice_repetition" => nil,
"siren" => "489445197",
"complement_adresse" => nil,
"unite_legale" => %{
"statut_diffusion" => "O",
"nic_siege" => "00036",
"prenom_usuel" => nil,
"sigle" => nil,
"denomination" => "SAFENERGY",
"id" => 33014385,
"pseudonyme" => nil,
"nom_usage" => nil,
"siren" => "489445197",
"date_dernier_traitement" => "2019-11-13T15:06:19",
"annee_effectifs" => nil,
"categorie_entreprise" => "PME",
"tranche_effectifs" => nil,
"identifiant_association" => nil,
"sexe" => nil,
"prenom_2" => nil,
"caractere_employeur" => "N",
"nom" => nil,
"created_at" => "2020-07-02T02:56:19.773+02:00",
"economie_sociale_solidaire" => "N",
"prenom_4" => nil,
"date_fin" => nil,
"date_debut" => "2019-07-01",
"prenom_3" => nil,
"date_creation" => "2006-04-01",
"annee_categorie_entreprise" => "2017",
"denomination_usuelle_2" => nil,
"denomination_usuelle_3" => nil,
"denomination_usuelle_1" => nil,
...
},
"date_dernier_traitement" => "2019-11-13T15:06:19",
"annee_effectifs" => nil,
"denomination_usuelle" => "SAFENERGY",
"code_pays_etranger" => nil,
"complement_adresse_2" => nil,
"tranche_effectifs" => nil,
"enseigne_1" => nil,
"numero_voie_2" => nil,
"geo_id" => "34154_b163",
"activite_principale_registre_metiers" => nil,
"geo_l5" => nil,
"caractere_employeur" => "N",
"geo_l4" => "MAS DES PERES",
"nic" => "00036",
"code_postal" => "34130",
"libelle_cedex_2" => nil,
"created_at" => "2020-07-02T03:12:45.814+02:00",
"numero_voie" => "9002",
"longitude" => "3.967449",
"type_voie" => nil,
"date_debut" => "2019-07-01",
"code_cedex_2" => nil,
"date_creation" => "2019-07-01",
"code_commune_2" => nil,
"libelle_pays_etranger_2" => nil,
"libelle_commune_etranger" => nil,
"latitude" => "43.603798",
"code_cedex" => nil,
"geo_score" => "0.95",
...
}
],
"meta" => %{
"page" => 1,
"per_page" => 1,
"total_pages" => 1,
"total_results" => 1
}
}
正如您在上面看到的(评论中的四行),例如,我正在尝试提取“警笛”元素,但它失败了……我是不是偏离了目标?
我猜您想遍历所有“etablissements”并检索所有 SIREN 号码。您可以尝试类似的操作:
body
|> decode!()
|> Map.get("etablissements")
|> Enum.map(fn etablissement -> Map.get(etablissement, "siren") end)
假设数据结构如上所示。请注意 Enum.map/2
与 Map
模块无关,但会在 "etablissements"
列表的每个条目上运行函数 fn etablissement -> Map.get(etablissement, "siren") end
。
这应该 return 包含所有 SIREN 号码的列表。如果可能有重复条目,您可以使用 Enum.uniq/1
.
这里特别用Access
in general and Kernel.get_in/2
body
|> decode!()
|> get_in(["etablissements", Access.all(), "unite_legale", "siren"])
#⇒ "0"
使用 Access
您可以过滤每一步的结果,获取所有分支等。这里我们使用 Access.all/0
来获取列表的所有元素。
我用 HTTPoison 发出了一个获取请求,但现在我正在尝试解析 JSON 文件。为此,我正在使用 Poison 库。但是我在从 JSON 对象中获取元素时遇到了一些问题。
这是我的代码:
def parse_json do
IO.puts("\nLet's parse JSON file.")
url = "https://entreprise.data.gouv.fr/api/sirene/v3/etablissements/?page=1&per_page=1&etat_administratif=A&denomination_usuelle=" <> "SAFENERGY"
start()
case get(url) do
{:ok, %{status_code: 200, body: body}} ->
IO.puts("Success research.")
decode!(body)
#|> Map.to_list() NON-FUNCTIONAL
#|> Enum.find(&match?(["etablissements" | _], &1)) NON-FUNCTIONAL
#|> Enum.find_value(fn %{"siren" => siren} -> siren end) NON-FUNCTIONAL
#|> IO.puts(&(&1.siren)) NON-FUNCTIONAL
|> IO.inspect()
{:ok, %{status_code: 404}} ->
IO.puts("None match between establishment's name and Sirene API.")
{:ok, %{status_code: 500}} ->
IO.puts("Nonfunctional Sirene API's server (maintenance...).")
{:ok, %{status_code: 429}} ->
IO.puts("Exceeding the maximum call volume (7 requests/s maximum).")
{:error, %{reason: reason}} ->
IO.puts("Failure research.")
IO.inspect(reason)
_ -> IO.puts("Unknown error! Good luck to find it!")
end
end
我的输出:
%{
"etablissements" => [
%{
"statut_diffusion" => "O",
"libelle_voie" => "MAS DES PERES",
"distribution_speciale_2" => nil,
"geo_ligne" => "G",
"libelle_voie_2" => nil,
"unite_legale_id" => 33014385,
"type_voie_2" => nil,
"geo_type" => "street",
"code_postal_2" => nil,
"libelle_pays_etranger" => nil,
"indice_repetition_2" => nil,
"libelle_commune" => "MAUGUIO",
"siret" => "48944519700036",
"id" => 70725092,
"distribution_speciale" => nil,
"indice_repetition" => nil,
"siren" => "489445197",
"complement_adresse" => nil,
"unite_legale" => %{
"statut_diffusion" => "O",
"nic_siege" => "00036",
"prenom_usuel" => nil,
"sigle" => nil,
"denomination" => "SAFENERGY",
"id" => 33014385,
"pseudonyme" => nil,
"nom_usage" => nil,
"siren" => "489445197",
"date_dernier_traitement" => "2019-11-13T15:06:19",
"annee_effectifs" => nil,
"categorie_entreprise" => "PME",
"tranche_effectifs" => nil,
"identifiant_association" => nil,
"sexe" => nil,
"prenom_2" => nil,
"caractere_employeur" => "N",
"nom" => nil,
"created_at" => "2020-07-02T02:56:19.773+02:00",
"economie_sociale_solidaire" => "N",
"prenom_4" => nil,
"date_fin" => nil,
"date_debut" => "2019-07-01",
"prenom_3" => nil,
"date_creation" => "2006-04-01",
"annee_categorie_entreprise" => "2017",
"denomination_usuelle_2" => nil,
"denomination_usuelle_3" => nil,
"denomination_usuelle_1" => nil,
...
},
"date_dernier_traitement" => "2019-11-13T15:06:19",
"annee_effectifs" => nil,
"denomination_usuelle" => "SAFENERGY",
"code_pays_etranger" => nil,
"complement_adresse_2" => nil,
"tranche_effectifs" => nil,
"enseigne_1" => nil,
"numero_voie_2" => nil,
"geo_id" => "34154_b163",
"activite_principale_registre_metiers" => nil,
"geo_l5" => nil,
"caractere_employeur" => "N",
"geo_l4" => "MAS DES PERES",
"nic" => "00036",
"code_postal" => "34130",
"libelle_cedex_2" => nil,
"created_at" => "2020-07-02T03:12:45.814+02:00",
"numero_voie" => "9002",
"longitude" => "3.967449",
"type_voie" => nil,
"date_debut" => "2019-07-01",
"code_cedex_2" => nil,
"date_creation" => "2019-07-01",
"code_commune_2" => nil,
"libelle_pays_etranger_2" => nil,
"libelle_commune_etranger" => nil,
"latitude" => "43.603798",
"code_cedex" => nil,
"geo_score" => "0.95",
...
}
],
"meta" => %{
"page" => 1,
"per_page" => 1,
"total_pages" => 1,
"total_results" => 1
}
}
正如您在上面看到的(评论中的四行),例如,我正在尝试提取“警笛”元素,但它失败了……我是不是偏离了目标?
我猜您想遍历所有“etablissements”并检索所有 SIREN 号码。您可以尝试类似的操作:
body
|> decode!()
|> Map.get("etablissements")
|> Enum.map(fn etablissement -> Map.get(etablissement, "siren") end)
假设数据结构如上所示。请注意 Enum.map/2
与 Map
模块无关,但会在 "etablissements"
列表的每个条目上运行函数 fn etablissement -> Map.get(etablissement, "siren") end
。
这应该 return 包含所有 SIREN 号码的列表。如果可能有重复条目,您可以使用 Enum.uniq/1
.
这里特别用Access
in general and Kernel.get_in/2
body
|> decode!()
|> get_in(["etablissements", Access.all(), "unite_legale", "siren"])
#⇒ "0"
使用 Access
您可以过滤每一步的结果,获取所有分支等。这里我们使用 Access.all/0
来获取列表的所有元素。