避免 url 在 R 中编码
Avoid url encoding in R
我正在尝试从 Google 地理编码 API 中获取 lat/lon,但是当地址中包含丹麦本地字符时请求失败。我怀疑这是因为 httr::GET 函数对 url 进行了编码,但我不确定我是否正确。
如果你 copy/paste 这个 link 直接进入你的浏览器你会得到一个有效的结果:
http://maps.googleapis.com/maps/api/geocode/json?address=Søholmen+9,+4500+丹麦
但下面的代码无效,即使 url 在解析为 GET 函数之前是相同的。如果我使用没有本地字符的地址,它会起作用。
library(httr)
library(jsonlite)
library(stringr)
address <- "Søholmen 9, 4500 Denmark"
# address <- "Kronprinsesse Sofies Vej 6, 2000 Denmark"
base_url <- "http://maps.googleapis.com/maps/api/geocode/json?"
# An address OR components
geo_url <- paste0(base_url, "address=", str_replace_all(address, pattern = " ", replacement = "+"))
# Get the result
# get the content
# Parse the JSON
temp_geo_results <- httr::GET(url = URLencode(URL = geo_url), verbose())
temp_geo_results <- httr::content(temp_geo_results, as = "text")
temp_geo_results <- jsonlite::fromJSON(temp_geo_results)
这是我的 sessionInfo()
R version 3.1.2 (2014-10-31)
Platform: x86_64-w64-mingw32/x64 (64-bit)
locale:
[1] LC_COLLATE=Danish_Denmark.1252 LC_CTYPE=Danish_Denmark.1252 LC_MONETARY=Danish_Denmark.1252
[4] LC_NUMERIC=C LC_TIME=Danish_Denmark.1252
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] stringr_0.6.2 jsonlite_0.9.10 httr_0.5
loaded via a namespace (and not attached):
[1] RCurl_1.95-4.3 tools_3.1.2
编辑:我删除了该问题不需要的一行代码并添加了我的 sessionInfo。
我可以分享我如何用我的语言解决同样问题的粗略方法:
deencode <- function(text){
output <- NULL
for(i in 1:length(text)){
temp <- text[i]
temp <- gsub("ā", "a", temp)
temp <- gsub("Ā", "A", temp)
temp <- gsub("č", "c", temp)
temp <- gsub("Č", "C", temp)
temp <- gsub("ē", "e", temp)
temp <- gsub("Ē", "E", temp)
temp <- gsub("ģ", "g", temp)
temp <- gsub("Ģ", "G", temp)
temp <- gsub("ī", "i", temp)
temp <- gsub("Ī", "I", temp)
temp <- gsub("ķ", "k", temp)
temp <- gsub("Ķ", "K", temp)
temp <- gsub("ļ", "l", temp)
temp <- gsub("Ļ", "L", temp)
temp <- gsub("ņ", "n", temp)
temp <- gsub("Ņ", "N", temp)
temp <- gsub("š", "s", temp)
temp <- gsub("Š", "S", temp)
temp <- gsub("ū", "u", temp)
temp <- gsub("Ū", "u", temp)
temp <- gsub("ž", "z", temp)
temp <- gsub("Ž", "Z", temp)
output <- c(output, temp)
}
return(output)
}
在这个简单的替换之后它全部起作用了,至少在 Google 地理编码 API.
这似乎是一个编码问题。
以下对我来说很好用:
address <- "Søholmen 9, 4500 Denmark"
u <- sprintf("http://maps.googleapis.com/maps/api/geocode/json?address=%s",
gsub('\s+', '+', enc2utf8(address)))
fromJSON(content(GET(u), as='text'))
您可以使用 rvest 包
library(rvest); library(jsonlite)
address <- "Søholmen 9, 4500 Denmark"
# address <- "Kronprinsesse Sofies Vej 6, 2000 Denmark"
base_url <- "http://maps.googleapis.com/maps/api/geocode/json?"
# An address OR components
geo_url <- paste0(base_url, "address=", str_replace_all(address, pattern = " ", replacement = "+"))
geo_url <- iconv(geo_url, to="UTF-8")
temp_geo_results <- html_text(html_nodes(html(geo_url) , "p"))
temp_geo_results <- fromJSON(temp_geo_results)
我是如何解决类似问题的:在 rawToChar
和 fromJSON
之间设置 Encoding
,如下所示(不可执行)。
library(httr)
library(jsonlite)
call_api <- GET("YOUR_URL",
add_headers(.headers=c(`Authorization` = "YOUR_KEY")))
strange_characters <- rawToChar(call_api$content) #wherever the raw_data is
# if you pass Encoding(strange_characters) you will get "unknown". So run the line below.
Encoding(strange_characters) <- "UTF-8"
right_characters <- fromJSON(strange_characters)
我正在尝试从 Google 地理编码 API 中获取 lat/lon,但是当地址中包含丹麦本地字符时请求失败。我怀疑这是因为 httr::GET 函数对 url 进行了编码,但我不确定我是否正确。
如果你 copy/paste 这个 link 直接进入你的浏览器你会得到一个有效的结果: http://maps.googleapis.com/maps/api/geocode/json?address=Søholmen+9,+4500+丹麦
但下面的代码无效,即使 url 在解析为 GET 函数之前是相同的。如果我使用没有本地字符的地址,它会起作用。
library(httr)
library(jsonlite)
library(stringr)
address <- "Søholmen 9, 4500 Denmark"
# address <- "Kronprinsesse Sofies Vej 6, 2000 Denmark"
base_url <- "http://maps.googleapis.com/maps/api/geocode/json?"
# An address OR components
geo_url <- paste0(base_url, "address=", str_replace_all(address, pattern = " ", replacement = "+"))
# Get the result
# get the content
# Parse the JSON
temp_geo_results <- httr::GET(url = URLencode(URL = geo_url), verbose())
temp_geo_results <- httr::content(temp_geo_results, as = "text")
temp_geo_results <- jsonlite::fromJSON(temp_geo_results)
这是我的 sessionInfo()
R version 3.1.2 (2014-10-31)
Platform: x86_64-w64-mingw32/x64 (64-bit)
locale:
[1] LC_COLLATE=Danish_Denmark.1252 LC_CTYPE=Danish_Denmark.1252 LC_MONETARY=Danish_Denmark.1252
[4] LC_NUMERIC=C LC_TIME=Danish_Denmark.1252
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] stringr_0.6.2 jsonlite_0.9.10 httr_0.5
loaded via a namespace (and not attached):
[1] RCurl_1.95-4.3 tools_3.1.2
编辑:我删除了该问题不需要的一行代码并添加了我的 sessionInfo。
我可以分享我如何用我的语言解决同样问题的粗略方法:
deencode <- function(text){
output <- NULL
for(i in 1:length(text)){
temp <- text[i]
temp <- gsub("ā", "a", temp)
temp <- gsub("Ā", "A", temp)
temp <- gsub("č", "c", temp)
temp <- gsub("Č", "C", temp)
temp <- gsub("ē", "e", temp)
temp <- gsub("Ē", "E", temp)
temp <- gsub("ģ", "g", temp)
temp <- gsub("Ģ", "G", temp)
temp <- gsub("ī", "i", temp)
temp <- gsub("Ī", "I", temp)
temp <- gsub("ķ", "k", temp)
temp <- gsub("Ķ", "K", temp)
temp <- gsub("ļ", "l", temp)
temp <- gsub("Ļ", "L", temp)
temp <- gsub("ņ", "n", temp)
temp <- gsub("Ņ", "N", temp)
temp <- gsub("š", "s", temp)
temp <- gsub("Š", "S", temp)
temp <- gsub("ū", "u", temp)
temp <- gsub("Ū", "u", temp)
temp <- gsub("ž", "z", temp)
temp <- gsub("Ž", "Z", temp)
output <- c(output, temp)
}
return(output)
}
在这个简单的替换之后它全部起作用了,至少在 Google 地理编码 API.
这似乎是一个编码问题。
以下对我来说很好用:
address <- "Søholmen 9, 4500 Denmark"
u <- sprintf("http://maps.googleapis.com/maps/api/geocode/json?address=%s",
gsub('\s+', '+', enc2utf8(address)))
fromJSON(content(GET(u), as='text'))
您可以使用 rvest 包
library(rvest); library(jsonlite)
address <- "Søholmen 9, 4500 Denmark"
# address <- "Kronprinsesse Sofies Vej 6, 2000 Denmark"
base_url <- "http://maps.googleapis.com/maps/api/geocode/json?"
# An address OR components
geo_url <- paste0(base_url, "address=", str_replace_all(address, pattern = " ", replacement = "+"))
geo_url <- iconv(geo_url, to="UTF-8")
temp_geo_results <- html_text(html_nodes(html(geo_url) , "p"))
temp_geo_results <- fromJSON(temp_geo_results)
我是如何解决类似问题的:在 rawToChar
和 fromJSON
之间设置 Encoding
,如下所示(不可执行)。
library(httr)
library(jsonlite)
call_api <- GET("YOUR_URL",
add_headers(.headers=c(`Authorization` = "YOUR_KEY")))
strange_characters <- rawToChar(call_api$content) #wherever the raw_data is
# if you pass Encoding(strange_characters) you will get "unknown". So run the line below.
Encoding(strange_characters) <- "UTF-8"
right_characters <- fromJSON(strange_characters)