创建一个向量函数来清理 Houston Crime Data 的地址数据
Create a vector function to clean address data for Houston Crime Data
有很好的映射休斯顿犯罪数据的教程,但没有关于如何清理 HPD 提供的原始数据的简单示例。
https://github.com/hadley/ggplot2/wiki/Crime-in-Downtown-Houston,-Texas-:-Combining-ggplot2-and-Google-Maps
d <- structure(list(BlockRange = c("5400-5499", "3700-3799", "2200-2299",
"1000-1099", "1200-1299", "UNK", "1900-1999", "500-599", "1200-1299"
), StreetName = c("BELL", "BELL", "BELL", "BELL", "BELL", "BELL",
"BELL", "BELL", "BELL"), Date = c("4/28/2015", "4/11/2015", "4/26/2015",
"4/9/2015", "4/9/2015", "4/21/2015", "4/26/2015", "4/26/2015",
"4/17/2015")), row.names = c(60L, 75L, 88L, 4972L, 4990L, 5096L,
5098L, 5099L, 5155L), class = "data.frame", .Names = c("BlockRange",
"StreetName", "Date"))
这将 return 经度和纬度:
x <- gGeoCode("1950 Bell St, Houston, TX")
#[1] 29.74800 -95.35926
但是,它需要一个函数来对整个数据库进行地理编码并为 Lon 和 Lat 添加列
完成数据的选择示例。
structure(list(address = c("9650 marlive ln", "4750 telephone rd",
"5050 wickview ln", "1050 ashland st", "8350 canyon", "9350 rowan ln",
"2550 southmore blvd", "6350 rupley cir", "5050 georgi ln", "10750 briar forest dr"
), lon = c(-95.4373883, -95.2988769, -95.455864, -95.4033373,
-95.3779081, -95.5483009, -95.3733977, -95.3156032, -95.4665841,
-95.565934), lat = c(29.6779015, 29.6917121, 29.5992174, 29.7902425,
29.6706341, 29.7022336, 29.7198936, 29.6902746, 29.8297359, 29.747596
)), row.names = 82729:82738, class = "data.frame", .Names = c("address",
"lon", "lat"))
以下是地理编码函数:
library(RCurl)
library(RJSONIO)
library(dplyr)
library(gdata)
construct.geocode.url <- function(address, return.call = "json", sensor = "false") {
root <- "http://maps.google.com/maps/api/geocode/"
u <- paste(root, return.call, "?address=", address, "&sensor=", sensor, sep = "")
return(URLencode(u))
}
gGeoCode <- function(address,verbose=FALSE) {
if(verbose) cat(address,"\n")
u <- construct.geocode.url(address)
doc <- getURL(u)
x <- fromJSON(doc,simplify = FALSE)
if(x$status=="OK") {
lat <- x$results[[1]]$geometry$location$lat
lng <- x$results[[1]]$geometry$location$lng
return(c(lat, lng))
} else {
return(c(NA,NA))
}
}
我们如何使用 dplyr 或另一种方法编写一个函数,将 [address, long, lat] 的输出再添加 3 列?
即..
data.frame <- mutate(d, address = ConvertBlockRange(BlockRange) + StreetName, "Houston, TX"), Lon = geocode(address)[0] , lat = geocode(address)[1])
这是问题的阻塞点:
#function to convert - "2200-2299" to integer 2250.. i.e find the middle of the block.
library(stringr)
ConvertBlockRange <- function(blockRange){
m <- unlist(str_split(d$BlockRange, "-"))
m2 <- mean(c(as.numeric(m[1]),as.numeric(m[2]))) + .5
m2
}
您可以通过拆分范围和平均来计算平均块范围:
例如
x <- '5400-5499'
mean(as.numeric(strsplit(x, '-')[[1]])) # 5449.5
要扩大规模,我们可以使用 tidyr
包中的 separate
。这会做一些很酷的事情,比如自动将 blockrange 的 min/max 放入新列,将类型从字符串转换为数字 (convert=T, type.convert=as.numeric
)。我先 filter
列出了 "UNK" 地址 - 你必须单独处理它们。
library(dplyr)
library(tidyr)
d %>%
filter(BlockRange != "UNK") %>%
# this is a df with blockmin & blockmax
separate(BlockRange, c("blockmin", "blockmax"), sep = "-",
convert=T, type.convert=as.numeric, remove=FALSE) %>%
# calc average (round down) and address
mutate(block=floor((blockmin + blockmax)/2),
address=paste(block, StreetName))
# BlockRange blockmin blockmax StreetName Date block address
# 1 5400-5499 5400 5499 BELL 4/28/2015 5449 5449 BELL
# 2 3700-3799 3700 3799 BELL 4/11/2015 3749 3749 BELL
# 3 2200-2299 2200 2299 BELL 4/26/2015 2249 2249 BELL
# 4 1000-1099 1000 1099 BELL 4/9/2015 1049 1049 BELL
# 5 1200-1299 1200 1299 BELL 4/9/2015 1249 1249 BELL
# 6 1900-1999 1900 1999 BELL 4/26/2015 1949 1949 BELL
# 7 500-599 500 599 BELL 4/26/2015 549 549 BELL
# 8 1200-1299 1200 1299 BELL 4/17/2015 1249 1249 BELL
然后您可以 %>% group_by(address)
获取唯一地址和地理编码(尽管我会考虑如何在这里限制最大请求数等)。
关于同时添加输出纬度和经度列,我认为 dplyr 还没有这样做(参见 this feature request)。
如果你真的想在这里使用 dplyr
语法,最好的办法是更改 gGeoCode
以便它被矢量化,例如
gGeoCode2 <- function (addresses) {
x <- data.frame(t(sapply(addresses[[1]], gGeoCode)), row.names=NULL)
names(x) <- c('lat', 'lng')
x
}
d2 %>%
select(address) %>%
gGeoCode2 %>%
bind_cols(d2, .)
但我真的真的认为您应该跳过此特定步骤的 dplyr
糖并执行手动循环和 cbind
结果,这使您可以更好地控制请求限制。
有很好的映射休斯顿犯罪数据的教程,但没有关于如何清理 HPD 提供的原始数据的简单示例。 https://github.com/hadley/ggplot2/wiki/Crime-in-Downtown-Houston,-Texas-:-Combining-ggplot2-and-Google-Maps
d <- structure(list(BlockRange = c("5400-5499", "3700-3799", "2200-2299",
"1000-1099", "1200-1299", "UNK", "1900-1999", "500-599", "1200-1299"
), StreetName = c("BELL", "BELL", "BELL", "BELL", "BELL", "BELL",
"BELL", "BELL", "BELL"), Date = c("4/28/2015", "4/11/2015", "4/26/2015",
"4/9/2015", "4/9/2015", "4/21/2015", "4/26/2015", "4/26/2015",
"4/17/2015")), row.names = c(60L, 75L, 88L, 4972L, 4990L, 5096L,
5098L, 5099L, 5155L), class = "data.frame", .Names = c("BlockRange",
"StreetName", "Date"))
这将 return 经度和纬度:
x <- gGeoCode("1950 Bell St, Houston, TX")
#[1] 29.74800 -95.35926
但是,它需要一个函数来对整个数据库进行地理编码并为 Lon 和 Lat 添加列
完成数据的选择示例。
structure(list(address = c("9650 marlive ln", "4750 telephone rd",
"5050 wickview ln", "1050 ashland st", "8350 canyon", "9350 rowan ln",
"2550 southmore blvd", "6350 rupley cir", "5050 georgi ln", "10750 briar forest dr"
), lon = c(-95.4373883, -95.2988769, -95.455864, -95.4033373,
-95.3779081, -95.5483009, -95.3733977, -95.3156032, -95.4665841,
-95.565934), lat = c(29.6779015, 29.6917121, 29.5992174, 29.7902425,
29.6706341, 29.7022336, 29.7198936, 29.6902746, 29.8297359, 29.747596
)), row.names = 82729:82738, class = "data.frame", .Names = c("address",
"lon", "lat"))
以下是地理编码函数:
library(RCurl)
library(RJSONIO)
library(dplyr)
library(gdata)
construct.geocode.url <- function(address, return.call = "json", sensor = "false") {
root <- "http://maps.google.com/maps/api/geocode/"
u <- paste(root, return.call, "?address=", address, "&sensor=", sensor, sep = "")
return(URLencode(u))
}
gGeoCode <- function(address,verbose=FALSE) {
if(verbose) cat(address,"\n")
u <- construct.geocode.url(address)
doc <- getURL(u)
x <- fromJSON(doc,simplify = FALSE)
if(x$status=="OK") {
lat <- x$results[[1]]$geometry$location$lat
lng <- x$results[[1]]$geometry$location$lng
return(c(lat, lng))
} else {
return(c(NA,NA))
}
}
我们如何使用 dplyr 或另一种方法编写一个函数,将 [address, long, lat] 的输出再添加 3 列?
即..
data.frame <- mutate(d, address = ConvertBlockRange(BlockRange) + StreetName, "Houston, TX"), Lon = geocode(address)[0] , lat = geocode(address)[1])
这是问题的阻塞点:
#function to convert - "2200-2299" to integer 2250.. i.e find the middle of the block.
library(stringr)
ConvertBlockRange <- function(blockRange){
m <- unlist(str_split(d$BlockRange, "-"))
m2 <- mean(c(as.numeric(m[1]),as.numeric(m[2]))) + .5
m2
}
您可以通过拆分范围和平均来计算平均块范围:
例如
x <- '5400-5499'
mean(as.numeric(strsplit(x, '-')[[1]])) # 5449.5
要扩大规模,我们可以使用 tidyr
包中的 separate
。这会做一些很酷的事情,比如自动将 blockrange 的 min/max 放入新列,将类型从字符串转换为数字 (convert=T, type.convert=as.numeric
)。我先 filter
列出了 "UNK" 地址 - 你必须单独处理它们。
library(dplyr)
library(tidyr)
d %>%
filter(BlockRange != "UNK") %>%
# this is a df with blockmin & blockmax
separate(BlockRange, c("blockmin", "blockmax"), sep = "-",
convert=T, type.convert=as.numeric, remove=FALSE) %>%
# calc average (round down) and address
mutate(block=floor((blockmin + blockmax)/2),
address=paste(block, StreetName))
# BlockRange blockmin blockmax StreetName Date block address
# 1 5400-5499 5400 5499 BELL 4/28/2015 5449 5449 BELL
# 2 3700-3799 3700 3799 BELL 4/11/2015 3749 3749 BELL
# 3 2200-2299 2200 2299 BELL 4/26/2015 2249 2249 BELL
# 4 1000-1099 1000 1099 BELL 4/9/2015 1049 1049 BELL
# 5 1200-1299 1200 1299 BELL 4/9/2015 1249 1249 BELL
# 6 1900-1999 1900 1999 BELL 4/26/2015 1949 1949 BELL
# 7 500-599 500 599 BELL 4/26/2015 549 549 BELL
# 8 1200-1299 1200 1299 BELL 4/17/2015 1249 1249 BELL
然后您可以 %>% group_by(address)
获取唯一地址和地理编码(尽管我会考虑如何在这里限制最大请求数等)。
关于同时添加输出纬度和经度列,我认为 dplyr 还没有这样做(参见 this feature request)。
如果你真的想在这里使用 dplyr
语法,最好的办法是更改 gGeoCode
以便它被矢量化,例如
gGeoCode2 <- function (addresses) {
x <- data.frame(t(sapply(addresses[[1]], gGeoCode)), row.names=NULL)
names(x) <- c('lat', 'lng')
x
}
d2 %>%
select(address) %>%
gGeoCode2 %>%
bind_cols(d2, .)
但我真的真的认为您应该跳过此特定步骤的 dplyr
糖并执行手动循环和 cbind
结果,这使您可以更好地控制请求限制。