R 中双变量空间相关图(双变量 LISA)
Map of bivariate spatial correlation in R (bivariate LISA)
我想创建一个地图来显示两个变量之间的双变量空间相关性。这可以通过制作双变量 Moran's I 空间相关性的 LISA 图或使用 Lee (2001) 提出的 L 指数来完成。
双变量 Moran's I 未在 spdep
库中实现,但 L 索引已实现,因此这是我尝试使用 L 索引但未成功的方法。显示基于莫兰的解决方案的答案我也非常欢迎!
正如您从下面的可重现示例中看到的那样,到目前为止,我已经设法计算了局部 L 索引。 我想做的 是估计伪 p 值和 create a map of the results like those maps we use in LISA spatial clusters with high-high, high-low, ..., low-low。
在这个例子中,目标是创建一个地图,其中包含黑人和白人之间的双变量 Lisa 关联。该地图应在 ggplot2
中创建,显示集群:
- 黑人比例高和白人比例高
- 黑人占比高,白人占比低
- 黑人比例低,白人比例高
- 黑人出现率低,白人出现率低
可重现的例子
library(UScensus2000tract)
library(ggplot2)
library(spdep)
library(sf)
# load data
data("oregon.tract")
# plot Census Tract map
plot(oregon.tract)
# Variables to use in the correlation: white and black population in each census track
x <- scale(oregon.tract$white)
y <- scale(oregon.tract$black)
# create Queen contiguity matrix and Spatial weights matrix
nb <- poly2nb(oregon.tract)
lw <- nb2listw(nb)
# Lee index
Lxy <-lee(x, y, lw, length(x), zero.policy=TRUE)
# Lee’s L statistic (Global)
Lxy[1]
#> -0.1865688811
# 10k permutations to estimate pseudo p-values
LMCxy <- lee.mc(x, y, nsim=10000, lw, zero.policy=TRUE, alternative="less")
# quik plot of local L
Lxy[[2]] %>% density() %>% plot() # Lee’s local L statistic (Local)
LMCxy[[7]] %>% density() %>% lines(col="red") # plot values simulated 10k times
# get confidence interval of 95% ( mean +- 2 standard deviations)
two_sd_above <- mean(LMCxy[[7]]) + 2 * sd(LMCxy[[7]])
two_sd_below <- mean(LMCxy[[7]]) - 2 * sd(LMCxy[[7]])
# convert spatial object to sf class for easier/faster use
oregon_sf <- st_as_sf(oregon.tract)
# add L index values to map object
oregon_sf$Lindex <- Lxy[[2]]
# identify significant local results
oregon_sf$sig <- if_else( oregon_sf$Lindex < 2*two_sd_below, 1, if_else( oregon_sf$Lindex > 2*two_sd_above, 1, 0))
# Map of Local L index but only the significant results
ggplot() + geom_sf(data=oregon_sf, aes(fill=ifelse( sig==T, Lindex, NA)), color=NA)
这个呢?
我使用的是常规的 Moran's I 而不是您建议的 Lee Index。但我认为潜在的推理几乎是相同的。
正如您可能在下面看到的那样——以这种方式产生的结果与来自 GeoDA 的结果非常相似
library(dplyr)
library(ggplot2)
library(sf)
library(spdep)
library(rgdal)
library(stringr)
library(UScensus2000tract)
#======================================================
# load data
data("oregon.tract")
# Variables to use in the correlation: white and black population in each census track
x <- oregon.tract$white
y <- oregon.tract$black
#======================================================
# Programming some functions
# Bivariate Moran's I
moran_I <- function(x, y = NULL, W){
if(is.null(y)) y = x
xp <- (x - mean(x, na.rm=T))/sd(x, na.rm=T)
yp <- (y - mean(y, na.rm=T))/sd(y, na.rm=T)
W[which(is.na(W))] <- 0
n <- nrow(W)
global <- (xp%*%W%*%yp)/(n - 1)
local <- (xp*W%*%yp)
list(global = global, local = as.numeric(local))
}
# Permutations for the Bivariate Moran's I
simula_moran <- function(x, y = NULL, W, nsims = 1000){
if(is.null(y)) y = x
n = nrow(W)
IDs = 1:n
xp <- (x - mean(x, na.rm=T))/sd(x, na.rm=T)
W[which(is.na(W))] <- 0
global_sims = NULL
local_sims = matrix(NA, nrow = n, ncol=nsims)
ID_sample = sample(IDs, size = n*nsims, replace = T)
y_s = y[ID_sample]
y_s = matrix(y_s, nrow = n, ncol = nsims)
y_s <- (y_s - apply(y_s, 1, mean))/apply(y_s, 1, sd)
global_sims <- as.numeric( (xp%*%W%*%y_s)/(n - 1) )
local_sims <- (xp*W%*%y_s)
list(global_sims = global_sims,
local_sims = local_sims)
}
#======================================================
# Adjacency Matrix (Queen)
nb <- poly2nb(oregon.tract)
lw <- nb2listw(nb, style = "B", zero.policy = T)
W <- as(lw, "symmetricMatrix")
W <- as.matrix(W/rowSums(W))
W[which(is.na(W))] <- 0
#======================================================
# Calculating the index and its simulated distribution
# for global and local values
m <- moran_I(x, y, W)
m[[1]] # global value
m_i <- m[[2]] # local values
local_sims <- simula_moran(x, y, W)$local_sims
# Identifying the significant values
alpha <- .05 # for a 95% confidence interval
probs <- c(alpha/2, 1-alpha/2)
intervals <- t( apply(local_sims, 1, function(x) quantile(x, probs=probs)))
sig <- ( m_i < intervals[,1] ) | ( m_i > intervals[,2] )
#======================================================
# Preparing for plotting
oregon.tract <- st_as_sf(oregon.tract)
oregon.tract$sig <- sig
# Identifying the LISA patterns
xp <- (x-mean(x))/sd(x)
yp <- (y-mean(y))/sd(y)
patterns <- as.character( interaction(xp > 0, W%*%yp > 0) )
patterns <- patterns %>%
str_replace_all("TRUE","High") %>%
str_replace_all("FALSE","Low")
patterns[oregon.tract$sig==0] <- "Not significant"
oregon.tract$patterns <- patterns
# Plotting
ggplot() + geom_sf(data=oregon.tract, aes(fill=patterns), color="NA") +
scale_fill_manual(values = c("red", "pink", "light blue", "dark blue", "grey95")) +
theme_minimal()
通过更改置信区间(例如使用 90% 而不是 95%),您可以获得更接近(但不完全相同)GeoDa 的结果。
我想剩下的差异来自于计算 Moran's I 的方法略有不同。我的版本给出了与包 spdep
中可用的函数 moran
相同的值。但GeoDa可能使用了另一种方法。
我想现在添加到线程中已经很晚了,但是 Lee 的 L 与您在这里所做的完全不同,这是 Wartenberg (1985) 的创新。这有一些潜在的缺点。主要是测试x和y[的滞后之间的关系 正如@RogerioJB 通过解释空间滞后 y 是通过将模拟 y 乘以邻接矩阵来计算的那样澄清的。 Lee (2001) 的创新非常不同,涉及 Pearson 的 r 和空间平滑标量 (SSS) 的集成,而是比较 x 之间的过程和 y 而不是 y 的 滞后。 @RogerioJB 采用的方法可以通过从 lee.mc 函数生成可能的局部 l 的分布来复制。反过来,可以以类似于 GeoDa-like high-high ... low-low 显着性聚类图的样式绘制结果。
根据@justin-k 的建议,我修改了@rogeriojb 的双变量 LISA 代码以计算 Lee 的 L 统计量。此方法使用点级数据创建修改后的 lee.mc() function from the spdep package to simulate the local L values. I provide another example in a GitHub gist。
library(boot)
library(dplyr)
library(ggplot2)
library(sf)
library(spdep)
library(rgdal)
library(stringr)
library(UScensus2000tract)
#======================================================
# load data
data("oregon.tract")
# Variables to use in the correlation: white and black population in each census track
x <- oregon.tract$white
y <- oregon.tract$black
# ----------------------------------------------------- #
# Program a function
## Permutations for Lee's L statistic
## Modification of the lee.mc() function within the {spdep} package
## Saves 'localL' output instead of 'L' output
simula_lee <- function(x, y, listw, nsim = nsim, zero.policy = NULL, na.action = na.fail) {
if (deparse(substitute(na.action)) == "na.pass")
stop ("na.pass not permitted")
na.act <- attr(na.action(cbind(x, y)), "na.action")
x[na.act] <- NA
y[na.act] <- NA
x <- na.action(x)
y <- na.action(y)
if (!is.null(na.act)) {
subset <- !(1:length(listw$neighbours) %in% na.act)
listw <- subset(listw, subset, zero.policy = zero.policy)
}
n <- length(listw$neighbours)
if ((n != length(x)) | (n != length(y)))
stop ("objects of different length")
gamres <- suppressWarnings(nsim > gamma(n + 1))
if (gamres)
stop ("nsim too large for this number of observations")
if (nsim < 1)
stop ("nsim too small")
xy <- data.frame(x, y)
S2 <- sum((unlist(lapply(listw$weights, sum)))^2)
lee_boot <- function(var, i, ...) {
return(lee(x = var[i, 1], y = var[i, 2], ...)$localL)
}
res <- boot(xy, statistic = lee_boot, R = nsim, sim = "permutation",
listw = listw, n = n, S2 = S2, zero.policy = zero.policy)
}
# ----------------------------------------------------- #
# Adjacency Matrix
nb <- poly2nb(oregon.tract)
lw <- nb2listw(nb, style = "B", zero.policy = T)
W <- as(lw, "symmetricMatrix")
W <- as.matrix(W / rowSums(W))
W[which(is.na(W))] <- 0
# ----------------------------------------------------- #
# Calculate the index and its simulated distribution
# for global and local values
# Global Lee's L
lee.test(x = x, y = y, listw = lw, zero.policy = TRUE,
alternative = "two.sided", na.action = na.omit)
# Local Lee's L values
m <- lee(x = x, y = y, listw = lw, n = length(x),
zero.policy = TRUE, NAOK = TRUE)
# Local Lee's L simulations
local_sims <- simula_lee(x = x, y = y, listw = lw, nsim = 10000,
zero.policy = TRUE, na.action = na.omit)
m_i <- m[[2]] # local values
# Identify the significant values
alpha <- 0.05 # for a 95% confidence interval
probs <- c(alpha/2, 1-alpha/2)
intervals <- t(apply(t(local_sims[[2]]), 1, function(x) quantile(x, probs = probs)))
sig <- (m_i < intervals[ , 1] ) | ( m_i > intervals[ , 2])
#======================================================
# Preparing for plotting
oregon.tract <- st_as_sf(oregon.tract)
oregon.tract$sig <- sig
# Identifying the Lee's L patterns
xp <- scale(x)
yp <- scale(y)
patterns <- as.character(interaction(xp > 0, W%*%yp > 0))
patterns <- patterns %>%
str_replace_all("TRUE","High") %>%
str_replace_all("FALSE","Low")
patterns[oregon.tract$sig == 0] <- "Not significant"
oregon.tract$patterns <- patterns
# Plotting
ggplot() +
geom_sf(data = oregon.tract, aes(fill = patterns), color = "NA") +
scale_fill_manual(values = c("red", "pink", "light blue", "dark blue", "grey95")) +
guides(fill = guide_legend(title = "Lee's L clusters")) +
theme_minimal()
Lee's L clusters for oregon.tract data
我想创建一个地图来显示两个变量之间的双变量空间相关性。这可以通过制作双变量 Moran's I 空间相关性的 LISA 图或使用 Lee (2001) 提出的 L 指数来完成。
双变量 Moran's I 未在 spdep
库中实现,但 L 索引已实现,因此这是我尝试使用 L 索引但未成功的方法。显示基于莫兰的解决方案的答案我也非常欢迎!
正如您从下面的可重现示例中看到的那样,到目前为止,我已经设法计算了局部 L 索引。 我想做的 是估计伪 p 值和 create a map of the results like those maps we use in LISA spatial clusters with high-high, high-low, ..., low-low。
在这个例子中,目标是创建一个地图,其中包含黑人和白人之间的双变量 Lisa 关联。该地图应在 ggplot2
中创建,显示集群:
- 黑人比例高和白人比例高
- 黑人占比高,白人占比低
- 黑人比例低,白人比例高
- 黑人出现率低,白人出现率低
可重现的例子
library(UScensus2000tract)
library(ggplot2)
library(spdep)
library(sf)
# load data
data("oregon.tract")
# plot Census Tract map
plot(oregon.tract)
# Variables to use in the correlation: white and black population in each census track
x <- scale(oregon.tract$white)
y <- scale(oregon.tract$black)
# create Queen contiguity matrix and Spatial weights matrix
nb <- poly2nb(oregon.tract)
lw <- nb2listw(nb)
# Lee index
Lxy <-lee(x, y, lw, length(x), zero.policy=TRUE)
# Lee’s L statistic (Global)
Lxy[1]
#> -0.1865688811
# 10k permutations to estimate pseudo p-values
LMCxy <- lee.mc(x, y, nsim=10000, lw, zero.policy=TRUE, alternative="less")
# quik plot of local L
Lxy[[2]] %>% density() %>% plot() # Lee’s local L statistic (Local)
LMCxy[[7]] %>% density() %>% lines(col="red") # plot values simulated 10k times
# get confidence interval of 95% ( mean +- 2 standard deviations)
two_sd_above <- mean(LMCxy[[7]]) + 2 * sd(LMCxy[[7]])
two_sd_below <- mean(LMCxy[[7]]) - 2 * sd(LMCxy[[7]])
# convert spatial object to sf class for easier/faster use
oregon_sf <- st_as_sf(oregon.tract)
# add L index values to map object
oregon_sf$Lindex <- Lxy[[2]]
# identify significant local results
oregon_sf$sig <- if_else( oregon_sf$Lindex < 2*two_sd_below, 1, if_else( oregon_sf$Lindex > 2*two_sd_above, 1, 0))
# Map of Local L index but only the significant results
ggplot() + geom_sf(data=oregon_sf, aes(fill=ifelse( sig==T, Lindex, NA)), color=NA)
这个呢?
我使用的是常规的 Moran's I 而不是您建议的 Lee Index。但我认为潜在的推理几乎是相同的。
正如您可能在下面看到的那样——以这种方式产生的结果与来自 GeoDA 的结果非常相似
library(dplyr)
library(ggplot2)
library(sf)
library(spdep)
library(rgdal)
library(stringr)
library(UScensus2000tract)
#======================================================
# load data
data("oregon.tract")
# Variables to use in the correlation: white and black population in each census track
x <- oregon.tract$white
y <- oregon.tract$black
#======================================================
# Programming some functions
# Bivariate Moran's I
moran_I <- function(x, y = NULL, W){
if(is.null(y)) y = x
xp <- (x - mean(x, na.rm=T))/sd(x, na.rm=T)
yp <- (y - mean(y, na.rm=T))/sd(y, na.rm=T)
W[which(is.na(W))] <- 0
n <- nrow(W)
global <- (xp%*%W%*%yp)/(n - 1)
local <- (xp*W%*%yp)
list(global = global, local = as.numeric(local))
}
# Permutations for the Bivariate Moran's I
simula_moran <- function(x, y = NULL, W, nsims = 1000){
if(is.null(y)) y = x
n = nrow(W)
IDs = 1:n
xp <- (x - mean(x, na.rm=T))/sd(x, na.rm=T)
W[which(is.na(W))] <- 0
global_sims = NULL
local_sims = matrix(NA, nrow = n, ncol=nsims)
ID_sample = sample(IDs, size = n*nsims, replace = T)
y_s = y[ID_sample]
y_s = matrix(y_s, nrow = n, ncol = nsims)
y_s <- (y_s - apply(y_s, 1, mean))/apply(y_s, 1, sd)
global_sims <- as.numeric( (xp%*%W%*%y_s)/(n - 1) )
local_sims <- (xp*W%*%y_s)
list(global_sims = global_sims,
local_sims = local_sims)
}
#======================================================
# Adjacency Matrix (Queen)
nb <- poly2nb(oregon.tract)
lw <- nb2listw(nb, style = "B", zero.policy = T)
W <- as(lw, "symmetricMatrix")
W <- as.matrix(W/rowSums(W))
W[which(is.na(W))] <- 0
#======================================================
# Calculating the index and its simulated distribution
# for global and local values
m <- moran_I(x, y, W)
m[[1]] # global value
m_i <- m[[2]] # local values
local_sims <- simula_moran(x, y, W)$local_sims
# Identifying the significant values
alpha <- .05 # for a 95% confidence interval
probs <- c(alpha/2, 1-alpha/2)
intervals <- t( apply(local_sims, 1, function(x) quantile(x, probs=probs)))
sig <- ( m_i < intervals[,1] ) | ( m_i > intervals[,2] )
#======================================================
# Preparing for plotting
oregon.tract <- st_as_sf(oregon.tract)
oregon.tract$sig <- sig
# Identifying the LISA patterns
xp <- (x-mean(x))/sd(x)
yp <- (y-mean(y))/sd(y)
patterns <- as.character( interaction(xp > 0, W%*%yp > 0) )
patterns <- patterns %>%
str_replace_all("TRUE","High") %>%
str_replace_all("FALSE","Low")
patterns[oregon.tract$sig==0] <- "Not significant"
oregon.tract$patterns <- patterns
# Plotting
ggplot() + geom_sf(data=oregon.tract, aes(fill=patterns), color="NA") +
scale_fill_manual(values = c("red", "pink", "light blue", "dark blue", "grey95")) +
theme_minimal()
通过更改置信区间(例如使用 90% 而不是 95%),您可以获得更接近(但不完全相同)GeoDa 的结果。
我想剩下的差异来自于计算 Moran's I 的方法略有不同。我的版本给出了与包 spdep
中可用的函数 moran
相同的值。但GeoDa可能使用了另一种方法。
我想现在添加到线程中已经很晚了,但是 Lee 的 L 与您在这里所做的完全不同,这是 Wartenberg (1985) 的创新。这有一些潜在的缺点。主要是测试x和y[的滞后之间的关系 正如@RogerioJB 通过解释空间滞后 y 是通过将模拟 y 乘以邻接矩阵来计算的那样澄清的。 Lee (2001) 的创新非常不同,涉及 Pearson 的 r 和空间平滑标量 (SSS) 的集成,而是比较 x 之间的过程和 y 而不是 y 的 滞后。 @RogerioJB 采用的方法可以通过从 lee.mc 函数生成可能的局部 l 的分布来复制。反过来,可以以类似于 GeoDa-like high-high ... low-low 显着性聚类图的样式绘制结果。
根据@justin-k 的建议,我修改了@rogeriojb 的双变量 LISA 代码以计算 Lee 的 L 统计量。此方法使用点级数据创建修改后的 lee.mc() function from the spdep package to simulate the local L values. I provide another example in a GitHub gist。
library(boot)
library(dplyr)
library(ggplot2)
library(sf)
library(spdep)
library(rgdal)
library(stringr)
library(UScensus2000tract)
#======================================================
# load data
data("oregon.tract")
# Variables to use in the correlation: white and black population in each census track
x <- oregon.tract$white
y <- oregon.tract$black
# ----------------------------------------------------- #
# Program a function
## Permutations for Lee's L statistic
## Modification of the lee.mc() function within the {spdep} package
## Saves 'localL' output instead of 'L' output
simula_lee <- function(x, y, listw, nsim = nsim, zero.policy = NULL, na.action = na.fail) {
if (deparse(substitute(na.action)) == "na.pass")
stop ("na.pass not permitted")
na.act <- attr(na.action(cbind(x, y)), "na.action")
x[na.act] <- NA
y[na.act] <- NA
x <- na.action(x)
y <- na.action(y)
if (!is.null(na.act)) {
subset <- !(1:length(listw$neighbours) %in% na.act)
listw <- subset(listw, subset, zero.policy = zero.policy)
}
n <- length(listw$neighbours)
if ((n != length(x)) | (n != length(y)))
stop ("objects of different length")
gamres <- suppressWarnings(nsim > gamma(n + 1))
if (gamres)
stop ("nsim too large for this number of observations")
if (nsim < 1)
stop ("nsim too small")
xy <- data.frame(x, y)
S2 <- sum((unlist(lapply(listw$weights, sum)))^2)
lee_boot <- function(var, i, ...) {
return(lee(x = var[i, 1], y = var[i, 2], ...)$localL)
}
res <- boot(xy, statistic = lee_boot, R = nsim, sim = "permutation",
listw = listw, n = n, S2 = S2, zero.policy = zero.policy)
}
# ----------------------------------------------------- #
# Adjacency Matrix
nb <- poly2nb(oregon.tract)
lw <- nb2listw(nb, style = "B", zero.policy = T)
W <- as(lw, "symmetricMatrix")
W <- as.matrix(W / rowSums(W))
W[which(is.na(W))] <- 0
# ----------------------------------------------------- #
# Calculate the index and its simulated distribution
# for global and local values
# Global Lee's L
lee.test(x = x, y = y, listw = lw, zero.policy = TRUE,
alternative = "two.sided", na.action = na.omit)
# Local Lee's L values
m <- lee(x = x, y = y, listw = lw, n = length(x),
zero.policy = TRUE, NAOK = TRUE)
# Local Lee's L simulations
local_sims <- simula_lee(x = x, y = y, listw = lw, nsim = 10000,
zero.policy = TRUE, na.action = na.omit)
m_i <- m[[2]] # local values
# Identify the significant values
alpha <- 0.05 # for a 95% confidence interval
probs <- c(alpha/2, 1-alpha/2)
intervals <- t(apply(t(local_sims[[2]]), 1, function(x) quantile(x, probs = probs)))
sig <- (m_i < intervals[ , 1] ) | ( m_i > intervals[ , 2])
#======================================================
# Preparing for plotting
oregon.tract <- st_as_sf(oregon.tract)
oregon.tract$sig <- sig
# Identifying the Lee's L patterns
xp <- scale(x)
yp <- scale(y)
patterns <- as.character(interaction(xp > 0, W%*%yp > 0))
patterns <- patterns %>%
str_replace_all("TRUE","High") %>%
str_replace_all("FALSE","Low")
patterns[oregon.tract$sig == 0] <- "Not significant"
oregon.tract$patterns <- patterns
# Plotting
ggplot() +
geom_sf(data = oregon.tract, aes(fill = patterns), color = "NA") +
scale_fill_manual(values = c("red", "pink", "light blue", "dark blue", "grey95")) +
guides(fill = guide_legend(title = "Lee's L clusters")) +
theme_minimal()
Lee's L clusters for oregon.tract data