使用 interp 推断数据不会产生准确的图像
Extrapolating data with interp not producing accurate image
我有一张图表,其中外推与初始内插不匹配。我希望热图填满整个图像。
一、插值代码:
library(akima)
library(reshape2)
xmin <- signif(min(CBLo2$MD1))
xmax <- signif(max(CBLo2$MD1))
ymin <- signif(min(CBLo2$MD2))
ymax <- signif(max(CBLo2$MD2))
gridint <- 100
fld <- with(CBLo2, interp(x = MD1, y = MD2, z = Abundance,
xo=seq(xmin, xmax, length=gridint), yo=seq(ymin, ymax, length=gridint) ))
df <- melt(fld$z, na.rm = TRUE)
names(df) <- c("MD1", "MD2", "Abundance")
df$MD1 <- fld$x[df$MD1]
df$MD2 <- fld$y[df$MD2]
contour(fld) # test plot
我不会 post 整个 ggplot 代码(用于下图),只是生成热图所需的代码:
ggplot() +
geom_tile(inherit.aes=FALSE,data = df, aes(x = MD1, y = MD2,fill = Abundance)) +
scale_fill_continuous(name = "Rain (mm)", low = "yellow", high = "green")
但是,当我尝试推断数据时(根据其他 post 的示例),我得到了以下图表,它与第一口井完全不匹配:
fld <- with(CBLo2, interp(x = MD1, y = MD2, z = Abundance, extrap=TRUE, linear=FALSE,
xo=seq(xmin, xmax, length=gridint), yo=seq(ymin, ymax, length=gridint) ))
这是数据:
Abundance MD1 MD2
9 -0.59042 0.76793119
42 -0.48544284 -0.09465043
13 0.51250586 -0.24599322
84 -0.30857525 -0.21529624
2 0.90449257 0.679926
16 0.24536209 0.24016424
52 -0.43144002 -0.75474149
4 1.23830339 -0.11985391
37 -1.10235817 0.33886773
79 0.01757236 -0.59635386
我做错了什么?我怎样才能使外推更准确?
TLDR 解决方案:
将linear = FALSE
添加到所有interp()
代码以保持一致性,并在scale_fill_continuous()
中指定相同的限制。
解释:
这里有两个问题。
问题1:生成第一个fld
的代码不包含参数linear = FALSE
,而用于第二个的代码。
让我们比较一下插值:
library(dplyr)
fld1 <- with(CBLo2,
interp(x = MD1, y = MD2, z = Abundance,
xo=seq(xmin, xmax, length=gridint),
yo=seq(ymin, ymax, length=gridint) ))
df1 <- melt(fld1$z, na.rm = TRUE) # 6426 obs
fld2 <- with(CBLo2,
interp(x = MD1, y = MD2, z = Abundance,
extrap = TRUE, linear = FALSE,
xo=seq(xmin, xmax, length=gridint),
yo=seq(ymin, ymax, length=gridint) ))
df2 <- melt(fld2$z, na.rm = TRUE) #1000 obs
df.combined <- left_join(df2, df1, by = c("Var1", "Var2"))
df.combined %>%
filter(!is.na(value.y)) %>% # compare for the overlapping range
mutate(diff = value.x - value.y) %>%
select(diff) %>%
summary()
diff
Min. :-303.360
1st Qu.: -42.399
Median : 8.763
Mean : -7.552
3rd Qu.: 36.132
Max. : 238.647
现在将linear = FALSE
添加到第一个fld
:
fld3 <- with(CBLo2,
interp(x = MD1, y = MD2, z = Abundance,
linear = FALSE,
xo=seq(xmin, xmax, length=gridint),
yo=seq(ymin, ymax, length=gridint) ))
df3 <- melt(fld3$z, na.rm = TRUE) # 6426 obs
df.combined <- left_join(df2, df3, by = c("Var1", "Var2"))
df.combined %>%
filter(!is.na(value.y)) %>%
mutate(diff = value.x - value.y) %>%
select(diff) %>%
summary()
diff
Min. :0
1st Qu.:0
Median :0
Mean :0
3rd Qu.:0
Max. :0
问题2:插值范围差别很大
# define column names
names(df2) <- c("MD1", "MD2", "Abundance")
names(df3) <- c("MD1", "MD2", "Abundance")
> range(df2$Abundance)
[1] -1136.341 420.369
> range(df3$Abundance)
[1] -297.9161 241.6618
我们可以看到,即使值在相同的 MD1/MD2 坐标处匹配,扩展的 df2 中的值范围也远远超过 df3 的范围。为了确保丰度值和颜色之间的映射相同,我们必须根据两者的组合范围来指定填充限制。
我将使用丑陋但视觉上明显不同的渐变来说明这一点:
library(gridExtra)
p <- ggplot() +
scale_fill_gradientn(name = "Rain (mm)", colours = rainbow(15),
limits = range(c(df2$Abundance, df3$Abundance)))
grid.arrange(p + geom_tile(data = df3, aes(x = MD1, y = MD2, fill = Abundance)),
p + geom_tile(data = df2, aes(x = MD1, y = MD2, fill = Abundance)),
nrow = 1)
如果我们叠加图,它们会完全重叠(调整透明度以显示 df3 的边缘):
p +
geom_tile(data = df3, aes(x = MD1, y = MD2, fill = Abundance), alpha = 0.5) +
geom_tile(data = df2, aes(x = MD1, y = MD2, fill = Abundance), alpha = 0.5)
我有一张图表,其中外推与初始内插不匹配。我希望热图填满整个图像。
一、插值代码:
library(akima)
library(reshape2)
xmin <- signif(min(CBLo2$MD1))
xmax <- signif(max(CBLo2$MD1))
ymin <- signif(min(CBLo2$MD2))
ymax <- signif(max(CBLo2$MD2))
gridint <- 100
fld <- with(CBLo2, interp(x = MD1, y = MD2, z = Abundance,
xo=seq(xmin, xmax, length=gridint), yo=seq(ymin, ymax, length=gridint) ))
df <- melt(fld$z, na.rm = TRUE)
names(df) <- c("MD1", "MD2", "Abundance")
df$MD1 <- fld$x[df$MD1]
df$MD2 <- fld$y[df$MD2]
contour(fld) # test plot
我不会 post 整个 ggplot 代码(用于下图),只是生成热图所需的代码:
ggplot() +
geom_tile(inherit.aes=FALSE,data = df, aes(x = MD1, y = MD2,fill = Abundance)) +
scale_fill_continuous(name = "Rain (mm)", low = "yellow", high = "green")
但是,当我尝试推断数据时(根据其他 post 的示例),我得到了以下图表,它与第一口井完全不匹配:
fld <- with(CBLo2, interp(x = MD1, y = MD2, z = Abundance, extrap=TRUE, linear=FALSE,
xo=seq(xmin, xmax, length=gridint), yo=seq(ymin, ymax, length=gridint) ))
这是数据:
Abundance MD1 MD2
9 -0.59042 0.76793119
42 -0.48544284 -0.09465043
13 0.51250586 -0.24599322
84 -0.30857525 -0.21529624
2 0.90449257 0.679926
16 0.24536209 0.24016424
52 -0.43144002 -0.75474149
4 1.23830339 -0.11985391
37 -1.10235817 0.33886773
79 0.01757236 -0.59635386
我做错了什么?我怎样才能使外推更准确?
TLDR 解决方案:
将linear = FALSE
添加到所有interp()
代码以保持一致性,并在scale_fill_continuous()
中指定相同的限制。
解释:
这里有两个问题。
问题1:生成第一个fld
的代码不包含参数linear = FALSE
,而用于第二个的代码。
让我们比较一下插值:
library(dplyr)
fld1 <- with(CBLo2,
interp(x = MD1, y = MD2, z = Abundance,
xo=seq(xmin, xmax, length=gridint),
yo=seq(ymin, ymax, length=gridint) ))
df1 <- melt(fld1$z, na.rm = TRUE) # 6426 obs
fld2 <- with(CBLo2,
interp(x = MD1, y = MD2, z = Abundance,
extrap = TRUE, linear = FALSE,
xo=seq(xmin, xmax, length=gridint),
yo=seq(ymin, ymax, length=gridint) ))
df2 <- melt(fld2$z, na.rm = TRUE) #1000 obs
df.combined <- left_join(df2, df1, by = c("Var1", "Var2"))
df.combined %>%
filter(!is.na(value.y)) %>% # compare for the overlapping range
mutate(diff = value.x - value.y) %>%
select(diff) %>%
summary()
diff
Min. :-303.360
1st Qu.: -42.399
Median : 8.763
Mean : -7.552
3rd Qu.: 36.132
Max. : 238.647
现在将linear = FALSE
添加到第一个fld
:
fld3 <- with(CBLo2,
interp(x = MD1, y = MD2, z = Abundance,
linear = FALSE,
xo=seq(xmin, xmax, length=gridint),
yo=seq(ymin, ymax, length=gridint) ))
df3 <- melt(fld3$z, na.rm = TRUE) # 6426 obs
df.combined <- left_join(df2, df3, by = c("Var1", "Var2"))
df.combined %>%
filter(!is.na(value.y)) %>%
mutate(diff = value.x - value.y) %>%
select(diff) %>%
summary()
diff
Min. :0
1st Qu.:0
Median :0
Mean :0
3rd Qu.:0
Max. :0
问题2:插值范围差别很大
# define column names
names(df2) <- c("MD1", "MD2", "Abundance")
names(df3) <- c("MD1", "MD2", "Abundance")
> range(df2$Abundance)
[1] -1136.341 420.369
> range(df3$Abundance)
[1] -297.9161 241.6618
我们可以看到,即使值在相同的 MD1/MD2 坐标处匹配,扩展的 df2 中的值范围也远远超过 df3 的范围。为了确保丰度值和颜色之间的映射相同,我们必须根据两者的组合范围来指定填充限制。
我将使用丑陋但视觉上明显不同的渐变来说明这一点:
library(gridExtra)
p <- ggplot() +
scale_fill_gradientn(name = "Rain (mm)", colours = rainbow(15),
limits = range(c(df2$Abundance, df3$Abundance)))
grid.arrange(p + geom_tile(data = df3, aes(x = MD1, y = MD2, fill = Abundance)),
p + geom_tile(data = df2, aes(x = MD1, y = MD2, fill = Abundance)),
nrow = 1)
如果我们叠加图,它们会完全重叠(调整透明度以显示 df3 的边缘):
p +
geom_tile(data = df3, aes(x = MD1, y = MD2, fill = Abundance), alpha = 0.5) +
geom_tile(data = df2, aes(x = MD1, y = MD2, fill = Abundance), alpha = 0.5)