R中cdplot()的密度计算问题
Problems with density calculation of cdplot() in R
(不确定这个问题是属于CrossValidated还是Whosebug)
我的数据子集:
mdat1 <- structure(list(Name = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Bilbao",
"San Sebastian", "Vitoria"), class = "factor"), PrecipTotal = c(0,
1.01600203200406, 0, 6.09601219202438, 73.4061468122936, 4.31800863601727,
0, 0.254000508001016, 7.8740157480315, 5.58801117602235, 0, 0,
0, 0, 2.03200406400813, 0, 0.254000508001016, 0, 2.03200406400813,
0, 0, 0, 57.9121158242316, 1.77800355600711, 0, 0.762001524003048,
6.3500127000254, 0, 0, 1.27000254000508, 8.89001778003556, 1.01600203200406,
0, 0, 0, 0, 0.762001524003048, 0, 8.89001778003556, 0, 0, 21.8440436880874,
0, 0.508001016002032, 0, 0.508001016002032, 0.508001016002032,
0, 0, 0, 14.4780289560579, 0.254000508001016, 0.508001016002032,
0, 23.3680467360935, 6.09601219202438, 0, 0, 0, 0, 28.1940563881128,
0, 0, 0, 3.04800609601219, 0, 0, 0, 0, 6.09601219202438, 0, 2.03200406400813,
0, 4.06400812801626, 0, 0.508001016002032, 0, 0, 0.508001016002032,
7.11201422402845, 34.0360680721361, 0, 0, 0, 7.8740157480315,
0, 4.06400812801626, 0, 0, 0.508001016002032, 5.08001016002032,
7.11201422402845, 7.11201422402845, 0, 0, 0, 1.01600203200406,
0, 0, 0), Hail = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Hail",
"NoHail"), class = "factor")), .Names = c("Name", "PrecipTotal",
"Hail"), row.names = c(43878L, 33821L, 40681L, 35121L, 45112L,
46428L, 45844L, 43199L, 34440L, 43184L, 32850L, 39220L, 38416L,
33860L, 34867L, 32737L, 43232L, 31772L, 35850L, 38894L, 39289L,
33148L, 32159L, 43197L, 43962L, 45068L, 41848L, 35929L, 34842L,
42069L, 39503L, 31747L, 43286L, 34919L, 43925L, 45368L, 42489L,
41686L, 43194L, 34747L, 37001L, 42923L, 45006L, 46170L, 33191L,
34392L, 44047L, 35859L, 42159L, 38843L, 45860L, 34180L, 33846L,
42810L, 46160L, 33523L, 34840L, 40226L, 42868L, 43576L, 46570L,
39980L, 42453L, 42063L, 38121L, 32822L, 40670L, 32859L, 46228L,
40239L, 32420L, 38874L, 39638L, 39523L, 31765L, 32753L, 33752L,
35574L, 36263L, 32871L, 32539L, 38455L, 41119L, 45124L, 34560L,
34144L, 41461L, 41449L, 35499L, 42783L, 34106L, 38151L, 36313L,
46593L, 39973L, 43928L, 35240L, 43626L, 46195L, 44388L), class = "data.frame")
使用以下代码
cdplot(mdat1 [, 2], mdat1 [, 3], ylab = "", main = "1",
xlab = "",
col = c("purple", "gray"))
创建 cdplot()
的混乱输出 ("1")。使用原始数据的不同样本生成标有 "2"
的输出
我假设它与 x 值的分布有关?如果它们非常倾斜(比如“1”),密度计算就会遇到麻烦?
我认为您可能需要考虑先转换 PrecipTotal
变量,然后创建条件密度图。在四处游荡之后,似乎采用变量的 sqrt
可能就足够了。我们可能还需要调整 binwidth
以获得更好看的图。
显然,这些转变和调整要求我们对关系的解释非常小心。
基础 R
使用 cdplot
cdplot(Hail ~ sqrt(PrecipTotal), data = mdat1)
ggplot2
使用 geom_density
和 position = 'fill'
library(ggplot2)
ggplot(mdat1, aes(sqrt(PrecipTotal)))+
geom_density(aes(fill = Hail), position = 'fill')+
theme_bw()
ggplot2
有一些选项
ggplot(mdat1, aes(sqrt(PrecipTotal)))+
geom_density(aes(fill = Hail), position = 'fill',
kernel = 'cosine', adjust = 1.1)+
theme_bw()
这是我在不修改您的数据的情况下简单地调整 bw
参数时的样子,所以我想说只是玩 bw
参数。
cdplot(mdat1 [, 2], mdat1 [, 3], ylab = "",
xlab = "",
col = c("purple", "gray"), bw = 1)
cdplot(mdat1 [, 2], mdat1 [, 3], ylab = "",
xlab = "",
col = c("purple", "gray"), bw = 2)
我会说这只是一个错误,尽管当帮助页面显示 "conditional densities are more reliable for high-density regions of x" 时,您会得到相当模糊的警告。将所有这些努力与 lattice 的 densityplot
得到的结果进行对比。 (在我看来更加清晰和信息丰富。)cdplot
和 ggplot
的努力似乎严重扭曲了数据。
library(lattice)
densityplot(~PrecipTotal, groups=Hail, mdat1, col = c("purple", "gray"))
您可以将数据的显示与您从以下位置获得的不那么病态的外观的输出进行对比:
cdplot(Hail ~ PrecipTotal, data=mdat1, bw=2)
...但这仍然给您留下这样的印象,即两组在 45-65 区域的密度存在显着差异,而并排显示应该有差距在一个点和另一组中的一个点,这似乎更容易用随机变化来解释。
需要指出的一点是,格子绘图参数约定是单独的绘图来自包含分组变量的公式规范,而使用 groups=
机制将分组包含在同一地块区域。
(不确定这个问题是属于CrossValidated还是Whosebug)
我的数据子集:
mdat1 <- structure(list(Name = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Bilbao",
"San Sebastian", "Vitoria"), class = "factor"), PrecipTotal = c(0,
1.01600203200406, 0, 6.09601219202438, 73.4061468122936, 4.31800863601727,
0, 0.254000508001016, 7.8740157480315, 5.58801117602235, 0, 0,
0, 0, 2.03200406400813, 0, 0.254000508001016, 0, 2.03200406400813,
0, 0, 0, 57.9121158242316, 1.77800355600711, 0, 0.762001524003048,
6.3500127000254, 0, 0, 1.27000254000508, 8.89001778003556, 1.01600203200406,
0, 0, 0, 0, 0.762001524003048, 0, 8.89001778003556, 0, 0, 21.8440436880874,
0, 0.508001016002032, 0, 0.508001016002032, 0.508001016002032,
0, 0, 0, 14.4780289560579, 0.254000508001016, 0.508001016002032,
0, 23.3680467360935, 6.09601219202438, 0, 0, 0, 0, 28.1940563881128,
0, 0, 0, 3.04800609601219, 0, 0, 0, 0, 6.09601219202438, 0, 2.03200406400813,
0, 4.06400812801626, 0, 0.508001016002032, 0, 0, 0.508001016002032,
7.11201422402845, 34.0360680721361, 0, 0, 0, 7.8740157480315,
0, 4.06400812801626, 0, 0, 0.508001016002032, 5.08001016002032,
7.11201422402845, 7.11201422402845, 0, 0, 0, 1.01600203200406,
0, 0, 0), Hail = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Hail",
"NoHail"), class = "factor")), .Names = c("Name", "PrecipTotal",
"Hail"), row.names = c(43878L, 33821L, 40681L, 35121L, 45112L,
46428L, 45844L, 43199L, 34440L, 43184L, 32850L, 39220L, 38416L,
33860L, 34867L, 32737L, 43232L, 31772L, 35850L, 38894L, 39289L,
33148L, 32159L, 43197L, 43962L, 45068L, 41848L, 35929L, 34842L,
42069L, 39503L, 31747L, 43286L, 34919L, 43925L, 45368L, 42489L,
41686L, 43194L, 34747L, 37001L, 42923L, 45006L, 46170L, 33191L,
34392L, 44047L, 35859L, 42159L, 38843L, 45860L, 34180L, 33846L,
42810L, 46160L, 33523L, 34840L, 40226L, 42868L, 43576L, 46570L,
39980L, 42453L, 42063L, 38121L, 32822L, 40670L, 32859L, 46228L,
40239L, 32420L, 38874L, 39638L, 39523L, 31765L, 32753L, 33752L,
35574L, 36263L, 32871L, 32539L, 38455L, 41119L, 45124L, 34560L,
34144L, 41461L, 41449L, 35499L, 42783L, 34106L, 38151L, 36313L,
46593L, 39973L, 43928L, 35240L, 43626L, 46195L, 44388L), class = "data.frame")
使用以下代码
cdplot(mdat1 [, 2], mdat1 [, 3], ylab = "", main = "1",
xlab = "",
col = c("purple", "gray"))
创建 cdplot()
的混乱输出 ("1")。使用原始数据的不同样本生成标有 "2"
我假设它与 x 值的分布有关?如果它们非常倾斜(比如“1”),密度计算就会遇到麻烦?
我认为您可能需要考虑先转换 PrecipTotal
变量,然后创建条件密度图。在四处游荡之后,似乎采用变量的 sqrt
可能就足够了。我们可能还需要调整 binwidth
以获得更好看的图。
显然,这些转变和调整要求我们对关系的解释非常小心。
基础 R
使用 cdplot
cdplot(Hail ~ sqrt(PrecipTotal), data = mdat1)
ggplot2
使用 geom_density
和 position = 'fill'
library(ggplot2)
ggplot(mdat1, aes(sqrt(PrecipTotal)))+
geom_density(aes(fill = Hail), position = 'fill')+
theme_bw()
ggplot2
有一些选项
ggplot(mdat1, aes(sqrt(PrecipTotal)))+
geom_density(aes(fill = Hail), position = 'fill',
kernel = 'cosine', adjust = 1.1)+
theme_bw()
这是我在不修改您的数据的情况下简单地调整 bw
参数时的样子,所以我想说只是玩 bw
参数。
cdplot(mdat1 [, 2], mdat1 [, 3], ylab = "",
xlab = "",
col = c("purple", "gray"), bw = 1)
cdplot(mdat1 [, 2], mdat1 [, 3], ylab = "",
xlab = "",
col = c("purple", "gray"), bw = 2)
我会说这只是一个错误,尽管当帮助页面显示 "conditional densities are more reliable for high-density regions of x" 时,您会得到相当模糊的警告。将所有这些努力与 lattice 的 densityplot
得到的结果进行对比。 (在我看来更加清晰和信息丰富。)cdplot
和 ggplot
的努力似乎严重扭曲了数据。
library(lattice)
densityplot(~PrecipTotal, groups=Hail, mdat1, col = c("purple", "gray"))
您可以将数据的显示与您从以下位置获得的不那么病态的外观的输出进行对比:
cdplot(Hail ~ PrecipTotal, data=mdat1, bw=2)
...但这仍然给您留下这样的印象,即两组在 45-65 区域的密度存在显着差异,而并排显示应该有差距在一个点和另一组中的一个点,这似乎更容易用随机变化来解释。
需要指出的一点是,格子绘图参数约定是单独的绘图来自包含分组变量的公式规范,而使用 groups=
机制将分组包含在同一地块区域。