R:如何可视化连续数据和分类数据之间的关系
R: how to visualize the relationship between continuous and categorical data
我有以下 data.frame
,其中包含 3 个分类变量(不同类型的血管病理学)和 1 个连续变量(输出)。我有兴趣了解输出与不同类型的血管病理之间的关系,即 higher/lower 输出是否与 mild/severe 病理相关?
> dput(df)
structure(list(Vascular_Pathology_M = structure(c(1L, 2L, 3L,
1L, 1L, 2L, 4L, 3L, 1L, 2L), .Label = c("Absent", "Mild", "Mild/Moderate",
"Moderate/Severe", "Severe"), class = "factor"), Vascular_Pathology_F = structure(c(4L,
2L, 1L, 1L, 1L, 1L, 2L, 4L, 1L, 1L), .Label = c("Absent", "Mild",
"Mild/Moderate", "Moderate/Severe", "Severe"), class = "factor"),
Vascular_Pathology_O = structure(c(1L, 3L, 4L, 3L, 1L, 2L,
1L, 1L, 1L, 2L), .Label = c("Absent", "Mild", "Mild/Moderate",
"Moderate/Severe"), class = "factor"), Output = c(1.01789418758932,
1.05627630598801, 1.49233946102323, 1.38192374975672, 1.13097652937671,
0.861306979571144, 0.707820561413699, 1.16628243128399, 0.983163398006992,
1.23972603843843)), .Names = c("Vascular_Pathology_M", "Vascular_Pathology_F",
"Vascular_Pathology_O", "Output"), row.names = c(1L, 3L, 4L,
5L, 6L, 7L, 8L, 10L, 11L, 12L), class = "data.frame")
> df
Vascular_Pathology_M Vascular_Pathology_F Vascular_Pathology_O Output
1 Absent Moderate/Severe Absent 1.0178942
3 Mild Mild Mild/Moderate 1.0562763
4 Mild/Moderate Absent Moderate/Severe 1.4923395
5 Absent Absent Mild/Moderate 1.3819237
6 Absent Absent Absent 1.1309765
7 Mild Absent Mild 0.8613070
8 Moderate/Severe Mild Absent 0.7078206
10 Mild/Moderate Moderate/Severe Absent 1.1662824
11 Absent Absent Absent 0.9831634
12 Mild Absent Mild 1.2397260
您可以简单地根据分类变量绘制输出图
plot(df[, 1], df[, 4])
plot(df[, 2], df[, 4])
plot(df[, 3], df[, 4])
您有一个 4 维数据集。一种选择是在一个小的多序列(还有一个维度)中绘制散点图(x/y = 二维),并将输出变量映射到类似大小的视觉对象(还有第四个维度)。
例如,将数据放入名为 my_dat
的 data.frame
中(因为 df
已分配给 R 中的函数)。点抖动以显示每个点的多个观察值,并按 Y 位置着色以帮助清楚哪个点属于哪个类别。
library(ggplot2)
my_dat$O_with_labels <-
factor(my_dat[, 3], labels=paste('Vasc Path O:', levels(my_dat[, 3])))
ggplot(my_dat,
aes(x=Vascular_Pathology_M, y=Vascular_Pathology_F)) +
geom_jitter(aes(size=Output, color=Vascular_Pathology_F)) +
facet_wrap(~O_with_labels) +
theme_bw() +
theme(axis.text.x = element_text(angle=45, hjust=1))
您可以查看各种病症的相互作用。例如,使用条形图
## Make the interaction variable
df$interact <- interaction(df[, 1:3], sep="_")
## Look at means of groups
library(dplyr)
df %>% group_by(interact) %>%
dplyr::summarise(Output = mean(Output)) -> means
ggplot(means, aes(interact, Output))+
geom_bar(stat="identity") +
theme(axis.text=element_text(angle=90)) +
xlab("Interaction")
或积分
ggplot(df, aes(interact, Output))+
geom_point() +
theme(axis.text=element_text(angle=45, hjust=1)) +
xlab("Interaction") +
geom_point(data=means, col="red") +
ylim(0, 1.6)
我有以下 data.frame
,其中包含 3 个分类变量(不同类型的血管病理学)和 1 个连续变量(输出)。我有兴趣了解输出与不同类型的血管病理之间的关系,即 higher/lower 输出是否与 mild/severe 病理相关?
> dput(df)
structure(list(Vascular_Pathology_M = structure(c(1L, 2L, 3L,
1L, 1L, 2L, 4L, 3L, 1L, 2L), .Label = c("Absent", "Mild", "Mild/Moderate",
"Moderate/Severe", "Severe"), class = "factor"), Vascular_Pathology_F = structure(c(4L,
2L, 1L, 1L, 1L, 1L, 2L, 4L, 1L, 1L), .Label = c("Absent", "Mild",
"Mild/Moderate", "Moderate/Severe", "Severe"), class = "factor"),
Vascular_Pathology_O = structure(c(1L, 3L, 4L, 3L, 1L, 2L,
1L, 1L, 1L, 2L), .Label = c("Absent", "Mild", "Mild/Moderate",
"Moderate/Severe"), class = "factor"), Output = c(1.01789418758932,
1.05627630598801, 1.49233946102323, 1.38192374975672, 1.13097652937671,
0.861306979571144, 0.707820561413699, 1.16628243128399, 0.983163398006992,
1.23972603843843)), .Names = c("Vascular_Pathology_M", "Vascular_Pathology_F",
"Vascular_Pathology_O", "Output"), row.names = c(1L, 3L, 4L,
5L, 6L, 7L, 8L, 10L, 11L, 12L), class = "data.frame")
> df
Vascular_Pathology_M Vascular_Pathology_F Vascular_Pathology_O Output
1 Absent Moderate/Severe Absent 1.0178942
3 Mild Mild Mild/Moderate 1.0562763
4 Mild/Moderate Absent Moderate/Severe 1.4923395
5 Absent Absent Mild/Moderate 1.3819237
6 Absent Absent Absent 1.1309765
7 Mild Absent Mild 0.8613070
8 Moderate/Severe Mild Absent 0.7078206
10 Mild/Moderate Moderate/Severe Absent 1.1662824
11 Absent Absent Absent 0.9831634
12 Mild Absent Mild 1.2397260
您可以简单地根据分类变量绘制输出图
plot(df[, 1], df[, 4])
plot(df[, 2], df[, 4])
plot(df[, 3], df[, 4])
您有一个 4 维数据集。一种选择是在一个小的多序列(还有一个维度)中绘制散点图(x/y = 二维),并将输出变量映射到类似大小的视觉对象(还有第四个维度)。
例如,将数据放入名为 my_dat
的 data.frame
中(因为 df
已分配给 R 中的函数)。点抖动以显示每个点的多个观察值,并按 Y 位置着色以帮助清楚哪个点属于哪个类别。
library(ggplot2)
my_dat$O_with_labels <-
factor(my_dat[, 3], labels=paste('Vasc Path O:', levels(my_dat[, 3])))
ggplot(my_dat,
aes(x=Vascular_Pathology_M, y=Vascular_Pathology_F)) +
geom_jitter(aes(size=Output, color=Vascular_Pathology_F)) +
facet_wrap(~O_with_labels) +
theme_bw() +
theme(axis.text.x = element_text(angle=45, hjust=1))
您可以查看各种病症的相互作用。例如,使用条形图
## Make the interaction variable
df$interact <- interaction(df[, 1:3], sep="_")
## Look at means of groups
library(dplyr)
df %>% group_by(interact) %>%
dplyr::summarise(Output = mean(Output)) -> means
ggplot(means, aes(interact, Output))+
geom_bar(stat="identity") +
theme(axis.text=element_text(angle=90)) +
xlab("Interaction")
或积分
ggplot(df, aes(interact, Output))+
geom_point() +
theme(axis.text=element_text(angle=45, hjust=1)) +
xlab("Interaction") +
geom_point(data=means, col="red") +
ylim(0, 1.6)