平行坐标图中每个级别的单独 y 轴
Individual y-axis for each level in parallel coordinates plot
我正在尝试生成平行坐标图,其中每个变量都有自己的轴。例如:
到目前为止,我已经使用了包 GGally
中的函数 ggparcoord()
。但是,据我所知,它不允许每个变量都像上面那样拥有自己的轴。
有谁知道如何使用 R
,最好使用 ggplot2
来完成此操作?提前致谢。
我不知道有什么包可以做到这一点,但在 ggplot 中自己绘制轴并不太难。
假设我们有一个与示例图中显示的数据集相似的数据集:
library(ggplot2)
library(dplyr)
cars <- mtcars %>%
select(c(2:4, 6:7, 1)) %>%
tibble::rownames_to_column("model") %>%
as_tibble()
cars
#> # A tibble: 32 x 7
#> model cyl disp hp wt qsec mpg
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Mazda RX4 6 160 110 2.62 16.5 21
#> 2 Mazda RX4 Wag 6 160 110 2.88 17.0 21
#> 3 Datsun 710 4 108 93 2.32 18.6 22.8
#> 4 Hornet 4 Drive 6 258 110 3.22 19.4 21.4
#> 5 Hornet Sportabout 8 360 175 3.44 17.0 18.7
#> 6 Valiant 6 225 105 3.46 20.2 18.1
#> 7 Duster 360 8 360 245 3.57 15.8 14.3
#> 8 Merc 240D 4 147. 62 3.19 20 24.4
#> 9 Merc 230 4 141. 95 3.15 22.9 22.8
#> 10 Merc 280 6 168. 123 3.44 18.3 19.2
#> # ... with 22 more rows
我们可以使用一些简单的算法计算轴中断(并设置刻度线的坐标):
axis_df <- stack(cars[-1]) %>%
group_by(ind) %>%
summarize(breaks = pretty(values, n = 10),
yval = (breaks - min(breaks))/(max(values) - min(values))) %>%
mutate(xmin = as.numeric(ind) - 0.05,
xmax = as.numeric(ind),
x_text = as.numeric(ind) - 0.2)
我们实际轴线的坐标是这样的:
axis_line_df <- axis_df %>%
group_by(ind) %>%
summarize(min = min(yval), max = max(yval))
现在我们需要对原始数据进行整形和规范化:
lines_df <- cars[-1] %>%
mutate(across(everything(), function(x) (x - min(x))/(max(x) - min(x)))) %>%
stack() %>%
mutate(row = rep(cars$model, ncol(cars) - 1))
最后,绘图代码将类似于:
ggplot(lines_df, aes(ind, values, group = row)) +
geom_line(color = "orange", alpha = 0.5) +
geom_segment(data = axis_line_df, aes(x = ind, xend = ind, y = min, yend = max),
inherit.aes = FALSE) +
geom_segment(data = axis_df, aes(x = xmin, xend = xmax, y = yval, yend = yval),
inherit.aes = FALSE) +
geom_text(data = axis_df, aes(x = x_text, y = yval, label = breaks),
inherit.aes = FALSE) +
geom_text(data = axis_line_df, aes(x = ind, y = 1.2, label = ind),
size = 6, inherit.aes = FALSE, check_overlap = TRUE, hjust = 1) +
theme_void() +
theme(plot.margin = margin(50, 20, 50, 20))
由 reprex package (v2.0.0)
于 2021-10-24 创建
再次感谢@Allan Cameron 的精彩回答。我使用他的代码编写了一个模仿 GGally::ggparcoord()
的函数,但具有单独的 y 轴。这里,轴的归一化中断并完成数据,以便轴的高度相同。
我还添加了一个参数 truth
,这是一个可选的 data.frame
,包含要为每个变量绘制的点;在我的应用程序的上下文中,这些线对应于参数估计,truth
点是我们试图估计的真实值。
函数如下:
ggparcoord_ind_yaxis <- function(
data,
truth = NULL,
truthPointSize = 2,
columns = 1:ncol(data),
groupColumn = NULL,
alphaLines = 1,
nbreaks = 4,
axis_font_size = 3
) {
# select the variables to plot
data_subset <- data %>% select(columns)
# re-order truth to match columns
col_names <- data_subset %>% names
if (!is.null(truth)) {
truth <- truth %>% select(col_names)
data_subset <- data_subset %>% rbind(truth)
}
# Calculate the axis breaks for each variable on the *original* scale.
# Note that the breaks computed by pretty() are guaranteed to contain all of
# the data. We include truth in these breaks, just in case one of the true
# points falls outside the range of the data (can easily happen in the context
# of comparing parameter estimates to the true values).
breaks_df <- data_subset %>%
stack %>% # convert to long format
group_by(ind) %>% # group by the plotting variables
summarize(breaks = pretty(values, n = nbreaks))
# Normalise the breaks to be between 0 and 1, and set the coordinates of the
# tick marks. Importantly, if we want the axis heights to be the same, the
# breaks need to be normalised to be between exactly 0 and 1.
axis_df <- breaks_df %>%
mutate(yval = (breaks - min(breaks))/(max(breaks) - min(breaks))) %>%
mutate(xmin = as.numeric(ind) - 0.05,
xmax = as.numeric(ind),
x_text = as.numeric(ind) - 0.2)
# Calculate the co-ordinates for our axis lines:
axis_line_df <- axis_df %>%
group_by(ind) %>%
summarize(min = min(yval), max = max(yval))
# Getting the minimum/maximum breaks on the original scale, to scale the
# data in the same manner that we scaled the breaks
minmax_breaks <- breaks_df %>%
summarize(min_break = min(breaks), max_break = max(breaks)) %>%
tibble::column_to_rownames(var = "ind")
# Normalise the original data in the same way that the breaks were normalised.
# This ensures that the scaling is correct.
# Do the same for the truth points, if they exist.
lines_df <- data %>% select(columns)
for (col in col_names) {
lines_df[, col] <- (lines_df[, col] - minmax_breaks[col, "min_break"]) / ( minmax_breaks[col, "max_break"] - minmax_breaks[col, "min_break"])
if (!is.null(truth)) {
truth[, col] <- (truth[, col] - minmax_breaks[col, "min_break"]) / ( minmax_breaks[col, "max_break"] - minmax_breaks[col, "min_break"])
}
}
# Reshape original data (and truth):
lines_df <- lines_df %>%
mutate(row = row_number()) %>% # need row information to group individual rows
bind_cols(data[, groupColumn, drop = FALSE]) %>% # need groupColumn for colour aesthetic
reshape2::melt(id.vars = c("row", groupColumn),
# choose names that are consistent with stack() above:
value.name = "values", variable.name = "ind")
# Reshape truth, as above
if (!is.null(truth)) {
truth <- truth %>%
mutate(row = row_number()) %>% # need row information to group individual rows
reshape2::melt(id.vars = c("row"),
# choose names that are consistent with stack():
value.name = "values", variable.name = "ind")
}
# Now plot:
gg <- ggplot() +
geom_line(data = lines_df %>% sample_n(nrow(.)), # permute rows to prevent one group dominating over another
aes_string(x = "ind", y = "values", group = "row", colour = groupColumn),
alpha = alphaLines) +
geom_segment(data = axis_line_df, aes(x = ind, xend = ind, y = min, yend = max),
inherit.aes = FALSE) +
geom_segment(data = axis_df, aes(x = xmin, xend = xmax, y = yval, yend = yval),
inherit.aes = FALSE) +
geom_text(data = axis_df, aes(x = x_text, y = yval, label = breaks),
inherit.aes = FALSE, size = axis_font_size)
if (!is.null(truth)) {
gg <- gg + geom_point(data = truth, aes(x = ind, y = values),
inherit.aes = FALSE, colour = "red", size = truthPointSize)
}
gg <- gg + theme_bw() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank())
return(gg)
}
使用 iris
数据集的示例:
library("ggplot2")
library("dplyr")
library("tibble")
truth <- iris %>% select(4:1) %>% apply(2, median, simplify = FALSE) %>% data.frame
ggparcoord_ind_yaxis(iris, truth = truth, columns = 4:1, groupColumn = "Species", alphaLines = 0.5)
我正在尝试生成平行坐标图,其中每个变量都有自己的轴。例如:
到目前为止,我已经使用了包 GGally
中的函数 ggparcoord()
。但是,据我所知,它不允许每个变量都像上面那样拥有自己的轴。
有谁知道如何使用 R
,最好使用 ggplot2
来完成此操作?提前致谢。
我不知道有什么包可以做到这一点,但在 ggplot 中自己绘制轴并不太难。
假设我们有一个与示例图中显示的数据集相似的数据集:
library(ggplot2)
library(dplyr)
cars <- mtcars %>%
select(c(2:4, 6:7, 1)) %>%
tibble::rownames_to_column("model") %>%
as_tibble()
cars
#> # A tibble: 32 x 7
#> model cyl disp hp wt qsec mpg
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Mazda RX4 6 160 110 2.62 16.5 21
#> 2 Mazda RX4 Wag 6 160 110 2.88 17.0 21
#> 3 Datsun 710 4 108 93 2.32 18.6 22.8
#> 4 Hornet 4 Drive 6 258 110 3.22 19.4 21.4
#> 5 Hornet Sportabout 8 360 175 3.44 17.0 18.7
#> 6 Valiant 6 225 105 3.46 20.2 18.1
#> 7 Duster 360 8 360 245 3.57 15.8 14.3
#> 8 Merc 240D 4 147. 62 3.19 20 24.4
#> 9 Merc 230 4 141. 95 3.15 22.9 22.8
#> 10 Merc 280 6 168. 123 3.44 18.3 19.2
#> # ... with 22 more rows
我们可以使用一些简单的算法计算轴中断(并设置刻度线的坐标):
axis_df <- stack(cars[-1]) %>%
group_by(ind) %>%
summarize(breaks = pretty(values, n = 10),
yval = (breaks - min(breaks))/(max(values) - min(values))) %>%
mutate(xmin = as.numeric(ind) - 0.05,
xmax = as.numeric(ind),
x_text = as.numeric(ind) - 0.2)
我们实际轴线的坐标是这样的:
axis_line_df <- axis_df %>%
group_by(ind) %>%
summarize(min = min(yval), max = max(yval))
现在我们需要对原始数据进行整形和规范化:
lines_df <- cars[-1] %>%
mutate(across(everything(), function(x) (x - min(x))/(max(x) - min(x)))) %>%
stack() %>%
mutate(row = rep(cars$model, ncol(cars) - 1))
最后,绘图代码将类似于:
ggplot(lines_df, aes(ind, values, group = row)) +
geom_line(color = "orange", alpha = 0.5) +
geom_segment(data = axis_line_df, aes(x = ind, xend = ind, y = min, yend = max),
inherit.aes = FALSE) +
geom_segment(data = axis_df, aes(x = xmin, xend = xmax, y = yval, yend = yval),
inherit.aes = FALSE) +
geom_text(data = axis_df, aes(x = x_text, y = yval, label = breaks),
inherit.aes = FALSE) +
geom_text(data = axis_line_df, aes(x = ind, y = 1.2, label = ind),
size = 6, inherit.aes = FALSE, check_overlap = TRUE, hjust = 1) +
theme_void() +
theme(plot.margin = margin(50, 20, 50, 20))
由 reprex package (v2.0.0)
于 2021-10-24 创建再次感谢@Allan Cameron 的精彩回答。我使用他的代码编写了一个模仿 GGally::ggparcoord()
的函数,但具有单独的 y 轴。这里,轴的归一化中断并完成数据,以便轴的高度相同。
我还添加了一个参数 truth
,这是一个可选的 data.frame
,包含要为每个变量绘制的点;在我的应用程序的上下文中,这些线对应于参数估计,truth
点是我们试图估计的真实值。
函数如下:
ggparcoord_ind_yaxis <- function(
data,
truth = NULL,
truthPointSize = 2,
columns = 1:ncol(data),
groupColumn = NULL,
alphaLines = 1,
nbreaks = 4,
axis_font_size = 3
) {
# select the variables to plot
data_subset <- data %>% select(columns)
# re-order truth to match columns
col_names <- data_subset %>% names
if (!is.null(truth)) {
truth <- truth %>% select(col_names)
data_subset <- data_subset %>% rbind(truth)
}
# Calculate the axis breaks for each variable on the *original* scale.
# Note that the breaks computed by pretty() are guaranteed to contain all of
# the data. We include truth in these breaks, just in case one of the true
# points falls outside the range of the data (can easily happen in the context
# of comparing parameter estimates to the true values).
breaks_df <- data_subset %>%
stack %>% # convert to long format
group_by(ind) %>% # group by the plotting variables
summarize(breaks = pretty(values, n = nbreaks))
# Normalise the breaks to be between 0 and 1, and set the coordinates of the
# tick marks. Importantly, if we want the axis heights to be the same, the
# breaks need to be normalised to be between exactly 0 and 1.
axis_df <- breaks_df %>%
mutate(yval = (breaks - min(breaks))/(max(breaks) - min(breaks))) %>%
mutate(xmin = as.numeric(ind) - 0.05,
xmax = as.numeric(ind),
x_text = as.numeric(ind) - 0.2)
# Calculate the co-ordinates for our axis lines:
axis_line_df <- axis_df %>%
group_by(ind) %>%
summarize(min = min(yval), max = max(yval))
# Getting the minimum/maximum breaks on the original scale, to scale the
# data in the same manner that we scaled the breaks
minmax_breaks <- breaks_df %>%
summarize(min_break = min(breaks), max_break = max(breaks)) %>%
tibble::column_to_rownames(var = "ind")
# Normalise the original data in the same way that the breaks were normalised.
# This ensures that the scaling is correct.
# Do the same for the truth points, if they exist.
lines_df <- data %>% select(columns)
for (col in col_names) {
lines_df[, col] <- (lines_df[, col] - minmax_breaks[col, "min_break"]) / ( minmax_breaks[col, "max_break"] - minmax_breaks[col, "min_break"])
if (!is.null(truth)) {
truth[, col] <- (truth[, col] - minmax_breaks[col, "min_break"]) / ( minmax_breaks[col, "max_break"] - minmax_breaks[col, "min_break"])
}
}
# Reshape original data (and truth):
lines_df <- lines_df %>%
mutate(row = row_number()) %>% # need row information to group individual rows
bind_cols(data[, groupColumn, drop = FALSE]) %>% # need groupColumn for colour aesthetic
reshape2::melt(id.vars = c("row", groupColumn),
# choose names that are consistent with stack() above:
value.name = "values", variable.name = "ind")
# Reshape truth, as above
if (!is.null(truth)) {
truth <- truth %>%
mutate(row = row_number()) %>% # need row information to group individual rows
reshape2::melt(id.vars = c("row"),
# choose names that are consistent with stack():
value.name = "values", variable.name = "ind")
}
# Now plot:
gg <- ggplot() +
geom_line(data = lines_df %>% sample_n(nrow(.)), # permute rows to prevent one group dominating over another
aes_string(x = "ind", y = "values", group = "row", colour = groupColumn),
alpha = alphaLines) +
geom_segment(data = axis_line_df, aes(x = ind, xend = ind, y = min, yend = max),
inherit.aes = FALSE) +
geom_segment(data = axis_df, aes(x = xmin, xend = xmax, y = yval, yend = yval),
inherit.aes = FALSE) +
geom_text(data = axis_df, aes(x = x_text, y = yval, label = breaks),
inherit.aes = FALSE, size = axis_font_size)
if (!is.null(truth)) {
gg <- gg + geom_point(data = truth, aes(x = ind, y = values),
inherit.aes = FALSE, colour = "red", size = truthPointSize)
}
gg <- gg + theme_bw() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank())
return(gg)
}
使用 iris
数据集的示例:
library("ggplot2")
library("dplyr")
library("tibble")
truth <- iris %>% select(4:1) %>% apply(2, median, simplify = FALSE) %>% data.frame
ggparcoord_ind_yaxis(iris, truth = truth, columns = 4:1, groupColumn = "Species", alphaLines = 0.5)