提高大数据方面网格图的性能

Question

我有几个时间序列，需要绘制每个组合的散点图。因为我已经发布了代码，在你的帮助下我想出了如何使用 ggplot2-Package 中的 facet_grid() 很好地绘制整个东西。

现在的问题是性能。下面的例子很小。您可以设置 n <- 50000 来触摸我需要处理的较低数据量。我认为最消耗的部分是生成 FACET-Data_Frame 所有组合，尤其是所有重复。最后，由于我要经过大量的台词，剧情调用也需要很长时间。 nrow(FACET) 是 length(df) * length(df) * n，在我 n = 50000 和 length(df) = 10 的实际案例中是 500 万。

library(tidyverse)
set.seed(214)

n <- 1000
df <- tibble(v1 = runif(n), v2 = runif(n)*0.1 + v1, v3 = runif(n)*0.2 + v2, v4 = runif(n)*0.3 + v3, v5 = runif(n)*0.4 + v4, v6 = runif(n)*0.5 + v5)

C                   <- crossing(w1 = 1:length(df), w2 = 1:length(df))    # Alle Kombinationsmöglichkeiten

FACET_LIST <- lapply(1:nrow(C), function(c) { # c <- 14   C[c,]
  tibble(a1 = unlist(df[, C$w1[c]], use.names = FALSE), 
         a2 = unlist(df[, C$w2[c]], use.names = FALSE), 
         name1 = names(df[, C$w1[c]]),
         name2 = names(df[, C$w2[c]])
  )
})

FACET <- do.call(rbind.data.frame, FACET_LIST)

FACET$name1 <- as_factor(FACET$name1)
FACET$name2 <- as_factor(FACET$name2)

dat_text <- tibble(
  name1 = rep(names(df), each = length(names(df))), 
  name2 = rep(names(df), length(names(df)))
)

p <- ggplot()
p <- p + geom_point(data=FACET, aes(a1, a2), size = 0.5)
p <- p + stat_smooth(data=FACET, aes(a1, a2), method = "lm")
p <- p + facet_grid(vars(name1), vars(name2)) + coord_fixed()
p

是否有更有效的方法将需求信息传递给 facet_grid()-plot？或者有没有其他方法可以加速我的代码？

Answer 1

所以我运行进行了一些测试 n = 50000:

base <- system.time({
  p <- ggplot()
  p <- p + geom_point(data=FACET, aes(a1, a2), size = 0.5)
  print(p)
})

facet <- system.time({
  p <- ggplot()
  p <- p + geom_point(data=FACET, aes(a1, a2), size = 0.5)
  p <- p + facet_grid(vars(name1), vars(name2)) + coord_fixed()
  print(p)
})

# Adding group to stat_smooth, so the number of lines it 
# has to estimate is consistent with the facetted option
smooth <- system.time({
  p <- ggplot()
  p <- p + geom_point(data=FACET, aes(a1, a2), size = 0.5)
  p <- p + stat_smooth(data=FACET, aes(a1, a2, group = interaction(name1, name2)), method = "lm")
  print(p)
})

smooth_facet <- system.time({
  p <- ggplot()
  p <- p + geom_point(data=FACET, aes(a1, a2), size = 0.5)
  p <- p + stat_smooth(data=FACET, aes(a1, a2), method = "lm")
  p <- p + facet_grid(vars(name1), vars(name2)) + coord_fixed()
  print(p)
})

building <- system.time({
  pp <- ggplot_build(p)
})

interpreting <- system.time({
  ppp <- ggplotGrob(pp$plot)
})

library(grid)
drawing <- system.time({
  grid.newpage(); grid.draw(ppp)
})

alternative <- system.time({
  g <- ggplot()
  g <- g + geom_point(data=FACET, aes(a1, a2), size = 0.5, shape = ".")
  g <- g + stat_smooth(data=FACET, aes(a1, a2), method = "lm")
  g <- g + facet_grid(vars(name1), vars(name2)) + coord_fixed()
  print(g)
})

这些是结果：

rbind(base, facet, smooth, smooth_facet, building, interpreting, drawing, alternative)

             user.self sys.self elapsed user.child sys.child
base              8.34    30.96   39.44         NA        NA
facet             8.56    30.48   39.12         NA        NA
smooth           10.00    31.14   41.18         NA        NA
smooth_facet     10.14    31.50   41.73         NA        NA
building          2.59     0.42    3.03         NA        NA
interpreting      5.08     0.61    5.76         NA        NA
drawing           5.13    30.23   35.39         NA        NA
alternative       7.58     8.23   15.86         NA        NA

这对我来说不是 ggplot 的代码慢，而是绘图代码或您必须绘制很多点的事实。

但是，如果不使用圆角点，而是在 geom_point() 语句中使用 shape = "."（如 'alternative'测试）。无论如何，您可能会过度绘制点。这是它的样子：

提高大数据方面网格图的性能

Improve Performance for Facet Grid Plot on Big Data

performance

r

ggplot2

facet-grid