在 R 中从宽到长的复杂重塑(从原始变量名中提取多个东西)

Complex reshaping from wide to long in R (pulling multiple things from original variable name)

我有一个关于将复杂数据从宽格式重塑为长格式的问题。

"Prim_key" 是唯一标识。变量具有以下格式:"sn016_1_2"。我需要将第一个数字拉入一列并将其命名为 "S"(例如,此处为 1),第二个数字为名为 "T" 的列(例如,此处为 2 ),然后将值拉入按唯一 ID 分组的其他变量名称。前缀 sn016 也不是唯一的前缀。以下是变量:

[1] "prim_key"  "sn016_1_2" "sn016_1_3" "sn016_1_4" "sn016_1_5" "sn016_1_6" "sn016_1_7" "sn016_2_3"
 [9] "sn016_2_4" "sn016_2_5" "sn016_2_6" "sn016_2_7" "sn016_3_4" "sn016_3_5" "sn016_3_6" "sn016_3_7"
[17] "sn016_4_5" "sn016_4_6" "sn016_4_7" "sn016_5_6" "sn016_5_7" "sn016_6_7" "sn017_1_2" "sn017_1_3"
[25] "sn017_1_4" "sn017_1_5" "sn017_1_6" "sn017_1_7" "sn017_2_3" "sn017_2_4" "sn017_2_5" "sn017_2_6"
[33] "sn017_2_7" "sn017_3_4" "sn017_3_5" "sn017_3_6" "sn017_3_7" "sn017_4_5" "sn017_4_6" "sn017_4_7"
[41] "sn017_5_6" "sn017_5_7" "sn017_6_7"

"Prim_key" 是唯一标识。关于如何做到这一点的任何想法?感觉应该不难,但是却在躲避我

这是我正在寻找的示例: 这些变量:"prim_key" "sn016_1_2" "sn016_1_3" "sn016_2_6" "sn016_2_7" "sn016_3_4" "sn016_3_5"

prim_key   S     T      sn016
   1       1     2      value
   1       1     3      value
   1       2     6      value
   1       2     7      value
   1       3     4      value
   1       3     5      value

P.s。目标长格式示例未正确显示。所以我附上了一张图片。

在此先感谢您的帮助!!

也许您可以尝试使用 tidyr 中的 pivot_longer

您可以指定:

  • 要加长的列(可以 select 以 "sn" 开头的列,例如 starts_with("sn"),或除 prim_key 之外的所有列)
  • 生成的新列的名称,其中包括初始 letter/number 组合(例如,sn016)、ST
  • 和一个正则表达式模式拆分成这些列

代码如下:

library(tidyverse)

df %>%
  pivot_longer(cols = -prim_key, 
               names_to = c(".value", "S", "T"), 
               names_pattern = "(\w+)_(\d+)_(\d+)") 

输出

# A tibble: 10 x 5
   prim_key S     T     sn016 sn017
      <dbl> <chr> <chr> <int> <int>
 1        1 1     2         5    NA
 2        1 1     3         2    NA
 3        1 2     6         5     3
 4        1 2     7         1     2
 5        1 3     5        NA     3
 6        1 1     2         2    NA
 7        1 1     3         3    NA
 8        1 2     6         3     4
 9        1 2     7         2     3
10        1 3     5        NA     5

数据

组成的示例数据:

df <- structure(list(prim_key = c(1, 1), sn016_1_2 = c(5L, 2L), sn016_1_3 = 2:3, 
    sn016_2_6 = c(5L, 3L), sn016_2_7 = 1:2, sn017_2_6 = 3:4, 
    sn017_2_7 = 2:3, sn017_3_5 = c(3L, 5L)), class = "data.frame", row.names = c(NA, 
-2L))

我们可以使用 data.table

中的 melt
library(data.table)
dcast(melt(setDT(df),  id.var = 'prim_key')[, c("nm1", "S", "T") 
   := tstrsplit(variable, '_')], rowid(nm1, S, T) + prim_key + S + T 
         ~ nm1, value.var = 'value')[, nm1 := NULL][]
#     prim_key S T sn016 sn017
# 1:        1 1 2     5    NA
# 2:        1 1 3     2    NA
# 3:        1 2 6     5     3
# 4:        1 2 7     1     2
# 5:        1 3 5    NA     3
# 6:        1 1 2     2    NA
# 7:        1 1 3     3    NA
# 8:        1 2 6     3     4
# 9:        1 2 7     2     3
#10:        1 3 5    NA     5

数据

df <- structure(list(prim_key = c(1, 1), sn016_1_2 = c(5L, 2L), sn016_1_3 = 2:3, 
    sn016_2_6 = c(5L, 3L), sn016_2_7 = 1:2, sn017_2_6 = 3:4, 
    sn017_2_7 = 2:3, sn017_3_5 = c(3L, 5L)), class = "data.frame", row.names = c(NA, 
-2L))

使用外部包的答案可能是节俭的方式。然而,有时使用 base R 来暴力破解你想要的解决方案是很有用的。下面是一个例子。以下的一个好处是可以用并行版本 parLapplymclapply 替换对 lapply 的调用,这两个版本都来自 parallel 随 R.

#### First make some example data

# The column names you gave
cnames <- c("prim_key", "sn016_1_2", "sn016_1_3", "sn016_1_4", "sn016_1_5",
            "sn016_1_6", "sn016_1_7", "sn016_2_3", "sn016_2_4", "sn016_2_5",
            "sn016_2_6", "sn016_2_7", "sn016_3_4", "sn016_3_5", "sn016_3_6",
            "sn016_3_7", "sn016_4_5", "sn016_4_6", "sn016_4_7", "sn016_5_6",
            "sn016_5_7", "sn016_6_7", "sn017_1_2", "sn017_1_3", "sn017_1_4",
            "sn017_1_5", "sn017_1_6", "sn017_1_7", "sn017_2_3", "sn017_2_4",
            "sn017_2_5", "sn017_2_6", "sn017_2_7", "sn017_3_4", "sn017_3_5",
            "sn017_3_6", "sn017_3_7", "sn017_4_5", "sn017_4_6", "sn017_4_7",
            "sn017_5_6", "sn017_5_7", "sn017_6_7")

# An example matrix with random data
mat <- matrix(runif(length(cnames) * 4), nrow = 4)
# Make the column names corrcet
colnames(mat) <- cnames

### Now pretend we already had the data

# Get the column names of the input matrix
cnames <- colnames(mat)
# The column names that are not your primary key
n_primkey <- cnames[which(cnames != "prim_key")]

# Get the unique set of prefixes for the non-primkey variables
prefix <- strsplit(n_primkey, "_")
prefix <- unique(unlist(lapply(prefix, "[", 1)))

# Go row by row through the original matrix
dat <- lapply(seq_len(nrow(mat)), function(i) {
  # The row we're dealing with now
  row <- mat[i, ]

  # The column names of your output matrix
  dcnames <- c("prim_key", "S", "T", prefix)
  # A pre-allocated data.frame to hold the rehaped data for this row
  dat <- matrix(rep(NA, length(dcnames) * length(n_primkey)), ncol = length(dcnames))
  dat <- as.data.frame(dat)
  colnames(dat) <- dcnames

  # All values for this row have the same prim_key value
  dat$prim_key <- row["prim_key"]

  # Go through each of the non-prim_key variables, split them, and put the
  # values in the correct place
  for (j in seq_len(length(n_primkey))) {

    # k has the non-prim_key name we're dealing with
    k <- n_primkey[j]
    # l splits this name by underscores "_"
    l <- strsplit(k, "_")

    # The first element gives the prefix
    pref  <- l[[1]][1]
    # The second gives the "S" value
    S_val <- l[[1]][2]
    # The third gives the "T" value
    T_val <- l[[1]][3]

    # Allocate these values into the output data.frame we created ealier
    dat[j, "S"]  <- S_val
    dat[j, "T"]  <- T_val
    dat[j, pref] <- row[k]
  }
  # Return the data for row i of the input data
  dat
})
# dat is a list, so combine each element into a single data.frame
dat <- do.call(rbind, dat)

# Check a few
dat[1:2, ]
mat[1, ]