在 R 中从宽到长的复杂重塑(从原始变量名中提取多个东西)
Complex reshaping from wide to long in R (pulling multiple things from original variable name)
我有一个关于将复杂数据从宽格式重塑为长格式的问题。
"Prim_key" 是唯一标识。变量具有以下格式:"sn016_1_2"。我需要将第一个数字拉入一列并将其命名为 "S"(例如,此处为 1),第二个数字为名为 "T" 的列(例如,此处为 2 ),然后将值拉入按唯一 ID 分组的其他变量名称。前缀 sn016 也不是唯一的前缀。以下是变量:
[1] "prim_key" "sn016_1_2" "sn016_1_3" "sn016_1_4" "sn016_1_5" "sn016_1_6" "sn016_1_7" "sn016_2_3"
[9] "sn016_2_4" "sn016_2_5" "sn016_2_6" "sn016_2_7" "sn016_3_4" "sn016_3_5" "sn016_3_6" "sn016_3_7"
[17] "sn016_4_5" "sn016_4_6" "sn016_4_7" "sn016_5_6" "sn016_5_7" "sn016_6_7" "sn017_1_2" "sn017_1_3"
[25] "sn017_1_4" "sn017_1_5" "sn017_1_6" "sn017_1_7" "sn017_2_3" "sn017_2_4" "sn017_2_5" "sn017_2_6"
[33] "sn017_2_7" "sn017_3_4" "sn017_3_5" "sn017_3_6" "sn017_3_7" "sn017_4_5" "sn017_4_6" "sn017_4_7"
[41] "sn017_5_6" "sn017_5_7" "sn017_6_7"
"Prim_key" 是唯一标识。关于如何做到这一点的任何想法?感觉应该不难,但是却在躲避我
这是我正在寻找的示例:
这些变量:"prim_key" "sn016_1_2" "sn016_1_3" "sn016_2_6" "sn016_2_7" "sn016_3_4" "sn016_3_5"
prim_key S T sn016
1 1 2 value
1 1 3 value
1 2 6 value
1 2 7 value
1 3 4 value
1 3 5 value
P.s。目标长格式示例未正确显示。所以我附上了一张图片。
在此先感谢您的帮助!!
也许您可以尝试使用 tidyr
中的 pivot_longer
。
您可以指定:
- 要加长的列(可以 select 以 "sn" 开头的列,例如
starts_with("sn")
,或除 prim_key
之外的所有列)
- 生成的新列的名称,其中包括初始 letter/number 组合(例如,
sn016
)、S
和 T
- 和一个正则表达式模式拆分成这些列
代码如下:
library(tidyverse)
df %>%
pivot_longer(cols = -prim_key,
names_to = c(".value", "S", "T"),
names_pattern = "(\w+)_(\d+)_(\d+)")
输出
# A tibble: 10 x 5
prim_key S T sn016 sn017
<dbl> <chr> <chr> <int> <int>
1 1 1 2 5 NA
2 1 1 3 2 NA
3 1 2 6 5 3
4 1 2 7 1 2
5 1 3 5 NA 3
6 1 1 2 2 NA
7 1 1 3 3 NA
8 1 2 6 3 4
9 1 2 7 2 3
10 1 3 5 NA 5
数据
组成的示例数据:
df <- structure(list(prim_key = c(1, 1), sn016_1_2 = c(5L, 2L), sn016_1_3 = 2:3,
sn016_2_6 = c(5L, 3L), sn016_2_7 = 1:2, sn017_2_6 = 3:4,
sn017_2_7 = 2:3, sn017_3_5 = c(3L, 5L)), class = "data.frame", row.names = c(NA,
-2L))
我们可以使用 data.table
中的 melt
library(data.table)
dcast(melt(setDT(df), id.var = 'prim_key')[, c("nm1", "S", "T")
:= tstrsplit(variable, '_')], rowid(nm1, S, T) + prim_key + S + T
~ nm1, value.var = 'value')[, nm1 := NULL][]
# prim_key S T sn016 sn017
# 1: 1 1 2 5 NA
# 2: 1 1 3 2 NA
# 3: 1 2 6 5 3
# 4: 1 2 7 1 2
# 5: 1 3 5 NA 3
# 6: 1 1 2 2 NA
# 7: 1 1 3 3 NA
# 8: 1 2 6 3 4
# 9: 1 2 7 2 3
#10: 1 3 5 NA 5
数据
df <- structure(list(prim_key = c(1, 1), sn016_1_2 = c(5L, 2L), sn016_1_3 = 2:3,
sn016_2_6 = c(5L, 3L), sn016_2_7 = 1:2, sn017_2_6 = 3:4,
sn017_2_7 = 2:3, sn017_3_5 = c(3L, 5L)), class = "data.frame", row.names = c(NA,
-2L))
使用外部包的答案可能是节俭的方式。然而,有时使用 base R 来暴力破解你想要的解决方案是很有用的。下面是一个例子。以下的一个好处是可以用并行版本 parLapply
或 mclapply
替换对 lapply
的调用,这两个版本都来自 parallel
随 R.
#### First make some example data
# The column names you gave
cnames <- c("prim_key", "sn016_1_2", "sn016_1_3", "sn016_1_4", "sn016_1_5",
"sn016_1_6", "sn016_1_7", "sn016_2_3", "sn016_2_4", "sn016_2_5",
"sn016_2_6", "sn016_2_7", "sn016_3_4", "sn016_3_5", "sn016_3_6",
"sn016_3_7", "sn016_4_5", "sn016_4_6", "sn016_4_7", "sn016_5_6",
"sn016_5_7", "sn016_6_7", "sn017_1_2", "sn017_1_3", "sn017_1_4",
"sn017_1_5", "sn017_1_6", "sn017_1_7", "sn017_2_3", "sn017_2_4",
"sn017_2_5", "sn017_2_6", "sn017_2_7", "sn017_3_4", "sn017_3_5",
"sn017_3_6", "sn017_3_7", "sn017_4_5", "sn017_4_6", "sn017_4_7",
"sn017_5_6", "sn017_5_7", "sn017_6_7")
# An example matrix with random data
mat <- matrix(runif(length(cnames) * 4), nrow = 4)
# Make the column names corrcet
colnames(mat) <- cnames
### Now pretend we already had the data
# Get the column names of the input matrix
cnames <- colnames(mat)
# The column names that are not your primary key
n_primkey <- cnames[which(cnames != "prim_key")]
# Get the unique set of prefixes for the non-primkey variables
prefix <- strsplit(n_primkey, "_")
prefix <- unique(unlist(lapply(prefix, "[", 1)))
# Go row by row through the original matrix
dat <- lapply(seq_len(nrow(mat)), function(i) {
# The row we're dealing with now
row <- mat[i, ]
# The column names of your output matrix
dcnames <- c("prim_key", "S", "T", prefix)
# A pre-allocated data.frame to hold the rehaped data for this row
dat <- matrix(rep(NA, length(dcnames) * length(n_primkey)), ncol = length(dcnames))
dat <- as.data.frame(dat)
colnames(dat) <- dcnames
# All values for this row have the same prim_key value
dat$prim_key <- row["prim_key"]
# Go through each of the non-prim_key variables, split them, and put the
# values in the correct place
for (j in seq_len(length(n_primkey))) {
# k has the non-prim_key name we're dealing with
k <- n_primkey[j]
# l splits this name by underscores "_"
l <- strsplit(k, "_")
# The first element gives the prefix
pref <- l[[1]][1]
# The second gives the "S" value
S_val <- l[[1]][2]
# The third gives the "T" value
T_val <- l[[1]][3]
# Allocate these values into the output data.frame we created ealier
dat[j, "S"] <- S_val
dat[j, "T"] <- T_val
dat[j, pref] <- row[k]
}
# Return the data for row i of the input data
dat
})
# dat is a list, so combine each element into a single data.frame
dat <- do.call(rbind, dat)
# Check a few
dat[1:2, ]
mat[1, ]
我有一个关于将复杂数据从宽格式重塑为长格式的问题。
"Prim_key" 是唯一标识。变量具有以下格式:"sn016_1_2"。我需要将第一个数字拉入一列并将其命名为 "S"(例如,此处为 1),第二个数字为名为 "T" 的列(例如,此处为 2 ),然后将值拉入按唯一 ID 分组的其他变量名称。前缀 sn016 也不是唯一的前缀。以下是变量:
[1] "prim_key" "sn016_1_2" "sn016_1_3" "sn016_1_4" "sn016_1_5" "sn016_1_6" "sn016_1_7" "sn016_2_3"
[9] "sn016_2_4" "sn016_2_5" "sn016_2_6" "sn016_2_7" "sn016_3_4" "sn016_3_5" "sn016_3_6" "sn016_3_7"
[17] "sn016_4_5" "sn016_4_6" "sn016_4_7" "sn016_5_6" "sn016_5_7" "sn016_6_7" "sn017_1_2" "sn017_1_3"
[25] "sn017_1_4" "sn017_1_5" "sn017_1_6" "sn017_1_7" "sn017_2_3" "sn017_2_4" "sn017_2_5" "sn017_2_6"
[33] "sn017_2_7" "sn017_3_4" "sn017_3_5" "sn017_3_6" "sn017_3_7" "sn017_4_5" "sn017_4_6" "sn017_4_7"
[41] "sn017_5_6" "sn017_5_7" "sn017_6_7"
"Prim_key" 是唯一标识。关于如何做到这一点的任何想法?感觉应该不难,但是却在躲避我
这是我正在寻找的示例: 这些变量:"prim_key" "sn016_1_2" "sn016_1_3" "sn016_2_6" "sn016_2_7" "sn016_3_4" "sn016_3_5"
prim_key S T sn016
1 1 2 value
1 1 3 value
1 2 6 value
1 2 7 value
1 3 4 value
1 3 5 value
P.s。目标长格式示例未正确显示。所以我附上了一张图片。
在此先感谢您的帮助!!
也许您可以尝试使用 tidyr
中的 pivot_longer
。
您可以指定:
- 要加长的列(可以 select 以 "sn" 开头的列,例如
starts_with("sn")
,或除prim_key
之外的所有列) - 生成的新列的名称,其中包括初始 letter/number 组合(例如,
sn016
)、S
和T
- 和一个正则表达式模式拆分成这些列
代码如下:
library(tidyverse)
df %>%
pivot_longer(cols = -prim_key,
names_to = c(".value", "S", "T"),
names_pattern = "(\w+)_(\d+)_(\d+)")
输出
# A tibble: 10 x 5
prim_key S T sn016 sn017
<dbl> <chr> <chr> <int> <int>
1 1 1 2 5 NA
2 1 1 3 2 NA
3 1 2 6 5 3
4 1 2 7 1 2
5 1 3 5 NA 3
6 1 1 2 2 NA
7 1 1 3 3 NA
8 1 2 6 3 4
9 1 2 7 2 3
10 1 3 5 NA 5
数据
组成的示例数据:
df <- structure(list(prim_key = c(1, 1), sn016_1_2 = c(5L, 2L), sn016_1_3 = 2:3,
sn016_2_6 = c(5L, 3L), sn016_2_7 = 1:2, sn017_2_6 = 3:4,
sn017_2_7 = 2:3, sn017_3_5 = c(3L, 5L)), class = "data.frame", row.names = c(NA,
-2L))
我们可以使用 data.table
melt
library(data.table)
dcast(melt(setDT(df), id.var = 'prim_key')[, c("nm1", "S", "T")
:= tstrsplit(variable, '_')], rowid(nm1, S, T) + prim_key + S + T
~ nm1, value.var = 'value')[, nm1 := NULL][]
# prim_key S T sn016 sn017
# 1: 1 1 2 5 NA
# 2: 1 1 3 2 NA
# 3: 1 2 6 5 3
# 4: 1 2 7 1 2
# 5: 1 3 5 NA 3
# 6: 1 1 2 2 NA
# 7: 1 1 3 3 NA
# 8: 1 2 6 3 4
# 9: 1 2 7 2 3
#10: 1 3 5 NA 5
数据
df <- structure(list(prim_key = c(1, 1), sn016_1_2 = c(5L, 2L), sn016_1_3 = 2:3,
sn016_2_6 = c(5L, 3L), sn016_2_7 = 1:2, sn017_2_6 = 3:4,
sn017_2_7 = 2:3, sn017_3_5 = c(3L, 5L)), class = "data.frame", row.names = c(NA,
-2L))
使用外部包的答案可能是节俭的方式。然而,有时使用 base R 来暴力破解你想要的解决方案是很有用的。下面是一个例子。以下的一个好处是可以用并行版本 parLapply
或 mclapply
替换对 lapply
的调用,这两个版本都来自 parallel
随 R.
#### First make some example data
# The column names you gave
cnames <- c("prim_key", "sn016_1_2", "sn016_1_3", "sn016_1_4", "sn016_1_5",
"sn016_1_6", "sn016_1_7", "sn016_2_3", "sn016_2_4", "sn016_2_5",
"sn016_2_6", "sn016_2_7", "sn016_3_4", "sn016_3_5", "sn016_3_6",
"sn016_3_7", "sn016_4_5", "sn016_4_6", "sn016_4_7", "sn016_5_6",
"sn016_5_7", "sn016_6_7", "sn017_1_2", "sn017_1_3", "sn017_1_4",
"sn017_1_5", "sn017_1_6", "sn017_1_7", "sn017_2_3", "sn017_2_4",
"sn017_2_5", "sn017_2_6", "sn017_2_7", "sn017_3_4", "sn017_3_5",
"sn017_3_6", "sn017_3_7", "sn017_4_5", "sn017_4_6", "sn017_4_7",
"sn017_5_6", "sn017_5_7", "sn017_6_7")
# An example matrix with random data
mat <- matrix(runif(length(cnames) * 4), nrow = 4)
# Make the column names corrcet
colnames(mat) <- cnames
### Now pretend we already had the data
# Get the column names of the input matrix
cnames <- colnames(mat)
# The column names that are not your primary key
n_primkey <- cnames[which(cnames != "prim_key")]
# Get the unique set of prefixes for the non-primkey variables
prefix <- strsplit(n_primkey, "_")
prefix <- unique(unlist(lapply(prefix, "[", 1)))
# Go row by row through the original matrix
dat <- lapply(seq_len(nrow(mat)), function(i) {
# The row we're dealing with now
row <- mat[i, ]
# The column names of your output matrix
dcnames <- c("prim_key", "S", "T", prefix)
# A pre-allocated data.frame to hold the rehaped data for this row
dat <- matrix(rep(NA, length(dcnames) * length(n_primkey)), ncol = length(dcnames))
dat <- as.data.frame(dat)
colnames(dat) <- dcnames
# All values for this row have the same prim_key value
dat$prim_key <- row["prim_key"]
# Go through each of the non-prim_key variables, split them, and put the
# values in the correct place
for (j in seq_len(length(n_primkey))) {
# k has the non-prim_key name we're dealing with
k <- n_primkey[j]
# l splits this name by underscores "_"
l <- strsplit(k, "_")
# The first element gives the prefix
pref <- l[[1]][1]
# The second gives the "S" value
S_val <- l[[1]][2]
# The third gives the "T" value
T_val <- l[[1]][3]
# Allocate these values into the output data.frame we created ealier
dat[j, "S"] <- S_val
dat[j, "T"] <- T_val
dat[j, pref] <- row[k]
}
# Return the data for row i of the input data
dat
})
# dat is a list, so combine each element into a single data.frame
dat <- do.call(rbind, dat)
# Check a few
dat[1:2, ]
mat[1, ]