麻烦 df 从宽到长变得不平衡
trouble getting unbalanced df from wide to long
我有一个不平衡的宽数据框,看起来像这样:
set.seed(1)
df <- data.frame(id1=seq(1:10),
id2=runif(10),
v1.a=runif(10),
v1.b=runif(10),
v1.c=runif(10),
v2.a=runif(10),
v2.b=runif(10),
v2.c=runif(10),
v3.a=runif(10),
#v3.b=runif(10),
v3.c=runif(10),
v4.a=runif(10),
v4.b=runif(10),
v4.c=runif(10),
#v5.a=runif(10),
#v5.b=runif(10),
v5.c=runif(10),
v6.a=runif(10),
v6.b=runif(10),
v6.c=runif(10),
v7.a=rep(NA, 10),
v7.b=rep(NA, 10),
v7.c=rep(NA, 10),
v8.d=runif(10))
我正在尝试将它变成长格式。 reshape
失败,因为并非每次都出现所有不同的列,所以我转向 splitstackshape
中的 Reshape
。
library(splitstackshape)
vary <- grep("\.a$|\.b$|\.c$|\.d$", names(df))
stubs <- unique(sub("\..*$", "", names(df[vary])))
df2 <- Reshape(df,
id.vars=c("id1", "id2"),
var.stubs=stubs,
sep=".")
不过,最后的结果似乎不太对。例如,v3
缺少 "b" 的输入,我假设它是时间 2。在 df2
中,时间 1 和时间 2 有很长的 v3
值,但是不是 3.
id1 id2 time v1 v2 v3
1 1 0.26550866 1 0.20597457 0.82094629 0.3390729
2 2 0.37212390 1 0.17655675 0.64706019 0.8394404
3 3 0.57285336 1 0.68702285 0.78293276 0.3466835
4 4 0.90820779 1 0.38410372 0.55303631 0.3337749
5 5 0.20168193 1 0.76984142 0.52971958 0.4763512
6 6 0.89838968 1 0.49769924 0.78935623 0.8921983
7 7 0.94467527 1 0.71761851 0.02333120 0.8643395
8 8 0.66079779 1 0.99190609 0.47723007 0.3899895
9 9 0.62911404 1 0.38003518 0.73231374 0.7773207
10 10 0.06178627 1 0.77744522 0.69273156 0.9606180
11 1 0.26550866 2 0.93470523 0.47761962 0.4346595
12 2 0.37212390 2 0.21214252 0.86120948 0.7125147
13 3 0.57285336 2 0.65167377 0.43809711 0.3999944
14 4 0.90820779 2 0.12555510 0.24479728 0.3253522
15 5 0.20168193 2 0.26722067 0.07067905 0.7570871
16 6 0.89838968 2 0.38611409 0.09946616 0.2026923
17 7 0.94467527 2 0.01339033 0.31627171 0.7111212
18 8 0.66079779 2 0.38238796 0.51863426 0.1216919
19 9 0.62911404 2 0.86969085 0.66200508 0.2454885
20 10 0.06178627 2 0.34034900 0.40683019 0.1433044
21 1 0.26550866 3 0.48208012 0.91287592 NA
22 2 0.37212390 3 0.59956583 0.29360337 NA
23 3 0.57285336 3 0.49354131 0.45906573 NA
24 4 0.90820779 3 0.18621760 0.33239467 NA
25 5 0.20168193 3 0.82737332 0.65087047 NA
26 6 0.89838968 3 0.66846674 0.25801678 NA
27 7 0.94467527 3 0.79423986 0.47854525 NA
28 8 0.66079779 3 0.10794363 0.76631067 NA
29 9 0.62911404 3 0.72371095 0.08424691 NA
30 10 0.06178627 3 0.41127443 0.87532133 NA
我是不是搞错了?
使用 melt
或 gather
是否有更好的选择?我尝试了几种方法,但运气不佳。我的实际用例包括 1302 个我称之为 vary
的列、3 个时间段(a、b、c)和 821 个唯一 stubs
(显然不平衡)。
或许,我们可以使用 data.table
中的 melt
,它可以在 measure
中使用多个 patterns
。使用 data.table
更容易,因为它需要多个模式
library(data.table)
setDT(df)
d1 <- read.table(text=names(df)[-(1:2)], sep=".")
df[, (setdiff(outer(d1$V1, d1$V2, FUN = paste, sep="."), names(df)[-(1:2)])) := NA]
melt(df[, order(names(df)), with = FALSE], measure = patterns(paste0("v",
1:8)), value.name = paste0("v", 1:8))
或者可以是melt/dcast
res <- dcast(melt(df, id.var = c("id1", "id2"))[, c("var1", "var2") :=
tstrsplit(variable, "[.]")], id1 + id2 + var2 ~ var1, value.var = "value")
res[order(var2, id1)]
# id1 id2 var2 v1 v2 v3 v4 v5 v6 v7 v8
# 1: 1 0.26550866 a 0.20597457 0.82094629 0.3390729 0.23962942 NA 0.57487220 NA NA
# 2: 2 0.37212390 a 0.17655675 0.64706019 0.8394404 0.05893438 NA 0.07706438 NA NA
# 3: 3 0.57285336 a 0.68702285 0.78293276 0.3466835 0.64228826 NA 0.03554058 NA NA
# 4: 4 0.90820779 a 0.38410372 0.55303631 0.3337749 0.87626921 NA 0.64279549 NA NA
# 5: 5 0.20168193 a 0.76984142 0.52971958 0.4763512 0.77891468 NA 0.92861520 NA NA
# 6: 6 0.89838968 a 0.49769924 0.78935623 0.8921983 0.79730883 NA 0.59809242 NA NA
# 7: 7 0.94467527 a 0.71761851 0.02333120 0.8643395 0.45527445 NA 0.56090075 NA NA
# 8: 8 0.66079779 a 0.99190609 0.47723007 0.3899895 0.41008408 NA 0.52602772 NA NA
# 9: 9 0.62911404 a 0.38003518 0.73231374 0.7773207 0.81087024 NA 0.98509522 NA NA
#10: 10 0.06178627 a 0.77744522 0.69273156 0.9606180 0.60493329 NA 0.50764182 NA NA
#11: 1 0.26550866 b 0.93470523 0.47761962 NA 0.65472393 NA 0.68278808 NA NA
#12: 2 0.37212390 b 0.21214252 0.86120948 NA 0.35319727 NA 0.60154122 NA NA
#13: 3 0.57285336 b 0.65167377 0.43809711 NA 0.27026015 NA 0.23886868 NA NA
#14: 4 0.90820779 b 0.12555510 0.24479728 NA 0.99268406 NA 0.25816593 NA NA
#15: 5 0.20168193 b 0.26722067 0.07067905 NA 0.63349326 NA 0.72930962 NA NA
#16: 6 0.89838968 b 0.38611409 0.09946616 NA 0.21320814 NA 0.45257083 NA NA
#17: 7 0.94467527 b 0.01339033 0.31627171 NA 0.12937235 NA 0.17512677 NA NA
#18: 8 0.66079779 b 0.38238796 0.51863426 NA 0.47811803 NA 0.74669827 NA NA
#19: 9 0.62911404 b 0.86969085 0.66200508 NA 0.92407447 NA 0.10498764 NA NA
#20: 10 0.06178627 b 0.34034900 0.40683019 NA 0.59876097 NA 0.86454495 NA NA
#21: 1 0.26550866 c 0.48208012 0.91287592 0.4346595 0.97617069 0.9918386 0.61464497 NA NA
#22: 2 0.37212390 c 0.59956583 0.29360337 0.7125147 0.73179251 0.4955936 0.55715954 NA NA
#23: 3 0.57285336 c 0.49354131 0.45906573 0.3999944 0.35672691 0.4843495 0.32877732 NA NA
#24: 4 0.90820779 c 0.18621760 0.33239467 0.3253522 0.43147369 0.1734423 0.45313145 NA NA
#25: 5 0.20168193 c 0.82737332 0.65087047 0.7570871 0.14821156 0.7548209 0.50044097 NA NA
#26: 6 0.89838968 c 0.66846674 0.25801678 0.2026923 0.01307758 0.4538955 0.18086636 NA NA
#27: 7 0.94467527 c 0.79423986 0.47854525 0.7111212 0.71556607 0.5111698 0.52963060 NA NA
#28: 8 0.66079779 c 0.10794363 0.76631067 0.1216919 0.10318424 0.2075451 0.07527575 NA NA
#29: 9 0.62911404 c 0.72371095 0.08424691 0.2454885 0.44628435 0.2286581 0.27775593 NA NA
#30: 10 0.06178627 c 0.41127443 0.87532133 0.1433044 0.64010105 0.5957120 0.21269952 NA NA
#31: 1 0.26550866 d NA NA NA NA NA NA NA 0.28479048
#32: 2 0.37212390 d NA NA NA NA NA NA NA 0.89509410
#33: 3 0.57285336 d NA NA NA NA NA NA NA 0.44623532
#34: 4 0.90820779 d NA NA NA NA NA NA NA 0.77998489
#35: 5 0.20168193 d NA NA NA NA NA NA NA 0.88061903
#36: 6 0.89838968 d NA NA NA NA NA NA NA 0.41312421
#37: 7 0.94467527 d NA NA NA NA NA NA NA 0.06380848
#38: 8 0.66079779 d NA NA NA NA NA NA NA 0.33548749
#39: 9 0.62911404 d NA NA NA NA NA NA NA 0.72372595
#40: 10 0.06178627 d NA NA NA NA NA NA NA 0.33761533
试试这个,改编自其他链接的答案:
spl <- strsplit(names(df)[-(1:2)],"\.")
allvars <- c(outer(unique(sapply(spl,`[`,1)), unique(sapply(spl,`[`,2)),paste,sep="."))
df[setdiff(allvars, names(df))] <- NA
reshape(df, direction="long", sep=".", varying=allvars)
# id1 id2 time v1 v2 v3 v4 v5 v6 v7 v8 id
#1.a 1 0.26550866 a 0.20597457 0.82094629 0.3390729 0.23962942 NA 0.57487220 NA NA 1
#2.a 2 0.37212390 a 0.17655675 0.64706019 0.8394404 0.05893438 NA 0.07706438 NA NA 2
#...
我认为你要用 tidyr 做什么:
library(tidyr)
# gather non-ID columns to long form
df %>% gather(var, val, -id1:-id2) %>%
# split former column names into variable name and time variables
separate(var, c('var', 'time')) %>%
# spread back to wide form
spread(var, val) %>%
head()
## id1 id2 time v1 v2 v3 v4 v5 v6 v7 v8
## 1 1 0.2655087 a 0.2059746 0.8209463 0.3390729 0.23962942 NA 0.57487220 NA NA
## 2 1 0.2655087 b 0.9347052 0.4776196 NA 0.65472393 NA 0.68278808 NA NA
## 3 1 0.2655087 c 0.4820801 0.9128759 0.4346595 0.97617069 0.9918386 0.61464497 NA NA
## 4 1 0.2655087 d NA NA NA NA NA NA NA 0.2847905
## 5 2 0.3721239 a 0.1765568 0.6470602 0.8394404 0.05893438 NA 0.07706438 NA NA
## 6 2 0.3721239 b 0.2121425 0.8612095 NA 0.35319727 NA 0.60154122 NA NA
我有一个不平衡的宽数据框,看起来像这样:
set.seed(1)
df <- data.frame(id1=seq(1:10),
id2=runif(10),
v1.a=runif(10),
v1.b=runif(10),
v1.c=runif(10),
v2.a=runif(10),
v2.b=runif(10),
v2.c=runif(10),
v3.a=runif(10),
#v3.b=runif(10),
v3.c=runif(10),
v4.a=runif(10),
v4.b=runif(10),
v4.c=runif(10),
#v5.a=runif(10),
#v5.b=runif(10),
v5.c=runif(10),
v6.a=runif(10),
v6.b=runif(10),
v6.c=runif(10),
v7.a=rep(NA, 10),
v7.b=rep(NA, 10),
v7.c=rep(NA, 10),
v8.d=runif(10))
我正在尝试将它变成长格式。 reshape
失败,因为并非每次都出现所有不同的列,所以我转向 splitstackshape
中的 Reshape
。
library(splitstackshape)
vary <- grep("\.a$|\.b$|\.c$|\.d$", names(df))
stubs <- unique(sub("\..*$", "", names(df[vary])))
df2 <- Reshape(df,
id.vars=c("id1", "id2"),
var.stubs=stubs,
sep=".")
不过,最后的结果似乎不太对。例如,v3
缺少 "b" 的输入,我假设它是时间 2。在 df2
中,时间 1 和时间 2 有很长的 v3
值,但是不是 3.
id1 id2 time v1 v2 v3
1 1 0.26550866 1 0.20597457 0.82094629 0.3390729
2 2 0.37212390 1 0.17655675 0.64706019 0.8394404
3 3 0.57285336 1 0.68702285 0.78293276 0.3466835
4 4 0.90820779 1 0.38410372 0.55303631 0.3337749
5 5 0.20168193 1 0.76984142 0.52971958 0.4763512
6 6 0.89838968 1 0.49769924 0.78935623 0.8921983
7 7 0.94467527 1 0.71761851 0.02333120 0.8643395
8 8 0.66079779 1 0.99190609 0.47723007 0.3899895
9 9 0.62911404 1 0.38003518 0.73231374 0.7773207
10 10 0.06178627 1 0.77744522 0.69273156 0.9606180
11 1 0.26550866 2 0.93470523 0.47761962 0.4346595
12 2 0.37212390 2 0.21214252 0.86120948 0.7125147
13 3 0.57285336 2 0.65167377 0.43809711 0.3999944
14 4 0.90820779 2 0.12555510 0.24479728 0.3253522
15 5 0.20168193 2 0.26722067 0.07067905 0.7570871
16 6 0.89838968 2 0.38611409 0.09946616 0.2026923
17 7 0.94467527 2 0.01339033 0.31627171 0.7111212
18 8 0.66079779 2 0.38238796 0.51863426 0.1216919
19 9 0.62911404 2 0.86969085 0.66200508 0.2454885
20 10 0.06178627 2 0.34034900 0.40683019 0.1433044
21 1 0.26550866 3 0.48208012 0.91287592 NA
22 2 0.37212390 3 0.59956583 0.29360337 NA
23 3 0.57285336 3 0.49354131 0.45906573 NA
24 4 0.90820779 3 0.18621760 0.33239467 NA
25 5 0.20168193 3 0.82737332 0.65087047 NA
26 6 0.89838968 3 0.66846674 0.25801678 NA
27 7 0.94467527 3 0.79423986 0.47854525 NA
28 8 0.66079779 3 0.10794363 0.76631067 NA
29 9 0.62911404 3 0.72371095 0.08424691 NA
30 10 0.06178627 3 0.41127443 0.87532133 NA
我是不是搞错了?
使用 melt
或 gather
是否有更好的选择?我尝试了几种方法,但运气不佳。我的实际用例包括 1302 个我称之为 vary
的列、3 个时间段(a、b、c)和 821 个唯一 stubs
(显然不平衡)。
或许,我们可以使用 data.table
中的 melt
,它可以在 measure
中使用多个 patterns
。使用 data.table
更容易,因为它需要多个模式
library(data.table)
setDT(df)
d1 <- read.table(text=names(df)[-(1:2)], sep=".")
df[, (setdiff(outer(d1$V1, d1$V2, FUN = paste, sep="."), names(df)[-(1:2)])) := NA]
melt(df[, order(names(df)), with = FALSE], measure = patterns(paste0("v",
1:8)), value.name = paste0("v", 1:8))
或者可以是melt/dcast
res <- dcast(melt(df, id.var = c("id1", "id2"))[, c("var1", "var2") :=
tstrsplit(variable, "[.]")], id1 + id2 + var2 ~ var1, value.var = "value")
res[order(var2, id1)]
# id1 id2 var2 v1 v2 v3 v4 v5 v6 v7 v8
# 1: 1 0.26550866 a 0.20597457 0.82094629 0.3390729 0.23962942 NA 0.57487220 NA NA
# 2: 2 0.37212390 a 0.17655675 0.64706019 0.8394404 0.05893438 NA 0.07706438 NA NA
# 3: 3 0.57285336 a 0.68702285 0.78293276 0.3466835 0.64228826 NA 0.03554058 NA NA
# 4: 4 0.90820779 a 0.38410372 0.55303631 0.3337749 0.87626921 NA 0.64279549 NA NA
# 5: 5 0.20168193 a 0.76984142 0.52971958 0.4763512 0.77891468 NA 0.92861520 NA NA
# 6: 6 0.89838968 a 0.49769924 0.78935623 0.8921983 0.79730883 NA 0.59809242 NA NA
# 7: 7 0.94467527 a 0.71761851 0.02333120 0.8643395 0.45527445 NA 0.56090075 NA NA
# 8: 8 0.66079779 a 0.99190609 0.47723007 0.3899895 0.41008408 NA 0.52602772 NA NA
# 9: 9 0.62911404 a 0.38003518 0.73231374 0.7773207 0.81087024 NA 0.98509522 NA NA
#10: 10 0.06178627 a 0.77744522 0.69273156 0.9606180 0.60493329 NA 0.50764182 NA NA
#11: 1 0.26550866 b 0.93470523 0.47761962 NA 0.65472393 NA 0.68278808 NA NA
#12: 2 0.37212390 b 0.21214252 0.86120948 NA 0.35319727 NA 0.60154122 NA NA
#13: 3 0.57285336 b 0.65167377 0.43809711 NA 0.27026015 NA 0.23886868 NA NA
#14: 4 0.90820779 b 0.12555510 0.24479728 NA 0.99268406 NA 0.25816593 NA NA
#15: 5 0.20168193 b 0.26722067 0.07067905 NA 0.63349326 NA 0.72930962 NA NA
#16: 6 0.89838968 b 0.38611409 0.09946616 NA 0.21320814 NA 0.45257083 NA NA
#17: 7 0.94467527 b 0.01339033 0.31627171 NA 0.12937235 NA 0.17512677 NA NA
#18: 8 0.66079779 b 0.38238796 0.51863426 NA 0.47811803 NA 0.74669827 NA NA
#19: 9 0.62911404 b 0.86969085 0.66200508 NA 0.92407447 NA 0.10498764 NA NA
#20: 10 0.06178627 b 0.34034900 0.40683019 NA 0.59876097 NA 0.86454495 NA NA
#21: 1 0.26550866 c 0.48208012 0.91287592 0.4346595 0.97617069 0.9918386 0.61464497 NA NA
#22: 2 0.37212390 c 0.59956583 0.29360337 0.7125147 0.73179251 0.4955936 0.55715954 NA NA
#23: 3 0.57285336 c 0.49354131 0.45906573 0.3999944 0.35672691 0.4843495 0.32877732 NA NA
#24: 4 0.90820779 c 0.18621760 0.33239467 0.3253522 0.43147369 0.1734423 0.45313145 NA NA
#25: 5 0.20168193 c 0.82737332 0.65087047 0.7570871 0.14821156 0.7548209 0.50044097 NA NA
#26: 6 0.89838968 c 0.66846674 0.25801678 0.2026923 0.01307758 0.4538955 0.18086636 NA NA
#27: 7 0.94467527 c 0.79423986 0.47854525 0.7111212 0.71556607 0.5111698 0.52963060 NA NA
#28: 8 0.66079779 c 0.10794363 0.76631067 0.1216919 0.10318424 0.2075451 0.07527575 NA NA
#29: 9 0.62911404 c 0.72371095 0.08424691 0.2454885 0.44628435 0.2286581 0.27775593 NA NA
#30: 10 0.06178627 c 0.41127443 0.87532133 0.1433044 0.64010105 0.5957120 0.21269952 NA NA
#31: 1 0.26550866 d NA NA NA NA NA NA NA 0.28479048
#32: 2 0.37212390 d NA NA NA NA NA NA NA 0.89509410
#33: 3 0.57285336 d NA NA NA NA NA NA NA 0.44623532
#34: 4 0.90820779 d NA NA NA NA NA NA NA 0.77998489
#35: 5 0.20168193 d NA NA NA NA NA NA NA 0.88061903
#36: 6 0.89838968 d NA NA NA NA NA NA NA 0.41312421
#37: 7 0.94467527 d NA NA NA NA NA NA NA 0.06380848
#38: 8 0.66079779 d NA NA NA NA NA NA NA 0.33548749
#39: 9 0.62911404 d NA NA NA NA NA NA NA 0.72372595
#40: 10 0.06178627 d NA NA NA NA NA NA NA 0.33761533
试试这个,改编自其他链接的答案:
spl <- strsplit(names(df)[-(1:2)],"\.")
allvars <- c(outer(unique(sapply(spl,`[`,1)), unique(sapply(spl,`[`,2)),paste,sep="."))
df[setdiff(allvars, names(df))] <- NA
reshape(df, direction="long", sep=".", varying=allvars)
# id1 id2 time v1 v2 v3 v4 v5 v6 v7 v8 id
#1.a 1 0.26550866 a 0.20597457 0.82094629 0.3390729 0.23962942 NA 0.57487220 NA NA 1
#2.a 2 0.37212390 a 0.17655675 0.64706019 0.8394404 0.05893438 NA 0.07706438 NA NA 2
#...
我认为你要用 tidyr 做什么:
library(tidyr)
# gather non-ID columns to long form
df %>% gather(var, val, -id1:-id2) %>%
# split former column names into variable name and time variables
separate(var, c('var', 'time')) %>%
# spread back to wide form
spread(var, val) %>%
head()
## id1 id2 time v1 v2 v3 v4 v5 v6 v7 v8
## 1 1 0.2655087 a 0.2059746 0.8209463 0.3390729 0.23962942 NA 0.57487220 NA NA
## 2 1 0.2655087 b 0.9347052 0.4776196 NA 0.65472393 NA 0.68278808 NA NA
## 3 1 0.2655087 c 0.4820801 0.9128759 0.4346595 0.97617069 0.9918386 0.61464497 NA NA
## 4 1 0.2655087 d NA NA NA NA NA NA NA 0.2847905
## 5 2 0.3721239 a 0.1765568 0.6470602 0.8394404 0.05893438 NA 0.07706438 NA NA
## 6 2 0.3721239 b 0.2121425 0.8612095 NA 0.35319727 NA 0.60154122 NA NA