在 R 中用几组列名熔化数据
Melting data with several groups of column names in R
我正在处理一些数据,对多个协变量进行重复测量。示例数据格式是这样的
set.seed(123)
df <- data.frame(id = 1001:1003, matrix(rnorm(36),3,12),
d=runif(3), e=runif(3), f=runif(3))
colnames(df) <- c('df', paste('a', 1:4, sep=''), paste('b', 1:4, sep=''),
paste('c',1:4,sep=''), 'd', 'e', 'f')
df
id a1 a2 a3 a4 b1 b2 b3 b4 c1 c2 c3 c4 d e f
1001 -0.5604756 0.07050839 ......
1002 -0.2301775 0.12928774
1003 1.5587083 1.71506499 ...
我希望我的数据在每个协变量中都转换为长格式,例如
id time a b c d e f
1001 1 -0.5604756 0.4007715 ..
1001 2 0.07050839
1001 3
1001 4
1002 1
1002 2
1002 3
1002 4
1003 1
1003 2
1003 3
1004 4
非常感谢。
有点乱七八糟,但确实有效。
library(reshape)
# split the dataframes into a list of dataframes (one df for each letter)
dfs <- lapply(letters[1:6], function(x)
df[colnames(df)[grep(paste0("(df)|(",x,"\d?)"), colnames(df))]])
# melt the data (using reshape's melt function)
dfs <- lapply(1:6, function(x)
melt(dfs[[x]],id.vars="df",variable_name="time"))
# convert factor to numerical value for "time"; rename "value" to letter
for(i in 1:6) {
dfs[[i]]$time <- as.numeric(dfs[[i]]$time)
colnames(dfs[[i]])[3] <- letters[i]
}
# merge everything back into one dataframe
df.final <- dfs[[1]]
for(i in 2:6)
df.final <- merge(x = df.final, y = dfs[[i]], all = TRUE, by = c("df","time"))
df.final
df time a b c d e f
1 1001 1 -0.56047565 0.4007715 -0.6250393 0.7101824014 0.2201189 0.3517979
2 1001 2 0.07050839 1.7869131 0.1533731 NA NA NA
3 1001 3 0.46091621 0.7013559 0.4264642 NA NA NA
4 1001 4 -0.44566197 -0.2179749 0.8781335 NA NA NA
5 1002 1 -0.23017749 0.1106827 -1.6866933 0.0006247733 0.3798165 0.1111354
6 1002 2 0.12928774 0.4978505 -1.1381369 NA NA NA
7 1002 3 -1.26506123 -0.4727914 -0.2950715 NA NA NA
8 1002 4 1.22408180 -1.0260044 0.8215811 NA NA NA
9 1003 1 1.55870831 -0.5558411 0.8377870 0.4753165741 0.6127710 0.2436195
10 1003 2 1.71506499 -1.9666172 1.2538149 NA NA NA
11 1003 3 -0.68685285 -1.0678237 0.8951257 NA NA NA
12 1003 4 0.35981383 -0.7288912 0.6886403 NA NA NA
这是使用最近发布的 tidyr
包的方法。请注意,我稍微更改了您的示例数据,您需要它 完全 吗?
示例数据:
set.seed(123)
df = data.frame(id=1001:1003,matrix(rnorm(36),3,12),d=runif(3),e=runif(3),f=runif(3))
colnames(df) = c('df',paste('a',1:4,sep=''),
paste('b',1:4,sep=''),
paste('c',1:4,sep=''),
paste('d',1,sep=''),
paste('e',1,sep=''),
paste('f',1,sep=''))
df
代码:
library(tidyr)
library(dplyr)
df %>%
gather(key, value, -df) %>%
extract(key, c("letter", "number"), "([[:alpha:]])([[:digit:]])" ) %>%
spread(letter, value)
结果:
df number a b c d e f
1 1001 1 -0.56047565 0.4007715 -0.6250393 0.7101824014 0.2201189 0.3517979
2 1001 2 0.07050839 1.7869131 0.1533731 NA NA NA
3 1001 3 0.46091621 0.7013559 0.4264642 NA NA NA
4 1001 4 -0.44566197 -0.2179749 0.8781335 NA NA NA
5 1002 1 -0.23017749 0.1106827 -1.6866933 0.0006247733 0.3798165 0.1111354
6 1002 2 0.12928774 0.4978505 -1.1381369 NA NA NA
7 1002 3 -1.26506123 -0.4727914 -0.2950715 NA NA NA
8 1002 4 1.22408180 -1.0260044 0.8215811 NA NA NA
9 1003 1 1.55870831 -0.5558411 0.8377870 0.4753165741 0.6127710 0.2436195
10 1003 2 1.71506499 -1.9666172 1.2538149 NA NA NA
11 1003 3 -0.68685285 -1.0678237 0.8951257 NA NA NA
12 1003 4 0.35981383 -0.7288912 0.6886403 NA NA NA
来自 v1.9.5+
的 data.table
可以融化成多列。您可以按照说明进行安装 here.
require(data.table) # v1.9.5+
melt(setDT(df), measure = patterns(paste0("^", letters[1:6], "[0-9]*$")),
value.name=letters[1:6])
# df variable a b c d e f
# 1: 1001 1 -0.56047565 0.4007715 -0.6250393 0.7101824014 0.2201189 0.3517979
# 2: 1002 1 -0.23017749 0.1106827 -1.6866933 0.0006247733 0.3798165 0.1111354
# 3: 1003 1 1.55870831 -0.5558411 0.8377870 0.4753165741 0.6127710 0.2436195
# 4: 1001 2 0.07050839 1.7869131 0.1533731 NA NA NA
# 5: 1002 2 0.12928774 0.4978505 -1.1381369 NA NA NA
# 6: 1003 2 1.71506499 -1.9666172 1.2538149 NA NA NA
# 7: 1001 3 0.46091621 0.7013559 0.4264642 NA NA NA
# 8: 1002 3 -1.26506123 -0.4727914 -0.2950715 NA NA NA
# 9: 1003 3 -0.68685285 -1.0678237 0.8951257 NA NA NA
# 10: 1001 4 -0.44566197 -0.2179749 0.8781335 NA NA NA
# 11: 1002 4 1.22408180 -1.0260044 0.8215811 NA NA NA
# 12: 1003 4 0.35981383 -0.7288912 0.6886403 NA NA NA
这将为您提供宽列的长格式:
(res <- reshape(df[-(14:16)], direction="long", idvar="df",
sep="", # the default is "." but you had no separator
varying=names(df)[-c(1,14:16)]) )
df time a b c
1001.1 1001 1 -0.56047565 0.4007715 -0.6250393
1002.1 1002 1 -0.23017749 0.1106827 -1.6866933
1003.1 1003 1 1.55870831 -0.5558411 0.8377870
1001.2 1001 2 0.07050839 1.7869131 0.1533731
1002.2 1002 2 0.12928774 0.4978505 -1.1381369
1003.2 1003 2 1.71506499 -1.9666172 1.2538149
1001.3 1001 3 0.46091621 0.7013559 0.4264642
1002.3 1002 3 -1.26506123 -0.4727914 -0.2950715
1003.3 1003 3 -0.68685285 -1.0678237 0.8951257
1001.4 1001 4 -0.44566197 -0.2179749 0.8781335
1002.4 1002 4 1.22408180 -1.0260044 0.8215811
1003.4 1003 4 0.35981383 -0.7288912 0.6886403
您可以将其与不重复的列合并:
merge(res, df[c(1,14:16)])
df time a b c d
1 1001 1 -0.56047565 0.4007715 -0.6250393 0.7101824014
2 1001 4 -0.44566197 -0.2179749 0.8781335 0.7101824014
3 1001 3 0.46091621 0.7013559 0.4264642 0.7101824014
4 1001 2 0.07050839 1.7869131 0.1533731 0.7101824014
5 1002 2 0.12928774 0.4978505 -1.1381369 0.0006247733
6 1002 1 -0.23017749 0.1106827 -1.6866933 0.0006247733
7 1002 4 1.22408180 -1.0260044 0.8215811 0.0006247733
8 1002 3 -1.26506123 -0.4727914 -0.2950715 0.0006247733
9 1003 3 -0.68685285 -1.0678237 0.8951257 0.4753165741
10 1003 2 1.71506499 -1.9666172 1.2538149 0.4753165741
11 1003 1 1.55870831 -0.5558411 0.8377870 0.4753165741
12 1003 4 0.35981383 -0.7288912 0.6886403 0.4753165741
e f
1 0.2201189 0.3517979
2 0.2201189 0.3517979
3 0.2201189 0.3517979
4 0.2201189 0.3517979
5 0.3798165 0.1111354
6 0.3798165 0.1111354
7 0.3798165 0.1111354
8 0.3798165 0.1111354
9 0.6127710 0.2436195
10 0.6127710 0.2436195
11 0.6127710 0.2436195
12 0.6127710 0.2436195
我正在处理一些数据,对多个协变量进行重复测量。示例数据格式是这样的
set.seed(123)
df <- data.frame(id = 1001:1003, matrix(rnorm(36),3,12),
d=runif(3), e=runif(3), f=runif(3))
colnames(df) <- c('df', paste('a', 1:4, sep=''), paste('b', 1:4, sep=''),
paste('c',1:4,sep=''), 'd', 'e', 'f')
df
id a1 a2 a3 a4 b1 b2 b3 b4 c1 c2 c3 c4 d e f
1001 -0.5604756 0.07050839 ......
1002 -0.2301775 0.12928774
1003 1.5587083 1.71506499 ...
我希望我的数据在每个协变量中都转换为长格式,例如
id time a b c d e f
1001 1 -0.5604756 0.4007715 ..
1001 2 0.07050839
1001 3
1001 4
1002 1
1002 2
1002 3
1002 4
1003 1
1003 2
1003 3
1004 4
非常感谢。
有点乱七八糟,但确实有效。
library(reshape)
# split the dataframes into a list of dataframes (one df for each letter)
dfs <- lapply(letters[1:6], function(x)
df[colnames(df)[grep(paste0("(df)|(",x,"\d?)"), colnames(df))]])
# melt the data (using reshape's melt function)
dfs <- lapply(1:6, function(x)
melt(dfs[[x]],id.vars="df",variable_name="time"))
# convert factor to numerical value for "time"; rename "value" to letter
for(i in 1:6) {
dfs[[i]]$time <- as.numeric(dfs[[i]]$time)
colnames(dfs[[i]])[3] <- letters[i]
}
# merge everything back into one dataframe
df.final <- dfs[[1]]
for(i in 2:6)
df.final <- merge(x = df.final, y = dfs[[i]], all = TRUE, by = c("df","time"))
df.final
df time a b c d e f
1 1001 1 -0.56047565 0.4007715 -0.6250393 0.7101824014 0.2201189 0.3517979
2 1001 2 0.07050839 1.7869131 0.1533731 NA NA NA
3 1001 3 0.46091621 0.7013559 0.4264642 NA NA NA
4 1001 4 -0.44566197 -0.2179749 0.8781335 NA NA NA
5 1002 1 -0.23017749 0.1106827 -1.6866933 0.0006247733 0.3798165 0.1111354
6 1002 2 0.12928774 0.4978505 -1.1381369 NA NA NA
7 1002 3 -1.26506123 -0.4727914 -0.2950715 NA NA NA
8 1002 4 1.22408180 -1.0260044 0.8215811 NA NA NA
9 1003 1 1.55870831 -0.5558411 0.8377870 0.4753165741 0.6127710 0.2436195
10 1003 2 1.71506499 -1.9666172 1.2538149 NA NA NA
11 1003 3 -0.68685285 -1.0678237 0.8951257 NA NA NA
12 1003 4 0.35981383 -0.7288912 0.6886403 NA NA NA
这是使用最近发布的 tidyr
包的方法。请注意,我稍微更改了您的示例数据,您需要它 完全 吗?
示例数据:
set.seed(123)
df = data.frame(id=1001:1003,matrix(rnorm(36),3,12),d=runif(3),e=runif(3),f=runif(3))
colnames(df) = c('df',paste('a',1:4,sep=''),
paste('b',1:4,sep=''),
paste('c',1:4,sep=''),
paste('d',1,sep=''),
paste('e',1,sep=''),
paste('f',1,sep=''))
df
代码:
library(tidyr)
library(dplyr)
df %>%
gather(key, value, -df) %>%
extract(key, c("letter", "number"), "([[:alpha:]])([[:digit:]])" ) %>%
spread(letter, value)
结果:
df number a b c d e f
1 1001 1 -0.56047565 0.4007715 -0.6250393 0.7101824014 0.2201189 0.3517979
2 1001 2 0.07050839 1.7869131 0.1533731 NA NA NA
3 1001 3 0.46091621 0.7013559 0.4264642 NA NA NA
4 1001 4 -0.44566197 -0.2179749 0.8781335 NA NA NA
5 1002 1 -0.23017749 0.1106827 -1.6866933 0.0006247733 0.3798165 0.1111354
6 1002 2 0.12928774 0.4978505 -1.1381369 NA NA NA
7 1002 3 -1.26506123 -0.4727914 -0.2950715 NA NA NA
8 1002 4 1.22408180 -1.0260044 0.8215811 NA NA NA
9 1003 1 1.55870831 -0.5558411 0.8377870 0.4753165741 0.6127710 0.2436195
10 1003 2 1.71506499 -1.9666172 1.2538149 NA NA NA
11 1003 3 -0.68685285 -1.0678237 0.8951257 NA NA NA
12 1003 4 0.35981383 -0.7288912 0.6886403 NA NA NA
v1.9.5+
的 data.table
可以融化成多列。您可以按照说明进行安装 here.
require(data.table) # v1.9.5+
melt(setDT(df), measure = patterns(paste0("^", letters[1:6], "[0-9]*$")),
value.name=letters[1:6])
# df variable a b c d e f
# 1: 1001 1 -0.56047565 0.4007715 -0.6250393 0.7101824014 0.2201189 0.3517979
# 2: 1002 1 -0.23017749 0.1106827 -1.6866933 0.0006247733 0.3798165 0.1111354
# 3: 1003 1 1.55870831 -0.5558411 0.8377870 0.4753165741 0.6127710 0.2436195
# 4: 1001 2 0.07050839 1.7869131 0.1533731 NA NA NA
# 5: 1002 2 0.12928774 0.4978505 -1.1381369 NA NA NA
# 6: 1003 2 1.71506499 -1.9666172 1.2538149 NA NA NA
# 7: 1001 3 0.46091621 0.7013559 0.4264642 NA NA NA
# 8: 1002 3 -1.26506123 -0.4727914 -0.2950715 NA NA NA
# 9: 1003 3 -0.68685285 -1.0678237 0.8951257 NA NA NA
# 10: 1001 4 -0.44566197 -0.2179749 0.8781335 NA NA NA
# 11: 1002 4 1.22408180 -1.0260044 0.8215811 NA NA NA
# 12: 1003 4 0.35981383 -0.7288912 0.6886403 NA NA NA
这将为您提供宽列的长格式:
(res <- reshape(df[-(14:16)], direction="long", idvar="df",
sep="", # the default is "." but you had no separator
varying=names(df)[-c(1,14:16)]) )
df time a b c
1001.1 1001 1 -0.56047565 0.4007715 -0.6250393
1002.1 1002 1 -0.23017749 0.1106827 -1.6866933
1003.1 1003 1 1.55870831 -0.5558411 0.8377870
1001.2 1001 2 0.07050839 1.7869131 0.1533731
1002.2 1002 2 0.12928774 0.4978505 -1.1381369
1003.2 1003 2 1.71506499 -1.9666172 1.2538149
1001.3 1001 3 0.46091621 0.7013559 0.4264642
1002.3 1002 3 -1.26506123 -0.4727914 -0.2950715
1003.3 1003 3 -0.68685285 -1.0678237 0.8951257
1001.4 1001 4 -0.44566197 -0.2179749 0.8781335
1002.4 1002 4 1.22408180 -1.0260044 0.8215811
1003.4 1003 4 0.35981383 -0.7288912 0.6886403
您可以将其与不重复的列合并:
merge(res, df[c(1,14:16)])
df time a b c d
1 1001 1 -0.56047565 0.4007715 -0.6250393 0.7101824014
2 1001 4 -0.44566197 -0.2179749 0.8781335 0.7101824014
3 1001 3 0.46091621 0.7013559 0.4264642 0.7101824014
4 1001 2 0.07050839 1.7869131 0.1533731 0.7101824014
5 1002 2 0.12928774 0.4978505 -1.1381369 0.0006247733
6 1002 1 -0.23017749 0.1106827 -1.6866933 0.0006247733
7 1002 4 1.22408180 -1.0260044 0.8215811 0.0006247733
8 1002 3 -1.26506123 -0.4727914 -0.2950715 0.0006247733
9 1003 3 -0.68685285 -1.0678237 0.8951257 0.4753165741
10 1003 2 1.71506499 -1.9666172 1.2538149 0.4753165741
11 1003 1 1.55870831 -0.5558411 0.8377870 0.4753165741
12 1003 4 0.35981383 -0.7288912 0.6886403 0.4753165741
e f
1 0.2201189 0.3517979
2 0.2201189 0.3517979
3 0.2201189 0.3517979
4 0.2201189 0.3517979
5 0.3798165 0.1111354
6 0.3798165 0.1111354
7 0.3798165 0.1111354
8 0.3798165 0.1111354
9 0.6127710 0.2436195
10 0.6127710 0.2436195
11 0.6127710 0.2436195
12 0.6127710 0.2436195