将 HRS 数据从宽格式重塑为长格式并创建时间变量
Reshaping the HRS data from wide to long format and creating a time variable
我有以下数据集(除 Weight = W 和 Height = H 外,还包含大约 25 个变量),时间跨度均为 10 年。
目前有以下形式和无时间索引。
df <- structure(list(data = structure(1:4, .Label = c("Ind_1", "Ind_2",
"Ind_3", "Ind_4"), class = "factor"), r1weight = c(56, 76, 87, 64
), r2weight = c(57, 75, 88, 66), r3weight = c(56, 76, 87, 65), r4weight = c(56L,
73L, 85L, 63L), r5weight = c(55L, 77L, 84L, 65L), r1height = c(151L, 163L,
173L, 153L), r2height = c(154L, 164L, NA, 154L), r3height = c(NA, 165L, NA,
152L), r4height = c(153L, 162L, 172L, 154L), r5height = c(152,161,171,154)), class =
"data.frame", row.names = c(NA,
-4L))
data r1w r2w r3w r4w r5w r1h r2h r3h r4h r5h
1 Ind_1 56 57 56 56 55 151 154 NA 153 152
2 Ind_2 76 75 76 73 77 163 164 165 162 161
3 Ind_3 87 88 87 85 84 173 NA NA 172 171
4 Ind_4 64 66 65 63 65 153 154 152 154 154`
我需要添加时间变量并重塑为长格式,希望得到这样的结果。
dflong <- structure(list(time = structure(1:20, .Label = c("1", "2",
"3", "4", "5", "1","2","3","4","5", "1","2","3","4","5","1","2","3","4","5"),
class = "factor"), Ind = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4), W = c(56,57,56,56,55,76,75,76,73,77,87,88,87,85,84,64,66,65,63,65),
H = c(151,154,NA,153,152,163,164,165,162,161,173,NA,NA,172,171,153,154,152,154,154)), class = "data.frame", row.names = c(NA, -20L))
看起来
time Ind W H
1 1 1 56 151
2 2 1 57 154
3 3 1 56 NA
4 4 1 56 153
5 5 1 55 152
6 1 2 76 163
7 2 2 75 164
8 3 2 76 165
9 4 2 73 162
10 5 2 77 161
11 1 3 87 173
12 2 3 88 NA
13 3 3 87 NA
14 4 3 85 172
15 5 3 84 171
16 1 4 64 153
17 2 4 66 154
18 3 4 65 152
19 4 4 63 154
20 5 4 65 154`
我尝试使用 reshape2
-命令,到目前为止我得到了:
library(reshape2)
dflong <- melt(df,id.vars = c("idhhpn",r1w-r10w, r1h-r10h (help writing compactly),
time(needs help constructing) )`
我不想写 "r1w, r2w, r3w",但更像是 r1weight-r10weight 所以我不必为所有 25 个变量写所有 10 个时间实例。
到此为止
使用以下代码
melt <- melt(setDT(HRSdata), measure = patterns("idhhpn", "srhlt", "highbp", "diabetes", "cancer", "lungev", "heartp", "strokev", "psychev", "arth", "obese", "agey", "marpart", "male", "black", "hispan", "logass", "logdebt", "atotal", "debt", "lths", "hsorged", "somehs", "scorAA", "bachelor", "graduate", "works62", "works65", "momagey", "dadagey", "dadalive", "momalive", "vigact3", "smokesn"),
value.name = c("idhhpn", "srhlt", "highbp", "diabetes", "cancer", "lungev", "heartp", "strokev", "psychev", "arth", "obese", "agey", "marpart", "male", "black", "hispan", "logass", "logdebt", "atotal", "debt", "lths", "hsorged", "somehs", "scorAA", "bachelor", "graduate", "works62", "works65", "momagey", "dadagey", "dadalive", "momalive", "vigact3", "smokesn"),
variable.name = "time")[,
idhhpn := as.integer(sub("\D+", "", HRSdata))][order(idhhpn)][, .(time, idhhpn, srhlt, highbp, diabetes, cancer, lungev, heartp, strokev, psychev, arth, obese, agey, marpart, male, black, hispan, logass, logdebt, atotal, debt, lths, hsorged, somehs, scorAA, bachelor, graduate, works62, works65, momagey, dadagey, dadalive, momalive, vigact3, smokesn )]
您可以使用 data.table
中的 melt
函数,然后 cbind
-
setDT(df)
df <- cbind(setnames(melt(df)[grep("^H_",variable),],"value","H"),
setnames(melt(df)[grep("^W_",variable),],"value","W"))
df <- df[,Ind:=gsub(".*_","",data)] ##cleaning Ind_
df <- df[, time:=1:.N, by = .(Ind)]
df <- df[,.(time,W,H,Ind)]
输出-
> df
time W H Ind
1: 1 56 151 1
2: 1 76 163 2
3: 1 87 173 3
4: 1 64 153 4
5: 2 57 154 1
6: 2 75 164 2
7: 2 88 NA 3
8: 2 66 154 4
9: 3 56 NA 1
10: 3 76 165 2
11: 3 87 NA 3
12: 3 65 152 4
13: 4 56 153 1
14: 4 73 162 2
15: 4 85 172 3
16: 4 63 154 4
17: 5 55 152 1
18: 5 77 161 2
19: 5 84 171 3
20: 5 65 154 4
使用 gather
和 spread
的 tidyverse
方法是
library(tidyverse)
df %>%
gather(time, ind, -data) %>%
separate(time, into = c("indName", "time")) %>%
spread(indName, ind)
# data time H W
#1 Ind_1 1 151 56
#2 Ind_1 2 154 57
#3 Ind_1 3 NA 56
#4 Ind_1 4 153 56
#5 Ind_1 5 152 55
#6 Ind_2 1 163 76
#7 Ind_2 2 164 75
#8 Ind_2 3 165 76
#9 Ind_2 4 162 73
#10 Ind_2 5 161 77
#11 Ind_3 1 173 87
#12 Ind_3 2 NA 88
#13 Ind_3 3 NA 87
#14 Ind_3 4 172 85
#15 Ind_3 5 171 84
#16 Ind_4 1 153 64
#17 Ind_4 2 154 66
#18 Ind_4 3 152 65
#19 Ind_4 4 154 63
#20 Ind_4 5 154 65
相同的解决方案,但修改后的变量名称为 "r[num][varname]"(@iod):
df %>%
gather(time, ind, -data) %>%
mutate(time=gsub("r([0-9])","\1_",time)) %>%
separate(time, into = c("time","indName")) %>%
spread(indName, ind)
data time height weight
1 Ind_1 1 151 56
2 Ind_1 2 154 57
3 Ind_1 3 NA 56
4 Ind_1 4 153 56
5 Ind_1 5 152 55
6 Ind_2 1 163 76
7 Ind_2 2 164 75
8 Ind_2 3 165 76
9 Ind_2 4 162 73
10 Ind_2 5 161 77
11 Ind_3 1 173 87
12 Ind_3 2 NA 88
13 Ind_3 3 NA 87
14 Ind_3 4 172 85
15 Ind_3 5 171 84
16 Ind_4 1 153 64
17 Ind_4 2 154 66
18 Ind_4 3 152 65
19 Ind_4 4 154 63
20 Ind_4 5 154 65
使用 data.table
并利用 measure/patterns
的选项是使用 melt
。在例子中,列名有common patterns
as 'weight', 'height' 我们在measure
参数中指定它,将其转换为'long'格式,然后提取sub
的数字部分创建 'Ind'
library(data.table)
melt(setDT(df), measure = patterns("weight", "height"), value.name = c("W", "H"),
variable.name = "time")[,
Ind := as.integer(sub("\D+", "", data))][order(Ind)][, .(time, Ind, W, H)]
# time Ind W H
# 1: 1 1 56 151
# 2: 2 1 57 154
# 3: 3 1 56 NA
# 4: 4 1 56 153
# 5: 5 1 55 152
# 6: 1 2 76 163
# 7: 2 2 75 164
# 8: 3 2 76 165
# 9: 4 2 73 162
#10: 5 2 77 161
#11: 1 3 87 173
#12: 2 3 88 NA
#13: 3 3 87 NA
#14: 4 3 85 172
#15: 5 3 84 171
#16: 1 4 64 153
#17: 2 4 66 154
#18: 3 4 65 152
#19: 4 4 63 154
#20: 5 4 65 154
我有以下数据集(除 Weight = W 和 Height = H 外,还包含大约 25 个变量),时间跨度均为 10 年。
目前有以下形式和无时间索引。
df <- structure(list(data = structure(1:4, .Label = c("Ind_1", "Ind_2",
"Ind_3", "Ind_4"), class = "factor"), r1weight = c(56, 76, 87, 64
), r2weight = c(57, 75, 88, 66), r3weight = c(56, 76, 87, 65), r4weight = c(56L,
73L, 85L, 63L), r5weight = c(55L, 77L, 84L, 65L), r1height = c(151L, 163L,
173L, 153L), r2height = c(154L, 164L, NA, 154L), r3height = c(NA, 165L, NA,
152L), r4height = c(153L, 162L, 172L, 154L), r5height = c(152,161,171,154)), class =
"data.frame", row.names = c(NA,
-4L))
data r1w r2w r3w r4w r5w r1h r2h r3h r4h r5h
1 Ind_1 56 57 56 56 55 151 154 NA 153 152
2 Ind_2 76 75 76 73 77 163 164 165 162 161
3 Ind_3 87 88 87 85 84 173 NA NA 172 171
4 Ind_4 64 66 65 63 65 153 154 152 154 154`
我需要添加时间变量并重塑为长格式,希望得到这样的结果。
dflong <- structure(list(time = structure(1:20, .Label = c("1", "2",
"3", "4", "5", "1","2","3","4","5", "1","2","3","4","5","1","2","3","4","5"),
class = "factor"), Ind = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4), W = c(56,57,56,56,55,76,75,76,73,77,87,88,87,85,84,64,66,65,63,65),
H = c(151,154,NA,153,152,163,164,165,162,161,173,NA,NA,172,171,153,154,152,154,154)), class = "data.frame", row.names = c(NA, -20L))
看起来
time Ind W H
1 1 1 56 151
2 2 1 57 154
3 3 1 56 NA
4 4 1 56 153
5 5 1 55 152
6 1 2 76 163
7 2 2 75 164
8 3 2 76 165
9 4 2 73 162
10 5 2 77 161
11 1 3 87 173
12 2 3 88 NA
13 3 3 87 NA
14 4 3 85 172
15 5 3 84 171
16 1 4 64 153
17 2 4 66 154
18 3 4 65 152
19 4 4 63 154
20 5 4 65 154`
我尝试使用 reshape2
-命令,到目前为止我得到了:
library(reshape2)
dflong <- melt(df,id.vars = c("idhhpn",r1w-r10w, r1h-r10h (help writing compactly),
time(needs help constructing) )`
我不想写 "r1w, r2w, r3w",但更像是 r1weight-r10weight 所以我不必为所有 25 个变量写所有 10 个时间实例。
到此为止
使用以下代码
melt <- melt(setDT(HRSdata), measure = patterns("idhhpn", "srhlt", "highbp", "diabetes", "cancer", "lungev", "heartp", "strokev", "psychev", "arth", "obese", "agey", "marpart", "male", "black", "hispan", "logass", "logdebt", "atotal", "debt", "lths", "hsorged", "somehs", "scorAA", "bachelor", "graduate", "works62", "works65", "momagey", "dadagey", "dadalive", "momalive", "vigact3", "smokesn"),
value.name = c("idhhpn", "srhlt", "highbp", "diabetes", "cancer", "lungev", "heartp", "strokev", "psychev", "arth", "obese", "agey", "marpart", "male", "black", "hispan", "logass", "logdebt", "atotal", "debt", "lths", "hsorged", "somehs", "scorAA", "bachelor", "graduate", "works62", "works65", "momagey", "dadagey", "dadalive", "momalive", "vigact3", "smokesn"),
variable.name = "time")[,
idhhpn := as.integer(sub("\D+", "", HRSdata))][order(idhhpn)][, .(time, idhhpn, srhlt, highbp, diabetes, cancer, lungev, heartp, strokev, psychev, arth, obese, agey, marpart, male, black, hispan, logass, logdebt, atotal, debt, lths, hsorged, somehs, scorAA, bachelor, graduate, works62, works65, momagey, dadagey, dadalive, momalive, vigact3, smokesn )]
您可以使用 data.table
中的 melt
函数,然后 cbind
-
setDT(df)
df <- cbind(setnames(melt(df)[grep("^H_",variable),],"value","H"),
setnames(melt(df)[grep("^W_",variable),],"value","W"))
df <- df[,Ind:=gsub(".*_","",data)] ##cleaning Ind_
df <- df[, time:=1:.N, by = .(Ind)]
df <- df[,.(time,W,H,Ind)]
输出-
> df
time W H Ind
1: 1 56 151 1
2: 1 76 163 2
3: 1 87 173 3
4: 1 64 153 4
5: 2 57 154 1
6: 2 75 164 2
7: 2 88 NA 3
8: 2 66 154 4
9: 3 56 NA 1
10: 3 76 165 2
11: 3 87 NA 3
12: 3 65 152 4
13: 4 56 153 1
14: 4 73 162 2
15: 4 85 172 3
16: 4 63 154 4
17: 5 55 152 1
18: 5 77 161 2
19: 5 84 171 3
20: 5 65 154 4
使用 gather
和 spread
的 tidyverse
方法是
library(tidyverse)
df %>%
gather(time, ind, -data) %>%
separate(time, into = c("indName", "time")) %>%
spread(indName, ind)
# data time H W
#1 Ind_1 1 151 56
#2 Ind_1 2 154 57
#3 Ind_1 3 NA 56
#4 Ind_1 4 153 56
#5 Ind_1 5 152 55
#6 Ind_2 1 163 76
#7 Ind_2 2 164 75
#8 Ind_2 3 165 76
#9 Ind_2 4 162 73
#10 Ind_2 5 161 77
#11 Ind_3 1 173 87
#12 Ind_3 2 NA 88
#13 Ind_3 3 NA 87
#14 Ind_3 4 172 85
#15 Ind_3 5 171 84
#16 Ind_4 1 153 64
#17 Ind_4 2 154 66
#18 Ind_4 3 152 65
#19 Ind_4 4 154 63
#20 Ind_4 5 154 65
相同的解决方案,但修改后的变量名称为 "r[num][varname]"(@iod):
df %>%
gather(time, ind, -data) %>%
mutate(time=gsub("r([0-9])","\1_",time)) %>%
separate(time, into = c("time","indName")) %>%
spread(indName, ind)
data time height weight
1 Ind_1 1 151 56
2 Ind_1 2 154 57
3 Ind_1 3 NA 56
4 Ind_1 4 153 56
5 Ind_1 5 152 55
6 Ind_2 1 163 76
7 Ind_2 2 164 75
8 Ind_2 3 165 76
9 Ind_2 4 162 73
10 Ind_2 5 161 77
11 Ind_3 1 173 87
12 Ind_3 2 NA 88
13 Ind_3 3 NA 87
14 Ind_3 4 172 85
15 Ind_3 5 171 84
16 Ind_4 1 153 64
17 Ind_4 2 154 66
18 Ind_4 3 152 65
19 Ind_4 4 154 63
20 Ind_4 5 154 65
使用 data.table
并利用 measure/patterns
的选项是使用 melt
。在例子中,列名有common patterns
as 'weight', 'height' 我们在measure
参数中指定它,将其转换为'long'格式,然后提取sub
的数字部分创建 'Ind'
library(data.table)
melt(setDT(df), measure = patterns("weight", "height"), value.name = c("W", "H"),
variable.name = "time")[,
Ind := as.integer(sub("\D+", "", data))][order(Ind)][, .(time, Ind, W, H)]
# time Ind W H
# 1: 1 1 56 151
# 2: 2 1 57 154
# 3: 3 1 56 NA
# 4: 4 1 56 153
# 5: 5 1 55 152
# 6: 1 2 76 163
# 7: 2 2 75 164
# 8: 3 2 76 165
# 9: 4 2 73 162
#10: 5 2 77 161
#11: 1 3 87 173
#12: 2 3 88 NA
#13: 3 3 87 NA
#14: 4 3 85 172
#15: 5 3 84 171
#16: 1 4 64 153
#17: 2 4 66 154
#18: 3 4 65 152
#19: 4 4 63 154
#20: 5 4 65 154