Reshape/Melt 两行变量名的数据
Reshape/Melt data with two rows of variable names
我有一个大数据集需要reshape/melt。
我遇到的问题是第一行和第二行是变量名(即第一行是一个人的 ID,第二行列出了这个人的四个属性)。除此之外,第一列捕获收集值的日期。要查看我的数据集示例,请查看以下 google sheet (https://docs.google.com/spreadsheets/d/19b_4hB6aM9JXReG67i9EF_sJVDHkNFHf4iShH8yrfOc/edit?usp=sharing) 中的 Sheet1。我想重塑我的数据,使其看起来像上面 google sheet.
中的 Sheet2
有没有办法在我的 melt 命令中指定有两个 ID。第一个 id 是第一行(从第二列开始),第二个 id 是第一列(从第二行开始)。如果我可以为我想做的事情写一个伪 R 脚本,我会写这样的东西。
melt(dt, id=c("Dates from A2:A6", "Person from B1:I1")
谢谢!
为了方便起见,这里有一个使用 readr::type_convert
的 data.table
方法:
df <- structure(list(
person_A = c("var1", "45.0413", "50.4132", "53.719", "53.719"),
person_A = c("var2", "43.8596", "52.6316", "49.1228", "52.6316"),
person_A = c("var3", "67.8571", "67.8571", "67.8571", "73.2143"),
person_A = c("var4", "35.6589", "41.8605", "49.6124", "45.7364"),
person_B = c("var1", "40.4959", "41.7355", "41.3223", "29.7521"),
person_B = c("var2", "38.5965", "43.8596", "38.5965", "33.3333"),
person_B = c("var3", "60.7143", "42.8571", "48.2143", "57.1429"),
person_B = c("var4", "32.5581", "40.3101", "39.5349", "16.2791")),
class = "data.frame",
row.names = c("Dates", "2021-05-01", "2021-05-02", "2021-05-03", "2021-05-04")
)
library(data.table)
# split data.frame by person
dfl <- split.default(df, sub('\d+', '', names(df)))
# re-define column labels and types based on first row
dfl <- lapply(dfl, function(x) {
setnames(readr::type_convert(data.table(x[-1,],
keep.rownames = TRUE)),
as.character(data.table(x[1,], keep.rownames = TRUE)))})
# combine list elements and reorder columns
setcolorder(rbindlist(dfl, idcol = "Person"), c(2,1))[]
#> Dates Person var1 var2 var3 var4
#> 1: 2021-05-01 person_A 45.0413 43.8596 67.8571 35.6589
#> 2: 2021-05-02 person_A 50.4132 52.6316 67.8571 41.8605
#> 3: 2021-05-03 person_A 53.7190 49.1228 67.8571 49.6124
#> 4: 2021-05-04 person_A 53.7190 52.6316 73.2143 45.7364
#> 5: 2021-05-01 person_B 40.4959 38.5965 60.7143 32.5581
#> 6: 2021-05-02 person_B 41.7355 43.8596 42.8571 40.3101
#> 7: 2021-05-03 person_B 41.3223 38.5965 48.2143 39.5349
#> 8: 2021-05-04 person_B 29.7521 33.3333 57.1429 16.2791
由 reprex package (v2.0.0)
于 2021-05-05 创建
为了使用基础重塑,重要的是将级别组合在一个字符串中,用点分隔它们并重命名变量。
使用上面的 data.frame。
# Get variables names
varA <- as.character(df[1,])
varB <- names(df)
newNames <- paste(varA, varB, sep = '.') # don't forget the point
# Drop first line, rename variables and pass rownames to a column
df <- df[-1,]
names(df) <- newNames
df$dates <- row.names(df) # added after last column
# reshape
lastVar <- ncol(df)-1 # because of rownames in last column
df <- reshape(df, varying = 1:lastVar, direction = 'long')
# and beautify
row.names(df) <- NULL
names(df)[names(df) == 'time'] <- 'person'
df <- df[, -ncol(df)]
df
> dates person var1 var2 var3 var4
> 1 2021-05-01 person_A 45.0413 43.8596 67.8571 35.6589
> 2 2021-05-02 person_A 50.4132 52.6316 67.8571 41.8605
> 3 2021-05-03 person_A 53.719 49.1228 67.8571 49.6124
> 4 2021-05-04 person_A 53.719 52.6316 73.2143 45.7364
> 5 2021-05-01 person_B 40.4959 38.5965 60.7143 32.5581
> 6 2021-05-02 person_B 41.7355 43.8596 42.8571 40.3101
> 7 2021-05-03 person_B 41.3223 38.5965 48.2143 39.5349
> 8 2021-05-04 person_B 29.7521 33.3333 57.1429 16.2791
以下方法将完成您的工作,
library(tidyverse)
name <- readr::read_csv("csvs1.csv", col_names = F, n_max = 2)
name
# A tibble: 2 x 9
X1 X2 X3 X4 X5 X6 X7 X8 X9
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 NA person_A person_A person_A person_A person_B person_B person_B person_B
2 Dates var1 var2 var3 var4 var1 var2 var3 var4
nm <- paste(names[1,], names[2,], sep = "-")
nm
[1] "NA-Dates" "person_A-var1" "person_A-var2" "person_A-var3" "person_A-var4" "person_B-var1" "person_B-var2"
[8] "person_B-var3" "person_B-var4"
data <- readr::read_csv("csvs1.csv", col_names = F, skip = 2)
names(data) <- nm
data
# A tibble: 4 x 9
`NA-Dates` `person_A-var1` `person_A-var2` `person_A-var3` `person_A-var4` `person_B-var1` `person_B-var2`
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2021-05-01 45.0 43.9 67.9 35.7 40.5 38.6
2 2021-05-02 50.4 52.6 67.9 41.9 41.7 43.9
3 2021-05-03 53.7 49.1 67.9 49.6 41.3 38.6
4 2021-05-04 53.7 52.6 73.2 45.7 29.8 33.3
# ... with 2 more variables: person_B-var3 <dbl>, person_B-var4 <dbl>
data %>%
+ pivot_longer(!`NA-Dates`, names_to = c('person', '.value'), names_sep = '-', names_prefix = 'person_')
# A tibble: 8 x 6
`NA-Dates` person var1 var2 var3 var4
<date> <chr> <dbl> <dbl> <dbl> <dbl>
1 2021-05-01 A 45.0 43.9 67.9 35.7
2 2021-05-01 B 40.5 38.6 60.7 32.6
3 2021-05-02 A 50.4 52.6 67.9 41.9
4 2021-05-02 B 41.7 43.9 42.9 40.3
5 2021-05-03 A 53.7 49.1 67.9 49.6
6 2021-05-03 B 41.3 38.6 48.2 39.5
7 2021-05-04 A 53.7 52.6 73.2 45.7
8 2021-05-04 B 29.8 33.3 57.1 16.3
我有一个大数据集需要reshape/melt。 我遇到的问题是第一行和第二行是变量名(即第一行是一个人的 ID,第二行列出了这个人的四个属性)。除此之外,第一列捕获收集值的日期。要查看我的数据集示例,请查看以下 google sheet (https://docs.google.com/spreadsheets/d/19b_4hB6aM9JXReG67i9EF_sJVDHkNFHf4iShH8yrfOc/edit?usp=sharing) 中的 Sheet1。我想重塑我的数据,使其看起来像上面 google sheet.
中的 Sheet2有没有办法在我的 melt 命令中指定有两个 ID。第一个 id 是第一行(从第二列开始),第二个 id 是第一列(从第二行开始)。如果我可以为我想做的事情写一个伪 R 脚本,我会写这样的东西。
melt(dt, id=c("Dates from A2:A6", "Person from B1:I1")
谢谢!
为了方便起见,这里有一个使用 readr::type_convert
的 data.table
方法:
df <- structure(list(
person_A = c("var1", "45.0413", "50.4132", "53.719", "53.719"),
person_A = c("var2", "43.8596", "52.6316", "49.1228", "52.6316"),
person_A = c("var3", "67.8571", "67.8571", "67.8571", "73.2143"),
person_A = c("var4", "35.6589", "41.8605", "49.6124", "45.7364"),
person_B = c("var1", "40.4959", "41.7355", "41.3223", "29.7521"),
person_B = c("var2", "38.5965", "43.8596", "38.5965", "33.3333"),
person_B = c("var3", "60.7143", "42.8571", "48.2143", "57.1429"),
person_B = c("var4", "32.5581", "40.3101", "39.5349", "16.2791")),
class = "data.frame",
row.names = c("Dates", "2021-05-01", "2021-05-02", "2021-05-03", "2021-05-04")
)
library(data.table)
# split data.frame by person
dfl <- split.default(df, sub('\d+', '', names(df)))
# re-define column labels and types based on first row
dfl <- lapply(dfl, function(x) {
setnames(readr::type_convert(data.table(x[-1,],
keep.rownames = TRUE)),
as.character(data.table(x[1,], keep.rownames = TRUE)))})
# combine list elements and reorder columns
setcolorder(rbindlist(dfl, idcol = "Person"), c(2,1))[]
#> Dates Person var1 var2 var3 var4
#> 1: 2021-05-01 person_A 45.0413 43.8596 67.8571 35.6589
#> 2: 2021-05-02 person_A 50.4132 52.6316 67.8571 41.8605
#> 3: 2021-05-03 person_A 53.7190 49.1228 67.8571 49.6124
#> 4: 2021-05-04 person_A 53.7190 52.6316 73.2143 45.7364
#> 5: 2021-05-01 person_B 40.4959 38.5965 60.7143 32.5581
#> 6: 2021-05-02 person_B 41.7355 43.8596 42.8571 40.3101
#> 7: 2021-05-03 person_B 41.3223 38.5965 48.2143 39.5349
#> 8: 2021-05-04 person_B 29.7521 33.3333 57.1429 16.2791
由 reprex package (v2.0.0)
于 2021-05-05 创建为了使用基础重塑,重要的是将级别组合在一个字符串中,用点分隔它们并重命名变量。
使用上面的 data.frame。
# Get variables names
varA <- as.character(df[1,])
varB <- names(df)
newNames <- paste(varA, varB, sep = '.') # don't forget the point
# Drop first line, rename variables and pass rownames to a column
df <- df[-1,]
names(df) <- newNames
df$dates <- row.names(df) # added after last column
# reshape
lastVar <- ncol(df)-1 # because of rownames in last column
df <- reshape(df, varying = 1:lastVar, direction = 'long')
# and beautify
row.names(df) <- NULL
names(df)[names(df) == 'time'] <- 'person'
df <- df[, -ncol(df)]
df
> dates person var1 var2 var3 var4
> 1 2021-05-01 person_A 45.0413 43.8596 67.8571 35.6589
> 2 2021-05-02 person_A 50.4132 52.6316 67.8571 41.8605
> 3 2021-05-03 person_A 53.719 49.1228 67.8571 49.6124
> 4 2021-05-04 person_A 53.719 52.6316 73.2143 45.7364
> 5 2021-05-01 person_B 40.4959 38.5965 60.7143 32.5581
> 6 2021-05-02 person_B 41.7355 43.8596 42.8571 40.3101
> 7 2021-05-03 person_B 41.3223 38.5965 48.2143 39.5349
> 8 2021-05-04 person_B 29.7521 33.3333 57.1429 16.2791
以下方法将完成您的工作,
library(tidyverse)
name <- readr::read_csv("csvs1.csv", col_names = F, n_max = 2)
name
# A tibble: 2 x 9
X1 X2 X3 X4 X5 X6 X7 X8 X9
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 NA person_A person_A person_A person_A person_B person_B person_B person_B
2 Dates var1 var2 var3 var4 var1 var2 var3 var4
nm <- paste(names[1,], names[2,], sep = "-")
nm
[1] "NA-Dates" "person_A-var1" "person_A-var2" "person_A-var3" "person_A-var4" "person_B-var1" "person_B-var2"
[8] "person_B-var3" "person_B-var4"
data <- readr::read_csv("csvs1.csv", col_names = F, skip = 2)
names(data) <- nm
data
# A tibble: 4 x 9
`NA-Dates` `person_A-var1` `person_A-var2` `person_A-var3` `person_A-var4` `person_B-var1` `person_B-var2`
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2021-05-01 45.0 43.9 67.9 35.7 40.5 38.6
2 2021-05-02 50.4 52.6 67.9 41.9 41.7 43.9
3 2021-05-03 53.7 49.1 67.9 49.6 41.3 38.6
4 2021-05-04 53.7 52.6 73.2 45.7 29.8 33.3
# ... with 2 more variables: person_B-var3 <dbl>, person_B-var4 <dbl>
data %>%
+ pivot_longer(!`NA-Dates`, names_to = c('person', '.value'), names_sep = '-', names_prefix = 'person_')
# A tibble: 8 x 6
`NA-Dates` person var1 var2 var3 var4
<date> <chr> <dbl> <dbl> <dbl> <dbl>
1 2021-05-01 A 45.0 43.9 67.9 35.7
2 2021-05-01 B 40.5 38.6 60.7 32.6
3 2021-05-02 A 50.4 52.6 67.9 41.9
4 2021-05-02 B 41.7 43.9 42.9 40.3
5 2021-05-03 A 53.7 49.1 67.9 49.6
6 2021-05-03 B 41.3 38.6 48.2 39.5
7 2021-05-04 A 53.7 52.6 73.2 45.7
8 2021-05-04 B 29.8 33.3 57.1 16.3