按组透视不相等的数据大小
Pivot by group for unequal data size
我有以下DF:
DF = structure(list(ID = c(21785L, 21785L, 21785L), V1 = c(0.828273303,
6.404590021, 0.775568448), V2 = c(2L, 3L, 2L), V3 = c(NA, 1.122899914,
0.850113234), V4 = c(NA, 4L, 3L), V5 = c(NA, 0.866757168, 0.868943246
), V6 = c(NA, 5L, 4L), V7 = c(NA, 0.563804788, 0.728656328),
V8 = c(NA, 6L, 5L), V9 = c(NA, 0.888109208, 0.823803733),
V10 = c(NA, 7L, 6L), V11 = c(NA, 0.578834113, 0.863467391
), V12 = c(NA, 1L, 7L), V13 = c(NA, NA, 0.939920869)), class = "data.frame", row.names = c(5L,
163L, 167L))
Output:
Row ID V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
5 21785 0.8282733 2 NA NA NA NA NA NA NA NA NA NA NA
163 21785 6.4045900 3 1.1228999 4 0.8667572 5 0.5638048 6 0.8881092 7 0.5788341 1 NA
167 21785 0.7755684 2 0.8501132 3 0.8689432 4 0.7286563 5 0.8238037 6 0.8634674 7 0.9399209
数据可以分为3个部分:
- 每个参与者的 ID
- 表示标准心率的奇数列
- 表示星期几的偶数列(1 = 星期日)
我每天有 100 多个唯一参与者和 3000 行数据不相等,因此是 NA。
我想将数据转换为每个部分一列
- 因此:col1 = ID,col2 = HR,col3 = 工作日
我根据类似的问题尝试了几种方法,例如:
# melt the data frame to put all the metrics in a single column
DF2 = reshape2::melt(DF, id.vars = c("ID"))
# split the data by ID
DF3 = split(DF2, DF2$ID)
# allocate empty DF with 3 columns for future appending
DF_Organized = data.frame()[1,3]
# make the data into 3 new columns, 1 for ID, HR, weekday
for (m in 1:length(DF3)){
DF_tmp = DF3[m] %>%
data.frame %>% na.omit() # convert to DF, remove NAs
setNames(., c("ID","colx","Value")) %>% # set names for clarity
mutate(ind = rep(c(1, 2),length.out = n())) %>% # assign 1 to amplitude and 2 to day values in each row
group_by(ind) %>% # group by value type
mutate(id = row_number()) %>% # make new column that determines location of data by previous assignment
spread(ind, Value) %>% # organize data by new ID
select(-id) #clean
# reorganize the NAs to the bottom
DF_tmp2 = setNames(do.call(function(...) rowr::cbind.fill(..., fill = NA),
lapply(DF_tmp, na.omit)),colnames(DF_tmp)) %>%
na.omit() %>%
select(-colx) %>%
setNames(., c("ID","HR","Weekday")) # set names for clarity
我接近但不准确:
实际输出:
> DF_tmp2
ID HR Weekday
1 21785 0.8282733 6.4045900
2 21785 0.7755684 2.0000000
3 21785 3.0000000 2.0000000
4 21785 1.1228999 0.8501132
。
.
.
存在错位和不准确的组合。感谢任何帮助。
预期输出:
> DF_tmp2
ID HR Weekday
1 21785 0.8282733 2.0000000
2 21785 6.4045900 3.0000000
3 21785 1.1228999 4.0000000
4 21785 0.8667572 5.0000000
5 21875 0.5638048 6.0000000
.
.
.
for (m in 1:length(DF3)){
DF_tmp = DF3[m] %>%
data.frame %>%
na.omit() %>%
setNames(., c("ID","colx","Value")) %>% # set names for clarity
mutate(ind = substring(colx, 2) %>% as.numeric() %% 2) %>% # assign 1 to amplitude and 2 to day values in each row
group_by(ind) %>% # group by value type
mutate(id = row_number()) %>% # make new column that determines location of data by previous assignment
ungroup() %>%
select(-colx) %>%
spread(ind, Value)
# reorganize the NAs to the bottom
DF_tmp2 = setNames(do.call(function(...) rowr::cbind.fill(..., fill = NA),
lapply(DF_tmp, na.omit)),colnames(DF_tmp)) %>%
na.omit() %>%
select(-id) %>%
setNames(., c("ID","Weekday","HR")) %>% # set names for clarity
arrange(Weekday)
}
DF %>%
gather(col, val, -ID) %>%
mutate(col = if_else(str_ends(col, "0|2|4|6|8"), "Weekday", "HR")) %>%
group_by(col) %>%
mutate(instance = row_number()) %>%
spread(col, val) %>%
filter(!is.na(HR))
## A tibble: 14 x 4
# ID instance HR Weekday
# <int> <int> <dbl> <dbl>
# 1 21785 1 0.828 2
# 2 21785 2 6.40 3
# 3 21785 3 0.776 2
# 4 21785 5 1.12 4
# 5 21785 6 0.850 3
# 6 21785 8 0.867 5
# 7 21785 9 0.869 4
# 8 21785 11 0.564 6
# 9 21785 12 0.729 5
#10 21785 14 0.888 7
#11 21785 15 0.824 6
#12 21785 17 0.579 1
#13 21785 18 0.863 7
#14 21785 21 0.940 NA
1) pivot_longer 定义v.names
列名和对数k
。然后添加 V14
因为 V13
似乎不匹配,然后将名称更改为标识列的名称,即 ID、HR 1、Weekday 1、HR 2、Weekday 2 等。使用这些名称,我们可以使用pivot_longer
。
library(dplyr)
library(purrr)
library(tidyr)
v.names <- c("HR", "Weekday")
k <- ncol(DF) %/% 2L # 7L = no. of (HR, Weekday) pairs
DF %>%
mutate(V14 = V12 %% 7L + 1L, n = 1:n()) %>%
set_names("ID", cross2(v.names, 1:k) %>% map(lift(paste)), "n") %>%
pivot_longer(-c(ID, n), names_to = c(".value", "Num"), names_sep = " ") %>%
drop_na %>%
arrange(n, Num) %>%
select(-n, -Num)
给予:
# A tibble: 14 x 3
ID HR Weekday
<int> <dbl> <dbl>
1 21785 0.828 2
2 21785 6.40 3
3 21785 1.12 4
4 21785 0.867 5
5 21785 0.564 6
6 21785 0.888 7
7 21785 0.579 1
8 21785 0.776 2
9 21785 0.850 3
10 21785 0.869 4
11 21785 0.729 5
12 21785 0.824 6
13 21785 0.863 7
14 21785 0.940 1
2) Base R 我们可以用几乎相同的方式在 base R 中交替使用 reshape
。 v.names
和 k
来自上面。请注意,reshape
会自动添加一个 id
列,给出原始数据框中的行号,因此我们不必像在 (1) 中那样自己添加它。
DF2 <- transform(DF, V14 = V12 %% 7L + 1L)
names(DF2)[-1] <- outer(v.names, 1:k, paste)
long <- na.omit(reshape(DF2, dir = "long",
varying = lapply(v.names, grep, names(DF2)), v.names = v.names))
long[order(long$id, long$time), c("ID", "HR", "Weekday")]
3) data.table
使用 (2) 中的 DF2
library(data.table)
DT2 <- data.table(DF2)[, row := .I]
DT2 <- na.omit(melt(DT2, idvars = c("ID", "row"),
measure.vars = sapply(v.names, grep, names(DT2), simplify = FALSE)))
setkey(DT2, row, Weekday)
DT2[, c("ID", "HR", "Weekday")]
我有以下DF:
DF = structure(list(ID = c(21785L, 21785L, 21785L), V1 = c(0.828273303,
6.404590021, 0.775568448), V2 = c(2L, 3L, 2L), V3 = c(NA, 1.122899914,
0.850113234), V4 = c(NA, 4L, 3L), V5 = c(NA, 0.866757168, 0.868943246
), V6 = c(NA, 5L, 4L), V7 = c(NA, 0.563804788, 0.728656328),
V8 = c(NA, 6L, 5L), V9 = c(NA, 0.888109208, 0.823803733),
V10 = c(NA, 7L, 6L), V11 = c(NA, 0.578834113, 0.863467391
), V12 = c(NA, 1L, 7L), V13 = c(NA, NA, 0.939920869)), class = "data.frame", row.names = c(5L,
163L, 167L))
Output:
Row ID V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
5 21785 0.8282733 2 NA NA NA NA NA NA NA NA NA NA NA
163 21785 6.4045900 3 1.1228999 4 0.8667572 5 0.5638048 6 0.8881092 7 0.5788341 1 NA
167 21785 0.7755684 2 0.8501132 3 0.8689432 4 0.7286563 5 0.8238037 6 0.8634674 7 0.9399209
数据可以分为3个部分:
- 每个参与者的 ID
- 表示标准心率的奇数列
- 表示星期几的偶数列(1 = 星期日)
我每天有 100 多个唯一参与者和 3000 行数据不相等,因此是 NA。
我想将数据转换为每个部分一列
- 因此:col1 = ID,col2 = HR,col3 = 工作日
我根据类似的问题尝试了几种方法,例如:
# melt the data frame to put all the metrics in a single column
DF2 = reshape2::melt(DF, id.vars = c("ID"))
# split the data by ID
DF3 = split(DF2, DF2$ID)
# allocate empty DF with 3 columns for future appending
DF_Organized = data.frame()[1,3]
# make the data into 3 new columns, 1 for ID, HR, weekday
for (m in 1:length(DF3)){
DF_tmp = DF3[m] %>%
data.frame %>% na.omit() # convert to DF, remove NAs
setNames(., c("ID","colx","Value")) %>% # set names for clarity
mutate(ind = rep(c(1, 2),length.out = n())) %>% # assign 1 to amplitude and 2 to day values in each row
group_by(ind) %>% # group by value type
mutate(id = row_number()) %>% # make new column that determines location of data by previous assignment
spread(ind, Value) %>% # organize data by new ID
select(-id) #clean
# reorganize the NAs to the bottom
DF_tmp2 = setNames(do.call(function(...) rowr::cbind.fill(..., fill = NA),
lapply(DF_tmp, na.omit)),colnames(DF_tmp)) %>%
na.omit() %>%
select(-colx) %>%
setNames(., c("ID","HR","Weekday")) # set names for clarity
我接近但不准确:
实际输出:
> DF_tmp2
ID HR Weekday
1 21785 0.8282733 6.4045900
2 21785 0.7755684 2.0000000
3 21785 3.0000000 2.0000000
4 21785 1.1228999 0.8501132
。 . . 存在错位和不准确的组合。感谢任何帮助。
预期输出:
> DF_tmp2
ID HR Weekday
1 21785 0.8282733 2.0000000
2 21785 6.4045900 3.0000000
3 21785 1.1228999 4.0000000
4 21785 0.8667572 5.0000000
5 21875 0.5638048 6.0000000
.
.
.
for (m in 1:length(DF3)){
DF_tmp = DF3[m] %>%
data.frame %>%
na.omit() %>%
setNames(., c("ID","colx","Value")) %>% # set names for clarity
mutate(ind = substring(colx, 2) %>% as.numeric() %% 2) %>% # assign 1 to amplitude and 2 to day values in each row
group_by(ind) %>% # group by value type
mutate(id = row_number()) %>% # make new column that determines location of data by previous assignment
ungroup() %>%
select(-colx) %>%
spread(ind, Value)
# reorganize the NAs to the bottom
DF_tmp2 = setNames(do.call(function(...) rowr::cbind.fill(..., fill = NA),
lapply(DF_tmp, na.omit)),colnames(DF_tmp)) %>%
na.omit() %>%
select(-id) %>%
setNames(., c("ID","Weekday","HR")) %>% # set names for clarity
arrange(Weekday)
}
DF %>%
gather(col, val, -ID) %>%
mutate(col = if_else(str_ends(col, "0|2|4|6|8"), "Weekday", "HR")) %>%
group_by(col) %>%
mutate(instance = row_number()) %>%
spread(col, val) %>%
filter(!is.na(HR))
## A tibble: 14 x 4
# ID instance HR Weekday
# <int> <int> <dbl> <dbl>
# 1 21785 1 0.828 2
# 2 21785 2 6.40 3
# 3 21785 3 0.776 2
# 4 21785 5 1.12 4
# 5 21785 6 0.850 3
# 6 21785 8 0.867 5
# 7 21785 9 0.869 4
# 8 21785 11 0.564 6
# 9 21785 12 0.729 5
#10 21785 14 0.888 7
#11 21785 15 0.824 6
#12 21785 17 0.579 1
#13 21785 18 0.863 7
#14 21785 21 0.940 NA
1) pivot_longer 定义v.names
列名和对数k
。然后添加 V14
因为 V13
似乎不匹配,然后将名称更改为标识列的名称,即 ID、HR 1、Weekday 1、HR 2、Weekday 2 等。使用这些名称,我们可以使用pivot_longer
。
library(dplyr)
library(purrr)
library(tidyr)
v.names <- c("HR", "Weekday")
k <- ncol(DF) %/% 2L # 7L = no. of (HR, Weekday) pairs
DF %>%
mutate(V14 = V12 %% 7L + 1L, n = 1:n()) %>%
set_names("ID", cross2(v.names, 1:k) %>% map(lift(paste)), "n") %>%
pivot_longer(-c(ID, n), names_to = c(".value", "Num"), names_sep = " ") %>%
drop_na %>%
arrange(n, Num) %>%
select(-n, -Num)
给予:
# A tibble: 14 x 3
ID HR Weekday
<int> <dbl> <dbl>
1 21785 0.828 2
2 21785 6.40 3
3 21785 1.12 4
4 21785 0.867 5
5 21785 0.564 6
6 21785 0.888 7
7 21785 0.579 1
8 21785 0.776 2
9 21785 0.850 3
10 21785 0.869 4
11 21785 0.729 5
12 21785 0.824 6
13 21785 0.863 7
14 21785 0.940 1
2) Base R 我们可以用几乎相同的方式在 base R 中交替使用 reshape
。 v.names
和 k
来自上面。请注意,reshape
会自动添加一个 id
列,给出原始数据框中的行号,因此我们不必像在 (1) 中那样自己添加它。
DF2 <- transform(DF, V14 = V12 %% 7L + 1L)
names(DF2)[-1] <- outer(v.names, 1:k, paste)
long <- na.omit(reshape(DF2, dir = "long",
varying = lapply(v.names, grep, names(DF2)), v.names = v.names))
long[order(long$id, long$time), c("ID", "HR", "Weekday")]
3) data.table
使用 (2) 中的 DF2
library(data.table)
DT2 <- data.table(DF2)[, row := .I]
DT2 <- na.omit(melt(DT2, idvars = c("ID", "row"),
measure.vars = sapply(v.names, grep, names(DT2), simplify = FALSE)))
setkey(DT2, row, Weekday)
DT2[, c("ID", "HR", "Weekday")]