在 R 的数据框中按 ID 折叠数据
Collapse data by ID in data frame in R
我正在尝试按 ID 和性别折叠数据。
我有数据框:
ID <- c(1,1,1,1,2,2,2,3,3,3,4,4,4)
Gender <- c("M","M","M","M","F","F",'F',"F","F","F", "M", "M", "M")
Test1 <- c("70", "NA", "NA", "NA", "NA", "85", "NA", "NA", "90", "NA", "NA", "NA", "90")
Test2 <- c("NA", "60", "NA", "NA", "NA", "NA", "82", "NA", "NA", "87", "NA", "88", "NA")
df <- data.frame(ID, Gender, Test1, Test2)
ID Gender Test1 Test2
1 1 M 70 NA
2 1 M NA 60
3 1 M NA NA
4 1 M NA NA
5 2 F NA NA
6 2 F 85 NA
7 2 F NA 82
8 3 F NA NA
9 3 F 90 NA
10 3 F NA 87
11 4 M NA NA
12 4 M NA 88
13 4 M 90 NA
我希望获得有关如何跨 ID 和性别折叠数据的帮助,以便每个 ID 可以有 1 行。看起来像这样:
id gender test1 test2
1 1 M 70 60
2 2 F 85 82
3 3 F 90 87
4 4 M 90 88
如有任何帮助,我们将不胜感激!
谢谢!
# convert the type from factor to integer
# this step is not necessary if you create these columns of type integer
df$Test1 <- as.integer(as.character(df$Test1))
df$Test2 <- as.integer(as.character(df$Test2))
# choose non-NA value for each (ID, gender) combination
# the function max is interchangeable, you just need the NA treatment
df %>%
group_by(ID, Gender) %>%
summarise(
Test1 = max(Test1, na.rm = T),
Test2 = max(Test2, na.rm = T)
)
# # A tibble: 4 x 4
# # Groups: ID [?]
# ID Gender Test1 Test2
# <dbl> <fct> <int> <int>
# 1 1 M 70 60
# 2 2 F 85 82
# 3 3 F 90 87
# 4 4 M 90 88
进行了一些类型调整:
# create the example data with suitable column types
df <- data_frame(
ID = c(rep(1, 4), rep(2:4, each = 3)),
Gender = c(rep("M", 4), rep("F", 6), rep("M", 3)),
Test1 = c(70, rep(NA, 4), 85, rep(NA, 2), 90, rep(NA, 3), 90),
Test2 = c(NA, 60, rep(NA, 4), 82, rep(NA, 2), 87, NA, 88, NA)
)
df %>%
group_by(ID, Gender) %>%
summarise(
Test1 = max(Test1, na.rm = T),
Test2 = max(Test2, na.rm = T)
)
我通过将 stringAsFactors = FALSE
添加到您的 data.frame
参数对您的数据进行了一次编辑。让我知道此解决方案是否适合您:
df <- data.frame(ID = c(1,1,1,1,2,2,2,3,3,3,4,4,4),
Gender = c("M","M","M","M","F","F",'F',"F","F","F", "M", "M", "M"),
Test1 = c("70", "NA", "NA", "NA", "NA", "85", "NA", "NA", "90", "NA", "NA", "NA", "90"),
Test2 = c("NA", "60", "NA", "NA", "NA", "NA", "82", "NA", "NA", "87", "NA", "88", "NA"),
stringsAsFactors = FALSE)
library(dplyr)
library(tidyr)
new_df <- df %>%
gather(key = "test_num", value = "score", Test1, Test2)%>%
filter(score != "NA")%>%
spread(test_num, score)
这是一个将 Test1 和 Test2 的值粘贴在一起的解决方案。即使您有多种性别和 Test1 和 Test2 的值,这也会起作用,但会将值保留为因素。
df$Test1 <- as.integer(as.character(df$Test1))
df$Test2 <- as.integer(as.character(df$Test2))
xy <- sapply(split(df, f = df$ID), FUN = function(x) {
out <- data.frame(ID = unique(x$ID),
Gender = paste(unique(x$Gender), collapse = ", "),
Test1 = paste(unique(na.omit(x$Test1)), collapse = ","),
Test2 = paste(unique(na.omit(x$Test2)), collapse = ","))
out
}, simplify = FALSE)
xy <- do.call(rbind, xy)
ID Gender Test1 Test2
1 1 M 70 60
2 2 F 85 82
3 3 F 90 87
4 4 M 90 88
我正在尝试按 ID 和性别折叠数据。 我有数据框:
ID <- c(1,1,1,1,2,2,2,3,3,3,4,4,4)
Gender <- c("M","M","M","M","F","F",'F',"F","F","F", "M", "M", "M")
Test1 <- c("70", "NA", "NA", "NA", "NA", "85", "NA", "NA", "90", "NA", "NA", "NA", "90")
Test2 <- c("NA", "60", "NA", "NA", "NA", "NA", "82", "NA", "NA", "87", "NA", "88", "NA")
df <- data.frame(ID, Gender, Test1, Test2)
ID Gender Test1 Test2
1 1 M 70 NA
2 1 M NA 60
3 1 M NA NA
4 1 M NA NA
5 2 F NA NA
6 2 F 85 NA
7 2 F NA 82
8 3 F NA NA
9 3 F 90 NA
10 3 F NA 87
11 4 M NA NA
12 4 M NA 88
13 4 M 90 NA
我希望获得有关如何跨 ID 和性别折叠数据的帮助,以便每个 ID 可以有 1 行。看起来像这样:
id gender test1 test2
1 1 M 70 60
2 2 F 85 82
3 3 F 90 87
4 4 M 90 88
如有任何帮助,我们将不胜感激! 谢谢!
# convert the type from factor to integer
# this step is not necessary if you create these columns of type integer
df$Test1 <- as.integer(as.character(df$Test1))
df$Test2 <- as.integer(as.character(df$Test2))
# choose non-NA value for each (ID, gender) combination
# the function max is interchangeable, you just need the NA treatment
df %>%
group_by(ID, Gender) %>%
summarise(
Test1 = max(Test1, na.rm = T),
Test2 = max(Test2, na.rm = T)
)
# # A tibble: 4 x 4
# # Groups: ID [?]
# ID Gender Test1 Test2
# <dbl> <fct> <int> <int>
# 1 1 M 70 60
# 2 2 F 85 82
# 3 3 F 90 87
# 4 4 M 90 88
进行了一些类型调整:
# create the example data with suitable column types
df <- data_frame(
ID = c(rep(1, 4), rep(2:4, each = 3)),
Gender = c(rep("M", 4), rep("F", 6), rep("M", 3)),
Test1 = c(70, rep(NA, 4), 85, rep(NA, 2), 90, rep(NA, 3), 90),
Test2 = c(NA, 60, rep(NA, 4), 82, rep(NA, 2), 87, NA, 88, NA)
)
df %>%
group_by(ID, Gender) %>%
summarise(
Test1 = max(Test1, na.rm = T),
Test2 = max(Test2, na.rm = T)
)
我通过将 stringAsFactors = FALSE
添加到您的 data.frame
参数对您的数据进行了一次编辑。让我知道此解决方案是否适合您:
df <- data.frame(ID = c(1,1,1,1,2,2,2,3,3,3,4,4,4),
Gender = c("M","M","M","M","F","F",'F',"F","F","F", "M", "M", "M"),
Test1 = c("70", "NA", "NA", "NA", "NA", "85", "NA", "NA", "90", "NA", "NA", "NA", "90"),
Test2 = c("NA", "60", "NA", "NA", "NA", "NA", "82", "NA", "NA", "87", "NA", "88", "NA"),
stringsAsFactors = FALSE)
library(dplyr)
library(tidyr)
new_df <- df %>%
gather(key = "test_num", value = "score", Test1, Test2)%>%
filter(score != "NA")%>%
spread(test_num, score)
这是一个将 Test1 和 Test2 的值粘贴在一起的解决方案。即使您有多种性别和 Test1 和 Test2 的值,这也会起作用,但会将值保留为因素。
df$Test1 <- as.integer(as.character(df$Test1))
df$Test2 <- as.integer(as.character(df$Test2))
xy <- sapply(split(df, f = df$ID), FUN = function(x) {
out <- data.frame(ID = unique(x$ID),
Gender = paste(unique(x$Gender), collapse = ", "),
Test1 = paste(unique(na.omit(x$Test1)), collapse = ","),
Test2 = paste(unique(na.omit(x$Test2)), collapse = ","))
out
}, simplify = FALSE)
xy <- do.call(rbind, xy)
ID Gender Test1 Test2
1 1 M 70 60
2 2 F 85 82
3 3 F 90 87
4 4 M 90 88