R:根据 NA 位置合并来自相同 data.frame 的列
R: merge columns from same data.frame based on NA positions
我有这样一个数据框:
df <- data.frame(theme1=c("hello",NA,NA,NA), theme2=c(NA,"world",NA,NA), theme3=c(NA,NA,"good_morning",NA), theme4=c(NA,NA,NA,"good_evening"))
theme1 theme2 theme3 theme4
1 hello NA NA NA
2 NA world NA NA
3 NA NA good_morning NA
4 NA NA NA good_evening
现在我想获得保留行顺序的一列:
**Theme_merged**
hello
world
good_morning
good_evening
尝试次数:
merge_themes <- data.frame(cbind(mycol = na.omit(unlist(data2_tst[18:23]))), stringsAsFactors = F)
上面的代码有效,但不保留行顺序,所以当我想将向量放回原始数据帧时,它不再匹配。
真实数据:
dput(head(data2_tst[18:23], n = 50))
structure(list(Theme1 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, "%Bedrukken%", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "%Bedrukken%", NA, NA, NA, NA, NA, NA, NA, NA), Theme2 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "%Nieuwste|Nieuwe|201[6:7]%",
"%Nieuwste|Nieuwe|201[6:7]%", "%Nieuwste|Nieuwe|201[6:7]%", NA,
NA, NA, NA, NA, "%Nieuwste|Nieuwe|201[6:7]%", "%Nieuwste|Nieuwe|201[6:7]%",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "%Nieuwste|Nieuwe|201[6:7]%",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "%Nieuwste|Nieuwe|201[6:7]%",
"%Nieuwste|Nieuwe|201[6:7]%"), Theme3 = c("%Nodig%", NA, "%Nodig%",
"%Nodig%", "%Nodig%", NA, NA, "%Nodig%", NA, "%Nodig%", NA, NA,
NA, NA, "%Nodig%", "%Nodig%", "%Nodig%", NA, NA, NA, NA, NA,
NA, "%Nodig%", "%Nodig%", NA, NA, "%Nodig%", NA, "%Nodig%", "%Nodig%",
"%Nodig%", NA, "%Nodig%", "%Nodig%", "%Nodig%", NA, NA, NA, "%Nodig%",
"%Nodig%", NA, "%Nodig%", NA, "%Nodig%", "%Nodig%", NA, "%Nodig%",
NA, NA), Theme4 = c(NA, "%Kopen%", NA, NA, NA, "%Kopen%", "%Kopen%",
NA, "%Kopen%", NA, NA, NA, NA, NA, NA, NA, NA, "%Kopen%", "%Kopen%",
NA, NA, "%Kopen%", "%Kopen%", NA, NA, "%Kopen%", "%Kopen%", NA,
"%Kopen%", NA, NA, NA, NA, NA, NA, NA, "%Kopen%", "%Kopen%",
"%Kopen%", NA, NA, NA, NA, "%Kopen%", NA, NA, "%Kopen%", NA,
NA, NA), Theme5 = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_), Theme6 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_)), .Names = c("Theme1",
"Theme2", "Theme3", "Theme4", "Theme5", "Theme6"), row.names = 3:52, class = "data.frame")
这是一个 tidyverse 解决方案(使用 dplyr
和 tidyr
或仅使用 tidyverse
)
library(tidyverse)
> df <- df %>%
gather("theme", "theme_merged", 1:4) %>%
filter(!is.na(theme_merged)) %>%
select(theme_merged)
> df
theme_merged
1 hello
2 world
3 good_morning
4 good_evening
在 SQL 中,这将是 COALESCE
函数:
apply(df, 1, function(r) c(na.omit(r), NA)[1])
# [1] "hello" "world" "good_morning" "good_evening"
df <- data.frame(
theme1=c("hello",NA,NA,NA),
theme2=c(NA,"world",NA,NA),
theme3=c(NA,NA,"good_morning",NA),
theme4=c(NA,NA,NA,"good_evening"),
stringsAsFactors = FALSE
)
在您的示例数据 na.omit(unlist(df2, use.names = FALSE))
上可以正常工作,但如果有一行 只有 NA
值,它将失败:
df2 <- data.frame(
theme1=c("hello",NA,NA,NA,NA),
theme2=c(NA,"world",NA,NA,NA),
theme3=c(NA,NA,"good_morning",NA,NA),
theme4=c(NA,NA,NA,"good_evening",NA),
theme5=c(NA_character_,NA_character_,NA_character_,
NA_character_,NA_character_),
stringsAsFactors = FALSE
)
df2$X <- na.omit(unlist(df2, use.names = FALSE))
# Error in `$<-.data.frame`(`*tmp*`, "X", value = c("hello", "world", "good_morning", :
# replacement has 4 rows, data has 5
df2$X <- apply(df2, 1, function(r) c(na.omit(r), NA)[1])
# theme1 theme2 theme3 theme4 theme5 X
# 1 hello <NA> <NA> <NA> <NA> hello
# 2 <NA> world <NA> <NA> <NA> world
# 3 <NA> <NA> good_morning <NA> <NA> good_morning
# 4 <NA> <NA> <NA> good_evening <NA> good_evening
# 5 <NA> <NA> <NA> <NA> <NA> <NA>
另一个选项可以是 df2$X <- df2[cbind(1:nrow(df2), max.col(!is.na(df2)))]
dplyr 0.5.0版本引入了合并功能:
This version of dplyr gains a number of vector functions inspired by SQL. Two functions make it a little easier to eliminate or generate missing values:
Given a set of vectors, coalesce() finds the first non-missing value in each position.
要将此应用于示例数据框,您可以使用:
df <- mutate_all(df, .funs = as.character)
df$merged <- with(df, coalesce(theme1, theme2, theme3, theme4))
我发现有必要将因子转换为字符以避免 'invalid factor levels' 错误。
您的真实数据无需转换:
df$merged <- with(df, coalesce(Theme1, Theme2, Theme3, Theme4, Theme5, Theme6)
这应该适用于您的数据:
new_df = c(as.matrix(df))
此行首先将 df
转换为矩阵,并将所有列与 c()
绑定到一个向量中。
new_df <- new_df[!is.na(new_df)]
现在我们只保留非NA
条目。如果您愿意,可以将其转换回数据框:
new_df <- data.frame(new_df);names(new_df) <- "Themes"
我有这样一个数据框:
df <- data.frame(theme1=c("hello",NA,NA,NA), theme2=c(NA,"world",NA,NA), theme3=c(NA,NA,"good_morning",NA), theme4=c(NA,NA,NA,"good_evening"))
theme1 theme2 theme3 theme4
1 hello NA NA NA
2 NA world NA NA
3 NA NA good_morning NA
4 NA NA NA good_evening
现在我想获得保留行顺序的一列:
**Theme_merged**
hello
world
good_morning
good_evening
尝试次数:
merge_themes <- data.frame(cbind(mycol = na.omit(unlist(data2_tst[18:23]))), stringsAsFactors = F)
上面的代码有效,但不保留行顺序,所以当我想将向量放回原始数据帧时,它不再匹配。
真实数据:
dput(head(data2_tst[18:23], n = 50))
structure(list(Theme1 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, "%Bedrukken%", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "%Bedrukken%", NA, NA, NA, NA, NA, NA, NA, NA), Theme2 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "%Nieuwste|Nieuwe|201[6:7]%",
"%Nieuwste|Nieuwe|201[6:7]%", "%Nieuwste|Nieuwe|201[6:7]%", NA,
NA, NA, NA, NA, "%Nieuwste|Nieuwe|201[6:7]%", "%Nieuwste|Nieuwe|201[6:7]%",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "%Nieuwste|Nieuwe|201[6:7]%",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "%Nieuwste|Nieuwe|201[6:7]%",
"%Nieuwste|Nieuwe|201[6:7]%"), Theme3 = c("%Nodig%", NA, "%Nodig%",
"%Nodig%", "%Nodig%", NA, NA, "%Nodig%", NA, "%Nodig%", NA, NA,
NA, NA, "%Nodig%", "%Nodig%", "%Nodig%", NA, NA, NA, NA, NA,
NA, "%Nodig%", "%Nodig%", NA, NA, "%Nodig%", NA, "%Nodig%", "%Nodig%",
"%Nodig%", NA, "%Nodig%", "%Nodig%", "%Nodig%", NA, NA, NA, "%Nodig%",
"%Nodig%", NA, "%Nodig%", NA, "%Nodig%", "%Nodig%", NA, "%Nodig%",
NA, NA), Theme4 = c(NA, "%Kopen%", NA, NA, NA, "%Kopen%", "%Kopen%",
NA, "%Kopen%", NA, NA, NA, NA, NA, NA, NA, NA, "%Kopen%", "%Kopen%",
NA, NA, "%Kopen%", "%Kopen%", NA, NA, "%Kopen%", "%Kopen%", NA,
"%Kopen%", NA, NA, NA, NA, NA, NA, NA, "%Kopen%", "%Kopen%",
"%Kopen%", NA, NA, NA, NA, "%Kopen%", NA, NA, "%Kopen%", NA,
NA, NA), Theme5 = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_), Theme6 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_)), .Names = c("Theme1",
"Theme2", "Theme3", "Theme4", "Theme5", "Theme6"), row.names = 3:52, class = "data.frame")
这是一个 tidyverse 解决方案(使用 dplyr
和 tidyr
或仅使用 tidyverse
)
library(tidyverse)
> df <- df %>%
gather("theme", "theme_merged", 1:4) %>%
filter(!is.na(theme_merged)) %>%
select(theme_merged)
> df
theme_merged
1 hello
2 world
3 good_morning
4 good_evening
在 SQL 中,这将是 COALESCE
函数:
apply(df, 1, function(r) c(na.omit(r), NA)[1])
# [1] "hello" "world" "good_morning" "good_evening"
df <- data.frame(
theme1=c("hello",NA,NA,NA),
theme2=c(NA,"world",NA,NA),
theme3=c(NA,NA,"good_morning",NA),
theme4=c(NA,NA,NA,"good_evening"),
stringsAsFactors = FALSE
)
在您的示例数据 na.omit(unlist(df2, use.names = FALSE))
上可以正常工作,但如果有一行 只有 NA
值,它将失败:
df2 <- data.frame(
theme1=c("hello",NA,NA,NA,NA),
theme2=c(NA,"world",NA,NA,NA),
theme3=c(NA,NA,"good_morning",NA,NA),
theme4=c(NA,NA,NA,"good_evening",NA),
theme5=c(NA_character_,NA_character_,NA_character_,
NA_character_,NA_character_),
stringsAsFactors = FALSE
)
df2$X <- na.omit(unlist(df2, use.names = FALSE))
# Error in `$<-.data.frame`(`*tmp*`, "X", value = c("hello", "world", "good_morning", :
# replacement has 4 rows, data has 5
df2$X <- apply(df2, 1, function(r) c(na.omit(r), NA)[1])
# theme1 theme2 theme3 theme4 theme5 X
# 1 hello <NA> <NA> <NA> <NA> hello
# 2 <NA> world <NA> <NA> <NA> world
# 3 <NA> <NA> good_morning <NA> <NA> good_morning
# 4 <NA> <NA> <NA> good_evening <NA> good_evening
# 5 <NA> <NA> <NA> <NA> <NA> <NA>
另一个选项可以是 df2$X <- df2[cbind(1:nrow(df2), max.col(!is.na(df2)))]
dplyr 0.5.0版本引入了合并功能:
This version of dplyr gains a number of vector functions inspired by SQL. Two functions make it a little easier to eliminate or generate missing values:
Given a set of vectors, coalesce() finds the first non-missing value in each position.
要将此应用于示例数据框,您可以使用:
df <- mutate_all(df, .funs = as.character)
df$merged <- with(df, coalesce(theme1, theme2, theme3, theme4))
我发现有必要将因子转换为字符以避免 'invalid factor levels' 错误。
您的真实数据无需转换:
df$merged <- with(df, coalesce(Theme1, Theme2, Theme3, Theme4, Theme5, Theme6)
这应该适用于您的数据:
new_df = c(as.matrix(df))
此行首先将 df
转换为矩阵,并将所有列与 c()
绑定到一个向量中。
new_df <- new_df[!is.na(new_df)]
现在我们只保留非NA
条目。如果您愿意,可以将其转换回数据框:
new_df <- data.frame(new_df);names(new_df) <- "Themes"