R - 一行中数据框列中的相同值
R - Identical values in columns of dataframe in one row
我有一个包含 3 列非整数值的数据框。分配时间的各个列中的值将与同一数据框中其他一列或两列中的值相同。如果列之间存在匹配项,我希望它们位于同一行。
请参阅下面的 subset_df 与 expected_subset_df 进行说明。
Notice that the values ending on "248:-" are in the same row in expected_subset_df but not in subset_df.
Summary: values in col1 can also be in col2 and/or col3. If the values between columns do match I want them on the same row.
> subset_df
col1 col2 col3
1 20:31722330:- 20:31722330:- 20:31722330:-
2 20:31722348:- 20:31724051:- 20:31724051:-
3 FALSE 20:31722348:- 20:31722348:-
> expected_subset_df
col1 col2 col3
1 20:31722330:- 20:31722330:- 20:31722330:-
2 20:31722348:- 20:31722348:- 20:31722348:-
3 FALSE 20:31724051:- 20:31724051:-
我尝试过的
library(dplyr)
subset_df %>%
mutate_all(as.character) %>%
mutate(col1 = subset_df$col1[match(subset_df$col2, subset_df$col1)],
col3 = subset_df$col3[match(subset_df$col2, subset_df$col3)])
产量:
col1 col2 col3
1 20:31722330:- 20:31722330:- 20:31722330:-
2 <NA> 20:31724051:- 20:31724051:-
3 20:31722348:- 20:31722348:- 20:31722348:-
Is this method robust? Is there a better alternative?
编辑:
假设数据帧断点看起来像这样:
> breakpoint
col1 col2 col3
1 20:31722330:- 20:31722344:- FALSE
2 21:15014555:- 21:15014555:- FALSE
3 21:15014767:- 21:15014767:- 21:15014767:-
如何将数据帧 断点 变成这样:
> expected_breakpoint
col1 col2 col3
1 20:31722330:- <NA> <NA>
2 <NA> 20:31722344:- <NA>
3 21:15014555:- 21:15014555:- <NA>
4 <NA> <NA> FALSE
5 <NA> <NA> FALSE
6 21:15014767:- 21:15014767:- 21:15014767:-
编辑 2:FALSE
进入 <NA>
分析前
假设数据帧 breakpoint_new 看起来像这样:
> breakpoint_new
col1 col2 col3
1 20:31722330:- 20:31722344:- <NA>
2 21:15014555:- 21:15014555:- <NA>
3 21:15014767:- 21:15014767:- 21:15014767:-
如何将数据框 breakpoint_new 变成这样:
> expected_breakpoint_new
col1 col2 col3
1 20:31722330:- <NA> <NA>
2 <NA> 20:31722344:- <NA>
3 21:15014555:- 21:15014555:- <NA>
4 21:15014767:- 21:15014767:- 21:15014767:-
以下函数解决了我的问题:
match_columns = function(df, nomatch=F){
if (ncol(df) != 3){
stop("Input DataFrame needs to have 3 columns")
}
matrix = matrix(ncol = 3, nrow = 0)
match12 = intersect(df$object, df$object.1)
match23 = intersect(df$object.1, df$object.2)
match13 = intersect(df$object, df$object.2)
for (item in match12){
if (item == nomatch){next}
if (item %in% match23){
matrix = rbind(matrix, c(rep(item, 3)))
}else{
matrix = rbind(matrix, c(rep(item, 2), nomatch))
}
}
for (item in match13){
if (item == nomatch){next}
if (!(item %in% match12)){
matrix = rbind(matrix, c(item, nomatch, item))
}
}
for (item in match23){
if (item == nomatch){next}
if (!(item %in% match13)){
matrix = rbind(matrix, c(nomatch, rep(item, 2)))
}
}
for (item in df$object){
if (item == nomatch){next}
if (!(item %in% match12) & !(item %in% match13)){
matrix = rbind(matrix, c(item, rep(nomatch, 2)))
}
}
for (item in df$object.1){
if (item == nomatch){next}
if (!(item %in% match12) & !(item %in% match23)){
matrix = rbind(matrix, c(nomatch, item, nomatch))
}
}
for (item in df$object.2){
if (item == nomatch){next}
if (!(item %in% match13) & !(item %in% match23)){
matrix = rbind(matrix, c(rep(nomatch, 2), item))
}
}
return(matrix)
}
各自列中的值与其他列中的相同值相匹配。如果不是所有三列都匹配,则引入 FALSE
。
我有一个包含 3 列非整数值的数据框。分配时间的各个列中的值将与同一数据框中其他一列或两列中的值相同。如果列之间存在匹配项,我希望它们位于同一行。
请参阅下面的 subset_df 与 expected_subset_df 进行说明。
Notice that the values ending on "248:-" are in the same row in expected_subset_df but not in subset_df.
Summary: values in col1 can also be in col2 and/or col3. If the values between columns do match I want them on the same row.
> subset_df
col1 col2 col3
1 20:31722330:- 20:31722330:- 20:31722330:-
2 20:31722348:- 20:31724051:- 20:31724051:-
3 FALSE 20:31722348:- 20:31722348:-
> expected_subset_df
col1 col2 col3
1 20:31722330:- 20:31722330:- 20:31722330:-
2 20:31722348:- 20:31722348:- 20:31722348:-
3 FALSE 20:31724051:- 20:31724051:-
我尝试过的
library(dplyr)
subset_df %>%
mutate_all(as.character) %>%
mutate(col1 = subset_df$col1[match(subset_df$col2, subset_df$col1)],
col3 = subset_df$col3[match(subset_df$col2, subset_df$col3)])
产量:
col1 col2 col3
1 20:31722330:- 20:31722330:- 20:31722330:-
2 <NA> 20:31724051:- 20:31724051:-
3 20:31722348:- 20:31722348:- 20:31722348:-
Is this method robust? Is there a better alternative?
编辑:
假设数据帧断点看起来像这样:
> breakpoint
col1 col2 col3
1 20:31722330:- 20:31722344:- FALSE
2 21:15014555:- 21:15014555:- FALSE
3 21:15014767:- 21:15014767:- 21:15014767:-
如何将数据帧 断点 变成这样:
> expected_breakpoint
col1 col2 col3
1 20:31722330:- <NA> <NA>
2 <NA> 20:31722344:- <NA>
3 21:15014555:- 21:15014555:- <NA>
4 <NA> <NA> FALSE
5 <NA> <NA> FALSE
6 21:15014767:- 21:15014767:- 21:15014767:-
编辑 2:FALSE
进入 <NA>
分析前
假设数据帧 breakpoint_new 看起来像这样:
> breakpoint_new
col1 col2 col3
1 20:31722330:- 20:31722344:- <NA>
2 21:15014555:- 21:15014555:- <NA>
3 21:15014767:- 21:15014767:- 21:15014767:-
如何将数据框 breakpoint_new 变成这样:
> expected_breakpoint_new
col1 col2 col3
1 20:31722330:- <NA> <NA>
2 <NA> 20:31722344:- <NA>
3 21:15014555:- 21:15014555:- <NA>
4 21:15014767:- 21:15014767:- 21:15014767:-
以下函数解决了我的问题:
match_columns = function(df, nomatch=F){
if (ncol(df) != 3){
stop("Input DataFrame needs to have 3 columns")
}
matrix = matrix(ncol = 3, nrow = 0)
match12 = intersect(df$object, df$object.1)
match23 = intersect(df$object.1, df$object.2)
match13 = intersect(df$object, df$object.2)
for (item in match12){
if (item == nomatch){next}
if (item %in% match23){
matrix = rbind(matrix, c(rep(item, 3)))
}else{
matrix = rbind(matrix, c(rep(item, 2), nomatch))
}
}
for (item in match13){
if (item == nomatch){next}
if (!(item %in% match12)){
matrix = rbind(matrix, c(item, nomatch, item))
}
}
for (item in match23){
if (item == nomatch){next}
if (!(item %in% match13)){
matrix = rbind(matrix, c(nomatch, rep(item, 2)))
}
}
for (item in df$object){
if (item == nomatch){next}
if (!(item %in% match12) & !(item %in% match13)){
matrix = rbind(matrix, c(item, rep(nomatch, 2)))
}
}
for (item in df$object.1){
if (item == nomatch){next}
if (!(item %in% match12) & !(item %in% match23)){
matrix = rbind(matrix, c(nomatch, item, nomatch))
}
}
for (item in df$object.2){
if (item == nomatch){next}
if (!(item %in% match13) & !(item %in% match23)){
matrix = rbind(matrix, c(rep(nomatch, 2), item))
}
}
return(matrix)
}
各自列中的值与其他列中的相同值相匹配。如果不是所有三列都匹配,则引入 FALSE
。