根据另一个自定义排序的列对数据框中的列对进行排序
Sort pairs of columns in a dataframe, based on another customly ordered column
我已经通过网络进行了搜索,但找不到解决我的问题的方法。我有如下数据
df <- structure(list(V1 = c("ATP1A2", "CAPRIN1", "ATP1A1", "CBX3",
"AUP1", "LARS2", "MTHFD1", "VDAC2", "PRKCSH", "ATP1B1", "B3GNT3",
"", ""), V2 = c("ATP1A1", "ATP1A2", "ATP1B1", "AUP1", "B3GNT3",
"CAPRIN1", "CAPRIN1", "CBX3", "", "", "", "", ""), V3 = c("220948_s_at",
"203296_s_at", "201243_s_at", "220525_s_at", "204856_at", "200722_s_at",
"200723_s_at", "200037_s_at", "", "", "", "", ""), V4 = c("LARS2",
"MTHFD1", "PRKCSH", "PRKCSH", "VDAC2", "", "", "", "", "", "",
"", ""), V5 = c("204016_at", "202309_at", "200707_at", "214080_x_at",
"211662_s_at", "", "", "", "", "", "", "", "")), .Names = c("V1",
"V2", "V3", "V4", "V5"), row.names = c(NA, -13L), class = "data.frame")
我想做的是根据第一列对第 2 列和第 4 列进行排序。但是,如果我对第二列进行排序,则第三列将根据 V2 进行更改,第五列将根据 V4 进行更改。
预期输出如下
V1 V2 V3 V4 V5
1 ATP1A2 ATP1A2 203296_s_at - -
2 CAPRIN1 CAPRIN1 200722_s_at - -
3 - CAPRIN1 200723_s_at - -
4 ATP1A1 ATP1A1 220948_s_at - -
5 CBX3 CBX3 200037_s_at - -
6 AUP1 AUP1 220525_s_at - -
7 LARS2 - - LARS2 204016_at
8 MTHFD1 - - MTHFD1 202309_at
9 VDAC2 - - VDAC2 211662_s_at
10 PRKCSH - - PRKCSH 200707_at
11 - - - PRKCSH 214080_x_at
12 ATP1B1 ATP1B1 201243_s_at - -
13 B3GNT3 B3GNT3 204856_at - -
我想根据V1 对V2 和V4 进行排序。请注意,V3 对应于 V2,V5 对应于 V4。例如,如果 V2 更改 V3 更改等等。
我确实尝试了以下方法,但对我没有帮助
df2<- df[with(df, order(V1)), ]
我也尝试了以下但没有用
require(data.table)
df2 <- data.table(df, key="V1")
以下同样无效
df2<- df[order(df$V1),]
这没有给出预期的输出,但我认为它更有意义:
#get order
df$rn <- 1:nrow(df)
#merge twice
x <- merge(subset(df,V1!="",select=1),
subset(df,V2!="",select=c(2:3)),
by.x="V1",by.y="V2",all=TRUE)
res <- merge(x,
subset(df,V4!="",select=c(4:5)),
by.x="V1",by.y="V4",all=TRUE)
#merge rownumber to order as `df`
res <- merge(subset(df,select=c(1,6)),res,by="V1")
res <- res[ order(res$rn),]
#add pretty colnames
colnames(res) <- c("Gene","RowNum","Probe1","Probe2")
#output:
res
# Gene RowNum Probe1 Probe2
# 2 ATP1A2 1 203296_s_at <NA>
# 6 CAPRIN1 2 200723_s_at <NA>
# 7 CAPRIN1 2 200722_s_at <NA>
# 1 ATP1A1 3 220948_s_at <NA>
# 8 CBX3 4 200037_s_at <NA>
# 4 AUP1 5 220525_s_at <NA>
# 9 LARS2 6 <NA> 204016_at
# 10 MTHFD1 7 <NA> 202309_at
# 13 VDAC2 8 <NA> 211662_s_at
# 11 PRKCSH 9 <NA> 200707_at
# 12 PRKCSH 9 <NA> 214080_x_at
# 3 ATP1B1 10 201243_s_at <NA>
# 5 B3GNT3 11 204856_at <NA>
在预期的输出中,V1、V2、V4是基因名称,在我的输出中合并成一列V1=基因列。
you removed the second and fourth columns
要获取第 2 列或第 4 列:
require(dplyr)
res %>% filter(!is.na(res$Probe1)) %>% select(Gene) %>% distinct
res %>% filter(!is.na(res$Probe2)) %>% select(Gene) %>% distinct
好的,可能有一个更优雅的解决方案,但是你去吧(注意我从 V1 中删除了空字符串。我将它们留在 V2 到 V5 中,但它们也可以被遗漏。):
V1 <- c("ATP1A2", "CAPRIN1", "ATP1A1", "CBX3", "AUP1", "LARS2", "MTHFD1", "VDAC2", "PRKCSH", "ATP1B1", "B3GNT3")
V2 <- c("ATP1A1", "ATP1A2", "ATP1B1", "AUP1", "B3GNT3", "CAPRIN1", "CAPRIN1", "CBX3", "", "", "", "", "")
V3 <- c("220948_s_at", "203296_s_at", "201243_s_at", "220525_s_at", "204856_at", "200722_s_at", "200723_s_at", "200037_s_at", "", "", "", "", "")
V4 <- c("LARS2", "MTHFD1", "PRKCSH", "PRKCSH", "VDAC2", "", "", "", "", "", "", "", "")
V5 <- c("204016_at", "202309_at", "200707_at", "214080_x_at", "211662_s_at", "", "", "", "", "", "", "", "")
V2.final <- c()
V3.final <- c()
V4.final <- c()
V5.final <- c()
for(i in seq_along(unique(V1))) {
if(V1[i] %in% V2) {
V2.final <- append(V2.final, V2[which(V2==V1[i])])
V3.final <- append(V3.final, V3[which(V2==V1[i])])
V4.final <- append(V4.final, rep("", length(which(V2==V1[i]))))
V5.final <- append(V5.final, rep("", length(which(V2==V1[i]))))
} else if(V1[i] %in% V4) {
V2.final <- append(V2.final, rep("", length(which(V4==V1[i]))))
V3.final <- append(V3.final, rep("", length(which(V4==V1[i]))))
V4.final <- append(V4.final, V4[which(V4==V1[i])])
V5.final <- append(V5.final, V5[which(V4==V1[i])])
}
}
cbind(V2.final, V3.final, V4.final, V5.final)
V2.final V3.final V4.final V5.final
[1,] "ATP1A2" "203296_s_at" "" ""
[2,] "CAPRIN1" "200722_s_at" "" ""
[3,] "CAPRIN1" "200723_s_at" "" ""
[4,] "ATP1A1" "220948_s_at" "" ""
[5,] "CBX3" "200037_s_at" "" ""
[6,] "AUP1" "220525_s_at" "" ""
[7,] "" "" "LARS2" "204016_at"
[8,] "" "" "MTHFD1" "202309_at"
[9,] "" "" "VDAC2" "211662_s_at"
[10,] "" "" "PRKCSH" "200707_at"
[11,] "" "" "PRKCSH" "214080_x_at"
[12,] "ATP1B1" "201243_s_at" "" ""
[13,] "B3GNT3" "204856_at" "" ""
我同意 zx8754 你想要做的是在 (V2,V3) 和 (V4,V5) 的 V1 上合并(加入 sql),如果你的数据可以更简单地完成最初在格式分开的数据库中:
df$RowNum <- 1:nrow(df) # row numbers to sort at the end
1) 使用 data.table,如果您有大表,可能会有用
library("data.table")
dt <- as.data.table(df)
# your data are essentially three different tables
# so let's split it up, removing useless empty cells
ref <- dt[which(dt$V1!=""),c("V1","RowNum"),with=FALSE]
# with=FALSE necessary for the second argument to
# be understood as column names in a data table
setkey(ref,"V1") # the column used for the merges
tab1 <- dt[which(dt$V2!=""),c("V2","V3"),with=FALSE]
setkey(tab1,"V2")
tab2 <- dt[which(dt$V4!=""),c("V4","V5"),with=FALSE]
setkey(tab2,"V4")
# merge tab1 to ref and tab2 to the product
# using data.table formalism
df3 <- tab2[tab1[ref,allow.cartesian=T],allow.cartesian=T]
# allow.cartesio=T important to keep all in ref
# and to get exactly the same output
setkey(df3,"RowNum") # order df3 by RowNum
df3 <- within(df3,{
V2 <- V1 <- V4 # make the columns V1 and V2
V4[is.na(V5)]<-NA # put back NA
V2[is.na(V3)]<-NA
})
setcolorder(df3,sort(names(df3))) # sort V1 to V5
2) 使用基础 R
# 同上,我们将df拆分为三张表
参考 <- df[哪个(df$V1!=""),c("V1","RowNum")]
tab1 <- df[哪个(df$V2!=""),c("V2","V3")]
tab2 <- df[which(df$V4!=""),c("V4","V5")]
然后你合并:
df2 <- merge(ref,tab1,by.x="V1",by.y="V2",all=TRUE)
df2 <- merge(df2,tab2,by.x="V1",by.y="V4",all=TRUE)
如果你想要完全相同的输出:
df2 <- df2[order(df2$RowNum),] # order by RowNum
# make the V2 column
df2$V2 <- df2$V1
df2$V2[which(is.na(df2$V3))] <- NA
# make the V4 column
df2$V4 <- df2$V1
df2$V4[which(is.na(df2$V5))] <- NA
# order the columns as wanted
df2 <- df2[,c("V1","V2","V3","V4","V5")]
我已经通过网络进行了搜索,但找不到解决我的问题的方法。我有如下数据
df <- structure(list(V1 = c("ATP1A2", "CAPRIN1", "ATP1A1", "CBX3",
"AUP1", "LARS2", "MTHFD1", "VDAC2", "PRKCSH", "ATP1B1", "B3GNT3",
"", ""), V2 = c("ATP1A1", "ATP1A2", "ATP1B1", "AUP1", "B3GNT3",
"CAPRIN1", "CAPRIN1", "CBX3", "", "", "", "", ""), V3 = c("220948_s_at",
"203296_s_at", "201243_s_at", "220525_s_at", "204856_at", "200722_s_at",
"200723_s_at", "200037_s_at", "", "", "", "", ""), V4 = c("LARS2",
"MTHFD1", "PRKCSH", "PRKCSH", "VDAC2", "", "", "", "", "", "",
"", ""), V5 = c("204016_at", "202309_at", "200707_at", "214080_x_at",
"211662_s_at", "", "", "", "", "", "", "", "")), .Names = c("V1",
"V2", "V3", "V4", "V5"), row.names = c(NA, -13L), class = "data.frame")
我想做的是根据第一列对第 2 列和第 4 列进行排序。但是,如果我对第二列进行排序,则第三列将根据 V2 进行更改,第五列将根据 V4 进行更改。
预期输出如下
V1 V2 V3 V4 V5
1 ATP1A2 ATP1A2 203296_s_at - -
2 CAPRIN1 CAPRIN1 200722_s_at - -
3 - CAPRIN1 200723_s_at - -
4 ATP1A1 ATP1A1 220948_s_at - -
5 CBX3 CBX3 200037_s_at - -
6 AUP1 AUP1 220525_s_at - -
7 LARS2 - - LARS2 204016_at
8 MTHFD1 - - MTHFD1 202309_at
9 VDAC2 - - VDAC2 211662_s_at
10 PRKCSH - - PRKCSH 200707_at
11 - - - PRKCSH 214080_x_at
12 ATP1B1 ATP1B1 201243_s_at - -
13 B3GNT3 B3GNT3 204856_at - -
我想根据V1 对V2 和V4 进行排序。请注意,V3 对应于 V2,V5 对应于 V4。例如,如果 V2 更改 V3 更改等等。
我确实尝试了以下方法,但对我没有帮助
df2<- df[with(df, order(V1)), ]
我也尝试了以下但没有用
require(data.table)
df2 <- data.table(df, key="V1")
以下同样无效
df2<- df[order(df$V1),]
这没有给出预期的输出,但我认为它更有意义:
#get order
df$rn <- 1:nrow(df)
#merge twice
x <- merge(subset(df,V1!="",select=1),
subset(df,V2!="",select=c(2:3)),
by.x="V1",by.y="V2",all=TRUE)
res <- merge(x,
subset(df,V4!="",select=c(4:5)),
by.x="V1",by.y="V4",all=TRUE)
#merge rownumber to order as `df`
res <- merge(subset(df,select=c(1,6)),res,by="V1")
res <- res[ order(res$rn),]
#add pretty colnames
colnames(res) <- c("Gene","RowNum","Probe1","Probe2")
#output:
res
# Gene RowNum Probe1 Probe2
# 2 ATP1A2 1 203296_s_at <NA>
# 6 CAPRIN1 2 200723_s_at <NA>
# 7 CAPRIN1 2 200722_s_at <NA>
# 1 ATP1A1 3 220948_s_at <NA>
# 8 CBX3 4 200037_s_at <NA>
# 4 AUP1 5 220525_s_at <NA>
# 9 LARS2 6 <NA> 204016_at
# 10 MTHFD1 7 <NA> 202309_at
# 13 VDAC2 8 <NA> 211662_s_at
# 11 PRKCSH 9 <NA> 200707_at
# 12 PRKCSH 9 <NA> 214080_x_at
# 3 ATP1B1 10 201243_s_at <NA>
# 5 B3GNT3 11 204856_at <NA>
在预期的输出中,V1、V2、V4是基因名称,在我的输出中合并成一列V1=基因列。
you removed the second and fourth columns
要获取第 2 列或第 4 列:
require(dplyr)
res %>% filter(!is.na(res$Probe1)) %>% select(Gene) %>% distinct
res %>% filter(!is.na(res$Probe2)) %>% select(Gene) %>% distinct
好的,可能有一个更优雅的解决方案,但是你去吧(注意我从 V1 中删除了空字符串。我将它们留在 V2 到 V5 中,但它们也可以被遗漏。):
V1 <- c("ATP1A2", "CAPRIN1", "ATP1A1", "CBX3", "AUP1", "LARS2", "MTHFD1", "VDAC2", "PRKCSH", "ATP1B1", "B3GNT3")
V2 <- c("ATP1A1", "ATP1A2", "ATP1B1", "AUP1", "B3GNT3", "CAPRIN1", "CAPRIN1", "CBX3", "", "", "", "", "")
V3 <- c("220948_s_at", "203296_s_at", "201243_s_at", "220525_s_at", "204856_at", "200722_s_at", "200723_s_at", "200037_s_at", "", "", "", "", "")
V4 <- c("LARS2", "MTHFD1", "PRKCSH", "PRKCSH", "VDAC2", "", "", "", "", "", "", "", "")
V5 <- c("204016_at", "202309_at", "200707_at", "214080_x_at", "211662_s_at", "", "", "", "", "", "", "", "")
V2.final <- c()
V3.final <- c()
V4.final <- c()
V5.final <- c()
for(i in seq_along(unique(V1))) {
if(V1[i] %in% V2) {
V2.final <- append(V2.final, V2[which(V2==V1[i])])
V3.final <- append(V3.final, V3[which(V2==V1[i])])
V4.final <- append(V4.final, rep("", length(which(V2==V1[i]))))
V5.final <- append(V5.final, rep("", length(which(V2==V1[i]))))
} else if(V1[i] %in% V4) {
V2.final <- append(V2.final, rep("", length(which(V4==V1[i]))))
V3.final <- append(V3.final, rep("", length(which(V4==V1[i]))))
V4.final <- append(V4.final, V4[which(V4==V1[i])])
V5.final <- append(V5.final, V5[which(V4==V1[i])])
}
}
cbind(V2.final, V3.final, V4.final, V5.final)
V2.final V3.final V4.final V5.final
[1,] "ATP1A2" "203296_s_at" "" ""
[2,] "CAPRIN1" "200722_s_at" "" ""
[3,] "CAPRIN1" "200723_s_at" "" ""
[4,] "ATP1A1" "220948_s_at" "" ""
[5,] "CBX3" "200037_s_at" "" ""
[6,] "AUP1" "220525_s_at" "" ""
[7,] "" "" "LARS2" "204016_at"
[8,] "" "" "MTHFD1" "202309_at"
[9,] "" "" "VDAC2" "211662_s_at"
[10,] "" "" "PRKCSH" "200707_at"
[11,] "" "" "PRKCSH" "214080_x_at"
[12,] "ATP1B1" "201243_s_at" "" ""
[13,] "B3GNT3" "204856_at" "" ""
我同意 zx8754 你想要做的是在 (V2,V3) 和 (V4,V5) 的 V1 上合并(加入 sql),如果你的数据可以更简单地完成最初在格式分开的数据库中:
df$RowNum <- 1:nrow(df) # row numbers to sort at the end
1) 使用 data.table,如果您有大表,可能会有用
library("data.table")
dt <- as.data.table(df)
# your data are essentially three different tables
# so let's split it up, removing useless empty cells
ref <- dt[which(dt$V1!=""),c("V1","RowNum"),with=FALSE]
# with=FALSE necessary for the second argument to
# be understood as column names in a data table
setkey(ref,"V1") # the column used for the merges
tab1 <- dt[which(dt$V2!=""),c("V2","V3"),with=FALSE]
setkey(tab1,"V2")
tab2 <- dt[which(dt$V4!=""),c("V4","V5"),with=FALSE]
setkey(tab2,"V4")
# merge tab1 to ref and tab2 to the product
# using data.table formalism
df3 <- tab2[tab1[ref,allow.cartesian=T],allow.cartesian=T]
# allow.cartesio=T important to keep all in ref
# and to get exactly the same output
setkey(df3,"RowNum") # order df3 by RowNum
df3 <- within(df3,{
V2 <- V1 <- V4 # make the columns V1 and V2
V4[is.na(V5)]<-NA # put back NA
V2[is.na(V3)]<-NA
})
setcolorder(df3,sort(names(df3))) # sort V1 to V5
2) 使用基础 R # 同上,我们将df拆分为三张表 参考 <- df[哪个(df$V1!=""),c("V1","RowNum")] tab1 <- df[哪个(df$V2!=""),c("V2","V3")] tab2 <- df[which(df$V4!=""),c("V4","V5")]
然后你合并:
df2 <- merge(ref,tab1,by.x="V1",by.y="V2",all=TRUE)
df2 <- merge(df2,tab2,by.x="V1",by.y="V4",all=TRUE)
如果你想要完全相同的输出:
df2 <- df2[order(df2$RowNum),] # order by RowNum
# make the V2 column
df2$V2 <- df2$V1
df2$V2[which(is.na(df2$V3))] <- NA
# make the V4 column
df2$V4 <- df2$V1
df2$V4[which(is.na(df2$V5))] <- NA
# order the columns as wanted
df2 <- df2[,c("V1","V2","V3","V4","V5")]