使用列作为行名称并将其他列设置为基于字符串的二进制值
Using a column as row names and set the other columns as binary values based on strings
我需要将其中一列设置为行名,以解决重复问题,如果其他列与行名相关,则应使用唯一值将其分隔为假定为二进制值的新列。
示例:
原始数据框
df <- data.frame(module = c("M1","M1","M1","M1","M1","M2"),
GO = c("inflama","inflama","ciclo","inflama","ciclo","sinapse"),
gene = c("PPARG","RELA","RELA","IRF5","ACKR1","GATA3"))
> df
module GO gene
1 M1 inflama PPARG
2 M1 inflama RELA
3 M1 ciclo RELA
4 M1 inflama IRF5
5 M1 ciclo ACKR1
6 M2 sinapse GATA3
最终数据框
df2 <- data.frame(gene = c("PPARG","RELA","IRF5","ACKR1","GATA3"),
M1 = c(1,1,1,1,0),
M2 = c(0,0,0,0,1),
inflama = c(1,1,1,0,0),
ciclo = c(0,1,0,1,0), sinapse = c(0,0,0,0,1))
> df2
gene M1 M2 inflama ciclo sinapse
1 PPARG 1 0 1 0 0
2 RELA 1 0 1 1 0
3 IRF5 1 0 1 0 0
4 ACKR1 1 0 0 1 0
5 GATA3 0 1 0 0 1
预先感谢您的帮助!
我们可以使用 pivot_wider
(来自 tidyr
)重塑或使用 dummy_cols
(来自 fastDummies
)
library(fastDummies)
library(dplyr)
library(stringr)
df %>%
dummy_cols(c('module', 'GO'), remove_selected_columns = TRUE) %>%
group_by(gene) %>%
summarise(across(everything(), ~ + (any(.x)))) %>%
rename_with(~ str_remove(., ".*_"), contains("_"))
-输出
# A tibble: 5 × 6
gene M1 M2 ciclo inflama sinapse
<chr> <int> <int> <int> <int> <int>
1 ACKR1 1 0 1 0 0
2 GATA3 0 1 0 0 1
3 IRF5 1 0 0 1 0
4 PPARG 1 0 0 1 0
5 RELA 1 0 1 1 0
您应该查看 dplyr、tidyr 和 tidyverse 库以进行数据整理R.
## Loading the required libraries
library(dplyr)
library(tidyr)
library(tidyverse)
## Creating a pivot table
df_pivot = df %>%
pivot_longer(!gene, names_to = 'Attributes', values_to = "AttributeValue") %>% ## Wider to longer
group_by(gene,AttributeValue) %>% ## Grouping
summarise(Count = n()) %>% ## Aggregation
pivot_wider(names_from = AttributeValue , values_from = Count) ## Longer to wider
## Replacing NA with 0
df_pivot[is.na(df_pivot)] = 0
df_pivot
# A tibble: 5 x 6
# Groups: gene [5]
gene ciclo M1 M2 sinapse inflama
<chr> <int> <int> <int> <int> <int>
1 ACKR1 1 1 0 0 0
2 GATA3 0 0 1 1 0
3 IRF5 0 1 0 0 1
4 PPARG 0 1 0 0 1
5 RELA 1 2 0 0 1
我认为在这里调用 pivot_wider
两次就足够了。
library(tidyr)
df %>%
pivot_wider(names_from = module,
values_from = module,
values_fill = 0,
values_fn = \(x)1) %>%
pivot_wider(names_from = GO,
values_from = GO,
values_fill = 0,
values_fn = \(x)1)
# A tibble: 5 × 6
gene M1 M2 inflama ciclo sinapse
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 PPARG 1 0 1 0 0
2 RELA 1 0 1 1 0
3 IRF5 1 0 1 0 0
4 ACKR1 1 0 0 1 0
5 GATA3 0 1 0 0 1
如果我们想让它更简单一点,我们可以定义一个旋转函数:
pivoting<-function(x, y) pivot_wider(x,
names_from = y,
values_from = y,
values_fill = 0,
values_fn = \(x)1)
df %>%
pivoting('module') %>%
pivoting('GO')
但我认为这太麻烦了,因为已经有一个包。正如@akrun 所提到的,我会选择 dummy_cols
Matrix
包有一个非常有效的 fac2sparse
函数用于此目的。如果你从 CRAN 下载 R,那么你已经安装了它。以下是其用法示例:
library("Matrix")
f <- gl(3, 2L, labels = letters[1:3])
f
## [1] a a b b c c
## Levels: a b c
fac2sparse(f)
## 3 x 6 sparse Matrix of class "dgCMatrix"
##
## a 1 1 . . . .
## b . . 1 1 . .
## c . . . . 1 1
要将生成的矩阵包含在长格式数据框中,您可以使用 t
将其转置并使用 as(Class="matrix")
.
将其从稀疏格式强制转换为密集格式
fac2dense <- function(x) as(t(fac2sparse(x)), "matrix")
data.frame(f, fac2dense(f))
## f a b c
## 1 a 1 0 0
## 2 a 1 0 0
## 3 b 0 1 0
## 4 b 0 1 0
## 5 c 0 0 1
## 6 c 0 0 1
将其付诸实践,我将分两步转换您的数据框 df
,如下所示:
df_bin <- data.frame(df["gene"], fac2dense(df[["module"]]), fac2dense(df[["GO"]]))
df_bin
gene M1 M2 ciclo inflama sinapse
1 PPARG 1 0 0 1 0
2 RELA 1 0 0 1 0
3 RELA 1 0 1 0 0
4 IRF5 1 0 0 1 0
5 ACKR1 1 0 1 0 0
6 GATA3 0 1 0 0 1
aggregate(. ~ gene, df_bin, max)
gene M1 M2 ciclo inflama sinapse
1 ACKR1 1 0 1 0 0
2 GATA3 0 1 0 0 1
3 IRF5 1 0 0 1 0
4 PPARG 1 0 0 1 0
5 RELA 1 0 1 1 0
使用data.table
也是一种选择:
library(data.table)
setDT(df)
使用dcast()
聚合:
DT1 = dcast(df, gene ~ GO, fill = 0L, fun.agg = \(x) sum(!is.na(x)))
DT2 = dcast(df, gene ~ module, fill = 0L, fun.agg = \(x) ifelse(x == 0, 0, 1))
那么您可以加入表格:
DT3 = DT1[DT2, on = "gene"]
DT3
输出
gene ciclo inflama sinapse M1 M2
1: ACKR1 1 0 0 1 0
2: GATA3 0 0 1 0 1
3: IRF5 0 1 0 1 0
4: PPARG 0 1 0 1 0
5: RELA 1 1 0 1 0
我需要将其中一列设置为行名,以解决重复问题,如果其他列与行名相关,则应使用唯一值将其分隔为假定为二进制值的新列。
示例:
原始数据框
df <- data.frame(module = c("M1","M1","M1","M1","M1","M2"),
GO = c("inflama","inflama","ciclo","inflama","ciclo","sinapse"),
gene = c("PPARG","RELA","RELA","IRF5","ACKR1","GATA3"))
> df
module GO gene
1 M1 inflama PPARG
2 M1 inflama RELA
3 M1 ciclo RELA
4 M1 inflama IRF5
5 M1 ciclo ACKR1
6 M2 sinapse GATA3
最终数据框
df2 <- data.frame(gene = c("PPARG","RELA","IRF5","ACKR1","GATA3"),
M1 = c(1,1,1,1,0),
M2 = c(0,0,0,0,1),
inflama = c(1,1,1,0,0),
ciclo = c(0,1,0,1,0), sinapse = c(0,0,0,0,1))
> df2
gene M1 M2 inflama ciclo sinapse
1 PPARG 1 0 1 0 0
2 RELA 1 0 1 1 0
3 IRF5 1 0 1 0 0
4 ACKR1 1 0 0 1 0
5 GATA3 0 1 0 0 1
预先感谢您的帮助!
我们可以使用 pivot_wider
(来自 tidyr
)重塑或使用 dummy_cols
(来自 fastDummies
)
library(fastDummies)
library(dplyr)
library(stringr)
df %>%
dummy_cols(c('module', 'GO'), remove_selected_columns = TRUE) %>%
group_by(gene) %>%
summarise(across(everything(), ~ + (any(.x)))) %>%
rename_with(~ str_remove(., ".*_"), contains("_"))
-输出
# A tibble: 5 × 6
gene M1 M2 ciclo inflama sinapse
<chr> <int> <int> <int> <int> <int>
1 ACKR1 1 0 1 0 0
2 GATA3 0 1 0 0 1
3 IRF5 1 0 0 1 0
4 PPARG 1 0 0 1 0
5 RELA 1 0 1 1 0
您应该查看 dplyr、tidyr 和 tidyverse 库以进行数据整理R.
## Loading the required libraries
library(dplyr)
library(tidyr)
library(tidyverse)
## Creating a pivot table
df_pivot = df %>%
pivot_longer(!gene, names_to = 'Attributes', values_to = "AttributeValue") %>% ## Wider to longer
group_by(gene,AttributeValue) %>% ## Grouping
summarise(Count = n()) %>% ## Aggregation
pivot_wider(names_from = AttributeValue , values_from = Count) ## Longer to wider
## Replacing NA with 0
df_pivot[is.na(df_pivot)] = 0
df_pivot
# A tibble: 5 x 6
# Groups: gene [5]
gene ciclo M1 M2 sinapse inflama
<chr> <int> <int> <int> <int> <int>
1 ACKR1 1 1 0 0 0
2 GATA3 0 0 1 1 0
3 IRF5 0 1 0 0 1
4 PPARG 0 1 0 0 1
5 RELA 1 2 0 0 1
我认为在这里调用 pivot_wider
两次就足够了。
library(tidyr)
df %>%
pivot_wider(names_from = module,
values_from = module,
values_fill = 0,
values_fn = \(x)1) %>%
pivot_wider(names_from = GO,
values_from = GO,
values_fill = 0,
values_fn = \(x)1)
# A tibble: 5 × 6
gene M1 M2 inflama ciclo sinapse
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 PPARG 1 0 1 0 0
2 RELA 1 0 1 1 0
3 IRF5 1 0 1 0 0
4 ACKR1 1 0 0 1 0
5 GATA3 0 1 0 0 1
如果我们想让它更简单一点,我们可以定义一个旋转函数:
pivoting<-function(x, y) pivot_wider(x,
names_from = y,
values_from = y,
values_fill = 0,
values_fn = \(x)1)
df %>%
pivoting('module') %>%
pivoting('GO')
但我认为这太麻烦了,因为已经有一个包。正如@akrun 所提到的,我会选择 dummy_cols
Matrix
包有一个非常有效的 fac2sparse
函数用于此目的。如果你从 CRAN 下载 R,那么你已经安装了它。以下是其用法示例:
library("Matrix")
f <- gl(3, 2L, labels = letters[1:3])
f
## [1] a a b b c c
## Levels: a b c
fac2sparse(f)
## 3 x 6 sparse Matrix of class "dgCMatrix"
##
## a 1 1 . . . .
## b . . 1 1 . .
## c . . . . 1 1
要将生成的矩阵包含在长格式数据框中,您可以使用 t
将其转置并使用 as(Class="matrix")
.
fac2dense <- function(x) as(t(fac2sparse(x)), "matrix")
data.frame(f, fac2dense(f))
## f a b c
## 1 a 1 0 0
## 2 a 1 0 0
## 3 b 0 1 0
## 4 b 0 1 0
## 5 c 0 0 1
## 6 c 0 0 1
将其付诸实践,我将分两步转换您的数据框 df
,如下所示:
df_bin <- data.frame(df["gene"], fac2dense(df[["module"]]), fac2dense(df[["GO"]]))
df_bin
gene M1 M2 ciclo inflama sinapse
1 PPARG 1 0 0 1 0
2 RELA 1 0 0 1 0
3 RELA 1 0 1 0 0
4 IRF5 1 0 0 1 0
5 ACKR1 1 0 1 0 0
6 GATA3 0 1 0 0 1
aggregate(. ~ gene, df_bin, max)
gene M1 M2 ciclo inflama sinapse
1 ACKR1 1 0 1 0 0
2 GATA3 0 1 0 0 1
3 IRF5 1 0 0 1 0
4 PPARG 1 0 0 1 0
5 RELA 1 0 1 1 0
使用data.table
也是一种选择:
library(data.table)
setDT(df)
使用dcast()
聚合:
DT1 = dcast(df, gene ~ GO, fill = 0L, fun.agg = \(x) sum(!is.na(x)))
DT2 = dcast(df, gene ~ module, fill = 0L, fun.agg = \(x) ifelse(x == 0, 0, 1))
那么您可以加入表格:
DT3 = DT1[DT2, on = "gene"]
DT3
输出
gene ciclo inflama sinapse M1 M2
1: ACKR1 1 0 0 1 0
2: GATA3 0 0 1 0 1
3: IRF5 0 1 0 1 0
4: PPARG 0 1 0 1 0
5: RELA 1 1 0 1 0