使用列作为行名称并将其他列设置为基于字符串的二进制值

Question

我需要将其中一列设置为行名，以解决重复问题，如果其他列与行名相关，则应使用唯一值将其分隔为假定为二进制值的新列。

示例：

原始数据框

df <- data.frame(module = c("M1","M1","M1","M1","M1","M2"),
                  GO = c("inflama","inflama","ciclo","inflama","ciclo","sinapse"),
                  gene = c("PPARG","RELA","RELA","IRF5","ACKR1","GATA3"))

> df
  module      GO  gene
1     M1 inflama PPARG
2     M1 inflama  RELA
3     M1   ciclo  RELA
4     M1 inflama  IRF5
5     M1   ciclo ACKR1
6     M2 sinapse GATA3

最终数据框

df2 <- data.frame(gene = c("PPARG","RELA","IRF5","ACKR1","GATA3"),
                   M1 = c(1,1,1,1,0),
                   M2 = c(0,0,0,0,1),
                   inflama = c(1,1,1,0,0),
                   ciclo = c(0,1,0,1,0), sinapse = c(0,0,0,0,1))
> df2
   gene M1 M2 inflama ciclo sinapse
1 PPARG  1  0       1     0       0
2  RELA  1  0       1     1       0
3  IRF5  1  0       1     0       0
4 ACKR1  1  0       0     1       0
5 GATA3  0  1       0     0       1

预先感谢您的帮助！

Answer 1

我们可以使用 pivot_wider（来自 tidyr）重塑或使用 dummy_cols（来自 fastDummies）

library(fastDummies)
library(dplyr)
library(stringr)
df %>% 
 dummy_cols(c('module', 'GO'), remove_selected_columns = TRUE) %>%
 group_by(gene) %>% 
 summarise(across(everything(),  ~ + (any(.x)))) %>%
 rename_with(~ str_remove(., ".*_"), contains("_"))

-输出

# A tibble: 5 × 6
  gene     M1    M2 ciclo inflama sinapse
  <chr> <int> <int> <int>   <int>   <int>
1 ACKR1     1     0     1       0       0
2 GATA3     0     1     0       0       1
3 IRF5      1     0     0       1       0
4 PPARG     1     0     0       1       0
5 RELA      1     0     1       1       0

Answer 2

您应该查看 dplyr、tidyr 和 tidyverse 库以进行数据整理R.

## Loading the required libraries 
library(dplyr)
library(tidyr)
library(tidyverse)

## Creating a pivot table
df_pivot = df %>% 
  pivot_longer(!gene, names_to = 'Attributes', values_to = "AttributeValue") %>% ## Wider to longer
  group_by(gene,AttributeValue) %>% ## Grouping
  summarise(Count = n()) %>% ## Aggregation
  pivot_wider(names_from = AttributeValue , values_from = Count) ## Longer to wider

## Replacing NA with 0
df_pivot[is.na(df_pivot)] = 0

df_pivot
# A tibble: 5 x 6
# Groups:   gene [5]
  gene  ciclo    M1    M2 sinapse inflama
  <chr> <int> <int> <int>   <int>   <int>
1 ACKR1     1     1     0       0       0
2 GATA3     0     0     1       1       0
3 IRF5      0     1     0       0       1
4 PPARG     0     1     0       0       1
5 RELA      1     2     0       0       1

Answer 3

我认为在这里调用 pivot_wider 两次就足够了。

library(tidyr)

df %>%
    pivot_wider(names_from = module,
                values_from = module,
                values_fill = 0,
                values_fn = \(x)1) %>%
    pivot_wider(names_from = GO,
                values_from = GO,
                values_fill = 0,
                values_fn = \(x)1)

# A tibble: 5 × 6
  gene     M1    M2 inflama ciclo sinapse
  <chr> <dbl> <dbl>   <dbl> <dbl>   <dbl>
1 PPARG     1     0       1     0       0
2 RELA      1     0       1     1       0
3 IRF5      1     0       1     0       0
4 ACKR1     1     0       0     1       0
5 GATA3     0     1       0     0       1

如果我们想让它更简单一点，我们可以定义一个旋转函数：

pivoting<-function(x, y) pivot_wider(x,
                        names_from = y,
                        values_from = y,
                        values_fill = 0,
                        values_fn = \(x)1)
df %>%
    pivoting('module') %>%
    pivoting('GO')

但我认为这太麻烦了，因为已经有一个包。正如@akrun 所提到的，我会选择 dummy_cols

Answer 4

Matrix 包有一个非常有效的 fac2sparse 函数用于此目的。如果你从 CRAN 下载 R，那么你已经安装了它。以下是其用法示例：

library("Matrix")
f <- gl(3, 2L, labels = letters[1:3])
f
## [1] a a b b c c
## Levels: a b c

fac2sparse(f)
## 3 x 6 sparse Matrix of class "dgCMatrix"
##              
## a 1 1 . . . .
## b . . 1 1 . .
## c . . . . 1 1

要将生成的矩阵包含在长格式数据框中，您可以使用 t 将其转置并使用 as(Class="matrix").

将其从稀疏格式强制转换为密集格式

fac2dense <- function(x) as(t(fac2sparse(x)), "matrix")
data.frame(f, fac2dense(f))
##   f a b c
## 1 a 1 0 0
## 2 a 1 0 0
## 3 b 0 1 0
## 4 b 0 1 0
## 5 c 0 0 1
## 6 c 0 0 1

将其付诸实践，我将分两步转换您的数据框 df，如下所示：

df_bin <- data.frame(df["gene"], fac2dense(df[["module"]]), fac2dense(df[["GO"]]))
df_bin

   gene M1 M2 ciclo inflama sinapse
1 PPARG  1  0     0       1       0
2  RELA  1  0     0       1       0
3  RELA  1  0     1       0       0
4  IRF5  1  0     0       1       0
5 ACKR1  1  0     1       0       0
6 GATA3  0  1     0       0       1

aggregate(. ~ gene, df_bin, max)

   gene M1 M2 ciclo inflama sinapse
1 ACKR1  1  0     1       0       0
2 GATA3  0  1     0       0       1
3  IRF5  1  0     0       1       0
4 PPARG  1  0     0       1       0
5  RELA  1  0     1       1       0

Answer 5

使用data.table也是一种选择：

library(data.table)

setDT(df)

使用dcast()聚合：

DT1 = dcast(df, gene ~ GO, fill = 0L, fun.agg = \(x) sum(!is.na(x)))

DT2 = dcast(df, gene ~ module, fill = 0L, fun.agg = \(x) ifelse(x == 0, 0, 1))

那么您可以加入表格：

DT3 = DT1[DT2, on = "gene"]

DT3

输出

    gene ciclo inflama sinapse M1 M2
1: ACKR1     1       0       0  1  0
2: GATA3     0       0       1  0  1
3:  IRF5     0       1       0  1  0
4: PPARG     0       1       0  1  0
5:  RELA     1       1       0  1  0

使用列作为行名称并将其他列设置为基于字符串的二进制值

Using a column as row names and set the other columns as binary values based on strings

r

reshape

dataframe