计算每组的 p 值
Calculate p-values for each group
我有一个像这样的长数据框:
df <- structure(list(Tumor_type = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("epithelial",
"lymphoma", "healthy", "sarcoma"), class = "factor"), Gene = c("A",
"B", "C", "A", "B", "C", "A", "B", "C", "A", "A", "B", "C", "A",
"B", "C", "A", "B", "C", "A"), value = c(55.9228661170814, 63.4145784380524,
207.26299945198, 14.3567918830159, 55.3521592504006, 185.331104118272,
27.6007163612577, 239.913726358976, 153.875266787649, 13.5284557778013,
22.5884252969717, 81.9416366341296, 197.881154317385, 13.3558001634159,
167.58802932121, 79.1822202852964, 37.9496510246124, 85.245769235316,
116.807115796469, 21.8110255970795)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
>df
Tumor_type Gene value
<fct> <chr> <dbl>
epithelial A 55.92287
epithelial B 63.41458
epithelial C 207.26300
epithelial A 14.35679
epithelial B 55.35216
epithelial C 185.33110
epithelial A 27.60072
epithelial B 239.91373
epithelial C 153.87527
epithelial A 13.52846
lymphoma A 22.58843
lymphoma B 81.94164
lymphoma C 197.88115
lymphoma A 13.35580
lymphoma B 167.58803
lymphoma C 79.18222
lymphoma A 37.94965
lymphoma B 85.24577
lymphoma C 116.80712
lymphoma A 21.81103
我想用 Log2FC 和来自 t 检验的相应 p 值创建一个数据框。现在我只能使用 Log2FC 值创建数据框,如下所示:
LFC <- df %>%
group_by(Tumor_type, Gene) %>%
summarise(mean = mean(value)) %>%
ungroup() %>%
group_by(Gene) %>%
mutate(lfc = log2(mean[2] / mean[1])) %>%
ungroup() %>%
filter(Tumor_type == "epithelial") %>%
select(-mean)
>LFC
Tumor_type Gene lfc
<fct> <chr> <dbl>
epithelial A -0.2191989
epithelial B -0.0995055
epithelial C -0.4724193
我想添加具有我可以手动计算的相应 p 值的列:
t.test(df[df$Tumor_type == "epithelial" & df$Gene == "A",]$value,
df[df$Tumor_type == "lymphoma" & df$Gene == "A",]$value)$p.value
0.74
所以问题是我怎样才能用这样的 p 值列创建最终的 table?
Tumor_type Gene lfc p-value
<fct> <chr> <dbl> <dbl>
epithelial A -0.2191989 0.7404867
epithelial B -0.0995055 0.9125193
epithelial C -0.4724193 0.2834448
一个更简单的选择是先使用 pivot_wider
重塑为 'wide' 格式,然后按 summarise
进行分组
library(dplyr)
library(tidyr)
library(data.table)
df %>%
# create a sequence column by Tumor_type
mutate(rn = rowid(Tumor_type)) %>%
# reshape to wide
pivot_wider(names_from = Tumor_type, values_from = value) %>%
# grouped by Gene
group_by(Gene) %>%
# get the log of divided mean values from lymphoma, epithelial
summarise(lfc = log2(mean(lymphoma)/mean(epithelial)),
# apply the t.test on the columns and get the p.value
p_value = t.test(epithelial, lymphoma)$p.value) %>%
mutate(Tumor_type = 'epithelial', .before = 1)
# A tibble: 3 × 4
Tumor_type Gene lfc p_value
<chr> <chr> <dbl> <dbl>
1 epithelial A -0.219 0.740
2 epithelial B -0.0995 0.913
3 epithelial C -0.472 0.283
此外,另一种可能的解决方案是将所有值堆叠为一个列表,以便调用 t.test。这有点像使用 pivot_wider 但代码更少。
df %>%
group_by(Tumor_type,Gene) %>%
summarise(values = list(value)) %>%
group_by(Gene) %>%
summarise(lfc = log2(mean(values[[2]]) /mean(values[[1]])),
p_value= t.test(values[[1]],values[[2]])$p.value)
输出:
# A tibble: 3 x 3
Gene lfc p_value
<chr> <dbl> <dbl>
1 A -0.219 0.740
2 B -0.0995 0.913
3 C -0.472 0.283
我有一个像这样的长数据框:
df <- structure(list(Tumor_type = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("epithelial",
"lymphoma", "healthy", "sarcoma"), class = "factor"), Gene = c("A",
"B", "C", "A", "B", "C", "A", "B", "C", "A", "A", "B", "C", "A",
"B", "C", "A", "B", "C", "A"), value = c(55.9228661170814, 63.4145784380524,
207.26299945198, 14.3567918830159, 55.3521592504006, 185.331104118272,
27.6007163612577, 239.913726358976, 153.875266787649, 13.5284557778013,
22.5884252969717, 81.9416366341296, 197.881154317385, 13.3558001634159,
167.58802932121, 79.1822202852964, 37.9496510246124, 85.245769235316,
116.807115796469, 21.8110255970795)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
>df
Tumor_type Gene value
<fct> <chr> <dbl>
epithelial A 55.92287
epithelial B 63.41458
epithelial C 207.26300
epithelial A 14.35679
epithelial B 55.35216
epithelial C 185.33110
epithelial A 27.60072
epithelial B 239.91373
epithelial C 153.87527
epithelial A 13.52846
lymphoma A 22.58843
lymphoma B 81.94164
lymphoma C 197.88115
lymphoma A 13.35580
lymphoma B 167.58803
lymphoma C 79.18222
lymphoma A 37.94965
lymphoma B 85.24577
lymphoma C 116.80712
lymphoma A 21.81103
我想用 Log2FC 和来自 t 检验的相应 p 值创建一个数据框。现在我只能使用 Log2FC 值创建数据框,如下所示:
LFC <- df %>%
group_by(Tumor_type, Gene) %>%
summarise(mean = mean(value)) %>%
ungroup() %>%
group_by(Gene) %>%
mutate(lfc = log2(mean[2] / mean[1])) %>%
ungroup() %>%
filter(Tumor_type == "epithelial") %>%
select(-mean)
>LFC
Tumor_type Gene lfc
<fct> <chr> <dbl>
epithelial A -0.2191989
epithelial B -0.0995055
epithelial C -0.4724193
我想添加具有我可以手动计算的相应 p 值的列:
t.test(df[df$Tumor_type == "epithelial" & df$Gene == "A",]$value,
df[df$Tumor_type == "lymphoma" & df$Gene == "A",]$value)$p.value
0.74
所以问题是我怎样才能用这样的 p 值列创建最终的 table?
Tumor_type Gene lfc p-value
<fct> <chr> <dbl> <dbl>
epithelial A -0.2191989 0.7404867
epithelial B -0.0995055 0.9125193
epithelial C -0.4724193 0.2834448
一个更简单的选择是先使用 pivot_wider
重塑为 'wide' 格式,然后按 summarise
library(dplyr)
library(tidyr)
library(data.table)
df %>%
# create a sequence column by Tumor_type
mutate(rn = rowid(Tumor_type)) %>%
# reshape to wide
pivot_wider(names_from = Tumor_type, values_from = value) %>%
# grouped by Gene
group_by(Gene) %>%
# get the log of divided mean values from lymphoma, epithelial
summarise(lfc = log2(mean(lymphoma)/mean(epithelial)),
# apply the t.test on the columns and get the p.value
p_value = t.test(epithelial, lymphoma)$p.value) %>%
mutate(Tumor_type = 'epithelial', .before = 1)
# A tibble: 3 × 4
Tumor_type Gene lfc p_value
<chr> <chr> <dbl> <dbl>
1 epithelial A -0.219 0.740
2 epithelial B -0.0995 0.913
3 epithelial C -0.472 0.283
此外,另一种可能的解决方案是将所有值堆叠为一个列表,以便调用 t.test。这有点像使用 pivot_wider 但代码更少。
df %>%
group_by(Tumor_type,Gene) %>%
summarise(values = list(value)) %>%
group_by(Gene) %>%
summarise(lfc = log2(mean(values[[2]]) /mean(values[[1]])),
p_value= t.test(values[[1]],values[[2]])$p.value)
输出:
# A tibble: 3 x 3
Gene lfc p_value
<chr> <dbl> <dbl>
1 A -0.219 0.740
2 B -0.0995 0.913
3 C -0.472 0.283