特定字符串的条件出现和使用 R 相对制作一个新的数据框
Conditional occurrences of specific strings and relatively making a new data frame using R
我有一个包含 4 列和许多行的大数据框(示例已附上)。
#what I have
Arm <- c("5prime","3prime","5prime","CoMature","3prime","5prime","3prime","3prime")
Family <- c("LET-7","LET-7","LET-7","MIR-10","MIR-103","MIR-124","MIR-124","MIR-124")
Sequence <- c("ATCGGCA","ATGCTAC","ATCGGCA","ATCGTTT","TGAGGAG","TGATCAG","AATTCAG","AATTCAG")
Star_seq <- c("TTCAGGT","TATACTG","TTCAGGT","GAGATCA","CAAAAGC","CACATGC","AATATGC","AATATGC")
my_data_frame <- data.frame(Arm,Family,Sequence,Star_seq)
我想做的基本上是为 Family 列中的每个 i 计算 Arm 列中 '5prime'、'3prime' 或 'CoMature' 出现的次数。然后对于最常见的('5prime'、'3prime' 或 'CoMature')取第三和第四列。总而言之,我需要一个最终文件,显示 Family 列中每个 i 的最频繁臂(在第一行)以及它们在第三和第四列中的相对序列。
#what I want as output
five_prime_counts <- c("2","0","0","1")
three_prime_counts <- c("1","0","1","2")
CoMature_counts <- c("0","1","0","0")
Arm_new <- c("5prime","CoMature","3prime","3prime")
Family_new <- c("LET-7","MIR-10","MIR-103","MIR-124")
Sequence_new <- c("ATCGGCA","ATCGTTT","TGAGGAG","AATTCAG")
Star_seq_new <- c("TTCAGGT","GAGATCA","CAAAAGC","AATATGC")
my_data_frame_new <- data.frame(five_prime_counts,three_prime_counts,CoMature_counts,Arm_new,Family_new,Sequence_new,Star_seq_new)
我们可以为每个Family
和Arm
添加一个计数变量,得到对应的Sequence
、Star_seq
和Arm
的最大计数值每个 Family
并以宽格式获取数据。
library(dplyr)
my_data_frame %>%
add_count(Family, Arm) %>%
group_by(Family) %>%
mutate(Sequence = Sequence[which.max(n)],
Star_seq = Star_seq[which.max(n)],
Arm_new = Arm[which.max(n)]) %>%
distinct() %>%
tidyr::pivot_wider(names_from = Arm, values_from = n, values_fill = list(n = 0))
# Family Sequence Star_seq Arm_new `5prime` `3prime` CoMature
# <fct> <fct> <fct> <fct> <int> <int> <int>
#1 LET-7 ATCGGCA TTCAGGT 5prime 2 1 0
#2 MIR-10 ATCGTTT GAGATCA CoMature 0 0 1
#3 MIR-103 TGAGGAG CAAAAGC 3prime 0 1 0
#4 MIR-124 AATTCAG AATATGC 3prime 1 2 0
我有一个包含 4 列和许多行的大数据框(示例已附上)。
#what I have
Arm <- c("5prime","3prime","5prime","CoMature","3prime","5prime","3prime","3prime")
Family <- c("LET-7","LET-7","LET-7","MIR-10","MIR-103","MIR-124","MIR-124","MIR-124")
Sequence <- c("ATCGGCA","ATGCTAC","ATCGGCA","ATCGTTT","TGAGGAG","TGATCAG","AATTCAG","AATTCAG")
Star_seq <- c("TTCAGGT","TATACTG","TTCAGGT","GAGATCA","CAAAAGC","CACATGC","AATATGC","AATATGC")
my_data_frame <- data.frame(Arm,Family,Sequence,Star_seq)
我想做的基本上是为 Family 列中的每个 i 计算 Arm 列中 '5prime'、'3prime' 或 'CoMature' 出现的次数。然后对于最常见的('5prime'、'3prime' 或 'CoMature')取第三和第四列。总而言之,我需要一个最终文件,显示 Family 列中每个 i 的最频繁臂(在第一行)以及它们在第三和第四列中的相对序列。
#what I want as output
five_prime_counts <- c("2","0","0","1")
three_prime_counts <- c("1","0","1","2")
CoMature_counts <- c("0","1","0","0")
Arm_new <- c("5prime","CoMature","3prime","3prime")
Family_new <- c("LET-7","MIR-10","MIR-103","MIR-124")
Sequence_new <- c("ATCGGCA","ATCGTTT","TGAGGAG","AATTCAG")
Star_seq_new <- c("TTCAGGT","GAGATCA","CAAAAGC","AATATGC")
my_data_frame_new <- data.frame(five_prime_counts,three_prime_counts,CoMature_counts,Arm_new,Family_new,Sequence_new,Star_seq_new)
我们可以为每个Family
和Arm
添加一个计数变量,得到对应的Sequence
、Star_seq
和Arm
的最大计数值每个 Family
并以宽格式获取数据。
library(dplyr)
my_data_frame %>%
add_count(Family, Arm) %>%
group_by(Family) %>%
mutate(Sequence = Sequence[which.max(n)],
Star_seq = Star_seq[which.max(n)],
Arm_new = Arm[which.max(n)]) %>%
distinct() %>%
tidyr::pivot_wider(names_from = Arm, values_from = n, values_fill = list(n = 0))
# Family Sequence Star_seq Arm_new `5prime` `3prime` CoMature
# <fct> <fct> <fct> <fct> <int> <int> <int>
#1 LET-7 ATCGGCA TTCAGGT 5prime 2 1 0
#2 MIR-10 ATCGTTT GAGATCA CoMature 0 0 1
#3 MIR-103 TGAGGAG CAAAAGC 3prime 0 1 0
#4 MIR-124 AATTCAG AATATGC 3prime 1 2 0