将成对距离 table 转换为仅两列中个体的距离列表
Convert pairwise distance table to list of distance for individuals in only two columns
我想将成对距离 table(2 列中的观察值)转换为 table 并列出个人(1 列中的观察值)。基本上,关于成对关系的信息将丢失(无论如何这与我的分析无关)并且距离值需要为它们各自的行加倍。
我可以用这段代码分隔字符串:
pairwise_readout <- str_split_fixed(pairwise[,1], " ", 4) #splits strings apart
pairwise_readout <- data.frame(pairwise_readout,pairwise$dist) #places distance again
但不知道如何继续将 table 重新排列到更少的列中。所有搜索结果仅显示成对的 table 相关解决方案。
这是一个示例数据集:
需要注意的重要一点是,我也对每次观察的字符串中包含的 'gr#' 感兴趣。
pairwise <- data.frame(ind_comp = c("OP2645ii_d gr3 OP5048___g gr2","OP5046___e gr5 OP5048___g gr2","OP2413iiia gr1 OP5048___g gr2","OP5043___b gr1 OP5048___g gr2", "OP3088i___a gr1 OP5048___g gr2","OP5046___a gr5 OP5048___g gr2", "OP5048___b gr5 OP5048___g gr2", "OP5043___a gr3 OP5048___g gr2", "OP2645ii_d gr3 OP5048___g gr2", "OP2645ii_d gr3 OP5044___c gr2", "OP2413iiib gr4 OP5048___g gr2", "OP5046___c gr1 OP5048___g gr2"), dist = c(7.590363,6.449676,6.419955,6.349918,6.182623,6.162655,6.154232,6.140147,6.058633,5.962923,5.943956,5.863753))
基本上我想要一个遵循这种形式的table:
pairwise_table_less_columns <- data.frame(ind_comp = c("OP2645ii_d","OP5048___g","OP5046___e", "OP5048___g", "OP2413iiia", "OP5048___g", "OP5043___b", "OP5048___g", "OP3088i___a", "OP5048___g", "OP5046___a", "OP5048___g", "OP5048___b", "OP5048___g", "OP5043___a", "OP5048___g", "OP2645ii_d", "OP5048___g", "OP2645ii_d", "OP5044___c", "OP2413iiib", "OP5048___g", "OP5046___c", "OP5048___g"), gr = c("gr3","gr2","gr5", "gr2", "gr1", "gr2", "gr1", "gr2", "gr1", "gr2", "gr5", "gr2", "gr5", "gr2", "gr3", "gr2", "gr3", "gr2", "gr3", "gr2", "gr4", "gr2", "gr1", "gr2"), dist = c(7.590363,7.590363,6.449676,6.449676,6.419955,6.419955,6.349918,6.349918,6.182623,6.182623,6.162655,6.162655,6.154232,6.154232,6.140147,6.140147,6.058633,6.058633,5.962923,5.962923,5.943956,5.943956,5.863753,5.863753))
我们可以使用 dplyr
和 tidyr
。首先 separate
ind_comp
根据空格分成 4 个不同的列,gather
变成长格式,从 key
列中删除数字,使它们具有相同的名称,使用创建一个公共标识符row_number()
然后 spread
到宽格式。
library(dplyr)
library(tidyr)
pairwise %>%
separate(ind_comp, c("ind_comp1", "gr1", "ind_comp2", "gr2"), sep = "\s+") %>%
gather(key, value, -dist) %>%
mutate(key = sub("\d+", "", key)) %>%
group_by(key) %>%
mutate(row = row_number()) %>%
spread(key, value) %>%
dplyr::select(-row)
# A tibble: 24 x 3
# dist gr ind_comp
# <dbl> <chr> <chr>
# 1 5.86 gr1 OP5046___c
# 2 5.86 gr2 OP5048___g
# 3 5.94 gr4 OP2413iiib
# 4 5.94 gr2 OP5048___g
# 5 5.96 gr3 OP2645ii_d
# 6 5.96 gr2 OP5044___c
# 7 6.06 gr3 OP2645ii_d
# 8 6.06 gr2 OP5048___g
# 9 6.14 gr3 OP5043___a
#10 6.14 gr2 OP5048___g
# … with 14 more rows
这是一个基本的 R 解决方案。
将数据框 pairwise_readout
按列分成两部分,然后 rbind
它们。有一些中间步骤可以确保列名相等并对结果进行排序。
tmp1 <- pairwise_readout[c(1, 2, 5)]
tmp2 <- pairwise_readout[c(3, 4, 5)]
names(tmp1) <- names(tmp2) <- c("ind_comp", "gr", "dist")
tmp1$id <- tmp2$id <- seq_len(nrow(tmp1))
tmp <- rbind(tmp1,tmp2)
result <- tmp[order(tmp$id), -4]
最后清理。
rm(tmp, tmp1, tmp2)
另一个想法是用另一个分隔符替换第二个 space,然后拆分,即
library(dplyr)
library(tidyr)
pairwise %>%
mutate(ind_comp = gsub('([^ ]+ [^ ]+) ', '\1|', ind_comp)) %>%
separate_rows(ind_comp, sep = '[|]')
这给出了,
ind_comp dist
1 OP2645ii_d gr3 7.590363
2 OP5048___g gr2 7.590363
3 OP5046___e gr5 6.449676
4 OP5048___g gr2 6.449676
5 OP2413iiia gr1 6.419955
6 OP5048___g gr2 6.419955
7 OP5043___b gr1 6.349918
8 OP5048___g gr2 6.349918
9 OP3088i___a gr1 6.182623
10 OP5048___g gr2 6.182623
11 OP5046___a gr5 6.162655
12 OP5048___g gr2 6.162655
13 OP5048___b gr5 6.154232
14 OP5048___g gr2 6.154232
15 OP5043___a gr3 6.140147
16 OP5048___g gr2 6.140147
17 OP2645ii_d gr3 6.058633
18 OP5048___g gr2 6.058633
19 OP2645ii_d gr3 5.962923
20 OP5044___c gr2 5.962923
21 OP2413iiib gr4 5.943956
22 OP5048___g gr2 5.943956
23 OP5046___c gr1 5.863753
24 OP5048___g gr2 5.863753
我来晚了,但这是我的解决方案:
library("stringr") #For str_split
pairwise <- data.frame(ind_comp = c("OP2645ii_d gr3 OP5048___g gr2","OP5046___e gr5 OP5048___g gr2","OP2413iiia gr1 OP5048___g gr2","OP5043___b gr1 OP5048___g gr2", "OP3088i___a gr1 OP5048___g gr2","OP5046___a gr5 OP5048___g gr2", "OP5048___b gr5 OP5048___g gr2", "OP5043___a gr3 OP5048___g gr2", "OP2645ii_d gr3 OP5048___g gr2", "OP2645ii_d gr3 OP5044___c gr2", "OP2413iiib gr4 OP5048___g gr2", "OP5046___c gr1 OP5048___g gr2"), dist = c(7.590363,6.449676,6.419955,6.349918,6.182623,6.162655,6.154232,6.140147,6.058633,5.962923,5.943956,5.863753))
pairwise$ind_comp <- as.character(pairwise$ind_comp)
pairwise$ind_comp2 <- sapply(str_split(pairwise$ind_comp, "(?<=\s[a-z]{2}[0-9]{1})\s"), "[", 2) #Splitting to create second column
pairwise$ind_comp <- sapply(str_split(pairwise$ind_comp, "(?<=\s[a-z]{2}[0-9]{1})\s"), "[", 1) #And first column
tmp_pairwise <- data.frame(ind_comp = pairwise$ind_comp2, dist = as.numeric(pairwise$dist)) #Copying second columna and corresponding distances to temporary object
pairwise <- pairwise[, -3] #Removing second column from original data frame
pairwise <- rbind(pairwise, tmp_pairwise) #Binding original data frame and the temporary data frame by rows
rm(tmp_pairwise) #Removing temporary data frame
pairwise$gr <- sapply(str_split(pairwise$ind_comp, "(?<=\s)"), "[", 2) #Creating group column
pairwise$ind_comp <- sapply(str_split(pairwise$ind_comp, "(?<=\s)"), "[", 1) #Fixing first column to remove group information
head(pairwise)
ind_comp dist gr
1 OP2645ii_d 7.590363 gr3
2 OP5046___e 6.449676 gr5
3 OP2413iiia 6.419955 gr1
4 OP5043___b 6.349918 gr1
5 OP3088i___a 6.182623 gr1
6 OP5046___a 6.162655 gr5
我想将成对距离 table(2 列中的观察值)转换为 table 并列出个人(1 列中的观察值)。基本上,关于成对关系的信息将丢失(无论如何这与我的分析无关)并且距离值需要为它们各自的行加倍。
我可以用这段代码分隔字符串:
pairwise_readout <- str_split_fixed(pairwise[,1], " ", 4) #splits strings apart
pairwise_readout <- data.frame(pairwise_readout,pairwise$dist) #places distance again
但不知道如何继续将 table 重新排列到更少的列中。所有搜索结果仅显示成对的 table 相关解决方案。
这是一个示例数据集:
需要注意的重要一点是,我也对每次观察的字符串中包含的 'gr#' 感兴趣。
pairwise <- data.frame(ind_comp = c("OP2645ii_d gr3 OP5048___g gr2","OP5046___e gr5 OP5048___g gr2","OP2413iiia gr1 OP5048___g gr2","OP5043___b gr1 OP5048___g gr2", "OP3088i___a gr1 OP5048___g gr2","OP5046___a gr5 OP5048___g gr2", "OP5048___b gr5 OP5048___g gr2", "OP5043___a gr3 OP5048___g gr2", "OP2645ii_d gr3 OP5048___g gr2", "OP2645ii_d gr3 OP5044___c gr2", "OP2413iiib gr4 OP5048___g gr2", "OP5046___c gr1 OP5048___g gr2"), dist = c(7.590363,6.449676,6.419955,6.349918,6.182623,6.162655,6.154232,6.140147,6.058633,5.962923,5.943956,5.863753))
基本上我想要一个遵循这种形式的table:
pairwise_table_less_columns <- data.frame(ind_comp = c("OP2645ii_d","OP5048___g","OP5046___e", "OP5048___g", "OP2413iiia", "OP5048___g", "OP5043___b", "OP5048___g", "OP3088i___a", "OP5048___g", "OP5046___a", "OP5048___g", "OP5048___b", "OP5048___g", "OP5043___a", "OP5048___g", "OP2645ii_d", "OP5048___g", "OP2645ii_d", "OP5044___c", "OP2413iiib", "OP5048___g", "OP5046___c", "OP5048___g"), gr = c("gr3","gr2","gr5", "gr2", "gr1", "gr2", "gr1", "gr2", "gr1", "gr2", "gr5", "gr2", "gr5", "gr2", "gr3", "gr2", "gr3", "gr2", "gr3", "gr2", "gr4", "gr2", "gr1", "gr2"), dist = c(7.590363,7.590363,6.449676,6.449676,6.419955,6.419955,6.349918,6.349918,6.182623,6.182623,6.162655,6.162655,6.154232,6.154232,6.140147,6.140147,6.058633,6.058633,5.962923,5.962923,5.943956,5.943956,5.863753,5.863753))
我们可以使用 dplyr
和 tidyr
。首先 separate
ind_comp
根据空格分成 4 个不同的列,gather
变成长格式,从 key
列中删除数字,使它们具有相同的名称,使用创建一个公共标识符row_number()
然后 spread
到宽格式。
library(dplyr)
library(tidyr)
pairwise %>%
separate(ind_comp, c("ind_comp1", "gr1", "ind_comp2", "gr2"), sep = "\s+") %>%
gather(key, value, -dist) %>%
mutate(key = sub("\d+", "", key)) %>%
group_by(key) %>%
mutate(row = row_number()) %>%
spread(key, value) %>%
dplyr::select(-row)
# A tibble: 24 x 3
# dist gr ind_comp
# <dbl> <chr> <chr>
# 1 5.86 gr1 OP5046___c
# 2 5.86 gr2 OP5048___g
# 3 5.94 gr4 OP2413iiib
# 4 5.94 gr2 OP5048___g
# 5 5.96 gr3 OP2645ii_d
# 6 5.96 gr2 OP5044___c
# 7 6.06 gr3 OP2645ii_d
# 8 6.06 gr2 OP5048___g
# 9 6.14 gr3 OP5043___a
#10 6.14 gr2 OP5048___g
# … with 14 more rows
这是一个基本的 R 解决方案。
将数据框 pairwise_readout
按列分成两部分,然后 rbind
它们。有一些中间步骤可以确保列名相等并对结果进行排序。
tmp1 <- pairwise_readout[c(1, 2, 5)]
tmp2 <- pairwise_readout[c(3, 4, 5)]
names(tmp1) <- names(tmp2) <- c("ind_comp", "gr", "dist")
tmp1$id <- tmp2$id <- seq_len(nrow(tmp1))
tmp <- rbind(tmp1,tmp2)
result <- tmp[order(tmp$id), -4]
最后清理。
rm(tmp, tmp1, tmp2)
另一个想法是用另一个分隔符替换第二个 space,然后拆分,即
library(dplyr)
library(tidyr)
pairwise %>%
mutate(ind_comp = gsub('([^ ]+ [^ ]+) ', '\1|', ind_comp)) %>%
separate_rows(ind_comp, sep = '[|]')
这给出了,
ind_comp dist 1 OP2645ii_d gr3 7.590363 2 OP5048___g gr2 7.590363 3 OP5046___e gr5 6.449676 4 OP5048___g gr2 6.449676 5 OP2413iiia gr1 6.419955 6 OP5048___g gr2 6.419955 7 OP5043___b gr1 6.349918 8 OP5048___g gr2 6.349918 9 OP3088i___a gr1 6.182623 10 OP5048___g gr2 6.182623 11 OP5046___a gr5 6.162655 12 OP5048___g gr2 6.162655 13 OP5048___b gr5 6.154232 14 OP5048___g gr2 6.154232 15 OP5043___a gr3 6.140147 16 OP5048___g gr2 6.140147 17 OP2645ii_d gr3 6.058633 18 OP5048___g gr2 6.058633 19 OP2645ii_d gr3 5.962923 20 OP5044___c gr2 5.962923 21 OP2413iiib gr4 5.943956 22 OP5048___g gr2 5.943956 23 OP5046___c gr1 5.863753 24 OP5048___g gr2 5.863753
我来晚了,但这是我的解决方案:
library("stringr") #For str_split
pairwise <- data.frame(ind_comp = c("OP2645ii_d gr3 OP5048___g gr2","OP5046___e gr5 OP5048___g gr2","OP2413iiia gr1 OP5048___g gr2","OP5043___b gr1 OP5048___g gr2", "OP3088i___a gr1 OP5048___g gr2","OP5046___a gr5 OP5048___g gr2", "OP5048___b gr5 OP5048___g gr2", "OP5043___a gr3 OP5048___g gr2", "OP2645ii_d gr3 OP5048___g gr2", "OP2645ii_d gr3 OP5044___c gr2", "OP2413iiib gr4 OP5048___g gr2", "OP5046___c gr1 OP5048___g gr2"), dist = c(7.590363,6.449676,6.419955,6.349918,6.182623,6.162655,6.154232,6.140147,6.058633,5.962923,5.943956,5.863753))
pairwise$ind_comp <- as.character(pairwise$ind_comp)
pairwise$ind_comp2 <- sapply(str_split(pairwise$ind_comp, "(?<=\s[a-z]{2}[0-9]{1})\s"), "[", 2) #Splitting to create second column
pairwise$ind_comp <- sapply(str_split(pairwise$ind_comp, "(?<=\s[a-z]{2}[0-9]{1})\s"), "[", 1) #And first column
tmp_pairwise <- data.frame(ind_comp = pairwise$ind_comp2, dist = as.numeric(pairwise$dist)) #Copying second columna and corresponding distances to temporary object
pairwise <- pairwise[, -3] #Removing second column from original data frame
pairwise <- rbind(pairwise, tmp_pairwise) #Binding original data frame and the temporary data frame by rows
rm(tmp_pairwise) #Removing temporary data frame
pairwise$gr <- sapply(str_split(pairwise$ind_comp, "(?<=\s)"), "[", 2) #Creating group column
pairwise$ind_comp <- sapply(str_split(pairwise$ind_comp, "(?<=\s)"), "[", 1) #Fixing first column to remove group information
head(pairwise)
ind_comp dist gr
1 OP2645ii_d 7.590363 gr3
2 OP5046___e 6.449676 gr5
3 OP2413iiia 6.419955 gr1
4 OP5043___b 6.349918 gr1
5 OP3088i___a 6.182623 gr1
6 OP5046___a 6.162655 gr5