根据数据框 R 的元素串创建坐标向量
Create a coordinate vector based on the string of elements of a data frame R
我正在尝试创建一个坐标向量,以便稍后使用 ggplot 绘图。
假设我有一个如下所示的数据框:
keys = c("aa", "aa", "ac", "ag", "gg", "at", "ca", "gc", "cc", "cg", "gt", "gg", "tt", "ta", "ga", "tg")
values = c(9.318796e-05, 1.863759e-04, 5.591278e-04, 1.863759e-04, 2.795639e-04, 9.318796e-05, 9.318796e-05, 1.863759e-04, 1.863759e-04, 2.795639e-04, 2.795639e-04, 1.863759e-04, 2.795639e-04, 9.318796e-05, 9.318796e-05, 5.591278e-04)
df = data.frame(keys, values)
现在我想创建一个矩阵,让每个字母都有自己的 space,具体来说:
A(-1,1) [左上],
T(1,-1)[右下],
G(1,1)[右上]和
C(-1,-1)[左下]
为此我做了:
array_size = sqrt(4^k) #Where k = 2
graph_coord = c()
for(i in range(array_size)){
graph_coord = append(graph_coord, array_size[1])
} ##Give the graph_coord its size
maxx = array_size
maxy = array_size
posx = 1
posy = 1
for(i in df$keys){
##This part is for getting each individual letter of each element of keys.
for(j in i[[1]]){
##If the individual letter is a T then the actual position on x should be maxx/2
if (i == "T"){
posx = maxx/2
}else if(i == "C"){
posy = maxy/2
}else if(i == "G"){
posx =maxx/2
posy =maxy/2
}
###Up until this point I think that the code is doing well,
###I can grab individual letters of each element of key and
###see which one they are and then decide to move them according
###to the initial coordinate maxx and maxy. The next part escapes me:
maxx = maxx/2
maxy /=2 ##This /= is customary to python what would be the R equivalent?
##Append the graph coordinates with the df$values.
graph_coord = append(graph_coord, posy-1, posx-1, prob) ##This part is especially hard for me to grasp and as such I have left the idea, but the code snippet is absolutely incorrect.
}
此代码仍在开发中。我正在尝试重新创建这里所做的事情:Frequency table extracted from Chaos Game Representation
这里有一个 tidyverse 方法:
library(tidyverse)
pos <- data.frame(key = c("a", "g", "t", "c"),
x = c(-1, 1, -1, 1),
y = c(1, 1, -1, -1))
df %>%
separate(keys, c("M1", "M2"), sep = 1, remove = FALSE) %>%
left_join(pos, by = c("M1" = "key")) %>% #this adds an x and y.
left_join(pos, by = c("M2" = "key")) %>% #the first x from M1 becomes x.x,
# 2nd from M2 becomes x.y
mutate(x = x.x + x.y/2, y = y.x + y.y/2) %>%
ggplot(aes(x, y, fill = values, label = keys)) +
geom_tile(alpha = 0.2) +
geom_text()
您的示例数据有两个 aa
和两个 gg
所以这是输出:
编辑:这是针对任何 k
粒度的通用方法。这里有两个示例数据帧,分别为 k = 3 和 4。
key <- c("a", "g", "t", "c")
# k = 3
df2 <- expand_grid(key1 = key,
key2 = key,
key3 = key) %>%
unite("keys", key1:key3, sep = "") %>%
mutate(values = sin(seq(0, 60, length.out = 4^3)))
# k = 4
df2 <- expand_grid(key1 = key,
key2 = key,
key3 = key,
key4 = key) %>%
unite("keys", key1:key4, sep = "") %>%
mutate(values = sin(seq(0, 60, length.out = 4^4)))
然后我们可以将它们插入到以下代码中:
array_size = max(str_count(df2$keys))
# reshape
df2 %>%
separate(keys, paste0("col",1:array_size), sep = 1:(array_size-1), remove = FALSE) %>%
mutate(id = row_number()) %>%
pivot_longer(-c(keys, id, values)) %>%
# calculate coordinates
left_join(pos, by = c("value" = "key")) %>%
mutate(divisor = 2^parse_number(name),
across(c(x,y), ~./divisor)) %>%
group_by(keys, id, values) %>%
summarize(x = sum(x), y = sum(y)) %>%
# plotting
ggplot(aes(x, y, fill = values, label = keys)) +
geom_tile(alpha = 0.5) +
geom_text(angle = 20)
产生以下输出:
我正在尝试创建一个坐标向量,以便稍后使用 ggplot 绘图。
假设我有一个如下所示的数据框:
keys = c("aa", "aa", "ac", "ag", "gg", "at", "ca", "gc", "cc", "cg", "gt", "gg", "tt", "ta", "ga", "tg")
values = c(9.318796e-05, 1.863759e-04, 5.591278e-04, 1.863759e-04, 2.795639e-04, 9.318796e-05, 9.318796e-05, 1.863759e-04, 1.863759e-04, 2.795639e-04, 2.795639e-04, 1.863759e-04, 2.795639e-04, 9.318796e-05, 9.318796e-05, 5.591278e-04)
df = data.frame(keys, values)
现在我想创建一个矩阵,让每个字母都有自己的 space,具体来说:
A(-1,1) [左上],
T(1,-1)[右下],
G(1,1)[右上]和
C(-1,-1)[左下]
为此我做了:
array_size = sqrt(4^k) #Where k = 2
graph_coord = c()
for(i in range(array_size)){
graph_coord = append(graph_coord, array_size[1])
} ##Give the graph_coord its size
maxx = array_size
maxy = array_size
posx = 1
posy = 1
for(i in df$keys){
##This part is for getting each individual letter of each element of keys.
for(j in i[[1]]){
##If the individual letter is a T then the actual position on x should be maxx/2
if (i == "T"){
posx = maxx/2
}else if(i == "C"){
posy = maxy/2
}else if(i == "G"){
posx =maxx/2
posy =maxy/2
}
###Up until this point I think that the code is doing well,
###I can grab individual letters of each element of key and
###see which one they are and then decide to move them according
###to the initial coordinate maxx and maxy. The next part escapes me:
maxx = maxx/2
maxy /=2 ##This /= is customary to python what would be the R equivalent?
##Append the graph coordinates with the df$values.
graph_coord = append(graph_coord, posy-1, posx-1, prob) ##This part is especially hard for me to grasp and as such I have left the idea, but the code snippet is absolutely incorrect.
}
此代码仍在开发中。我正在尝试重新创建这里所做的事情:Frequency table extracted from Chaos Game Representation
这里有一个 tidyverse 方法:
library(tidyverse)
pos <- data.frame(key = c("a", "g", "t", "c"),
x = c(-1, 1, -1, 1),
y = c(1, 1, -1, -1))
df %>%
separate(keys, c("M1", "M2"), sep = 1, remove = FALSE) %>%
left_join(pos, by = c("M1" = "key")) %>% #this adds an x and y.
left_join(pos, by = c("M2" = "key")) %>% #the first x from M1 becomes x.x,
# 2nd from M2 becomes x.y
mutate(x = x.x + x.y/2, y = y.x + y.y/2) %>%
ggplot(aes(x, y, fill = values, label = keys)) +
geom_tile(alpha = 0.2) +
geom_text()
您的示例数据有两个 aa
和两个 gg
所以这是输出:
编辑:这是针对任何 k
粒度的通用方法。这里有两个示例数据帧,分别为 k = 3 和 4。
key <- c("a", "g", "t", "c")
# k = 3
df2 <- expand_grid(key1 = key,
key2 = key,
key3 = key) %>%
unite("keys", key1:key3, sep = "") %>%
mutate(values = sin(seq(0, 60, length.out = 4^3)))
# k = 4
df2 <- expand_grid(key1 = key,
key2 = key,
key3 = key,
key4 = key) %>%
unite("keys", key1:key4, sep = "") %>%
mutate(values = sin(seq(0, 60, length.out = 4^4)))
然后我们可以将它们插入到以下代码中:
array_size = max(str_count(df2$keys))
# reshape
df2 %>%
separate(keys, paste0("col",1:array_size), sep = 1:(array_size-1), remove = FALSE) %>%
mutate(id = row_number()) %>%
pivot_longer(-c(keys, id, values)) %>%
# calculate coordinates
left_join(pos, by = c("value" = "key")) %>%
mutate(divisor = 2^parse_number(name),
across(c(x,y), ~./divisor)) %>%
group_by(keys, id, values) %>%
summarize(x = sum(x), y = sum(y)) %>%
# plotting
ggplot(aes(x, y, fill = values, label = keys)) +
geom_tile(alpha = 0.5) +
geom_text(angle = 20)
产生以下输出: