如何使用函数从数据帧的每一行生成多行,然后 join/merge 它们返回?
How to generate multiple rows from each single row of a dataframe with a function(s) and then join/merge them back in?
我从一个数据框开始,其中每一行都有一个长字符串,代表 1D 中的 2D 环境(我们称之为景观)。在实际情况下,这些大约有 6 个值高和 80 个值长,因此在 1D 中,字符串的长度为 480 个字符。我在示例中缩短了这些。每行还有一个唯一名称,是每个景观的 shorthand 标识符。
我有一个函数可以获取每一行,将字符串分成 6 条,并对每条进行分析。在这个例子中,函数的核心动作是压缩条带并获取压缩长度。此函数生成一个 6 行数据框,我需要将其与原始数据框组合,结果是最终数据框,原始数据框的每 1 行有 6 行。
library(dplyr)
library(tibble)
master_df <- tribble(~land_id, ~land_string,
"v1-few_bs", "aaaaaaaaaabaaaaabaaabababaabab",
"v2-more_bs", "aaaaaaaaaababbabbbabbababaabbb")
compress_it <- function(txt) {
len.raw <- sum(nchar(txt))
len.gz <- length(memCompress(txt, "g"))
return(list("len_raw" = len.raw,
"len_gz" = len.gz))
}
get_strip_data <- function(land_id, land_string) {
with_spaces <- gsub("(.{5})", "\1 ", land_string)
chars_on_lines <- str_replace_all(with_spaces, pattern = " ", "\n")
prob_matrix <- read.table(text = chars_on_lines, header=FALSE, sep = " ",
stringsAsFactors = FALSE)
prob_matrix <- mutate(prob_matrix,
land_id = land_id,
substr_id = 1:nrow(prob_matrix) )
prob_matrix <- rename(prob_matrix, land_substring = V1)
mutate(prob_matrix, new = map(land_substring, compress_it)) %>%
unnest_wider(c(new))
}
get_strip_data(master_df$land_id[[2]], master_df$land_string[[2]]) # to test the above function
这里是我们进入 pseudocode/klugecode 我想做的事情的地方。
首先,我制作了一个空数据框。
subchunks_df <-
tribble(~land_id, ~land_string, ~land_substring, ~substr_id, ~len_raw, ~len_gz,
"", "", "", NA, NA, NA)
尝试 for 循环:
for ( i in 1:nrow(master_df) ) {
subchunks_df[i, ] <- get_strip_data(master_df$land_id[[i]], master_df$land_string[[i]])
}
改为尝试映射:
subchunks_df <- mapply(get_strip_data,
land_id = master_df$land_id,
land_string = master_df$land_string)
没有。大方一点,我的尝试是 "close but no cigar."
如果我能得到 subchunks_df 正确的形状,我会 right_join:
final_df <- right_join(master_df, subchunks_df, by = "land_id")
考虑到 master_df 遍历函数,这是所需的输出:
final_df <-
tribble(~land_id, ~land_string, ~land_substring, ~substr_id, ~len_raw, ~len_gz,
"v1-few_bs", "aaaaaaaaaabaaaaabaaabababaabab", "aaaaa", 1, 5, 11,
"v1-few_bs", "aaaaaaaaaabaaaaabaaabababaabab", "aaaaa", 2, 5, 11,
"v1-few_bs", "aaaaaaaaaabaaaaabaaabababaabab", "baaaa", 3, 5, 11,
"v1-few_bs", "aaaaaaaaaabaaaaabaaabababaabab", "abaaa", 4, 5, 13,
"v1-few_bs", "aaaaaaaaaabaaaaabaaabababaabab", "babab", 5, 5, 13,
"v1-few_bs", "aaaaaaaaaabaaaaabaaabababaabab", "aabab", 6, 5, 13,
"v2-more_bs", "aaaaaaaaaababbabbbabbababaabbb", "aaaaa", 1, 5, 11,
"v2-more_bs", "aaaaaaaaaababbabbbabbababaabbb", "aaaaa", 2, 5, 11,
"v2-more_bs", "aaaaaaaaaababbabbbabbababaabbb", "babba", 3, 5, 13,
"v2-more_bs", "aaaaaaaaaababbabbbabbababaabbb", "bbbab", 4, 5, 13,
"v2-more_bs", "aaaaaaaaaababbabbbabbababaabbb", "babab", 5, 5, 13,
"v2-more_bs", "aaaaaaaaaababbabbbabbababaabbb", "aabbb", 6, 5, 13)
一如既往,我很感激 dplyr 和 base R 对此的看法。我不相信我的接收条带和获得压缩长度的功能也很好。我找不到更简单的东西。但最后一英里才是真正的麻烦。
地图功能是apply系列的tidyverse版本。 map_dfr 函数正在使用向量 master_df$land_id 索引。把它想象成一个 for 循环。它为您提供了您正在寻找 right_join 调用的数据框。
library(tidyverse)
subchunks_df <- map_dfr(seq_along(master_df$land_id), function(i){
get_strip_data(master_df$land_id[[i]],
master_df$land_string[[i]])})
final_df <- right_join(master_df, subchunks_df, by = "land_id")
您可以使用 Map
然后将数据与 do.call
+ rbind
绑定在一起
subchunks_df <- do.call(rbind, Map(get_strip_data, master_df$land_id,
master_df$land_string))
或者如果你更喜欢tidyverse
,你可以使用map2_df
subchunks_df <- purrr::map2_df(master_df$land_id, master_df$land_string,
get_strip_data)
# A tibble: 12 x 5
# land_substring land_id substr_id len_raw len_gz
# <chr> <chr> <int> <int> <int>
# 1 aaaaa v1-few_bs 1 5 11
# 2 aaaaa v1-few_bs 2 5 11
# 3 baaaa v1-few_bs 3 5 11
# 4 abaaa v1-few_bs 4 5 13
# 5 babab v1-few_bs 5 5 13
# 6 aabab v1-few_bs 6 5 13
# 7 aaaaa v2-more_bs 1 5 11
# 8 aaaaa v2-more_bs 2 5 11
# 9 babba v2-more_bs 3 5 13
#10 bbbab v2-more_bs 4 5 13
#11 babab v2-more_bs 5 5 13
#12 aabbb v2-more_bs 6 5 13
然后 right_join
:
final_df <- dplyr::right_join(master_df, subchunks_df, by = "land_id")
我从一个数据框开始,其中每一行都有一个长字符串,代表 1D 中的 2D 环境(我们称之为景观)。在实际情况下,这些大约有 6 个值高和 80 个值长,因此在 1D 中,字符串的长度为 480 个字符。我在示例中缩短了这些。每行还有一个唯一名称,是每个景观的 shorthand 标识符。
我有一个函数可以获取每一行,将字符串分成 6 条,并对每条进行分析。在这个例子中,函数的核心动作是压缩条带并获取压缩长度。此函数生成一个 6 行数据框,我需要将其与原始数据框组合,结果是最终数据框,原始数据框的每 1 行有 6 行。
library(dplyr)
library(tibble)
master_df <- tribble(~land_id, ~land_string,
"v1-few_bs", "aaaaaaaaaabaaaaabaaabababaabab",
"v2-more_bs", "aaaaaaaaaababbabbbabbababaabbb")
compress_it <- function(txt) {
len.raw <- sum(nchar(txt))
len.gz <- length(memCompress(txt, "g"))
return(list("len_raw" = len.raw,
"len_gz" = len.gz))
}
get_strip_data <- function(land_id, land_string) {
with_spaces <- gsub("(.{5})", "\1 ", land_string)
chars_on_lines <- str_replace_all(with_spaces, pattern = " ", "\n")
prob_matrix <- read.table(text = chars_on_lines, header=FALSE, sep = " ",
stringsAsFactors = FALSE)
prob_matrix <- mutate(prob_matrix,
land_id = land_id,
substr_id = 1:nrow(prob_matrix) )
prob_matrix <- rename(prob_matrix, land_substring = V1)
mutate(prob_matrix, new = map(land_substring, compress_it)) %>%
unnest_wider(c(new))
}
get_strip_data(master_df$land_id[[2]], master_df$land_string[[2]]) # to test the above function
这里是我们进入 pseudocode/klugecode 我想做的事情的地方。
首先,我制作了一个空数据框。
subchunks_df <-
tribble(~land_id, ~land_string, ~land_substring, ~substr_id, ~len_raw, ~len_gz,
"", "", "", NA, NA, NA)
尝试 for 循环:
for ( i in 1:nrow(master_df) ) {
subchunks_df[i, ] <- get_strip_data(master_df$land_id[[i]], master_df$land_string[[i]])
}
改为尝试映射:
subchunks_df <- mapply(get_strip_data,
land_id = master_df$land_id,
land_string = master_df$land_string)
没有。大方一点,我的尝试是 "close but no cigar."
如果我能得到 subchunks_df 正确的形状,我会 right_join:
final_df <- right_join(master_df, subchunks_df, by = "land_id")
考虑到 master_df 遍历函数,这是所需的输出:
final_df <-
tribble(~land_id, ~land_string, ~land_substring, ~substr_id, ~len_raw, ~len_gz,
"v1-few_bs", "aaaaaaaaaabaaaaabaaabababaabab", "aaaaa", 1, 5, 11,
"v1-few_bs", "aaaaaaaaaabaaaaabaaabababaabab", "aaaaa", 2, 5, 11,
"v1-few_bs", "aaaaaaaaaabaaaaabaaabababaabab", "baaaa", 3, 5, 11,
"v1-few_bs", "aaaaaaaaaabaaaaabaaabababaabab", "abaaa", 4, 5, 13,
"v1-few_bs", "aaaaaaaaaabaaaaabaaabababaabab", "babab", 5, 5, 13,
"v1-few_bs", "aaaaaaaaaabaaaaabaaabababaabab", "aabab", 6, 5, 13,
"v2-more_bs", "aaaaaaaaaababbabbbabbababaabbb", "aaaaa", 1, 5, 11,
"v2-more_bs", "aaaaaaaaaababbabbbabbababaabbb", "aaaaa", 2, 5, 11,
"v2-more_bs", "aaaaaaaaaababbabbbabbababaabbb", "babba", 3, 5, 13,
"v2-more_bs", "aaaaaaaaaababbabbbabbababaabbb", "bbbab", 4, 5, 13,
"v2-more_bs", "aaaaaaaaaababbabbbabbababaabbb", "babab", 5, 5, 13,
"v2-more_bs", "aaaaaaaaaababbabbbabbababaabbb", "aabbb", 6, 5, 13)
一如既往,我很感激 dplyr 和 base R 对此的看法。我不相信我的接收条带和获得压缩长度的功能也很好。我找不到更简单的东西。但最后一英里才是真正的麻烦。
地图功能是apply系列的tidyverse版本。 map_dfr 函数正在使用向量 master_df$land_id 索引。把它想象成一个 for 循环。它为您提供了您正在寻找 right_join 调用的数据框。
library(tidyverse)
subchunks_df <- map_dfr(seq_along(master_df$land_id), function(i){
get_strip_data(master_df$land_id[[i]],
master_df$land_string[[i]])})
final_df <- right_join(master_df, subchunks_df, by = "land_id")
您可以使用 Map
然后将数据与 do.call
+ rbind
subchunks_df <- do.call(rbind, Map(get_strip_data, master_df$land_id,
master_df$land_string))
或者如果你更喜欢tidyverse
,你可以使用map2_df
subchunks_df <- purrr::map2_df(master_df$land_id, master_df$land_string,
get_strip_data)
# A tibble: 12 x 5
# land_substring land_id substr_id len_raw len_gz
# <chr> <chr> <int> <int> <int>
# 1 aaaaa v1-few_bs 1 5 11
# 2 aaaaa v1-few_bs 2 5 11
# 3 baaaa v1-few_bs 3 5 11
# 4 abaaa v1-few_bs 4 5 13
# 5 babab v1-few_bs 5 5 13
# 6 aabab v1-few_bs 6 5 13
# 7 aaaaa v2-more_bs 1 5 11
# 8 aaaaa v2-more_bs 2 5 11
# 9 babba v2-more_bs 3 5 13
#10 bbbab v2-more_bs 4 5 13
#11 babab v2-more_bs 5 5 13
#12 aabbb v2-more_bs 6 5 13
然后 right_join
:
final_df <- dplyr::right_join(master_df, subchunks_df, by = "land_id")