从 dplyr 函数中保存切除的数据:如何防止 conflict/overwrite
Saving excised data from dplyr functions: how to prevent conflict/overwrite
编辑:根据要求提供更完整的解释:
如果你一开始不明白我想做什么,这将很难解释。当您进行链接时,很多时候您只想将操作集中在部分数据上(某些行、汇总表、列名等)。完成后,您通常希望将这些更改合并到原始数据中。但是,如果不破坏链条的流动,这是不可能做到的。这些功能可以让你做到这一点。但是,原始数据的副本存储在称为银行的位置。此外,为了正确地重新合并行,保存行和加载行使用一个名为索引的新变量。 bank 和 index 都可能在链中被意外修改,从而导致问题。另外,如果存在名为 bank 和 index 的现有变量怎么办?有办法避免这些问题吗?
许多 dplyr 操作会导致数据丢失。这是一个可以保存和恢复这些数据的系统。 Save 将是一个字符串,用于标识一块已切除的数据。这些数据将以列表形式存储在银行中。请注意,这是受 skranz 的 mutate_if 启发。
我的问题是:如何更好地处理银行和索引,使这些变量不会被覆盖或干扰现有变量
library(dplyr)
library(magrittr)
bank = NULL
save_rows = function(FUN)
function(data, save, ...) {
data$index = 1:nrow(data)
results = data %>% FUN(...)
bank[[save]] <<-
data %>%
anti_join(results %>% select(index))
results}
load_rows = function(data, save)
data %>%
bind_rows(bank[[save]]) %>%
arrange(index) %>%
select(-index)
save_all = function(FUN)
function(data, save, ...) {
bank[[save]] <<- data
data %>% FUN(...)
}
load_all = function(data, save)
bank[[save]] %>% full_join(data)
change_names = function(data, save) {
bank[[save]] <<- data
data %>% colnames
}
attach_names_to = function(colnames, save) {
bank[[save]] %>%
set_colnames(colnames)
}
#example
library(stringr)
a = c(1, 1, 2, 3)
b = c("my_momma", "my_momma", "takes_care", "of_me")
c = c("you", "you", "and", "me")
data_frame(a, b, c) %>%
save_rows(filter)("filter", a == 1) %>%
mutate(c = paste("baby", c, sep = "_")) %>%
load_rows("filter") %>%
save_rows(slice)("slice", 1) %>%
mutate(b = "my_papa") %>%
load_rows("slice") %>%
group_by(a) %>%
save_all(summarize)("summarize", n = n()) %>%
load_all("summarize") %>%
save_all(distinct)("distinct", a) %>%
mutate(B = b %>% str_replace_all("_", " ")) %>%
select(a, B) %>%
load_all("distinct") %>%
change_names("data") %>%
paste("2", sep = "") %>%
attach_names_to("data")
我看不出你的方法比 dplyrExtras
所采用的方法有什么优势。您的大部分代码都可以用 dplyr
或 dplyrExtras
重写。请参阅下文,了解如何在没有额外功能的情况下实施您的示例。
此外,使用 <<-
通常不是一个好主意。如果您真的想进一步开发您的方法,也许您可以使用类似于 options()
的方法。有关此示例,请参见 ?igraph:::igraph.options
。
最后,您的重命名方式不保留组。在你的例子中,最后的分组是在 a
之后,但数据中没有 a
。
最后,这里有一个替代示例:
# to get dplyrExtras
library(devtools)
install_github(repo="skranz/dplyrExtras")
require(dplyrExtras)
# the code that does the same as your example (except the final grouping)
data_frame(a, b, c) %>%
mutate_if(a == 1, c = paste("baby", c, sep = "_")) %>%
mutate_if(1, b = "my_papa") %>%
group_by(a) %>%
mutate(n = n()) %>%
mutate(B = b %>% str_replace_all("_", " ")) %>%
ungroup() %>%
do(set_colnames(., paste0(colnames(.), "2"))) %>%
group_by(a2)
您的策略不必要地复杂,在所有情况下都存在更简单的方法。而不是
data_frame(a, b, c) %>%
save_rows(filter)("filter", a == 1) %>%
mutate(c = paste("baby", c, sep = "_")) %>%
load_rows("filter")
使用类似的东西:
d <- data_frame(a, b, c)
d %>%
mutate(c = ifelse(a == 1, paste("baby", c, sep = "_"), c))
而不是
d %>%
save_rows(slice)("slice", 1) %>%
mutate(b = "my_papa") %>%
load_rows("slice")
使用类似
的东西
d %>%
mutate(b = ifelse(1:n() == 1, "my_papa", b))
而不是
d %>%
group_by(a) %>%
save_all(summarize)("summarize", n = n()) %>%
load_all("summarize")
使用
d %>%
group_by(a) %>%
mutate(n = n())
我仍然认为 mutate_if
采取的方法是可行的方法。但是对于某些示例,总结和重新合并可能会更快,这似乎是正确的。我的偏好是使用类似于 mutate_if
的函数来处理这种情况。下面我展示了我对 mutate_group
函数的方法以及一些基准测试。
require(dplyr)
# mutate_group function
mutate_group <- function(df, ...){
.dots = lazyeval::lazy_dots(...)
groups <- groups(df)
# grouping variables defined
mdf <- df %>%
mutate_(.dots = setNames(names(.dots), paste0(".group.", names(.dots))))
mdf %>%
group_by_(.dots = paste0(".group.", names(.dots))) %>%
summarize_() %>%
ungroup %>%
mutate_(.dots=.dots) %>%
right_join(mdf, by=paste0(".group.", names(.dots))) %>%
select_(.dots = c(paste0("-.group.", names(.dots)), paste0("-", names(.dots), ".y"))) %>%
rename_(.dots = setNames(paste0(names(.dots), ".x"), names(.dots))) %>%
group_by_(.dots=groups)
}
现在进行基准测试:
require(microbenchmark)
# testing 1
set.seed(1)
df <- data.frame(x = sample(letters[1:4], 1e6, replace=TRUE),
y = runif(1e6))
#
microbenchmark(
r1 <- df %>% mutate(x = gsub('a', 't', x))
,
r2 <- df %>% mutate_group(x = gsub('a', 't', .group.x))
)
#
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## r1 <- df %>% mutate(x = gsub("a", "t", x)) 324.9036 328.7171 337.6389 330.2874 345.2245 415.6200 100 b
## r2 <- df %>% mutate_group(x = gsub("a", "t", .group.x)) 117.0220 120.1766 128.9403 121.8053 135.4410 208.5801 100 a
#
all.equal(r1, r2, check.attributes = FALSE)
## [1] TRUE
#
# testing 2
set.seed(1)
df <- data.frame(x = sample(letters[1:4], 1e6, replace=TRUE),
y = sample(letters[1:4], 1e6, replace=TRUE),
z = runif(1e6))
microbenchmark(
r1 <- df %>% mutate(x = gsub('a', 't', x),
y = gsub('b', 's', y))
,
r2 <- df %>% mutate_group(x = gsub('a', 't', .group.x),
y = gsub('b', 's', .group.y))
)
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## r1 <- df %>% mutate(x = gsub("a", "t", x), y = gsub("b", "s", y)) 665.9306 674.2292 691.7966 682.0874 695.8887 776.9765 100 b
## r2 <- df %>% mutate_group(x = gsub("a", "t", .group.x), y = gsub("b", "s", .group.y)) 150.9971 156.5910 177.6797 171.7907 177.9938 279.1329 100 a
#
all.equal(r1, r2, check.attributes = FALSE)
## [1] TRUE
#
# testing 3
set.seed(1)
df <- data.frame(x = sample(letters[1:4], 1e6, replace=TRUE),
y = sample(letters[1:4], 1e6, replace=TRUE),
z = runif(1e6))
#
microbenchmark(
r1 <- df %>% group_by(y) %>% mutate(x = gsub('a', 't', x))
,
r2 <- df %>% group_by(y) %>% mutate_group(x = gsub('a', 't', .group.x))
)
#
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## r1 <- df %>% group_by(y) %>% mutate(x = gsub("a", "t", x)) 399.8765 405.0650 415.0338 408.3451 423.2523 494.0247 100 b
## r2 <- df %>% group_by(y) %>% mutate_group(x = gsub("a", "t", .group.x)) 224.0281 231.9769 247.7521 244.8441 248.5926 319.9048 100 a
#
all.equal(r1, r2, check.attributes = FALSE)
## [1] TRUE
好的,这是一个新版本。语法是精简的。还是没有解决意外干扰的问题
library(stringr)
library(stringi)
library(dplyr)
library(dplyrExtras)
library(pipeR)
bank = list("dummy" = NULL)
oldBank = function() length(bank)
newBank = function() length(bank) + 1
oldIndex = function() paste("index", oldBank(), sep = "")
newIndex = function() paste("index", newBank(), sep = "")
selectOldIndex = function(data) data %>>%
select_(oldIndex())
removeOldIndex = function(data) data %>>%
select_(paste("-", oldIndex()))
focus = function(data) {
data[[newIndex()]] = 1:nrow(data)
bank[[newBank()]] <<- data
data
}
restore = function(zoom) {
restoreData =
bank[[oldBank()]] %>>%
removeOldIndex %>>%
left_join(zoom)
bank[[oldBank()]] <<-NULL
restoreData
}
restore_rows = function(zoom) {
restoreData =
bank[[oldBank()]] %>>%
anti_join(zoom %>>% selectOldIndex()) %>>%
bind_rows(zoom) %>>%
removeOldIndex()
bank[[oldBank()]] <<-NULL
restoreData
}
#example
options(stringsAsFactors = FALSE)
characters = c("1st", "2nd", "3rd", "other_value") %>>%
rep(10) %>>%
(data.frame(value = ., type = "character"))
numerics = c("1", "2", "3", ".") %>>%
rep(10) %>>%
(data.frame(value = ., type = "numeric"))
data = bind_rows(characters, numerics)
abbrevs = data_frame(
value = c("1st", "2nd", "3rd"),
full = c("first_value", "second_value", "third_value"))
results =
data %>>%
focus %>>%
group_by(value) %>>%
summarize(n = nrow(.), type = first(type)) %>>%
focus %>>%
filter(type == "character") %>>%
left_join(abbrevs) %>>%
mutate_if(is.na(full), full = value) %>>%
mutate(full = full %>>%
str_replace_all("_", " ") %>>%
stri_trans_totitle()) %>>%
restore_rows %>>%
restore %>>%
mutate_if(!is.na(full), value = full) %>>%
select(-full)
好的,这是版本 3。我已经通过允许用户为银行和索引指定他们自己的名称解决了干扰问题。
library(stringr)
library(stringi)
library(dplyr)
library(dplyrExtras)
library(pipeR)
library(lazyeval)
library(gtools)
construct_bank_index = defmacro(bank, index, expr = {
bank = list("NULL" = NULL)
old_bank = function() length(bank)
new_bank = function() length(bank) + 1
old_index = function() paste(index, old_bank(), sep = "")
newIndex = function() paste(index, new_bank(), sep = "")
select_old_index = function(data) data %>>%
select_(old_index())
remove_old_index = function(data) data %>>%
select_(paste("-", old_index()))
focus = function(data) {
data[[newIndex()]] = 1:nrow(data)
bank[[new_bank()]] <<- data
data
}
restore = function(zoom) {
restoreData =
bank[[old_bank()]] %>>%
remove_old_index %>>%
left_join(zoom)
bank[[old_bank()]] <<-NULL
restoreData
}
restore_rows = function(zoom) {
restoreData =
bank[[old_bank()]] %>>%
anti_join(zoom %>>% select_old_index()) %>>%
bind_rows(zoom) %>>%
remove_old_index()
bank[[old_bank()]] <<-NULL
restoreData
}
})
#example
construct_bank_index(bank, "index")
options(stringsAsFactors = FALSE)
characters = c("1st", "2nd", "3rd", "other_value") %>>%
rep(10) %>>%
(data.frame(value = ., type = "character"))
numerics = c("1", "2", "3", ".") %>>%
rep(10) %>>%
(data.frame(value = ., type = "numeric"))
data = bind_rows(characters, numerics)
abbrevs = data_frame(
value = c("1st", "2nd", "3rd"),
full = c("first_value", "second_value", "third_value"))
results =
data %>>%
focus %>>%
group_by(value) %>>%
summarize(n = n(), type = first(type)) %>>%
focus %>>%
filter(type == "character") %>>%
left_join(abbrevs) %>>%
mutate_if(is.na(full), full = value) %>>%
mutate(full = full %>>%
str_replace_all("_", " ") %>>%
stri_trans_totitle()) %>>%
restore_rows %>>%
restore %>>%
mutate_if(!is.na(full), value = full) %>>%
select(-full)
编辑:根据要求提供更完整的解释: 如果你一开始不明白我想做什么,这将很难解释。当您进行链接时,很多时候您只想将操作集中在部分数据上(某些行、汇总表、列名等)。完成后,您通常希望将这些更改合并到原始数据中。但是,如果不破坏链条的流动,这是不可能做到的。这些功能可以让你做到这一点。但是,原始数据的副本存储在称为银行的位置。此外,为了正确地重新合并行,保存行和加载行使用一个名为索引的新变量。 bank 和 index 都可能在链中被意外修改,从而导致问题。另外,如果存在名为 bank 和 index 的现有变量怎么办?有办法避免这些问题吗?
许多 dplyr 操作会导致数据丢失。这是一个可以保存和恢复这些数据的系统。 Save 将是一个字符串,用于标识一块已切除的数据。这些数据将以列表形式存储在银行中。请注意,这是受 skranz 的 mutate_if 启发。
我的问题是:如何更好地处理银行和索引,使这些变量不会被覆盖或干扰现有变量
library(dplyr)
library(magrittr)
bank = NULL
save_rows = function(FUN)
function(data, save, ...) {
data$index = 1:nrow(data)
results = data %>% FUN(...)
bank[[save]] <<-
data %>%
anti_join(results %>% select(index))
results}
load_rows = function(data, save)
data %>%
bind_rows(bank[[save]]) %>%
arrange(index) %>%
select(-index)
save_all = function(FUN)
function(data, save, ...) {
bank[[save]] <<- data
data %>% FUN(...)
}
load_all = function(data, save)
bank[[save]] %>% full_join(data)
change_names = function(data, save) {
bank[[save]] <<- data
data %>% colnames
}
attach_names_to = function(colnames, save) {
bank[[save]] %>%
set_colnames(colnames)
}
#example
library(stringr)
a = c(1, 1, 2, 3)
b = c("my_momma", "my_momma", "takes_care", "of_me")
c = c("you", "you", "and", "me")
data_frame(a, b, c) %>%
save_rows(filter)("filter", a == 1) %>%
mutate(c = paste("baby", c, sep = "_")) %>%
load_rows("filter") %>%
save_rows(slice)("slice", 1) %>%
mutate(b = "my_papa") %>%
load_rows("slice") %>%
group_by(a) %>%
save_all(summarize)("summarize", n = n()) %>%
load_all("summarize") %>%
save_all(distinct)("distinct", a) %>%
mutate(B = b %>% str_replace_all("_", " ")) %>%
select(a, B) %>%
load_all("distinct") %>%
change_names("data") %>%
paste("2", sep = "") %>%
attach_names_to("data")
我看不出你的方法比 dplyrExtras
所采用的方法有什么优势。您的大部分代码都可以用 dplyr
或 dplyrExtras
重写。请参阅下文,了解如何在没有额外功能的情况下实施您的示例。
此外,使用 <<-
通常不是一个好主意。如果您真的想进一步开发您的方法,也许您可以使用类似于 options()
的方法。有关此示例,请参见 ?igraph:::igraph.options
。
最后,您的重命名方式不保留组。在你的例子中,最后的分组是在 a
之后,但数据中没有 a
。
最后,这里有一个替代示例:
# to get dplyrExtras
library(devtools)
install_github(repo="skranz/dplyrExtras")
require(dplyrExtras)
# the code that does the same as your example (except the final grouping)
data_frame(a, b, c) %>%
mutate_if(a == 1, c = paste("baby", c, sep = "_")) %>%
mutate_if(1, b = "my_papa") %>%
group_by(a) %>%
mutate(n = n()) %>%
mutate(B = b %>% str_replace_all("_", " ")) %>%
ungroup() %>%
do(set_colnames(., paste0(colnames(.), "2"))) %>%
group_by(a2)
您的策略不必要地复杂,在所有情况下都存在更简单的方法。而不是
data_frame(a, b, c) %>%
save_rows(filter)("filter", a == 1) %>%
mutate(c = paste("baby", c, sep = "_")) %>%
load_rows("filter")
使用类似的东西:
d <- data_frame(a, b, c)
d %>%
mutate(c = ifelse(a == 1, paste("baby", c, sep = "_"), c))
而不是
d %>%
save_rows(slice)("slice", 1) %>%
mutate(b = "my_papa") %>%
load_rows("slice")
使用类似
的东西d %>%
mutate(b = ifelse(1:n() == 1, "my_papa", b))
而不是
d %>%
group_by(a) %>%
save_all(summarize)("summarize", n = n()) %>%
load_all("summarize")
使用
d %>%
group_by(a) %>%
mutate(n = n())
我仍然认为 mutate_if
采取的方法是可行的方法。但是对于某些示例,总结和重新合并可能会更快,这似乎是正确的。我的偏好是使用类似于 mutate_if
的函数来处理这种情况。下面我展示了我对 mutate_group
函数的方法以及一些基准测试。
require(dplyr)
# mutate_group function
mutate_group <- function(df, ...){
.dots = lazyeval::lazy_dots(...)
groups <- groups(df)
# grouping variables defined
mdf <- df %>%
mutate_(.dots = setNames(names(.dots), paste0(".group.", names(.dots))))
mdf %>%
group_by_(.dots = paste0(".group.", names(.dots))) %>%
summarize_() %>%
ungroup %>%
mutate_(.dots=.dots) %>%
right_join(mdf, by=paste0(".group.", names(.dots))) %>%
select_(.dots = c(paste0("-.group.", names(.dots)), paste0("-", names(.dots), ".y"))) %>%
rename_(.dots = setNames(paste0(names(.dots), ".x"), names(.dots))) %>%
group_by_(.dots=groups)
}
现在进行基准测试:
require(microbenchmark)
# testing 1
set.seed(1)
df <- data.frame(x = sample(letters[1:4], 1e6, replace=TRUE),
y = runif(1e6))
#
microbenchmark(
r1 <- df %>% mutate(x = gsub('a', 't', x))
,
r2 <- df %>% mutate_group(x = gsub('a', 't', .group.x))
)
#
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## r1 <- df %>% mutate(x = gsub("a", "t", x)) 324.9036 328.7171 337.6389 330.2874 345.2245 415.6200 100 b
## r2 <- df %>% mutate_group(x = gsub("a", "t", .group.x)) 117.0220 120.1766 128.9403 121.8053 135.4410 208.5801 100 a
#
all.equal(r1, r2, check.attributes = FALSE)
## [1] TRUE
#
# testing 2
set.seed(1)
df <- data.frame(x = sample(letters[1:4], 1e6, replace=TRUE),
y = sample(letters[1:4], 1e6, replace=TRUE),
z = runif(1e6))
microbenchmark(
r1 <- df %>% mutate(x = gsub('a', 't', x),
y = gsub('b', 's', y))
,
r2 <- df %>% mutate_group(x = gsub('a', 't', .group.x),
y = gsub('b', 's', .group.y))
)
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## r1 <- df %>% mutate(x = gsub("a", "t", x), y = gsub("b", "s", y)) 665.9306 674.2292 691.7966 682.0874 695.8887 776.9765 100 b
## r2 <- df %>% mutate_group(x = gsub("a", "t", .group.x), y = gsub("b", "s", .group.y)) 150.9971 156.5910 177.6797 171.7907 177.9938 279.1329 100 a
#
all.equal(r1, r2, check.attributes = FALSE)
## [1] TRUE
#
# testing 3
set.seed(1)
df <- data.frame(x = sample(letters[1:4], 1e6, replace=TRUE),
y = sample(letters[1:4], 1e6, replace=TRUE),
z = runif(1e6))
#
microbenchmark(
r1 <- df %>% group_by(y) %>% mutate(x = gsub('a', 't', x))
,
r2 <- df %>% group_by(y) %>% mutate_group(x = gsub('a', 't', .group.x))
)
#
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## r1 <- df %>% group_by(y) %>% mutate(x = gsub("a", "t", x)) 399.8765 405.0650 415.0338 408.3451 423.2523 494.0247 100 b
## r2 <- df %>% group_by(y) %>% mutate_group(x = gsub("a", "t", .group.x)) 224.0281 231.9769 247.7521 244.8441 248.5926 319.9048 100 a
#
all.equal(r1, r2, check.attributes = FALSE)
## [1] TRUE
好的,这是一个新版本。语法是精简的。还是没有解决意外干扰的问题
library(stringr)
library(stringi)
library(dplyr)
library(dplyrExtras)
library(pipeR)
bank = list("dummy" = NULL)
oldBank = function() length(bank)
newBank = function() length(bank) + 1
oldIndex = function() paste("index", oldBank(), sep = "")
newIndex = function() paste("index", newBank(), sep = "")
selectOldIndex = function(data) data %>>%
select_(oldIndex())
removeOldIndex = function(data) data %>>%
select_(paste("-", oldIndex()))
focus = function(data) {
data[[newIndex()]] = 1:nrow(data)
bank[[newBank()]] <<- data
data
}
restore = function(zoom) {
restoreData =
bank[[oldBank()]] %>>%
removeOldIndex %>>%
left_join(zoom)
bank[[oldBank()]] <<-NULL
restoreData
}
restore_rows = function(zoom) {
restoreData =
bank[[oldBank()]] %>>%
anti_join(zoom %>>% selectOldIndex()) %>>%
bind_rows(zoom) %>>%
removeOldIndex()
bank[[oldBank()]] <<-NULL
restoreData
}
#example
options(stringsAsFactors = FALSE)
characters = c("1st", "2nd", "3rd", "other_value") %>>%
rep(10) %>>%
(data.frame(value = ., type = "character"))
numerics = c("1", "2", "3", ".") %>>%
rep(10) %>>%
(data.frame(value = ., type = "numeric"))
data = bind_rows(characters, numerics)
abbrevs = data_frame(
value = c("1st", "2nd", "3rd"),
full = c("first_value", "second_value", "third_value"))
results =
data %>>%
focus %>>%
group_by(value) %>>%
summarize(n = nrow(.), type = first(type)) %>>%
focus %>>%
filter(type == "character") %>>%
left_join(abbrevs) %>>%
mutate_if(is.na(full), full = value) %>>%
mutate(full = full %>>%
str_replace_all("_", " ") %>>%
stri_trans_totitle()) %>>%
restore_rows %>>%
restore %>>%
mutate_if(!is.na(full), value = full) %>>%
select(-full)
好的,这是版本 3。我已经通过允许用户为银行和索引指定他们自己的名称解决了干扰问题。
library(stringr)
library(stringi)
library(dplyr)
library(dplyrExtras)
library(pipeR)
library(lazyeval)
library(gtools)
construct_bank_index = defmacro(bank, index, expr = {
bank = list("NULL" = NULL)
old_bank = function() length(bank)
new_bank = function() length(bank) + 1
old_index = function() paste(index, old_bank(), sep = "")
newIndex = function() paste(index, new_bank(), sep = "")
select_old_index = function(data) data %>>%
select_(old_index())
remove_old_index = function(data) data %>>%
select_(paste("-", old_index()))
focus = function(data) {
data[[newIndex()]] = 1:nrow(data)
bank[[new_bank()]] <<- data
data
}
restore = function(zoom) {
restoreData =
bank[[old_bank()]] %>>%
remove_old_index %>>%
left_join(zoom)
bank[[old_bank()]] <<-NULL
restoreData
}
restore_rows = function(zoom) {
restoreData =
bank[[old_bank()]] %>>%
anti_join(zoom %>>% select_old_index()) %>>%
bind_rows(zoom) %>>%
remove_old_index()
bank[[old_bank()]] <<-NULL
restoreData
}
})
#example
construct_bank_index(bank, "index")
options(stringsAsFactors = FALSE)
characters = c("1st", "2nd", "3rd", "other_value") %>>%
rep(10) %>>%
(data.frame(value = ., type = "character"))
numerics = c("1", "2", "3", ".") %>>%
rep(10) %>>%
(data.frame(value = ., type = "numeric"))
data = bind_rows(characters, numerics)
abbrevs = data_frame(
value = c("1st", "2nd", "3rd"),
full = c("first_value", "second_value", "third_value"))
results =
data %>>%
focus %>>%
group_by(value) %>>%
summarize(n = n(), type = first(type)) %>>%
focus %>>%
filter(type == "character") %>>%
left_join(abbrevs) %>>%
mutate_if(is.na(full), full = value) %>>%
mutate(full = full %>>%
str_replace_all("_", " ") %>>%
stri_trans_totitle()) %>>%
restore_rows %>>%
restore %>>%
mutate_if(!is.na(full), value = full) %>>%
select(-full)