在 R 删除列中收集函数
Gather function in R dropping column
我正在将一些作者使用的语言与从 Project Gutenberg 网站下载的数据进行比较,但我在处理 tibble 时遇到了一些问题。我的最终目标是制作一个图表,比较 Herman Melville 和 Lewis Carroll 与 Washington Irving 的单词使用频率。但是,我的 tibble 没有 Irving 专栏,当我尝试在我的 ggplot 中调用它时,这是有问题的。
我希望我的 frequency
tibble 看起来像
# A tibble: 72,984 x 4
word Irving author proportion
<chr> <dbl> <chr> <dbl>
1 a'dale 0.00000907 Melville NA
2 aa NA Melville 0.0000246
3 ab NA Melville NA
4 aback NA Melville 0.0000369
5 abana NA Melville 0.0000123
6 abandon 0.0000363 Melville 0.0000861
7 abandoned 0.000163 Melville 0.000172
8 abandoning 0.0000181 Melville NA
9 abandonment 0.00000907 Melville 0.0000123
10 abasement 0.0000181 Melville 0.0000123
# ... with 72,974 more rows
但它看起来像
# A tibble: 72,984 x 3
word author proportion
<chr> <chr> <dbl>
1 a'dale Melville NA
2 aa Melville 0.0000246
3 ab Melville NA
4 aback Melville 0.0000369
5 abana Melville 0.0000123
6 abandon Melville 0.0000861
7 abandoned Melville 0.000172
8 abandoning Melville NA
9 abandonment Melville 0.0000123
10 abasement Melville 0.0000123
# ... with 72,974 more rows
而且当我聚在一起使频率抖动时,我不确定自己做错了什么。
代码
# Import libraries
library(tidyverse) # dplyr, tidyr, stringr, ggplot2
library(tidytext)
library(gutenbergr)
# Download four works from each author
wirving <- gutenberg_download(c(49872, 41, 14228, 13514))
hmelville <- gutenberg_download(c(15, 4045, 28656, 2694))
lcarroll <- gutenberg_download(c(19033, 620, 12, 4763))
# tidy each author
tidy_wirving <- wirving %>%
unnest_tokens(word, text) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
anti_join(stop_words, by = "word")
tidy_hmelville <- hmelville %>%
unnest_tokens(word, text) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
anti_join(stop_words, by = "word")
tidy_lcarroll <- lcarroll %>%
unnest_tokens(word, text) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
anti_join(stop_words, by = "word")
# calculate word frequency
frequency_by_word_across_authors <-
bind_rows(mutate(tidy_wirving, author = "Irving"),
mutate(tidy_hmelville, author = "Melville"),
mutate(tidy_lcarroll, author = "Carroll")) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(author, word) %>%
group_by(author) %>%
mutate(proportion = n /sum(n)) %>%
select(-n) %>%
spread(author, proportion)
# compare frequency of Melville and Carroll against Irving
frequency <- frequency_by_word_across_authors %>%
gather(author, proportion,`Melville`:`Carroll`)
ggplot(frequency,
aes(x = proportion,
y =`Irving`,
color = abs(`Irving`- proportion))) +
geom_abline(color = "gray40",
lty = 2) +
geom_jitter(alpha = 0.1,
size = 2.5,
width = 0.3,
height = 0.3) +
geom_text(aes(label = word),
check_overlap = TRUE,
vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001),
low = "darkslategray4",
high = "gray75") +
facet_wrap(~author, ncol = 2) +
theme(legend.position="none") +
labs(y = "Irving Washington", x = NULL)
# Error in FUN(X[[i]], ...) : object 'Irving' not found
问题在于您如何使用 gather()
; 你想要收集的两列不相邻所以你不想使用:
:
frequency <- frequency_by_word_across_authors %>%
gather(author, proportion, Carroll, Melville)
ggplot(frequency,
aes(x = proportion,
y = Irving,
color = abs(Irving - proportion))) +
geom_abline(color = "gray40",
lty = 2) +
geom_jitter(alpha = 0.1,
size = 2.5,
width = 0.3,
height = 0.3) +
geom_text(aes(label = word),
check_overlap = TRUE,
vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001),
low = "darkslategray4",
high = "gray75") +
facet_wrap(~author, ncol = 2) +
theme(legend.position="none") +
labs(y = "Irving Washington", x = NULL)
由 reprex package (v0.3.0)
创建于 2019-11-01
我正在将一些作者使用的语言与从 Project Gutenberg 网站下载的数据进行比较,但我在处理 tibble 时遇到了一些问题。我的最终目标是制作一个图表,比较 Herman Melville 和 Lewis Carroll 与 Washington Irving 的单词使用频率。但是,我的 tibble 没有 Irving 专栏,当我尝试在我的 ggplot 中调用它时,这是有问题的。
我希望我的 frequency
tibble 看起来像
# A tibble: 72,984 x 4
word Irving author proportion
<chr> <dbl> <chr> <dbl>
1 a'dale 0.00000907 Melville NA
2 aa NA Melville 0.0000246
3 ab NA Melville NA
4 aback NA Melville 0.0000369
5 abana NA Melville 0.0000123
6 abandon 0.0000363 Melville 0.0000861
7 abandoned 0.000163 Melville 0.000172
8 abandoning 0.0000181 Melville NA
9 abandonment 0.00000907 Melville 0.0000123
10 abasement 0.0000181 Melville 0.0000123
# ... with 72,974 more rows
但它看起来像
# A tibble: 72,984 x 3
word author proportion
<chr> <chr> <dbl>
1 a'dale Melville NA
2 aa Melville 0.0000246
3 ab Melville NA
4 aback Melville 0.0000369
5 abana Melville 0.0000123
6 abandon Melville 0.0000861
7 abandoned Melville 0.000172
8 abandoning Melville NA
9 abandonment Melville 0.0000123
10 abasement Melville 0.0000123
# ... with 72,974 more rows
而且当我聚在一起使频率抖动时,我不确定自己做错了什么。
代码
# Import libraries
library(tidyverse) # dplyr, tidyr, stringr, ggplot2
library(tidytext)
library(gutenbergr)
# Download four works from each author
wirving <- gutenberg_download(c(49872, 41, 14228, 13514))
hmelville <- gutenberg_download(c(15, 4045, 28656, 2694))
lcarroll <- gutenberg_download(c(19033, 620, 12, 4763))
# tidy each author
tidy_wirving <- wirving %>%
unnest_tokens(word, text) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
anti_join(stop_words, by = "word")
tidy_hmelville <- hmelville %>%
unnest_tokens(word, text) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
anti_join(stop_words, by = "word")
tidy_lcarroll <- lcarroll %>%
unnest_tokens(word, text) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
anti_join(stop_words, by = "word")
# calculate word frequency
frequency_by_word_across_authors <-
bind_rows(mutate(tidy_wirving, author = "Irving"),
mutate(tidy_hmelville, author = "Melville"),
mutate(tidy_lcarroll, author = "Carroll")) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(author, word) %>%
group_by(author) %>%
mutate(proportion = n /sum(n)) %>%
select(-n) %>%
spread(author, proportion)
# compare frequency of Melville and Carroll against Irving
frequency <- frequency_by_word_across_authors %>%
gather(author, proportion,`Melville`:`Carroll`)
ggplot(frequency,
aes(x = proportion,
y =`Irving`,
color = abs(`Irving`- proportion))) +
geom_abline(color = "gray40",
lty = 2) +
geom_jitter(alpha = 0.1,
size = 2.5,
width = 0.3,
height = 0.3) +
geom_text(aes(label = word),
check_overlap = TRUE,
vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001),
low = "darkslategray4",
high = "gray75") +
facet_wrap(~author, ncol = 2) +
theme(legend.position="none") +
labs(y = "Irving Washington", x = NULL)
# Error in FUN(X[[i]], ...) : object 'Irving' not found
问题在于您如何使用 gather()
; 你想要收集的两列不相邻所以你不想使用:
:
frequency <- frequency_by_word_across_authors %>%
gather(author, proportion, Carroll, Melville)
ggplot(frequency,
aes(x = proportion,
y = Irving,
color = abs(Irving - proportion))) +
geom_abline(color = "gray40",
lty = 2) +
geom_jitter(alpha = 0.1,
size = 2.5,
width = 0.3,
height = 0.3) +
geom_text(aes(label = word),
check_overlap = TRUE,
vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001),
low = "darkslategray4",
high = "gray75") +
facet_wrap(~author, ncol = 2) +
theme(legend.position="none") +
labs(y = "Irving Washington", x = NULL)
由 reprex package (v0.3.0)
创建于 2019-11-01