当列名为字符串时,转换为 long 并使频率 table,R
Convert to long and make frequency table when column names are strings, R
ID What color is this item? What color is this item?_2 What is the shape of this item? What is the shape of this item?_2 size
55 red blue circle triangle small
83 blue yellow circle NA large
78 red yellow square circle large
43 green NA square circle small
29 yellow green circle triangle medium
我想获得这样的频率 table:
Variable Level Freq Percent
color blue 2 22.22
red 2 22.22
yellow 3 33.33
green 2 22.22
total 9 100.00
shape circle 5 50.0
triangle 3 30.0
square 2 20.0
total 10 100.0
size small 2 33.3
medium 2 33.3
large 2 33.3
total 6 100.0
但是当我尝试转换为长字符串时,我无法匹配我的列名称,因为它们是长字符串。从上一个问题,我知道我可以做类似的事情:
options(digits = 3)
df1 <- df2 %>%
pivot_longer(
-ID,
names_to = "Question",
values_to = "Response"
) %>%
mutate(Question = str_extract(Question, '')) %>%
group_by(Question, Response) %>%
count(Response, name = "Freq") %>%
na.omit() %>%
group_by(Question) %>%
mutate(Percent = Freq/sum(Freq)*100) %>%
group_split() %>%
adorn_totals() %>%
bind_rows() %>%
mutate(Response = ifelse(Response == last(Response), last(Question), Response)) %>%
mutate(Question = ifelse(duplicated(Question) |
Question == "Total", NA, Question))
但是我找不到正确的正则表达式来放入行中:
mutate(Question = str_extract(Question, '')) %>%
如果有人知道另一种方法,那就太好了!
如果打算提取自定义单词列表,我们可以将元素粘贴在一起以创建单个字符串并将其用作 regex
in str_extract
library(dplyr)
library(tidyr)
library(janitor)
library(stringr)
library(flextable)
pat_words <- c("color", "shape", "size")
out <- df %>%
pivot_longer(
-ID,
names_to = "Question",
values_to = "Response"
) %>% mutate(Question = str_extract(Question, str_c(pat_words, collapse="|"))) %>% group_by(Question, Response) %>%
count(Response, name = "Freq") %>%
na.omit() %>%
group_by(Question) %>%
mutate(Percent = round(Freq/sum(Freq)*100, 2)) %>%
group_split() %>%
adorn_totals() %>%
bind_rows() %>%
mutate(Response = ifelse(Response == last(Response), last(Question), Response)) %>%
mutate(Question = ifelse(duplicated(Question) |
Question == "Total", NA, Question)) %>%
as.data.frame
flextable(out)
-输出
数据
df <- structure(list(ID = c(55L, 83L, 78L, 43L, 29L), `What color is this item?` = c("red",
"blue", "red", "green", "yellow"), `What color is this item?_2` = c("blue",
"yellow", "yellow", NA, "green"), `What is the shape of this item?` = c("circle",
"circle", "square", "square", "circle"), `What is the shape of this item?_2` = c("triangle",
NA, "circle", "circle", "triangle"), size = c("small", "large",
"large", "small", "medium")), class = "data.frame", row.names = c(NA,
-5L))
有必要对列的内容进行假设(appl
),即给出重要的关键字。
然后根据列创建数据框
appl <- sapply( c("color","shape","size"), function(x) grep(x, colnames(dat)) )
data.frame( do.call( rbind, sapply( seq_along(appl), function(x){
tbl <- table(unlist( dat[,appl[[x]]] ));
rbind( cbind( Variable=names(appl[x]), Freq=tbl, Percent=round( tbl/sum(tbl)*100, digits=2 ) ),
cbind( Variable=names(appl[x]), sum(tbl), sum(tbl/sum(tbl)*100) ) ) } ) ) )
Variable Freq Percent
blue color 2 22.22
green color 2 22.22
red color 2 22.22
yellow color 3 33.33
X color 9 100
circle shape 5 55.56
square shape 2 22.22
triangle shape 2 22.22
X.1 shape 9 100
large size 2 40
medium size 1 20
small size 2 40
X.2 size 5 100
数据
dat <- structure(list(ID = c(55L, 83L, 78L, 43L, 29L), What.color.is.this.item. = c("red",
"blue", "red", "green", "yellow"), What.color.is.this.item._2 = c("blue",
"yellow", "yellow", NA, "green"), What.is.the.shape.of.this.item. = c("circle",
"circle", "square", "square", "circle"), What.is.the.shape.of.this.item._2 = c("triangle",
NA, "circle", "circle", "triangle"), size = c("small", "large",
"large", "small", "medium")), class = "data.frame", row.names = c(NA,
-5L))
首先编码时要多用专有名词
names(dat)[2:5] <- paste0(rep(c('color.', 'shape.'), each=2), 1:2)
现在我们很容易将数据转换成长格式。
dat_l <- reshape(dat, 2:5, direction='long', idvar='ID')
之后我们就可以在 base R 及其亲戚中使用 table()
函数了,
vars <- names(dat_l)[c("size", "color", "shape")]
tbl <- lapply(vars, \(x) table(dat_l[, x]) |>
(\(Freq) cbind(Freq=addmargins(Freq),
Percent=addmargins(proportions(Freq))*100))() |>
round(2)) |>
setNames(vars)
为控制台获得一个不错的table。
tbl
# $size
# Freq Percent
# large 4 40
# medium 2 20
# small 4 40
# Sum 10 100
#
# $color
# Freq Percent
# blue 2 22.22
# green 2 22.22
# red 2 22.22
# yellow 3 33.33
# Sum 9 100.00
#
# $shape
# Freq Percent
# circle 5 55.56
# square 2 22.22
# triangle 2 22.22
# Sum 9 100.00
# [1] "R version 4.1.2 (2021-11-01)"
数据
dat <- structure(list(ID = c(55L, 83L, 78L, 43L, 29L), What.color.is.this.item. = c("red",
"blue", "red", "green", "yellow"), What.color.is.this.item._2 = c("blue",
"yellow", "yellow", NA, "green"), What.is.the.shape.of.this.item. = c("circle",
"circle", "square", "square", "circle"), What.is.the.shape.of.this.item._2 = c("triangle",
NA, "circle", "circle", "triangle"), size = c("small", "large",
"large", "small", "medium")), class = "data.frame", row.names = c(NA,
-5L))
ID What color is this item? What color is this item?_2 What is the shape of this item? What is the shape of this item?_2 size
55 red blue circle triangle small
83 blue yellow circle NA large
78 red yellow square circle large
43 green NA square circle small
29 yellow green circle triangle medium
我想获得这样的频率 table:
Variable Level Freq Percent
color blue 2 22.22
red 2 22.22
yellow 3 33.33
green 2 22.22
total 9 100.00
shape circle 5 50.0
triangle 3 30.0
square 2 20.0
total 10 100.0
size small 2 33.3
medium 2 33.3
large 2 33.3
total 6 100.0
但是当我尝试转换为长字符串时,我无法匹配我的列名称,因为它们是长字符串。从上一个问题,我知道我可以做类似的事情:
options(digits = 3)
df1 <- df2 %>%
pivot_longer(
-ID,
names_to = "Question",
values_to = "Response"
) %>%
mutate(Question = str_extract(Question, '')) %>%
group_by(Question, Response) %>%
count(Response, name = "Freq") %>%
na.omit() %>%
group_by(Question) %>%
mutate(Percent = Freq/sum(Freq)*100) %>%
group_split() %>%
adorn_totals() %>%
bind_rows() %>%
mutate(Response = ifelse(Response == last(Response), last(Question), Response)) %>%
mutate(Question = ifelse(duplicated(Question) |
Question == "Total", NA, Question))
但是我找不到正确的正则表达式来放入行中:
mutate(Question = str_extract(Question, '')) %>%
如果有人知道另一种方法,那就太好了!
如果打算提取自定义单词列表,我们可以将元素粘贴在一起以创建单个字符串并将其用作 regex
in str_extract
library(dplyr)
library(tidyr)
library(janitor)
library(stringr)
library(flextable)
pat_words <- c("color", "shape", "size")
out <- df %>%
pivot_longer(
-ID,
names_to = "Question",
values_to = "Response"
) %>% mutate(Question = str_extract(Question, str_c(pat_words, collapse="|"))) %>% group_by(Question, Response) %>%
count(Response, name = "Freq") %>%
na.omit() %>%
group_by(Question) %>%
mutate(Percent = round(Freq/sum(Freq)*100, 2)) %>%
group_split() %>%
adorn_totals() %>%
bind_rows() %>%
mutate(Response = ifelse(Response == last(Response), last(Question), Response)) %>%
mutate(Question = ifelse(duplicated(Question) |
Question == "Total", NA, Question)) %>%
as.data.frame
flextable(out)
-输出
数据
df <- structure(list(ID = c(55L, 83L, 78L, 43L, 29L), `What color is this item?` = c("red",
"blue", "red", "green", "yellow"), `What color is this item?_2` = c("blue",
"yellow", "yellow", NA, "green"), `What is the shape of this item?` = c("circle",
"circle", "square", "square", "circle"), `What is the shape of this item?_2` = c("triangle",
NA, "circle", "circle", "triangle"), size = c("small", "large",
"large", "small", "medium")), class = "data.frame", row.names = c(NA,
-5L))
有必要对列的内容进行假设(appl
),即给出重要的关键字。
然后根据列创建数据框
appl <- sapply( c("color","shape","size"), function(x) grep(x, colnames(dat)) )
data.frame( do.call( rbind, sapply( seq_along(appl), function(x){
tbl <- table(unlist( dat[,appl[[x]]] ));
rbind( cbind( Variable=names(appl[x]), Freq=tbl, Percent=round( tbl/sum(tbl)*100, digits=2 ) ),
cbind( Variable=names(appl[x]), sum(tbl), sum(tbl/sum(tbl)*100) ) ) } ) ) )
Variable Freq Percent
blue color 2 22.22
green color 2 22.22
red color 2 22.22
yellow color 3 33.33
X color 9 100
circle shape 5 55.56
square shape 2 22.22
triangle shape 2 22.22
X.1 shape 9 100
large size 2 40
medium size 1 20
small size 2 40
X.2 size 5 100
数据
dat <- structure(list(ID = c(55L, 83L, 78L, 43L, 29L), What.color.is.this.item. = c("red",
"blue", "red", "green", "yellow"), What.color.is.this.item._2 = c("blue",
"yellow", "yellow", NA, "green"), What.is.the.shape.of.this.item. = c("circle",
"circle", "square", "square", "circle"), What.is.the.shape.of.this.item._2 = c("triangle",
NA, "circle", "circle", "triangle"), size = c("small", "large",
"large", "small", "medium")), class = "data.frame", row.names = c(NA,
-5L))
首先编码时要多用专有名词
names(dat)[2:5] <- paste0(rep(c('color.', 'shape.'), each=2), 1:2)
现在我们很容易将数据转换成长格式。
dat_l <- reshape(dat, 2:5, direction='long', idvar='ID')
之后我们就可以在 base R 及其亲戚中使用 table()
函数了,
vars <- names(dat_l)[c("size", "color", "shape")]
tbl <- lapply(vars, \(x) table(dat_l[, x]) |>
(\(Freq) cbind(Freq=addmargins(Freq),
Percent=addmargins(proportions(Freq))*100))() |>
round(2)) |>
setNames(vars)
为控制台获得一个不错的table。
tbl
# $size
# Freq Percent
# large 4 40
# medium 2 20
# small 4 40
# Sum 10 100
#
# $color
# Freq Percent
# blue 2 22.22
# green 2 22.22
# red 2 22.22
# yellow 3 33.33
# Sum 9 100.00
#
# $shape
# Freq Percent
# circle 5 55.56
# square 2 22.22
# triangle 2 22.22
# Sum 9 100.00
# [1] "R version 4.1.2 (2021-11-01)"
数据
dat <- structure(list(ID = c(55L, 83L, 78L, 43L, 29L), What.color.is.this.item. = c("red",
"blue", "red", "green", "yellow"), What.color.is.this.item._2 = c("blue",
"yellow", "yellow", NA, "green"), What.is.the.shape.of.this.item. = c("circle",
"circle", "square", "square", "circle"), What.is.the.shape.of.this.item._2 = c("triangle",
NA, "circle", "circle", "triangle"), size = c("small", "large",
"large", "small", "medium")), class = "data.frame", row.names = c(NA,
-5L))