R中按列汇总数据
Summary data by column in R
我有以下数据
pt_id <- c(1,1,1,1,1,2,2,2,3,3,3,3,3,4,4,4,4)
Tob_pk <- c(2, 5, 7, 1, 8, 12, 14, 3, 6, 8, 10, 20, 13, 5, 4, 12, 10)
Tobacco <- c("Once","Twice","Never", NA, NA, NA, NA, NA,"Once","Twice","Quit","Once",NA,NA,"Never", NA, "Never")
Alcohol <- c("Twice", "Once",NA, NA, "Never", NA, NA, "Once", NA, "Quit", "Twice", NA, "Once", NA, NA, "Never", "Never")
PA <- c("Once",NA,"Never", NA, NA, NA, NA, NA,"Once",NA,"Quit","Once",NA,NA,"Never", NA, NA)
mydata <- data.frame(pt_id, Tob_pk, Tobacco, Alcohol, PA)
mydata
我想要数据集中每个变量的摘要/比例,我尝试使用以下代码获取每个变量的摘要/比例
data_summ <- mydata %>%
summarize_at(.vars=3:5, funs(prop.table(.)))
但是,我收到以下错误
Error: Problem with `summarise()` input `Tobacco`.
x invalid 'type' (character) of argument
ℹ Input `Tobacco` is `prop.table(Tobacco)`.
Run `rlang::last_error()` to see where the error occurred.
我不确定我哪里出错了。如果我能得到任何建议来获得以下输出,但也有 NA 的百分比,那将会很有帮助。
Tobacco Alcohol PA
Never 0.3333333 Never 0.3333333 Never 0.3333333
Once 0.3333333 Once 0.3333333 Once 0.5000000
Quit 0.1111111 Quit 0.1111111 Quit. 0.1666667
Twice 0.2222222 Twice 0.2222222
提前致谢!
使用base
pt_id <- c(1,1,1,1,1,2,2,2,3,3,3,3,3,4,4,4,4)
Tob_pk <- c(2, 5, 7, 1, 8, 12, 14, 3, 6, 8, 10, 20, 13, 5, 4, 12, 10)
Tobacco <- c("Once","Twice","Never", NA, NA, NA, NA, NA,"Once","Twice","Quit","Once",NA,NA,"Never", NA, "Never")
Alcohol <- c("Twice", "Once",NA, NA, "Never", NA, NA, "Once", NA, "Quit", "Twice", NA, "Once", NA, NA, "Never", "Never")
PA <- c("Once",NA,"Never", NA, NA, NA, NA, NA,"Once",NA,"Quit","Once",NA,NA,"Never", NA, NA)
mydata <- data.frame(pt_id, Tob_pk, Tobacco, Alcohol, PA)
apply(mydata[3:5], 2, function(x) prop.table(table(x, useNA="ifany")))
$Tobacco
x
Never Once Quit Twice <NA>
0.17647059 0.17647059 0.05882353 0.11764706 0.47058824
$Alcohol
x
Never Once Quit Twice <NA>
0.17647059 0.17647059 0.05882353 0.11764706 0.47058824
$PA
x
Never Once Quit <NA>
0.11764706 0.17647059 0.05882353 0.64705882
由 reprex package (v0.3.0)
于 2021 年 1 月 18 日创建
使用tidyverse
library(tidyverse)
map_dfr(mydata[3:5], ~prop.table(table(.x)))
#> # A tibble: 3 x 4
#> Never Once Quit Twice
#> <table> <table> <table> <table>
#> 1 0.3333333 0.3333333 0.1111111 0.2222222
#> 2 0.3333333 0.3333333 0.1111111 0.2222222
#> 3 0.3333333 0.5000000 0.1666667 NA
由 reprex package (v0.3.0)
于 2021 年 1 月 18 日创建
使用 dplyr:
library(tidyverse)
df <- mydata %>%
select(3:5) %>%
gather('Your_vice', 'freq', 1:3) %>%
group_by(Your_vice, freq) %>%
summarize(n = n()) %>%
mutate(perc = n/sum(n))
df
# A tibble: 14 x 4
# Groups: Your_vice [3]
Your_vice freq n perc
<chr> <chr> <int> <dbl>
1 Alcohol Never 3 0.176
2 Alcohol Once 3 0.176
3 Alcohol Quit 1 0.0588
4 Alcohol Twice 2 0.118
5 Alcohol NA 8 0.471
6 PA Never 2 0.118
7 PA Once 3 0.176
8 PA Quit 1 0.0588
9 PA NA 11 0.647
10 Tobacco Never 3 0.176
11 Tobacco Once 3 0.176
12 Tobacco Quit 1 0.0588
13 Tobacco Twice 2 0.118
14 Tobacco NA 8 0.471
您可以将变量转换为具有相应水平的因子。然后 table
与选项 useNA="ifany"
显示 NA
.
mydata[3:5] <- lapply(mydata[3:5], factor, levels=c("Never", "Once", "Quit", "Twice"))
res1 <- sapply(mydata[3:5], function(x) prop.table(table(x)))
res1
# Tobacco Alcohol PA
# Never 0.3333333 0.3333333 0.3333333
# Once 0.3333333 0.3333333 0.5000000
# Quit 0.1111111 0.1111111 0.1666667
# Twice 0.2222222 0.2222222 0.0000000
res2 <- sapply(mydata[3:5], function(x) prop.table(table(x, useNA="ifany")))
res2
# Tobacco Alcohol PA
# Never 0.17647059 0.17647059 0.11764706
# Once 0.17647059 0.17647059 0.17647059
# Quit 0.05882353 0.05882353 0.05882353
# Twice 0.11764706 0.11764706 0.00000000
# <NA> 0.47058824 0.47058824 0.64705882
这是一个基本的 R 选项,使用 prop.table
+ table
+ na.omit
+ factor
do.call(
cbind,
lapply(
mydata[3:5],
function(x) {
prop.table(
table(
na.omit(
factor(x, levels = unique(na.omit(unlist(mydata[3:5]))))
)
)
)
}
)
)
这给出了
Tobacco Alcohol PA
Once 0.3333333 0.3333333 0.5000000
Twice 0.2222222 0.2222222 0.0000000
Never 0.3333333 0.3333333 0.3333333
Quit 0.1111111 0.1111111 0.1666667
我有以下数据
pt_id <- c(1,1,1,1,1,2,2,2,3,3,3,3,3,4,4,4,4)
Tob_pk <- c(2, 5, 7, 1, 8, 12, 14, 3, 6, 8, 10, 20, 13, 5, 4, 12, 10)
Tobacco <- c("Once","Twice","Never", NA, NA, NA, NA, NA,"Once","Twice","Quit","Once",NA,NA,"Never", NA, "Never")
Alcohol <- c("Twice", "Once",NA, NA, "Never", NA, NA, "Once", NA, "Quit", "Twice", NA, "Once", NA, NA, "Never", "Never")
PA <- c("Once",NA,"Never", NA, NA, NA, NA, NA,"Once",NA,"Quit","Once",NA,NA,"Never", NA, NA)
mydata <- data.frame(pt_id, Tob_pk, Tobacco, Alcohol, PA)
mydata
我想要数据集中每个变量的摘要/比例,我尝试使用以下代码获取每个变量的摘要/比例
data_summ <- mydata %>%
summarize_at(.vars=3:5, funs(prop.table(.)))
但是,我收到以下错误
Error: Problem with `summarise()` input `Tobacco`.
x invalid 'type' (character) of argument
ℹ Input `Tobacco` is `prop.table(Tobacco)`.
Run `rlang::last_error()` to see where the error occurred.
我不确定我哪里出错了。如果我能得到任何建议来获得以下输出,但也有 NA 的百分比,那将会很有帮助。
Tobacco Alcohol PA
Never 0.3333333 Never 0.3333333 Never 0.3333333
Once 0.3333333 Once 0.3333333 Once 0.5000000
Quit 0.1111111 Quit 0.1111111 Quit. 0.1666667
Twice 0.2222222 Twice 0.2222222
提前致谢!
使用base
pt_id <- c(1,1,1,1,1,2,2,2,3,3,3,3,3,4,4,4,4)
Tob_pk <- c(2, 5, 7, 1, 8, 12, 14, 3, 6, 8, 10, 20, 13, 5, 4, 12, 10)
Tobacco <- c("Once","Twice","Never", NA, NA, NA, NA, NA,"Once","Twice","Quit","Once",NA,NA,"Never", NA, "Never")
Alcohol <- c("Twice", "Once",NA, NA, "Never", NA, NA, "Once", NA, "Quit", "Twice", NA, "Once", NA, NA, "Never", "Never")
PA <- c("Once",NA,"Never", NA, NA, NA, NA, NA,"Once",NA,"Quit","Once",NA,NA,"Never", NA, NA)
mydata <- data.frame(pt_id, Tob_pk, Tobacco, Alcohol, PA)
apply(mydata[3:5], 2, function(x) prop.table(table(x, useNA="ifany")))
$Tobacco
x
Never Once Quit Twice <NA>
0.17647059 0.17647059 0.05882353 0.11764706 0.47058824
$Alcohol
x
Never Once Quit Twice <NA>
0.17647059 0.17647059 0.05882353 0.11764706 0.47058824
$PA
x
Never Once Quit <NA>
0.11764706 0.17647059 0.05882353 0.64705882
由 reprex package (v0.3.0)
于 2021 年 1 月 18 日创建使用tidyverse
library(tidyverse)
map_dfr(mydata[3:5], ~prop.table(table(.x)))
#> # A tibble: 3 x 4
#> Never Once Quit Twice
#> <table> <table> <table> <table>
#> 1 0.3333333 0.3333333 0.1111111 0.2222222
#> 2 0.3333333 0.3333333 0.1111111 0.2222222
#> 3 0.3333333 0.5000000 0.1666667 NA
由 reprex package (v0.3.0)
于 2021 年 1 月 18 日创建使用 dplyr:
library(tidyverse)
df <- mydata %>%
select(3:5) %>%
gather('Your_vice', 'freq', 1:3) %>%
group_by(Your_vice, freq) %>%
summarize(n = n()) %>%
mutate(perc = n/sum(n))
df
# A tibble: 14 x 4
# Groups: Your_vice [3]
Your_vice freq n perc
<chr> <chr> <int> <dbl>
1 Alcohol Never 3 0.176
2 Alcohol Once 3 0.176
3 Alcohol Quit 1 0.0588
4 Alcohol Twice 2 0.118
5 Alcohol NA 8 0.471
6 PA Never 2 0.118
7 PA Once 3 0.176
8 PA Quit 1 0.0588
9 PA NA 11 0.647
10 Tobacco Never 3 0.176
11 Tobacco Once 3 0.176
12 Tobacco Quit 1 0.0588
13 Tobacco Twice 2 0.118
14 Tobacco NA 8 0.471
您可以将变量转换为具有相应水平的因子。然后 table
与选项 useNA="ifany"
显示 NA
.
mydata[3:5] <- lapply(mydata[3:5], factor, levels=c("Never", "Once", "Quit", "Twice"))
res1 <- sapply(mydata[3:5], function(x) prop.table(table(x)))
res1
# Tobacco Alcohol PA
# Never 0.3333333 0.3333333 0.3333333
# Once 0.3333333 0.3333333 0.5000000
# Quit 0.1111111 0.1111111 0.1666667
# Twice 0.2222222 0.2222222 0.0000000
res2 <- sapply(mydata[3:5], function(x) prop.table(table(x, useNA="ifany")))
res2
# Tobacco Alcohol PA
# Never 0.17647059 0.17647059 0.11764706
# Once 0.17647059 0.17647059 0.17647059
# Quit 0.05882353 0.05882353 0.05882353
# Twice 0.11764706 0.11764706 0.00000000
# <NA> 0.47058824 0.47058824 0.64705882
这是一个基本的 R 选项,使用 prop.table
+ table
+ na.omit
+ factor
do.call(
cbind,
lapply(
mydata[3:5],
function(x) {
prop.table(
table(
na.omit(
factor(x, levels = unique(na.omit(unlist(mydata[3:5]))))
)
)
)
}
)
)
这给出了
Tobacco Alcohol PA
Once 0.3333333 0.3333333 0.5000000
Twice 0.2222222 0.2222222 0.0000000
Never 0.3333333 0.3333333 0.3333333
Quit 0.1111111 0.1111111 0.1666667