按R中的列汇总数据帧
summarizing data frame by columns in R
我有这个数据框 df:
df <- structure(list(App = structure(c(4L, 4L, 3L, 3L, 2L, 2L, 1L), .Label = c("DB",
"End", "Mid", "Web"), class = "factor"), Server = structure(c(5L,
6L, 1L, 2L, 3L, 4L, 7L), .Label = c("GServer101", "Hserver103",
"JServer100", "Kserver200", "Server101", "Server102", "Xdb101"
), class = "factor"), Process1 = c(1L, 5L, 1L, 1L, 1L, 1L, 1L
), Process2 = c(1L, 1L, 1L, 4L, 1L, 1L, 1L), Process3 = c(NA,
NA, NA, NA, NA, NA, NA), Process4 = c(NA, NA, NA, NA, NA, NA,
NA), Process5 = c(NA, NA, NA, 1L, 1L, 1L, 1L)), .Names = c("App",
"Server", "Process1", "Process2", "Process3", "Process4", "Process5"
), class = "data.frame", row.names = c(NA, -7L))
我希望能够按列总结 df 数据框以及计数和放置过程,如下所示。我需要知道每个应用程序按列名分组有多少进程。我将如何在 R 中执行此操作?
end <- structure(list(App = structure(c(4L, 3L, 2L, 1L), .Label = c("DB",
"End", "Mid", "Web"), class = "factor"), Process1 = c(6L, 2L,
2L, 1L), Process2 = c(2L, 5L, 2L, 1L), Process3 = c(0L, 0L, 0L,
0L), Process4 = c(0L, 0L, 0L, 0L), Process5 = c(0L, 1L, 2L, 1L
)), .Names = c("App", "Process1", "Process2", "Process3", "Process4",
"Process5"), class = "data.frame", row.names = c(NA, -4L))
您可以使用 dplyr
:
library(dplyr)
df %>%
group_by(App) %>%
summarize_at(vars(starts_with("Process")), funs(sum(., na.rm=TRUE)))
# A tibble: 4 × 6
# App Process1 Process2 Process3 Process4 Process5
# <fctr> <int> <int> <int> <int> <int>
#1 DB 1 1 0 0 1
#2 End 2 2 0 0 2
#3 Mid 2 5 0 0 1
#4 Web 6 2 0 0 0
或者如果列位置是首选,可以将位置传递给 .cols
参数:
df %>%
group_by(App) %>%
summarize_at(.cols=3:7, funs(sum(., na.rm=TRUE)))
# A tibble: 4 × 6
# App Process1 Process2 Process3 Process4 Process5
# <fctr> <int> <int> <int> <int> <int>
#1 DB 1 1 0 0 1
#2 End 2 2 0 0 2
#3 Mid 2 5 0 0 1
#4 Web 6 2 0 0 0
这是一个使用data.table
的方法
library(data.table)
# convert df to data.table
setDT(df)
df[, lapply(.SD, sum, na.rm=TRUE), .SDcols=Process1:Process5, by="App"]
App Process1 Process2 Process3 Process4 Process5
1: Web 6 2 0 0 0
2: Mid 2 5 0 0 1
3: End 2 2 0 0 2
4: DB 1 1 0 0 1
或使用列位置代替列名
df[, lapply(.SD, sum, na.rm=TRUE), .SDcols=3:7, by="App"]
App Process1 Process2 Process3 Process4 Process5
1: Web 6 2 0 0 0
2: Mid 2 5 0 0 1
3: End 2 2 0 0 2
4: DB 1 1 0 0 1
如果这是新的,这里有一个快速分解。 lapply(.SD, sum, na.rm=TRUE)
表示 sum
所有列的 na.rm=TRUE,.SDcols=3:7
或 .SDcols=Process1:Process5
将此操作子集到所需的列,by=App
对操作进行分组.
我有这个数据框 df:
df <- structure(list(App = structure(c(4L, 4L, 3L, 3L, 2L, 2L, 1L), .Label = c("DB",
"End", "Mid", "Web"), class = "factor"), Server = structure(c(5L,
6L, 1L, 2L, 3L, 4L, 7L), .Label = c("GServer101", "Hserver103",
"JServer100", "Kserver200", "Server101", "Server102", "Xdb101"
), class = "factor"), Process1 = c(1L, 5L, 1L, 1L, 1L, 1L, 1L
), Process2 = c(1L, 1L, 1L, 4L, 1L, 1L, 1L), Process3 = c(NA,
NA, NA, NA, NA, NA, NA), Process4 = c(NA, NA, NA, NA, NA, NA,
NA), Process5 = c(NA, NA, NA, 1L, 1L, 1L, 1L)), .Names = c("App",
"Server", "Process1", "Process2", "Process3", "Process4", "Process5"
), class = "data.frame", row.names = c(NA, -7L))
我希望能够按列总结 df 数据框以及计数和放置过程,如下所示。我需要知道每个应用程序按列名分组有多少进程。我将如何在 R 中执行此操作?
end <- structure(list(App = structure(c(4L, 3L, 2L, 1L), .Label = c("DB",
"End", "Mid", "Web"), class = "factor"), Process1 = c(6L, 2L,
2L, 1L), Process2 = c(2L, 5L, 2L, 1L), Process3 = c(0L, 0L, 0L,
0L), Process4 = c(0L, 0L, 0L, 0L), Process5 = c(0L, 1L, 2L, 1L
)), .Names = c("App", "Process1", "Process2", "Process3", "Process4",
"Process5"), class = "data.frame", row.names = c(NA, -4L))
您可以使用 dplyr
:
library(dplyr)
df %>%
group_by(App) %>%
summarize_at(vars(starts_with("Process")), funs(sum(., na.rm=TRUE)))
# A tibble: 4 × 6
# App Process1 Process2 Process3 Process4 Process5
# <fctr> <int> <int> <int> <int> <int>
#1 DB 1 1 0 0 1
#2 End 2 2 0 0 2
#3 Mid 2 5 0 0 1
#4 Web 6 2 0 0 0
或者如果列位置是首选,可以将位置传递给 .cols
参数:
df %>%
group_by(App) %>%
summarize_at(.cols=3:7, funs(sum(., na.rm=TRUE)))
# A tibble: 4 × 6
# App Process1 Process2 Process3 Process4 Process5
# <fctr> <int> <int> <int> <int> <int>
#1 DB 1 1 0 0 1
#2 End 2 2 0 0 2
#3 Mid 2 5 0 0 1
#4 Web 6 2 0 0 0
这是一个使用data.table
library(data.table)
# convert df to data.table
setDT(df)
df[, lapply(.SD, sum, na.rm=TRUE), .SDcols=Process1:Process5, by="App"]
App Process1 Process2 Process3 Process4 Process5
1: Web 6 2 0 0 0
2: Mid 2 5 0 0 1
3: End 2 2 0 0 2
4: DB 1 1 0 0 1
或使用列位置代替列名
df[, lapply(.SD, sum, na.rm=TRUE), .SDcols=3:7, by="App"]
App Process1 Process2 Process3 Process4 Process5
1: Web 6 2 0 0 0
2: Mid 2 5 0 0 1
3: End 2 2 0 0 2
4: DB 1 1 0 0 1
如果这是新的,这里有一个快速分解。 lapply(.SD, sum, na.rm=TRUE)
表示 sum
所有列的 na.rm=TRUE,.SDcols=3:7
或 .SDcols=Process1:Process5
将此操作子集到所需的列,by=App
对操作进行分组.