使用 lapply 的数据集摘要

Summary of Dataset using lapply

这是一个新手问题,但是,我发现很难理解如何正确使用 lapply,尤其是当使用的 ID 不是数字时。

可能有更好的方法来尝试找到我想要的摘要,但现在,我正在尝试使用 lapply。本质上,我有一个包含 17 列的大型 df。其中两列是 ID 和日期。并非所有 ID 在给定的列名中都有记录值。我感兴趣的是找到每列可用的总行数,以及该列存在的唯一 ID 的数量。我有一个 dput 示例,可以使事情更清楚。例如,Var8 只有 6 行数据可用,因此它有 6 个唯一 ID。此外,Var15 有 20 行和 12 个唯一 ID。但我想知道所有 Var15。我可以使用

手动执行此操作
Var8=df[!(is.na(df$Var8)),]
length(df$ID)
length(unique(df$ID)) 
remove(Var8)

但尝试自动化:

lapply(COL.NAMES, function(x){
       temp=df[!(is.na(df$paste(x))),]
       rows=length(temp$ID)
       num_comp=length(unique(temp$ID)) 
       return(rows)
       return(num_comp)
       remove(temp)
})

给我留下一个错误:尝试应用非函数。

COL.NAMES<-c("Var1","Var2","Var3","Var4","Var5","Var6","Var7","Var8","Var9","Var10","Var11","Var12","Var13","Var14","Var15")


structure(list(ID = structure(c(1L, 5L, 6L, 7L, 8L, 9L, 10L, 
11L, 12L, 2L, 3L, 4L, 1L, 5L, 6L, 7L, 8L, 9L, 10L, 11L), .Label = c("Comp1", 
"Comp10", "Comp11", "Comp12", "Comp2", "Comp3", "Comp4", "Comp5", 
"Comp6", "Comp7", "Comp8", "Comp9"), class = "factor"), Date = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L), .Label = c("0/1/2014", "0/1/2015"), class = "factor"), 
    Var1 = c(0.57, 0.34, 0.38, 0.93, 0.54, 0.17, 0.08, 0.28, 
    0.99, 1, 0.61, 0.73, 0.15, 0.09, 0.64, 0.3, 0.12, 0.79, 0.79, 
    0.15), Var2 = c(0.7, 0.77, 0.93, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.46, 0.26), Var3 = c(0.65, 
    0.7, 0.83, 0.7, 0.43, 0.81, 0.21, 0.44, 0.25, 0.77, 0.24, 
    0.29, 0.87, 0.42, 1, NA, NA, NA, NA, 0.79), Var4 = c(1, 0.7, 
    0.69, NA, NA, NA, NA, 0.2, 0.61, 0.89, 0.45, 0.02, 0.97, 
    0.33, 0.34, 0.81, 0.99, 0.35, 0.48, 0.33), Var5 = c(0.47, 
    0.95, 0.38, 0.69, 0.84, 0.21, 0.62, 0.59, 0.45, 0.63, 0.18, 
    0.49, NA, NA, NA, NA, 0.17, 0.15, 0.6, 0.44), Var6 = c(NA, 
    NA, NA, NA, 0.24, 0.07, 0.75, 0.24, 0.82, 0.14, 0.86, 0.63, 
    0.82, 0.92, 0.55, 0.22, 0.87, 0.69, 0.64, 0.73), Var7 = c(0.2, 
    0.11, 0.82, 0.31, 0.97, NA, NA, NA, NA, 0.83, 0.84, 0.81, 
    0.72, 0.36, 0.09, 0.15, 0.46, 0.79, 0.75, 0.39), Var8 = c(0.28, 
    0.55, NA, NA, NA, NA, 0.56, 0.89, 0.92, 0.46, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA), Var9 = c(0.11, 0.36, 1, 0.44, 
    0.53, 0.6, 0.24, 0.56, 0.6, 0.55, 0.55, 0.05, 0.77, 0.9, 
    NA, NA, NA, NA, 0.4, 0.33), Var10 = c(0.74, 0.13, 0.09, 0.61, 
    NA, NA, NA, NA, 0.27, 0.71, 0.56, 0.3, 0.36, 0.44, 0.78, 
    0.9, 0.46, 0.49, 0.87, 0.36), Var11 = c(0.58, 0.99, 0.07, 
    0.83, 0.45, 0.07, 0.16, 0.43, 0.34, 0.31, 0.06, 0.67, 0.02, 
    0.52, 0.19, 0.49, 0.31, 0.02, 0.62, 0.21), Var12 = c(0.93, 
    0.26, 0.77, 0.8, 0.67, 0.83, 0.12, 0.39, 0.78, 0.75, 0.44, 
    NA, NA, NA, NA, 0.42, 0.49, 0.06, 0.8, 0.54), Var13 = c(0.44, 
    0.75, NA, NA, NA, NA, 0.58, 0.3, 0.47, 0.88, 0.36, 0.21, 
    0.87, 0.33, 0.12, 0.31, 0.95, 0.59, 0.18, 0.43), Var14 = c(0.55, 
    0.03, 0.37, 0.66, NA, 0.91, 0.78, 0.84, 0.96, 0.34, 0.25, 
    0.92, 0.71, 0.41, 0.23, 0.54, 0.8, 0.87, 0.3, 0.37), Var15 = c(0.71, 
    0.66, 0.01, 0.7, 0.4, 0.04, 0.3, 1, 0.59, 0.69, 0.88, 0.28, 
    0.44, 0.51, 0.2, 0.17, 0.6, 0.11, 0.85, 0.04)), .Names = c("ID", 
"Date", "Var1", "Var2", "Var3", "Var4", "Var5", "Var6", "Var7", 
"Var8", "Var9", "Var10", "Var11", "Var12", "Var13", "Var14", 
"Var15"), class = "data.frame", row.names = c(NA, -20L))

我不确定我是否理解正确,但这可能是您的解决方案。 x 是你的数据框

try1 <-  function(df){
  temp <- sum(!is.na(df)) ## no of non na entries 
  temp2 <- length(unique(df)) # length unique entries `
  temp <- list("x"=temp,"y"=temp2)
  temp

}

> lapply(x,try1)

这是一个data.table解决方案

library(data.table)
dd <- as.data.table(x)

COL.NAMES<-c("Var1","Var2","Var3","Var4","Var5","Var6","Var7","Var8","Var9","Var10","Var11","Var12","Var13","Var14","Var15")

dd[,lapply(.SD, try1),.SDcols=COL.NAMES]

我建议您熟悉使用 dplyr 进行数据整理。实施的 magrittr 管道 %>% 将帮助您理解 apply.

的用法

以下是我将如何更改您的功能:

library(dplyr)
tmp<-lapply(COL.NAMES, function(x) df[,c("ID", x)] %>% na.omit) # loop and extract 15 data.frames, each with 2 columns; remove rows with missing value
rows <- sapply(tmp, nrow) 
num_comp <- lapply(tmp, '[[', "ID") %>% lapply(., unique) %>% sapply(., length) #extract only ID column from list of 15 data.frame; loop across each vector to retain unique values; count length of vector.

但是,我没有使用 lapply,这个解决方案确实有效

find.uniques<- function(df){
for(i in 1:ncol(df)){   
    uniques<- data.frame()
    uniques[i,1]<- length(!is.na(unique(df[,i])))
    uniques[i,2]<- length(which(!is.na(unique(df[,i]))))
 }
return(uniques)
}

结果是一个 data.frame,其中 V1 表示有多少行可用,V2 表示每列有多少个 ID。 您还可以 return(as.data.frame(t(uniques))) 将行更改为列以查看每列可用的内容。

另一种方法是,

df1 <- data.frame(n_rows = colSums(!is.na(df[,-(1:2)]), na.rm = TRUE), 
                  unique_IDs = sapply(df[,-2], function(i) length(unique(df$ID[!is.na(i)])))[-1])
head(df1)
#     n_rows unique_IDs
#Var1     20         12
#Var2      5          5
#Var3     16         12
#Var4     16         12
#Var5     16         12
#Var6     16         12