在 R 中使用 class 的最小值来估算缺失值
impute missing values using minimum of a class in R
我是 R 的新手,需要帮助在我当前处理的数据集中的其中一列中输入缺失值。下图显示了我想要估算的缺失值以及几列。
我想用客户以前的条目填写最小数量的值,因为我认为这最适合我的情况和数据。例如,在图像中,我应该能够用 1(1、5、2 的最小值)填充缺失值。
在我的搜索过程中,我主要遇到了对给定 class 使用平均值的方法,而不是最小值或最大值。
非常感谢任何帮助或指点。
编辑:这是 dput 的输出。
structure(list(YEAR = c(2011L, 2012L, 2014L, 2015L, 2011L, 2012L
), CustomerId = c("00000063", "00000063", "00000063", "00000063",
"00000065", "00000065"), MemberType = structure(c(2L, 2L, 2L,
2L, 2L, 2L), .Label = c("GROUP", "INDIVIDUAL", "PARTNER"), class = "factor"),
MembershipTypeCode = structure(c(6L, 6L, 6L, 10L, 6L, 6L), .Label = c("EGROUP",
"EINDIV", "EINDIV2", "EPARTNER", "GROUP", "INDIV", "INDIV2",
"INDIV3", "PARTNER", "PLUS", "PLUS2", "PLUS20", "PLUS3",
"PLUSENTERPRI", "PLUSGROUP", "PLUSGROUP2", "PROF_ENTERPR",
"PROF_GROUP", "PROF_GROUP2", "PROF_INDIV", "PROF_INDIV2",
"PROF_INDIV3"), class = "factor"), MembershipPeriodBegin = structure(c(15279,
15677, 16071, 16436, 15006, 15371), class = "Date"), MembershipPeriodEnd = structure(c(15644,
16070, 16435, 16800, 15370, 15736), class = "Date"), ConsecutiveYearsAsMember = c(14L,
15L, 17L, 18L, 8L, 9L), AllocationUsage = c(0, 0, 0, 0, 0,
0), SetCOPPreference = structure(c(2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("Y", "N"), class = "factor"), Purchase.Qty = c(2L,
5L, 1L, NA, 7L, 27L), Webcast.Registration = c(0L, 0L, 0L,
0L, 0L, 1L), Web.Visits = c(0L, 0L, 42L, 0L, 0L, 0L), Web.Page.Views = c(0L,
0L, 98L, 0L, 0L, 0L), Blog.Visits = c(0L, 0L, 3L, 0L, 0L,
0L), Blog.Page.Views = c(0L, 0L, 4L, 0L, 0L, 0L), Forum.Visits = c(0L,
0L, 45L, 0L, 0L, 0L), Forum.Page.Views = c(0L, 0L, 102L,
0L, 0L, 0L), ParatureTickets = c(0L, 0L, 0L, 0L, 0L, 0L),
ParatureChats = c(0L, 0L, 0L, 0L, 0L, 0L), Registered.for.Edu = c(0L,
0L, 0L, 0L, 0L, 0L), Attended.ICE = structure(c(2L, 2L, 2L,
2L, 2L, 2L), .Label = c("Y", "N"), class = "factor"), Attended.TK = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("Y", "N"), class = "factor"),
Frugal = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Y",
"N"), class = "factor"), Chapter.Board = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("Y", "N"), class = "factor"),
Retained = structure(c(5L, 5L, 5L, 1L, 5L, 5L), .Label = c("Active",
"Awaiting Renewal", "Future Dated", "Lost", "Retained"), class = "factor"),
ProfileCompletion = c(60, 60, 60, 60, 60, 60), NumberofLogins = c(1L,
1L, 15L, 0L, 0L, 4L), Downloads = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), ForumMember = structure(c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), .Label = "N", class = "factor"), FreeUpgrade = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("Y", "N"), class = "factor")), .Names = c("YEAR",
"CustomerId", "MemberType", "MembershipTypeCode", "MembershipPeriodBegin",
"MembershipPeriodEnd", "ConsecutiveYearsAsMember", "AllocationUsage",
"SetCOPPreference", "Purchase.Qty", "Webcast.Registration", "Web.Visits",
"Web.Page.Views", "Blog.Visits", "Blog.Page.Views", "Forum.Visits",
"Forum.Page.Views", "ParatureTickets", "ParatureChats", "Registered.for.Edu",
"Attended.ICE", "Attended.TK", "Frugal", "Chapter.Board", "Retained",
"ProfileCompletion", "NumberofLogins", "Downloads", "ForumMember",
"FreeUpgrade"), row.names = c(NA, 6L), class = "data.frame")
谢谢,
普拉蒂克
由于您未提供任何数据,这里是一个玩具示例,我将如何在 base R 中执行此操作:
# simple sample data
data <- data.frame( a = rep( 10:12, each = 4 ), b = 12:1 )
data[ c( 3, 5, 12 ), 2 ] <- NA
# for each unique a value, get the row index with the min b value,
# and write that min value to col b where b is NA
for( i in unique( data$a ) )
data[ which( is.na( data$b ) & data$a == i ), "b" ] <-
min( data[ data$a == i, "b" ], na.rm = TRUE )
data
a b
1 10 12
2 10 11
3 10 9
4 10 9
5 11 5
6 11 7
7 11 6
8 11 5
9 12 4
10 12 3
11 12 2
12 12 2
我们可以使用 na.aggregate
和 FUN= min
。我们将 'data.frame' 转换为 'data.table' (setDT(df1)
),按 'CustomerID' 分组,我们在 'PurchaseQty' 上应用 na.aggregate
并分配 (:=
) 输出返回到'PurchaseQty'.
library(data.table)
library(zoo)
setDT(df1)[, PurchaseQty := na.aggregate(PurchaseQty, FUN= min) , by = CustomerID]
数据
df1 <- data.frame(CustomerID= rep(1:2, each=4), PurchaseQty= c(4, 3, NA, 3, 1, 9, NA, 4))
我是 R 的新手,需要帮助在我当前处理的数据集中的其中一列中输入缺失值。下图显示了我想要估算的缺失值以及几列。
我想用客户以前的条目填写最小数量的值,因为我认为这最适合我的情况和数据。例如,在图像中,我应该能够用 1(1、5、2 的最小值)填充缺失值。
在我的搜索过程中,我主要遇到了对给定 class 使用平均值的方法,而不是最小值或最大值。
非常感谢任何帮助或指点。
编辑:这是 dput 的输出。
structure(list(YEAR = c(2011L, 2012L, 2014L, 2015L, 2011L, 2012L
), CustomerId = c("00000063", "00000063", "00000063", "00000063",
"00000065", "00000065"), MemberType = structure(c(2L, 2L, 2L,
2L, 2L, 2L), .Label = c("GROUP", "INDIVIDUAL", "PARTNER"), class = "factor"),
MembershipTypeCode = structure(c(6L, 6L, 6L, 10L, 6L, 6L), .Label = c("EGROUP",
"EINDIV", "EINDIV2", "EPARTNER", "GROUP", "INDIV", "INDIV2",
"INDIV3", "PARTNER", "PLUS", "PLUS2", "PLUS20", "PLUS3",
"PLUSENTERPRI", "PLUSGROUP", "PLUSGROUP2", "PROF_ENTERPR",
"PROF_GROUP", "PROF_GROUP2", "PROF_INDIV", "PROF_INDIV2",
"PROF_INDIV3"), class = "factor"), MembershipPeriodBegin = structure(c(15279,
15677, 16071, 16436, 15006, 15371), class = "Date"), MembershipPeriodEnd = structure(c(15644,
16070, 16435, 16800, 15370, 15736), class = "Date"), ConsecutiveYearsAsMember = c(14L,
15L, 17L, 18L, 8L, 9L), AllocationUsage = c(0, 0, 0, 0, 0,
0), SetCOPPreference = structure(c(2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("Y", "N"), class = "factor"), Purchase.Qty = c(2L,
5L, 1L, NA, 7L, 27L), Webcast.Registration = c(0L, 0L, 0L,
0L, 0L, 1L), Web.Visits = c(0L, 0L, 42L, 0L, 0L, 0L), Web.Page.Views = c(0L,
0L, 98L, 0L, 0L, 0L), Blog.Visits = c(0L, 0L, 3L, 0L, 0L,
0L), Blog.Page.Views = c(0L, 0L, 4L, 0L, 0L, 0L), Forum.Visits = c(0L,
0L, 45L, 0L, 0L, 0L), Forum.Page.Views = c(0L, 0L, 102L,
0L, 0L, 0L), ParatureTickets = c(0L, 0L, 0L, 0L, 0L, 0L),
ParatureChats = c(0L, 0L, 0L, 0L, 0L, 0L), Registered.for.Edu = c(0L,
0L, 0L, 0L, 0L, 0L), Attended.ICE = structure(c(2L, 2L, 2L,
2L, 2L, 2L), .Label = c("Y", "N"), class = "factor"), Attended.TK = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("Y", "N"), class = "factor"),
Frugal = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Y",
"N"), class = "factor"), Chapter.Board = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("Y", "N"), class = "factor"),
Retained = structure(c(5L, 5L, 5L, 1L, 5L, 5L), .Label = c("Active",
"Awaiting Renewal", "Future Dated", "Lost", "Retained"), class = "factor"),
ProfileCompletion = c(60, 60, 60, 60, 60, 60), NumberofLogins = c(1L,
1L, 15L, 0L, 0L, 4L), Downloads = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), ForumMember = structure(c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), .Label = "N", class = "factor"), FreeUpgrade = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("Y", "N"), class = "factor")), .Names = c("YEAR",
"CustomerId", "MemberType", "MembershipTypeCode", "MembershipPeriodBegin",
"MembershipPeriodEnd", "ConsecutiveYearsAsMember", "AllocationUsage",
"SetCOPPreference", "Purchase.Qty", "Webcast.Registration", "Web.Visits",
"Web.Page.Views", "Blog.Visits", "Blog.Page.Views", "Forum.Visits",
"Forum.Page.Views", "ParatureTickets", "ParatureChats", "Registered.for.Edu",
"Attended.ICE", "Attended.TK", "Frugal", "Chapter.Board", "Retained",
"ProfileCompletion", "NumberofLogins", "Downloads", "ForumMember",
"FreeUpgrade"), row.names = c(NA, 6L), class = "data.frame")
谢谢,
普拉蒂克
由于您未提供任何数据,这里是一个玩具示例,我将如何在 base R 中执行此操作:
# simple sample data
data <- data.frame( a = rep( 10:12, each = 4 ), b = 12:1 )
data[ c( 3, 5, 12 ), 2 ] <- NA
# for each unique a value, get the row index with the min b value,
# and write that min value to col b where b is NA
for( i in unique( data$a ) )
data[ which( is.na( data$b ) & data$a == i ), "b" ] <-
min( data[ data$a == i, "b" ], na.rm = TRUE )
data
a b
1 10 12
2 10 11
3 10 9
4 10 9
5 11 5
6 11 7
7 11 6
8 11 5
9 12 4
10 12 3
11 12 2
12 12 2
我们可以使用 na.aggregate
和 FUN= min
。我们将 'data.frame' 转换为 'data.table' (setDT(df1)
),按 'CustomerID' 分组,我们在 'PurchaseQty' 上应用 na.aggregate
并分配 (:=
) 输出返回到'PurchaseQty'.
library(data.table)
library(zoo)
setDT(df1)[, PurchaseQty := na.aggregate(PurchaseQty, FUN= min) , by = CustomerID]
数据
df1 <- data.frame(CustomerID= rep(1:2, each=4), PurchaseQty= c(4, 3, NA, 3, 1, 9, NA, 4))