计算熵
to calculate the Entropy
我是R新手,无法计算熵。
在 Whosebug 上有一个类似的问题和答案,但我想知道为什么这段代码不起作用。这是来自同一问题的复制粘贴数据。
其中一个答案提到 "The part I think you are missing is the calculation of the class frequencies and you will get your answer",但我该如何解决这个问题。我尝试了大部分选项,但仍然没有得到任何输出。它运行没有任何错误。
info <- function(CLASS.FREQ){
freq.class <- CLASS.FREQ
info <- 0
for(i in 1:length(freq.class)){
if(freq.class[[i]] != 0){ # zero check in class
entropy <- -sum(freq.class[[i]] * log2(freq.class[[i]])) #I calculate the entropy for each class i here
}else{
entropy <- 0
}
info <- info + entropy # sum up entropy from all classes
}
return(info)
}
数据集如下,
buys <- c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no")
credit <- c("fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent")
student <- c("no", "no", "no","no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", "no")
income <- c("high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium")
age <- c(25, 27, 35, 41, 48, 42, 36, 29, 26, 45, 23, 33, 37, 44)
我们将年龄从分类更改为数字
干杯,杰克
需要计算"no"和"yes"在"buys"中的比例,"fair"和"excellent"在"credit"中的比例, 等等。这是一种方法:
data <- list(
buys = c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no"),
credit = c("fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent"),
student = c("no", "no", "no","no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", "no"),
income = c("high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium"),
age = c(25, 27, 35, 41, 48, 42, 36, 29, 26, 45, 23, 33, 37, 44)
)
freq <- lapply( data, function(x){rowMeans(outer(unique(x),x,"=="))})
.
> freq
$buys
[1] 0.3571429 0.6428571
$credit
[1] 0.5714286 0.4285714
$student
[1] 0.5 0.5
$income
[1] 0.2857143 0.4285714 0.2857143
$age
[1] 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857
[14] 0.07142857
这样的比例永远不可能为0,所以把if(freq.class[[i]] != 0){ # zero check in class
改成if(length(freq.class[[i]]) != 0){ # zero check in class
:
info <- function(CLASS.FREQ){
freq.class <- CLASS.FREQ
info <- 0
for(i in 1:length(freq.class)){
if(length(freq.class[[i]]) != 0){ # zero check in class
entropy <- -sum(freq.class[[i]] * log2(freq.class[[i]])) #I calculate the entropy for each class i here
}else{
entropy <- 0
}
info <- info + entropy # sum up entropy from all classes
}
return(info)
}
.
> info(freq)
[1] 8.289526
> info(freq$buys)
[1] 0.940286
> info(freq$age)
[1] 3.807355
>
我是R新手,无法计算熵。 在 Whosebug 上有一个类似的问题和答案,但我想知道为什么这段代码不起作用。这是来自同一问题的复制粘贴数据。
其中一个答案提到 "The part I think you are missing is the calculation of the class frequencies and you will get your answer",但我该如何解决这个问题。我尝试了大部分选项,但仍然没有得到任何输出。它运行没有任何错误。
info <- function(CLASS.FREQ){
freq.class <- CLASS.FREQ
info <- 0
for(i in 1:length(freq.class)){
if(freq.class[[i]] != 0){ # zero check in class
entropy <- -sum(freq.class[[i]] * log2(freq.class[[i]])) #I calculate the entropy for each class i here
}else{
entropy <- 0
}
info <- info + entropy # sum up entropy from all classes
}
return(info)
}
数据集如下,
buys <- c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no")
credit <- c("fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent")
student <- c("no", "no", "no","no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", "no")
income <- c("high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium")
age <- c(25, 27, 35, 41, 48, 42, 36, 29, 26, 45, 23, 33, 37, 44)
我们将年龄从分类更改为数字
干杯,杰克
需要计算"no"和"yes"在"buys"中的比例,"fair"和"excellent"在"credit"中的比例, 等等。这是一种方法:
data <- list(
buys = c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no"),
credit = c("fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent"),
student = c("no", "no", "no","no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", "no"),
income = c("high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium"),
age = c(25, 27, 35, 41, 48, 42, 36, 29, 26, 45, 23, 33, 37, 44)
)
freq <- lapply( data, function(x){rowMeans(outer(unique(x),x,"=="))})
.
> freq
$buys
[1] 0.3571429 0.6428571
$credit
[1] 0.5714286 0.4285714
$student
[1] 0.5 0.5
$income
[1] 0.2857143 0.4285714 0.2857143
$age
[1] 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857
[14] 0.07142857
这样的比例永远不可能为0,所以把if(freq.class[[i]] != 0){ # zero check in class
改成if(length(freq.class[[i]]) != 0){ # zero check in class
:
info <- function(CLASS.FREQ){
freq.class <- CLASS.FREQ
info <- 0
for(i in 1:length(freq.class)){
if(length(freq.class[[i]]) != 0){ # zero check in class
entropy <- -sum(freq.class[[i]] * log2(freq.class[[i]])) #I calculate the entropy for each class i here
}else{
entropy <- 0
}
info <- info + entropy # sum up entropy from all classes
}
return(info)
}
.
> info(freq)
[1] 8.289526
> info(freq$buys)
[1] 0.940286
> info(freq$age)
[1] 3.807355
>