链式 CostSensitiveClassifier
Chained CostSensitiveClassifier
我错误地在另一个内部使用了 CostSensitiveAnalysis,每个都有不同的权重矩阵。一个是
(matrix 1a)
0 3
1 0
另一个是
(matrix 1b)
0 1
3 0
并且取得了很好的效果。当我发现错误时,我将我的权重矩阵更改为:
(matrix 2)
0 1
1 0
但无法获得与以前相同的结果。我也试过了
(matrix 3)
0 3
3 0
我认为,通过使用成本敏感分析而不是其他成本敏感分析,对于矩阵 1a 和 1b,我会得到与矩阵 2 甚至矩阵 3 相同的结果,但结果却大不相同。
成本敏感分析是否以使用指定权重以外的其他方式更改成本值?
谢谢
--
我写了一个测试单元来说明,它应该适用于任何数据集,最后一个是 class 属性,并且有两个 classes.
import static org.junit.Assert.*;
import java.io.IOException;
import org.apache.log4j.Logger;
import org.apache.log4j.lf5.util.Resource;
import org.junit.Test;
import weka.classifiers.Classifier;
import weka.classifiers.Evaluation;
import weka.classifiers.meta.CostSensitiveClassifier;
import weka.classifiers.meta.FilteredClassifier;
import weka.classifiers.trees.J48;
import weka.core.Instances;
public class WekaFacadeTest {
private Logger logger = Logger.getLogger(WekaFacadeTest.class);
private CostMatrix createCostMatrix(double weightFalsePositive, double weightFalseNegative) {
CostMatrix costMatrix = new CostMatrix(2);
costMatrix.setCell(0, 0, 0.0);
costMatrix.setCell(1, 0, weightFalsePositive);
costMatrix.setCell(0, 1, weightFalseNegative);
costMatrix.setCell(1, 1, 0.0);
return costMatrix;
}
@Test
public void testDoubleCost() throws Exception {
Instances data = WekaFacade.loadArff("test.arff");
data.setClassIndex(data.numAttributes()-1);
// c1 => cost sensitive classifier applied to j48, cost = 1.0
CostSensitiveClassifier c1 = new CostSensitiveClassifier();
c1.setClassifier(new J48());
c1.setCostMatrix( createCostMatrix(1.0, 1.0));
c1.buildClassifier(data);
Evaluation ec1 = new Evaluation(data,c1.getCostMatrix());
ec1.evaluateModel(c1, data);
// c2 => no cost sensitive classifier, straight j48
J48 c2 = new J48();
c2.buildClassifier(data);
Evaluation ec2 = new Evaluation(data);
ec2.evaluateModel(c2, data);
// should c1 errorRate be equal to c2?
logger.info(String.format("Cost ec1=%f, ec2=%f",ec1.errorRate(),ec2.errorRate()));
assertEquals(ec1.errorRate(),ec2.errorRate(),0.0001);
// success!
// c3 => cost sensitive classifier applied to cost sensitive classifier applied to j48, cost = 1.0
CostSensitiveClassifier c3 = new CostSensitiveClassifier();
c3.setClassifier(new CostSensitiveClassifier());
((CostSensitiveClassifier)c3.getClassifier()).setClassifier(new J48());
c3.setCostMatrix( WekaFacade.createCostMatrix(1.0, 1.0));
((CostSensitiveClassifier)c3.getClassifier()).setCostMatrix( WekaFacade.createCostMatrix(1.0, 1.0));
c3.buildClassifier(data);
Evaluation ec3 = new Evaluation(data,c1.getCostMatrix());
ec3.evaluateModel(c3, data);
logger.info(String.format("Cost c3=%f, c1=%f",ec3.avgCost(),ec1.avgCost()));
assertEquals(ec3.avgCost(),ec1.avgCost(),0.0001);
// fail!
logger.info(String.format("ErrorRate c3=%f, c2=%f",ec3.errorRate(),ec2.errorRate()));
assertEquals(ec3.errorRate(),ec2.errorRate(),0.0001);
// fail!
// d => cost sensitive classifier applied to j48, normal situation
CostSensitiveClassifier d = new CostSensitiveClassifier();
d.setClassifier(new J48());
d.setCostMatrix( createCostMatrix(3.0, 1.0));
d.buildClassifier(data);
Evaluation ed = new Evaluation(data,d.getCostMatrix());
ed.evaluateModel(d, data);
// c => cost sensitive classifier applied to another cost sensitive classifier, abnormal situation
CostSensitiveClassifier c = new CostSensitiveClassifier();
c.setClassifier(new CostSensitiveClassifier());
((CostSensitiveClassifier)c.getClassifier()).setClassifier(new J48());
c.setCostMatrix( createCostMatrix(1.0, 1.0));
((CostSensitiveClassifier)c.getClassifier()).setCostMatrix( createCostMatrix(3.0, 1.0));
c.buildClassifier(data);
Evaluation ec = new Evaluation(data, c.getCostMatrix());
ec.evaluateModel(c, data );
// should ec average cost be the same as ed's ?
logger.info(String.format("Cost c=%f, d=%f",ec.avgCost(),ed.avgCost()));
assertEquals(ec.avgCost(),ed.avgCost(),0.0001);
// fails!
}
}
到目前为止我发现了什么
CostSensitiveClassifier 有两种操作模式:它可以在样本上设置显式权重(通过使用 .weight() 方法),也可以通过替换重新采样。在我的特殊情况下,它使用的是最后一种方法。
因此,上述 class 安排将对原始样本进行两次重采样。作为随机过程的重采样,结果不应等于单次重采样。
我错误地在另一个内部使用了 CostSensitiveAnalysis,每个都有不同的权重矩阵。一个是
(matrix 1a)
0 3
1 0
另一个是
(matrix 1b)
0 1
3 0
并且取得了很好的效果。当我发现错误时,我将我的权重矩阵更改为:
(matrix 2)
0 1
1 0
但无法获得与以前相同的结果。我也试过了
(matrix 3)
0 3
3 0
我认为,通过使用成本敏感分析而不是其他成本敏感分析,对于矩阵 1a 和 1b,我会得到与矩阵 2 甚至矩阵 3 相同的结果,但结果却大不相同。
成本敏感分析是否以使用指定权重以外的其他方式更改成本值?
谢谢
--
我写了一个测试单元来说明,它应该适用于任何数据集,最后一个是 class 属性,并且有两个 classes.
import static org.junit.Assert.*;
import java.io.IOException;
import org.apache.log4j.Logger;
import org.apache.log4j.lf5.util.Resource;
import org.junit.Test;
import weka.classifiers.Classifier;
import weka.classifiers.Evaluation;
import weka.classifiers.meta.CostSensitiveClassifier;
import weka.classifiers.meta.FilteredClassifier;
import weka.classifiers.trees.J48;
import weka.core.Instances;
public class WekaFacadeTest {
private Logger logger = Logger.getLogger(WekaFacadeTest.class);
private CostMatrix createCostMatrix(double weightFalsePositive, double weightFalseNegative) {
CostMatrix costMatrix = new CostMatrix(2);
costMatrix.setCell(0, 0, 0.0);
costMatrix.setCell(1, 0, weightFalsePositive);
costMatrix.setCell(0, 1, weightFalseNegative);
costMatrix.setCell(1, 1, 0.0);
return costMatrix;
}
@Test
public void testDoubleCost() throws Exception {
Instances data = WekaFacade.loadArff("test.arff");
data.setClassIndex(data.numAttributes()-1);
// c1 => cost sensitive classifier applied to j48, cost = 1.0
CostSensitiveClassifier c1 = new CostSensitiveClassifier();
c1.setClassifier(new J48());
c1.setCostMatrix( createCostMatrix(1.0, 1.0));
c1.buildClassifier(data);
Evaluation ec1 = new Evaluation(data,c1.getCostMatrix());
ec1.evaluateModel(c1, data);
// c2 => no cost sensitive classifier, straight j48
J48 c2 = new J48();
c2.buildClassifier(data);
Evaluation ec2 = new Evaluation(data);
ec2.evaluateModel(c2, data);
// should c1 errorRate be equal to c2?
logger.info(String.format("Cost ec1=%f, ec2=%f",ec1.errorRate(),ec2.errorRate()));
assertEquals(ec1.errorRate(),ec2.errorRate(),0.0001);
// success!
// c3 => cost sensitive classifier applied to cost sensitive classifier applied to j48, cost = 1.0
CostSensitiveClassifier c3 = new CostSensitiveClassifier();
c3.setClassifier(new CostSensitiveClassifier());
((CostSensitiveClassifier)c3.getClassifier()).setClassifier(new J48());
c3.setCostMatrix( WekaFacade.createCostMatrix(1.0, 1.0));
((CostSensitiveClassifier)c3.getClassifier()).setCostMatrix( WekaFacade.createCostMatrix(1.0, 1.0));
c3.buildClassifier(data);
Evaluation ec3 = new Evaluation(data,c1.getCostMatrix());
ec3.evaluateModel(c3, data);
logger.info(String.format("Cost c3=%f, c1=%f",ec3.avgCost(),ec1.avgCost()));
assertEquals(ec3.avgCost(),ec1.avgCost(),0.0001);
// fail!
logger.info(String.format("ErrorRate c3=%f, c2=%f",ec3.errorRate(),ec2.errorRate()));
assertEquals(ec3.errorRate(),ec2.errorRate(),0.0001);
// fail!
// d => cost sensitive classifier applied to j48, normal situation
CostSensitiveClassifier d = new CostSensitiveClassifier();
d.setClassifier(new J48());
d.setCostMatrix( createCostMatrix(3.0, 1.0));
d.buildClassifier(data);
Evaluation ed = new Evaluation(data,d.getCostMatrix());
ed.evaluateModel(d, data);
// c => cost sensitive classifier applied to another cost sensitive classifier, abnormal situation
CostSensitiveClassifier c = new CostSensitiveClassifier();
c.setClassifier(new CostSensitiveClassifier());
((CostSensitiveClassifier)c.getClassifier()).setClassifier(new J48());
c.setCostMatrix( createCostMatrix(1.0, 1.0));
((CostSensitiveClassifier)c.getClassifier()).setCostMatrix( createCostMatrix(3.0, 1.0));
c.buildClassifier(data);
Evaluation ec = new Evaluation(data, c.getCostMatrix());
ec.evaluateModel(c, data );
// should ec average cost be the same as ed's ?
logger.info(String.format("Cost c=%f, d=%f",ec.avgCost(),ed.avgCost()));
assertEquals(ec.avgCost(),ed.avgCost(),0.0001);
// fails!
}
}
到目前为止我发现了什么
CostSensitiveClassifier 有两种操作模式:它可以在样本上设置显式权重(通过使用 .weight() 方法),也可以通过替换重新采样。在我的特殊情况下,它使用的是最后一种方法。
因此,上述 class 安排将对原始样本进行两次重采样。作为随机过程的重采样,结果不应等于单次重采样。