链式 CostSensitiveClassifier

Chained CostSensitiveClassifier

我错误地在另一个内部使用了 CostSensitiveAnalysis,每个都有不同的权重矩阵。一个是

(matrix 1a)
0 3
1 0

另一个是

(matrix 1b)
0 1
3 0

并且取得了很好的效果。当我发现错误时,我将我的权重矩阵更改为:

(matrix 2)
0 1
1 0

但无法获得与以前相同的结果。我也试过了

(matrix 3)
0 3
3 0

我认为,通过使用成本敏感分析而不是其他成本敏感分析,对于矩阵 1a 和 1b,我会得到与矩阵 2 甚至矩阵 3 相同的结果,但结果却大不相同。

成本敏感分析是否以使用指定权重以外的其他方式更改成本值?

谢谢

--

我写了一个测试单元来说明,它应该适用于任何数据集,最后一个是 class 属性,并且有两个 classes.

import static org.junit.Assert.*;
import java.io.IOException;
import org.apache.log4j.Logger;
import org.apache.log4j.lf5.util.Resource;
import org.junit.Test;

import weka.classifiers.Classifier;
import weka.classifiers.Evaluation;
import weka.classifiers.meta.CostSensitiveClassifier;
import weka.classifiers.meta.FilteredClassifier;
import weka.classifiers.trees.J48;
import weka.core.Instances;

public class WekaFacadeTest {
private Logger logger = Logger.getLogger(WekaFacadeTest.class);

private CostMatrix createCostMatrix(double weightFalsePositive, double weightFalseNegative) {
    CostMatrix costMatrix = new CostMatrix(2);
    costMatrix.setCell(0, 0, 0.0);
    costMatrix.setCell(1, 0, weightFalsePositive);
    costMatrix.setCell(0, 1, weightFalseNegative);
    costMatrix.setCell(1, 1, 0.0);
    return costMatrix;
}   

@Test
public void testDoubleCost() throws Exception {
    Instances data = WekaFacade.loadArff("test.arff");
    data.setClassIndex(data.numAttributes()-1); 

    // c1 => cost sensitive classifier applied to j48, cost = 1.0 
    CostSensitiveClassifier c1 = new CostSensitiveClassifier();
    c1.setClassifier(new J48());

    c1.setCostMatrix( createCostMatrix(1.0, 1.0));
    c1.buildClassifier(data);

    Evaluation ec1 = new Evaluation(data,c1.getCostMatrix());
    ec1.evaluateModel(c1, data);

    // c2 => no cost sensitive classifier, straight j48 
    J48 c2 = new J48();

    c2.buildClassifier(data);

    Evaluation ec2 = new Evaluation(data); 
    ec2.evaluateModel(c2, data);

    // should c1 errorRate be equal to c2?
    logger.info(String.format("Cost ec1=%f, ec2=%f",ec1.errorRate(),ec2.errorRate()));
    assertEquals(ec1.errorRate(),ec2.errorRate(),0.0001);
    // success!     

    // c3 => cost sensitive classifier applied to cost sensitive classifier  applied to j48, cost = 1.0 
    CostSensitiveClassifier c3 = new CostSensitiveClassifier();
    c3.setClassifier(new CostSensitiveClassifier());
    ((CostSensitiveClassifier)c3.getClassifier()).setClassifier(new J48());

    c3.setCostMatrix( WekaFacade.createCostMatrix(1.0, 1.0));
    ((CostSensitiveClassifier)c3.getClassifier()).setCostMatrix( WekaFacade.createCostMatrix(1.0, 1.0));
    c3.buildClassifier(data);

    Evaluation ec3 = new Evaluation(data,c1.getCostMatrix());
    ec3.evaluateModel(c3, data);

    logger.info(String.format("Cost c3=%f, c1=%f",ec3.avgCost(),ec1.avgCost()));
    assertEquals(ec3.avgCost(),ec1.avgCost(),0.0001);
    // fail!        

    logger.info(String.format("ErrorRate c3=%f, c2=%f",ec3.errorRate(),ec2.errorRate()));
    assertEquals(ec3.errorRate(),ec2.errorRate(),0.0001);
    // fail!    

    // d => cost sensitive classifier applied to j48, normal situation 
    CostSensitiveClassifier d = new CostSensitiveClassifier();
    d.setClassifier(new J48());

    d.setCostMatrix( createCostMatrix(3.0, 1.0));
    d.buildClassifier(data);

    Evaluation ed = new Evaluation(data,d.getCostMatrix());
    ed.evaluateModel(d, data);

    // c => cost sensitive classifier applied to another cost sensitive classifier, abnormal situation
    CostSensitiveClassifier c = new CostSensitiveClassifier();
    c.setClassifier(new CostSensitiveClassifier());
    ((CostSensitiveClassifier)c.getClassifier()).setClassifier(new J48());

    c.setCostMatrix( createCostMatrix(1.0, 1.0));
    ((CostSensitiveClassifier)c.getClassifier()).setCostMatrix( createCostMatrix(3.0, 1.0));
    c.buildClassifier(data);

    Evaluation ec = new Evaluation(data, c.getCostMatrix());
    ec.evaluateModel(c, data );

    // should ec average cost be the same as ed's ?
    logger.info(String.format("Cost c=%f, d=%f",ec.avgCost(),ed.avgCost()));
    assertEquals(ec.avgCost(),ed.avgCost(),0.0001);
    // fails!

}
}

到目前为止我发现了什么

CostSensitiveClassifier 有两种操作模式:它可以在样本上设置显式权重(通过使用 .weight() 方法),也可以通过替换重新采样。在我的特殊情况下,它使用的是最后一种方法。

因此,上述 class 安排将对原始样本进行两次重采样。作为随机过程的重采样,结果不应等于单次重采样。