在 WEKA API 中为单个文档计算 TF-IDF 以预测分类
Calculate TF-IDF in WEKA API for single document to predict classification
出于某种原因,我正在使用 WEKA API...
我已经为一组文档生成了 tf-idf 分数,
StringToWordVector filter = new StringToWordVector();
filter.setInputFormat(data);
filter.setIDFTransform(true);
filter.setStopwordsHandler(new StopWordsHandlerEN());//just a simple handler for stop words I created
filter.setLowerCaseTokens(true);
filter.setStemmer(new MyStemmer());//a stemmer I created
filter.setWordsToKeep(words2keep);
Instances result = Filter.useFilter(data, filter);
然后将它们分成训练和测试子集,进行训练、测试等等...
一旦我有了一个训练有素的现成分类模型,我想创建一个简单的 API 来对任何传入的文档进行分类。但问题是新的 tf-idf 分数需要根据 tf-idf 向量和起始文档集的词来计算,对吧?换句话说,如果我没记错的话,我需要加载 scikit-learn 的 tfidfvectorizer
.
的对应项
我在 WEKA 中找不到类似的东西...有吗?..
StringToWordVector filter uses the weka.core.DictionaryBuilder class TF/IDF 计算的幕后花絮。
只要您使用要转换的文本创建一个 weka.core.Instance
对象,就可以使用生成器的 vectorizeInstance(Instance)
方法来实现。
编辑 1:
下面是一个基于您的代码(但使用 Weka classes)的示例,它显示了如何使用过滤器或 DictionaryBuilder 进行 TF/IDF 转换。两者都被序列化、反序列化和 re-used 以及证明这些 classes 是可序列化的:
import weka.core.DictionaryBuilder;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.SerializationHelper;
import weka.core.converters.ConverterUtils;
import weka.core.stemmers.LovinsStemmer;
import weka.core.stopwords.Rainbow;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.StringToWordVector;
public class TFIDF {
// just exposes the internal DictionaryBuilder member
public static class StringToWordVectorExposed
extends StringToWordVector {
public DictionaryBuilder getDictionary() {
return m_dictionaryBuilder;
}
}
public static void main(String[] args) throws Exception {
// load data
Instances train = ConverterUtils.DataSource.read("/some/where/train.arff");
train.setClassIndex(train.numAttributes() - 1);
Instances test = ConverterUtils.DataSource.read("/some/where/test.arff");
test.setClassIndex(test.numAttributes() - 1);
// init filter
StringToWordVectorExposed filter = new StringToWordVectorExposed();
int words2keep = 100;
filter.setInputFormat(train);
filter.setIDFTransform(true);
filter.setStopwordsHandler(new Rainbow());
filter.setLowerCaseTokens(true);
filter.setStemmer(new LovinsStemmer());
filter.setWordsToKeep(words2keep);
filter.setInputFormat(train);
Instances trainFiltered = Filter.useFilter(train, filter);
DictionaryBuilder builder = filter.getDictionary();
// apply filter/dictionary
Instances testFiltered = Filter.useFilter(test, filter);
System.out.println(testFiltered.instance(0));
Instance tfidf = builder.vectorizeInstance(test.instance(0));
System.out.println(tfidf);
// serialize
SerializationHelper.write("/some/where/filter.ser", filter);
SerializationHelper.write("/some/where/dictionary.ser", filter.getDictionary());
// deserialize
StringToWordVectorExposed filter2 = (StringToWordVectorExposed) SerializationHelper.read("/some/where/filter.ser");
DictionaryBuilder builder2 = (DictionaryBuilder) SerializationHelper.read("/some/where/dictionary.ser");
// re-apply
testFiltered = Filter.useFilter(test, filter2);
System.out.println(testFiltered.instance(0));
tfidf = builder2.vectorizeInstance(test.instance(0));
System.out.println(tfidf);
}
}
出于某种原因,我正在使用 WEKA API...
我已经为一组文档生成了 tf-idf 分数,
StringToWordVector filter = new StringToWordVector();
filter.setInputFormat(data);
filter.setIDFTransform(true);
filter.setStopwordsHandler(new StopWordsHandlerEN());//just a simple handler for stop words I created
filter.setLowerCaseTokens(true);
filter.setStemmer(new MyStemmer());//a stemmer I created
filter.setWordsToKeep(words2keep);
Instances result = Filter.useFilter(data, filter);
然后将它们分成训练和测试子集,进行训练、测试等等...
一旦我有了一个训练有素的现成分类模型,我想创建一个简单的 API 来对任何传入的文档进行分类。但问题是新的 tf-idf 分数需要根据 tf-idf 向量和起始文档集的词来计算,对吧?换句话说,如果我没记错的话,我需要加载 scikit-learn 的 tfidfvectorizer
.
我在 WEKA 中找不到类似的东西...有吗?..
StringToWordVector filter uses the weka.core.DictionaryBuilder class TF/IDF 计算的幕后花絮。
只要您使用要转换的文本创建一个 weka.core.Instance
对象,就可以使用生成器的 vectorizeInstance(Instance)
方法来实现。
编辑 1:
下面是一个基于您的代码(但使用 Weka classes)的示例,它显示了如何使用过滤器或 DictionaryBuilder 进行 TF/IDF 转换。两者都被序列化、反序列化和 re-used 以及证明这些 classes 是可序列化的:
import weka.core.DictionaryBuilder;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.SerializationHelper;
import weka.core.converters.ConverterUtils;
import weka.core.stemmers.LovinsStemmer;
import weka.core.stopwords.Rainbow;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.StringToWordVector;
public class TFIDF {
// just exposes the internal DictionaryBuilder member
public static class StringToWordVectorExposed
extends StringToWordVector {
public DictionaryBuilder getDictionary() {
return m_dictionaryBuilder;
}
}
public static void main(String[] args) throws Exception {
// load data
Instances train = ConverterUtils.DataSource.read("/some/where/train.arff");
train.setClassIndex(train.numAttributes() - 1);
Instances test = ConverterUtils.DataSource.read("/some/where/test.arff");
test.setClassIndex(test.numAttributes() - 1);
// init filter
StringToWordVectorExposed filter = new StringToWordVectorExposed();
int words2keep = 100;
filter.setInputFormat(train);
filter.setIDFTransform(true);
filter.setStopwordsHandler(new Rainbow());
filter.setLowerCaseTokens(true);
filter.setStemmer(new LovinsStemmer());
filter.setWordsToKeep(words2keep);
filter.setInputFormat(train);
Instances trainFiltered = Filter.useFilter(train, filter);
DictionaryBuilder builder = filter.getDictionary();
// apply filter/dictionary
Instances testFiltered = Filter.useFilter(test, filter);
System.out.println(testFiltered.instance(0));
Instance tfidf = builder.vectorizeInstance(test.instance(0));
System.out.println(tfidf);
// serialize
SerializationHelper.write("/some/where/filter.ser", filter);
SerializationHelper.write("/some/where/dictionary.ser", filter.getDictionary());
// deserialize
StringToWordVectorExposed filter2 = (StringToWordVectorExposed) SerializationHelper.read("/some/where/filter.ser");
DictionaryBuilder builder2 = (DictionaryBuilder) SerializationHelper.read("/some/where/dictionary.ser");
// re-apply
testFiltered = Filter.useFilter(test, filter2);
System.out.println(testFiltered.instance(0));
tfidf = builder2.vectorizeInstance(test.instance(0));
System.out.println(tfidf);
}
}