将训练数据添加到现有模型(bin 文件)
Add training data to existing model (bin file)
我正在尝试使用 OpenNLP
将额外的训练数据添加到我的 nl-personTest.bin
文件中。
现在是我的问题,当我 运行 我的代码添加额外的训练数据时,它会删除已经存在的数据并只添加我的新数据。
我怎样才能添加额外的训练数据而不是替换它?
我确实使用了以下代码,(从 Open NLP NER is not properly trained 获得)
public class TrainNames
{
public static void main(String[] args)
{
train("nl", "person", "namen.txt", "nl-ner-personTest.bin");
}
public static String train(String lang, String entity,InputStreamFactory inputStream, FileOutputStream modelStream) {
Charset charset = Charset.forName("UTF-8");
TokenNameFinderModel model = null;
ObjectStream<NameSample> sampleStream = null;
try {
ObjectStream<String> lineStream = new PlainTextByLineStream(inputStream, charset);
sampleStream = new NameSampleDataStream(lineStream);
TokenNameFinderFactory nameFinderFactory = new TokenNameFinderFactory();
model = NameFinderME.train("nl", "person", sampleStream, TrainingParameters.defaultParams(),
nameFinderFactory);
} catch (FileNotFoundException fio) {
} catch (IOException io) {
} finally {
try {
sampleStream.close();
} catch (IOException io) {
}
}
BufferedOutputStream modelOut = null;
try {
modelOut = new BufferedOutputStream(modelStream);
model.serialize(modelOut);
} catch (IOException io) {
} finally {
if (modelOut != null) {
try {
modelOut.close();
} catch (IOException io) {
}
}
}
return "Something goes wrong with training module.";
}
public static String train(String lang, String entity, String taggedCoprusFile,
String modelFile) {
try {
InputStreamFactory inputStream = new InputStreamFactory() {
FileInputStream fileInputStream = new FileInputStream("namen.txt");
public InputStream createInputStream() throws IOException {
return fileInputStream;
}
};
return train(lang, entity, inputStream,
new FileOutputStream(modelFile));
} catch (Exception e) {
e.printStackTrace();
}
return "Something goes wrong with training module.";
} }
任何人有解决这个问题的想法吗?
因为如果我想要一个准确的训练集,我至少需要 15K
句子说文档。
我认为 OpenNLP 不支持扩展现有的二进制 NLP 模型。
如果您拥有所有可用的训练数据,请将它们全部收集起来,然后立即进行训练。您可以使用 SequenceInputStream
。我修改了您的示例以使用另一个 InputStreamFactory
public String train(String lang, String entity, InputStreamFactory inputStream, FileOutputStream modelStream) {
// ....
try {
ObjectStream<String> lineStream = new PlainTextByLineStream(trainingDataInputStreamFactory(Arrays.asList(
new File("trainingdata1.txt"),
new File("trainingdata2.txt"),
new File("trainingdata3.txt")
)), charset);
// ...
}
// ...
}
private InputStreamFactory trainingDataInputStreamFactory(List<File> trainingFiles) {
return new InputStreamFactory() {
@Override
public InputStream createInputStream() throws IOException {
List<InputStream> inputStreams = trainingFiles.stream()
.map(f -> {
try {
return new FileInputStream(f);
} catch (FileNotFoundException e) {
e.printStackTrace();
return null;
}
})
.filter(Objects::nonNull)
.collect(Collectors.toList());
return new SequenceInputStream(new Vector<>(inputStreams).elements());
}
};
}
我正在尝试使用 OpenNLP
将额外的训练数据添加到我的 nl-personTest.bin
文件中。
现在是我的问题,当我 运行 我的代码添加额外的训练数据时,它会删除已经存在的数据并只添加我的新数据。
我怎样才能添加额外的训练数据而不是替换它?
我确实使用了以下代码,(从 Open NLP NER is not properly trained 获得)
public class TrainNames
{
public static void main(String[] args)
{
train("nl", "person", "namen.txt", "nl-ner-personTest.bin");
}
public static String train(String lang, String entity,InputStreamFactory inputStream, FileOutputStream modelStream) {
Charset charset = Charset.forName("UTF-8");
TokenNameFinderModel model = null;
ObjectStream<NameSample> sampleStream = null;
try {
ObjectStream<String> lineStream = new PlainTextByLineStream(inputStream, charset);
sampleStream = new NameSampleDataStream(lineStream);
TokenNameFinderFactory nameFinderFactory = new TokenNameFinderFactory();
model = NameFinderME.train("nl", "person", sampleStream, TrainingParameters.defaultParams(),
nameFinderFactory);
} catch (FileNotFoundException fio) {
} catch (IOException io) {
} finally {
try {
sampleStream.close();
} catch (IOException io) {
}
}
BufferedOutputStream modelOut = null;
try {
modelOut = new BufferedOutputStream(modelStream);
model.serialize(modelOut);
} catch (IOException io) {
} finally {
if (modelOut != null) {
try {
modelOut.close();
} catch (IOException io) {
}
}
}
return "Something goes wrong with training module.";
}
public static String train(String lang, String entity, String taggedCoprusFile,
String modelFile) {
try {
InputStreamFactory inputStream = new InputStreamFactory() {
FileInputStream fileInputStream = new FileInputStream("namen.txt");
public InputStream createInputStream() throws IOException {
return fileInputStream;
}
};
return train(lang, entity, inputStream,
new FileOutputStream(modelFile));
} catch (Exception e) {
e.printStackTrace();
}
return "Something goes wrong with training module.";
} }
任何人有解决这个问题的想法吗?
因为如果我想要一个准确的训练集,我至少需要 15K 句子说文档。
我认为 OpenNLP 不支持扩展现有的二进制 NLP 模型。
如果您拥有所有可用的训练数据,请将它们全部收集起来,然后立即进行训练。您可以使用 SequenceInputStream
。我修改了您的示例以使用另一个 InputStreamFactory
public String train(String lang, String entity, InputStreamFactory inputStream, FileOutputStream modelStream) {
// ....
try {
ObjectStream<String> lineStream = new PlainTextByLineStream(trainingDataInputStreamFactory(Arrays.asList(
new File("trainingdata1.txt"),
new File("trainingdata2.txt"),
new File("trainingdata3.txt")
)), charset);
// ...
}
// ...
}
private InputStreamFactory trainingDataInputStreamFactory(List<File> trainingFiles) {
return new InputStreamFactory() {
@Override
public InputStream createInputStream() throws IOException {
List<InputStream> inputStreams = trainingFiles.stream()
.map(f -> {
try {
return new FileInputStream(f);
} catch (FileNotFoundException e) {
e.printStackTrace();
return null;
}
})
.filter(Objects::nonNull)
.collect(Collectors.toList());
return new SequenceInputStream(new Vector<>(inputStreams).elements());
}
};
}