使用门的文本分割
Text Segmentation using Gate
我正在尝试使用 Java 编写自己的程序,以便将一组文本文件分割成句子。我搜索了可用的 NLP 工具,我发现了 GATE,但我无法使用它来仅使用管道进行分段。
- 任何关于如何限制管道功能的想法
- 任何可以帮助我编写程序的代码
改编自 different answer:
import gate.*;
import gate.creole.SerialAnalyserController;
import java.io.File;
import java.util.*;
public class Segmenter {
public static void main(String[] args) throws Exception {
Gate.setGateHome(new File("C:\Program Files\GATE_Developer_8.0"));
Gate.init();
regiterGatePlugin("ANNIE");
SerialAnalyserController pipeline = (SerialAnalyserController) Factory.createResource("gate.creole.SerialAnalyserController");
pipeline.add((ProcessingResource) Factory.createResource("gate.creole.tokeniser.DefaultTokeniser"));
pipeline.add((ProcessingResource) Factory.createResource("gate.creole.splitter.SentenceSplitter"));
Corpus corpus = Factory.newCorpus("SegmenterCorpus");
Document document = Factory.newDocument("Text to be segmented.");
corpus.add(document);
pipeline.setCorpus(corpus);
pipeline.execute();
AnnotationSet defaultAS = document.getAnnotations();
AnnotationSet sentences = defaultAS.get("Sentence");
for (Annotation sentence : sentences) {
System.err.println(Utils.stringFor(document, sentence));
}
//Clean up
Factory.deleteResource(document);
Factory.deleteResource(corpus);
for (ProcessingResource pr : pipeline.getPRs()) {
Factory.deleteResource(pr);
}
Factory.deleteResource(pipeline);
}
public static void regiterGatePlugin(String name) throws Exception {
Gate.getCreoleRegister().registerDirectories(new File(Gate.getPluginsHome(), name).toURI().toURL());
}
}
我正在尝试使用 Java 编写自己的程序,以便将一组文本文件分割成句子。我搜索了可用的 NLP 工具,我发现了 GATE,但我无法使用它来仅使用管道进行分段。
- 任何关于如何限制管道功能的想法
- 任何可以帮助我编写程序的代码
改编自 different answer:
import gate.*;
import gate.creole.SerialAnalyserController;
import java.io.File;
import java.util.*;
public class Segmenter {
public static void main(String[] args) throws Exception {
Gate.setGateHome(new File("C:\Program Files\GATE_Developer_8.0"));
Gate.init();
regiterGatePlugin("ANNIE");
SerialAnalyserController pipeline = (SerialAnalyserController) Factory.createResource("gate.creole.SerialAnalyserController");
pipeline.add((ProcessingResource) Factory.createResource("gate.creole.tokeniser.DefaultTokeniser"));
pipeline.add((ProcessingResource) Factory.createResource("gate.creole.splitter.SentenceSplitter"));
Corpus corpus = Factory.newCorpus("SegmenterCorpus");
Document document = Factory.newDocument("Text to be segmented.");
corpus.add(document);
pipeline.setCorpus(corpus);
pipeline.execute();
AnnotationSet defaultAS = document.getAnnotations();
AnnotationSet sentences = defaultAS.get("Sentence");
for (Annotation sentence : sentences) {
System.err.println(Utils.stringFor(document, sentence));
}
//Clean up
Factory.deleteResource(document);
Factory.deleteResource(corpus);
for (ProcessingResource pr : pipeline.getPRs()) {
Factory.deleteResource(pr);
}
Factory.deleteResource(pipeline);
}
public static void regiterGatePlugin(String name) throws Exception {
Gate.getCreoleRegister().registerDirectories(new File(Gate.getPluginsHome(), name).toURI().toURL());
}
}