使用门的文本分割

Text Segmentation using Gate

我正在尝试使用 Java 编写自己的程序,以便将一组文本文件分割成句子。我搜索了可用的 NLP 工具,我发现了 GATE,但我无法使用它来仅使用管道进行分段。

  1. 任何关于如何限制管道功能的想法
  2. 任何可以帮助我编写程序的代码

改编自 different answer:

import gate.*;
import gate.creole.SerialAnalyserController;
import java.io.File;
import java.util.*;

public class Segmenter {
    public static void main(String[] args) throws Exception {
        Gate.setGateHome(new File("C:\Program Files\GATE_Developer_8.0"));
        Gate.init();
        regiterGatePlugin("ANNIE");

        SerialAnalyserController pipeline = (SerialAnalyserController) Factory.createResource("gate.creole.SerialAnalyserController");
        pipeline.add((ProcessingResource) Factory.createResource("gate.creole.tokeniser.DefaultTokeniser"));
        pipeline.add((ProcessingResource) Factory.createResource("gate.creole.splitter.SentenceSplitter"));

        Corpus corpus = Factory.newCorpus("SegmenterCorpus");
        Document document = Factory.newDocument("Text to be segmented.");
        corpus.add(document); 
        pipeline.setCorpus(corpus); 
        pipeline.execute();

        AnnotationSet defaultAS = document.getAnnotations();
        AnnotationSet sentences = defaultAS.get("Sentence");

        for (Annotation sentence : sentences) {
            System.err.println(Utils.stringFor(document, sentence));
        }

        //Clean up
        Factory.deleteResource(document);
        Factory.deleteResource(corpus);
        for (ProcessingResource pr : pipeline.getPRs()) {
            Factory.deleteResource(pr);
        }
        Factory.deleteResource(pipeline);
    }

    public static void regiterGatePlugin(String name) throws Exception {
        Gate.getCreoleRegister().registerDirectories(new File(Gate.getPluginsHome(), name).toURI().toURL());
    }
}