将令牌写入 java 中的文件

write token into file in java

我有标记化代码,我想将结果写入文件 .txt,这样我就可以使用该文件删除所有标点符号

public static void tokenization() throws FileNotFoundException, IOException{

    String msg;

    int numberOfTotalTokens =0;
    int numberOfMessages = 0;

    String data = "E:\Data\SMSSpamCollection.txt";

    FileInputStream fisData = new FileInputStream(data);
    BufferedReader readBufferData = new BufferedReader(new InputStreamReader(fisData));

    try{
        while ((msg =readBufferData.readLine()) != null) {
            int numberOfTokens = 0;
            StringTokenizer tokens = new StringTokenizer(msg);
            StringBuilder sb = new StringBuilder();
            System.out.println("Before: "+msg);

            System.out.print("After : ");
            while (tokens.hasMoreTokens()) {
                msg = tokens.nextToken();
                String msgLower = msg.toLowerCase();
                numberOfTokens++;
                numberOfTotalTokens++; 
            }

            System.out.println("");
            System.out.println("Total Tokens: "+numberOfTokens);
            System.out.println("\n");
            numberOfTokens++;
            numberOfMessages++;
        }
        System.out.println("Total Tokens: "+numberOfTotalTokens);
        System.out.println("Total Messages: "+numberOfMessages);
    }
    catch (Exception e){
        System.out.println("Error Exception: "+e.getMessage());
    } 
}

代码的结果是一组标记: 例子: 我的 姓名 是 计算器

该组令牌需要写入文件 .txt 我如何将其写入文件 .txt

我在您的代码中添加了更改和注释。使用此代码,您可以使用 BufferedWriter 每行将一个标记写入文件 output.txt

一些备注:

  • 您的代码进行了一些基本的标记化 (documentation),并且不处理 停用词,例如标点符号
  • StringTokenizer 是为了向后兼容而保留的遗留 class,但建议将 String.split 与正则表达式一起使用
  • 对于更高级的标记化,您可以使用 Apache Lucene 库

希望对您有所帮助!

public static void tokenization() throws FileNotFoundException, IOException{

String msg;

int numberOfTotalTokens =0;
int numberOfMessages = 0;

String data = "E:\Data\SMSSpamCollection.txt";
// create a new output file output.txt
String outfilename = "E:\output.txt"; 
File file =new File(outfilename);
file.createNewFile();

FileInputStream fisData = new FileInputStream(data);
BufferedReader readBufferData = new BufferedReader(new InputStreamReader(fisData));

// create a buffer writer tokDataB
FileWriter tokData = new FileWriter(outfilename,true);
BufferedWriter tokDataB = new BufferedWriter(tokData);

try{
    while ((msg =readBufferData.readLine()) != null) {
        int numberOfTokens = 0;
        StringTokenizer tokens = new StringTokenizer(msg);
        StringBuilder sb = new StringBuilder();
        System.out.println("Before: "+msg);

        System.out.print("After : ");
        while (tokens.hasMoreTokens()) {
            msg = tokens.nextToken();
            String msgLower = msg.toLowerCase();

            // write one token per line to output file
            tokDataB.write(msgLower);
            tokDataB.write("\n");

            numberOfTokens++;
            numberOfTotalTokens++; 
        }

        System.out.println("");
        System.out.println("Total Tokens: "+numberOfTokens);
        System.out.println("\n");
        numberOfTokens++;
        numberOfMessages++;
    }

    // close output writer
    tokDataB.close();                    

    System.out.println("Total Tokens: "+numberOfTotalTokens);
    System.out.println("Total Messages: "+numberOfMessages);
}
catch (Exception e){
    System.out.println("Error Exception: "+e.getMessage());
} 

}