将令牌写入 java 中的文件
write token into file in java
我有标记化代码,我想将结果写入文件 .txt,这样我就可以使用该文件删除所有标点符号
public static void tokenization() throws FileNotFoundException, IOException{
String msg;
int numberOfTotalTokens =0;
int numberOfMessages = 0;
String data = "E:\Data\SMSSpamCollection.txt";
FileInputStream fisData = new FileInputStream(data);
BufferedReader readBufferData = new BufferedReader(new InputStreamReader(fisData));
try{
while ((msg =readBufferData.readLine()) != null) {
int numberOfTokens = 0;
StringTokenizer tokens = new StringTokenizer(msg);
StringBuilder sb = new StringBuilder();
System.out.println("Before: "+msg);
System.out.print("After : ");
while (tokens.hasMoreTokens()) {
msg = tokens.nextToken();
String msgLower = msg.toLowerCase();
numberOfTokens++;
numberOfTotalTokens++;
}
System.out.println("");
System.out.println("Total Tokens: "+numberOfTokens);
System.out.println("\n");
numberOfTokens++;
numberOfMessages++;
}
System.out.println("Total Tokens: "+numberOfTotalTokens);
System.out.println("Total Messages: "+numberOfMessages);
}
catch (Exception e){
System.out.println("Error Exception: "+e.getMessage());
}
}
代码的结果是一组标记:
例子:
我的
姓名
是
计算器
该组令牌需要写入文件 .txt
我如何将其写入文件 .txt
我在您的代码中添加了更改和注释。使用此代码,您可以使用 BufferedWriter
每行将一个标记写入文件 output.txt
。
一些备注:
- 您的代码进行了一些基本的标记化 (documentation),并且不处理 停用词,例如标点符号
StringTokenizer
是为了向后兼容而保留的遗留 class,但建议将 String.split
与正则表达式一起使用
- 对于更高级的标记化,您可以使用 Apache Lucene 库
希望对您有所帮助!
public static void tokenization() throws FileNotFoundException, IOException{
String msg;
int numberOfTotalTokens =0;
int numberOfMessages = 0;
String data = "E:\Data\SMSSpamCollection.txt";
// create a new output file output.txt
String outfilename = "E:\output.txt";
File file =new File(outfilename);
file.createNewFile();
FileInputStream fisData = new FileInputStream(data);
BufferedReader readBufferData = new BufferedReader(new InputStreamReader(fisData));
// create a buffer writer tokDataB
FileWriter tokData = new FileWriter(outfilename,true);
BufferedWriter tokDataB = new BufferedWriter(tokData);
try{
while ((msg =readBufferData.readLine()) != null) {
int numberOfTokens = 0;
StringTokenizer tokens = new StringTokenizer(msg);
StringBuilder sb = new StringBuilder();
System.out.println("Before: "+msg);
System.out.print("After : ");
while (tokens.hasMoreTokens()) {
msg = tokens.nextToken();
String msgLower = msg.toLowerCase();
// write one token per line to output file
tokDataB.write(msgLower);
tokDataB.write("\n");
numberOfTokens++;
numberOfTotalTokens++;
}
System.out.println("");
System.out.println("Total Tokens: "+numberOfTokens);
System.out.println("\n");
numberOfTokens++;
numberOfMessages++;
}
// close output writer
tokDataB.close();
System.out.println("Total Tokens: "+numberOfTotalTokens);
System.out.println("Total Messages: "+numberOfMessages);
}
catch (Exception e){
System.out.println("Error Exception: "+e.getMessage());
}
}
我有标记化代码,我想将结果写入文件 .txt,这样我就可以使用该文件删除所有标点符号
public static void tokenization() throws FileNotFoundException, IOException{
String msg;
int numberOfTotalTokens =0;
int numberOfMessages = 0;
String data = "E:\Data\SMSSpamCollection.txt";
FileInputStream fisData = new FileInputStream(data);
BufferedReader readBufferData = new BufferedReader(new InputStreamReader(fisData));
try{
while ((msg =readBufferData.readLine()) != null) {
int numberOfTokens = 0;
StringTokenizer tokens = new StringTokenizer(msg);
StringBuilder sb = new StringBuilder();
System.out.println("Before: "+msg);
System.out.print("After : ");
while (tokens.hasMoreTokens()) {
msg = tokens.nextToken();
String msgLower = msg.toLowerCase();
numberOfTokens++;
numberOfTotalTokens++;
}
System.out.println("");
System.out.println("Total Tokens: "+numberOfTokens);
System.out.println("\n");
numberOfTokens++;
numberOfMessages++;
}
System.out.println("Total Tokens: "+numberOfTotalTokens);
System.out.println("Total Messages: "+numberOfMessages);
}
catch (Exception e){
System.out.println("Error Exception: "+e.getMessage());
}
}
代码的结果是一组标记: 例子: 我的 姓名 是 计算器
该组令牌需要写入文件 .txt 我如何将其写入文件 .txt
我在您的代码中添加了更改和注释。使用此代码,您可以使用 BufferedWriter
每行将一个标记写入文件 output.txt
。
一些备注:
- 您的代码进行了一些基本的标记化 (documentation),并且不处理 停用词,例如标点符号
StringTokenizer
是为了向后兼容而保留的遗留 class,但建议将String.split
与正则表达式一起使用- 对于更高级的标记化,您可以使用 Apache Lucene 库
希望对您有所帮助!
public static void tokenization() throws FileNotFoundException, IOException{
String msg;
int numberOfTotalTokens =0;
int numberOfMessages = 0;
String data = "E:\Data\SMSSpamCollection.txt";
// create a new output file output.txt
String outfilename = "E:\output.txt";
File file =new File(outfilename);
file.createNewFile();
FileInputStream fisData = new FileInputStream(data);
BufferedReader readBufferData = new BufferedReader(new InputStreamReader(fisData));
// create a buffer writer tokDataB
FileWriter tokData = new FileWriter(outfilename,true);
BufferedWriter tokDataB = new BufferedWriter(tokData);
try{
while ((msg =readBufferData.readLine()) != null) {
int numberOfTokens = 0;
StringTokenizer tokens = new StringTokenizer(msg);
StringBuilder sb = new StringBuilder();
System.out.println("Before: "+msg);
System.out.print("After : ");
while (tokens.hasMoreTokens()) {
msg = tokens.nextToken();
String msgLower = msg.toLowerCase();
// write one token per line to output file
tokDataB.write(msgLower);
tokDataB.write("\n");
numberOfTokens++;
numberOfTotalTokens++;
}
System.out.println("");
System.out.println("Total Tokens: "+numberOfTokens);
System.out.println("\n");
numberOfTokens++;
numberOfMessages++;
}
// close output writer
tokDataB.close();
System.out.println("Total Tokens: "+numberOfTotalTokens);
System.out.println("Total Messages: "+numberOfMessages);
}
catch (Exception e){
System.out.println("Error Exception: "+e.getMessage());
}
}