读取文件时忽略某些单词
Ignore certain words when reading a file
我的程序读取一个文本文件并列出文件中每个单词的出现频率。接下来我需要做的是在读取文件时忽略某些单词,例如 'the'、'an'。我已经创建了这些单词的列表,但不确定如何在 while 循环中实现它。谢谢
public static String [] ConnectingWords = {"and", "it", "you"};
public static void readWordFile(LinkedHashMap<String, Integer> wordcount) {
// FileReader fileReader = null;
Scanner wordFile;
String word; // A word read from the file
Integer count; // The number of occurrences of the word
// LinkedHashMap <String, Integer> wordcount = new LinkedHashMap<String, Integer> ();
try {
wordFile = new Scanner(new FileReader("/Applications/text.txt"));
wordFile.useDelimiter(" ");
} catch (FileNotFoundException e) {
System.err.println(e);
return;
}
while (wordFile.hasNext()) {
word = wordFile.next();
word = word.toLowerCase();
if (word.contains("the")) {
count = getCount(word, wordcount) + 0;
wordcount.put(word, count);
}
// Get the current count of this word, add one, and then store the
// new count:
count = getCount(word, wordcount) + 1;
wordcount.put(word, count);
}
}
有排除列表的列表词。在更新计数之前,请检查排除列表。
public static void readWordFile (LinkedHashMap<String, Integer> wordcount) {
List<String> excludeList = new ArrayList<>();
excludeList.add("the"); // and so on
// FileReader fileReader = null;
Scanner wordFile;
String word; // A word read from the file
Integer count; // The number of occurrences of the word
// LinkedHashMap <String, Integer> wordcount = new LinkedHashMap <String, Integer> ();
try
{
wordFile = new Scanner(new FileReader("/Applications/text.txt"));
wordFile.useDelimiter(" ");
}
catch (FileNotFoundException e)
{
System.err.println(e);
return;
}
while (wordFile.hasNext())
{
word = wordFile.next( );
word = word.toLowerCase();
if(!excludeList.contains(word)) {
count = wordcount.get(word) + 1;
wordcount.put(word, count);
}
}
创建一个列表,其中包含需要忽略的单词列表:
List<String> ignoreAll= Arrays.asList("and","it", "you");
然后在 while 循环中添加一个条件,该条件将忽略包含这些单词的单词,如
if(ignoreAll.contains(word)){
continue;
}
您可以试试下面的代码。
public static HashSet<String> connectingWords;
public static Map<String,Integer> frequencyMap;
static {
connectingWords = new HashSet<>();
connectingWords.add("and");
connectingWords.add("it");
connectingWords.add("you");
frequencyMap = new HashMap<>();
}
public static void main(String[] args) {
BufferedReader reader = null;
String line;
try {
reader = new BufferedReader(new FileReader("src/files/temp2.txt"));
while ((line = reader.readLine()) != null) {
String[] words = line.split("-");
for (String word : words) {
if(connectingWords.contains(word)) {
continue;
}
Integer value = frequencyMap.get(word);
if(value != null) {
frequencyMap.put(word,value+1);
} else {
frequencyMap.put(word,0);
}
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
reader.close();
}
System.out.println(frequencyMap.values());
}
最好将连接词存储在 HashSet
中,因为它会在您每次为文件中的每个词调用 contains
时提供快速访问。另外这个词和它的频率可以保持在Map
。此外,我假设单词的分隔符是 -
,如果是其他分隔符,您可以修改代码。此外,如果您对 case
有任何特殊要求,您可以更改代码。我已经尝试使用具有 What-the-hell-is-going-on-and-it-is-good
输入的文件并且它工作正常。
我的程序读取一个文本文件并列出文件中每个单词的出现频率。接下来我需要做的是在读取文件时忽略某些单词,例如 'the'、'an'。我已经创建了这些单词的列表,但不确定如何在 while 循环中实现它。谢谢
public static String [] ConnectingWords = {"and", "it", "you"};
public static void readWordFile(LinkedHashMap<String, Integer> wordcount) {
// FileReader fileReader = null;
Scanner wordFile;
String word; // A word read from the file
Integer count; // The number of occurrences of the word
// LinkedHashMap <String, Integer> wordcount = new LinkedHashMap<String, Integer> ();
try {
wordFile = new Scanner(new FileReader("/Applications/text.txt"));
wordFile.useDelimiter(" ");
} catch (FileNotFoundException e) {
System.err.println(e);
return;
}
while (wordFile.hasNext()) {
word = wordFile.next();
word = word.toLowerCase();
if (word.contains("the")) {
count = getCount(word, wordcount) + 0;
wordcount.put(word, count);
}
// Get the current count of this word, add one, and then store the
// new count:
count = getCount(word, wordcount) + 1;
wordcount.put(word, count);
}
}
有排除列表的列表词。在更新计数之前,请检查排除列表。
public static void readWordFile (LinkedHashMap<String, Integer> wordcount) {
List<String> excludeList = new ArrayList<>();
excludeList.add("the"); // and so on
// FileReader fileReader = null;
Scanner wordFile;
String word; // A word read from the file
Integer count; // The number of occurrences of the word
// LinkedHashMap <String, Integer> wordcount = new LinkedHashMap <String, Integer> ();
try
{
wordFile = new Scanner(new FileReader("/Applications/text.txt"));
wordFile.useDelimiter(" ");
}
catch (FileNotFoundException e)
{
System.err.println(e);
return;
}
while (wordFile.hasNext())
{
word = wordFile.next( );
word = word.toLowerCase();
if(!excludeList.contains(word)) {
count = wordcount.get(word) + 1;
wordcount.put(word, count);
}
}
创建一个列表,其中包含需要忽略的单词列表:
List<String> ignoreAll= Arrays.asList("and","it", "you");
然后在 while 循环中添加一个条件,该条件将忽略包含这些单词的单词,如
if(ignoreAll.contains(word)){
continue;
}
您可以试试下面的代码。
public static HashSet<String> connectingWords;
public static Map<String,Integer> frequencyMap;
static {
connectingWords = new HashSet<>();
connectingWords.add("and");
connectingWords.add("it");
connectingWords.add("you");
frequencyMap = new HashMap<>();
}
public static void main(String[] args) {
BufferedReader reader = null;
String line;
try {
reader = new BufferedReader(new FileReader("src/files/temp2.txt"));
while ((line = reader.readLine()) != null) {
String[] words = line.split("-");
for (String word : words) {
if(connectingWords.contains(word)) {
continue;
}
Integer value = frequencyMap.get(word);
if(value != null) {
frequencyMap.put(word,value+1);
} else {
frequencyMap.put(word,0);
}
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
reader.close();
}
System.out.println(frequencyMap.values());
}
最好将连接词存储在 HashSet
中,因为它会在您每次为文件中的每个词调用 contains
时提供快速访问。另外这个词和它的频率可以保持在Map
。此外,我假设单词的分隔符是 -
,如果是其他分隔符,您可以修改代码。此外,如果您对 case
有任何特殊要求,您可以更改代码。我已经尝试使用具有 What-the-hell-is-going-on-and-it-is-good
输入的文件并且它工作正常。