删除重复的内容 Java
Remove repeated content Java
我收到了这段文字,我需要过滤掉这些重复的行和词。
我不知道有没有比我现在做的更好的方法。
00:00:00,413|03:50:25,600|ISDB|>> FALAM QUE A GENTE COMBINA
00:00:00,413|03:50:25,600|ISDB|PERFEITAMENTE. EU
00:00:01,135|00:00:01,315|ISDB|>> FALAM QUE A GENTE COMBINA
00:00:01,135|00:00:01,315|ISDB|PERFEITAMENTE. EU PEDI REVISTAS
00:00:01,315|00:00:02,218|ISDB|PERFEITAMENTE. EU PEDI REVISTAS
00:00:01,315|00:00:02,218|ISDB|BOBAS PARA
00:00:02,218|00:00:02,398|ISDB|PERFEITAMENTE. EU PEDI REVISTAS
00:00:02,218|00:00:02,398|ISDB|BOBAS PARA AMIGOS
00:00:02,398|00:00:02,759|ISDB|PERFEITAMENTE. EU PEDI REVISTAS
00:00:02,398|00:00:02,759|ISDB|BOBAS PARA AMIGOS E AO
00:00:02,759|00:00:03,274|ISDB|PERFEITAMENTE. EU PEDI REVISTAS
00:00:02,759|00:00:03,274|ISDB|BOBAS PARA AMIGOS E AO INV?
00:00:03,274|00:00:04,357|ISDB|BOBAS PARA AMIGOS E AO INV?
00:00:03,274|00:00:04,357|ISDB|DISSO TROUXERAM ISSO A?
00:00:04,357|00:00:05,259|ISDB|BOBAS PARA AMIGOS E AO INV?
00:00:04,357|00:00:05,259|ISDB|DISSO TROUXERAM ISSO A? ELES
00:00:05,259|00:00:05,414|ISDB|DISSO TROUXERAM ISSO A? ELES
00:00:05,414|00:00:05,775|ISDB|DISSO TROUXERAM ISSO A? ELES
00:00:05,414|00:00:05,775|ISDB|COLOCARAM AS FOTOS
00:00:05,775|00:00:06,677|ISDB|DISSO TROUXERAM ISSO A? ELES
00:00:05,775|00:00:06,677|ISDB|COLOCARAM AS FOTOS COMO
00:00:06,677|00:00:06,858|ISDB|DISSO TROUXERAM ISSO A? ELES
00:00:06,677|00:00:06,858|ISDB|COLOCARAM AS FOTOS COMO PAPEL
00:00:06,858|03:50:32,400|ISDB|COLOCARAM AS FOTOS COMO PAPEL DE
00:00:06,858|03:50:32,400|ISDB|PAREDE, PARECE AT?QUE
00:00:07,914|00:00:07,916|ISDB|COLOCARAM AS FOTOS COMO PAPEL DE
00:00:07,914|00:00:07,916|ISDB|PAREDE, PARECE AT?QUE EU
00:00:07,914|00:00:08,997|ISDB|PAREDE, PARECE AT?QUE EU GOSTO
00:00:08,997|00:00:09,178|ISDB|PAREDE, PARECE AT?QUE EU GOSTO
我正在使用该代码将这些行放入 HashSet 中,这样它们就不会重复。
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.HashSet;
import java.util.Scanner;
import java.util.Set;
public class Testecc {
public static void main(String args[]) throws Exception {
String filePath = "C://teste//teste1.txt";
String input = null;
//Buffered reader
BufferedReader br = new BufferedReader(new FileReader(filePath));
while((input=br.readLine()) !=null){
input=br.readLine();
//FileWriter (criando arquivo)
FileWriter writer = new FileWriter("C://teste//teste.txt");
//hashset para elimitar duplicatas
Set set = new HashSet();
String line;
//adicionando linhas no hashset
while((line=br.readLine())!=null){
String line1= line.substring(0,31);
String line2=line.substring(31);
System.out.println(line);
if(set.add(line2)){
writer.append(line1+line2+"\n");
}
}
writer.flush();
System.out.println("Pronto!");
}
}
}
有了这个,我删除了这样的重复行:
00:00:01,135|00:00:01,315|ISDB|>> FALAM QUE A GENTE COMBINA
00:00:01,135|00:00:01,315|ISDB|PERFEITAMENTE. EU PEDI REVISTAS
00:00:01,315|00:00:02,218|ISDB|BOBAS PARA
00:00:02,218|00:00:02,398|ISDB|BOBAS PARA AMIGOS
00:00:02,398|00:00:02,759|ISDB|BOBAS PARA AMIGOS E AO
00:00:02,759|00:00:03,274|ISDB|BOBAS PARA AMIGOS E AO INV�S
00:00:03,274|00:00:04,357|ISDB|DISSO TROUXERAM ISSO A�.
00:00:04,357|00:00:05,259|ISDB|DISSO TROUXERAM ISSO A�. ELES
00:00:05,414|00:00:05,775|ISDB|COLOCARAM AS FOTOS
00:00:05,775|00:00:06,677|ISDB|COLOCARAM AS FOTOS COMO
00:00:06,677|00:00:06,858|ISDB|COLOCARAM AS FOTOS COMO PAPEL
00:00:06,858|03:50:32,400|ISDB|COLOCARAM AS FOTOS COMO PAPEL DE
00:00:06,858|03:50:32,400|ISDB|PAREDE, PARECE AT� QUE
00:00:07,914|00:00:07,916|ISDB|PAREDE, PARECE AT� QUE EU
00:00:07,914|00:00:08,997|ISDB|PAREDE, PARECE AT� QUE EU GOSTO
但我还需要删除重复的单词。
我真的没主意了。
我该怎么做?
您可以使用每个日志行的最后 post-pipe 部分作为键,然后将每一行插入 LinkedHashMap
,以删除重复项:
String filePath = "C:/log.txt";
BufferedReader br = new BufferedReader(new FileReader(filePath));
String input;
Map<String, String> logMap = new LinkedHashMap<>();
while ((input = br.readLine()) != null) {
input = br.readLine();
String key = input.replaceAll("^.*\|", "");
logMap.put(key, input);
}
// Now print out the map minus duplicates
for (String line : logMap.values()) {
System.out.println(line);
}
您可以轻松地将过滤后的日志写入另一个文件,而不是打印到控制台。请注意,此方法将保留每个副本的 last 行。
有一张地图,其中包含按特定键分组的行值。键是行的开头,从您感兴趣的单词开始,比如前 5 个字母。然后将这些线添加到地图中,如果线比之前找到的线长,则替换它。
try (BufferedReader br = new BufferedReader(new FileReader(filepath))) {
final Map<String, String> map = new LinkedHashMap<>();
br.lines().forEach(line -> {
String message = line.substring(line.lastIndexOf("|") + 1);
if (message.isEmpty()) {
return;
}
String key = message.split(" ")[0];
if (map.get(key) == null) {
map.put(key, line);
} else if (map.get(key).length() < line.length()) {
map.remove(key);
map.put(key, line);
}
}
);
map.forEach((k, v) -> System.out.println(v));
}
上面的代码会给你下面的输出。
00:00:00,413|03:50:25,600|ISDB|>> FALAM QUE A GENTE COMBINA
00:00:01,135|00:00:01,315|ISDB|PERFEITAMENTE. EU PEDI REVISTAS
00:00:02,759|00:00:03,274|ISDB|BOBAS PARA AMIGOS E AO INV?
00:00:04,357|00:00:05,259|ISDB|DISSO TROUXERAM ISSO A? ELES
00:00:06,858|03:50:32,400|ISDB|COLOCARAM AS FOTOS COMO PAPEL DE
00:00:07,914|00:00:08,997|ISDB|PAREDE, PARECE AT?QUE EU GOSTO
我收到了这段文字,我需要过滤掉这些重复的行和词。 我不知道有没有比我现在做的更好的方法。
00:00:00,413|03:50:25,600|ISDB|>> FALAM QUE A GENTE COMBINA
00:00:00,413|03:50:25,600|ISDB|PERFEITAMENTE. EU
00:00:01,135|00:00:01,315|ISDB|>> FALAM QUE A GENTE COMBINA
00:00:01,135|00:00:01,315|ISDB|PERFEITAMENTE. EU PEDI REVISTAS
00:00:01,315|00:00:02,218|ISDB|PERFEITAMENTE. EU PEDI REVISTAS
00:00:01,315|00:00:02,218|ISDB|BOBAS PARA
00:00:02,218|00:00:02,398|ISDB|PERFEITAMENTE. EU PEDI REVISTAS
00:00:02,218|00:00:02,398|ISDB|BOBAS PARA AMIGOS
00:00:02,398|00:00:02,759|ISDB|PERFEITAMENTE. EU PEDI REVISTAS
00:00:02,398|00:00:02,759|ISDB|BOBAS PARA AMIGOS E AO
00:00:02,759|00:00:03,274|ISDB|PERFEITAMENTE. EU PEDI REVISTAS
00:00:02,759|00:00:03,274|ISDB|BOBAS PARA AMIGOS E AO INV?
00:00:03,274|00:00:04,357|ISDB|BOBAS PARA AMIGOS E AO INV?
00:00:03,274|00:00:04,357|ISDB|DISSO TROUXERAM ISSO A?
00:00:04,357|00:00:05,259|ISDB|BOBAS PARA AMIGOS E AO INV?
00:00:04,357|00:00:05,259|ISDB|DISSO TROUXERAM ISSO A? ELES
00:00:05,259|00:00:05,414|ISDB|DISSO TROUXERAM ISSO A? ELES
00:00:05,414|00:00:05,775|ISDB|DISSO TROUXERAM ISSO A? ELES
00:00:05,414|00:00:05,775|ISDB|COLOCARAM AS FOTOS
00:00:05,775|00:00:06,677|ISDB|DISSO TROUXERAM ISSO A? ELES
00:00:05,775|00:00:06,677|ISDB|COLOCARAM AS FOTOS COMO
00:00:06,677|00:00:06,858|ISDB|DISSO TROUXERAM ISSO A? ELES
00:00:06,677|00:00:06,858|ISDB|COLOCARAM AS FOTOS COMO PAPEL
00:00:06,858|03:50:32,400|ISDB|COLOCARAM AS FOTOS COMO PAPEL DE
00:00:06,858|03:50:32,400|ISDB|PAREDE, PARECE AT?QUE
00:00:07,914|00:00:07,916|ISDB|COLOCARAM AS FOTOS COMO PAPEL DE
00:00:07,914|00:00:07,916|ISDB|PAREDE, PARECE AT?QUE EU
00:00:07,914|00:00:08,997|ISDB|PAREDE, PARECE AT?QUE EU GOSTO
00:00:08,997|00:00:09,178|ISDB|PAREDE, PARECE AT?QUE EU GOSTO
我正在使用该代码将这些行放入 HashSet 中,这样它们就不会重复。
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.HashSet;
import java.util.Scanner;
import java.util.Set;
public class Testecc {
public static void main(String args[]) throws Exception {
String filePath = "C://teste//teste1.txt";
String input = null;
//Buffered reader
BufferedReader br = new BufferedReader(new FileReader(filePath));
while((input=br.readLine()) !=null){
input=br.readLine();
//FileWriter (criando arquivo)
FileWriter writer = new FileWriter("C://teste//teste.txt");
//hashset para elimitar duplicatas
Set set = new HashSet();
String line;
//adicionando linhas no hashset
while((line=br.readLine())!=null){
String line1= line.substring(0,31);
String line2=line.substring(31);
System.out.println(line);
if(set.add(line2)){
writer.append(line1+line2+"\n");
}
}
writer.flush();
System.out.println("Pronto!");
}
}
}
有了这个,我删除了这样的重复行:
00:00:01,135|00:00:01,315|ISDB|>> FALAM QUE A GENTE COMBINA
00:00:01,135|00:00:01,315|ISDB|PERFEITAMENTE. EU PEDI REVISTAS
00:00:01,315|00:00:02,218|ISDB|BOBAS PARA
00:00:02,218|00:00:02,398|ISDB|BOBAS PARA AMIGOS
00:00:02,398|00:00:02,759|ISDB|BOBAS PARA AMIGOS E AO
00:00:02,759|00:00:03,274|ISDB|BOBAS PARA AMIGOS E AO INV�S
00:00:03,274|00:00:04,357|ISDB|DISSO TROUXERAM ISSO A�.
00:00:04,357|00:00:05,259|ISDB|DISSO TROUXERAM ISSO A�. ELES
00:00:05,414|00:00:05,775|ISDB|COLOCARAM AS FOTOS
00:00:05,775|00:00:06,677|ISDB|COLOCARAM AS FOTOS COMO
00:00:06,677|00:00:06,858|ISDB|COLOCARAM AS FOTOS COMO PAPEL
00:00:06,858|03:50:32,400|ISDB|COLOCARAM AS FOTOS COMO PAPEL DE
00:00:06,858|03:50:32,400|ISDB|PAREDE, PARECE AT� QUE
00:00:07,914|00:00:07,916|ISDB|PAREDE, PARECE AT� QUE EU
00:00:07,914|00:00:08,997|ISDB|PAREDE, PARECE AT� QUE EU GOSTO
但我还需要删除重复的单词。
我真的没主意了。
我该怎么做?
您可以使用每个日志行的最后 post-pipe 部分作为键,然后将每一行插入 LinkedHashMap
,以删除重复项:
String filePath = "C:/log.txt";
BufferedReader br = new BufferedReader(new FileReader(filePath));
String input;
Map<String, String> logMap = new LinkedHashMap<>();
while ((input = br.readLine()) != null) {
input = br.readLine();
String key = input.replaceAll("^.*\|", "");
logMap.put(key, input);
}
// Now print out the map minus duplicates
for (String line : logMap.values()) {
System.out.println(line);
}
您可以轻松地将过滤后的日志写入另一个文件,而不是打印到控制台。请注意,此方法将保留每个副本的 last 行。
有一张地图,其中包含按特定键分组的行值。键是行的开头,从您感兴趣的单词开始,比如前 5 个字母。然后将这些线添加到地图中,如果线比之前找到的线长,则替换它。
try (BufferedReader br = new BufferedReader(new FileReader(filepath))) {
final Map<String, String> map = new LinkedHashMap<>();
br.lines().forEach(line -> {
String message = line.substring(line.lastIndexOf("|") + 1);
if (message.isEmpty()) {
return;
}
String key = message.split(" ")[0];
if (map.get(key) == null) {
map.put(key, line);
} else if (map.get(key).length() < line.length()) {
map.remove(key);
map.put(key, line);
}
}
);
map.forEach((k, v) -> System.out.println(v));
}
上面的代码会给你下面的输出。
00:00:00,413|03:50:25,600|ISDB|>> FALAM QUE A GENTE COMBINA
00:00:01,135|00:00:01,315|ISDB|PERFEITAMENTE. EU PEDI REVISTAS
00:00:02,759|00:00:03,274|ISDB|BOBAS PARA AMIGOS E AO INV?
00:00:04,357|00:00:05,259|ISDB|DISSO TROUXERAM ISSO A? ELES
00:00:06,858|03:50:32,400|ISDB|COLOCARAM AS FOTOS COMO PAPEL DE
00:00:07,914|00:00:08,997|ISDB|PAREDE, PARECE AT?QUE EU GOSTO