如何比较两个未排序且行略有不同的文本文件
How to compare two text files with unsorted, and slightly different lines
我有 2 个如下文件:
文件1
the quick brown fox jumps
jumps over the very lazy dog
brown fox jumps over the
lorem ipsum dolor
文件2
jumps over the very lazy *chicken*
brown fox jumps over the
the quick brown fox *swims*
an apple a day keeps the doctor away
我需要对两个文件进行 DIFF,并从中提取出两个文件中存在的唯一行。
但问题是:
- 两个文件中的所有行都未排序
- 行可能(或可能不)相同
- 在比较行时,前四个字很重要。往前的第五个字是 "don't care"。在上面的示例中,File1 中带有 chicken 和 swims 的行在 File2 中被视为 "PRESENT"。
因此,根据上述条件,预期输出为:
文件1
lorem ipsum dolor
文件 2
an apple a day keeps the doctor away
有人知道快速有效地进行 DIFF 的方法吗? (最短的解决方案,具有易于阅读的输出)我尝试的是使用 excel 直观地并排比较两个文件。但是我要对 LOT 对日志文件执行此操作。完成所有这些需要很长时间。
如有更好的建议,我们将不胜感激。
谢谢并致以最诚挚的问候。
为什么不为这项工作编写一个小程序,让它在两个 平台上工作?它很容易在一些与平台无关的 C 代码中完成:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct Line
{
char *line;
char *tokens;
size_t nwords;
const char **words;
} Line;
char *copyString(const char *s)
{
char *r = malloc(strlen(s) + 1);
if (!r) exit(EXIT_FAILURE);
strcpy(r, s);
return r;
}
int compareLines(const void *a, const void *b)
{
const Line *line1 = a;
const Line *line2 = b;
size_t mw = line1->nwords;
if (line2->nwords < mw) mw = line2->nwords;
for (size_t i = 0; i < mw; ++i)
{
int r = strcmp(line1->words[i], line2->words[i]);
if (r) return r;
}
if (line1->nwords > mw) return 1;
if (line2->nwords > mw) return -1;
return 0;
}
size_t readFile(Line **linesptr, FILE *f, size_t wordCount)
{
size_t cap = 256;
size_t n = 0;
char buf[1024];
Line *lines = malloc(cap * sizeof(Line));
if (!lines) exit(EXIT_FAILURE);
while (fgets(buf, 1024, f))
{
if (n == cap)
{
cap *= 2;
lines = realloc(lines, cap * sizeof(Line));
if (!lines) exit(EXIT_FAILURE);
}
lines[n].line = copyString(buf);
lines[n].tokens = copyString(buf);
lines[n].words = malloc(wordCount * sizeof(const char *));
if (!lines[n].words) exit(EXIT_FAILURE);
size_t c = 0;
char *word = strtok(lines[n].tokens, " \t");
while (word && c < wordCount)
{
lines[n].words[c++] = word;
if (c == wordCount) break;
word = strtok(0, " \t");
}
lines[n].nwords = c;
lines[n].words = realloc(lines[n].words, c * sizeof(const char *));
if (!lines[n].words) exit(EXIT_FAILURE);
++n;
}
lines = realloc(lines, n * sizeof(Line));
if (!lines) exit(EXIT_FAILURE);
qsort(lines, n, sizeof(Line), compareLines);
*linesptr = lines;
return n;
}
void freeLines(Line *lines, size_t n)
{
for (size_t i = 0; i < n; ++i)
{
free(lines[i].words);
free(lines[i].tokens);
free(lines[i].line);
}
free(lines);
}
int main(int argc, char **argv)
{
if (argc != 4)
{
fprintf(stderr, "Usage: %s [n] [file1] [file2]\n", argv[0]);
return EXIT_FAILURE;
}
int nwords = atoi(argv[1]);
if (!nwords) return EXIT_FAILURE;
FILE *f1 = fopen(argv[2], "r");
if (!f1) return EXIT_FAILURE;
FILE *f2 = fopen(argv[3], "r");
if (!f2) return EXIT_FAILURE;
Line *f1lines = 0;
size_t nf1lines = readFile(&f1lines, f1, nwords);
if (!f1lines) return EXIT_FAILURE;
Line *f2lines = 0;
size_t nf2lines = readFile(&f2lines, f2, nwords);
if (!f2lines) return EXIT_FAILURE;
fclose(f1);
fclose(f2);
size_t f1pos = 0;
size_t f2pos = 0;
while (f1pos < nf1lines && f2pos < nf2lines)
{
int cmp = compareLines(f1lines + f1pos, f2lines + f2pos);
if (cmp)
{
if (cmp < 0)
{
printf("%s: %s", argv[2], f1lines[f1pos++].line);
}
else
{
printf("%s: %s", argv[3], f2lines[f2pos++].line);
}
}
else
{
++f1pos;
++f2pos;
}
}
while (f1pos < nf1lines)
{
printf("%s: %s", argv[2], f1lines[f1pos++].line);
}
while (f2pos < nf2lines)
{
printf("%s: %s", argv[3], f2lines[f2pos++].line);
}
freeLines(f1lines, nf1lines);
freeLines(f2lines, nf2lines);
return EXIT_SUCCESS;
}
如果你使用 gcc,例如用
编译
gcc -s -g0 -O2 -std=c11 -Wall -Wextra -pedantic -ofinduniq finduniq.c
演示:
$ ./finduniq 4 test1.txt test2.txt
test2.txt: an apple a day keeps the doctor away
test1.txt: lorem ipsum dolor
$ ./finduniq 6 test1.txt test2.txt
test2.txt: an apple a day keeps the doctor away
test2.txt: jumps over the very lazy *chicken*
test1.txt: jumps over the very lazy dog
test1.txt: lorem ipsum dolor
test2.txt: the quick brown fox *swims*
test1.txt: the quick brown fox jumps
$ diff file1 file2 | grep "<\|>" | sed -E 's/^(<|>) //g' | sort | uniq -w5 -u
diff
- 比较文件 file1
和 file2
,
grep
和 sed
删除多余的行和符号,然后 sort
字符串,
uniq
输出唯一字符串(-w5
比较行中的前 5 个字符,尝试解决问题列表中的问题 #3)
我有 2 个如下文件:
文件1
the quick brown fox jumps
jumps over the very lazy dog
brown fox jumps over the
lorem ipsum dolor
文件2
jumps over the very lazy *chicken*
brown fox jumps over the
the quick brown fox *swims*
an apple a day keeps the doctor away
我需要对两个文件进行 DIFF,并从中提取出两个文件中存在的唯一行。
但问题是:
- 两个文件中的所有行都未排序
- 行可能(或可能不)相同
- 在比较行时,前四个字很重要。往前的第五个字是 "don't care"。在上面的示例中,File1 中带有 chicken 和 swims 的行在 File2 中被视为 "PRESENT"。
因此,根据上述条件,预期输出为:
文件1
lorem ipsum dolor
文件 2
an apple a day keeps the doctor away
有人知道快速有效地进行 DIFF 的方法吗? (最短的解决方案,具有易于阅读的输出)我尝试的是使用 excel 直观地并排比较两个文件。但是我要对 LOT 对日志文件执行此操作。完成所有这些需要很长时间。
如有更好的建议,我们将不胜感激。
谢谢并致以最诚挚的问候。
为什么不为这项工作编写一个小程序,让它在两个 平台上工作?它很容易在一些与平台无关的 C 代码中完成:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct Line
{
char *line;
char *tokens;
size_t nwords;
const char **words;
} Line;
char *copyString(const char *s)
{
char *r = malloc(strlen(s) + 1);
if (!r) exit(EXIT_FAILURE);
strcpy(r, s);
return r;
}
int compareLines(const void *a, const void *b)
{
const Line *line1 = a;
const Line *line2 = b;
size_t mw = line1->nwords;
if (line2->nwords < mw) mw = line2->nwords;
for (size_t i = 0; i < mw; ++i)
{
int r = strcmp(line1->words[i], line2->words[i]);
if (r) return r;
}
if (line1->nwords > mw) return 1;
if (line2->nwords > mw) return -1;
return 0;
}
size_t readFile(Line **linesptr, FILE *f, size_t wordCount)
{
size_t cap = 256;
size_t n = 0;
char buf[1024];
Line *lines = malloc(cap * sizeof(Line));
if (!lines) exit(EXIT_FAILURE);
while (fgets(buf, 1024, f))
{
if (n == cap)
{
cap *= 2;
lines = realloc(lines, cap * sizeof(Line));
if (!lines) exit(EXIT_FAILURE);
}
lines[n].line = copyString(buf);
lines[n].tokens = copyString(buf);
lines[n].words = malloc(wordCount * sizeof(const char *));
if (!lines[n].words) exit(EXIT_FAILURE);
size_t c = 0;
char *word = strtok(lines[n].tokens, " \t");
while (word && c < wordCount)
{
lines[n].words[c++] = word;
if (c == wordCount) break;
word = strtok(0, " \t");
}
lines[n].nwords = c;
lines[n].words = realloc(lines[n].words, c * sizeof(const char *));
if (!lines[n].words) exit(EXIT_FAILURE);
++n;
}
lines = realloc(lines, n * sizeof(Line));
if (!lines) exit(EXIT_FAILURE);
qsort(lines, n, sizeof(Line), compareLines);
*linesptr = lines;
return n;
}
void freeLines(Line *lines, size_t n)
{
for (size_t i = 0; i < n; ++i)
{
free(lines[i].words);
free(lines[i].tokens);
free(lines[i].line);
}
free(lines);
}
int main(int argc, char **argv)
{
if (argc != 4)
{
fprintf(stderr, "Usage: %s [n] [file1] [file2]\n", argv[0]);
return EXIT_FAILURE;
}
int nwords = atoi(argv[1]);
if (!nwords) return EXIT_FAILURE;
FILE *f1 = fopen(argv[2], "r");
if (!f1) return EXIT_FAILURE;
FILE *f2 = fopen(argv[3], "r");
if (!f2) return EXIT_FAILURE;
Line *f1lines = 0;
size_t nf1lines = readFile(&f1lines, f1, nwords);
if (!f1lines) return EXIT_FAILURE;
Line *f2lines = 0;
size_t nf2lines = readFile(&f2lines, f2, nwords);
if (!f2lines) return EXIT_FAILURE;
fclose(f1);
fclose(f2);
size_t f1pos = 0;
size_t f2pos = 0;
while (f1pos < nf1lines && f2pos < nf2lines)
{
int cmp = compareLines(f1lines + f1pos, f2lines + f2pos);
if (cmp)
{
if (cmp < 0)
{
printf("%s: %s", argv[2], f1lines[f1pos++].line);
}
else
{
printf("%s: %s", argv[3], f2lines[f2pos++].line);
}
}
else
{
++f1pos;
++f2pos;
}
}
while (f1pos < nf1lines)
{
printf("%s: %s", argv[2], f1lines[f1pos++].line);
}
while (f2pos < nf2lines)
{
printf("%s: %s", argv[3], f2lines[f2pos++].line);
}
freeLines(f1lines, nf1lines);
freeLines(f2lines, nf2lines);
return EXIT_SUCCESS;
}
如果你使用 gcc,例如用
编译gcc -s -g0 -O2 -std=c11 -Wall -Wextra -pedantic -ofinduniq finduniq.c
演示:
$ ./finduniq 4 test1.txt test2.txt
test2.txt: an apple a day keeps the doctor away
test1.txt: lorem ipsum dolor
$ ./finduniq 6 test1.txt test2.txt
test2.txt: an apple a day keeps the doctor away
test2.txt: jumps over the very lazy *chicken*
test1.txt: jumps over the very lazy dog
test1.txt: lorem ipsum dolor
test2.txt: the quick brown fox *swims*
test1.txt: the quick brown fox jumps
$ diff file1 file2 | grep "<\|>" | sed -E 's/^(<|>) //g' | sort | uniq -w5 -u
diff
- 比较文件 file1
和 file2
,
grep
和 sed
删除多余的行和符号,然后 sort
字符串,
uniq
输出唯一字符串(-w5
比较行中的前 5 个字符,尝试解决问题列表中的问题 #3)