如何比较两个未排序且行略有不同的文本文件

Question

我有 2 个如下文件：

文件1

the quick brown fox jumps
jumps over the very lazy dog
brown fox jumps over the
lorem ipsum dolor

文件2

jumps over the very lazy *chicken*
brown fox jumps over the
the quick brown fox *swims*
an apple a day keeps the doctor away

我需要对两个文件进行 DIFF，并从中提取出两个文件中存在的唯一行。

但问题是：

两个文件中的所有行都未排序
行可能（或可能不）相同
在比较行时，前四个字很重要。往前的第五个字是 "don't care"。在上面的示例中，File1 中带有 chicken 和 swims 的行在 File2 中被视为 "PRESENT"。

因此，根据上述条件，预期输出为：

文件1

lorem ipsum dolor

文件 2

an apple a day keeps the doctor away

有人知道快速有效地进行 DIFF 的方法吗？（最短的解决方案，具有易于阅读的输出）我尝试的是使用 excel 直观地并排比较两个文件。但是我要对 LOT 对日志文件执行此操作。完成所有这些需要很长时间。

如有更好的建议，我们将不胜感激。

谢谢并致以最诚挚的问候。

Answer 1

为什么不为这项工作编写一个小程序，让它在两个平台上工作？它很容易在一些与平台无关的 C 代码中完成：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef struct Line
{
    char *line;
    char *tokens;
    size_t nwords;
    const char **words;
} Line;

char *copyString(const char *s)
{
    char *r = malloc(strlen(s) + 1);
    if (!r) exit(EXIT_FAILURE);
    strcpy(r, s);
    return r;
}

int compareLines(const void *a, const void *b)
{
    const Line *line1 = a;
    const Line *line2 = b;

    size_t mw = line1->nwords;
    if (line2->nwords < mw) mw = line2->nwords;
    for (size_t i = 0; i < mw; ++i)
    {
        int r = strcmp(line1->words[i], line2->words[i]);
        if (r) return r;
    }
    if (line1->nwords > mw) return 1;
    if (line2->nwords > mw) return -1;
    return 0;
}

size_t readFile(Line **linesptr, FILE *f, size_t wordCount)
{
    size_t cap = 256;
    size_t n = 0;
    char buf[1024];

    Line *lines = malloc(cap * sizeof(Line));
    if (!lines) exit(EXIT_FAILURE);

    while (fgets(buf, 1024, f))
    {
        if (n == cap)
        {
            cap *= 2;
            lines = realloc(lines, cap * sizeof(Line));
            if (!lines) exit(EXIT_FAILURE);
        }
        lines[n].line = copyString(buf);
        lines[n].tokens = copyString(buf);
        lines[n].words = malloc(wordCount * sizeof(const char *));
        if (!lines[n].words) exit(EXIT_FAILURE);
        size_t c = 0;
        char *word = strtok(lines[n].tokens, " \t");
        while (word && c < wordCount)
        {
            lines[n].words[c++] = word;
            if (c == wordCount) break;
            word = strtok(0, " \t");
        }
        lines[n].nwords = c;
        lines[n].words = realloc(lines[n].words, c * sizeof(const char *));
        if (!lines[n].words) exit(EXIT_FAILURE);
        ++n;
    }
    lines = realloc(lines, n * sizeof(Line));
    if (!lines) exit(EXIT_FAILURE);
    qsort(lines, n, sizeof(Line), compareLines);
    *linesptr = lines;
    return n;
}

void freeLines(Line *lines, size_t n)
{
    for (size_t i = 0; i < n; ++i)
    {
        free(lines[i].words);
        free(lines[i].tokens);
        free(lines[i].line);
    }
    free(lines);
}

int main(int argc, char **argv)
{
    if (argc != 4)
    {
        fprintf(stderr, "Usage: %s [n] [file1] [file2]\n", argv[0]);
        return EXIT_FAILURE;
    }

    int nwords = atoi(argv[1]);
    if (!nwords) return EXIT_FAILURE;
    FILE *f1 = fopen(argv[2], "r");
    if (!f1) return EXIT_FAILURE;
    FILE *f2 = fopen(argv[3], "r");
    if (!f2) return EXIT_FAILURE;

    Line *f1lines = 0;
    size_t nf1lines = readFile(&f1lines, f1, nwords);
    if (!f1lines) return EXIT_FAILURE;

    Line *f2lines = 0;
    size_t nf2lines = readFile(&f2lines, f2, nwords);
    if (!f2lines) return EXIT_FAILURE;

    fclose(f1);
    fclose(f2);

    size_t f1pos = 0;
    size_t f2pos = 0;

    while (f1pos < nf1lines && f2pos < nf2lines)
    {
        int cmp = compareLines(f1lines + f1pos, f2lines + f2pos);
        if (cmp)
        {
            if (cmp < 0)
            {
                printf("%s: %s", argv[2], f1lines[f1pos++].line);
            }
            else
            {
                printf("%s: %s", argv[3], f2lines[f2pos++].line);
            }
        }
        else
        {
            ++f1pos;
            ++f2pos;
        }
    }

    while (f1pos < nf1lines)
    {
        printf("%s: %s", argv[2], f1lines[f1pos++].line);
    }

    while (f2pos < nf2lines)
    {
        printf("%s: %s", argv[3], f2lines[f2pos++].line);
    }

    freeLines(f1lines, nf1lines);
    freeLines(f2lines, nf2lines);

    return EXIT_SUCCESS;
}

如果你使用 gcc，例如用

编译

gcc -s -g0 -O2 -std=c11 -Wall -Wextra -pedantic -ofinduniq finduniq.c

演示：

$ ./finduniq 4 test1.txt test2.txt
test2.txt: an apple a day keeps the doctor away
test1.txt: lorem ipsum dolor

$ ./finduniq 6 test1.txt test2.txt
test2.txt: an apple a day keeps the doctor away
test2.txt: jumps over the very lazy *chicken*
test1.txt: jumps over the very lazy dog
test1.txt: lorem ipsum dolor
test2.txt: the quick brown fox *swims*
test1.txt: the quick brown fox jumps

Answer 2

 $ diff file1 file2 | grep "<\|>" | sed -E 's/^(<|>) //g' | sort | uniq -w5 -u

diff - 比较文件 file1 和 file2,

grep 和 sed 删除多余的行和符号，然后 sort 字符串，

uniq 输出唯一字符串（-w5 比较行中的前 5 个字符，尝试解决问题列表中的问题 #3）

如何比较两个未排序且行略有不同的文本文件

How to compare two text files with unsorted, and slightly different lines

linux

windows

diff