比较字符串中的单词?

Comparison of words in a string?

是否有一个函数可以用来比较单词位置无关紧要的字符串?我的意思是“Aaron Jack Brussels”与“Brussels Aaron Jack”等相同

不是直接的,但您可以结合使用 strtok()(或其更有用的可重入表亲 strtok_r())通过“”分隔符拆分字符串,并使用 strcmp()将每个子字符串与所有其他子字符串进行比较。

我的示例非常粗糙(它假设两个字符串集中的子字符串数量相同,并且只检查一组与另一组),但应该有助于让您走上正确的轨道。

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdbool.h>

int main() {
  char stringA[] = "Aaron Jack Brussels";
  char stringB[] = "Brussels Aaron Jack";                                                           
  int numSubStrings = 3;

  char **stringSetA, **stringSetB;
  char *saveStrA, *saveStrB;

  stringSetA = malloc(numSubStrings*sizeof(*stringSetA));
  stringSetB = malloc(numSubStrings*sizeof(*stringSetB));

  // extract first token of each to initialize strtok
  stringSetA[0] = strtok_r(stringA, " ", &saveStrA);
  stringSetB[0] = strtok_r(stringB, " ", &saveStrB);

  // loop through the string to extract all other tokens                                            
  for (int i = 1; i < numSubStrings; ++i) {
    stringSetA[i] = strtok_r(NULL, " ", &saveStrA);
    stringSetB[i] = strtok_r(NULL, " ", &saveStrB);
  }

  for (int i = 0; i < numSubStrings; ++i) {
    bool found = false;
    for (int j = 0; j < numSubStrings; ++j) {
      // strcmp == 0 if both strings are equal                                                      
      if (!strcmp(stringSetA[i], stringSetB[j])) {
        found = true;
        break;
      }
    }
    if (!found) printf("Both sets of strings aren't equal!\n");
    else printf("Found %s!\n", stringSetA[i]);
  }
  return 0;
}

没有任何标准函数可以接近您的目标。您需要编写特定的代码。您可以迭代一个字符串并搜索另一个字符串中的每个单词,反之亦然。

这是一个既不修改字符串也不分配任何内存的简单实现:

#include <stdio.h>
#include <string.h>

int countword(const char *w, size_t len, const char *str) {
    size_t i;
    int count = 0;

    for (;;) {
        while (*str == ' ')
            str++;
        if (!*str)
            return count;
        for (i = 1; str[i] && str[i] != ' '; i++)
            continue;
        if (i == len && !memcmp(w, str, len))
            count++;
        str += i;
    }
}

int samewords(const char *s1, const char *s2) {
    const char *p0, *p;

    for (p = s1;;) {
        while (*p == ' ')
            p++;
        if (!*p)
            return 1;
        for (p0 = p++; *p && *p != ' '; p++)
            continue;
        if (countword(p0, p - p0, s1) != countword(p0, p - p0, s2))
            return 0;
    }
}

int main() {
    if (samewords("Aaron  Jack  Brussels", "Brussels Aaron Jack"))
        printf("OK\n");
    if (samewords("Aaron  Jack  Brussels", "AaronJackBrussels"))
        printf("Not OK\n");
    if (samewords("Aaron Jack", "Aaron Jack Jack"))
        printf("Not OK\n");
    if (samewords("Aaron Jack Brussels", "Aaron Jack"))
        printf("Not OK\n");
    if (samewords("John John Doe", "John Doe Doe"))
        printf("Not OK\n");    return 0;
}

您可以扩展它以处理多个分隔符,例如 space、制表符和换行符,使用 strspn()strcspn() 来自 <string.h>:

int countword(const char *w, size_t len, const char *str) {
    const char *separators = " \t\r\n";
    size_t i;
    int count = 0;

    for (;;) {
        str += strspn(str, separators);
        if (!*str)
            return count;
        i = strcspn(str, separators);
        if (i == len && !memcmp(w, str, len))
            count++;
        str += i;
    }
}

int samewords(const char *s1, const char *s2) {
    const char *separators = " \t\r\n";
    const char *p0, *p;

    for (p = s1;;) {
        p += strspn(p, separators);
        if (!*p)
            return 1;
        p += strcspn(p0 = p, separators);
        if (countword(p0, p - p0, s1) != countword(p0, p - p0, s2))
            return 0;
    }
}

注意:我用一个更通用的版本更新了答案,该版本可以处理重复的单词,例如 "John John Doe" <-> "John Doe Doe",以前的版本会错误地认为它们是等价的。

您真正要做的是创建两组(数学上的)字符串,并尝试比较这些组。不幸的是,未排序的数组(例如字符串)不利于这种比较。在 github 上寻找实现集合的 C 库可能是一个更好的起点(我认为 https://github.com/barrust/set 会满足您的需求)。

如果你真的需要一个简单的实现,这应该可以避免分配或字符串修改:

#include <stdbool.h>
#include <string.h>

enum cmp_result { CMP_EQV, CMP_NEQV, CMP_SUB, CMP_SUPER };

bool is_word_subset(const char *s1, const char *s2);
enum cmp_result cmp_wordset(const char *s1, const char *s2);
const char *get_word_end(const char *s);

enum cmp_result cmp_wordset(const char *s1, const char *s2)
{

    enum cmp_result res1, res2;

    res1 = is_word_subset(s1, s2) ? CMP_SUB : CMP_NEQV;
    res2 = is_word_subset(s2, s1) ? CMP_SUB : CMP_NEQV;

    if ( res1 == CMP_SUB && res2 == CMP_SUB) {
        return CMP_EQV;
    }
    else if ( res1 == CMP_SUB ) {
        return CMP_SUB;
    }
    else if ( res2 == CMP_SUB ) {
        return CMP_SUPER;
    }
    else {
        return CMP_NEQV;
    }
}

// checks if every word in s1 is also in s2
bool is_word_subset(const char *s1, const char *s2) {
    const char *start1, *end1, *start2, *end2;
    size_t len1, len2;
    
    start1 = s1;
    do {
        end1 = get_word_end(start1);
        
        start2 = s2;
        do {
            end2 = get_word_end(start2);
            
            len1 = end1 - start1;
            len2 = end2 - start2;
            
            // compares the two current words. memcmp returns 0 on equal strings
            // if there is a match, then the given word in s1 is also in s2, and
            // we can return to the outer loop to get the next word
            if ( len1 == len2 && !memcmp(start1, start2, len1) ) { 
                start1 = end1 + 1;
                goto out;
            }
            start2 = end2 + 1; // update start2 to point to the next word in s2
        } while ( *end2 );
        
        // if there is a word in s1 that is not in s2, then the
        // inner loop will exit without breaking. In this case, s1 is not
        // a subset of s2, and so we return false.
        return false;
        out: 0;
        start1 = end1 + 1; // update start1 to point to the next word in s1
    } while ( *end1 );

    return true;
}

// gets the character following the current word.
// for all words but the last one in the string,
// this character will be a space.
const char *get_word_end(const char *s) {
    const char *end;
    
    end = strchr(s, ' ');
    if ( !end ) {
        end = s + strlen(s);
    }
    
    return end;
}