比较字符串中的单词?
Comparison of words in a string?
是否有一个函数可以用来比较单词位置无关紧要的字符串?我的意思是“Aaron Jack Brussels”与“Brussels Aaron Jack”等相同
不是直接的,但您可以结合使用 strtok()
(或其更有用的可重入表亲 strtok_r()
)通过“”分隔符拆分字符串,并使用 strcmp()
将每个子字符串与所有其他子字符串进行比较。
我的示例非常粗糙(它假设两个字符串集中的子字符串数量相同,并且只检查一组与另一组),但应该有助于让您走上正确的轨道。
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
int main() {
char stringA[] = "Aaron Jack Brussels";
char stringB[] = "Brussels Aaron Jack";
int numSubStrings = 3;
char **stringSetA, **stringSetB;
char *saveStrA, *saveStrB;
stringSetA = malloc(numSubStrings*sizeof(*stringSetA));
stringSetB = malloc(numSubStrings*sizeof(*stringSetB));
// extract first token of each to initialize strtok
stringSetA[0] = strtok_r(stringA, " ", &saveStrA);
stringSetB[0] = strtok_r(stringB, " ", &saveStrB);
// loop through the string to extract all other tokens
for (int i = 1; i < numSubStrings; ++i) {
stringSetA[i] = strtok_r(NULL, " ", &saveStrA);
stringSetB[i] = strtok_r(NULL, " ", &saveStrB);
}
for (int i = 0; i < numSubStrings; ++i) {
bool found = false;
for (int j = 0; j < numSubStrings; ++j) {
// strcmp == 0 if both strings are equal
if (!strcmp(stringSetA[i], stringSetB[j])) {
found = true;
break;
}
}
if (!found) printf("Both sets of strings aren't equal!\n");
else printf("Found %s!\n", stringSetA[i]);
}
return 0;
}
没有任何标准函数可以接近您的目标。您需要编写特定的代码。您可以迭代一个字符串并搜索另一个字符串中的每个单词,反之亦然。
这是一个既不修改字符串也不分配任何内存的简单实现:
#include <stdio.h>
#include <string.h>
int countword(const char *w, size_t len, const char *str) {
size_t i;
int count = 0;
for (;;) {
while (*str == ' ')
str++;
if (!*str)
return count;
for (i = 1; str[i] && str[i] != ' '; i++)
continue;
if (i == len && !memcmp(w, str, len))
count++;
str += i;
}
}
int samewords(const char *s1, const char *s2) {
const char *p0, *p;
for (p = s1;;) {
while (*p == ' ')
p++;
if (!*p)
return 1;
for (p0 = p++; *p && *p != ' '; p++)
continue;
if (countword(p0, p - p0, s1) != countword(p0, p - p0, s2))
return 0;
}
}
int main() {
if (samewords("Aaron Jack Brussels", "Brussels Aaron Jack"))
printf("OK\n");
if (samewords("Aaron Jack Brussels", "AaronJackBrussels"))
printf("Not OK\n");
if (samewords("Aaron Jack", "Aaron Jack Jack"))
printf("Not OK\n");
if (samewords("Aaron Jack Brussels", "Aaron Jack"))
printf("Not OK\n");
if (samewords("John John Doe", "John Doe Doe"))
printf("Not OK\n"); return 0;
}
您可以扩展它以处理多个分隔符,例如 space、制表符和换行符,使用 strspn()
和 strcspn()
来自 <string.h>
:
int countword(const char *w, size_t len, const char *str) {
const char *separators = " \t\r\n";
size_t i;
int count = 0;
for (;;) {
str += strspn(str, separators);
if (!*str)
return count;
i = strcspn(str, separators);
if (i == len && !memcmp(w, str, len))
count++;
str += i;
}
}
int samewords(const char *s1, const char *s2) {
const char *separators = " \t\r\n";
const char *p0, *p;
for (p = s1;;) {
p += strspn(p, separators);
if (!*p)
return 1;
p += strcspn(p0 = p, separators);
if (countword(p0, p - p0, s1) != countword(p0, p - p0, s2))
return 0;
}
}
注意:我用一个更通用的版本更新了答案,该版本可以处理重复的单词,例如 "John John Doe"
<-> "John Doe Doe"
,以前的版本会错误地认为它们是等价的。
您真正要做的是创建两组(数学上的)字符串,并尝试比较这些组。不幸的是,未排序的数组(例如字符串)不利于这种比较。在 github 上寻找实现集合的 C 库可能是一个更好的起点(我认为 https://github.com/barrust/set 会满足您的需求)。
如果你真的需要一个简单的实现,这应该可以避免分配或字符串修改:
#include <stdbool.h>
#include <string.h>
enum cmp_result { CMP_EQV, CMP_NEQV, CMP_SUB, CMP_SUPER };
bool is_word_subset(const char *s1, const char *s2);
enum cmp_result cmp_wordset(const char *s1, const char *s2);
const char *get_word_end(const char *s);
enum cmp_result cmp_wordset(const char *s1, const char *s2)
{
enum cmp_result res1, res2;
res1 = is_word_subset(s1, s2) ? CMP_SUB : CMP_NEQV;
res2 = is_word_subset(s2, s1) ? CMP_SUB : CMP_NEQV;
if ( res1 == CMP_SUB && res2 == CMP_SUB) {
return CMP_EQV;
}
else if ( res1 == CMP_SUB ) {
return CMP_SUB;
}
else if ( res2 == CMP_SUB ) {
return CMP_SUPER;
}
else {
return CMP_NEQV;
}
}
// checks if every word in s1 is also in s2
bool is_word_subset(const char *s1, const char *s2) {
const char *start1, *end1, *start2, *end2;
size_t len1, len2;
start1 = s1;
do {
end1 = get_word_end(start1);
start2 = s2;
do {
end2 = get_word_end(start2);
len1 = end1 - start1;
len2 = end2 - start2;
// compares the two current words. memcmp returns 0 on equal strings
// if there is a match, then the given word in s1 is also in s2, and
// we can return to the outer loop to get the next word
if ( len1 == len2 && !memcmp(start1, start2, len1) ) {
start1 = end1 + 1;
goto out;
}
start2 = end2 + 1; // update start2 to point to the next word in s2
} while ( *end2 );
// if there is a word in s1 that is not in s2, then the
// inner loop will exit without breaking. In this case, s1 is not
// a subset of s2, and so we return false.
return false;
out: 0;
start1 = end1 + 1; // update start1 to point to the next word in s1
} while ( *end1 );
return true;
}
// gets the character following the current word.
// for all words but the last one in the string,
// this character will be a space.
const char *get_word_end(const char *s) {
const char *end;
end = strchr(s, ' ');
if ( !end ) {
end = s + strlen(s);
}
return end;
}
是否有一个函数可以用来比较单词位置无关紧要的字符串?我的意思是“Aaron Jack Brussels”与“Brussels Aaron Jack”等相同
不是直接的,但您可以结合使用 strtok()
(或其更有用的可重入表亲 strtok_r()
)通过“”分隔符拆分字符串,并使用 strcmp()
将每个子字符串与所有其他子字符串进行比较。
我的示例非常粗糙(它假设两个字符串集中的子字符串数量相同,并且只检查一组与另一组),但应该有助于让您走上正确的轨道。
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
int main() {
char stringA[] = "Aaron Jack Brussels";
char stringB[] = "Brussels Aaron Jack";
int numSubStrings = 3;
char **stringSetA, **stringSetB;
char *saveStrA, *saveStrB;
stringSetA = malloc(numSubStrings*sizeof(*stringSetA));
stringSetB = malloc(numSubStrings*sizeof(*stringSetB));
// extract first token of each to initialize strtok
stringSetA[0] = strtok_r(stringA, " ", &saveStrA);
stringSetB[0] = strtok_r(stringB, " ", &saveStrB);
// loop through the string to extract all other tokens
for (int i = 1; i < numSubStrings; ++i) {
stringSetA[i] = strtok_r(NULL, " ", &saveStrA);
stringSetB[i] = strtok_r(NULL, " ", &saveStrB);
}
for (int i = 0; i < numSubStrings; ++i) {
bool found = false;
for (int j = 0; j < numSubStrings; ++j) {
// strcmp == 0 if both strings are equal
if (!strcmp(stringSetA[i], stringSetB[j])) {
found = true;
break;
}
}
if (!found) printf("Both sets of strings aren't equal!\n");
else printf("Found %s!\n", stringSetA[i]);
}
return 0;
}
没有任何标准函数可以接近您的目标。您需要编写特定的代码。您可以迭代一个字符串并搜索另一个字符串中的每个单词,反之亦然。
这是一个既不修改字符串也不分配任何内存的简单实现:
#include <stdio.h>
#include <string.h>
int countword(const char *w, size_t len, const char *str) {
size_t i;
int count = 0;
for (;;) {
while (*str == ' ')
str++;
if (!*str)
return count;
for (i = 1; str[i] && str[i] != ' '; i++)
continue;
if (i == len && !memcmp(w, str, len))
count++;
str += i;
}
}
int samewords(const char *s1, const char *s2) {
const char *p0, *p;
for (p = s1;;) {
while (*p == ' ')
p++;
if (!*p)
return 1;
for (p0 = p++; *p && *p != ' '; p++)
continue;
if (countword(p0, p - p0, s1) != countword(p0, p - p0, s2))
return 0;
}
}
int main() {
if (samewords("Aaron Jack Brussels", "Brussels Aaron Jack"))
printf("OK\n");
if (samewords("Aaron Jack Brussels", "AaronJackBrussels"))
printf("Not OK\n");
if (samewords("Aaron Jack", "Aaron Jack Jack"))
printf("Not OK\n");
if (samewords("Aaron Jack Brussels", "Aaron Jack"))
printf("Not OK\n");
if (samewords("John John Doe", "John Doe Doe"))
printf("Not OK\n"); return 0;
}
您可以扩展它以处理多个分隔符,例如 space、制表符和换行符,使用 strspn()
和 strcspn()
来自 <string.h>
:
int countword(const char *w, size_t len, const char *str) {
const char *separators = " \t\r\n";
size_t i;
int count = 0;
for (;;) {
str += strspn(str, separators);
if (!*str)
return count;
i = strcspn(str, separators);
if (i == len && !memcmp(w, str, len))
count++;
str += i;
}
}
int samewords(const char *s1, const char *s2) {
const char *separators = " \t\r\n";
const char *p0, *p;
for (p = s1;;) {
p += strspn(p, separators);
if (!*p)
return 1;
p += strcspn(p0 = p, separators);
if (countword(p0, p - p0, s1) != countword(p0, p - p0, s2))
return 0;
}
}
注意:我用一个更通用的版本更新了答案,该版本可以处理重复的单词,例如 "John John Doe"
<-> "John Doe Doe"
,以前的版本会错误地认为它们是等价的。
您真正要做的是创建两组(数学上的)字符串,并尝试比较这些组。不幸的是,未排序的数组(例如字符串)不利于这种比较。在 github 上寻找实现集合的 C 库可能是一个更好的起点(我认为 https://github.com/barrust/set 会满足您的需求)。
如果你真的需要一个简单的实现,这应该可以避免分配或字符串修改:
#include <stdbool.h>
#include <string.h>
enum cmp_result { CMP_EQV, CMP_NEQV, CMP_SUB, CMP_SUPER };
bool is_word_subset(const char *s1, const char *s2);
enum cmp_result cmp_wordset(const char *s1, const char *s2);
const char *get_word_end(const char *s);
enum cmp_result cmp_wordset(const char *s1, const char *s2)
{
enum cmp_result res1, res2;
res1 = is_word_subset(s1, s2) ? CMP_SUB : CMP_NEQV;
res2 = is_word_subset(s2, s1) ? CMP_SUB : CMP_NEQV;
if ( res1 == CMP_SUB && res2 == CMP_SUB) {
return CMP_EQV;
}
else if ( res1 == CMP_SUB ) {
return CMP_SUB;
}
else if ( res2 == CMP_SUB ) {
return CMP_SUPER;
}
else {
return CMP_NEQV;
}
}
// checks if every word in s1 is also in s2
bool is_word_subset(const char *s1, const char *s2) {
const char *start1, *end1, *start2, *end2;
size_t len1, len2;
start1 = s1;
do {
end1 = get_word_end(start1);
start2 = s2;
do {
end2 = get_word_end(start2);
len1 = end1 - start1;
len2 = end2 - start2;
// compares the two current words. memcmp returns 0 on equal strings
// if there is a match, then the given word in s1 is also in s2, and
// we can return to the outer loop to get the next word
if ( len1 == len2 && !memcmp(start1, start2, len1) ) {
start1 = end1 + 1;
goto out;
}
start2 = end2 + 1; // update start2 to point to the next word in s2
} while ( *end2 );
// if there is a word in s1 that is not in s2, then the
// inner loop will exit without breaking. In this case, s1 is not
// a subset of s2, and so we return false.
return false;
out: 0;
start1 = end1 + 1; // update start1 to point to the next word in s1
} while ( *end1 );
return true;
}
// gets the character following the current word.
// for all words but the last one in the string,
// this character will be a space.
const char *get_word_end(const char *s) {
const char *end;
end = strchr(s, ' ');
if ( !end ) {
end = s + strlen(s);
}
return end;
}