strstr 的实现失败,最后一个字
Implementation of strstr fails with the last word
我有以下 strstr 的实现
注意:此代码不是我的。
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
char *fast_strstr(const char *haystack, const char *needle)
{
if (!*needle) // Empty needle.
return (char *) haystack;
const char needle_first = *needle;
// Runs strchr() on the first section of the haystack as it has a lower
// algorithmic complexity for discarding the first non-matching characters.
haystack = strchr(haystack, needle_first);
if (!haystack) // First character of needle is not in the haystack.
return NULL;
// First characters of haystack and needle are the same now. Both are
// guaranteed to be at least one character long.
// Now computes the sum of the first needle_len characters of haystack
// minus the sum of characters values of needle.
const char *i_haystack = haystack + 1,
*i_needle = needle + 1;
unsigned int sums_diff = *haystack;
bool identical = true;
while (*i_haystack && *i_needle)
{
sums_diff += *i_haystack;
sums_diff -= *i_needle;
identical &= *i_haystack++ == *i_needle++;
}
// i_haystack now references the (needle_len + 1)-th character.
if (*i_needle) // haystack is smaller than needle.
return NULL;
else if (identical)
return (char *) haystack;
size_t needle_len = i_needle - needle;
size_t needle_len_1 = needle_len - 1;
// Loops for the remaining of the haystack, updating the sum iteratively.
const char *sub_start;
for (sub_start = haystack; *i_haystack; i_haystack++)
{
sums_diff -= *sub_start++;
sums_diff += *i_haystack;
// Since the sum of the characters is already known to be equal at that
// point, it is enough to check just needle_len-1 characters for
// equality.
if (
sums_diff == 0
&& needle_first == *sub_start // Avoids some calls to memcmp.
&& memcmp(sub_start, needle, needle_len_1) == 0
)
return (char *) sub_start;
}
return NULL;
}
int main(void)
{
char s[] = "this is a test";
char s2[] = "test";
if(fast_strstr(s, s2) != NULL)
puts("YES!");
else
puts("NOT!");
return 0;
}
这给出了当前条目的错误输出,其中 NOT! 而不是 YES!。这个问题只出现在最后一个单词上,但奇怪的是它适用于其他字符串,知道为什么会这样吗?
如果第一个 char
针与大海捞针中的 char
相匹配,但其余的不匹配,则代码失败。
尝试 fast_strstr("zhis is a test", "test")
而不是最后一个 return NULL;
,代码需要在 第一个匹配字母之后尝试大海捞针的其余部分 。接下来是递归解决方案,但肯定可以在函数内进行循环。
return fast_strstr(haystack+1, needle); // --> YES!
// return NULL;
某些输入的代码可能很快,但似乎是 O(n*n)
sum_diff
应初始化为 0,因为它们在第一个字符匹配时没有初始差异。
如果你在 GDB 中 运行 这个你会发现 sum_diff = 116
(这是 't'
的 ASCII 值)当它 returns 而不是 0.
unsigned int sums_diff = 0; // *haystack - *needle (which is 0)
这个 bug 会导致它在 needle 的第一个字符出现在字符串中较早的任何 haystack 时失败,并且不完全匹配,因为当您进入 for
循环并依赖 sum_diff
.
我有以下 strstr 的实现
注意:此代码不是我的。
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
char *fast_strstr(const char *haystack, const char *needle)
{
if (!*needle) // Empty needle.
return (char *) haystack;
const char needle_first = *needle;
// Runs strchr() on the first section of the haystack as it has a lower
// algorithmic complexity for discarding the first non-matching characters.
haystack = strchr(haystack, needle_first);
if (!haystack) // First character of needle is not in the haystack.
return NULL;
// First characters of haystack and needle are the same now. Both are
// guaranteed to be at least one character long.
// Now computes the sum of the first needle_len characters of haystack
// minus the sum of characters values of needle.
const char *i_haystack = haystack + 1,
*i_needle = needle + 1;
unsigned int sums_diff = *haystack;
bool identical = true;
while (*i_haystack && *i_needle)
{
sums_diff += *i_haystack;
sums_diff -= *i_needle;
identical &= *i_haystack++ == *i_needle++;
}
// i_haystack now references the (needle_len + 1)-th character.
if (*i_needle) // haystack is smaller than needle.
return NULL;
else if (identical)
return (char *) haystack;
size_t needle_len = i_needle - needle;
size_t needle_len_1 = needle_len - 1;
// Loops for the remaining of the haystack, updating the sum iteratively.
const char *sub_start;
for (sub_start = haystack; *i_haystack; i_haystack++)
{
sums_diff -= *sub_start++;
sums_diff += *i_haystack;
// Since the sum of the characters is already known to be equal at that
// point, it is enough to check just needle_len-1 characters for
// equality.
if (
sums_diff == 0
&& needle_first == *sub_start // Avoids some calls to memcmp.
&& memcmp(sub_start, needle, needle_len_1) == 0
)
return (char *) sub_start;
}
return NULL;
}
int main(void)
{
char s[] = "this is a test";
char s2[] = "test";
if(fast_strstr(s, s2) != NULL)
puts("YES!");
else
puts("NOT!");
return 0;
}
这给出了当前条目的错误输出,其中 NOT! 而不是 YES!。这个问题只出现在最后一个单词上,但奇怪的是它适用于其他字符串,知道为什么会这样吗?
如果第一个 char
针与大海捞针中的 char
相匹配,但其余的不匹配,则代码失败。
尝试 fast_strstr("zhis is a test", "test")
而不是最后一个 return NULL;
,代码需要在 第一个匹配字母之后尝试大海捞针的其余部分 。接下来是递归解决方案,但肯定可以在函数内进行循环。
return fast_strstr(haystack+1, needle); // --> YES!
// return NULL;
某些输入的代码可能很快,但似乎是 O(n*n)
sum_diff
应初始化为 0,因为它们在第一个字符匹配时没有初始差异。
如果你在 GDB 中 运行 这个你会发现 sum_diff = 116
(这是 't'
的 ASCII 值)当它 returns 而不是 0.
unsigned int sums_diff = 0; // *haystack - *needle (which is 0)
这个 bug 会导致它在 needle 的第一个字符出现在字符串中较早的任何 haystack 时失败,并且不完全匹配,因为当您进入 for
循环并依赖 sum_diff
.