C++ 中的 Rabin-Karp 算法
Rabin-Karp algorithm in c++
我正在尝试了解 Rabin-Karp 算法的实现。 d 是输入字母表中的字符数,但如果我替换 0 或任何其他值而不是 20,它不会影响任何东西。为什么会这样?
// Rabin-Karp algorithm in C++
#include <string.h>
#include <iostream>
using namespace std;
#define d 20
void rabinKarp(char pattern[], char text[], int q) {
int m = strlen(pattern);
int n = strlen(text);
int i, j;
int p = 0;
int t = 0;
int h = 1;
for (i = 0; i < m - 1; i++)
h = (h * d) % q;
// Calculate hash value for pattern and text
for (i = 0; i < m; i++) {
p = (d * p + pattern[i]) % q;
t = (d * t + text[i]) % q;
}
// Find the match
for (i = 0; i <= n - m; i++) {
if (p == t) {
for (j = 0; j < m; j++) {
if (text[i + j] != pattern[j])
break;
}
if (j == m)
cout << "Pattern is found at position: " << i + 1 << endl;
}
if (i < n - m) {
t = (d * (t - text[i] * h) + text[i + m]) % q;
if (t < 0)
t = (t + q);
}
}
}
int main() {
// char text[] = "ABCCDXAEFGX";
char text[] = "QWERTYUIOPASDFGHJKLXQWERTYUIOPASDFGHJKLX";
char pattern[] = "KLXQW";
int q = 13;
rabinKarp(pattern, text, q);
}
我认为简短的回答是,d
越低,哈希冲突就越多,但无论如何您都要验证匹配,所以它不会影响任何东西。
有点冗长:
首先让我mod确认你的代码有更多的表达变量:
// Rabin-Karp algorithm in C++
#include <string.h>
#include <iostream>
using namespace std;
#define HASH_BASE 0
void rabinKarp(char pattern[], char text[], int inputBase) {
int patternLen = strlen(pattern);
int textLen = strlen(text);
int i, j; //predefined iterators
int patternHash = 0;
int textHash = 0;
int patternLenOut = 1;
for (i = 0; i < patternLen - 1; i++)
patternLenOut = (patternLenOut * HASH_BASE) % inputBase; // hash of pattern len
// Calculate hash value for pattern and text
for (i = 0; i < patternLen; i++) {
patternHash = (HASH_BASE * patternHash + pattern[i]) % inputBase;
textHash = (HASH_BASE * textHash + text[i]) % inputBase;
}
// Find the match
for (i = 0; i <= textLen - patternLen; i++) {
if (patternHash == textHash) {
for (j = 0; j < patternLen; j++) {
if (text[i + j] != pattern[j])
break;
}
if (j == patternLen)
cout << "Pattern is found at position: " << i + 1 << endl;
}
if (i < textLen - patternLen) {
textHash = (HASH_BASE * (textHash - text[i] * patternLenOut) + text[i + patternLen]) % inputBase;
if (textHash < 0)
textHash = (textHash + inputBase);
}
}
}
int main() {
// char text[] = "ABCCDXAEFGX";
char text[] = "QWEEERTYUIOPASDFGHJKLXQWERTYUIOPASDFGHJKLX";
char pattern[] = "EE";
int q = 13;
rabinKarp(pattern, text, q);
}
攻击它的最简单方法是将 HASH_BASE
(以前的 d
)设置为零,看看我们可以在哪里简化。然后可以将 rabinKarp 函数简化为:
void rabinKarp(char pattern[], char text[], int inputBase) {
int patternLen = strlen(pattern);
int textLen = strlen(text);
int i, j; //predefined iterators
int patternHash = 0;
int textHash = 0;
int patternLenOut = 0;
// Calculate hash value for pattern and text
for (i = 0; i < patternLen; i++) {
patternHash = (pattern[i]) % inputBase;
textHash = (text[i]) % inputBase;
}
// Find the match
for (i = 0; i <= textLen - patternLen; i++) {
if (patternHash == textHash) {
for (j = 0; j < patternLen; j++) {
if (text[i + j] != pattern[j])
break;
}
if (j == patternLen)
cout << "Pattern is found at position: " << i + 1 << endl;
}
if (i < textLen - patternLen) {
textHash = (text[i + patternLen]) % inputBase;
if (textHash < 0)
textHash = (textHash + inputBase);
}
}
}
现在你会注意到所有的哈希值都是字母 mod 一些数字的总和(在你的例子中是 13,在我的例子中是 2)。这是一个糟糕的散列,意味着很多东西的总和将是相同的数字。但是,在这部分代码中:
if (patternHash == textHash) {
for (j = 0; j < patternLen; j++) {
if (text[i + j] != pattern[j])
break;
}
if (j == patternLen)
cout << "Pattern is found at position: " << i + 1 << endl;
}
如果哈希值匹配,您将逐个字母地明确检查匹配项。哈希函数越差,误报的频率就越高(这意味着函数的运行时间更长)。还有更多细节,但我相信这可以直接回答您的问题。可能有趣的是记录误报并查看误报率如何随着 d
和 q
的减少而增加。
我正在尝试了解 Rabin-Karp 算法的实现。 d 是输入字母表中的字符数,但如果我替换 0 或任何其他值而不是 20,它不会影响任何东西。为什么会这样?
// Rabin-Karp algorithm in C++
#include <string.h>
#include <iostream>
using namespace std;
#define d 20
void rabinKarp(char pattern[], char text[], int q) {
int m = strlen(pattern);
int n = strlen(text);
int i, j;
int p = 0;
int t = 0;
int h = 1;
for (i = 0; i < m - 1; i++)
h = (h * d) % q;
// Calculate hash value for pattern and text
for (i = 0; i < m; i++) {
p = (d * p + pattern[i]) % q;
t = (d * t + text[i]) % q;
}
// Find the match
for (i = 0; i <= n - m; i++) {
if (p == t) {
for (j = 0; j < m; j++) {
if (text[i + j] != pattern[j])
break;
}
if (j == m)
cout << "Pattern is found at position: " << i + 1 << endl;
}
if (i < n - m) {
t = (d * (t - text[i] * h) + text[i + m]) % q;
if (t < 0)
t = (t + q);
}
}
}
int main() {
// char text[] = "ABCCDXAEFGX";
char text[] = "QWERTYUIOPASDFGHJKLXQWERTYUIOPASDFGHJKLX";
char pattern[] = "KLXQW";
int q = 13;
rabinKarp(pattern, text, q);
}
我认为简短的回答是,d
越低,哈希冲突就越多,但无论如何您都要验证匹配,所以它不会影响任何东西。
有点冗长:
首先让我mod确认你的代码有更多的表达变量:
// Rabin-Karp algorithm in C++
#include <string.h>
#include <iostream>
using namespace std;
#define HASH_BASE 0
void rabinKarp(char pattern[], char text[], int inputBase) {
int patternLen = strlen(pattern);
int textLen = strlen(text);
int i, j; //predefined iterators
int patternHash = 0;
int textHash = 0;
int patternLenOut = 1;
for (i = 0; i < patternLen - 1; i++)
patternLenOut = (patternLenOut * HASH_BASE) % inputBase; // hash of pattern len
// Calculate hash value for pattern and text
for (i = 0; i < patternLen; i++) {
patternHash = (HASH_BASE * patternHash + pattern[i]) % inputBase;
textHash = (HASH_BASE * textHash + text[i]) % inputBase;
}
// Find the match
for (i = 0; i <= textLen - patternLen; i++) {
if (patternHash == textHash) {
for (j = 0; j < patternLen; j++) {
if (text[i + j] != pattern[j])
break;
}
if (j == patternLen)
cout << "Pattern is found at position: " << i + 1 << endl;
}
if (i < textLen - patternLen) {
textHash = (HASH_BASE * (textHash - text[i] * patternLenOut) + text[i + patternLen]) % inputBase;
if (textHash < 0)
textHash = (textHash + inputBase);
}
}
}
int main() {
// char text[] = "ABCCDXAEFGX";
char text[] = "QWEEERTYUIOPASDFGHJKLXQWERTYUIOPASDFGHJKLX";
char pattern[] = "EE";
int q = 13;
rabinKarp(pattern, text, q);
}
攻击它的最简单方法是将 HASH_BASE
(以前的 d
)设置为零,看看我们可以在哪里简化。然后可以将 rabinKarp 函数简化为:
void rabinKarp(char pattern[], char text[], int inputBase) {
int patternLen = strlen(pattern);
int textLen = strlen(text);
int i, j; //predefined iterators
int patternHash = 0;
int textHash = 0;
int patternLenOut = 0;
// Calculate hash value for pattern and text
for (i = 0; i < patternLen; i++) {
patternHash = (pattern[i]) % inputBase;
textHash = (text[i]) % inputBase;
}
// Find the match
for (i = 0; i <= textLen - patternLen; i++) {
if (patternHash == textHash) {
for (j = 0; j < patternLen; j++) {
if (text[i + j] != pattern[j])
break;
}
if (j == patternLen)
cout << "Pattern is found at position: " << i + 1 << endl;
}
if (i < textLen - patternLen) {
textHash = (text[i + patternLen]) % inputBase;
if (textHash < 0)
textHash = (textHash + inputBase);
}
}
}
现在你会注意到所有的哈希值都是字母 mod 一些数字的总和(在你的例子中是 13,在我的例子中是 2)。这是一个糟糕的散列,意味着很多东西的总和将是相同的数字。但是,在这部分代码中:
if (patternHash == textHash) {
for (j = 0; j < patternLen; j++) {
if (text[i + j] != pattern[j])
break;
}
if (j == patternLen)
cout << "Pattern is found at position: " << i + 1 << endl;
}
如果哈希值匹配,您将逐个字母地明确检查匹配项。哈希函数越差,误报的频率就越高(这意味着函数的运行时间更长)。还有更多细节,但我相信这可以直接回答您的问题。可能有趣的是记录误报并查看误报率如何随着 d
和 q
的减少而增加。