Horspool 算法

Boyer-Moore 算法的简化版本。每次从模式串末尾开始匹配,匹配失败的话根据转移表确定下一次匹配开始的位置。具体介绍可以看看 Horspool(字符串匹配)算法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
int Horspool(const char* src, const char* pattern) {
size_t n = strlen(src);
size_t m = strlen(pattern);
size_t table[m];
// 预处理转移表
for (size_t i = 0; i < m; i++) table[i] = m;
for (size_t i = 0; i < m-1; i++)
table[pattern[i]-'a'] = m-1-i;

size_t i = m-1;
while (i <= n-1) {
size_t k = 0;
while (k <= m-1 && pattern[m-1-k] == src[i-k]) k++;

if (k == m) return i-m+1;
else i += table[src[i]-'a'];
}
return -1;
}

Boyer-Moore 算法

该算法常用于文本编辑器中的搜索匹配功能,相关介绍以及实现来自这篇博客 从入门到精通之Boyer-Moore字符串搜索算法详解

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#include <stdio.h>
#include <string.h>

#define MAX_CHAR 256
#define SIZE 256
#define MAX(x, y) (x) > (y) ? (x) : (y)

void BoyerMoore(char *pattern, int m, char *text, int n);

int main()
{
char text[256], pattern[256];

while(1)
{
scanf("%s%s", text, pattern);
if(text == 0 || pattern == 0) break;

BoyerMoore(pattern, strlen(pattern), text, strlen(text));
printf("\n");
}

return 0;
}

void print(int *array, int n, char *arrayName)
{
int i;
printf("%s: ", arrayName);
for(i = 0; i < n; i++)
{
printf("%d ", array[i]);
}
printf("\n");
}

void PreBmBc(char *pattern, int m, int bmBc[])
{
int i;

for(i = 0; i < MAX_CHAR; i++)
{
bmBc[i] = m;
}

for(i = 0; i < m - 1; i++)
{
bmBc[pattern[i]] = m - 1 - i;
}

/* printf("bmBc[]: ");
for(i = 0; i < m; i++)
{
printf("%d ", bmBc[pattern[i]]);
}
printf("\n"); */
}

void suffix_old(char *pattern, int m, int suff[])
{
int i, j;

suff[m - 1] = m;

for(i = m - 2; i >= 0; i--)
{
j = i;
while(j >= 0 && pattern[j] == pattern[m - 1 - i + j]) j--;

suff[i] = i - j;
}
}

void suffix(char *pattern, int m, int suff[]) {
int f, g, i;

suff[m - 1] = m;
g = m - 1;
for (i = m - 2; i >= 0; --i) {
if (i > g && suff[i + m - 1 - f] < i - g)
suff[i] = suff[i + m - 1 - f];
else {
if (i < g)
g = i;
f = i;
while (g >= 0 && pattern[g] == pattern[g + m - 1 - f])
--g;
suff[i] = f - g;
}
}

// print(suff, m, "suff[]");
}

void PreBmGs(char *pattern, int m, int bmGs[])
{
int i, j;
int suff[SIZE];

// 计算后缀数组
suffix(pattern, m, suff);

// 先全部赋值为m,包含Case3
for(i = 0; i < m; i++)
{
bmGs[i] = m;
}

// Case2
j = 0;
for(i = m - 1; i >= 0; i--)
{
if(suff[i] == i + 1)
{
for(; j < m - 1 - i; j++)
{
if(bmGs[j] == m)
bmGs[j] = m - 1 - i;
}
}
}

// Case1
for(i = 0; i <= m - 2; i++)
{
bmGs[m - 1 - suff[i]] = m - 1 - i;
}

// print(bmGs, m, "bmGs[]");
}

void BoyerMoore(char *pattern, int m, char *text, int n)
{
int i, j, bmBc[MAX_CHAR], bmGs[SIZE];

// Preprocessing
PreBmBc(pattern, m, bmBc);
PreBmGs(pattern, m, bmGs);

// Searching
j = 0;
while(j <= n - m)
{
for(i = m - 1; i >= 0 && pattern[i] == text[i + j]; i--);
if(i < 0)
{
printf("Find it, the position is %d\n", j);
j += bmGs[0];
return;
}
else
{
j += MAX(bmBc[text[i + j]] - m + 1 + i, bmGs[i]);
}
}

printf("No find.\n");
}

Sunday 算法

一种比 BM 算法更快的查找算法。

Sunday 算法是从前往后匹配,在匹配失败时关注的是文本串中参加匹配的最末位字符的下一位字符。

  • 如果该字符没有在模式串中出现则直接跳过,即移动位数 = 匹配串长度 + 1;
  • 否则,其移动位数 = 模式串中最右端的该字符到末尾的距离 +1。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
int sunday(char *text, char *patt) {
size_t pre[256];
size_t *shift = pre;
size_t i, m = strlen(patt), n = strlen(text);
// init
for (i = 0; i < 256; i++) *(shift+i) = m+1;
for (i = 0; i < m; i++) *(shift + (unsigned char)(*(patt+i))) = m-i;
i = 0;
while (i <= n-m) {
if (memcmp(patt, text+i, m) == 0) return i;
i += shift[text[i+m]];
}
return -1;
}