void PreBmBc(char *pattern, int m, int bmBc[]) { int i; for (i = 0; i < MAX_CHAR; i++) bmBc[i] = m; for (i = 0; i < m - 1; i++) bmBc[pattern[i]] = m - 1 - i; /* printf(""bmBc[]:); for (i = 0; i < m; i++) printf("%d ",bmBc[pattern[i]]); printf("\n"); */ }
計算pattern需要右移的距離,要藉助bmBc數組,那么bmBc的值是不是就是pattern實際要右移的距離呢?No,想想也不是,比如前面舉例說到利用bmBc算法還可能走回頭路,也就是右移的距離是負數,而bmBc的值絕對不可能是負數,所以兩者不相等。那么pattern實際右移的距離怎么算呢?這個就要看text中壞字元的位置了,前面說過壞字元算法是針對text的,還是看圖吧,一目了然。圖中v是text中的壞字元(對應位置i+j),在pattern中對應不匹配的位置為i,那么pattern實際要右移的距離就是:bmBc['v'] - m + 1 + i。
好後綴bmGs[ ]
這裡bmGs[ ]的下標是數字而不是字元了,表示字元在pattern中位置。
如前所述,bmGs數組的計算分三種情況,與前一一對應。假設圖中好後綴長度用數組suff[ ]表示。
Case1:對應好後綴算法case1,如下圖,j是好後綴之前的那個位置。
Case2:對應好後綴算法case2:如下圖所示:
Case3:對應與好後綴算法case3,bmGs[i] = strlen(pattern)= m
這樣就清晰了,代碼編寫也比較簡單:
void PreBmGs(char *pattern, int m, int bmGs[]) { int i, j; int suff[SIZE]; // 計算後綴數組 suffix(pattern, m, suff); // 先全部賦值為m,包含Case3 for (i = 0; i < m; i++) bmGs[i] = m; // Case2 j = 0; for (i = m - 1; i >= 0; i--) { if (i + 1 == suff[i]) { for (;j < m - 1 - i; j++) { if (m == bmGs[j]) bmGs[j] = m - 1 - i; } } } // Case1 for (i = 0; i <= m - 2; i++) bmGs[m - 1 - suff[i]] = m - 1 - i; print(bmGs, m , "bmGs[]"); }
計算suff[ ]
在計算bmGc數組時,為提高效率,先計算輔助數組suff[ ]表示好後綴的長度。
suff數組的定義:m是pattern的長度
a. suffix[m-1] = m;
b. suffix[i] = k
for [ pattern[i-k+1] ....,pattern[i]] == [pattern[m-1-k+1],pattern[m-1]]
void suffix_old(char *pattern, int m, int suff[]) { int i, j; suff[m - 1] = m; for (i = m - 2; i >= 0; i--) { j = i; while (j >= 0 && pattern[j] == pattern[m -1 - i + j]) j--; suff[i] = i - j; } }
void suffix(char *pattern, int m, int suff[]) { int f, g, i; suff[m - 1] = m; g = m - 1; for (i = m - 2; i >= 0; --i) { if (i > g && suff[i + m - 1 - f] < i - g) suff[i] = suff[i + m - 1 - f]; else { if (i < g) g = i; f = i; while (g >= 0 && pattern[g] == pattern[g + m - 1 - f]) --g; suff[i] = f - g; } } //print(suff, m, "suff[]"); }
完整C代碼
#include <stdio.h> #include <string.h> #define MAX_CHAR 256 #define SIZE 256 #define MAX(x, y) (x) > (y) ? (x) : (y) void BoyerMoore(char *pattern, int m, char *text, int n); int main() { char text[256], pattern[256]; while (1) { puts("Please input the text and the pattern:(input Q to quit.)"); gets(text); if (!strcmp(text, "Q") || ! strcmp(text, "q")) break; gets(pattern); //printf("%s\n", text); //printf("%s\n", pattern); BoyerMoore(pattern, strlen(pattern), text, strlen(text)); printf("\n"); } return 0; } void print(int *array, int n, char *arrayName) { int i; printf("%s:", arrayName); for (i = 0; i < n; i++) printf("%d ", array[i]); printf("\n"); } void PreBmBc(char *pattern, int m, int bmBc[]) { int i; for (i = 0; i < MAX_CHAR; i++) bmBc[i] = m; for (i = 0; i < m - 1; i++) bmBc[pattern[i]] = m - 1 - i; /* printf(""bmBc[]:); for (i = 0; i < m; i++) printf("%d ",bmBc[pattern[i]]); printf("\n"); */ } void suffix_old(char *pattern, int m, int suff[]) { int i, j; suff[m - 1] = m; for (i = m - 2; i >= 0; i--) { j = i; while (j >= 0 && pattern[j] == pattern[m -1 - i + j]) j--; suff[i] = i - j; } } void suffix(char *pattern, int m, int suff[]) //改進計算suffix方法 { int f, g, i; suff[m - 1] = m; g = m - 1; for (i = m - 2; i >= 0; --i) { if (i > g && suff[i + m - 1 - f] < i - g) suff[i] = suff[i + m - 1 - f]; else { if (i < g) g = i; f = i; while (g >= 0 && pattern[g] == pattern[g + m - 1 - f]) --g; suff[i] = f - g; } } //print(suff, m, "suff[]"); } void PreBmGs(char *pattern, int m, int bmGs[]) { int i, j; int suff[SIZE]; // 計算後綴數組 suffix(pattern, m, suff); // 先全部賦值為m,包含Case3 for (i = 0; i < m; i++) bmGs[i] = m; // Case2 j = 0; for (i = m - 1; i >= 0; i--) { if (i + 1 == suff[i]) { for (;j < m - 1 - i; j++) { if (m == bmGs[j]) bmGs[j] = m - 1 - i; } } } // Case1 for (i = 0; i <= m - 2; i++) bmGs[m - 1 - suff[i]] = m - 1 - i; print(bmGs, m , "bmGs[]"); } void BoyerMoore(char *pattern, int m, char *text, int n) { int i, j, bmBc[MAX_CHAR], bmGs[SIZE]; // Preprocessing PreBmBc(pattern, m, bmBc); PreBmGs(pattern, m, bmGs); // Searching j = 0; while (j <= n - m) { for (i = m - 1; i >= 0 && pattern[i] == text[i + j]; i--); if (i < 0) { printf("Find it, the position is %d\n", j); j += bmGs[0]; return ; } else j += MAX(bmBc[text[i + j]] - m + 1 + i, bmGs[i]); } printf("No find.\n"); }