基本介紹
- 中文名:KMP算法
- 外文名:The Knuth-Morris-Pratt Algorithm
- 輸入:正文串T[1,n]和模式串W[1,m]
- 輸出:匹配結果match[1,n]
- 發現者:D.E.Knuth等
- 時間複雜度:O(m+n)
基本思想
j | 0 | 1 | 2 | 3 | 4 | 5 |
W[j] | a | b | a | c | a | b |
F(j) | 0 | 0 | 1 | 0 | 1 | 2 |
串匹配算法
int KMP(string W,string T){ int i=1,j=1; while(i<=n){ while (j!=0&&W[j]!=T[i]) j=next[j]; if(j==m) return i-m+1;//匹配成功,返回匹配位置s else{j++; i++;} } return -1;//匹配失敗}
procedureKMP begin i=1 j=1 while i<=n do while j<>0 and W[j]<>T[i] do j=newnext[j] endwhile if j=m return "success" else j++ i++ endif endwhile return "failure"end
next和newnext
void getNext(string W){ for(int i=1; i<m; i++){ int j=i; while(j>0){ j=next[j]; if(W[j]==W[i]){ next[i+1]=j+1; break; } } }}
function NEXT begin next[1]=newnext[1]=0 j=2 while j<=m do i=next[j-1] while i<>0 and W[i]<>W[j-1]) do i=next[i] endwhile next[j]=i+1 j=j+1 endwhile endfunction NEWNEXT begin newnext[1]=0 j=2 while j<=m do i=next(j) if i=0 or W[j]<>W[i+1] newnext[j]=i else newnext[j]=newnext[i] endif j++ endwhile end
void GetNext(charT[],intnext[]){ next[1]=0; j=1;k=0; while(j<=strlen(T)) //此處應包含string頭檔案 if((k==0)||(T[j]==T[k])){ next[j]=k; j++; k++; } else k=next[k];}
BM串匹配
procedure BM begin i=m Repeat j=m k=i while(j>0)and(w[j]=t[k])do j=j-1 k=k-1 endwhile i=i+d[t[i]] Until(j=0)or(i>n) If j=0 return "SUCCESS" else return“FAILURE” endif end
d函式
functiond:integer;beginforx∈∑dod(x)=mforj=m-1downto1doifd(w[j])=md(w[j]):=m-jendforendxi+1=ord(ti+1)dm-1+ord(ti+2)dm-2+…+ord(ti+m)=(xi-ord(ti)dm-1).d+ord(ti+m)
RK串匹配
programRK;begin{計算x,x:=d↑(m-1)modq}x=1fori=1tom-1dox=(32*x)modq{計算模式W的散列函式值}s=0fori=1tomdos=((s*32)+ord(w[i]))modq{計算正文T的第一個長度為m的字元段的散列函式值}t=0fori=1tomdot=(t*32+ord(w[i]))modq{如果正文的第一個長度為m的字元段和模式有相同的散列函式值,則進行匹配檢查.否則,以及在匹配檢查失敗情況下,繼續計算下一個字元段的散列函式值}i=1whilei<=n-mdoifs=t{進行匹配檢查}k=1j=iwhile(t[j]=w[k])and(k<=m)doj=j+1k=k+1endwhileifi<n-m{計算下一字元段的散列函式值}t=((t-x*ord(t[i]))*32+ord(t[i+m]))modqi=i+1endifendifendwhilereturn“FAILURE”end
最佳化
最佳化思路
下標i | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
p(i) | a | b | c | d | a | a | b | c | a | b |
next[i] | -1 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 3 | 1 |
下標i | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
p(i) | a | b | c | d | a | a | b | c | a | b |
next[i] | -1 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 3 | 1 |
最佳化的next[i] | -1 | 0 | 0 | 0 | -1 | 1 | 0 | 0 | 3 | 0 |
void getNext(const char*pattern,int next[]){ next[0]=-1; int k=-1,j=0; while(pattern[j]!='\0'){ while (k!=-1 && pattern[k]!=pattern[j]) k=next[k]; ++j; ++k; if(pattern[k]==pattern[j]) next[j]=next[k]; else next[j]=k; } }
C++ 原始碼
#include<iostream>#include<stdlib.h>#include<vector>using namespace std;inline void NEXT(const string&T, vector<int>&next){//按模式串生成vector,next(T.size()) next[0] = -1; for (int i = 1; i<T.size(); i++){ int j = next[i - 1]; while (j >= 0 && T[i - 1] != T[j]) j = next[j];//遞推計算 if (j >= 0 && T[i - 1] == T[j]) next[i] = j + 1; else next[i] = 0; }}inline string::size_type COUNT_KMP(const string&S, const string&T){ //利用模式串T的next函式求T在主串S中的個數count的KMP算法 //其中T非空, vector<int>next(T.size()); NEXT(T, next); string::size_type index, count = 0; for (index = 0; index<S.size(); ++index){ int pos = 0; string::size_type iter = index; while (pos<T.size() && iter<S.size()){ if (S[iter] == T[pos]){ ++iter; ++pos; } else{ if (pos == 0) ++iter; else pos = next[pos - 1] + 1; } } if (pos == T.size() && (iter - index) == T.size()) ++count; } return count;}int main(int argc, char*argv[]){ string S="abaabcacabaabcacabaabcacabaabcacabaabcac"; string T="ab"; //cin >> S; //cin >> T; string::size_type count = COUNT_KMP(S, T); cout << count << endl; system("PAUSE"); return 0;}
Pascal 原始碼
const MAX_STRLEN=255;var next:array[1..MAX_STRLEN] of longint; str_s,str_t:string; int_i:longint;procedure get_next(t:string); var j,k:longint; begin j:=1; k:=0; while j<length(t) do begin if (k=0) or (t[j]=t[k]) then begin inc(j); inc(k); next[j]:=k; end else k:=next[k]; end; end;function kmp(s,t:string):longint; var i,j,back:longint; begin get_next(t); back:=0; i:=1; j:=1; while (i<=length(s)) and (j<=length(t)) do begin if (j=0) or (s[i]=t[j]) then begin inc(i); inc(j); end else j:=next[j]; if j>length(t) then back:=i-length(t); end; exit(back); end;begin write('S='); readln(str_s); write('T='); readln(str_t); int_i:=kmp(str_s,str_t); if int_i<>0 then writeln('Found ',str_t,' in ',str_s,' at ',int_i,'.') else writeln('Cannot found ',str_t,' in ',str_s,'.')end.