/* TRIPOS Pattern Matching algorithm with extensions */ /* * Original BCPL version by Martin Richards * This is a transliteration of the BCPL code with additions * by Pete Goodeve 87:8:10 * * This version has: * simple repetition ("*") (alternative to "#?") * negation ("~" and "||") * slicing ("^") */ /* * Reference: * * Martin Richards, * "A Compact Function for Regular Expression * Pattern Matching" * [Software--Practice and Experience, Vol 9, 527-534 (1979)] */ /* don't #define DEBUG 1 */ /* ...there are large hunks of optional trace code in this version */ #include /* Code bits returned by Pattern Compiler: */ /* this bit always set unless pattern is bad: */ #define CODE_OK 1 /* the following bits will not be set if pattern is bad: */ /* set if any single-wild-match characters present ("?"): */ #define CODE_ANY 2 /* set if any multiple=-wild-match characters present ("*"): */ #define CODE_ALL 4 /* set if any grouping characters present ("()"): */ #define CODE_GROUP 8 /* set if pattern has alternations ("|"): */ #define CODE_ALT 16 /* set if pattern has repeating segments ("#" or "*"): */ #define CODE_REP 32 /* set if pattern has negation ("~" or "||"): */ #define CODE_NEG 64 /* set if pattern is sliced ("^"): */ #define CODE_SLICE 128 /* set if more than MAXMARK slices or if used within parentheses */ #define CODE_OVERSLICE CODE_SLICE | 256 /* Pattern Control Characters are defined here so you can change them * easily. * You may undefine PAT_ALL or PAT_SLICE to remove the sections of code * that provide these functions; the other definitions may be changed but * not removed. */ #define PAT_QUOTE '\'' #define PAT_LGROUP '(' #define PAT_RGROUP ')' #define PAT_ALT '|' #define PAT_REP '#' #define PAT_ANY '?' #define PAT_ALL '*' #define PAT_NEG '~' /* -- note double PAT_ALT is also always a negation code */ #define PAT_NULL '%' #define PAT_SLICE '^' #define Endstreamch 0xFF #define WORKSIZE 128 /*** Static Variables: ***/ UBYTE *Work = NULL; int Wp = 0, Succflag = FALSE; char *Pat = 0; UBYTE *Aux = 0; UBYTE Ch = 0; int PatP = 0, Patlen = 0, Errflag = FALSE, NegP = 0; UBYTE *NWork = NULL; #ifdef PAT_SLICE -------------------------v #define MAXMARK 4 #define MAXCUT 16 struct MarkSet {UBYTE mk[MAXMARK];}; UBYTE *Svect = NULL; int cutlimit; struct MarkSet MarkP, *MWork; #endif -----------------------------------^ int S, StrLength; #ifdef DEBUG ==============================V int DEBmode = TRUE; /* set this false for no printout when DEBUG defined */ int ii; /* local variable -- defined here for convenience */ #endif ====================================^ /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ /*** The Interpreter: ***/ #ifdef PAT_SLICE --------------------------v SMatch(Pat, Aux, Str, Slice) UBYTE *Aux; char *Pat, *Str, *Slice; #else ------------------------------------ Match(Pat, Aux, Str) UBYTE *Aux; char *Pat, *Str; #endif ------------------------------------^ { UBYTE W[WORKSIZE]; UBYTE NegW[WORKSIZE]; /* for negation */ #ifdef PAT_SLICE --------------------------v struct MarkSet MarkW[WORKSIZE]; /* for slice marking */ #endif ------------------------------------^ register int N, I; register UBYTE P, Q; char K; StrLength = strlen(Str); S = 0; Pat -= 1; /* align into "BCPL" form (byte 0 never accessed) */ Work = W; *Work = 0; /* remote possibility of first Put failing otherwise */ NWork = NegW; #ifdef PAT_SLICE ---------------------------v MWork = MarkW; for (I=0; I < MAXMARK; I++) { MWork->mk[I] = MarkP.mk[I] = 0; } if (Svect = Slice) /* allowed to be NULL */ *Svect = 0; cutlimit = MAXCUT; #endif -------------------------------------^ Wp = 0; Succflag = FALSE; NegP = 0; Put(1); if (!(Aux[0]==0)) Put(Aux[0]); do { for (N=1; N <= Wp; N++) { /* first complete the closure: */ P = Work[N]; NegP = NWork[N] & 1; /* propagate low bit only */ #ifdef PAT_SLICE ---------------------------v MarkP = MWork[N]; #endif -------------------------------------^ K = Pat[P]; Q = Aux[P]; switch(K) { case PAT_REP: Put(P+1); #ifdef PAT_ALL -----------------------------v case PAT_ALL: #endif -------------------------------------^ case PAT_NULL: Put(Q); default: break; case PAT_LGROUP: case PAT_ALT: if (Q != 0) Put(Q); if (Pat[P+1] == '|') { /* Negative alternate */ NegP = 1; P++; } Put(P+1); break; case PAT_NEG: Put(Q); NegP = 1; if (!(NWork[N] & 2)) Put(P+1); break; #ifdef PAT_SLICE ----------------------------v case PAT_SLICE: Putslice(P, S); Put(Q); #endif --------------------------------------^ } } #ifdef DEBUG ====================================V if (DEBmode) { printf("\nSucc=%d Closure Vector (len=%d):\n", Succflag, Wp); for (ii=1; ii<=Wp; ii++) printf("%3d", Work[ii]); putchar('\n'); for (ii=1; ii<=Wp; ii++) printf("%3d", NWork[ii]); puts("\n====="); } #endif ==========================================^ if (S >= StrLength) return (Succflag > 0); if (Wp == 0) return FALSE; Ch = Str[S++]; #ifdef DEBUG ====================================V if (DEBmode) printf("Matching character %c:\n",Ch); #endif ==========================================^ /* now deal with match items: */ N = Wp; Wp = 0; Succflag = FALSE; for (I = 1; I <= N; I++) { P = Work[I]; NegP = NWork[I]; #ifdef PAT_SLICE ------------------------------v MarkP = MWork[I]; #endif ----------------------------------------^ K = Pat[P]; switch(K) { case PAT_NEG: NegP |= 2; Put(P); #ifdef PAT_SLICE ------------------------------v case PAT_SLICE: #endif ----------------------------------------^ case PAT_REP: case PAT_ALT: case PAT_NULL: case PAT_LGROUP: break; /* BCPL was 'loop' */ case PAT_QUOTE: K = Pat[P+1]; default: if (Ch != K) break; /* 'loop' */ case PAT_ANY: /* successful match */ Put(Aux[P]); break; /* 'loop' */ #ifdef PAT_ALL --------------------------------v case PAT_ALL: Put(P); /* point back to self...*/ break; /* 'loop' */ #endif ----------------------------------------^ } } #ifdef DEBUG ======================================V if (DEBmode) { printf("\nSucc=%d Match Vector (len=%d):\n", Succflag, Wp); for (ii=1; ii<=Wp; ii++) printf("%3d", Work[ii]); putchar('\n'); for (ii=1; ii<=Wp; ii++) printf("%3d", NWork[ii]); puts("\n====="); } #endif ============================================^ } while (TRUE); } /*** end Match ***/ Put(N) int N; { UBYTE *W, *Wlim, *NW; #ifdef DEBUG ======================================V if (DEBmode) #ifdef PAT_SLICE ------------------------------v printf("Put %d[%d, %d/%d] ", N, NegP, MarkP.mk[0], MarkP.mk[1]); #else ---------------------------------------- printf("Put %d[%d] ", N, NegP); #endif ----------------------------------------^ #endif ============================================^ if (N == 0) { Succflag |= NegP ? -1 : 1; #ifdef PAT_SLICE ------------------------------v if (S == StrLength) RetSlice(); /* matched -- pass back slice info */ #endif ----------------------------------------^ } else { W = Work; Wlim = W + Wp; NW = NWork; for(;W <= Wlim; W++, NW++) if ((*W == N)&&(*NW == NegP)) return; *W = N; *NW = NegP; Wp = W - Work; #ifdef PAT_SLICE ------------------------------v MWork[Wp] = MarkP; #endif ----------------------------------------^ if (Wp >= WORKSIZE) exit(33); } } #ifdef PAT_SLICE -----------------------------------------------v Putslice(P, S) int P, S; { int i; UBYTE *MW; #ifdef DEBUG ======================================V if (DEBmode) printf("Slice ^%d=%d ", P, S); #endif ============================================^ for (i = 0, MW = (UBYTE *)MWork; imk, *mpp = MarkP.mk, lastcut = 0; if (!Svect) return; /* ignore if NULL */ #ifdef DEBUG ======================================V if (DEBmode) printf("RetSlice.. Svect = %d\n",Svect); #endif ============================================^ for (i=MAXMARK; i && *mwp && cutlimit; i--, cutlimit--) if(*mpp >= lastcut) { /* avoid supefluous cuts */ *Svect++ = *mwp++; lastcut = *Svect++ = *mpp++; } *Svect = 0; } #endif ---------------------------------------------------------^ /*********************************************************************/ /*** The Compiler: ***/ int retcode, grouplevel, slicecount; Rch() { if (PatP >= Patlen) Ch = Endstreamch; else { Ch = Pat[++PatP]; } } Nextitem() { if (Ch == PAT_QUOTE) Rch(); Rch(); } int Prim(NegMark) int NegMark; { int A = PatP; UBYTE Op = Ch; if (NegMark) retcode |= CODE_NEG; Nextitem(); switch(Op) { case PAT_ANY: retcode |= CODE_ANY; return A; case PAT_ALL: retcode |= CODE_ALL; return A; case Endstreamch: case PAT_RGROUP: case PAT_ALT: Errflag = TRUE; default: return A; case PAT_REP: Setexits(Prim(NegMark), A); retcode |= CODE_REP; return A; case PAT_LGROUP: grouplevel++; A = Exp(A, (NegMark ? 1 : FALSE)); grouplevel--; if (Ch != PAT_RGROUP) Errflag = TRUE; retcode |= CODE_GROUP; Nextitem(); return A; case PAT_NEG: if (NegMark) Errflag = TRUE; NegMark = 1; retcode |= CODE_NEG; Join(A, Prim(NegMark)); NegMark = FALSE; return A; #ifdef PAT_SLICE ------------------------------v case PAT_SLICE: retcode |= (++slicecount > MAXMARK || grouplevel ) ? CODE_OVERSLICE : CODE_SLICE; return A; #endif ----------------------------------------^ } } int Exp(AltP, NegMark) int AltP, NegMark; { int Exits = 0, A; do { A = Prim(NegMark); if (Ch == PAT_ALT || Ch == PAT_RGROUP || Ch == Endstreamch) { Exits = Join(Exits, A); if (Ch != PAT_ALT) return Exits; retcode |= CODE_ALT; Aux[AltP] = PatP; AltP = PatP; Nextitem(); if (Ch == PAT_ALT) { /* negation convention */ if (NegMark == 1) Errflag = TRUE; NegMark = -1; /* not an error to meet oneself */ retcode |= CODE_NEG; Nextitem(); } else NegMark = FALSE; } else Setexits(A, PatP); } while (TRUE); } Setexits(List, Val) int List, Val; { int A; while (List != 0) { A = Aux[List]; Aux[List] = Val; List = A; } } int Join(A, B) int A, B; { int T = A; if (A == 0) return B; while (Aux[A] != 0) A = Aux[A]; Aux[A] = B; return T; } int CmplPat(Pattern, CmplPattern) char *Pattern, *CmplPattern; { int I; Pat = Pattern - 1; Aux = CmplPattern; Patlen = strlen(Pattern); /** this is no longer a BSTR!!! **/ PatP = 0; Errflag = FALSE; retcode = CODE_OK; grouplevel = slicecount = 0; for (I = 0; I <= Patlen; I++) Aux[I] = 0; Rch(); Setexits(Exp(0,FALSE), 0); return Errflag ? 0 : retcode; }