/* * Lexical Analysis Functions Takes input stream and returns "tokens". * Implements pre-processor, allowing #include's and #define's tokens are * character strings. */ #include #include #include #include #include "listmac.h" #include "std.h" #include "lex.h" struct StringElt { struct Nod node; char *buf; }; struct Token { struct Nod node; char *buf; short type, can_be_macro; char chr; }; struct Macro { struct Nod node; struct Lst arguments; struct Lst tokens; char *name; short active, nargs; }; struct ActiveMacro { struct Nod node; struct Macro *mac; struct Token *curtok; }; struct LexFile { struct Nod node; FILE *fp; short newln; short nextchar; }; struct IfBlock { struct Nod *node; short skip_else; }; struct Token *CopyToken (); struct Macro *FindMacro (); char *FindString (); #define BUFLEN 100 static char buf[BUFLEN]; static short bufpos; static struct Token rawtok, *lasttok, *retok; static struct LexFile *toplf; static struct Lst macros, activemacs, files, ifblocks; static short files_open = 0, init_done = 0; static char *includestr, *definestr, *ifdefstr, *ifndefstr, *elsestr, *endifstr; static char *libbase = "hd:include/", libnam[80], basnam[80]; #define NHASH 53 struct Lst hashTab[NHASH]; /* * Open a file for lexing - could be an include. * Set newline flag to true. */ short OpenLexFile (name) char *name; { struct LexFile *lf; struct Macro *mac; /* * on first call, init environment vars */ if (!init_done) { NewList (&files); NewList (¯os); NewList (&ifblocks); NewList (&activemacs); InitHash (); includestr = FindString ("include"); definestr = FindString ("define"); ifdefstr = FindString ("ifdef"); ifndefstr = FindString ("ifndef"); elsestr = FindString ("else"); endifstr = FindString ("endif"); init_done = 1; } /* * if there are no files currently open, prepare for a new one */ if (!files_open) { while (mac = (struct Macro *) RemHead (¯os)) FreeMacro (mac); ASSERT (!TEST (HEAD (&ifblocks))); ASSERT (!TEST (HEAD (&activemacs))); retok = NULL; } lf = NEW (struct LexFile); if (!lf) return 0; /* * printf ("opening \"%s\"\n", name); */ lf->fp = fopen (name, "r"); if (!lf->fp) { FREI (lf); return 0; } lf->newln = 1; lf->nextchar = getc (lf->fp); AddHead (&files, lf); toplf = lf; if (feof (lf->fp)) { CloseLexFile (); return 0; } files_open = 1; return 1; } /* * Close top file returning 1 for the last file now closed. */ short CloseLexFile () { if (!files_open) return 1; fclose (toplf->fp); Remove (toplf); FREI (toplf); toplf = HEAD (&files); if (TEST (toplf)) return 0; files_open = 0; toplf = 0; return 1; } /* * free stuff up */ LexCleanup () { struct Macro *mac, *nmac; while (files_open) CloseLexFile (); FreeHash (); for (mac = HEAD (¯os); nmac = NEXT (mac); mac = nmac) FreeMacro (mac); ASSERT (!TEST (HEAD (&activemacs))); /* * initialization can be done again */ init_done = 0; } FreeMacro (mac) struct Macro *mac; { struct Token *tok; struct StringElt *se; while (tok = (struct Token *) RemHead (&mac->tokens)) FREI (tok); while (se = (struct StringElt *) RemHead (&mac->arguments)) FREI (se); FREI (mac); } /* * Get next raw character from file dealing with backslash escape chars. * This is kind of wrong since these really ought to be recognized only * inside strings. Doing it this way deals with backslashes before * newlines. */ short NextC1 (fp) FILE *fp; { short c, k, radix; c = getc (fp); if (c == '\\') { c = getc (fp); if (isdigit (c)) { radix = 10; if (c == '0') radix = 8; k = (c - '0'); while (isdigit (c = getc (fp))) k = k * radix + (c - '0'); ungetc (c, fp); c = k; } else { switch (c) { case '\\': break; case '"': c = '.'; break; case 'n': c = '\n'; break; case 'b': c = '\b'; break; case 't': c = '\t'; break; case '\n': c = getc (fp); } } } return c; } /* * Get next character from Top Lex File, updating nextchar. */ short NextC () { short c; if (!toplf) return EOF; c = toplf->nextchar; toplf->nextchar = NextC1 (toplf->fp); return c; } /* * Move the top lex file to the next newline. */ AdvanceEOL () { if (!toplf) return; while (toplf->nextchar != '\n' && !feof (toplf->fp)) NextC (); } /* * Read a raw set of characters updating newline state. Sets the * fields in rawtok to the type and string pointer, if any. */ RawToken () { short c; short nochar = 1, comment, nl; bufpos = 0; rawtok.can_be_macro = 1; /* * get a non-blank char */ while (nochar) { /* * get a character testing for end-of-file if EOF, just * return that fact right away */ c = NextC (); if (c == EOF) { rawtok.type = RT_EOF; return; } /* * test next char: if it's white space other than newline, * just clear the newline flag and continue looking. */ if (isspace (c) && c != '\n') toplf->newln = 0; else { /* * otherwise, this is potentially interesting, but it * might be the start of a comment - if so, scan til * the end of the comment (no nested comments ala * "C"). If not, exit the loop. */ if (c == '/' && toplf->nextchar == '*') { comment = 1; while (comment) { c = NextC (); if (c == '*' && toplf->nextchar == '/') { c = NextC (); comment = 0; } } } else nochar = 0; } } nl = toplf->newln; toplf->newln = 0; bufpos = 0; /* * otherwise, read out the cases id symbol starting with char */ if (nl && c == '#') rawtok.type = RT_ESC; else if (c == '\'') { rawtok.chr = NextC (); c = NextC (); /* * Skip past weird character constants, like 'FOOL', * making no attempt to deal with them correctly. */ comment = 0; while (c != '\'' && comment < 6) { c = NextC (); comment++; } if (comment >= 6) printf ("error in character constant\n"); rawtok.type = RT_CHRC; } else if (isalpha (c) || c == '_') { buf[bufpos++] = c; while (isalnum (toplf->nextchar) || toplf->nextchar == '_') buf[bufpos++] = NextC (); buf[bufpos] = 0; rawtok.type = RT_ID; rawtok.buf = FindString (buf); } else if (c == '\n') { toplf->newln = 1; rawtok.type = RT_NEWLN; } else if (isdigit (c)) { buf[bufpos++] = c; while (isdigit (toplf->nextchar)) buf[bufpos++] = NextC (); buf[bufpos] = 0; rawtok.buf = FindString (buf); rawtok.type = RT_NUM; } else if (c == '"') { while ((c = NextC ()) != '"') if (bufpos < BUFLEN) buf[bufpos++] = c; if (bufpos + 1 == BUFLEN) printf ("string overflow\n"); buf[bufpos] = 0; rawtok.buf = FindString (buf); rawtok.type = RT_STR; } else { rawtok.chr = c; rawtok.type = RT_CHR; } } /* * Main function -- returns string pointer for next token makes include files * and macros transparent reads from top open file and returns pointer to * string of chars could be a symbol, could be a number, could be a character */ short NextToken (tbuf) char **tbuf; { struct Token *tok; struct Macro *mac; struct ActiveMacro *amac; short i; char c; while () { /* * get raw token from file or active macro */ amac = HEAD (&activemacs); if (retok) { tok = retok; retok = NULL; } else if (TEST (amac)) { tok = amac->curtok; if (!TEST (tok)) { amac->mac->active = 0; for (i = 0; i < amac->mac->nargs; i++) FreeMacro (RemHead (¯os)); Remove (amac); FREI (amac); continue; } amac->curtok = NEXT (amac->curtok); } else { RawToken (); tok = &rawtok; } switch (tok->type) { case RT_EOF: if (CloseLexFile ()) { lasttok = tok; return RT_EOF; } break; case RT_ESC: RawToken (); if (rawtok.type != RT_ID) { printf ("real problem with lexer directive\n"); AdvanceEOL (); break; } if (rawtok.buf == includestr) Include (); else if (rawtok.buf == definestr) Define (); else if (rawtok.buf == ifdefstr) IfDef (1); else if (rawtok.buf == ifndefstr) IfDef (0); else if (rawtok.buf == elsestr) Else (); else if (rawtok.buf == endifstr) EndIf (); else { printf ("unknown directive \"%s\"\n", rawtok.buf); AdvanceEOL (); } break; case RT_ID: /* * test for known macros (if this token can be one) */ if (tok->can_be_macro) { mac = FindMacro (tok->buf); if (mac) { InstantiateMacro (mac); break; } } case RT_STR: case RT_NUM: *tbuf = tok->buf; lasttok = tok; return tok->type; case RT_CHR: case RT_CHRC: *tbuf = &tok->chr; lasttok = tok; return tok->type; } } } /* * The given macro has occured in the text, cause it to come into existence * with argument substitution. */ InstantiateMacro (mac) struct Macro *mac; { struct ActiveMacro *amac; struct StringElt *arg; struct Macro *smac; struct Token *tok; char *buf; short vtok, level, endarglist; if (mac->active) { printf ("recursive macro ignored\n"); return; } /* * read arguments, if any, and make them macros in their own right * arguments are read recursively using NextToken (right? I'm not * sure...) */ arg = HEAD (&mac->arguments); if (TEST (arg)) { /* * get first open paren */ vtok = NextToken (&buf); if (vtok != RT_CHR || *buf != '(') goto argfail; /* * loop through args separated with commas */ endarglist = 0; while (TEST (arg)) { smac = NEW (struct Macro); if (!smac) goto argfail; smac->name = arg->buf; smac->active = 0; smac->nargs = 0; NewList (&smac->arguments); NewList (&smac->tokens); /* * scan out tokens allowing for nested commas */ level = 0; while () { vtok = NextToken (&buf); if (vtok == RT_CHR) { if (*buf == '(' || *buf == '{') level++; if (*buf == '}') level--; if (*buf == ')') { if (level == 0) { endarglist = 1; break; } level--; } if (*buf == ',' && level == 0) break; } tok = CopyToken (lasttok); if (!tok) goto argfail; tok->can_be_macro = 0; AddTail (&smac->tokens, tok); } AddHead (¯os, smac); if (endarglist) break; arg = NEXT (arg); } if (vtok != RT_CHR || *buf != ')') goto argfail; } /* * Macro does not become active in the global list until all * arguments have been parsed and interpreted. */ amac = NEW (struct ActiveMacro); if (!amac) return; /* what the fuck do I do here? */ amac->mac = mac; amac->curtok = HEAD (&mac->tokens); mac->active = 1; AddHead (&activemacs, amac); return; argfail: printf ("bad problem with arguments\n"); } /* * locate a specified macro in the available macros list */ struct Macro * FindMacro (buf) char *buf; { struct Macro *mac; for (mac = HEAD (¯os); TEST (mac); mac = NEXT (mac)) if (mac->name == buf) return mac; return NULL; } /* * Do an "include" directive */ Include () { short pos, ok, global; char c; /* * get filename */ RawToken (); ok = 0; global = 0; if (rawtok.type == RT_STR) { ok = 1; strcpy (basnam, rawtok.buf); } else if (rawtok.type == RT_CHR && rawtok.chr == '<') { ok = 1; pos = 0; while () { c = NextC (); if (c == '>') break; basnam[pos++] = c; } basnam[pos] = 0; global = 1; } AdvanceEOL (); if (!ok) { printf ("no file to include\n"); return; } if (!global) if (OpenLexFile (basnam)) return; strcpy (libnam, libbase); strcat (libnam, basnam); if (OpenLexFile (libnam)) return; printf ("error open include file <%s>\n", libnam); } /* * Do the "define" directive. */ Define () { struct Macro *mac = NULL; struct Token *tok; struct StringElt *se; /* * get identifier to define */ RawToken (); if (rawtok.type != RT_ID) { printf ("error in define - no identifier\n"); goto escape; } mac = NEW (struct Macro); if (!mac) goto escape; mac->name = rawtok.buf; mac->active = 0; mac->nargs = 0; NewList (&mac->arguments); NewList (&mac->tokens); /* * Look for parenthized argument list. */ if (toplf->nextchar == '(') { RawToken (); while () { RawToken (); /* * deal with special case of "#define MACRO()" */ if (!mac->nargs && rawtok.type == RT_CHR && rawtok.chr == ')') break; if (rawtok.type != RT_ID) { printf ("macro argument not an identifier\n"); goto escape; } se = NEW (struct StringElt); if (!se) goto escape; se->buf = rawtok.buf; AddTail (&mac->arguments, se); mac->nargs++; RawToken (); if (rawtok.type != RT_CHR) { printf ("macro arg list delimiter not a character\n"); goto escape; } if (rawtok.chr == ')') break; if (rawtok.chr != ',') { printf ("macro arg list delimiter not ',' or ')'\n"); goto escape; } } } /* * get the sequence of tokens which make up this definition */ while () { RawToken (); if (rawtok.type == RT_NEWLN || rawtok.type == RT_EOF) break; tok = CopyToken (&rawtok); if (!tok) goto escape; tok->can_be_macro = 1; AddTail (&mac->tokens, tok); } AddHead (¯os, mac); return; escape: if (mac) FreeMacro (mac); AdvanceEOL (); } /* * Skip over a defined-out section. Returns 1 if it ends with #else, 0 * if it ends with #endif, -1 for EOF. Keeps track of nested if's * being skipped. */ short SkipRemovedSec () { short level = 0; while () { RawToken (); if (rawtok.type == RT_EOF) return -1; if (rawtok.type == RT_ESC) { RawToken (); if (rawtok.type == RT_ID) { if (rawtok.buf == ifdefstr || rawtok.buf == ifndefstr) level++; if (rawtok.buf == elsestr) if (!level) return 1; if (rawtok.buf == endifstr) if (!level--) return 0; } } } } /* * Does the "ifdef"/"ifndef" - "else" - "endif" directive. Type is 1 * for ifdef and 0 for ifndef. */ IfDef (type) short type; { struct Macro *mac; struct IfBlock *ib; short els; RawToken (); mac = FindMacro (rawtok.buf); /* * If condition is true, add a node to say "skip the next #else * section" if false, skip this section. If section ends with an * else, add a junk node to pop off for consistency at next #endif. */ if ((mac && type) || (!mac && !type)) { ib = NEW (struct IfBlock); if (!ib) return; ib->skip_else = 1; AddHead (&ifblocks, ib); } else { els = SkipRemovedSec (); if (els == 1) { ib = NEW (struct IfBlock); if (!ib) return; ib->skip_else = 0; AddHead (&ifblocks, ib); } } } Else () { struct IfBlock *ib; short els; ib = HEAD (&ifblocks); ASSERT (TEST (ib)); Remove (ib); if (!ib->skip_else) { printf ("strange \"else\" block found\n"); } else { els = SkipRemovedSec (); if (els == 1 && ib->skip_else) { printf ("strange \"else\" block found\n"); } } FREI (ib); } EndIf () { struct IfBlock *ib; ib = HEAD (&ifblocks); ASSERT (TEST (ib)); Remove (ib); FREI (ib); } /* * Backs up one token */ Backspace () { retok = lasttok; } struct Token * CopyToken (tok) struct Token *tok; { struct Token *ntok; ntok = NEW (struct Token); if (ntok) *ntok = *tok; return ntok; } /* HASH TABLE STUFF */ InitHash () { short i; for (i = 0; i < NHASH; i++) NewList (&hashTab[i]); } /* * Simple but (I hope) effective hash function. */ short HashVal (buf) char *buf; { unsigned long hash; hash = 0; while (*buf) hash = hash * 3 + *buf++; return hash % NHASH; } /* * Finds (or creates) a stored string matching the given one return pointer * which acts as unique ident for this string. */ char * FindString (buf) char *buf; { short hash; struct Lst *hlist; struct StringElt *se; char *sto; hash = HashVal (buf); hlist = &hashTab[hash]; for (se = HEAD (hlist); TEST (se); se = NEXT (se)) { if (!strcmp (se->buf, buf)) return se->buf; } se = NEW (struct StringElt); if (!se) return NULL; sto = NEW_N (char, strlen (buf) + 1); if (!sto) { FREI (se); return NULL; } AddTail (hlist, se); strcpy (sto, buf); se->buf = sto; return sto; } FreeHash () { short i; struct StringElt *se, *nse; for (i = 0; i < NHASH; i++) { for (se = HEAD (&hashTab[i]); nse = NEXT (se); se = nse) { FREE_N (se->buf, char, strlen (se->buf) + 1); FREI (se); } } }