simgear/nasal/lex.c

   1 #include "parse.h"
   2
   3 // Static table of recognized lexemes in the language
   4 struct Lexeme {
   5     char* str;
   6     int   tok;
   7 } LEXEMES[] = {
   8     {"and", TOK_AND},
   9     {"or",  TOK_OR},
  10     {"!",   TOK_NOT},
  11     {"(", TOK_LPAR},
  12     {")", TOK_RPAR},
  13     {"[", TOK_LBRA},
  14     {"]", TOK_RBRA},
  15     {"{", TOK_LCURL},
  16     {"}", TOK_RCURL},
  17     {"*", TOK_MUL},
  18     {"+", TOK_PLUS},
  19     {"-", TOK_MINUS},
  20     {"/", TOK_DIV},
  21     {"~", TOK_CAT},
  22     {":", TOK_COLON},
  23     {".", TOK_DOT},
  24     {",", TOK_COMMA},
  25     {";", TOK_SEMI},
  26     {"=", TOK_ASSIGN},
  27     {"<",  TOK_LT},
  28     {"<=", TOK_LTE},
  29     {"==", TOK_EQ},
  30     {"!=", TOK_NEQ},
  31     {">",  TOK_GT},
  32     {">=", TOK_GTE},
  33     {"nil", TOK_NIL},
  34     {"if",    TOK_IF},
  35     {"elsif", TOK_ELSIF},
  36     {"else",  TOK_ELSE},
  37     {"for",     TOK_FOR},
  38     {"foreach", TOK_FOREACH},
  39     {"while",   TOK_WHILE},
  40     {"return",   TOK_RETURN},
  41     {"break",    TOK_BREAK},
  42     {"continue", TOK_CONTINUE},
  43     {"func", TOK_FUNC}
  44 };
  45
  46 // Build a table of where each line ending is
  47 static int* findLines(struct Parser* p)
  48 {
  49     char* buf = p->buf;
  50     int sz = p->len/10 + 16;
  51     int* lines = naParseAlloc(p, (sizeof(int) * sz));
  52     int i, j, n=0;
  53
  54     for(i=0; i<p->len; i++) {
  55         // Not a line ending at all
  56         if(buf[i] != '\n' && buf[i] != '\r')
  57             continue;
  58
  59         // Skip over the \r of a \r\n pair.
  60         if(buf[i] == '\r' && (i+1)<p->len && buf[i+1] == '\n') {
  61             i++;
  62             continue;
  63         }
  64         // Reallocate if necessary
  65         if(n == sz) {
  66             int* nl;
  67             sz *= 2;
  68             nl = naParseAlloc(p, sizeof(int) * sz);
  69             for(j=0; j<n; j++) nl[j] = lines[j];
  70             lines = nl;
  71         }
  72         lines[n++] = i;
  73     }
  74     p->lines = lines;
  75     p->nLines = n;
  76     return lines;
  77 }
  78
  79 // What line number is the index on?
  80 static int getLine(struct Parser* p, int index)
  81 {
  82     int i;
  83     for(i=0; i<p->nLines; i++)
  84         if(p->lines[i] > index)
  85             return (p->firstLine-1) + i+1;
  86     return (p->firstLine-1) + p->nLines+1;
  87 }
  88
  89 static void error(struct Parser* p, char* msg, int index)
  90 {
  91     naParseError(p, msg, getLine(p, index));
  92 }
  93
  94 // End index (the newline character) of the given line
  95 static int lineEnd(struct Parser* p, int line)
  96 {
  97     if(line > p->nLines) return p->len;
  98     return p->lines[line-1];
  99 }
 100
 101 static void newToken(struct Parser* p, int pos, int type,
 102                      char* str, int slen, double num)
 103 {
 104     struct Token* tok;
 105
 106     tok = naParseAlloc(p, sizeof(struct Token));
 107     tok->type = type;
 108     tok->line = getLine(p, pos);
 109     tok->str = str;
 110     tok->strlen = slen;
 111     tok->num = num;
 112     tok->parent = &p->tree;
 113     tok->next = 0;
 114     tok->prev = p->tree.lastChild;
 115     tok->children = 0;
 116     tok->lastChild = 0;
 117
 118     // Context sensitivity hack: a "-" following a binary operator of
 119     // higher precedence (MUL and DIV, basically) must be a unary
 120     // negation.  Needed to get precedence right in the parser for
 121     // expressiong like "a * -2"
 122     if(type == TOK_MINUS && tok->prev)
 123         if(tok->prev->type == TOK_MUL || tok->prev->type == TOK_DIV)
 124             tok->type = type = TOK_NEG;
 125
 126     if(!p->tree.children) p->tree.children = tok;
 127     if(p->tree.lastChild) p->tree.lastChild->next = tok;
 128     p->tree.lastChild = tok;
 129 }
 130
 131 // Parse a hex nibble
 132 static int hexc(char c, struct Parser* p, int index)
 133 {
 134     if(c >= '0' && c <= '9') return c - '0';
 135     if(c >= 'A' && c <= 'F') return c - 'a' + 10;
 136     if(c >= 'a' && c <= 'f') return c - 'a' + 10;
 137     error(p, "bad hex constant", index);
 138     return 0;
 139 }
 140
 141 // Escape and returns a single backslashed expression in a single
 142 // quoted string.  Trivial, just escape \' and leave everything else
 143 // alone.
 144 static void sqEscape(char* buf, int len, int index, struct Parser* p,
 145                      char* cOut, int* eatenOut)
 146 {
 147     if(len < 2) error(p, "unterminated string", index);
 148     if(buf[1] == '\'') {
 149         *cOut = '\'';
 150         *eatenOut = 2;
 151     } else {
 152         *cOut = '\\';
 153         *eatenOut = 1;
 154     }
 155 }
 156
 157 // Ditto, but more complicated for double quotes.
 158 static void dqEscape(char* buf, int len, int index, struct Parser* p,
 159                      char* cOut, int* eatenOut)
 160 {
 161     if(len < 2) error(p, "unterminated string", index);
 162     *eatenOut = 2;
 163     switch(buf[1]) {
 164     case '"': *cOut = '"'; break;
 165     case 'r': *cOut = '\r'; break;
 166     case 'n': *cOut = '\n'; break;
 167     case 't': *cOut = '\t'; break;
 168     case '\\': *cOut = '\\'; break;
 169     case 'x':
 170         if(len < 4) error(p, "unterminated string", index);
 171         *cOut = (char)((hexc(buf[2], p, index)<<4) | hexc(buf[3], p, index));
 172         *eatenOut = 4;
 173     default:
 174         // Unhandled, put the backslash back
 175         *cOut = '\\';
 176         *eatenOut = 1;
 177     }
 178 }
 179
 180 // Read in a string literal
 181 static int lexStringLiteral(struct Parser* p, int index, int singleQuote)
 182 {
 183     int i, j, len, iteration;
 184     char* out = 0;
 185     char* buf = p->buf;
 186     char endMark = singleQuote ? '\'' : '"';
 187
 188     for(iteration = 0; iteration<2; iteration++) {
 189         i = index+1;
 190         j = len = 0;
 191         while(i < p->len) {
 192             char c = buf[i];
 193             int eaten = 1;
 194             if(c == endMark)
 195                 break;
 196             if(c == '\\') {
 197                 if(singleQuote) sqEscape(buf+i, p->len-i, i, p, &c, &eaten);
 198                 else            dqEscape(buf+i, p->len-i, i, p, &c, &eaten);
 199             }
 200             if(iteration == 1) out[j++] = c;
 201             i += eaten;
 202             len++;
 203         }
 204         // Finished stage one -- allocate the buffer for stage two
 205         if(iteration == 0) out = naParseAlloc(p, len);
 206     }
 207     newToken(p, index, TOK_LITERAL, out, len, 0);
 208     return i+1;
 209 }
 210
 211 static int lexNumLiteral(struct Parser* p, int index)
 212 {
 213     int len = p->len, i = index;
 214     unsigned char* buf = p->buf;
 215     double d;
 216
 217     while(i<len && buf[i] >= '0' && buf[i] <= '9') i++;
 218     if(i<len && buf[i] == '.') {
 219         i++;
 220         while(i<len && buf[i] >= '0' && buf[i] <= '9') i++;
 221     }
 222     if(i<len && (buf[i] == 'e' || buf[i] == 'E')) {
 223         i++;
 224         if(i<len
 225            && (buf[i] == '-' || buf[i] == '+')
 226            && (i+1<len && buf[i+1] >= '0' && buf[i+1] <= '9')) i++;
 227         while(i<len && buf[i] >= '0' && buf[i] <= '9') i++;
 228     }
 229     naStr_parsenum(p->buf + index, i - index, &d);
 230     newToken(p, index, TOK_LITERAL, 0, 0, d);
 231     return i;
 232 }
 233
 234 static int trySymbol(struct Parser* p, int start)
 235 {
 236     int i = start;
 237     while((i < p->len) &&
 238           ((p->buf[i] >= 'A' && p->buf[i] <= 'Z') ||
 239            (p->buf[i] >= 'a' && p->buf[i] <= 'z') ||
 240            (p->buf[i] >= '0' && p->buf[i] <= '9')))
 241     { i++; }
 242     return i-start;
 243 }
 244
 245 // Returns the length of lexeme l if the buffer prefix matches, or
 246 // else zero.
 247 static int matchLexeme(char* buf, int len, char* l)
 248 {
 249     int i;
 250     for(i=0; i<len; i++) {
 251         if(l[i] == 0)      return i;
 252         if(l[i] != buf[i]) return 0;
 253     }
 254     // Ran out of buffer.  This is still OK if we're also at the end
 255     // of the lexeme.
 256     if(l[i] == 0) return i;
 257     return 0;
 258 }
 259
 260 // This is dumb and algorithmically slow.  It would be much more
 261 // elegant to sort and binary search the lexeme list, but that's a lot
 262 // more code and this really isn't very slow in practice; it checks
 263 // every byte of every lexeme for each input byte.  There are less
 264 // than 100 bytes of lexemes in the grammar.  Returns the number of
 265 // bytes in the lexeme read (or zero if none was recognized)
 266 static int tryLexemes(struct Parser* p, int index, int* lexemeOut)
 267 {
 268     int i, n, best, bestIndex=-1;
 269     char* start = p->buf + index;
 270     int len = p->len - index;
 271
 272     n = sizeof(LEXEMES) / sizeof(struct Lexeme);
 273     best = 0;
 274     for(i=0; i<n; i++) {
 275         int l = matchLexeme(start, len, LEXEMES[i].str);
 276         if(l > best) {
 277             best = l;
 278             bestIndex = i;
 279         }
 280     }
 281     if(best > 0) *lexemeOut = bestIndex;
 282     return best;
 283 }
 284
 285 void naLex(struct Parser* p)
 286 {
 287     int i = 0;
 288     findLines(p);
 289     while(i<p->len) {
 290         char c = p->buf[i];
 291
 292         // Whitespace, comments and string literals have obvious
 293         // markers and can be handled by a switch:
 294         int handled = 1;
 295         switch(c) {
 296         case ' ': case '\t': case '\n': case '\r': case '\f': case '\v':
 297             i++;
 298             break;
 299         case '#':
 300             i = lineEnd(p, getLine(p, i));
 301             break;
 302         case '\'': case '"':
 303             i = lexStringLiteral(p, i, (c=='"' ? 0 : 1));
 304             break;
 305         default:
 306             if(c >= '0' && c <= '9') i = lexNumLiteral(p, i);
 307             else                     handled = 0;
 308         }
 309
 310         // Lexemes and symbols are a little more complicated.  Pick
 311         // the longest one that matches.  Since some lexemes look like
 312         // symbols (e.g. "or") they need a higher precedence, but we
 313         // don't want a lexeme match to clobber the beginning of a
 314         // symbol (e.g. "orchid").  If neither match, we have a bad
 315         // character in the mix.
 316         if(!handled) {
 317             int symlen=0, lexlen=0, lexeme;
 318             lexlen = tryLexemes(p, i, &lexeme);
 319             if((c>='A' && c<='Z') || (c>='a' && c<='z'))
 320                 symlen = trySymbol(p, i);
 321             if(lexlen && lexlen >= symlen) {
 322                 newToken(p, i, LEXEMES[lexeme].tok, 0, 0, 0);
 323                 i += lexlen;
 324             } else if(symlen) {
 325                 newToken(p, i, TOK_SYMBOL, p->buf+i, symlen, 0);
 326                 i += symlen;
 327             } else {
 328                 error(p, "illegal character", i);
 329             }
 330         }
 331     }
 332 }