simgear/nasal/lex.c

   1 #include "parse.h"
   2
   3 // Static table of recognized lexemes in the language
   4 static const struct Lexeme {
   5     char* str;
   6     int   tok;
   7 } LEXEMES[] = {
   8     {"and", TOK_AND},
   9     {"or",  TOK_OR},
  10     {"!",   TOK_NOT},
  11     {"(", TOK_LPAR},
  12     {")", TOK_RPAR},
  13     {"[", TOK_LBRA},
  14     {"]", TOK_RBRA},
  15     {"{", TOK_LCURL},
  16     {"}", TOK_RCURL},
  17     {"*", TOK_MUL},
  18     {"+", TOK_PLUS},
  19     {"-", TOK_MINUS},
  20     {"/", TOK_DIV},
  21     {"~", TOK_CAT},
  22     {":", TOK_COLON},
  23     {".", TOK_DOT},
  24     {",", TOK_COMMA},
  25     {";", TOK_SEMI},
  26     {"=", TOK_ASSIGN},
  27     {"<",  TOK_LT},
  28     {"<=", TOK_LTE},
  29     {"==", TOK_EQ},
  30     {"!=", TOK_NEQ},
  31     {">",  TOK_GT},
  32     {">=", TOK_GTE},
  33     {"nil", TOK_NIL},
  34     {"if",    TOK_IF},
  35     {"elsif", TOK_ELSIF},
  36     {"else",  TOK_ELSE},
  37     {"for",     TOK_FOR},
  38     {"foreach", TOK_FOREACH},
  39     {"while",   TOK_WHILE},
  40     {"return",   TOK_RETURN},
  41     {"break",    TOK_BREAK},
  42     {"continue", TOK_CONTINUE},
  43     {"func", TOK_FUNC},
  44     {"...", TOK_ELLIPSIS},
  45     {"?", TOK_QUESTION},
  46     {"var", TOK_VAR},
  47     {"+=", TOK_PLUSEQ},
  48     {"-=", TOK_MINUSEQ},
  49     {"*=", TOK_MULEQ},
  50     {"/=", TOK_DIVEQ},
  51     {"~=", TOK_CATEQ},
  52     {"forindex", TOK_FORINDEX},
  53 };
  54
  55 // Build a table of where each line ending is
  56 static int* findLines(struct Parser* p)
  57 {
  58     char* buf = p->buf;
  59     int sz = p->len/10 + 16;
  60     int* lines = naParseAlloc(p, (sizeof(int) * sz));
  61     int i, j, n=0;
  62
  63     for(i=0; i<p->len; i++) {
  64         // Not a line ending at all
  65         if(buf[i] != '\n' && buf[i] != '\r')
  66             continue;
  67
  68         // Skip over the \r of a \r\n pair.
  69         if(buf[i] == '\r' && (i+1)<p->len && buf[i+1] == '\n') {
  70             continue;
  71         }
  72         // Reallocate if necessary
  73         if(n == sz) {
  74             int* nl;
  75             sz *= 2;
  76             nl = naParseAlloc(p, sizeof(int) * sz);
  77             for(j=0; j<n; j++) nl[j] = lines[j];
  78             lines = nl;
  79         }
  80         lines[n++] = i;
  81     }
  82     p->lines = lines;
  83     p->nLines = n;
  84     return lines;
  85 }
  86
  87 // What line number is the index on?
  88 static int getLine(struct Parser* p, int index)
  89 {
  90     int i;
  91     for(i=0; i<p->nLines; i++)
  92         if(p->lines[i] > index)
  93             return (p->firstLine-1) + i+1;
  94     return (p->firstLine-1) + p->nLines+1;
  95 }
  96
  97 static void error(struct Parser* p, char* msg, int index)
  98 {
  99     naParseError(p, msg, getLine(p, index));
 100 }
 101
 102 // End index (the newline character) of the given line
 103 static int lineEnd(struct Parser* p, int line)
 104 {
 105     if(line > p->nLines) return p->len;
 106     return p->lines[line-1];
 107 }
 108
 109 static void newToken(struct Parser* p, int pos, int type,
 110                      char* str, int slen, double num)
 111 {
 112     struct Token *tok, *last = p->tree.lastChild;
 113
 114     /* Adjacent string literals get concatenated */
 115     if(type == TOK_LITERAL && str) {
 116         if(last && last->type == TOK_LITERAL) {
 117             int i, len1 = last->strlen;
 118             char* str2 = naParseAlloc(p, len1 + slen);
 119             for(i=0; i<len1; i++) str2[i] = last->str[i];
 120             for(i=0; i<slen; i++) str2[i+len1] = str[i];
 121             last->str = str2;
 122             last->strlen += slen;
 123             return;
 124         }
 125     }
 126
 127     tok = naParseAlloc(p, sizeof(struct Token));
 128     tok->type = type;
 129     tok->line = getLine(p, pos);
 130     tok->str = str;
 131     tok->strlen = slen;
 132     tok->num = num;
 133     tok->next = 0;
 134     tok->prev = last;
 135     tok->children = 0;
 136     tok->lastChild = 0;
 137
 138     // Context sensitivity hack: a "-" following a binary operator of
 139     // equal or higher precedence must be a unary negation.  Needed to
 140     // get precedence right in the parser for expressiong like "a * -2"
 141     if(type == TOK_MINUS && tok->prev) {
 142         int pt = tok->prev->type;
 143         if(pt==TOK_PLUS||pt==TOK_MINUS||pt==TOK_CAT||pt==TOK_MUL||pt==TOK_DIV)
 144             tok->type = type = TOK_NEG;
 145     }
 146
 147     if(!p->tree.children) p->tree.children = tok;
 148     if(p->tree.lastChild) p->tree.lastChild->next = tok;
 149     p->tree.lastChild = tok;
 150 }
 151
 152 static int hex(char c)
 153 {
 154     if(c >= '0' && c <= '9') return c - '0';
 155     if(c >= 'A' && c <= 'F') return c - 'A' + 10;
 156     if(c >= 'a' && c <= 'f') return c - 'a' + 10;
 157     return -1;
 158 }
 159
 160 static int hexc(char c, struct Parser* p, int index)
 161 {
 162     int n = hex(c);
 163     if(n < 0) error(p, "bad hex constant", index);
 164     return n;
 165 }
 166
 167 // Escape and returns a single backslashed expression in a single
 168 // quoted string.  Trivial, just escape \' and leave everything else
 169 // alone.
 170 static void sqEscape(char* buf, int len, int index, struct Parser* p,
 171                      char* cOut, int* eatenOut)
 172 {
 173     if(len < 2) error(p, "unterminated string", index);
 174     if(buf[1] == '\'') {
 175         *cOut = '\'';
 176         *eatenOut = 2;
 177     } else {
 178         *cOut = '\\';
 179         *eatenOut = 1;
 180     }
 181 }
 182
 183 // Ditto, but more complicated for double quotes.
 184 /* FIXME: need to handle \b (8), \f (12), and \uXXXX for JSON compliance */
 185 static void dqEscape(char* buf, int len, int index, struct Parser* p,
 186                      char* cOut, int* eatenOut)
 187 {
 188     if(len < 2) error(p, "unterminated string", index);
 189     *eatenOut = 2;
 190     switch(buf[1]) {
 191     case '"': *cOut = '"'; break;
 192     case 'r': *cOut = '\r'; break;
 193     case 'n': *cOut = '\n'; break;
 194     case 't': *cOut = '\t'; break;
 195     case '\\': *cOut = '\\'; break;
 196     case '`': *cOut = '`'; break;
 197     case 'x':
 198         if(len < 4) error(p, "unterminated string", index);
 199         *cOut = (char)((hexc(buf[2], p, index)<<4) | hexc(buf[3], p, index));
 200         *eatenOut = 4;
 201         break;
 202     default:
 203         // Unhandled, put the backslash back
 204         *cOut = '\\';
 205         *eatenOut = 1;
 206     }
 207 }
 208
 209 static void charLiteral(struct Parser* p, int index, char* s, int len)
 210 {
 211     int n, c;
 212     c = naLexUtf8C(s, len, &n);
 213     if(c < 0 || n != len) error(p, "invalid utf8 character constant", index);
 214     newToken(p, index, TOK_LITERAL, 0, 0, c);
 215 }
 216
 217 // Read in a string literal
 218 static int lexStringLiteral(struct Parser* p, int index, char q)
 219 {
 220     int i, j, len, iteration;
 221     char* out = 0;
 222     char* buf = p->buf;
 223
 224     for(iteration = 0; iteration<2; iteration++) {
 225         i = index+1;
 226         j = len = 0;
 227         while(i < p->len) {
 228             char c = buf[i];
 229             int eaten = 1;
 230             if(c == q) break;
 231             if(c == '\\') {
 232                 if(q == '\'') sqEscape(buf+i, p->len-i, i, p, &c, &eaten);
 233                 else          dqEscape(buf+i, p->len-i, i, p, &c, &eaten);
 234             }
 235             if(iteration == 1) out[j++] = c;
 236             i += eaten;
 237             len++;
 238         }
 239         // Finished stage one -- allocate the buffer for stage two
 240         if(iteration == 0) out = naParseAlloc(p, len);
 241     }
 242     if(q == '`') charLiteral(p, index, out, len);
 243     else         newToken(p, index, TOK_LITERAL, out, len, 0);
 244     return i+1;
 245 }
 246
 247 static int lexHexLiteral(struct Parser* p, int index)
 248 {
 249     int nib, i = index;
 250     double d = 0;
 251     while(i < p->len && (nib = hex(p->buf[i])) >= 0) {
 252         d = d*16 + nib;
 253         i++;
 254     }
 255     newToken(p, index, TOK_LITERAL, 0, 0, d);
 256     return i;
 257 }
 258
 259 #define ISNUM(c) ((c) >= '0' && (c) <= '9')
 260 #define NUMSTART(c) (ISNUM(c) || (c)=='+' || (c) == '-')
 261 static int lexNumLiteral(struct Parser* p, int index)
 262 {
 263     int len = p->len, i = index;
 264     unsigned char* buf = (unsigned char*)p->buf;
 265     double d;
 266
 267     if(buf[0] == '0' && i+1<len && buf[i+1] == 'x')
 268         return lexHexLiteral(p, index+2);
 269
 270     while(i<len && ISNUM(buf[i])) i++;
 271     if(i<len && buf[i] == '.') {
 272         i++;
 273         while(i<len && ISNUM(buf[i])) i++;
 274     }
 275     if(i+1<len && (buf[i] == 'e' || buf[i] == 'E') && NUMSTART(buf[i+1])) {
 276         i++;
 277         if(buf[i] == '-' || buf[i] == '+') i++;
 278         while(i<len && ISNUM(buf[i])) i++;
 279     }
 280     naStr_parsenum(p->buf + index, i - index, &d);
 281     newToken(p, index, TOK_LITERAL, 0, 0, d);
 282     return i;
 283 }
 284
 285 static int trySymbol(struct Parser* p, int start)
 286 {
 287     int i = start;
 288     while((i < p->len) &&
 289           ((p->buf[i] == '_') ||
 290            (p->buf[i] >= 'A' && p->buf[i] <= 'Z') ||
 291            (p->buf[i] >= 'a' && p->buf[i] <= 'z') ||
 292            (p->buf[i] >= '0' && p->buf[i] <= '9')))
 293     { i++; }
 294     return i-start;
 295 }
 296
 297 // Returns the length of lexeme l if the buffer prefix matches, or
 298 // else zero.
 299 static int matchLexeme(char* buf, int len, char* l)
 300 {
 301     int i;
 302     for(i=0; i<len; i++) {
 303         if(l[i] == 0)      return i;
 304         if(l[i] != buf[i]) return 0;
 305     }
 306     // Ran out of buffer.  This is still OK if we're also at the end
 307     // of the lexeme.
 308     if(l[i] == 0) return i;
 309     return 0;
 310 }
 311
 312 // This is dumb and algorithmically slow.  It would be much more
 313 // elegant to sort and binary search the lexeme list, but that's a lot
 314 // more code and this really isn't very slow in practice; it checks
 315 // every byte of every lexeme for each input byte.  There are less
 316 // than 100 bytes of lexemes in the grammar.  Returns the number of
 317 // bytes in the lexeme read (or zero if none was recognized)
 318 static int tryLexemes(struct Parser* p, int index, int* lexemeOut)
 319 {
 320     int i, n, best, bestIndex=-1;
 321     char* start = p->buf + index;
 322     int len = p->len - index;
 323
 324     n = sizeof(LEXEMES) / sizeof(struct Lexeme);
 325     best = 0;
 326     for(i=0; i<n; i++) {
 327         int l = matchLexeme(start, len, LEXEMES[i].str);
 328         if(l > best) {
 329             best = l;
 330             bestIndex = i;
 331         }
 332     }
 333     if(best > 0) *lexemeOut = bestIndex;
 334     return best;
 335 }
 336
 337 void naLex(struct Parser* p)
 338 {
 339     int i = 0;
 340     findLines(p);
 341     while(i<p->len) {
 342         char c = p->buf[i];
 343
 344         // Whitespace, comments and string literals have obvious
 345         // markers and can be handled by a switch:
 346         int handled = 1;
 347         switch(c) {
 348         case ' ': case '\t': case '\n': case '\r': case '\f': case '\v':
 349             i++;
 350             break;
 351         case '#':
 352             i = lineEnd(p, getLine(p, i));
 353             break;
 354         case '\'': case '"': case '`':
 355             i = lexStringLiteral(p, i, c);
 356             break;
 357         default:
 358             if(ISNUM(c) || (c == '.' && (i+1)<p->len && ISNUM(p->buf[i+1])))
 359                 i = lexNumLiteral(p, i);
 360             else handled = 0;
 361         }
 362
 363         // Lexemes and symbols are a little more complicated.  Pick
 364         // the longest one that matches.  Since some lexemes look like
 365         // symbols (e.g. "or") they need a higher precedence, but we
 366         // don't want a lexeme match to clobber the beginning of a
 367         // symbol (e.g. "orchid").  If neither match, we have a bad
 368         // character in the mix.
 369         if(!handled) {
 370             int symlen=0, lexlen=0, lexeme=-1;
 371             lexlen = tryLexemes(p, i, &lexeme);
 372             if((c>='A' && c<='Z') || (c>='a' && c<='z') || (c=='_'))
 373                 symlen = trySymbol(p, i);
 374             if(lexlen && lexlen >= symlen) {
 375                 newToken(p, i, LEXEMES[lexeme].tok, 0, 0, 0);
 376                 i += lexlen;
 377             } else if(symlen) {
 378                 newToken(p, i, TOK_SYMBOL, p->buf+i, symlen, 0);
 379                 i += symlen;
 380             } else {
 381                 error(p, "illegal character", i);
 382             }
 383         }
 384     }
 385 }