simgear/nasal/lex.c

   1 #include "parse.h"
   2
   3 // Static table of recognized lexemes in the language
   4 static const struct Lexeme {
   5     char* str;
   6     int   tok;
   7 } LEXEMES[] = {
   8     {"and", TOK_AND},
   9     {"or",  TOK_OR},
  10     {"!",   TOK_NOT},
  11     {"&",   TOK_BIT_AND},
  12     {"|",   TOK_BIT_OR},
  13     {"^",   TOK_BIT_XOR},
  14     {"(", TOK_LPAR},
  15     {")", TOK_RPAR},
  16     {"[", TOK_LBRA},
  17     {"]", TOK_RBRA},
  18     {"{", TOK_LCURL},
  19     {"}", TOK_RCURL},
  20     {"*", TOK_MUL},
  21     {"+", TOK_PLUS},
  22     {"-", TOK_MINUS},
  23     {"/", TOK_DIV},
  24     {"~", TOK_CAT},
  25     {":", TOK_COLON},
  26     {".", TOK_DOT},
  27     {",", TOK_COMMA},
  28     {";", TOK_SEMI},
  29     {"=", TOK_ASSIGN},
  30     {"<",  TOK_LT},
  31     {"<=", TOK_LTE},
  32     {"==", TOK_EQ},
  33     {"!=", TOK_NEQ},
  34     {">",  TOK_GT},
  35     {">=", TOK_GTE},
  36     {"nil", TOK_NIL},
  37     {"if",    TOK_IF},
  38     {"elsif", TOK_ELSIF},
  39     {"else",  TOK_ELSE},
  40     {"for",     TOK_FOR},
  41     {"foreach", TOK_FOREACH},
  42     {"while",   TOK_WHILE},
  43     {"return",   TOK_RETURN},
  44     {"break",    TOK_BREAK},
  45     {"continue", TOK_CONTINUE},
  46     {"func", TOK_FUNC},
  47     {"...", TOK_ELLIPSIS},
  48     {"?", TOK_QUESTION},
  49     {"var", TOK_VAR},
  50     {"+=", TOK_PLUSEQ},
  51     {"-=", TOK_MINUSEQ},
  52     {"*=", TOK_MULEQ},
  53     {"/=", TOK_DIVEQ},
  54     {"~=", TOK_CATEQ},
  55     {"&=", TOK_BIT_ANDEQ},
  56     {"|=", TOK_BIT_OREQ},
  57     {"^=", TOK_BIT_XOREQ},
  58     {"forindex", TOK_FORINDEX},
  59 };
  60
  61 // Build a table of where each line ending is
  62 static int* findLines(struct Parser* p)
  63 {
  64     char* buf = p->buf;
  65     int sz = p->len/10 + 16;
  66     int* lines = naParseAlloc(p, (sizeof(int) * sz));
  67     int i, j, n=0;
  68
  69     for(i=0; i<p->len; i++) {
  70         // Not a line ending at all
  71         if(buf[i] != '\n' && buf[i] != '\r')
  72             continue;
  73
  74         // Skip over the \r of a \r\n pair.
  75         if(buf[i] == '\r' && (i+1)<p->len && buf[i+1] == '\n') {
  76             continue;
  77         }
  78         // Reallocate if necessary
  79         if(n == sz) {
  80             int* nl;
  81             sz *= 2;
  82             nl = naParseAlloc(p, sizeof(int) * sz);
  83             for(j=0; j<n; j++) nl[j] = lines[j];
  84             lines = nl;
  85         }
  86         lines[n++] = i;
  87     }
  88     p->lines = lines;
  89     p->nLines = n;
  90     return lines;
  91 }
  92
  93 // What line number is the index on?
  94 static int getLine(struct Parser* p, int index)
  95 {
  96     int i;
  97     for(i=0; i<p->nLines; i++)
  98         if(p->lines[i] > index)
  99             return (p->firstLine-1) + i+1;
 100     return (p->firstLine-1) + p->nLines+1;
 101 }
 102
 103 static void error(struct Parser* p, char* msg, int index)
 104 {
 105     naParseError(p, msg, getLine(p, index));
 106 }
 107
 108 // End index (the newline character) of the given line
 109 static int lineEnd(struct Parser* p, int line)
 110 {
 111     if(line > p->nLines) return p->len;
 112     return p->lines[line-1];
 113 }
 114
 115 static void newToken(struct Parser* p, int pos, int type,
 116                      char* str, int slen, double num)
 117 {
 118     struct Token *tok, *last = p->tree.lastChild;
 119
 120     /* Adjacent string literals get concatenated */
 121     if(type == TOK_LITERAL && str) {
 122         if(last && last->type == TOK_LITERAL) {
 123             int i, len1 = last->strlen;
 124             char* str2 = naParseAlloc(p, len1 + slen);
 125             for(i=0; i<len1; i++) str2[i] = last->str[i];
 126             for(i=0; i<slen; i++) str2[i+len1] = str[i];
 127             last->str = str2;
 128             last->strlen += slen;
 129             return;
 130         }
 131     }
 132
 133     tok = naParseAlloc(p, sizeof(struct Token));
 134     tok->type = type;
 135     tok->line = getLine(p, pos);
 136     tok->str = str;
 137     tok->strlen = slen;
 138     tok->num = num;
 139     tok->next = 0;
 140     tok->prev = last;
 141     tok->children = 0;
 142     tok->lastChild = 0;
 143     tok->rule = 0;
 144
 145     // Context sensitivity hack: a "-" or "~" following a binary operator of
 146     // equal or higher precedence must be a unary negation.  Needed to
 147     // get precedence right in the parser for expressiong like "a * -2"
 148     if((type == TOK_MINUS || type == TOK_CAT) && tok->prev) {
 149         int pt = tok->prev->type;
 150         if( pt==TOK_PLUS||pt==TOK_MINUS||pt==TOK_CAT||pt==TOK_MUL||pt==TOK_DIV
 151          || pt==TOK_BIT_AND||pt==TOK_BIT_OR||pt==TOK_BIT_XOR )
 152             tok->type = type = (type == TOK_MINUS ? TOK_NEG : TOK_BIT_NEG);
 153     }
 154
 155     if(!p->tree.children) p->tree.children = tok;
 156     if(p->tree.lastChild) p->tree.lastChild->next = tok;
 157     p->tree.lastChild = tok;
 158 }
 159
 160 static int hex(char c)
 161 {
 162     if(c >= '0' && c <= '9') return c - '0';
 163     if(c >= 'A' && c <= 'F') return c - 'A' + 10;
 164     if(c >= 'a' && c <= 'f') return c - 'a' + 10;
 165     return -1;
 166 }
 167
 168 static int hexc(char c, struct Parser* p, int index)
 169 {
 170     int n = hex(c);
 171     if(n < 0) error(p, "bad hex constant", index);
 172     return n;
 173 }
 174
 175 // Escape and returns a single backslashed expression in a single
 176 // quoted string.  Trivial, just escape \' and leave everything else
 177 // alone.
 178 static void sqEscape(char* buf, int len, int index, struct Parser* p,
 179                      char* cOut, int* eatenOut)
 180 {
 181     if(len < 2) error(p, "unterminated string", index);
 182     if(buf[1] == '\'') {
 183         *cOut = '\'';
 184         *eatenOut = 2;
 185     } else {
 186         *cOut = '\\';
 187         *eatenOut = 1;
 188     }
 189 }
 190
 191 // Ditto, but more complicated for double quotes.
 192 /* FIXME: need to handle \b (8), \f (12), and \uXXXX for JSON compliance */
 193 static void dqEscape(char* buf, int len, int index, struct Parser* p,
 194                      char* cOut, int* eatenOut)
 195 {
 196     if(len < 2) error(p, "unterminated string", index);
 197     *eatenOut = 2;
 198     switch(buf[1]) {
 199     case '"': *cOut = '"'; break;
 200     case 'r': *cOut = '\r'; break;
 201     case 'n': *cOut = '\n'; break;
 202     case 't': *cOut = '\t'; break;
 203     case '\\': *cOut = '\\'; break;
 204     case '`': *cOut = '`'; break;
 205     case 'x':
 206         if(len < 4) error(p, "unterminated string", index);
 207         *cOut = (char)((hexc(buf[2], p, index)<<4) | hexc(buf[3], p, index));
 208         *eatenOut = 4;
 209         break;
 210     default:
 211         // Unhandled, put the backslash back
 212         *cOut = '\\';
 213         *eatenOut = 1;
 214     }
 215 }
 216
 217 static void charLiteral(struct Parser* p, int index, char* s, int len)
 218 {
 219     int n, c;
 220     c = naLexUtf8C(s, len, &n);
 221     if(c < 0 || n != len) error(p, "invalid utf8 character constant", index);
 222     newToken(p, index, TOK_LITERAL, 0, 0, c);
 223 }
 224
 225 // Read in a string literal
 226 static int lexStringLiteral(struct Parser* p, int index, char q)
 227 {
 228     int i, j, len, iteration;
 229     char* out = 0;
 230     char* buf = p->buf;
 231
 232     for(iteration = 0; iteration<2; iteration++) {
 233         i = index+1;
 234         j = len = 0;
 235         while(i < p->len) {
 236             char c = buf[i];
 237             int eaten = 1;
 238             if(c == q) break;
 239             if(c == '\\') {
 240                 if(q == '\'') sqEscape(buf+i, p->len-i, i, p, &c, &eaten);
 241                 else          dqEscape(buf+i, p->len-i, i, p, &c, &eaten);
 242             }
 243             if(iteration == 1) out[j++] = c;
 244             i += eaten;
 245             len++;
 246         }
 247         // Finished stage one -- allocate the buffer for stage two
 248         if(iteration == 0) out = naParseAlloc(p, len);
 249     }
 250     if(q == '`') charLiteral(p, index, out, len);
 251     else         newToken(p, index, TOK_LITERAL, out, len, 0);
 252     return i+1;
 253 }
 254
 255 static int lexHexLiteral(struct Parser* p, int index)
 256 {
 257     int nib, i = index;
 258     double d = 0;
 259     while(i < p->len && (nib = hex(p->buf[i])) >= 0) {
 260         d = d*16 + nib;
 261         i++;
 262     }
 263     newToken(p, index, TOK_LITERAL, 0, 0, d);
 264     return i;
 265 }
 266
 267 #define ISNUM(c) ((c) >= '0' && (c) <= '9')
 268 #define ISHEX(c) (ISNUM(c) || ((c)>='a' && (c)<='f') || ((c)>='A' && (c)<='F'))
 269 #define NUMSTART(c) (ISNUM(c) || (c) == '+' || (c) == '-')
 270 static int lexNumLiteral(struct Parser* p, int index)
 271 {
 272     int len = p->len, i = index;
 273     unsigned char* buf = (unsigned char*)p->buf;
 274     double d;
 275
 276     if(buf[i] == '0' && i+2<len && buf[i+1] == 'x' && ISHEX(buf[i+2]))
 277        return lexHexLiteral(p, index+2);
 278
 279     while(i<len && ISNUM(buf[i])) i++;
 280     if(i<len && buf[i] == '.') {
 281         i++;
 282         while(i<len && ISNUM(buf[i])) i++;
 283     }
 284     if(i+1<len && (buf[i] == 'e' || buf[i] == 'E') && NUMSTART(buf[i+1])) {
 285         i++;
 286         if(buf[i] == '-' || buf[i] == '+') i++;
 287         while(i<len && ISNUM(buf[i])) i++;
 288     }
 289     naStr_parsenum(p->buf + index, i - index, &d);
 290     newToken(p, index, TOK_LITERAL, 0, 0, d);
 291     return i;
 292 }
 293
 294 static int trySymbol(struct Parser* p, int start)
 295 {
 296     int i = start;
 297     while((i < p->len) &&
 298           ((p->buf[i] == '_') ||
 299            (p->buf[i] >= 'A' && p->buf[i] <= 'Z') ||
 300            (p->buf[i] >= 'a' && p->buf[i] <= 'z') ||
 301            (p->buf[i] >= '0' && p->buf[i] <= '9')))
 302     { i++; }
 303     return i-start;
 304 }
 305
 306 // Returns the length of lexeme l if the buffer prefix matches, or
 307 // else zero.
 308 static int matchLexeme(char* buf, int len, char* l)
 309 {
 310     int i;
 311     for(i=0; i<len; i++) {
 312         if(l[i] == 0)      return i;
 313         if(l[i] != buf[i]) return 0;
 314     }
 315     // Ran out of buffer.  This is still OK if we're also at the end
 316     // of the lexeme.
 317     if(l[i] == 0) return i;
 318     return 0;
 319 }
 320
 321 // This is dumb and algorithmically slow.  It would be much more
 322 // elegant to sort and binary search the lexeme list, but that's a lot
 323 // more code and this really isn't very slow in practice; it checks
 324 // every byte of every lexeme for each input byte.  There are less
 325 // than 100 bytes of lexemes in the grammar.  Returns the number of
 326 // bytes in the lexeme read (or zero if none was recognized)
 327 static int tryLexemes(struct Parser* p, int index, int* lexemeOut)
 328 {
 329     int i, n, best, bestIndex=-1;
 330     char* start = p->buf + index;
 331     int len = p->len - index;
 332
 333     n = sizeof(LEXEMES) / sizeof(struct Lexeme);
 334     best = 0;
 335     for(i=0; i<n; i++) {
 336         int l = matchLexeme(start, len, LEXEMES[i].str);
 337         if(l > best) {
 338             best = l;
 339             bestIndex = i;
 340         }
 341     }
 342     if(best > 0) *lexemeOut = bestIndex;
 343     return best;
 344 }
 345
 346 void naLex(struct Parser* p)
 347 {
 348     int i = 0;
 349     findLines(p);
 350     while(i<p->len) {
 351         char c = p->buf[i];
 352
 353         // Whitespace, comments and string literals have obvious
 354         // markers and can be handled by a switch:
 355         int handled = 1;
 356         switch(c) {
 357         case ' ': case '\t': case '\n': case '\r': case '\f': case '\v':
 358             i++;
 359             break;
 360         case '#':
 361             i = lineEnd(p, getLine(p, i));
 362             break;
 363         case '\'': case '"': case '`':
 364             i = lexStringLiteral(p, i, c);
 365             break;
 366         default:
 367             if(ISNUM(c) || (c == '.' && (i+1)<p->len && ISNUM(p->buf[i+1])))
 368                 i = lexNumLiteral(p, i);
 369             else handled = 0;
 370         }
 371
 372         // Lexemes and symbols are a little more complicated.  Pick
 373         // the longest one that matches.  Since some lexemes look like
 374         // symbols (e.g. "or") they need a higher precedence, but we
 375         // don't want a lexeme match to clobber the beginning of a
 376         // symbol (e.g. "orchid").  If neither match, we have a bad
 377         // character in the mix.
 378         if(!handled) {
 379             int symlen=0, lexlen=0, lexeme=-1;
 380             lexlen = tryLexemes(p, i, &lexeme);
 381             if((c>='A' && c<='Z') || (c>='a' && c<='z') || (c=='_'))
 382                 symlen = trySymbol(p, i);
 383             if(lexlen && lexlen >= symlen) {
 384                 newToken(p, i, LEXEMES[lexeme].tok, 0, 0, 0);
 385                 i += lexlen;
 386             } else if(symlen) {
 387                 newToken(p, i, TOK_SYMBOL, p->buf+i, symlen, 0);
 388                 i += symlen;
 389             } else {
 390                 error(p, "illegal character", i);
 391             }
 392         }
 393     }
 394 }