3 // Static table of recognized lexemes in the language
4 static const struct Lexeme {
41 {"foreach", TOK_FOREACH},
43 {"return", TOK_RETURN},
45 {"continue", TOK_CONTINUE},
47 {"...", TOK_ELLIPSIS},
55 {"&=", TOK_BIT_ANDEQ},
57 {"^=", TOK_BIT_XOREQ},
58 {"forindex", TOK_FORINDEX},
61 // Build a table of where each line ending is
62 static int* findLines(struct Parser* p)
65 int sz = p->len/10 + 16;
66 int* lines = naParseAlloc(p, (sizeof(int) * sz));
69 for(i=0; i<p->len; i++) {
70 // Not a line ending at all
71 if(buf[i] != '\n' && buf[i] != '\r')
74 // Skip over the \r of a \r\n pair.
75 if(buf[i] == '\r' && (i+1)<p->len && buf[i+1] == '\n') {
78 // Reallocate if necessary
82 nl = naParseAlloc(p, sizeof(int) * sz);
83 for(j=0; j<n; j++) nl[j] = lines[j];
93 // What line number is the index on?
94 static int getLine(struct Parser* p, int index)
97 for(i=0; i<p->nLines; i++)
98 if(p->lines[i] > index)
99 return (p->firstLine-1) + i+1;
100 return (p->firstLine-1) + p->nLines+1;
103 static void error(struct Parser* p, char* msg, int index)
105 naParseError(p, msg, getLine(p, index));
108 // End index (the newline character) of the given line
109 static int lineEnd(struct Parser* p, int line)
111 if(line > p->nLines) return p->len;
112 return p->lines[line-1];
115 static void newToken(struct Parser* p, int pos, int type,
116 char* str, int slen, double num)
118 struct Token *tok, *last = p->tree.lastChild;
120 /* Adjacent string literals get concatenated */
121 if(type == TOK_LITERAL && str) {
122 if(last && last->type == TOK_LITERAL) {
123 int i, len1 = last->strlen;
124 char* str2 = naParseAlloc(p, len1 + slen);
125 for(i=0; i<len1; i++) str2[i] = last->str[i];
126 for(i=0; i<slen; i++) str2[i+len1] = str[i];
128 last->strlen += slen;
133 tok = naParseAlloc(p, sizeof(struct Token));
135 tok->line = getLine(p, pos);
145 // Context sensitivity hack: a "-" or "~" following a binary operator of
146 // equal or higher precedence must be a unary negation. Needed to
147 // get precedence right in the parser for expressiong like "a * -2"
148 if((type == TOK_MINUS || type == TOK_CAT) && tok->prev) {
149 int pt = tok->prev->type;
150 if( pt==TOK_PLUS||pt==TOK_MINUS||pt==TOK_CAT||pt==TOK_MUL||pt==TOK_DIV
151 || pt==TOK_BIT_AND||pt==TOK_BIT_OR||pt==TOK_BIT_XOR )
152 tok->type = type = (type == TOK_MINUS ? TOK_NEG : TOK_BIT_NEG);
155 if(!p->tree.children) p->tree.children = tok;
156 if(p->tree.lastChild) p->tree.lastChild->next = tok;
157 p->tree.lastChild = tok;
160 static int hex(char c)
162 if(c >= '0' && c <= '9') return c - '0';
163 if(c >= 'A' && c <= 'F') return c - 'A' + 10;
164 if(c >= 'a' && c <= 'f') return c - 'a' + 10;
168 static int hexc(char c, struct Parser* p, int index)
171 if(n < 0) error(p, "bad hex constant", index);
175 // Escape and returns a single backslashed expression in a single
176 // quoted string. Trivial, just escape \' and leave everything else
178 static void sqEscape(char* buf, int len, int index, struct Parser* p,
179 char* cOut, int* eatenOut)
181 if(len < 2) error(p, "unterminated string", index);
191 // Ditto, but more complicated for double quotes.
192 /* FIXME: need to handle \b (8), \f (12), and \uXXXX for JSON compliance */
193 static void dqEscape(char* buf, int len, int index, struct Parser* p,
194 char* cOut, int* eatenOut)
196 if(len < 2) error(p, "unterminated string", index);
199 case '"': *cOut = '"'; break;
200 case 'r': *cOut = '\r'; break;
201 case 'n': *cOut = '\n'; break;
202 case 't': *cOut = '\t'; break;
203 case '\\': *cOut = '\\'; break;
204 case '`': *cOut = '`'; break;
206 if(len < 4) error(p, "unterminated string", index);
207 *cOut = (char)((hexc(buf[2], p, index)<<4) | hexc(buf[3], p, index));
211 // Unhandled, put the backslash back
217 static void charLiteral(struct Parser* p, int index, char* s, int len)
220 c = naLexUtf8C(s, len, &n);
221 if(c < 0 || n != len) error(p, "invalid utf8 character constant", index);
222 newToken(p, index, TOK_LITERAL, 0, 0, c);
225 // Read in a string literal
226 static int lexStringLiteral(struct Parser* p, int index, char q)
228 int i, j, len, iteration;
232 for(iteration = 0; iteration<2; iteration++) {
240 if(q == '\'') sqEscape(buf+i, p->len-i, i, p, &c, &eaten);
241 else dqEscape(buf+i, p->len-i, i, p, &c, &eaten);
243 if(iteration == 1) out[j++] = c;
247 // Finished stage one -- allocate the buffer for stage two
248 if(iteration == 0) out = naParseAlloc(p, len);
250 if(q == '`') charLiteral(p, index, out, len);
251 else newToken(p, index, TOK_LITERAL, out, len, 0);
255 static int lexIntLiteral(struct Parser* p, int index, int base)
259 while(i < p->len && (nib = hex(p->buf[i])) >= 0 && nib < base) {
263 newToken(p, index, TOK_LITERAL, 0, 0, d);
267 #define ISNUM(c) ((c) >= '0' && (c) <= '9')
268 #define ISHEX(c) (ISNUM(c) || ((c)>='a' && (c)<='f') || ((c)>='A' && (c)<='F'))
269 #define NUMSTART(c) (ISNUM(c) || (c) == '+' || (c) == '-')
270 static int lexNumLiteral(struct Parser* p, int index)
272 int len = p->len, i = index;
273 unsigned char* buf = (unsigned char*)p->buf;
276 if( buf[i] == '0' && i + 2 < len ) {
277 if( buf[i+1] == 'x' && ISHEX(buf[i+2]) )
278 return lexIntLiteral(p, index+2, 16);
279 if( buf[i+1] == 'o' && ISNUM(buf[i+2]) )
280 return lexIntLiteral(p, index+2, 8);
283 while(i<len && ISNUM(buf[i])) i++;
284 if(i<len && buf[i] == '.') {
286 while(i<len && ISNUM(buf[i])) i++;
288 if(i+1<len && (buf[i] == 'e' || buf[i] == 'E') && NUMSTART(buf[i+1])) {
290 if(buf[i] == '-' || buf[i] == '+') i++;
291 while(i<len && ISNUM(buf[i])) i++;
293 naStr_parsenum(p->buf + index, i - index, &d);
294 newToken(p, index, TOK_LITERAL, 0, 0, d);
298 static int trySymbol(struct Parser* p, int start)
301 while((i < p->len) &&
302 ((p->buf[i] == '_') ||
303 (p->buf[i] >= 'A' && p->buf[i] <= 'Z') ||
304 (p->buf[i] >= 'a' && p->buf[i] <= 'z') ||
305 (p->buf[i] >= '0' && p->buf[i] <= '9')))
310 // Returns the length of lexeme l if the buffer prefix matches, or
312 static int matchLexeme(char* buf, int len, char* l)
315 for(i=0; i<len; i++) {
316 if(l[i] == 0) return i;
317 if(l[i] != buf[i]) return 0;
319 // Ran out of buffer. This is still OK if we're also at the end
321 if(l[i] == 0) return i;
325 // This is dumb and algorithmically slow. It would be much more
326 // elegant to sort and binary search the lexeme list, but that's a lot
327 // more code and this really isn't very slow in practice; it checks
328 // every byte of every lexeme for each input byte. There are less
329 // than 100 bytes of lexemes in the grammar. Returns the number of
330 // bytes in the lexeme read (or zero if none was recognized)
331 static int tryLexemes(struct Parser* p, int index, int* lexemeOut)
333 int i, n, best, bestIndex=-1;
334 char* start = p->buf + index;
335 int len = p->len - index;
337 n = sizeof(LEXEMES) / sizeof(struct Lexeme);
340 int l = matchLexeme(start, len, LEXEMES[i].str);
346 if(best > 0) *lexemeOut = bestIndex;
350 void naLex(struct Parser* p)
357 // Whitespace, comments and string literals have obvious
358 // markers and can be handled by a switch:
361 case ' ': case '\t': case '\n': case '\r': case '\f': case '\v':
365 i = lineEnd(p, getLine(p, i));
367 case '\'': case '"': case '`':
368 i = lexStringLiteral(p, i, c);
371 if(ISNUM(c) || (c == '.' && (i+1)<p->len && ISNUM(p->buf[i+1])))
372 i = lexNumLiteral(p, i);
376 // Lexemes and symbols are a little more complicated. Pick
377 // the longest one that matches. Since some lexemes look like
378 // symbols (e.g. "or") they need a higher precedence, but we
379 // don't want a lexeme match to clobber the beginning of a
380 // symbol (e.g. "orchid"). If neither match, we have a bad
381 // character in the mix.
383 int symlen=0, lexlen=0, lexeme=-1;
384 lexlen = tryLexemes(p, i, &lexeme);
385 if((c>='A' && c<='Z') || (c>='a' && c<='z') || (c=='_'))
386 symlen = trySymbol(p, i);
387 if(lexlen && lexlen >= symlen) {
388 newToken(p, i, LEXEMES[lexeme].tok, 0, 0, 0);
391 newToken(p, i, TOK_SYMBOL, p->buf+i, symlen, 0);
394 error(p, "illegal character", i);