3 // Static table of recognized lexemes in the language
38 {"foreach", TOK_FOREACH},
40 {"return", TOK_RETURN},
42 {"continue", TOK_CONTINUE},
44 {"...", TOK_ELLIPSIS},
49 // Build a table of where each line ending is
50 static int* findLines(struct Parser* p)
53 int sz = p->len/10 + 16;
54 int* lines = naParseAlloc(p, (sizeof(int) * sz));
57 for(i=0; i<p->len; i++) {
58 // Not a line ending at all
59 if(buf[i] != '\n' && buf[i] != '\r')
62 // Skip over the \r of a \r\n pair.
63 if(buf[i] == '\r' && (i+1)<p->len && buf[i+1] == '\n') {
66 // Reallocate if necessary
70 nl = naParseAlloc(p, sizeof(int) * sz);
71 for(j=0; j<n; j++) nl[j] = lines[j];
81 // What line number is the index on?
82 static int getLine(struct Parser* p, int index)
85 for(i=0; i<p->nLines; i++)
86 if(p->lines[i] > index)
87 return (p->firstLine-1) + i+1;
88 return (p->firstLine-1) + p->nLines+1;
91 static void error(struct Parser* p, char* msg, int index)
93 naParseError(p, msg, getLine(p, index));
96 // End index (the newline character) of the given line
97 static int lineEnd(struct Parser* p, int line)
99 if(line > p->nLines) return p->len;
100 return p->lines[line-1];
103 static void newToken(struct Parser* p, int pos, int type,
104 char* str, int slen, double num)
108 tok = naParseAlloc(p, sizeof(struct Token));
110 tok->line = getLine(p, pos);
114 tok->parent = &p->tree;
116 tok->prev = p->tree.lastChild;
120 // Context sensitivity hack: a "-" following a binary operator of
121 // higher precedence (MUL and DIV, basically) must be a unary
122 // negation. Needed to get precedence right in the parser for
123 // expressiong like "a * -2"
124 if(type == TOK_MINUS && tok->prev)
125 if(tok->prev->type == TOK_MUL || tok->prev->type == TOK_DIV)
126 tok->type = type = TOK_NEG;
128 if(!p->tree.children) p->tree.children = tok;
129 if(p->tree.lastChild) p->tree.lastChild->next = tok;
130 p->tree.lastChild = tok;
133 // Parse a hex nibble
134 static int hexc(char c, struct Parser* p, int index)
136 if(c >= '0' && c <= '9') return c - '0';
137 if(c >= 'A' && c <= 'F') return c - 'A' + 10;
138 if(c >= 'a' && c <= 'f') return c - 'a' + 10;
139 error(p, "bad hex constant", index);
143 // Escape and returns a single backslashed expression in a single
144 // quoted string. Trivial, just escape \' and leave everything else
146 static void sqEscape(char* buf, int len, int index, struct Parser* p,
147 char* cOut, int* eatenOut)
149 if(len < 2) error(p, "unterminated string", index);
159 // Ditto, but more complicated for double quotes.
160 static void dqEscape(char* buf, int len, int index, struct Parser* p,
161 char* cOut, int* eatenOut)
163 if(len < 2) error(p, "unterminated string", index);
166 case '"': *cOut = '"'; break;
167 case 'r': *cOut = '\r'; break;
168 case 'n': *cOut = '\n'; break;
169 case 't': *cOut = '\t'; break;
170 case '\\': *cOut = '\\'; break;
172 if(len < 4) error(p, "unterminated string", index);
173 *cOut = (char)((hexc(buf[2], p, index)<<4) | hexc(buf[3], p, index));
177 // Unhandled, put the backslash back
183 // Read in a string literal
184 static int lexStringLiteral(struct Parser* p, int index, int singleQuote)
186 int i, j, len, iteration;
189 char endMark = singleQuote ? '\'' : '"';
191 for(iteration = 0; iteration<2; iteration++) {
200 if(singleQuote) sqEscape(buf+i, p->len-i, i, p, &c, &eaten);
201 else dqEscape(buf+i, p->len-i, i, p, &c, &eaten);
203 if(iteration == 1) out[j++] = c;
207 // Finished stage one -- allocate the buffer for stage two
208 if(iteration == 0) out = naParseAlloc(p, len);
210 newToken(p, index, TOK_LITERAL, out, len, 0);
214 static int lexNumLiteral(struct Parser* p, int index)
216 int len = p->len, i = index;
217 unsigned char* buf = p->buf;
220 while(i<len && buf[i] >= '0' && buf[i] <= '9') i++;
221 if(i<len && buf[i] == '.') {
223 while(i<len && buf[i] >= '0' && buf[i] <= '9') i++;
225 if(i<len && (buf[i] == 'e' || buf[i] == 'E')) {
228 && (buf[i] == '-' || buf[i] == '+')
229 && (i+1<len && buf[i+1] >= '0' && buf[i+1] <= '9')) i++;
230 while(i<len && buf[i] >= '0' && buf[i] <= '9') i++;
232 naStr_parsenum(p->buf + index, i - index, &d);
233 newToken(p, index, TOK_LITERAL, 0, 0, d);
237 static int trySymbol(struct Parser* p, int start)
240 while((i < p->len) &&
241 ((p->buf[i] == '_') ||
242 (p->buf[i] >= 'A' && p->buf[i] <= 'Z') ||
243 (p->buf[i] >= 'a' && p->buf[i] <= 'z') ||
244 (p->buf[i] >= '0' && p->buf[i] <= '9')))
249 // Returns the length of lexeme l if the buffer prefix matches, or
251 static int matchLexeme(char* buf, int len, char* l)
254 for(i=0; i<len; i++) {
255 if(l[i] == 0) return i;
256 if(l[i] != buf[i]) return 0;
258 // Ran out of buffer. This is still OK if we're also at the end
260 if(l[i] == 0) return i;
264 // This is dumb and algorithmically slow. It would be much more
265 // elegant to sort and binary search the lexeme list, but that's a lot
266 // more code and this really isn't very slow in practice; it checks
267 // every byte of every lexeme for each input byte. There are less
268 // than 100 bytes of lexemes in the grammar. Returns the number of
269 // bytes in the lexeme read (or zero if none was recognized)
270 static int tryLexemes(struct Parser* p, int index, int* lexemeOut)
272 int i, n, best, bestIndex=-1;
273 char* start = p->buf + index;
274 int len = p->len - index;
276 n = sizeof(LEXEMES) / sizeof(struct Lexeme);
279 int l = matchLexeme(start, len, LEXEMES[i].str);
285 if(best > 0) *lexemeOut = bestIndex;
289 void naLex(struct Parser* p)
296 // Whitespace, comments and string literals have obvious
297 // markers and can be handled by a switch:
300 case ' ': case '\t': case '\n': case '\r': case '\f': case '\v':
304 i = lineEnd(p, getLine(p, i));
307 i = lexStringLiteral(p, i, (c=='"' ? 0 : 1));
310 if(c >= '0' && c <= '9') i = lexNumLiteral(p, i);
314 // Lexemes and symbols are a little more complicated. Pick
315 // the longest one that matches. Since some lexemes look like
316 // symbols (e.g. "or") they need a higher precedence, but we
317 // don't want a lexeme match to clobber the beginning of a
318 // symbol (e.g. "orchid"). If neither match, we have a bad
319 // character in the mix.
321 int symlen=0, lexlen=0, lexeme;
322 lexlen = tryLexemes(p, i, &lexeme);
323 if((c>='A' && c<='Z') || (c>='a' && c<='z') || (c=='_'))
324 symlen = trySymbol(p, i);
325 if(lexlen && lexlen >= symlen) {
326 newToken(p, i, LEXEMES[lexeme].tok, 0, 0, 0);
329 newToken(p, i, TOK_SYMBOL, p->buf+i, symlen, 0);
332 error(p, "illegal character", i);