#include "parse.h"
// Static table of recognized lexemes in the language
-struct Lexeme {
+static const struct Lexeme {
char* str;
int tok;
} LEXEMES[] = {
{"return", TOK_RETURN},
{"break", TOK_BREAK},
{"continue", TOK_CONTINUE},
- {"func", TOK_FUNC}
+ {"func", TOK_FUNC},
+ {"...", TOK_ELLIPSIS},
+ {"?", TOK_QUESTION},
+ {"var", TOK_VAR},
+ {"+=", TOK_PLUSEQ},
+ {"-=", TOK_MINUSEQ},
+ {"*=", TOK_MULEQ},
+ {"/=", TOK_DIVEQ},
+ {"~=", TOK_CATEQ},
+ {"forindex", TOK_FORINDEX},
};
// Build a table of where each line ending is
// Skip over the \r of a \r\n pair.
if(buf[i] == '\r' && (i+1)<p->len && buf[i+1] == '\n') {
- i++;
continue;
}
// Reallocate if necessary
static void newToken(struct Parser* p, int pos, int type,
char* str, int slen, double num)
{
- struct Token* tok;
+ struct Token *tok, *last = p->tree.lastChild;
+
+ /* Adjacent string literals get concatenated */
+ if(type == TOK_LITERAL && str) {
+ if(last && last->type == TOK_LITERAL) {
+ int i, len1 = last->strlen;
+ char* str2 = naParseAlloc(p, len1 + slen);
+ for(i=0; i<len1; i++) str2[i] = last->str[i];
+ for(i=0; i<slen; i++) str2[i+len1] = str[i];
+ last->str = str2;
+ last->strlen += slen;
+ return;
+ }
+ }
tok = naParseAlloc(p, sizeof(struct Token));
tok->type = type;
tok->str = str;
tok->strlen = slen;
tok->num = num;
- tok->parent = &p->tree;
tok->next = 0;
- tok->prev = p->tree.lastChild;
+ tok->prev = last;
tok->children = 0;
tok->lastChild = 0;
// Context sensitivity hack: a "-" following a binary operator of
- // higher precedence (MUL and DIV, basically) must be a unary
- // negation. Needed to get precedence right in the parser for
- // expressiong like "a * -2"
- if(type == TOK_MINUS && tok->prev)
- if(tok->prev->type == TOK_MUL || tok->prev->type == TOK_DIV)
+ // equal or higher precedence must be a unary negation. Needed to
+ // get precedence right in the parser for expressiong like "a * -2"
+ if(type == TOK_MINUS && tok->prev) {
+ int pt = tok->prev->type;
+ if(pt==TOK_PLUS||pt==TOK_MINUS||pt==TOK_CAT||pt==TOK_MUL||pt==TOK_DIV)
tok->type = type = TOK_NEG;
+ }
if(!p->tree.children) p->tree.children = tok;
if(p->tree.lastChild) p->tree.lastChild->next = tok;
p->tree.lastChild = tok;
}
-// Parse a hex nibble
-static int hexc(char c, struct Parser* p, int index)
+static int hex(char c)
{
if(c >= '0' && c <= '9') return c - '0';
- if(c >= 'A' && c <= 'F') return c - 'a' + 10;
+ if(c >= 'A' && c <= 'F') return c - 'A' + 10;
if(c >= 'a' && c <= 'f') return c - 'a' + 10;
- error(p, "bad hex constant", index);
- return 0;
+ return -1;
+}
+
+static int hexc(char c, struct Parser* p, int index)
+{
+ int n = hex(c);
+ if(n < 0) error(p, "bad hex constant", index);
+ return n;
}
// Escape and returns a single backslashed expression in a single
}
// Ditto, but more complicated for double quotes.
+/* FIXME: need to handle \b (8), \f (12), and \uXXXX for JSON compliance */
static void dqEscape(char* buf, int len, int index, struct Parser* p,
char* cOut, int* eatenOut)
{
case 'n': *cOut = '\n'; break;
case 't': *cOut = '\t'; break;
case '\\': *cOut = '\\'; break;
+ case '`': *cOut = '`'; break;
case 'x':
if(len < 4) error(p, "unterminated string", index);
*cOut = (char)((hexc(buf[2], p, index)<<4) | hexc(buf[3], p, index));
*eatenOut = 4;
+ break;
default:
// Unhandled, put the backslash back
*cOut = '\\';
}
}
+static void charLiteral(struct Parser* p, int index, char* s, int len)
+{
+ int n, c;
+ c = naLexUtf8C(s, len, &n);
+ if(c < 0 || n != len) error(p, "invalid utf8 character constant", index);
+ newToken(p, index, TOK_LITERAL, 0, 0, c);
+}
+
// Read in a string literal
-static int lexStringLiteral(struct Parser* p, int index, int singleQuote)
+static int lexStringLiteral(struct Parser* p, int index, char q)
{
int i, j, len, iteration;
char* out = 0;
char* buf = p->buf;
- char endMark = singleQuote ? '\'' : '"';
for(iteration = 0; iteration<2; iteration++) {
i = index+1;
while(i < p->len) {
char c = buf[i];
int eaten = 1;
- if(c == endMark)
- break;
+ if(c == q) break;
if(c == '\\') {
- if(singleQuote) sqEscape(buf+i, p->len-i, i, p, &c, &eaten);
- else dqEscape(buf+i, p->len-i, i, p, &c, &eaten);
+ if(q == '\'') sqEscape(buf+i, p->len-i, i, p, &c, &eaten);
+ else dqEscape(buf+i, p->len-i, i, p, &c, &eaten);
}
if(iteration == 1) out[j++] = c;
i += eaten;
// Finished stage one -- allocate the buffer for stage two
if(iteration == 0) out = naParseAlloc(p, len);
}
- newToken(p, index, TOK_LITERAL, out, len, 0);
+ if(q == '`') charLiteral(p, index, out, len);
+ else newToken(p, index, TOK_LITERAL, out, len, 0);
return i+1;
}
+static int lexHexLiteral(struct Parser* p, int index)
+{
+ int nib, i = index;
+ double d = 0;
+ while(i < p->len && (nib = hex(p->buf[i])) >= 0) {
+ d = d*16 + nib;
+ i++;
+ }
+ newToken(p, index, TOK_LITERAL, 0, 0, d);
+ return i;
+}
+
+#define ISNUM(c) ((c) >= '0' && (c) <= '9')
+#define ISHEX(c) (ISNUM(c) || ((c)>='a' && (c)<='f') || ((c)>='A' && (c)<='F'))
+#define NUMSTART(c) (ISNUM(c) || (c) == '+' || (c) == '-')
static int lexNumLiteral(struct Parser* p, int index)
{
int len = p->len, i = index;
- unsigned char* buf = p->buf;
+ unsigned char* buf = (unsigned char*)p->buf;
double d;
- while(i<len && buf[i] >= '0' && buf[i] <= '9') i++;
+ if(buf[i] == '0' && i+2<len && buf[i+1] == 'x' && ISHEX(buf[i+2]))
+ return lexHexLiteral(p, index+2);
+
+ while(i<len && ISNUM(buf[i])) i++;
if(i<len && buf[i] == '.') {
i++;
- while(i<len && buf[i] >= '0' && buf[i] <= '9') i++;
+ while(i<len && ISNUM(buf[i])) i++;
}
- if(i<len && (buf[i] == 'e' || buf[i] == 'E')) {
+ if(i+1<len && (buf[i] == 'e' || buf[i] == 'E') && NUMSTART(buf[i+1])) {
i++;
- if(i<len
- && (buf[i] == '-' || buf[i] == '+')
- && (i+1<len && buf[i+1] >= '0' && buf[i+1] <= '9')) i++;
- while(i<len && buf[i] >= '0' && buf[i] <= '9') i++;
+ if(buf[i] == '-' || buf[i] == '+') i++;
+ while(i<len && ISNUM(buf[i])) i++;
}
naStr_parsenum(p->buf + index, i - index, &d);
newToken(p, index, TOK_LITERAL, 0, 0, d);
{
int i = start;
while((i < p->len) &&
- ((p->buf[i] >= 'A' && p->buf[i] <= 'Z') ||
+ ((p->buf[i] == '_') ||
+ (p->buf[i] >= 'A' && p->buf[i] <= 'Z') ||
(p->buf[i] >= 'a' && p->buf[i] <= 'z') ||
(p->buf[i] >= '0' && p->buf[i] <= '9')))
{ i++; }
case '#':
i = lineEnd(p, getLine(p, i));
break;
- case '\'': case '"':
- i = lexStringLiteral(p, i, (c=='"' ? 0 : 1));
+ case '\'': case '"': case '`':
+ i = lexStringLiteral(p, i, c);
break;
default:
- if(c >= '0' && c <= '9') i = lexNumLiteral(p, i);
- else handled = 0;
+ if(ISNUM(c) || (c == '.' && (i+1)<p->len && ISNUM(p->buf[i+1])))
+ i = lexNumLiteral(p, i);
+ else handled = 0;
}
// Lexemes and symbols are a little more complicated. Pick
// symbol (e.g. "orchid"). If neither match, we have a bad
// character in the mix.
if(!handled) {
- int symlen=0, lexlen=0, lexeme;
+ int symlen=0, lexlen=0, lexeme=-1;
lexlen = tryLexemes(p, i, &lexeme);
- if((c>='A' && c<='Z') || (c>='a' && c<='z'))
+ if((c>='A' && c<='Z') || (c>='a' && c<='z') || (c=='_'))
symlen = trySymbol(p, i);
if(lexlen && lexlen >= symlen) {
newToken(p, i, LEXEMES[lexeme].tok, 0, 0, 0);