llex.c (17352B)
1 /* 2 ** $Id: llex.c $ 3 ** Lexical Analyzer 4 ** See Copyright Notice in lua.h 5 */ 6 7 #define llex_c 8 #define LUA_CORE 9 10 #include "lprefix.h" 11 12 13 #include <locale.h> 14 #include <string.h> 15 16 #include "lua.h" 17 18 #include "lctype.h" 19 #include "ldebug.h" 20 #include "ldo.h" 21 #include "lgc.h" 22 #include "llex.h" 23 #include "lobject.h" 24 #include "lparser.h" 25 #include "lstate.h" 26 #include "lstring.h" 27 #include "ltable.h" 28 #include "lzio.h" 29 30 31 32 #define next(ls) (ls->current = zgetc(ls->z)) 33 34 35 /* minimum size for string buffer */ 36 #if !defined(LUA_MINBUFFER) 37 #define LUA_MINBUFFER 32 38 #endif 39 40 41 #define currIsNewline(ls) (ls->current == '\n' || ls->current == '\r') 42 43 44 /* ORDER RESERVED */ 45 static const char *const luaX_tokens [] = { 46 "and", "break", "do", "else", "elseif", 47 "end", "false", "for", "function", "goto", "if", 48 "in", "local", "nil", "not", "or", "repeat", 49 "return", "then", "true", "until", "while", 50 "//", "..", "...", "==", ">=", "<=", "~=", 51 "<<", ">>", "::", "<eof>", 52 "<number>", "<integer>", "<name>", "<string>" 53 }; 54 55 56 #define save_and_next(ls) (save(ls, ls->current), next(ls)) 57 58 59 static l_noret lexerror (LexState *ls, const char *msg, int token); 60 61 62 static void save (LexState *ls, int c) { 63 Mbuffer *b = ls->buff; 64 if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) { 65 size_t newsize; 66 if (luaZ_sizebuffer(b) >= MAX_SIZE/2) 67 lexerror(ls, "lexical element too long", 0); 68 newsize = luaZ_sizebuffer(b) * 2; 69 luaZ_resizebuffer(ls->L, b, newsize); 70 } 71 b->buffer[luaZ_bufflen(b)++] = cast_char(c); 72 } 73 74 75 void luaX_init (lua_State *L) { 76 int i; 77 TString *e = luaS_newliteral(L, LUA_ENV); /* create env name */ 78 luaC_fix(L, obj2gco(e)); /* never collect this name */ 79 for (i=0; i<NUM_RESERVED; i++) { 80 TString *ts = luaS_new(L, luaX_tokens[i]); 81 luaC_fix(L, obj2gco(ts)); /* reserved words are never collected */ 82 ts->extra = cast_byte(i+1); /* reserved word */ 83 } 84 } 85 86 87 const char *luaX_token2str (LexState *ls, int token) { 88 if (token < FIRST_RESERVED) { /* single-byte symbols? */ 89 if (lisprint(token)) 90 return luaO_pushfstring(ls->L, "'%c'", token); 91 else /* control character */ 92 return luaO_pushfstring(ls->L, "'<\\%d>'", token); 93 } 94 else { 95 const char *s = luaX_tokens[token - FIRST_RESERVED]; 96 if (token < TK_EOS) /* fixed format (symbols and reserved words)? */ 97 return luaO_pushfstring(ls->L, "'%s'", s); 98 else /* names, strings, and numerals */ 99 return s; 100 } 101 } 102 103 104 static const char *txtToken (LexState *ls, int token) { 105 switch (token) { 106 case TK_NAME: case TK_STRING: 107 case TK_FLT: case TK_INT: 108 save(ls, '\0'); 109 return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff)); 110 default: 111 return luaX_token2str(ls, token); 112 } 113 } 114 115 116 static l_noret lexerror (LexState *ls, const char *msg, int token) { 117 msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber); 118 if (token) 119 luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token)); 120 luaD_throw(ls->L, LUA_ERRSYNTAX); 121 } 122 123 124 l_noret luaX_syntaxerror (LexState *ls, const char *msg) { 125 lexerror(ls, msg, ls->t.token); 126 } 127 128 129 /* 130 ** Anchors a string in scanner's table so that it will not be collected 131 ** until the end of the compilation; by that time it should be anchored 132 ** somewhere. It also internalizes long strings, ensuring there is only 133 ** one copy of each unique string. 134 */ 135 static TString *anchorstr (LexState *ls, TString *ts) { 136 lua_State *L = ls->L; 137 TValue oldts; 138 int tag = luaH_getstr(ls->h, ts, &oldts); 139 if (!tagisempty(tag)) /* string already present? */ 140 return tsvalue(&oldts); /* use stored value */ 141 else { /* create a new entry */ 142 TValue *stv = s2v(L->top.p++); /* reserve stack space for string */ 143 setsvalue(L, stv, ts); /* push (anchor) the string on the stack */ 144 luaH_set(L, ls->h, stv, stv); /* t[string] = string */ 145 /* table is not a metatable, so it does not need to invalidate cache */ 146 luaC_checkGC(L); 147 L->top.p--; /* remove string from stack */ 148 return ts; 149 } 150 } 151 152 153 /* 154 ** Creates a new string and anchors it in scanner's table. 155 */ 156 TString *luaX_newstring (LexState *ls, const char *str, size_t l) { 157 return anchorstr(ls, luaS_newlstr(ls->L, str, l)); 158 } 159 160 161 /* 162 ** increment line number and skips newline sequence (any of 163 ** \n, \r, \n\r, or \r\n) 164 */ 165 static void inclinenumber (LexState *ls) { 166 int old = ls->current; 167 lua_assert(currIsNewline(ls)); 168 next(ls); /* skip '\n' or '\r' */ 169 if (currIsNewline(ls) && ls->current != old) 170 next(ls); /* skip '\n\r' or '\r\n' */ 171 if (++ls->linenumber >= INT_MAX) 172 lexerror(ls, "chunk has too many lines", 0); 173 } 174 175 176 void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source, 177 int firstchar) { 178 ls->t.token = 0; 179 ls->L = L; 180 ls->current = firstchar; 181 ls->lookahead.token = TK_EOS; /* no look-ahead token */ 182 ls->z = z; 183 ls->fs = NULL; 184 ls->linenumber = 1; 185 ls->lastline = 1; 186 ls->source = source; 187 ls->envn = luaS_newliteral(L, LUA_ENV); /* get env name */ 188 luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER); /* initialize buffer */ 189 } 190 191 192 193 /* 194 ** ======================================================= 195 ** LEXICAL ANALYZER 196 ** ======================================================= 197 */ 198 199 200 static int check_next1 (LexState *ls, int c) { 201 if (ls->current == c) { 202 next(ls); 203 return 1; 204 } 205 else return 0; 206 } 207 208 209 /* 210 ** Check whether current char is in set 'set' (with two chars) and 211 ** saves it 212 */ 213 static int check_next2 (LexState *ls, const char *set) { 214 lua_assert(set[2] == '\0'); 215 if (ls->current == set[0] || ls->current == set[1]) { 216 save_and_next(ls); 217 return 1; 218 } 219 else return 0; 220 } 221 222 223 /* LUA_NUMBER */ 224 /* 225 ** This function is quite liberal in what it accepts, as 'luaO_str2num' 226 ** will reject ill-formed numerals. Roughly, it accepts the following 227 ** pattern: 228 ** 229 ** %d(%x|%.|([Ee][+-]?))* | 0[Xx](%x|%.|([Pp][+-]?))* 230 ** 231 ** The only tricky part is to accept [+-] only after a valid exponent 232 ** mark, to avoid reading '3-4' or '0xe+1' as a single number. 233 ** 234 ** The caller might have already read an initial dot. 235 */ 236 static int read_numeral (LexState *ls, SemInfo *seminfo) { 237 TValue obj; 238 const char *expo = "Ee"; 239 int first = ls->current; 240 lua_assert(lisdigit(ls->current)); 241 save_and_next(ls); 242 if (first == '0' && check_next2(ls, "xX")) /* hexadecimal? */ 243 expo = "Pp"; 244 for (;;) { 245 if (check_next2(ls, expo)) /* exponent mark? */ 246 check_next2(ls, "-+"); /* optional exponent sign */ 247 else if (lisxdigit(ls->current) || ls->current == '.') /* '%x|%.' */ 248 save_and_next(ls); 249 else break; 250 } 251 if (lislalpha(ls->current)) /* is numeral touching a letter? */ 252 save_and_next(ls); /* force an error */ 253 save(ls, '\0'); 254 if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0) /* format error? */ 255 lexerror(ls, "malformed number", TK_FLT); 256 if (ttisinteger(&obj)) { 257 seminfo->i = ivalue(&obj); 258 return TK_INT; 259 } 260 else { 261 lua_assert(ttisfloat(&obj)); 262 seminfo->r = fltvalue(&obj); 263 return TK_FLT; 264 } 265 } 266 267 268 /* 269 ** read a sequence '[=*[' or ']=*]', leaving the last bracket. If 270 ** sequence is well formed, return its number of '='s + 2; otherwise, 271 ** return 1 if it is a single bracket (no '='s and no 2nd bracket); 272 ** otherwise (an unfinished '[==...') return 0. 273 */ 274 static size_t skip_sep (LexState *ls) { 275 size_t count = 0; 276 int s = ls->current; 277 lua_assert(s == '[' || s == ']'); 278 save_and_next(ls); 279 while (ls->current == '=') { 280 save_and_next(ls); 281 count++; 282 } 283 return (ls->current == s) ? count + 2 284 : (count == 0) ? 1 285 : 0; 286 } 287 288 289 static void read_long_string (LexState *ls, SemInfo *seminfo, size_t sep) { 290 int line = ls->linenumber; /* initial line (for error message) */ 291 save_and_next(ls); /* skip 2nd '[' */ 292 if (currIsNewline(ls)) /* string starts with a newline? */ 293 inclinenumber(ls); /* skip it */ 294 for (;;) { 295 switch (ls->current) { 296 case EOZ: { /* error */ 297 const char *what = (seminfo ? "string" : "comment"); 298 const char *msg = luaO_pushfstring(ls->L, 299 "unfinished long %s (starting at line %d)", what, line); 300 lexerror(ls, msg, TK_EOS); 301 break; /* to avoid warnings */ 302 } 303 case ']': { 304 if (skip_sep(ls) == sep) { 305 save_and_next(ls); /* skip 2nd ']' */ 306 goto endloop; 307 } 308 break; 309 } 310 case '\n': case '\r': { 311 save(ls, '\n'); 312 inclinenumber(ls); 313 if (!seminfo) luaZ_resetbuffer(ls->buff); /* avoid wasting space */ 314 break; 315 } 316 default: { 317 if (seminfo) save_and_next(ls); 318 else next(ls); 319 } 320 } 321 } endloop: 322 if (seminfo) 323 seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep, 324 luaZ_bufflen(ls->buff) - 2 * sep); 325 } 326 327 328 static void esccheck (LexState *ls, int c, const char *msg) { 329 if (!c) { 330 if (ls->current != EOZ) 331 save_and_next(ls); /* add current to buffer for error message */ 332 lexerror(ls, msg, TK_STRING); 333 } 334 } 335 336 337 static int gethexa (LexState *ls) { 338 save_and_next(ls); 339 esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected"); 340 return luaO_hexavalue(ls->current); 341 } 342 343 344 static int readhexaesc (LexState *ls) { 345 int r = gethexa(ls); 346 r = (r << 4) + gethexa(ls); 347 luaZ_buffremove(ls->buff, 2); /* remove saved chars from buffer */ 348 return r; 349 } 350 351 352 /* 353 ** When reading a UTF-8 escape sequence, save everything to the buffer 354 ** for error reporting in case of errors; 'i' counts the number of 355 ** saved characters, so that they can be removed if case of success. 356 */ 357 static unsigned long readutf8esc (LexState *ls) { 358 unsigned long r; 359 int i = 4; /* number of chars to be removed: start with #"\u{X" */ 360 save_and_next(ls); /* skip 'u' */ 361 esccheck(ls, ls->current == '{', "missing '{'"); 362 r = cast_ulong(gethexa(ls)); /* must have at least one digit */ 363 while (cast_void(save_and_next(ls)), lisxdigit(ls->current)) { 364 i++; 365 esccheck(ls, r <= (0x7FFFFFFFu >> 4), "UTF-8 value too large"); 366 r = (r << 4) + luaO_hexavalue(ls->current); 367 } 368 esccheck(ls, ls->current == '}', "missing '}'"); 369 next(ls); /* skip '}' */ 370 luaZ_buffremove(ls->buff, i); /* remove saved chars from buffer */ 371 return r; 372 } 373 374 375 static void utf8esc (LexState *ls) { 376 char buff[UTF8BUFFSZ]; 377 int n = luaO_utf8esc(buff, readutf8esc(ls)); 378 for (; n > 0; n--) /* add 'buff' to string */ 379 save(ls, buff[UTF8BUFFSZ - n]); 380 } 381 382 383 static int readdecesc (LexState *ls) { 384 int i; 385 int r = 0; /* result accumulator */ 386 for (i = 0; i < 3 && lisdigit(ls->current); i++) { /* read up to 3 digits */ 387 r = 10*r + ls->current - '0'; 388 save_and_next(ls); 389 } 390 esccheck(ls, r <= UCHAR_MAX, "decimal escape too large"); 391 luaZ_buffremove(ls->buff, i); /* remove read digits from buffer */ 392 return r; 393 } 394 395 396 static void read_string (LexState *ls, int del, SemInfo *seminfo) { 397 save_and_next(ls); /* keep delimiter (for error messages) */ 398 while (ls->current != del) { 399 switch (ls->current) { 400 case EOZ: 401 lexerror(ls, "unfinished string", TK_EOS); 402 break; /* to avoid warnings */ 403 case '\n': 404 case '\r': 405 lexerror(ls, "unfinished string", TK_STRING); 406 break; /* to avoid warnings */ 407 case '\\': { /* escape sequences */ 408 int c; /* final character to be saved */ 409 save_and_next(ls); /* keep '\\' for error messages */ 410 switch (ls->current) { 411 case 'a': c = '\a'; goto read_save; 412 case 'b': c = '\b'; goto read_save; 413 case 'f': c = '\f'; goto read_save; 414 case 'n': c = '\n'; goto read_save; 415 case 'r': c = '\r'; goto read_save; 416 case 't': c = '\t'; goto read_save; 417 case 'v': c = '\v'; goto read_save; 418 case 'x': c = readhexaesc(ls); goto read_save; 419 case 'u': utf8esc(ls); goto no_save; 420 case '\n': case '\r': 421 inclinenumber(ls); c = '\n'; goto only_save; 422 case '\\': case '\"': case '\'': 423 c = ls->current; goto read_save; 424 case EOZ: goto no_save; /* will raise an error next loop */ 425 case 'z': { /* zap following span of spaces */ 426 luaZ_buffremove(ls->buff, 1); /* remove '\\' */ 427 next(ls); /* skip the 'z' */ 428 while (lisspace(ls->current)) { 429 if (currIsNewline(ls)) inclinenumber(ls); 430 else next(ls); 431 } 432 goto no_save; 433 } 434 default: { 435 esccheck(ls, lisdigit(ls->current), "invalid escape sequence"); 436 c = readdecesc(ls); /* digital escape '\ddd' */ 437 goto only_save; 438 } 439 } 440 read_save: 441 next(ls); 442 /* go through */ 443 only_save: 444 luaZ_buffremove(ls->buff, 1); /* remove '\\' */ 445 save(ls, c); 446 /* go through */ 447 no_save: break; 448 } 449 default: 450 save_and_next(ls); 451 } 452 } 453 save_and_next(ls); /* skip delimiter */ 454 seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1, 455 luaZ_bufflen(ls->buff) - 2); 456 } 457 458 459 static int llex (LexState *ls, SemInfo *seminfo) { 460 luaZ_resetbuffer(ls->buff); 461 for (;;) { 462 switch (ls->current) { 463 case '\n': case '\r': { /* line breaks */ 464 inclinenumber(ls); 465 break; 466 } 467 case ' ': case '\f': case '\t': case '\v': { /* spaces */ 468 next(ls); 469 break; 470 } 471 case '-': { /* '-' or '--' (comment) */ 472 next(ls); 473 if (ls->current != '-') return '-'; 474 /* else is a comment */ 475 next(ls); 476 if (ls->current == '[') { /* long comment? */ 477 size_t sep = skip_sep(ls); 478 luaZ_resetbuffer(ls->buff); /* 'skip_sep' may dirty the buffer */ 479 if (sep >= 2) { 480 read_long_string(ls, NULL, sep); /* skip long comment */ 481 luaZ_resetbuffer(ls->buff); /* previous call may dirty the buff. */ 482 break; 483 } 484 } 485 /* else short comment */ 486 while (!currIsNewline(ls) && ls->current != EOZ) 487 next(ls); /* skip until end of line (or end of file) */ 488 break; 489 } 490 case '[': { /* long string or simply '[' */ 491 size_t sep = skip_sep(ls); 492 if (sep >= 2) { 493 read_long_string(ls, seminfo, sep); 494 return TK_STRING; 495 } 496 else if (sep == 0) /* '[=...' missing second bracket? */ 497 lexerror(ls, "invalid long string delimiter", TK_STRING); 498 return '['; 499 } 500 case '=': { 501 next(ls); 502 if (check_next1(ls, '=')) return TK_EQ; /* '==' */ 503 else return '='; 504 } 505 case '<': { 506 next(ls); 507 if (check_next1(ls, '=')) return TK_LE; /* '<=' */ 508 else if (check_next1(ls, '<')) return TK_SHL; /* '<<' */ 509 else return '<'; 510 } 511 case '>': { 512 next(ls); 513 if (check_next1(ls, '=')) return TK_GE; /* '>=' */ 514 else if (check_next1(ls, '>')) return TK_SHR; /* '>>' */ 515 else return '>'; 516 } 517 case '/': { 518 next(ls); 519 if (check_next1(ls, '/')) return TK_IDIV; /* '//' */ 520 else return '/'; 521 } 522 case '~': { 523 next(ls); 524 if (check_next1(ls, '=')) return TK_NE; /* '~=' */ 525 else return '~'; 526 } 527 case ':': { 528 next(ls); 529 if (check_next1(ls, ':')) return TK_DBCOLON; /* '::' */ 530 else return ':'; 531 } 532 case '"': case '\'': { /* short literal strings */ 533 read_string(ls, ls->current, seminfo); 534 return TK_STRING; 535 } 536 case '.': { /* '.', '..', '...', or number */ 537 save_and_next(ls); 538 if (check_next1(ls, '.')) { 539 if (check_next1(ls, '.')) 540 return TK_DOTS; /* '...' */ 541 else return TK_CONCAT; /* '..' */ 542 } 543 else if (!lisdigit(ls->current)) return '.'; 544 else return read_numeral(ls, seminfo); 545 } 546 case '0': case '1': case '2': case '3': case '4': 547 case '5': case '6': case '7': case '8': case '9': { 548 return read_numeral(ls, seminfo); 549 } 550 case EOZ: { 551 return TK_EOS; 552 } 553 default: { 554 if (lislalpha(ls->current)) { /* identifier or reserved word? */ 555 TString *ts; 556 do { 557 save_and_next(ls); 558 } while (lislalnum(ls->current)); 559 /* find or create string */ 560 ts = luaS_newlstr(ls->L, luaZ_buffer(ls->buff), 561 luaZ_bufflen(ls->buff)); 562 if (isreserved(ts)) /* reserved word? */ 563 return ts->extra - 1 + FIRST_RESERVED; 564 else { 565 seminfo->ts = anchorstr(ls, ts); 566 return TK_NAME; 567 } 568 } 569 else { /* single-char tokens ('+', '*', '%', '{', '}', ...) */ 570 int c = ls->current; 571 next(ls); 572 return c; 573 } 574 } 575 } 576 } 577 } 578 579 580 void luaX_next (LexState *ls) { 581 ls->lastline = ls->linenumber; 582 if (ls->lookahead.token != TK_EOS) { /* is there a look-ahead token? */ 583 ls->t = ls->lookahead; /* use this one */ 584 ls->lookahead.token = TK_EOS; /* and discharge it */ 585 } 586 else 587 ls->t.token = llex(ls, &ls->t.seminfo); /* read next token */ 588 } 589 590 591 int luaX_lookahead (LexState *ls) { 592 lua_assert(ls->lookahead.token == TK_EOS); 593 ls->lookahead.token = llex(ls, &ls->lookahead.seminfo); 594 return ls->lookahead.token; 595 } 596