lua

A copy of the Lua development repository
Log | Files | Refs | README

llex.c (17352B)


      1 /*
      2 ** $Id: llex.c $
      3 ** Lexical Analyzer
      4 ** See Copyright Notice in lua.h
      5 */
      6 
      7 #define llex_c
      8 #define LUA_CORE
      9 
     10 #include "lprefix.h"
     11 
     12 
     13 #include <locale.h>
     14 #include <string.h>
     15 
     16 #include "lua.h"
     17 
     18 #include "lctype.h"
     19 #include "ldebug.h"
     20 #include "ldo.h"
     21 #include "lgc.h"
     22 #include "llex.h"
     23 #include "lobject.h"
     24 #include "lparser.h"
     25 #include "lstate.h"
     26 #include "lstring.h"
     27 #include "ltable.h"
     28 #include "lzio.h"
     29 
     30 
     31 
     32 #define next(ls)	(ls->current = zgetc(ls->z))
     33 
     34 
     35 /* minimum size for string buffer */
     36 #if !defined(LUA_MINBUFFER)
     37 #define LUA_MINBUFFER   32
     38 #endif
     39 
     40 
     41 #define currIsNewline(ls)	(ls->current == '\n' || ls->current == '\r')
     42 
     43 
     44 /* ORDER RESERVED */
     45 static const char *const luaX_tokens [] = {
     46     "and", "break", "do", "else", "elseif",
     47     "end", "false", "for", "function", "goto", "if",
     48     "in", "local", "nil", "not", "or", "repeat",
     49     "return", "then", "true", "until", "while",
     50     "//", "..", "...", "==", ">=", "<=", "~=",
     51     "<<", ">>", "::", "<eof>",
     52     "<number>", "<integer>", "<name>", "<string>"
     53 };
     54 
     55 
     56 #define save_and_next(ls) (save(ls, ls->current), next(ls))
     57 
     58 
     59 static l_noret lexerror (LexState *ls, const char *msg, int token);
     60 
     61 
     62 static void save (LexState *ls, int c) {
     63   Mbuffer *b = ls->buff;
     64   if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
     65     size_t newsize;
     66     if (luaZ_sizebuffer(b) >= MAX_SIZE/2)
     67       lexerror(ls, "lexical element too long", 0);
     68     newsize = luaZ_sizebuffer(b) * 2;
     69     luaZ_resizebuffer(ls->L, b, newsize);
     70   }
     71   b->buffer[luaZ_bufflen(b)++] = cast_char(c);
     72 }
     73 
     74 
     75 void luaX_init (lua_State *L) {
     76   int i;
     77   TString *e = luaS_newliteral(L, LUA_ENV);  /* create env name */
     78   luaC_fix(L, obj2gco(e));  /* never collect this name */
     79   for (i=0; i<NUM_RESERVED; i++) {
     80     TString *ts = luaS_new(L, luaX_tokens[i]);
     81     luaC_fix(L, obj2gco(ts));  /* reserved words are never collected */
     82     ts->extra = cast_byte(i+1);  /* reserved word */
     83   }
     84 }
     85 
     86 
     87 const char *luaX_token2str (LexState *ls, int token) {
     88   if (token < FIRST_RESERVED) {  /* single-byte symbols? */
     89     if (lisprint(token))
     90       return luaO_pushfstring(ls->L, "'%c'", token);
     91     else  /* control character */
     92       return luaO_pushfstring(ls->L, "'<\\%d>'", token);
     93   }
     94   else {
     95     const char *s = luaX_tokens[token - FIRST_RESERVED];
     96     if (token < TK_EOS)  /* fixed format (symbols and reserved words)? */
     97       return luaO_pushfstring(ls->L, "'%s'", s);
     98     else  /* names, strings, and numerals */
     99       return s;
    100   }
    101 }
    102 
    103 
    104 static const char *txtToken (LexState *ls, int token) {
    105   switch (token) {
    106     case TK_NAME: case TK_STRING:
    107     case TK_FLT: case TK_INT:
    108       save(ls, '\0');
    109       return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff));
    110     default:
    111       return luaX_token2str(ls, token);
    112   }
    113 }
    114 
    115 
    116 static l_noret lexerror (LexState *ls, const char *msg, int token) {
    117   msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber);
    118   if (token)
    119     luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
    120   luaD_throw(ls->L, LUA_ERRSYNTAX);
    121 }
    122 
    123 
    124 l_noret luaX_syntaxerror (LexState *ls, const char *msg) {
    125   lexerror(ls, msg, ls->t.token);
    126 }
    127 
    128 
    129 /*
    130 ** Anchors a string in scanner's table so that it will not be collected
    131 ** until the end of the compilation; by that time it should be anchored
    132 ** somewhere. It also internalizes long strings, ensuring there is only
    133 ** one copy of each unique string.
    134 */
    135 static TString *anchorstr (LexState *ls, TString *ts) {
    136   lua_State *L = ls->L;
    137   TValue oldts;
    138   int tag = luaH_getstr(ls->h, ts, &oldts);
    139   if (!tagisempty(tag))  /* string already present? */
    140     return tsvalue(&oldts);  /* use stored value */
    141   else {  /* create a new entry */
    142     TValue *stv = s2v(L->top.p++);  /* reserve stack space for string */
    143     setsvalue(L, stv, ts);  /* push (anchor) the string on the stack */
    144     luaH_set(L, ls->h, stv, stv);  /* t[string] = string */
    145     /* table is not a metatable, so it does not need to invalidate cache */
    146     luaC_checkGC(L);
    147     L->top.p--;  /* remove string from stack */
    148     return ts;
    149   }
    150 }
    151 
    152 
    153 /*
    154 ** Creates a new string and anchors it in scanner's table.
    155 */
    156 TString *luaX_newstring (LexState *ls, const char *str, size_t l) {
    157   return anchorstr(ls, luaS_newlstr(ls->L, str, l));
    158 }
    159 
    160 
    161 /*
    162 ** increment line number and skips newline sequence (any of
    163 ** \n, \r, \n\r, or \r\n)
    164 */
    165 static void inclinenumber (LexState *ls) {
    166   int old = ls->current;
    167   lua_assert(currIsNewline(ls));
    168   next(ls);  /* skip '\n' or '\r' */
    169   if (currIsNewline(ls) && ls->current != old)
    170     next(ls);  /* skip '\n\r' or '\r\n' */
    171   if (++ls->linenumber >= INT_MAX)
    172     lexerror(ls, "chunk has too many lines", 0);
    173 }
    174 
    175 
    176 void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source,
    177                     int firstchar) {
    178   ls->t.token = 0;
    179   ls->L = L;
    180   ls->current = firstchar;
    181   ls->lookahead.token = TK_EOS;  /* no look-ahead token */
    182   ls->z = z;
    183   ls->fs = NULL;
    184   ls->linenumber = 1;
    185   ls->lastline = 1;
    186   ls->source = source;
    187   ls->envn = luaS_newliteral(L, LUA_ENV);  /* get env name */
    188   luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER);  /* initialize buffer */
    189 }
    190 
    191 
    192 
    193 /*
    194 ** =======================================================
    195 ** LEXICAL ANALYZER
    196 ** =======================================================
    197 */
    198 
    199 
    200 static int check_next1 (LexState *ls, int c) {
    201   if (ls->current == c) {
    202     next(ls);
    203     return 1;
    204   }
    205   else return 0;
    206 }
    207 
    208 
    209 /*
    210 ** Check whether current char is in set 'set' (with two chars) and
    211 ** saves it
    212 */
    213 static int check_next2 (LexState *ls, const char *set) {
    214   lua_assert(set[2] == '\0');
    215   if (ls->current == set[0] || ls->current == set[1]) {
    216     save_and_next(ls);
    217     return 1;
    218   }
    219   else return 0;
    220 }
    221 
    222 
    223 /* LUA_NUMBER */
    224 /*
    225 ** This function is quite liberal in what it accepts, as 'luaO_str2num'
    226 ** will reject ill-formed numerals. Roughly, it accepts the following
    227 ** pattern:
    228 **
    229 **   %d(%x|%.|([Ee][+-]?))* | 0[Xx](%x|%.|([Pp][+-]?))*
    230 **
    231 ** The only tricky part is to accept [+-] only after a valid exponent
    232 ** mark, to avoid reading '3-4' or '0xe+1' as a single number.
    233 **
    234 ** The caller might have already read an initial dot.
    235 */
    236 static int read_numeral (LexState *ls, SemInfo *seminfo) {
    237   TValue obj;
    238   const char *expo = "Ee";
    239   int first = ls->current;
    240   lua_assert(lisdigit(ls->current));
    241   save_and_next(ls);
    242   if (first == '0' && check_next2(ls, "xX"))  /* hexadecimal? */
    243     expo = "Pp";
    244   for (;;) {
    245     if (check_next2(ls, expo))  /* exponent mark? */
    246       check_next2(ls, "-+");  /* optional exponent sign */
    247     else if (lisxdigit(ls->current) || ls->current == '.')  /* '%x|%.' */
    248       save_and_next(ls);
    249     else break;
    250   }
    251   if (lislalpha(ls->current))  /* is numeral touching a letter? */
    252     save_and_next(ls);  /* force an error */
    253   save(ls, '\0');
    254   if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0)  /* format error? */
    255     lexerror(ls, "malformed number", TK_FLT);
    256   if (ttisinteger(&obj)) {
    257     seminfo->i = ivalue(&obj);
    258     return TK_INT;
    259   }
    260   else {
    261     lua_assert(ttisfloat(&obj));
    262     seminfo->r = fltvalue(&obj);
    263     return TK_FLT;
    264   }
    265 }
    266 
    267 
    268 /*
    269 ** read a sequence '[=*[' or ']=*]', leaving the last bracket. If
    270 ** sequence is well formed, return its number of '='s + 2; otherwise,
    271 ** return 1 if it is a single bracket (no '='s and no 2nd bracket);
    272 ** otherwise (an unfinished '[==...') return 0.
    273 */
    274 static size_t skip_sep (LexState *ls) {
    275   size_t count = 0;
    276   int s = ls->current;
    277   lua_assert(s == '[' || s == ']');
    278   save_and_next(ls);
    279   while (ls->current == '=') {
    280     save_and_next(ls);
    281     count++;
    282   }
    283   return (ls->current == s) ? count + 2
    284          : (count == 0) ? 1
    285          : 0;
    286 }
    287 
    288 
    289 static void read_long_string (LexState *ls, SemInfo *seminfo, size_t sep) {
    290   int line = ls->linenumber;  /* initial line (for error message) */
    291   save_and_next(ls);  /* skip 2nd '[' */
    292   if (currIsNewline(ls))  /* string starts with a newline? */
    293     inclinenumber(ls);  /* skip it */
    294   for (;;) {
    295     switch (ls->current) {
    296       case EOZ: {  /* error */
    297         const char *what = (seminfo ? "string" : "comment");
    298         const char *msg = luaO_pushfstring(ls->L,
    299                      "unfinished long %s (starting at line %d)", what, line);
    300         lexerror(ls, msg, TK_EOS);
    301         break;  /* to avoid warnings */
    302       }
    303       case ']': {
    304         if (skip_sep(ls) == sep) {
    305           save_and_next(ls);  /* skip 2nd ']' */
    306           goto endloop;
    307         }
    308         break;
    309       }
    310       case '\n': case '\r': {
    311         save(ls, '\n');
    312         inclinenumber(ls);
    313         if (!seminfo) luaZ_resetbuffer(ls->buff);  /* avoid wasting space */
    314         break;
    315       }
    316       default: {
    317         if (seminfo) save_and_next(ls);
    318         else next(ls);
    319       }
    320     }
    321   } endloop:
    322   if (seminfo)
    323     seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
    324                                      luaZ_bufflen(ls->buff) - 2 * sep);
    325 }
    326 
    327 
    328 static void esccheck (LexState *ls, int c, const char *msg) {
    329   if (!c) {
    330     if (ls->current != EOZ)
    331       save_and_next(ls);  /* add current to buffer for error message */
    332     lexerror(ls, msg, TK_STRING);
    333   }
    334 }
    335 
    336 
    337 static int gethexa (LexState *ls) {
    338   save_and_next(ls);
    339   esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected");
    340   return luaO_hexavalue(ls->current);
    341 }
    342 
    343 
    344 static int readhexaesc (LexState *ls) {
    345   int r = gethexa(ls);
    346   r = (r << 4) + gethexa(ls);
    347   luaZ_buffremove(ls->buff, 2);  /* remove saved chars from buffer */
    348   return r;
    349 }
    350 
    351 
    352 /*
    353 ** When reading a UTF-8 escape sequence, save everything to the buffer
    354 ** for error reporting in case of errors; 'i' counts the number of
    355 ** saved characters, so that they can be removed if case of success.
    356 */
    357 static unsigned long readutf8esc (LexState *ls) {
    358   unsigned long r;
    359   int i = 4;  /* number of chars to be removed: start with #"\u{X" */
    360   save_and_next(ls);  /* skip 'u' */
    361   esccheck(ls, ls->current == '{', "missing '{'");
    362   r = cast_ulong(gethexa(ls));  /* must have at least one digit */
    363   while (cast_void(save_and_next(ls)), lisxdigit(ls->current)) {
    364     i++;
    365     esccheck(ls, r <= (0x7FFFFFFFu >> 4), "UTF-8 value too large");
    366     r = (r << 4) + luaO_hexavalue(ls->current);
    367   }
    368   esccheck(ls, ls->current == '}', "missing '}'");
    369   next(ls);  /* skip '}' */
    370   luaZ_buffremove(ls->buff, i);  /* remove saved chars from buffer */
    371   return r;
    372 }
    373 
    374 
    375 static void utf8esc (LexState *ls) {
    376   char buff[UTF8BUFFSZ];
    377   int n = luaO_utf8esc(buff, readutf8esc(ls));
    378   for (; n > 0; n--)  /* add 'buff' to string */
    379     save(ls, buff[UTF8BUFFSZ - n]);
    380 }
    381 
    382 
    383 static int readdecesc (LexState *ls) {
    384   int i;
    385   int r = 0;  /* result accumulator */
    386   for (i = 0; i < 3 && lisdigit(ls->current); i++) {  /* read up to 3 digits */
    387     r = 10*r + ls->current - '0';
    388     save_and_next(ls);
    389   }
    390   esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
    391   luaZ_buffremove(ls->buff, i);  /* remove read digits from buffer */
    392   return r;
    393 }
    394 
    395 
    396 static void read_string (LexState *ls, int del, SemInfo *seminfo) {
    397   save_and_next(ls);  /* keep delimiter (for error messages) */
    398   while (ls->current != del) {
    399     switch (ls->current) {
    400       case EOZ:
    401         lexerror(ls, "unfinished string", TK_EOS);
    402         break;  /* to avoid warnings */
    403       case '\n':
    404       case '\r':
    405         lexerror(ls, "unfinished string", TK_STRING);
    406         break;  /* to avoid warnings */
    407       case '\\': {  /* escape sequences */
    408         int c;  /* final character to be saved */
    409         save_and_next(ls);  /* keep '\\' for error messages */
    410         switch (ls->current) {
    411           case 'a': c = '\a'; goto read_save;
    412           case 'b': c = '\b'; goto read_save;
    413           case 'f': c = '\f'; goto read_save;
    414           case 'n': c = '\n'; goto read_save;
    415           case 'r': c = '\r'; goto read_save;
    416           case 't': c = '\t'; goto read_save;
    417           case 'v': c = '\v'; goto read_save;
    418           case 'x': c = readhexaesc(ls); goto read_save;
    419           case 'u': utf8esc(ls);  goto no_save;
    420           case '\n': case '\r':
    421             inclinenumber(ls); c = '\n'; goto only_save;
    422           case '\\': case '\"': case '\'':
    423             c = ls->current; goto read_save;
    424           case EOZ: goto no_save;  /* will raise an error next loop */
    425           case 'z': {  /* zap following span of spaces */
    426             luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
    427             next(ls);  /* skip the 'z' */
    428             while (lisspace(ls->current)) {
    429               if (currIsNewline(ls)) inclinenumber(ls);
    430               else next(ls);
    431             }
    432             goto no_save;
    433           }
    434           default: {
    435             esccheck(ls, lisdigit(ls->current), "invalid escape sequence");
    436             c = readdecesc(ls);  /* digital escape '\ddd' */
    437             goto only_save;
    438           }
    439         }
    440        read_save:
    441          next(ls);
    442          /* go through */
    443        only_save:
    444          luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
    445          save(ls, c);
    446          /* go through */
    447        no_save: break;
    448       }
    449       default:
    450         save_and_next(ls);
    451     }
    452   }
    453   save_and_next(ls);  /* skip delimiter */
    454   seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
    455                                    luaZ_bufflen(ls->buff) - 2);
    456 }
    457 
    458 
    459 static int llex (LexState *ls, SemInfo *seminfo) {
    460   luaZ_resetbuffer(ls->buff);
    461   for (;;) {
    462     switch (ls->current) {
    463       case '\n': case '\r': {  /* line breaks */
    464         inclinenumber(ls);
    465         break;
    466       }
    467       case ' ': case '\f': case '\t': case '\v': {  /* spaces */
    468         next(ls);
    469         break;
    470       }
    471       case '-': {  /* '-' or '--' (comment) */
    472         next(ls);
    473         if (ls->current != '-') return '-';
    474         /* else is a comment */
    475         next(ls);
    476         if (ls->current == '[') {  /* long comment? */
    477           size_t sep = skip_sep(ls);
    478           luaZ_resetbuffer(ls->buff);  /* 'skip_sep' may dirty the buffer */
    479           if (sep >= 2) {
    480             read_long_string(ls, NULL, sep);  /* skip long comment */
    481             luaZ_resetbuffer(ls->buff);  /* previous call may dirty the buff. */
    482             break;
    483           }
    484         }
    485         /* else short comment */
    486         while (!currIsNewline(ls) && ls->current != EOZ)
    487           next(ls);  /* skip until end of line (or end of file) */
    488         break;
    489       }
    490       case '[': {  /* long string or simply '[' */
    491         size_t sep = skip_sep(ls);
    492         if (sep >= 2) {
    493           read_long_string(ls, seminfo, sep);
    494           return TK_STRING;
    495         }
    496         else if (sep == 0)  /* '[=...' missing second bracket? */
    497           lexerror(ls, "invalid long string delimiter", TK_STRING);
    498         return '[';
    499       }
    500       case '=': {
    501         next(ls);
    502         if (check_next1(ls, '=')) return TK_EQ;  /* '==' */
    503         else return '=';
    504       }
    505       case '<': {
    506         next(ls);
    507         if (check_next1(ls, '=')) return TK_LE;  /* '<=' */
    508         else if (check_next1(ls, '<')) return TK_SHL;  /* '<<' */
    509         else return '<';
    510       }
    511       case '>': {
    512         next(ls);
    513         if (check_next1(ls, '=')) return TK_GE;  /* '>=' */
    514         else if (check_next1(ls, '>')) return TK_SHR;  /* '>>' */
    515         else return '>';
    516       }
    517       case '/': {
    518         next(ls);
    519         if (check_next1(ls, '/')) return TK_IDIV;  /* '//' */
    520         else return '/';
    521       }
    522       case '~': {
    523         next(ls);
    524         if (check_next1(ls, '=')) return TK_NE;  /* '~=' */
    525         else return '~';
    526       }
    527       case ':': {
    528         next(ls);
    529         if (check_next1(ls, ':')) return TK_DBCOLON;  /* '::' */
    530         else return ':';
    531       }
    532       case '"': case '\'': {  /* short literal strings */
    533         read_string(ls, ls->current, seminfo);
    534         return TK_STRING;
    535       }
    536       case '.': {  /* '.', '..', '...', or number */
    537         save_and_next(ls);
    538         if (check_next1(ls, '.')) {
    539           if (check_next1(ls, '.'))
    540             return TK_DOTS;   /* '...' */
    541           else return TK_CONCAT;   /* '..' */
    542         }
    543         else if (!lisdigit(ls->current)) return '.';
    544         else return read_numeral(ls, seminfo);
    545       }
    546       case '0': case '1': case '2': case '3': case '4':
    547       case '5': case '6': case '7': case '8': case '9': {
    548         return read_numeral(ls, seminfo);
    549       }
    550       case EOZ: {
    551         return TK_EOS;
    552       }
    553       default: {
    554         if (lislalpha(ls->current)) {  /* identifier or reserved word? */
    555           TString *ts;
    556           do {
    557             save_and_next(ls);
    558           } while (lislalnum(ls->current));
    559           /* find or create string */
    560           ts = luaS_newlstr(ls->L, luaZ_buffer(ls->buff),
    561                                    luaZ_bufflen(ls->buff));
    562           if (isreserved(ts))   /* reserved word? */
    563             return ts->extra - 1 + FIRST_RESERVED;
    564           else {
    565             seminfo->ts = anchorstr(ls, ts);
    566             return TK_NAME;
    567           }
    568         }
    569         else {  /* single-char tokens ('+', '*', '%', '{', '}', ...) */
    570           int c = ls->current;
    571           next(ls);
    572           return c;
    573         }
    574       }
    575     }
    576   }
    577 }
    578 
    579 
    580 void luaX_next (LexState *ls) {
    581   ls->lastline = ls->linenumber;
    582   if (ls->lookahead.token != TK_EOS) {  /* is there a look-ahead token? */
    583     ls->t = ls->lookahead;  /* use this one */
    584     ls->lookahead.token = TK_EOS;  /* and discharge it */
    585   }
    586   else
    587     ls->t.token = llex(ls, &ls->t.seminfo);  /* read next token */
    588 }
    589 
    590 
    591 int luaX_lookahead (LexState *ls) {
    592   lua_assert(ls->lookahead.token == TK_EOS);
    593   ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
    594   return ls->lookahead.token;
    595 }
    596