lex.c (12642B)
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <string.h> 4 #include "cpp.h" 5 6 /* 7 * lexical FSM encoding 8 * when in state state, and one of the characters 9 * in ch arrives, enter nextstate. 10 * States >= S_SELF are either final, or at least require special action. 11 * In 'fsm' there is a line for each state X charset X nextstate. 12 * List chars that overwrite previous entries later (e.g. C_ALPH 13 * can be overridden by '_' by a later entry; and C_XX is the 14 * the universal set, and should always be first. 15 * States above S_SELF are represented in the big table as negative values. 16 * S_SELF and S_SELFB encode the resulting token type in the upper bits. 17 * These actions differ in that S_SELF doesn't have a lookahead char, 18 * S_SELFB does. 19 * 20 * The encoding is blown out into a big table for time-efficiency. 21 * Entries have 22 * nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits. 23 */ 24 25 #define MAXSTATE 32 26 #define ACT(tok,act) ((tok<<7)+act) 27 #define QBSBIT 0100 28 #define GETACT(st) (st>>7)&0x1ff 29 30 /* character classes */ 31 #define C_WS 1 32 #define C_ALPH 2 33 #define C_NUM 3 34 #define C_EOF 4 35 #define C_XX 5 36 37 enum state { 38 START=0, NUM1, NUM2, NUM3, ID1, ST1, ST2, ST3, COM1, COM2, COM3, COM4, 39 CC1, CC2, WS1, PLUS1, MINUS1, STAR1, SLASH1, PCT1, SHARP1, 40 CIRC1, GT1, GT2, LT1, LT2, OR1, AND1, ASG1, NOT1, DOTS1, 41 S_SELF=MAXSTATE, S_SELFB, S_EOF, S_NL, S_EOFSTR, 42 S_STNL, S_COMNL, S_EOFCOM, S_COMMENT, S_EOB, S_WS, S_NAME 43 }; 44 45 int tottok; 46 int tokkind[256]; 47 struct fsm { 48 int state; /* if in this state */ 49 uchar ch[4]; /* and see one of these characters */ 50 int nextstate; /* enter this state if +ve */ 51 }; 52 53 /*const*/ struct fsm fsm[] = { 54 /* start state */ 55 START, { C_XX }, ACT(UNCLASS,S_SELF), 56 START, { ' ', '\t', '\v' }, WS1, 57 START, { C_NUM }, NUM1, 58 START, { '.' }, NUM3, 59 START, { C_ALPH }, ID1, 60 START, { 'L' }, ST1, 61 START, { '"' }, ST2, 62 START, { '\'' }, CC1, 63 START, { '/' }, COM1, 64 START, { EOFC }, S_EOF, 65 START, { '\n' }, S_NL, 66 START, { '-' }, MINUS1, 67 START, { '+' }, PLUS1, 68 START, { '<' }, LT1, 69 START, { '>' }, GT1, 70 START, { '=' }, ASG1, 71 START, { '!' }, NOT1, 72 START, { '&' }, AND1, 73 START, { '|' }, OR1, 74 START, { '#' }, SHARP1, 75 START, { '%' }, PCT1, 76 START, { '[' }, ACT(SBRA,S_SELF), 77 START, { ']' }, ACT(SKET,S_SELF), 78 START, { '(' }, ACT(LP,S_SELF), 79 START, { ')' }, ACT(RP,S_SELF), 80 START, { '*' }, STAR1, 81 START, { ',' }, ACT(COMMA,S_SELF), 82 START, { '?' }, ACT(QUEST,S_SELF), 83 START, { ':' }, ACT(COLON,S_SELF), 84 START, { ';' }, ACT(SEMIC,S_SELF), 85 START, { '{' }, ACT(CBRA,S_SELF), 86 START, { '}' }, ACT(CKET,S_SELF), 87 START, { '~' }, ACT(TILDE,S_SELF), 88 START, { '^' }, CIRC1, 89 90 /* saw a digit */ 91 NUM1, { C_XX }, ACT(NUMBER,S_SELFB), 92 NUM1, { C_NUM, C_ALPH, '.' }, NUM1, 93 NUM1, { 'E', 'e' }, NUM2, 94 NUM1, { '_' }, ACT(NUMBER,S_SELFB), 95 96 /* saw possible start of exponent, digits-e */ 97 NUM2, { C_XX }, ACT(NUMBER,S_SELFB), 98 NUM2, { '+', '-' }, NUM1, 99 NUM2, { C_NUM, C_ALPH }, NUM1, 100 NUM2, { '_' }, ACT(NUMBER,S_SELFB), 101 102 /* saw a '.', which could be a number or an operator */ 103 NUM3, { C_XX }, ACT(DOT,S_SELFB), 104 NUM3, { '.' }, DOTS1, 105 NUM3, { C_NUM }, NUM1, 106 107 DOTS1, { C_XX }, ACT(UNCLASS, S_SELFB), 108 DOTS1, { C_NUM }, NUM1, 109 DOTS1, { '.' }, ACT(ELLIPS, S_SELF), 110 111 /* saw a letter or _ */ 112 ID1, { C_XX }, ACT(NAME,S_NAME), 113 ID1, { C_ALPH, C_NUM }, ID1, 114 115 /* saw L (start of wide string?) */ 116 ST1, { C_XX }, ACT(NAME,S_NAME), 117 ST1, { C_ALPH, C_NUM }, ID1, 118 ST1, { '"' }, ST2, 119 ST1, { '\'' }, CC1, 120 121 /* saw " beginning string */ 122 ST2, { C_XX }, ST2, 123 ST2, { '"' }, ACT(STRING, S_SELF), 124 ST2, { '\\' }, ST3, 125 ST2, { '\n' }, S_STNL, 126 ST2, { EOFC }, S_EOFSTR, 127 128 /* saw \ in string */ 129 ST3, { C_XX }, ST2, 130 ST3, { '\n' }, S_STNL, 131 ST3, { EOFC }, S_EOFSTR, 132 133 /* saw ' beginning character const */ 134 CC1, { C_XX }, CC1, 135 CC1, { '\'' }, ACT(CCON, S_SELF), 136 CC1, { '\\' }, CC2, 137 CC1, { '\n' }, S_STNL, 138 CC1, { EOFC }, S_EOFSTR, 139 140 /* saw \ in ccon */ 141 CC2, { C_XX }, CC1, 142 CC2, { '\n' }, S_STNL, 143 CC2, { EOFC }, S_EOFSTR, 144 145 /* saw /, perhaps start of comment */ 146 COM1, { C_XX }, ACT(SLASH, S_SELFB), 147 COM1, { '=' }, ACT(ASSLASH, S_SELF), 148 COM1, { '*' }, COM2, 149 COM1, { '/' }, COM4, 150 151 /* saw / then *, start of comment */ 152 COM2, { C_XX }, COM2, 153 COM2, { '\n' }, S_COMNL, 154 COM2, { '*' }, COM3, 155 COM2, { EOFC }, S_EOFCOM, 156 157 /* saw the * possibly ending a comment */ 158 COM3, { C_XX }, COM2, 159 COM3, { '\n' }, S_COMNL, 160 COM3, { '*' }, COM3, 161 COM3, { '/' }, S_COMMENT, 162 163 /* // comment */ 164 COM4, { C_XX }, COM4, 165 COM4, { '\n' }, S_NL, 166 COM4, { EOFC }, S_EOFCOM, 167 168 /* saw white space, eat it up */ 169 WS1, { C_XX }, S_WS, 170 WS1, { ' ', '\t', '\v' }, WS1, 171 172 /* saw -, check --, -=, -> */ 173 MINUS1, { C_XX }, ACT(MINUS, S_SELFB), 174 MINUS1, { '-' }, ACT(MMINUS, S_SELF), 175 MINUS1, { '=' }, ACT(ASMINUS,S_SELF), 176 MINUS1, { '>' }, ACT(ARROW,S_SELF), 177 178 /* saw +, check ++, += */ 179 PLUS1, { C_XX }, ACT(PLUS, S_SELFB), 180 PLUS1, { '+' }, ACT(PPLUS, S_SELF), 181 PLUS1, { '=' }, ACT(ASPLUS, S_SELF), 182 183 /* saw <, check <<, <<=, <= */ 184 LT1, { C_XX }, ACT(LT, S_SELFB), 185 LT1, { '<' }, LT2, 186 LT1, { '=' }, ACT(LEQ, S_SELF), 187 LT2, { C_XX }, ACT(LSH, S_SELFB), 188 LT2, { '=' }, ACT(ASLSH, S_SELF), 189 190 /* saw >, check >>, >>=, >= */ 191 GT1, { C_XX }, ACT(GT, S_SELFB), 192 GT1, { '>' }, GT2, 193 GT1, { '=' }, ACT(GEQ, S_SELF), 194 GT2, { C_XX }, ACT(RSH, S_SELFB), 195 GT2, { '=' }, ACT(ASRSH, S_SELF), 196 197 /* = */ 198 ASG1, { C_XX }, ACT(ASGN, S_SELFB), 199 ASG1, { '=' }, ACT(EQ, S_SELF), 200 201 /* ! */ 202 NOT1, { C_XX }, ACT(NOT, S_SELFB), 203 NOT1, { '=' }, ACT(NEQ, S_SELF), 204 205 /* & */ 206 AND1, { C_XX }, ACT(AND, S_SELFB), 207 AND1, { '&' }, ACT(LAND, S_SELF), 208 AND1, { '=' }, ACT(ASAND, S_SELF), 209 210 /* | */ 211 OR1, { C_XX }, ACT(OR, S_SELFB), 212 OR1, { '|' }, ACT(LOR, S_SELF), 213 OR1, { '=' }, ACT(ASOR, S_SELF), 214 215 /* # */ 216 SHARP1, { C_XX }, ACT(SHARP, S_SELFB), 217 SHARP1, { '#' }, ACT(DSHARP, S_SELF), 218 219 /* % */ 220 PCT1, { C_XX }, ACT(PCT, S_SELFB), 221 PCT1, { '=' }, ACT(ASPCT, S_SELF), 222 223 /* * */ 224 STAR1, { C_XX }, ACT(STAR, S_SELFB), 225 STAR1, { '=' }, ACT(ASSTAR, S_SELF), 226 227 /* ^ */ 228 CIRC1, { C_XX }, ACT(CIRC, S_SELFB), 229 CIRC1, { '=' }, ACT(ASCIRC, S_SELF), 230 231 -1 232 }; 233 234 /* first index is char, second is state */ 235 /* increase #states to power of 2 to encourage use of shift */ 236 short bigfsm[256][MAXSTATE]; 237 238 void 239 expandlex(void) 240 { 241 /*const*/ struct fsm *fp; 242 int i, j, nstate; 243 244 for (fp = fsm; fp->state>=0; fp++) { 245 for (i=0; fp->ch[i]; i++) { 246 nstate = fp->nextstate; 247 if (nstate >= S_SELF) 248 nstate = ~nstate; 249 switch (fp->ch[i]) { 250 251 case C_XX: /* random characters */ 252 for (j=0; j<256; j++) 253 bigfsm[j][fp->state] = nstate; 254 continue; 255 case C_ALPH: 256 for (j=0; j<=256; j++) 257 if ('a'<=j&&j<='z' || 'A'<=j&&j<='Z' 258 || j=='_') 259 bigfsm[j][fp->state] = nstate; 260 continue; 261 case C_NUM: 262 for (j='0'; j<='9'; j++) 263 bigfsm[j][fp->state] = nstate; 264 continue; 265 default: 266 bigfsm[fp->ch[i]][fp->state] = nstate; 267 } 268 } 269 } 270 /* install special cases for ? (trigraphs), \ (splicing), runes, and EOB */ 271 for (i=0; i<MAXSTATE; i++) { 272 for (j=0; j<0xFF; j++) 273 if (j=='?' || j=='\\') { 274 if (bigfsm[j][i]>0) 275 bigfsm[j][i] = ~bigfsm[j][i]; 276 bigfsm[j][i] &= ~QBSBIT; 277 } 278 bigfsm[EOB][i] = ~S_EOB; 279 if (bigfsm[EOFC][i]>=0) 280 bigfsm[EOFC][i] = ~S_EOF; 281 } 282 } 283 284 void 285 fixlex(void) 286 { 287 /* do C++ comments? */ 288 if (Cplusplus==0) 289 bigfsm['/'][COM1] = bigfsm['x'][COM1]; 290 } 291 292 /* 293 * fill in a row of tokens from input, terminated by NL or END 294 * First token is put at trp->lp. 295 * Reset is non-zero when the input buffer can be "rewound." 296 * The value is a flag indicating that possible macros have 297 * been seen in the row. 298 */ 299 int 300 gettokens(Tokenrow *trp, int reset) 301 { 302 register int c, state, oldstate; 303 register uchar *ip; 304 register Token *tp, *maxp; 305 int runelen; 306 Source *s = cursource; 307 int nmac = 0; 308 extern char outbuf[]; 309 310 tp = trp->lp; 311 ip = s->inp; 312 if (reset) { 313 s->lineinc = 0; 314 if (ip>=s->inl) { /* nothing in buffer */ 315 s->inl = s->inb; 316 fillbuf(s); 317 ip = s->inp = s->inb; 318 } else if (ip >= s->inb+(3*INS/4)) { 319 memmove(s->inb, ip, 4+s->inl-ip); 320 s->inl = s->inb+(s->inl-ip); 321 ip = s->inp = s->inb; 322 } 323 } 324 maxp = &trp->bp[trp->max]; 325 runelen = 1; 326 for (;;) { 327 continue2: 328 if (tp>=maxp) { 329 trp->lp = tp; 330 tp = growtokenrow(trp); 331 maxp = &trp->bp[trp->max]; 332 } 333 tp->type = UNCLASS; 334 tp->hideset = 0; 335 tp->t = ip; 336 tp->wslen = 0; 337 tp->flag = 0; 338 state = START; 339 for (;;) { 340 oldstate = state; 341 c = *ip; 342 if ((state = bigfsm[c][state]) >= 0) { 343 ip += runelen; 344 runelen = 1; 345 continue; 346 } 347 state = ~state; 348 reswitch: 349 switch (state&0177) { 350 case S_SELF: 351 ip += runelen; 352 runelen = 1; 353 case S_SELFB: 354 tp->type = GETACT(state); 355 tp->len = ip - tp->t; 356 tp++; 357 goto continue2; 358 359 case S_NAME: /* like S_SELFB but with nmac check */ 360 tp->type = NAME; 361 tp->len = ip - tp->t; 362 nmac |= quicklook(tp->t[0], tp->len>1?tp->t[1]:0); 363 tp++; 364 goto continue2; 365 366 case S_WS: 367 tp->wslen = ip - tp->t; 368 tp->t = ip; 369 state = START; 370 continue; 371 372 default: 373 if ((state&QBSBIT)==0) { 374 ip += runelen; 375 runelen = 1; 376 continue; 377 } 378 state &= ~QBSBIT; 379 s->inp = ip; 380 if (c=='?') { /* check trigraph */ 381 if (trigraph(s)) { 382 state = oldstate; 383 continue; 384 } 385 goto reswitch; 386 } 387 if (c=='\\') { /* line-folding */ 388 if (foldline(s)) { 389 s->lineinc++; 390 state = oldstate; 391 continue; 392 } 393 goto reswitch; 394 } 395 error(WARNING, "Lexical botch in cpp"); 396 ip += runelen; 397 runelen = 1; 398 continue; 399 400 case S_EOB: 401 s->inp = ip; 402 fillbuf(cursource); 403 state = oldstate; 404 continue; 405 406 case S_EOF: 407 tp->type = END; 408 tp->len = 0; 409 s->inp = ip; 410 if (tp!=trp->bp && (tp-1)->type!=NL && cursource->fd!=-1) 411 error(WARNING,"No newline at end of file"); 412 trp->lp = tp+1; 413 return nmac; 414 415 case S_STNL: 416 error(ERROR, "Unterminated string or char const"); 417 case S_NL: 418 tp->t = ip; 419 tp->type = NL; 420 tp->len = 1; 421 tp->wslen = 0; 422 s->lineinc++; 423 s->inp = ip+1; 424 trp->lp = tp+1; 425 return nmac; 426 427 case S_EOFSTR: 428 error(FATAL, "EOF in string or char constant"); 429 break; 430 431 case S_COMNL: 432 s->lineinc++; 433 state = COM2; 434 ip += runelen; 435 runelen = 1; 436 if (ip >= s->inb+(7*INS/8)) { /* very long comment */ 437 memmove(tp->t, ip, 4+s->inl-ip); 438 s->inl -= ip-tp->t; 439 ip = tp->t+1; 440 } 441 continue; 442 443 case S_EOFCOM: 444 error(WARNING, "EOF inside comment"); 445 --ip; 446 case S_COMMENT: 447 ++ip; 448 tp->t = ip; 449 tp->t[-1] = ' '; 450 tp->wslen = 1; 451 state = START; 452 continue; 453 } 454 break; 455 } 456 ip += runelen; 457 runelen = 1; 458 tp->len = ip - tp->t; 459 tp++; 460 } 461 } 462 463 /* have seen ?; handle the trigraph it starts (if any) else 0 */ 464 int 465 trigraph(Source *s) 466 { 467 int c; 468 469 while (s->inp+2 >= s->inl && fillbuf(s)!=EOF) 470 ; 471 if (s->inp[1]!='?') 472 return 0; 473 c = 0; 474 switch(s->inp[2]) { 475 case '=': 476 c = '#'; break; 477 case '(': 478 c = '['; break; 479 case '/': 480 c = '\\'; break; 481 case ')': 482 c = ']'; break; 483 case '\'': 484 c = '^'; break; 485 case '<': 486 c = '{'; break; 487 case '!': 488 c = '|'; break; 489 case '>': 490 c = '}'; break; 491 case '-': 492 c = '~'; break; 493 } 494 if (c) { 495 *s->inp = c; 496 memmove(s->inp+1, s->inp+3, s->inl-s->inp+2); 497 s->inl -= 2; 498 } 499 return c; 500 } 501 502 int 503 foldline(Source *s) 504 { 505 while (s->inp+1 >= s->inl && fillbuf(s)!=EOF) 506 ; 507 if (s->inp[1] == '\n') { 508 memmove(s->inp, s->inp+2, s->inl-s->inp+3); 509 s->inl -= 2; 510 return 1; 511 } 512 return 0; 513 } 514 515 int 516 fillbuf(Source *s) 517 { 518 int n, nr; 519 520 nr = INS/8; 521 if ((char *)s->inl+nr > (char *)s->inb+INS) 522 error(FATAL, "Input buffer overflow"); 523 if (s->fd<0 || (n=read(s->fd, (char *)s->inl, INS/8)) <= 0) 524 n = 0; 525 if ((*s->inp&0xff) == EOB) /* sentinel character appears in input */ 526 *s->inp = EOFC; 527 s->inl += n; 528 s->inl[0] = s->inl[1]= s->inl[2]= s->inl[3] = EOB; 529 if (n==0) { 530 s->inl[0] = s->inl[1]= s->inl[2]= s->inl[3] = EOFC; 531 return EOF; 532 } 533 return 0; 534 } 535 536 /* 537 * Push down to new source of characters. 538 * If fd>0 and str==NULL, then from a file `name'; 539 * if fd==-1 and str, then from the string. 540 */ 541 Source * 542 setsource(char *name, int fd, char *str) 543 { 544 Source *s = new(Source); 545 int len; 546 547 s->line = 1; 548 s->lineinc = 0; 549 s->fd = fd; 550 s->filename = name; 551 s->next = cursource; 552 s->ifdepth = 0; 553 cursource = s; 554 /* slop at right for EOB */ 555 if (str) { 556 len = strlen(str); 557 s->inb = domalloc(len+4); 558 s->inp = s->inb; 559 strncpy((char *)s->inp, str, len); 560 } else { 561 s->inb = domalloc(INS+4); 562 s->inp = s->inb; 563 len = 0; 564 } 565 s->inl = s->inp+len; 566 s->inl[0] = s->inl[1] = EOB; 567 return s; 568 } 569 570 void 571 unsetsource(void) 572 { 573 Source *s = cursource; 574 575 if (s->fd>=0) { 576 close(s->fd); 577 dofree(s->inb); 578 } 579 cursource = s->next; 580 dofree(s); 581 }