Quake-III-Arena

Quake III Arena GPL Source Release
Log | Files | Refs

lex.c (12642B)


      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 #include <string.h>
      4 #include "cpp.h"
      5 
      6 /*
      7  * lexical FSM encoding
      8  *   when in state state, and one of the characters
      9  *   in ch arrives, enter nextstate.
     10  *   States >= S_SELF are either final, or at least require special action.
     11  *   In 'fsm' there is a line for each state X charset X nextstate.
     12  *   List chars that overwrite previous entries later (e.g. C_ALPH
     13  *   can be overridden by '_' by a later entry; and C_XX is the
     14  *   the universal set, and should always be first.
     15  *   States above S_SELF are represented in the big table as negative values.
     16  *   S_SELF and S_SELFB encode the resulting token type in the upper bits.
     17  *   These actions differ in that S_SELF doesn't have a lookahead char,
     18  *   S_SELFB does.
     19  *
     20  *   The encoding is blown out into a big table for time-efficiency.
     21  *   Entries have
     22  *      nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits.
     23  */
     24 
     25 #define	MAXSTATE 32
     26 #define	ACT(tok,act)	((tok<<7)+act)
     27 #define	QBSBIT	0100
     28 #define	GETACT(st)	(st>>7)&0x1ff
     29 
     30 /* character classes */
     31 #define	C_WS	1
     32 #define	C_ALPH	2
     33 #define	C_NUM	3
     34 #define	C_EOF	4
     35 #define	C_XX	5
     36 
     37 enum state {
     38 	START=0, NUM1, NUM2, NUM3, ID1, ST1, ST2, ST3, COM1, COM2, COM3, COM4,
     39 	CC1, CC2, WS1, PLUS1, MINUS1, STAR1, SLASH1, PCT1, SHARP1,
     40 	CIRC1, GT1, GT2, LT1, LT2, OR1, AND1, ASG1, NOT1, DOTS1,
     41 	S_SELF=MAXSTATE, S_SELFB, S_EOF, S_NL, S_EOFSTR,
     42 	S_STNL, S_COMNL, S_EOFCOM, S_COMMENT, S_EOB, S_WS, S_NAME
     43 };
     44 
     45 int	tottok;
     46 int	tokkind[256];
     47 struct	fsm {
     48 	int	state;		/* if in this state */
     49 	uchar	ch[4];		/* and see one of these characters */
     50 	int	nextstate;	/* enter this state if +ve */
     51 };
     52 
     53 /*const*/ struct fsm fsm[] = {
     54 	/* start state */
     55 	START,	{ C_XX },	ACT(UNCLASS,S_SELF),
     56 	START,	{ ' ', '\t', '\v' },	WS1,
     57 	START,	{ C_NUM },	NUM1,
     58 	START,	{ '.' },	NUM3,
     59 	START,	{ C_ALPH },	ID1,
     60 	START,	{ 'L' },	ST1,
     61 	START,	{ '"' },	ST2,
     62 	START,	{ '\'' },	CC1,
     63 	START,	{ '/' },	COM1,
     64 	START,	{ EOFC },	S_EOF,
     65 	START,	{ '\n' },	S_NL,
     66 	START,	{ '-' },	MINUS1,
     67 	START,	{ '+' },	PLUS1,
     68 	START,	{ '<' },	LT1,
     69 	START,	{ '>' },	GT1,
     70 	START,	{ '=' },	ASG1,
     71 	START,	{ '!' },	NOT1,
     72 	START,	{ '&' },	AND1,
     73 	START,	{ '|' },	OR1,
     74 	START,	{ '#' },	SHARP1,
     75 	START,	{ '%' },	PCT1,
     76 	START,	{ '[' },	ACT(SBRA,S_SELF),
     77 	START,	{ ']' },	ACT(SKET,S_SELF),
     78 	START,	{ '(' },	ACT(LP,S_SELF),
     79 	START,	{ ')' },	ACT(RP,S_SELF),
     80 	START,	{ '*' },	STAR1,
     81 	START,	{ ',' },	ACT(COMMA,S_SELF),
     82 	START,	{ '?' },	ACT(QUEST,S_SELF),
     83 	START,	{ ':' },	ACT(COLON,S_SELF),
     84 	START,	{ ';' },	ACT(SEMIC,S_SELF),
     85 	START,	{ '{' },	ACT(CBRA,S_SELF),
     86 	START,	{ '}' },	ACT(CKET,S_SELF),
     87 	START,	{ '~' },	ACT(TILDE,S_SELF),
     88 	START,	{ '^' },	CIRC1,
     89 
     90 	/* saw a digit */
     91 	NUM1,	{ C_XX },	ACT(NUMBER,S_SELFB),
     92 	NUM1,	{ C_NUM, C_ALPH, '.' },	NUM1,
     93 	NUM1,	{ 'E', 'e' },	NUM2,
     94 	NUM1,	{ '_' },	ACT(NUMBER,S_SELFB),
     95 
     96 	/* saw possible start of exponent, digits-e */
     97 	NUM2,	{ C_XX },	ACT(NUMBER,S_SELFB),
     98 	NUM2,	{ '+', '-' },	NUM1,
     99 	NUM2,	{ C_NUM, C_ALPH },	NUM1,
    100 	NUM2,	{ '_' },	ACT(NUMBER,S_SELFB),
    101 
    102 	/* saw a '.', which could be a number or an operator */
    103 	NUM3,	{ C_XX },	ACT(DOT,S_SELFB),
    104 	NUM3,	{ '.' },	DOTS1,
    105 	NUM3,	{ C_NUM },	NUM1,
    106 
    107 	DOTS1,	{ C_XX },	ACT(UNCLASS, S_SELFB),
    108 	DOTS1,	{ C_NUM },	NUM1,
    109 	DOTS1,	{ '.' },	ACT(ELLIPS, S_SELF),
    110 
    111 	/* saw a letter or _ */
    112 	ID1,	{ C_XX },	ACT(NAME,S_NAME),
    113 	ID1,	{ C_ALPH, C_NUM },	ID1,
    114 
    115 	/* saw L (start of wide string?) */
    116 	ST1,	{ C_XX },	ACT(NAME,S_NAME),
    117 	ST1,	{ C_ALPH, C_NUM },	ID1,
    118 	ST1,	{ '"' },	ST2,
    119 	ST1,	{ '\'' },	CC1,
    120 
    121 	/* saw " beginning string */
    122 	ST2,	{ C_XX },	ST2,
    123 	ST2,	{ '"' },	ACT(STRING, S_SELF),
    124 	ST2,	{ '\\' },	ST3,
    125 	ST2,	{ '\n' },	S_STNL,
    126 	ST2,	{ EOFC },	S_EOFSTR,
    127 
    128 	/* saw \ in string */
    129 	ST3,	{ C_XX },	ST2,
    130 	ST3,	{ '\n' },	S_STNL,
    131 	ST3,	{ EOFC },	S_EOFSTR,
    132 
    133 	/* saw ' beginning character const */
    134 	CC1,	{ C_XX },	CC1,
    135 	CC1,	{ '\'' },	ACT(CCON, S_SELF),
    136 	CC1,	{ '\\' },	CC2,
    137 	CC1,	{ '\n' },	S_STNL,
    138 	CC1,	{ EOFC },	S_EOFSTR,
    139 
    140 	/* saw \ in ccon */
    141 	CC2,	{ C_XX },	CC1,
    142 	CC2,	{ '\n' },	S_STNL,
    143 	CC2,	{ EOFC },	S_EOFSTR,
    144 
    145 	/* saw /, perhaps start of comment */
    146 	COM1,	{ C_XX },	ACT(SLASH, S_SELFB),
    147 	COM1,	{ '=' },	ACT(ASSLASH, S_SELF),
    148 	COM1,	{ '*' },	COM2,
    149 	COM1,	{ '/' },	COM4,
    150 
    151 	/* saw / then *, start of comment */
    152 	COM2,	{ C_XX },	COM2,
    153 	COM2,	{ '\n' },	S_COMNL,
    154 	COM2,	{ '*' },	COM3,
    155 	COM2,	{ EOFC },	S_EOFCOM,
    156 
    157 	/* saw the * possibly ending a comment */
    158 	COM3,	{ C_XX },	COM2,
    159 	COM3,	{ '\n' },	S_COMNL,
    160 	COM3,	{ '*' },	COM3,
    161 	COM3,	{ '/' },	S_COMMENT,
    162 
    163 	/* // comment */
    164 	COM4,	{ C_XX },	COM4,
    165 	COM4,	{ '\n' },	S_NL,
    166 	COM4,	{ EOFC },	S_EOFCOM,
    167 
    168 	/* saw white space, eat it up */
    169 	WS1,	{ C_XX },	S_WS,
    170 	WS1,	{ ' ', '\t', '\v' },	WS1,
    171 
    172 	/* saw -, check --, -=, -> */
    173 	MINUS1,	{ C_XX },	ACT(MINUS, S_SELFB),
    174 	MINUS1,	{ '-' },	ACT(MMINUS, S_SELF),
    175 	MINUS1,	{ '=' },	ACT(ASMINUS,S_SELF),
    176 	MINUS1,	{ '>' },	ACT(ARROW,S_SELF),
    177 
    178 	/* saw +, check ++, += */
    179 	PLUS1,	{ C_XX },	ACT(PLUS, S_SELFB),
    180 	PLUS1,	{ '+' },	ACT(PPLUS, S_SELF),
    181 	PLUS1,	{ '=' },	ACT(ASPLUS, S_SELF),
    182 
    183 	/* saw <, check <<, <<=, <= */
    184 	LT1,	{ C_XX },	ACT(LT, S_SELFB),
    185 	LT1,	{ '<' },	LT2,
    186 	LT1,	{ '=' },	ACT(LEQ, S_SELF),
    187 	LT2,	{ C_XX },	ACT(LSH, S_SELFB),
    188 	LT2,	{ '=' },	ACT(ASLSH, S_SELF),
    189 
    190 	/* saw >, check >>, >>=, >= */
    191 	GT1,	{ C_XX },	ACT(GT, S_SELFB),
    192 	GT1,	{ '>' },	GT2,
    193 	GT1,	{ '=' },	ACT(GEQ, S_SELF),
    194 	GT2,	{ C_XX },	ACT(RSH, S_SELFB),
    195 	GT2,	{ '=' },	ACT(ASRSH, S_SELF),
    196 
    197 	/* = */
    198 	ASG1,	{ C_XX },	ACT(ASGN, S_SELFB),
    199 	ASG1,	{ '=' },	ACT(EQ, S_SELF),
    200 
    201 	/* ! */
    202 	NOT1,	{ C_XX },	ACT(NOT, S_SELFB),
    203 	NOT1,	{ '=' },	ACT(NEQ, S_SELF),
    204 
    205 	/* & */
    206 	AND1,	{ C_XX },	ACT(AND, S_SELFB),
    207 	AND1,	{ '&' },	ACT(LAND, S_SELF),
    208 	AND1,	{ '=' },	ACT(ASAND, S_SELF),
    209 
    210 	/* | */
    211 	OR1,	{ C_XX },	ACT(OR, S_SELFB),
    212 	OR1,	{ '|' },	ACT(LOR, S_SELF),
    213 	OR1,	{ '=' },	ACT(ASOR, S_SELF),
    214 
    215 	/* # */
    216 	SHARP1,	{ C_XX },	ACT(SHARP, S_SELFB),
    217 	SHARP1,	{ '#' },	ACT(DSHARP, S_SELF),
    218 
    219 	/* % */
    220 	PCT1,	{ C_XX },	ACT(PCT, S_SELFB),
    221 	PCT1,	{ '=' },	ACT(ASPCT, S_SELF),
    222 
    223 	/* * */
    224 	STAR1,	{ C_XX },	ACT(STAR, S_SELFB),
    225 	STAR1,	{ '=' },	ACT(ASSTAR, S_SELF),
    226 
    227 	/* ^ */
    228 	CIRC1,	{ C_XX },	ACT(CIRC, S_SELFB),
    229 	CIRC1,	{ '=' },	ACT(ASCIRC, S_SELF),
    230 
    231 	-1
    232 };
    233 
    234 /* first index is char, second is state */
    235 /* increase #states to power of 2 to encourage use of shift */
    236 short	bigfsm[256][MAXSTATE];
    237 
    238 void
    239 expandlex(void)
    240 {
    241 	/*const*/ struct fsm *fp;
    242 	int i, j, nstate;
    243 
    244 	for (fp = fsm; fp->state>=0; fp++) {
    245 		for (i=0; fp->ch[i]; i++) {
    246 			nstate = fp->nextstate;
    247 			if (nstate >= S_SELF)
    248 				nstate = ~nstate;
    249 			switch (fp->ch[i]) {
    250 
    251 			case C_XX:		/* random characters */
    252 				for (j=0; j<256; j++)
    253 					bigfsm[j][fp->state] = nstate;
    254 				continue;
    255 			case C_ALPH:
    256 				for (j=0; j<=256; j++)
    257 					if ('a'<=j&&j<='z' || 'A'<=j&&j<='Z'
    258 					  || j=='_')
    259 						bigfsm[j][fp->state] = nstate;
    260 				continue;
    261 			case C_NUM:
    262 				for (j='0'; j<='9'; j++)
    263 					bigfsm[j][fp->state] = nstate;
    264 				continue;
    265 			default:
    266 				bigfsm[fp->ch[i]][fp->state] = nstate;
    267 			}
    268 		}
    269 	}
    270 	/* install special cases for ? (trigraphs),  \ (splicing), runes, and EOB */
    271 	for (i=0; i<MAXSTATE; i++) {
    272 		for (j=0; j<0xFF; j++)
    273 			if (j=='?' || j=='\\') {
    274 				if (bigfsm[j][i]>0)
    275 					bigfsm[j][i] = ~bigfsm[j][i];
    276 				bigfsm[j][i] &= ~QBSBIT;
    277 			}
    278 		bigfsm[EOB][i] = ~S_EOB;
    279 		if (bigfsm[EOFC][i]>=0)
    280 			bigfsm[EOFC][i] = ~S_EOF;
    281 	}
    282 }
    283 
    284 void
    285 fixlex(void)
    286 {
    287 	/* do C++ comments? */
    288 	if (Cplusplus==0)
    289 		bigfsm['/'][COM1] = bigfsm['x'][COM1];
    290 }
    291 
    292 /*
    293  * fill in a row of tokens from input, terminated by NL or END
    294  * First token is put at trp->lp.
    295  * Reset is non-zero when the input buffer can be "rewound."
    296  * The value is a flag indicating that possible macros have
    297  * been seen in the row.
    298  */
    299 int
    300 gettokens(Tokenrow *trp, int reset)
    301 {
    302 	register int c, state, oldstate;
    303 	register uchar *ip;
    304 	register Token *tp, *maxp;
    305 	int runelen;
    306 	Source *s = cursource;
    307 	int nmac = 0;
    308 	extern char outbuf[];
    309 
    310 	tp = trp->lp;
    311 	ip = s->inp;
    312 	if (reset) {
    313 		s->lineinc = 0;
    314 		if (ip>=s->inl) {		/* nothing in buffer */
    315 			s->inl = s->inb;
    316 			fillbuf(s);
    317 			ip = s->inp = s->inb;
    318 		} else if (ip >= s->inb+(3*INS/4)) {
    319 			memmove(s->inb, ip, 4+s->inl-ip);
    320 			s->inl = s->inb+(s->inl-ip);
    321 			ip = s->inp = s->inb;
    322 		}
    323 	}
    324 	maxp = &trp->bp[trp->max];
    325 	runelen = 1;
    326 	for (;;) {
    327 	   continue2:
    328 		if (tp>=maxp) {
    329 			trp->lp = tp;
    330 			tp = growtokenrow(trp);
    331 			maxp = &trp->bp[trp->max];
    332 		}
    333 		tp->type = UNCLASS;
    334 		tp->hideset = 0;
    335 		tp->t = ip;
    336 		tp->wslen = 0;
    337 		tp->flag = 0;
    338 		state = START;
    339 		for (;;) {
    340 			oldstate = state;
    341 			c = *ip;
    342 			if ((state = bigfsm[c][state]) >= 0) {
    343 				ip += runelen;
    344 				runelen = 1;
    345 				continue;
    346 			}
    347 			state = ~state;
    348 		reswitch:
    349 			switch (state&0177) {
    350 			case S_SELF:
    351 				ip += runelen;
    352 				runelen = 1;
    353 			case S_SELFB:
    354 				tp->type = GETACT(state);
    355 				tp->len = ip - tp->t;
    356 				tp++;
    357 				goto continue2;
    358 
    359 			case S_NAME:	/* like S_SELFB but with nmac check */
    360 				tp->type = NAME;
    361 				tp->len = ip - tp->t;
    362 				nmac |= quicklook(tp->t[0], tp->len>1?tp->t[1]:0);
    363 				tp++;
    364 				goto continue2;
    365 
    366 			case S_WS:
    367 				tp->wslen = ip - tp->t;
    368 				tp->t = ip;
    369 				state = START;
    370 				continue;
    371 
    372 			default:
    373 				if ((state&QBSBIT)==0) {
    374 					ip += runelen;
    375 					runelen = 1;
    376 					continue;
    377 				}
    378 				state &= ~QBSBIT;
    379 				s->inp = ip;
    380 				if (c=='?') { 	/* check trigraph */
    381 					if (trigraph(s)) {
    382 						state = oldstate;
    383 						continue;
    384 					}
    385 					goto reswitch;
    386 				}
    387 				if (c=='\\') { /* line-folding */
    388 					if (foldline(s)) {
    389 						s->lineinc++;
    390 						state = oldstate;
    391 						continue;
    392 					}
    393 					goto reswitch;
    394 				}
    395 				error(WARNING, "Lexical botch in cpp");
    396 				ip += runelen;
    397 				runelen = 1;
    398 				continue;
    399 
    400 			case S_EOB:
    401 				s->inp = ip;
    402 				fillbuf(cursource);
    403 				state = oldstate;
    404 				continue;
    405 
    406 			case S_EOF:
    407 				tp->type = END;
    408 				tp->len = 0;
    409 				s->inp = ip;
    410 				if (tp!=trp->bp && (tp-1)->type!=NL && cursource->fd!=-1)
    411 					error(WARNING,"No newline at end of file");
    412 				trp->lp = tp+1;
    413 				return nmac;
    414 
    415 			case S_STNL:
    416 				error(ERROR, "Unterminated string or char const");
    417 			case S_NL:
    418 				tp->t = ip;
    419 				tp->type = NL;
    420 				tp->len = 1;
    421 				tp->wslen = 0;
    422 				s->lineinc++;
    423 				s->inp = ip+1;
    424 				trp->lp = tp+1;
    425 				return nmac;
    426 
    427 			case S_EOFSTR:
    428 				error(FATAL, "EOF in string or char constant");
    429 				break;
    430 
    431 			case S_COMNL:
    432 				s->lineinc++;
    433 				state = COM2;
    434 				ip += runelen;
    435 				runelen = 1;
    436 				if (ip >= s->inb+(7*INS/8)) { /* very long comment */
    437 					memmove(tp->t, ip, 4+s->inl-ip);
    438 					s->inl -= ip-tp->t;
    439 					ip = tp->t+1;
    440 				}
    441 				continue;
    442 
    443 			case S_EOFCOM:
    444 				error(WARNING, "EOF inside comment");
    445 				--ip;
    446 			case S_COMMENT:
    447 				++ip;
    448 				tp->t = ip;
    449 				tp->t[-1] = ' ';
    450 				tp->wslen = 1;
    451 				state = START;
    452 				continue;
    453 			}
    454 			break;
    455 		}
    456 		ip += runelen;
    457 		runelen = 1;
    458 		tp->len = ip - tp->t;
    459 		tp++;
    460 	}
    461 }
    462 
    463 /* have seen ?; handle the trigraph it starts (if any) else 0 */
    464 int
    465 trigraph(Source *s)
    466 {
    467 	int c;
    468 
    469 	while (s->inp+2 >= s->inl && fillbuf(s)!=EOF)
    470 		;
    471 	if (s->inp[1]!='?')
    472 		return 0;
    473 	c = 0;
    474 	switch(s->inp[2]) {
    475 	case '=':
    476 		c = '#'; break;
    477 	case '(':
    478 		c = '['; break;
    479 	case '/':
    480 		c = '\\'; break;
    481 	case ')':
    482 		c = ']'; break;
    483 	case '\'':
    484 		c = '^'; break;
    485 	case '<':
    486 		c = '{'; break;
    487 	case '!':
    488 		c = '|'; break;
    489 	case '>':
    490 		c = '}'; break;
    491 	case '-':
    492 		c = '~'; break;
    493 	}
    494 	if (c) {
    495 		*s->inp = c;
    496 		memmove(s->inp+1, s->inp+3, s->inl-s->inp+2);
    497 		s->inl -= 2;
    498 	}
    499 	return c;
    500 }
    501 
    502 int
    503 foldline(Source *s)
    504 {
    505 	while (s->inp+1 >= s->inl && fillbuf(s)!=EOF)
    506 		;
    507 	if (s->inp[1] == '\n') {
    508 		memmove(s->inp, s->inp+2, s->inl-s->inp+3);
    509 		s->inl -= 2;
    510 		return 1;
    511 	}
    512 	return 0;
    513 }
    514 
    515 int
    516 fillbuf(Source *s)
    517 {
    518 	int n, nr;
    519 
    520 	nr = INS/8;
    521 	if ((char *)s->inl+nr > (char *)s->inb+INS)
    522 		error(FATAL, "Input buffer overflow");
    523 	if (s->fd<0 || (n=read(s->fd, (char *)s->inl, INS/8)) <= 0)
    524 		n = 0;
    525 	if ((*s->inp&0xff) == EOB) /* sentinel character appears in input */
    526 		*s->inp = EOFC;
    527 	s->inl += n;
    528 	s->inl[0] = s->inl[1]= s->inl[2]= s->inl[3] = EOB;
    529 	if (n==0) {
    530 		s->inl[0] = s->inl[1]= s->inl[2]= s->inl[3] = EOFC;
    531 		return EOF;
    532 	}
    533 	return 0;
    534 }
    535 
    536 /*
    537  * Push down to new source of characters.
    538  * If fd>0 and str==NULL, then from a file `name';
    539  * if fd==-1 and str, then from the string.
    540  */
    541 Source *
    542 setsource(char *name, int fd, char *str)
    543 {
    544 	Source *s = new(Source);
    545 	int len;
    546 
    547 	s->line = 1;
    548 	s->lineinc = 0;
    549 	s->fd = fd;
    550 	s->filename = name;
    551 	s->next = cursource;
    552 	s->ifdepth = 0;
    553 	cursource = s;
    554 	/* slop at right for EOB */
    555 	if (str) {
    556 		len = strlen(str);
    557 		s->inb = domalloc(len+4);
    558 		s->inp = s->inb;
    559 		strncpy((char *)s->inp, str, len);
    560 	} else {
    561 		s->inb = domalloc(INS+4);
    562 		s->inp = s->inb;
    563 		len = 0;
    564 	}
    565 	s->inl = s->inp+len;
    566 	s->inl[0] = s->inl[1] = EOB;
    567 	return s;
    568 }
    569 
    570 void
    571 unsetsource(void)
    572 {
    573 	Source *s = cursource;
    574 
    575 	if (s->fd>=0) {
    576 		close(s->fd);
    577 		dofree(s->inb);
    578 	}
    579 	cursource = s->next;
    580 	dofree(s);
    581 }