a2223c7302
doesn't depend on the 'comp' set. ok espie@ deraadt@
1021 lines
25 KiB
C
1021 lines
25 KiB
C
/*
|
|
* (c) Thomas Pornin 1999 - 2002
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 4. The name of the authors may not be used to endorse or promote
|
|
* products derived from this software without specific prior written
|
|
* permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
|
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
|
|
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
|
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
|
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
|
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
*/
|
|
|
|
#include "tune.h"
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <stddef.h>
|
|
#include <limits.h>
|
|
#include "ucppi.h"
|
|
#include "mem.h"
|
|
#ifdef UCPP_MMAP
|
|
#include <unistd.h>
|
|
#include <sys/types.h>
|
|
#include <sys/mman.h>
|
|
#endif
|
|
|
|
/*
|
|
* Character classes for description of the automaton.
|
|
* The characters used for representing classes should not appear
|
|
* explicitely in an automaton rule.
|
|
*/
|
|
#define SPC ' ' /* whitespace characters */
|
|
#define ALP 'Z' /* A-Z, a-z, _ */
|
|
#define NUM '9' /* 0-9 */
|
|
#define ANY 'Y' /* any character */
|
|
#define VCH 'F' /* void character (for end of input) */
|
|
|
|
/*
|
|
* flags and macros to test those flags
|
|
* STO: the currently read string is a complete token
|
|
* PUT: the currently read character must be added to the string
|
|
* FRZ: the currently read character must be kept and read again
|
|
*/
|
|
#define MOD_MK 255
|
|
#define noMOD(x) ((x) & 255)
|
|
#define STO(x) ((x) | 256)
|
|
#define ttSTO(x) ((x) & 256)
|
|
#define FRZ(x) ((x) | 512)
|
|
#define ttFRZ(x) ((x) & 512)
|
|
#define PUT(x) ((x) | 1024)
|
|
#define ttPUT(x) ((x) & 1024)
|
|
|
|
/* order is important */
|
|
enum {
|
|
S_START, S_SPACE, S_BANG, S_STRING, S_STRING2, S_COLON,
|
|
S_SHARP, S_PCT, S_PCT2, S_PCT3, S_AMPER, S_CHAR, S_CHAR2, S_STAR,
|
|
S_PLUS, S_MINUS, S_DOT, S_DOT2, S_SLASH, S_NUMBER, S_NUMBER2, S_LT,
|
|
S_LT2, S_EQ, S_GT, S_GT2, S_CIRC, S_PIPE, S_BACKSLASH,
|
|
S_COMMENT, S_COMMENT2, S_COMMENT3, S_COMMENT4, S_COMMENT5,
|
|
S_NAME, S_NAME_BS, S_LCHAR,
|
|
MSTATE,
|
|
S_ILL, S_DDOT, S_DDSHARP, S_BS, S_ROGUE_BS, S_BEHEAD, S_DECAY,
|
|
S_TRUNC, S_TRUNCC, S_OUCH
|
|
};
|
|
|
|
#define CMT(x) ((x) >= S_COMMENT && (x) <= S_COMMENT5)
|
|
|
|
#define CMCR 2
|
|
|
|
/*
|
|
* This is the description of the automaton. It is not used "as is"
|
|
* but copied at execution time into a table.
|
|
*
|
|
* To my utmost displeasure, there are a few hacks in read_token()
|
|
* (which uses the transformed automaton) about the special handling
|
|
* of slashes, sharps, and the letter L.
|
|
*/
|
|
static struct machine_state {
|
|
int state;
|
|
unsigned char input[CMCR];
|
|
int new_state;
|
|
} cppms[] = {
|
|
/* S_START is the generic beginning state */
|
|
{ S_START, { ANY }, S_ILL },
|
|
#ifdef SEMPER_FIDELIS
|
|
{ S_START, { SPC }, PUT(S_SPACE) },
|
|
#else
|
|
{ S_START, { SPC }, S_SPACE },
|
|
#endif
|
|
{ S_START, { '\n' }, STO(NEWLINE) },
|
|
{ S_START, { '!' }, S_BANG },
|
|
{ S_START, { '"' }, PUT(S_STRING) },
|
|
{ S_START, { '#' }, S_SHARP },
|
|
{ S_START, { '%' }, S_PCT },
|
|
{ S_START, { '&' }, S_AMPER },
|
|
{ S_START, { '\'' }, PUT(S_CHAR) },
|
|
{ S_START, { '(' }, STO(LPAR) },
|
|
{ S_START, { ')' }, STO(RPAR) },
|
|
{ S_START, { '*' }, S_STAR },
|
|
{ S_START, { '+' }, S_PLUS },
|
|
{ S_START, { ',' }, STO(COMMA) },
|
|
{ S_START, { '-' }, S_MINUS },
|
|
{ S_START, { '.' }, PUT(S_DOT) },
|
|
#ifdef SEMPER_FIDELIS
|
|
{ S_START, { '/' }, PUT(S_SLASH) },
|
|
#else
|
|
{ S_START, { '/' }, S_SLASH },
|
|
#endif
|
|
{ S_START, { NUM }, PUT(S_NUMBER) },
|
|
{ S_START, { ':' }, S_COLON },
|
|
{ S_START, { ';' }, STO(SEMIC) },
|
|
{ S_START, { '<' }, S_LT },
|
|
{ S_START, { '=' }, S_EQ },
|
|
{ S_START, { '>' }, S_GT },
|
|
{ S_START, { '?' }, STO(QUEST) },
|
|
{ S_START, { ALP }, PUT(S_NAME) },
|
|
{ S_START, { 'L' }, PUT(S_LCHAR) },
|
|
{ S_START, { '[' }, STO(LBRK) },
|
|
{ S_START, { ']' }, STO(RBRK) },
|
|
{ S_START, { '^' }, S_CIRC },
|
|
{ S_START, { '{' }, STO(LBRA) },
|
|
{ S_START, { '|' }, S_PIPE },
|
|
{ S_START, { '}' }, STO(RBRA) },
|
|
{ S_START, { '~' }, STO(NOT) },
|
|
{ S_START, { '\\' }, S_BACKSLASH },
|
|
|
|
/* after a space */
|
|
{ S_SPACE, { ANY }, FRZ(STO(NONE)) },
|
|
#ifdef SEMPER_FIDELIS
|
|
{ S_SPACE, { SPC }, PUT(S_SPACE) },
|
|
#else
|
|
{ S_SPACE, { SPC }, S_SPACE },
|
|
#endif
|
|
|
|
/* after a ! */
|
|
{ S_BANG, { ANY }, FRZ(STO(LNOT)) },
|
|
{ S_BANG, { '=' }, STO(NEQ) },
|
|
|
|
/* after a " */
|
|
{ S_STRING, { ANY }, PUT(S_STRING) },
|
|
{ S_STRING, { VCH }, FRZ(S_TRUNC) },
|
|
{ S_STRING, { '\n' }, FRZ(S_BEHEAD) },
|
|
{ S_STRING, { '\\' }, PUT(S_STRING2) },
|
|
{ S_STRING, { '"' }, PUT(STO(STRING)) },
|
|
|
|
{ S_STRING2, { ANY }, PUT(S_STRING) },
|
|
{ S_STRING2, { VCH }, FRZ(S_TRUNC) },
|
|
|
|
/* after a # */
|
|
{ S_SHARP, { ANY }, FRZ(STO(SHARP)) },
|
|
{ S_SHARP, { '#' }, STO(DSHARP) },
|
|
|
|
/* after a : */
|
|
{ S_COLON, { ANY }, FRZ(STO(COLON)) },
|
|
{ S_COLON, { '>' }, STO(DIG_RBRK) },
|
|
|
|
/* after a % */
|
|
{ S_PCT, { ANY }, FRZ(STO(PCT)) },
|
|
{ S_PCT, { '=' }, STO(ASPCT) },
|
|
{ S_PCT, { '>' }, STO(DIG_RBRA) },
|
|
{ S_PCT, { ':' }, S_PCT2 },
|
|
|
|
/* after a %: */
|
|
{ S_PCT2, { ANY }, FRZ(STO(DIG_SHARP)) },
|
|
{ S_PCT2, { '%' }, S_PCT3 },
|
|
|
|
/* after a %:% */
|
|
{ S_PCT3, { ANY }, FRZ(S_DDSHARP) },
|
|
{ S_PCT3, { ':' }, STO(DIG_DSHARP) },
|
|
|
|
/* after a & */
|
|
{ S_AMPER, { ANY }, FRZ(STO(AND)) },
|
|
{ S_AMPER, { '=' }, STO(ASAND) },
|
|
{ S_AMPER, { '&' }, STO(LAND) },
|
|
|
|
/* after a ' */
|
|
{ S_CHAR, { ANY }, PUT(S_CHAR) },
|
|
{ S_CHAR, { VCH }, FRZ(S_TRUNC) },
|
|
{ S_CHAR, { '\'' }, PUT(STO(CHAR)) },
|
|
{ S_CHAR, { '\\' }, PUT(S_CHAR2) },
|
|
|
|
/* after a \ in a character constant
|
|
useful only for '\'' */
|
|
{ S_CHAR2, { ANY }, PUT(S_CHAR) },
|
|
{ S_CHAR2, { VCH }, FRZ(S_TRUNC) },
|
|
|
|
/* after a * */
|
|
{ S_STAR, { ANY }, FRZ(STO(STAR)) },
|
|
{ S_STAR, { '=' }, STO(ASSTAR) },
|
|
|
|
/* after a + */
|
|
{ S_PLUS, { ANY }, FRZ(STO(PLUS)) },
|
|
{ S_PLUS, { '+' }, STO(PPLUS) },
|
|
{ S_PLUS, { '=' }, STO(ASPLUS) },
|
|
|
|
/* after a - */
|
|
{ S_MINUS, { ANY }, FRZ(STO(MINUS)) },
|
|
{ S_MINUS, { '-' }, STO(MMINUS) },
|
|
{ S_MINUS, { '=' }, STO(ASMINUS) },
|
|
{ S_MINUS, { '>' }, STO(ARROW) },
|
|
|
|
/* after a . */
|
|
{ S_DOT, { ANY }, FRZ(STO(DOT)) },
|
|
{ S_DOT, { NUM }, PUT(S_NUMBER) },
|
|
{ S_DOT, { '.' }, S_DOT2 },
|
|
|
|
/* after .. */
|
|
{ S_DOT2, { ANY }, FRZ(S_DDOT) },
|
|
{ S_DOT2, { '.' }, STO(MDOTS) },
|
|
|
|
/* after a / */
|
|
{ S_SLASH, { ANY }, FRZ(STO(SLASH)) },
|
|
{ S_SLASH, { '=' }, STO(ASSLASH) },
|
|
#ifdef SEMPER_FIDELIS
|
|
{ S_SLASH, { '*' }, PUT(S_COMMENT) },
|
|
{ S_SLASH, { '/' }, PUT(S_COMMENT5) },
|
|
#else
|
|
{ S_SLASH, { '*' }, S_COMMENT },
|
|
{ S_SLASH, { '/' }, S_COMMENT5 },
|
|
#endif
|
|
/*
|
|
* There is a little hack in read_token() to disable
|
|
* this last rule, if C++ (C99) comments are not enabled.
|
|
*/
|
|
|
|
/* after a number */
|
|
{ S_NUMBER, { ANY }, FRZ(STO(NUMBER)) },
|
|
{ S_NUMBER, { ALP, NUM }, PUT(S_NUMBER) },
|
|
{ S_NUMBER, { '.' }, PUT(S_NUMBER) },
|
|
{ S_NUMBER, { 'E', 'e' }, PUT(S_NUMBER2) },
|
|
{ S_NUMBER, { 'P', 'p' }, PUT(S_NUMBER2) },
|
|
|
|
{ S_NUMBER2, { ANY }, FRZ(STO(NUMBER)) },
|
|
{ S_NUMBER2, { ALP, NUM }, PUT(S_NUMBER) },
|
|
{ S_NUMBER2, { '+', '-' }, PUT(S_NUMBER) },
|
|
|
|
/* after a < */
|
|
{ S_LT, { ANY }, FRZ(STO(LT)) },
|
|
{ S_LT, { '=' }, STO(LEQ) },
|
|
{ S_LT, { '<' }, S_LT2 },
|
|
{ S_LT, { ':' }, STO(DIG_LBRK) },
|
|
{ S_LT, { '%' }, STO(DIG_LBRA) },
|
|
|
|
{ S_LT2, { ANY }, FRZ(STO(LSH)) },
|
|
{ S_LT2, { '=' }, STO(ASLSH) },
|
|
|
|
/* after a > */
|
|
{ S_GT, { ANY }, FRZ(STO(GT)) },
|
|
{ S_GT, { '=' }, STO(GEQ) },
|
|
{ S_GT, { '>' }, S_GT2 },
|
|
|
|
{ S_GT2, { ANY }, FRZ(STO(RSH)) },
|
|
{ S_GT2, { '=' }, STO(ASRSH) },
|
|
|
|
/* after a = */
|
|
{ S_EQ, { ANY }, FRZ(STO(ASGN)) },
|
|
{ S_EQ, { '=' }, STO(SAME) },
|
|
#ifdef CAST_OP
|
|
{ S_EQ, { '>' }, STO(CAST) },
|
|
#endif
|
|
|
|
/* after a \ */
|
|
{ S_BACKSLASH, { ANY }, FRZ(S_BS) },
|
|
{ S_BACKSLASH, { 'U', 'u' }, FRZ(S_NAME_BS) },
|
|
|
|
/* after a letter */
|
|
{ S_NAME, { ANY }, FRZ(STO(NAME)) },
|
|
{ S_NAME, { ALP, NUM }, PUT(S_NAME) },
|
|
{ S_NAME, { '\\' }, S_NAME_BS },
|
|
|
|
/* after a \ in an identifier */
|
|
{ S_NAME_BS, { ANY }, FRZ(S_ROGUE_BS) },
|
|
{ S_NAME_BS, { 'u', 'U' }, PUT(S_NAME) },
|
|
|
|
/* after a L */
|
|
{ S_LCHAR, { ANY }, FRZ(S_NAME) },
|
|
{ S_LCHAR, { '"' }, PUT(S_STRING) },
|
|
{ S_LCHAR, { '\'' }, PUT(S_CHAR) },
|
|
|
|
/* after a ^ */
|
|
{ S_CIRC, { ANY }, FRZ(STO(CIRC)) },
|
|
{ S_CIRC, { '=' }, STO(ASCIRC) },
|
|
|
|
/* after a | */
|
|
{ S_PIPE, { ANY }, FRZ(STO(OR)) },
|
|
{ S_PIPE, { '=' }, STO(ASOR) },
|
|
{ S_PIPE, { '|' }, STO(LOR) },
|
|
|
|
/* after a / and * */
|
|
#ifdef SEMPER_FIDELIS
|
|
{ S_COMMENT, { ANY }, PUT(S_COMMENT) },
|
|
{ S_COMMENT, { VCH }, FRZ(S_TRUNCC) },
|
|
{ S_COMMENT, { '*' }, PUT(S_COMMENT2) },
|
|
|
|
{ S_COMMENT2, { ANY }, FRZ(S_COMMENT) },
|
|
{ S_COMMENT2, { VCH }, FRZ(S_TRUNCC) },
|
|
{ S_COMMENT2, { '*' }, PUT(S_COMMENT2) },
|
|
{ S_COMMENT2, { '/' }, STO(PUT(COMMENT)) },
|
|
|
|
{ S_COMMENT5, { ANY }, PUT(S_COMMENT5) },
|
|
{ S_COMMENT5, { VCH }, FRZ(S_DECAY) },
|
|
{ S_COMMENT5, { '\n' }, FRZ(STO(COMMENT)) },
|
|
#else
|
|
{ S_COMMENT, { ANY }, S_COMMENT },
|
|
{ S_COMMENT, { VCH }, FRZ(S_TRUNCC) },
|
|
{ S_COMMENT, { '*' }, S_COMMENT2 },
|
|
|
|
{ S_COMMENT2, { ANY }, FRZ(S_COMMENT) },
|
|
{ S_COMMENT2, { VCH }, FRZ(S_TRUNCC) },
|
|
{ S_COMMENT2, { '*' }, S_COMMENT2 },
|
|
{ S_COMMENT2, { '/' }, STO(COMMENT) },
|
|
|
|
{ S_COMMENT5, { ANY }, S_COMMENT5 },
|
|
{ S_COMMENT5, { VCH }, FRZ(S_DECAY) },
|
|
{ S_COMMENT5, { '\n' }, FRZ(STO(COMMENT)) },
|
|
#endif
|
|
|
|
/* dummy end of machine description */
|
|
{ 0, { 0 }, 0 }
|
|
};
|
|
|
|
/*
|
|
* cppm is the table used to store the automaton: if we are in state s
|
|
* and we read character c, we apply the action cppm[s][c] (jumping to
|
|
* another state, or emitting a token).
|
|
* cppm_vch is the table for the special virtual character "end of input"
|
|
*/
|
|
static int cppm[MSTATE][MAX_CHAR_VAL];
|
|
static int cppm_vch[MSTATE];
|
|
|
|
/*
|
|
* init_cppm() fills cppm[][] with the information stored in cppms[].
|
|
* It must be called before beginning the lexing process.
|
|
*/
|
|
void init_cppm(void)
|
|
{
|
|
int i, j, k, c;
|
|
static unsigned char upper[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
|
static unsigned char lower[] = "abcdefghijklmnopqrstuvwxyz";
|
|
unsigned char *cp;
|
|
|
|
for (i = 0; i < MSTATE; i ++) {
|
|
for (j = 0; j < MAX_CHAR_VAL; j ++) cppm[i][j] = S_OUCH;
|
|
cppm_vch[i] = S_OUCH;
|
|
}
|
|
for (i = 0; cppms[i].input[0]; i ++) for (k = 0; k < CMCR; k ++) {
|
|
int s = cppms[i].state;
|
|
int ns = cppms[i].new_state;
|
|
|
|
switch (c = cppms[i].input[k]) {
|
|
case 0:
|
|
break;
|
|
case SPC:
|
|
/* see space_char() also */
|
|
cppm[s][' '] = ns;
|
|
cppm[s]['\t'] = ns;
|
|
cppm[s]['\v'] = ns;
|
|
cppm[s]['\f'] = ns;
|
|
#ifdef UNBREAKABLE_SPACE
|
|
if (MAX_CHAR_VAL > UNBREAKABLE_SPACE)
|
|
cppm[s][UNBREAKABLE_SPACE] = ns;
|
|
#endif
|
|
break;
|
|
case ALP:
|
|
for (cp = upper; *cp; cp ++) cppm[s][(int)*cp] = ns;
|
|
for (cp = lower; *cp; cp ++) cppm[s][(int)*cp] = ns;
|
|
cppm[s]['_'] = ns;
|
|
break;
|
|
case NUM:
|
|
for (j = '0'; j <= '9'; j ++) cppm[s][j] = ns;
|
|
break;
|
|
case ANY:
|
|
for (j = 0; j < MAX_CHAR_VAL; j ++) cppm[s][j] = ns;
|
|
cppm_vch[s] = ns;
|
|
break;
|
|
case VCH:
|
|
cppm_vch[s] = ns;
|
|
break;
|
|
default:
|
|
cppm[s][c] = ns;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Make some character as equivalent to a letter for identifiers.
|
|
*/
|
|
void set_identifier_char(int c)
|
|
{
|
|
cppm[S_START][c] = PUT(S_NAME);
|
|
cppm[S_NAME][c] = PUT(S_NAME);
|
|
}
|
|
|
|
/*
|
|
* Remove the "identifier" status from a character.
|
|
*/
|
|
void unset_identifier_char(int c)
|
|
{
|
|
cppm[S_START][c] = S_ILL;
|
|
cppm[S_NAME][c] = FRZ(STO(NAME));
|
|
}
|
|
|
|
int space_char(int c)
|
|
{
|
|
if (c == ' ' || c == '\t' || c == '\v' || c == '\f'
|
|
#ifdef UNBREAKABLE_SPACE
|
|
|| c == UNBREAKABLE_SPACE
|
|
#endif
|
|
) return 1;
|
|
return 0;
|
|
}
|
|
|
|
#ifndef NO_UCPP_BUF
|
|
/*
|
|
* our output buffer is full, flush it
|
|
*/
|
|
void flush_output(struct lexer_state *ls)
|
|
{
|
|
size_t x = ls->sbuf, y = 0, z;
|
|
|
|
if (ls->sbuf == 0) return;
|
|
do {
|
|
z = fwrite(ls->output_buf + y, 1, x, ls->output);
|
|
x -= z;
|
|
y += z;
|
|
} while (z && x > 0);
|
|
if (!y) {
|
|
error(ls->line, "could not flush output (disk full ?)");
|
|
die();
|
|
}
|
|
ls->sbuf = 0;
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* Output one character; flush the buffer if needed.
|
|
* This function should not be called, except by put_char().
|
|
*/
|
|
static inline void write_char(struct lexer_state *ls, unsigned char c)
|
|
{
|
|
#ifndef NO_UCPP_BUF
|
|
ls->output_buf[ls->sbuf ++] = c;
|
|
if (ls->sbuf == OUTPUT_BUF_MEMG) flush_output(ls);
|
|
#else
|
|
if (putc((int)c, ls->output) == EOF) {
|
|
error(ls->line, "output write error (disk full ?)");
|
|
die();
|
|
}
|
|
#endif
|
|
if (c == '\n') {
|
|
ls->oline ++;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* schedule a character for output
|
|
*/
|
|
void put_char(struct lexer_state *ls, unsigned char c)
|
|
{
|
|
if (ls->flags & KEEP_OUTPUT) write_char(ls, c);
|
|
}
|
|
|
|
/*
|
|
* get next raw input character
|
|
*/
|
|
static inline int read_char(struct lexer_state *ls)
|
|
{
|
|
unsigned char c;
|
|
|
|
if (!ls->input) {
|
|
return ((ls->pbuf ++) < ls->ebuf) ?
|
|
ls->input_string[ls->pbuf - 1] : -1;
|
|
}
|
|
while (1) {
|
|
#ifndef NO_UCPP_BUF
|
|
if (ls->pbuf == ls->ebuf) {
|
|
#ifdef UCPP_MMAP
|
|
if (ls->from_mmap) {
|
|
munmap((void *)ls->input_buf, ls->ebuf);
|
|
ls->from_mmap = 0;
|
|
ls->input_buf = ls->input_buf_sav;
|
|
}
|
|
#endif
|
|
ls->ebuf = fread(ls->input_buf, 1,
|
|
INPUT_BUF_MEMG, ls->input);
|
|
ls->pbuf = 0;
|
|
}
|
|
if (ls->ebuf == 0) return -1;
|
|
c = ls->input_buf[ls->pbuf ++];
|
|
#else
|
|
int x = getc(ls->input);
|
|
|
|
if (x == EOF) return -1;
|
|
c = x;
|
|
#endif
|
|
if (ls->flags & COPY_LINE) {
|
|
if (c == '\n') {
|
|
ls->copy_line[ls->cli] = 0;
|
|
ls->cli = 0;
|
|
} else if (ls->cli < (COPY_LINE_LENGTH - 1)) {
|
|
ls->copy_line[ls->cli ++] = c;
|
|
}
|
|
}
|
|
if (ls->macfile && c == '\n') {
|
|
ls->macfile = 0;
|
|
continue;
|
|
}
|
|
ls->macfile = 0;
|
|
if (c == '\r') {
|
|
/*
|
|
* We found a '\r'; we handle it as a newline
|
|
* and ignore the next newline. This should work
|
|
* with all combinations of Msdos, MacIntosh and
|
|
* Unix files on these three platforms. On other
|
|
* platforms, native file formats are always
|
|
* supported.
|
|
*/
|
|
ls->macfile = 1;
|
|
c = '\n';
|
|
}
|
|
break;
|
|
}
|
|
return c;
|
|
}
|
|
|
|
/*
|
|
* next_fifo_char(), char_lka1() and char_lka2() give a two character
|
|
* look-ahead on the input stream; this is needed for trigraphs
|
|
*/
|
|
static inline int next_fifo_char(struct lexer_state *ls)
|
|
{
|
|
int c;
|
|
|
|
if (ls->nlka != 0) {
|
|
c = ls->lka[0];
|
|
ls->lka[0] = ls->lka[1];
|
|
ls->nlka --;
|
|
} else c = read_char(ls);
|
|
return c;
|
|
}
|
|
|
|
static inline int char_lka1(struct lexer_state *ls)
|
|
{
|
|
if (ls->nlka == 0) {
|
|
ls->lka[0] = read_char(ls);
|
|
ls->nlka ++;
|
|
}
|
|
return ls->lka[0];
|
|
}
|
|
|
|
static inline int char_lka2(struct lexer_state *ls)
|
|
{
|
|
#ifdef AUDIT
|
|
if (ls->nlka == 0) ouch("always in motion future is");
|
|
#endif
|
|
if (ls->nlka == 1) {
|
|
ls->lka[1] = read_char(ls);
|
|
ls->nlka ++;
|
|
}
|
|
return ls->lka[1];
|
|
}
|
|
|
|
static struct trigraph {
|
|
int old, new;
|
|
} trig[9] = {
|
|
{ '=', '#' },
|
|
{ '/', '\\' },
|
|
{ '\'', '^' },
|
|
{ '(', '[' },
|
|
{ ')', ']' },
|
|
{ '!', '|' },
|
|
{ '<', '{' },
|
|
{ '>', '}' },
|
|
{ '-', '~' }
|
|
};
|
|
|
|
/*
|
|
* Returns the next character, after treatment of trigraphs and terminating
|
|
* backslashes. Return value is -1 if there is no more input.
|
|
*/
|
|
static inline int next_char(struct lexer_state *ls)
|
|
{
|
|
int c;
|
|
|
|
if (!ls->discard) return ls->last;
|
|
ls->discard = 0;
|
|
do {
|
|
c = next_fifo_char(ls);
|
|
/* check trigraphs */
|
|
if (c == '?' && char_lka1(ls) == '?'
|
|
&& (ls->flags & HANDLE_TRIGRAPHS)) {
|
|
int i, d;
|
|
|
|
d = char_lka2(ls);
|
|
for (i = 0; i < 9; i ++) if (d == trig[i].old) {
|
|
if (ls->flags & WARN_TRIGRAPHS) {
|
|
ls->count_trigraphs ++;
|
|
}
|
|
if (ls->flags & WARN_TRIGRAPHS_MORE) {
|
|
warning(ls->line, "trigraph ?""?%c "
|
|
"encountered", d);
|
|
}
|
|
next_fifo_char(ls);
|
|
next_fifo_char(ls);
|
|
c = trig[i].new;
|
|
break;
|
|
}
|
|
}
|
|
if (c == '\\' && char_lka1(ls) == '\n') {
|
|
ls->line ++;
|
|
next_fifo_char(ls);
|
|
} else if (c == '\r' && char_lka1(ls) == '\n') {
|
|
ls->line ++;
|
|
next_fifo_char(ls);
|
|
c = '\n';
|
|
return c;
|
|
} else {
|
|
ls->last = c;
|
|
return c;
|
|
}
|
|
} while (1);
|
|
}
|
|
|
|
/*
|
|
* wrapper for next_char(), to be called from outside
|
|
* (used by #error, #include directives)
|
|
*/
|
|
int grap_char(struct lexer_state *ls)
|
|
{
|
|
return next_char(ls);
|
|
}
|
|
|
|
/*
|
|
* Discard the current character, so that the next call to next_char()
|
|
* will step into the input stream.
|
|
*/
|
|
void discard_char(struct lexer_state *ls)
|
|
{
|
|
#ifdef AUDIT
|
|
if (ls->discard) ouch("overcollecting garbage");
|
|
#endif
|
|
ls->discard = 1;
|
|
ls->utf8 = 0;
|
|
if (ls->last == '\n') ls->line ++;
|
|
}
|
|
|
|
/*
|
|
* Convert an UTF-8 encoded character to a Universal Character Name
|
|
* using \u (or \U when appropriate).
|
|
*/
|
|
static int utf8_to_string(unsigned char buf[], unsigned long utf8)
|
|
{
|
|
unsigned long val = 0;
|
|
static char hex[16] = "0123456789abcdef";
|
|
|
|
if (utf8 & 0x80UL) {
|
|
unsigned long x1, x2, x3, x4;
|
|
|
|
x1 = (utf8 >> 24) & 0x7fUL;
|
|
x2 = (utf8 >> 16) & 0x7fUL;
|
|
x3 = (utf8 >> 8) & 0x7fUL;
|
|
x4 = (utf8) & 0x3fUL;
|
|
x1 &= 0x07UL;
|
|
if (x2 & 0x40UL) x2 &= 0x0fUL;
|
|
if (x3 & 0x40UL) x3 &= 0x1fUL;
|
|
val = x4 | (x3 << 6) | (x2 << 12) | (x1 << 16);
|
|
} else val = utf8;
|
|
if (val < 128) {
|
|
buf[0] = val;
|
|
buf[1] = 0;
|
|
return 1;
|
|
} else if (val < 0xffffUL) {
|
|
buf[0] = '\\';
|
|
buf[1] = 'u';
|
|
buf[2] = hex[(size_t)(val >> 12)];
|
|
buf[3] = hex[(size_t)((val >> 8) & 0xfU)];
|
|
buf[4] = hex[(size_t)((val >> 4) & 0xfU)];
|
|
buf[5] = hex[(size_t)(val & 0xfU)];
|
|
buf[6] = 0;
|
|
return 6;
|
|
}
|
|
buf[0] = '\\';
|
|
buf[1] = 'U';
|
|
buf[2] = '0';
|
|
buf[3] = '0';
|
|
buf[4] = hex[(size_t)(val >> 20)];
|
|
buf[5] = hex[(size_t)((val >> 16) & 0xfU)];
|
|
buf[6] = hex[(size_t)((val >> 12) & 0xfU)];
|
|
buf[7] = hex[(size_t)((val >> 8) & 0xfU)];
|
|
buf[8] = hex[(size_t)((val >> 4) & 0xfU)];
|
|
buf[9] = hex[(size_t)(val & 0xfU)];
|
|
buf[10] = 0;
|
|
return 10;
|
|
}
|
|
|
|
/*
|
|
* Scan the identifier and put it in canonical form:
|
|
* -- tranform \U0000xxxx into \uxxxx
|
|
* -- inside \u and \U, make letters low case
|
|
* -- report (some) incorrect use of UCN
|
|
*/
|
|
static void canonize_id(struct lexer_state *ls, char *id)
|
|
{
|
|
char *c, *d;
|
|
|
|
for (c = d = id; *c;) {
|
|
if (*c == '\\') {
|
|
int i;
|
|
|
|
if (!*(c + 1)) goto canon_error;
|
|
if (*(c + 1) == 'U') {
|
|
for (i = 0; i < 8 && *(c + i + 2); i ++);
|
|
if (i != 8) goto canon_error;
|
|
*(d ++) = '\\';
|
|
c += 2;
|
|
for (i = 0; i < 4 && *(c + i) == '0'; i ++);
|
|
if (i == 4) {
|
|
*(d ++) = 'u';
|
|
c += 4;
|
|
} else {
|
|
*(d ++) = 'U';
|
|
i = 8;
|
|
}
|
|
for (; i > 0; i --) {
|
|
switch (*c) {
|
|
case 'A': *(d ++) = 'a'; break;
|
|
case 'B': *(d ++) = 'b'; break;
|
|
case 'C': *(d ++) = 'c'; break;
|
|
case 'D': *(d ++) = 'd'; break;
|
|
case 'E': *(d ++) = 'e'; break;
|
|
case 'F': *(d ++) = 'f'; break;
|
|
default: *(d ++) = *c; break;
|
|
}
|
|
c ++;
|
|
}
|
|
} else if (*(c + 1) == 'u') {
|
|
for (i = 0; i < 4 && *(c + i + 2); i ++);
|
|
if (i != 4) goto canon_error;
|
|
*(d ++) = '\\';
|
|
*(d ++) = 'u';
|
|
c += 2;
|
|
for (; i > 0; i --) {
|
|
switch (*c) {
|
|
case 'A': *(d ++) = 'a'; break;
|
|
case 'B': *(d ++) = 'b'; break;
|
|
case 'C': *(d ++) = 'c'; break;
|
|
case 'D': *(d ++) = 'd'; break;
|
|
case 'E': *(d ++) = 'e'; break;
|
|
case 'F': *(d ++) = 'f'; break;
|
|
default: *(d ++) = *c; break;
|
|
}
|
|
c ++;
|
|
}
|
|
} else goto canon_error;
|
|
continue;
|
|
}
|
|
*(d ++) = *(c ++);
|
|
}
|
|
*d = 0;
|
|
return;
|
|
|
|
canon_error:
|
|
for (; *c; *(d ++) = *(c ++));
|
|
if (ls->flags & WARN_STANDARD) {
|
|
warning(ls->line, "malformed identifier with UCN: '%s'", id);
|
|
}
|
|
*d = 0;
|
|
}
|
|
|
|
/*
|
|
* Run the automaton, in order to get the next token.
|
|
* This function should not be called, except by next_token()
|
|
*
|
|
* return value: 1 on error, 2 on end-of-file, 0 otherwise.
|
|
*/
|
|
static inline int read_token(struct lexer_state *ls)
|
|
{
|
|
int cstat = S_START, nstat;
|
|
size_t ltok = 0;
|
|
int c, outc = 0, ucn_in_id = 0;
|
|
int shift_state;
|
|
unsigned long utf8;
|
|
long l = ls->line;
|
|
|
|
ls->ctok->line = l;
|
|
if (ls->pending_token) {
|
|
if ((ls->ctok->type = ls->pending_token) == BUNCH) {
|
|
ls->ctok->name[0] = '\\';
|
|
ls->ctok->name[1] = 0;
|
|
}
|
|
ls->pending_token = 0;
|
|
return 0;
|
|
}
|
|
if (ls->flags & UTF8_SOURCE) {
|
|
utf8 = ls->utf8;
|
|
shift_state = 0;
|
|
}
|
|
if (!(ls->flags & LEXER) && (ls->flags & KEEP_OUTPUT))
|
|
for (; ls->line > ls->oline;) put_char(ls, '\n');
|
|
do {
|
|
c = next_char(ls);
|
|
if (c < 0) {
|
|
if ((ls->flags & UTF8_SOURCE) && shift_state) {
|
|
if (ls->flags & WARN_STANDARD)
|
|
warning(ls->line, "truncated UTF-8 "
|
|
"character");
|
|
shift_state = 0;
|
|
utf8 = 0;
|
|
}
|
|
if (cstat == S_START) return 2;
|
|
nstat = cppm_vch[cstat];
|
|
} else {
|
|
if (ls->flags & UTF8_SOURCE) {
|
|
if (shift_state) {
|
|
if ((c & 0xc0) != 0x80) {
|
|
if (ls->flags & WARN_STANDARD)
|
|
warning(ls->line,
|
|
"truncated "
|
|
"UTF-8 "
|
|
"character");
|
|
shift_state = 0;
|
|
utf8 = 0;
|
|
c = '_';
|
|
} else {
|
|
utf8 = (utf8 << 8) | c;
|
|
if (-- shift_state) {
|
|
ls->discard = 1;
|
|
continue;
|
|
}
|
|
c = '_';
|
|
}
|
|
} else if ((c & 0xc0) == 0xc0) {
|
|
if ((c & 0x30) == 0x30) {
|
|
shift_state = 3;
|
|
} else if (c & 0x20) {
|
|
shift_state = 2;
|
|
} else {
|
|
shift_state = 1;
|
|
}
|
|
utf8 = c;
|
|
ls->discard = 1;
|
|
continue;
|
|
} else utf8 = 0;
|
|
}
|
|
nstat = cppm[cstat][c < MAX_CHAR_VAL ? c : 0];
|
|
}
|
|
#ifdef AUDIT
|
|
if (nstat == S_OUCH) {
|
|
ouch("bad move...");
|
|
}
|
|
#endif
|
|
/*
|
|
* disable C++-like comments
|
|
*/
|
|
if (nstat == S_COMMENT5 && !(ls->flags & CPLUSPLUS_COMMENTS))
|
|
nstat = FRZ(STO(SLASH));
|
|
|
|
if (noMOD(nstat) >= MSTATE && !ttSTO(nstat))
|
|
switch (noMOD(nstat)) {
|
|
case S_ILL:
|
|
if (ls->flags & CCHARSET) {
|
|
error(ls->line, "illegal character '%c'", c);
|
|
return 1;
|
|
}
|
|
nstat = PUT(STO(BUNCH));
|
|
break;
|
|
case S_BS:
|
|
ls->ctok->name[0] = '\\';
|
|
ltok ++;
|
|
nstat = FRZ(STO(BUNCH));
|
|
if (!(ls->flags & LEXER)) put_char(ls, '\\');
|
|
break;
|
|
case S_ROGUE_BS:
|
|
ls->pending_token = BUNCH;
|
|
nstat = FRZ(STO(NAME));
|
|
break;
|
|
case S_DDOT:
|
|
ls->pending_token = DOT;
|
|
nstat = FRZ(STO(DOT));
|
|
break;
|
|
case S_DDSHARP:
|
|
ls->pending_token = PCT;
|
|
nstat = FRZ(STO(DIG_SHARP));
|
|
break;
|
|
case S_BEHEAD:
|
|
error(l, "unfinished string at end of line");
|
|
return 1;
|
|
case S_DECAY:
|
|
warning(l, "unterminated // comment");
|
|
nstat = FRZ(STO(COMMENT));
|
|
break;
|
|
case S_TRUNC:
|
|
error(l, "truncated token");
|
|
return 1;
|
|
case S_TRUNCC:
|
|
error(l, "truncated comment");
|
|
return 1;
|
|
#ifdef AUDIT
|
|
case S_OUCH:
|
|
ouch("machine went out of control");
|
|
break;
|
|
#endif
|
|
}
|
|
if (!ttFRZ(nstat)) {
|
|
discard_char(ls);
|
|
if (!(ls->flags & LEXER) && ls->condcomp) {
|
|
int z = ttSTO(nstat) ? S_ILL : noMOD(nstat);
|
|
|
|
if (cstat == S_NAME || z == S_NAME
|
|
|| ((CMT(cstat) || CMT(z))
|
|
&& (ls->flags & DISCARD_COMMENTS))) {
|
|
outc = 0;
|
|
} else if (z == S_LCHAR || z == S_SLASH
|
|
|| (z == S_SHARP && ls->ltwnl)
|
|
|| (z == S_PCT && ls->ltwnl)
|
|
|| (z == S_BACKSLASH)) {
|
|
outc = c;
|
|
} else if (z == S_PCT2 && ls->ltwnl) {
|
|
outc = -1;
|
|
} else if (z == S_PCT3 && ls->ltwnl) {
|
|
/* we have %:% but this still might
|
|
not be a %:%: */
|
|
outc = -2;
|
|
} else {
|
|
if (outc < 0) {
|
|
put_char(ls, '%');
|
|
put_char(ls, ':');
|
|
if (outc == -2)
|
|
put_char(ls, '%');
|
|
outc = 0;
|
|
} else if (outc) {
|
|
put_char(ls, outc);
|
|
outc = 0;
|
|
}
|
|
put_char(ls, c);
|
|
}
|
|
}
|
|
} else if (outc == '/' && !(ls->flags & LEXER)
|
|
&& ls->condcomp) {
|
|
/* this is a hack: we need to dump a pending slash */
|
|
put_char(ls, outc);
|
|
outc = 0;
|
|
}
|
|
if (ttPUT(nstat)) {
|
|
if (cstat == S_NAME_BS) {
|
|
ucn_in_id = 1;
|
|
wan(ls->ctok->name, ltok, '\\', ls->tknl);
|
|
}
|
|
if ((ls->flags & UTF8_SOURCE) && utf8) {
|
|
unsigned char buf[11];
|
|
int i, j;
|
|
|
|
for (i = 0, j = utf8_to_string(buf, utf8);
|
|
i < j; i ++)
|
|
wan(ls->ctok->name, ltok, buf[i],
|
|
ls->tknl);
|
|
/* if (j > 1) ucn_in_id = 1; */
|
|
} else wan(ls->ctok->name, ltok,
|
|
(unsigned char)c, ls->tknl);
|
|
}
|
|
if (ttSTO(nstat)) {
|
|
if (S_TOKEN(noMOD(nstat))) {
|
|
wan(ls->ctok->name, ltok,
|
|
(unsigned char)0, ls->tknl);
|
|
}
|
|
ls->ctok->type = noMOD(nstat);
|
|
break;
|
|
}
|
|
cstat = noMOD(nstat);
|
|
} while (1);
|
|
if (!(ls->flags & LEXER) && (ls->flags & DISCARD_COMMENTS)
|
|
&& ls->ctok->type == COMMENT) put_char(ls, ' ');
|
|
if (ucn_in_id && ls->ctok->type == NAME)
|
|
canonize_id(ls, ls->ctok->name);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* fills ls->ctok with the next token
|
|
*/
|
|
int next_token(struct lexer_state *ls)
|
|
{
|
|
if (ls->flags & READ_AGAIN) {
|
|
ls->flags &= ~READ_AGAIN;
|
|
if (!(ls->flags & LEXER)) {
|
|
char *c = S_TOKEN(ls->ctok->type) ?
|
|
ls->ctok->name : token_name(ls->ctok);
|
|
if (ls->ctok->type == OPT_NONE) {
|
|
ls->ctok->type = NONE;
|
|
#ifdef SEMPER_FIDELIS
|
|
ls->ctok->name[0] = ' ';
|
|
ls->ctok->name[1] = 0;
|
|
#endif
|
|
put_char(ls, ' ');
|
|
} else if (ls->ctok->type != NAME &&
|
|
!(ls->ltwnl && (ls->ctok->type == SHARP
|
|
|| ls->ctok->type == DIG_SHARP)))
|
|
for (; *c; c ++) put_char(ls, *c);
|
|
}
|
|
return 0;
|
|
}
|
|
return read_token(ls);
|
|
}
|