You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2415 lines
74 KiB

/+
* Copyright (c) 1999-2006 by Digital Mars
* All Rights Reserved
* written by Walter Bright www.digitalmars.com
* License for redistribution is by either the Artistic License in artistic.txt, or the GNU General Public License in gnu.txt.
* See the included readme.txt for details.
* D Language conversion by: J Duncan
+/
/**
* d language lexer
*/
module dparser.Lexer;
import dparser.Root;
import dparser.Tokens;
import dparser.Token;
import dparser.Keyword;
import dparser.Types;
import dparser.Module;
import dparser.Identifier;
import dparser.unialpha;
import dparser.OutBuffer;
//private import std.ctype;
//private import std.string;
//import dwf.core.debugapi;
int errno = 0;
//#if _WIN32 && __DMC__
// from \dm\src\include\setlocal.h
//extern "C" char * __cdecl __locale_decpoint;
char* __locale_decpoint;
//#endif
//const uint LS = 0x2028; // UTF line separator
//const uint PS = 0x2029; // UTF paragraph separator
//extern int isUniAlpha(unsigned u);
//extern int HtmlNamedEntity(unsigned char *p, int length);
/**
* Lexer object
*/
class Lexer
{
static Identifier[char[]] stringtable;
static OutBuffer stringbuffer;
static Token * freelist;
Token token; // current token
Module mod; // current module
Loc loc; // for error messages
ubyte *base; // pointer to start of buffer
ubyte *end; // past end of buffer
ubyte *p; // current character
int doDocComment; // collect doc comment information
int anyToken; // !=0 means seen at least one token
int commentToken; // !=0 means comments are TOKcomment's
this(Module mod, ubyte* base, uint begoffset, uint endoffset, int doDocComment, int commentToken)
{
if (stringbuffer is null) {
stringbuffer = new OutBuffer;
}
loc = Loc(mod, 1);
this.base = base;
this.end = base + endoffset;
this.p = base + begoffset;
this.mod = mod;
this.doDocComment = doDocComment;
this.commentToken = commentToken;
/*
* If first line starts with '#!', ignore the line
*/
if (p[0] == '#' && p[1] == '!') {
p += 2;
while (true) {
ubyte c = *p;
switch (c) {
case '\n':
p++;
break;
case '\r':
p++;
if (*p == '\n') {
p++;
}
break;
case 0:
case 0x1A:
break;
default:
if (c & 0x80) {
uint u = decodeUTF();
if (u == PS || u == LS) {
break;
}
}
p++;
continue;
}
break;
}
loc.linnum = 2;
}
}
// generate a unique identifier for this string
static Identifier idPool(in char[] str) {
// StringValue sv;
// uint len = s.length;
// StringValue sv = stringtable.update(s, len);
// Identifier* id = cast(Identifier*) sv.ptrvalue;
// if( id is null )
if ((str in stringtable) == null) {
stringtable[str] = new Identifier(str, TOK.TOKidentifier);
}
return(stringtable[str]);
}
static void initKeywords() {
// build character map
cmtable_init();
// create keyword tokens & identifiers
dparser.Keyword.initKeywords();
// create standard lexer tokens
dparser.Token.createLexerTokens();
}
// Combine two document comments into one.
static char[] combineComments(char[] c1, char[] c2) {
char[] c = c2;
if (c1.length) {
c = c1;
if (c2.length) {
c = c1 ~ "\n" ~ c2;
}
}
return(c);
}
// Decode UTF character. Issue error messages for invalid sequences. Return decoded character, advance p to last character in UTF sequence.
//! fix
uint decodeUTF() {
ubyte * s = p;
ubyte c = *s;
assert(c & 0x80);
if (!(c & 0x80)) {
return(c);
}
return(cast(uint)'X');
/*
* dchar u;
* uint len;
*
*
*
* // Check length of remaining string up to 6 UTF-8 characters
* for( len = 1; len < 6 && s[len]; len++ )
* {
*
* }
* /+
* uint idx = 0;
* char* msg = utf_decodeChar( s, len, &idx, &u );
* p += idx - 1;
* if( msg )
* {
* error(msg);
* }
* +/
* return u;
*/
}
void error(...) {
if ((mod !is null) && !global.gag) {
writefln(formatLoc(loc, _arguments, _argptr));
/*
* char[] p = loc.toChars();
* if( p.length )
* writef( "%s: ", p );
* writefx( stdout, _arguments, _argptr, 1 );
*/
if (global.errors >= global.max_errors) { // moderate blizzard of cascading messages
throw new Exception("too many errors");
}
}
global.errors++;
}
void errorLoc(Loc loc, ...) {
if ((mod !is null) && !global.gag) {
writefln(formatLoc(loc, _arguments, _argptr));
/*
* char[] p = loc.toChars();
* if( p.length )
* writef("%s: ", p);
* writefx(stdout, _arguments, _argptr, 1);
*/
if (global.errors >= 20) { // moderate blizzard of cascading messages
throw new Exception("too many errors");
}
}
global.errors++;
}
TOK nextToken() {
if (token.next) {
Token* t = token.next;
memcpy(&token, t, Token.sizeof);
// t.next = freelist;
// freelist = t;
}
else {
scan(&token);
}
// token.print();
return(token.value);
}
Token* peek(inout Token ct) {
Token* t;
if (ct.next) {
t = ct.next;
}
else {
t = new Token;
scan(t);
t.next = null;
ct.next = t;
}
return(t);
}
// Turn next token in buffer into a token.
void scan(Token* t) {
// debug writefln("scan token");
uint lastLine = loc.linnum;
uint linnum;
t.blockComment = null;
t.lineComment = null;
while (true) {
t.ptr = p;
// debug writefln( " p = %d, *p = ", cast(uint)p, cast(char)*p );
switch (*p) {
case 0:
case 0x1a:
t.value = TOK.TOKeof; // end of file
// debug writefln( " EOF" );
return;
case ' ':
case '\t':
case '\v':
case '\f':
p++;
// debug writefln( " whitespace" );
continue; // skip white space
case '\r':
// debug writefln( " cr" );
p++;
if (*p != '\n') { // if CR stands by itself
loc.linnum++;
}
continue; // skip white space
case '\n':
// debug writefln( " nl" );
p++;
loc.linnum++;
continue; // skip white space
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
t.value = number(t);
return;
/*
* #if CSTRINGS
* case '\'':
* t.value = charConstant(t, 0);
* return;
*
* case '"':
* t.value = stringConstant(t,0);
* return;
*
* case 'l':
* case 'L':
* if( p[1] == '\'')
* {
* p++;
* t.value = charConstant(t, 1);
* return;
* }
* else if( p[1] == '"')
* {
* p++;
* t.value = stringConstant(t, 1);
* return;
* }
* #else
*/
case '\'':
// debug writefln( " char" );
t.value = charConstant(t, 0);
return;
case 'r':
// debug writefln( " wysiwyg" );
if (p[1] != '"') {
goto case_identifier;
}
p++;
case '`':
t.value = wysiwygStringConstant(t, *p);
return;
case 'x':
// debug writefln( " hex string" );
if (p[1] != '"') {
goto case_identifier;
}
p++;
t.value = hexStringConstant(t);
return;
case '"':
// debug writefln( " string" );
t.value = escapeStringConstant(t, 0);
// debug writefln( t.ustring );
return;
case '\\': // escaped string literal
// debug writefln( " escaped string literal" );
uint c;
stringbuffer.offset = 0;
do {
p++;
c = escapeSequence();
stringbuffer.write(c);
} while (*p == '\\');
// t.len = stringbuffer.offset;
// stringbuffer.write(cast(byte)0);
t.ustring = stringbuffer.toString;
// memcpy( t.ustring.ptr, stringbuffer.data, stringbuffer.offset );
t.postfix = 0;
t.value = TOK.TOKstring;
return;
case 'l':
case 'L':
// #endif
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q': /*case 'r':*/
case 's':
case 't':
case 'u':
case 'v':
case 'w': /*case 'x':*/
case 'y':
case 'z':
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z':
case '_':
case_identifier:
{
// debug writefln( " identifier" );
ubyte c;
do {
c = *++p;
} while (isidchar(c) || (c & 0x80 && isUniAlpha(decodeUTF())));
// sv = stringtable.update((char *)t.ptr, p - t.ptr);
char[] tmp;
tmp.length = p - t.ptr;
memcpy(tmp.ptr, t.ptr, p - t.ptr);
Identifier id;
Identifier * pid = tmp in stringtable;
if (pid) {
id = *pid;
}
if (id is null) {
id = new Identifier(tmp, TOK.TOKidentifier);
stringtable[tmp] = id;
}
t.identifier = id;
t.value = cast(TOK)id.value;
anyToken = 1;
// if special identifier token
if (*t.ptr == '_') {
static char date[11 + 1];
static char time[8 + 1];
static char timestamp[24 + 1];
if (!date[0]) { // lazy evaluation
//!!
/+
* time_t t;
* char *p;
* .time(&t);
* p = ctime(&t);
* assert(p);
* sprintf(date.ptr, "%.6s %.4s", p + 4, p + 20);
* sprintf(time.ptr, "%.8s", p + 11);
* sprintf(timestamp.ptr, "%.24s", p);
+/
}
if (mod && id is Id.FILE) {
t.value = TOK.TOKstring;
if (loc.filename.length) {
t.ustring = loc.filename;
}
else {
t.ustring = mod.identifier.toChars();
}
goto Llen;
}
else if (mod && id == Id.LINE) {
t.value = TOK.TOKint64v;
t.uns64value = loc.linnum;
}
else if (id == Id.DATE) {
t.value = TOK.TOKstring;
//! t.ustring = date;
goto Llen;
}
else if (id == Id.TIME) {
t.value = TOK.TOKstring;
//! t.ustring = time;
goto Llen;
}
else if (id == Id.TIMESTAMP) {
t.value = TOK.TOKstring;
//! t.ustring = timestamp;
Llen:
t.postfix = 0;
// t.len = strlen((char *)t.ustring);
}
}
//printf("t.value = %d\n",t.value);
return;
}
// comments
case '/':
p++;
switch (*p) {
case '=':
p++;
t.value = TOK.TOKdivass;
return;
case '*': // '/*'
p++;
linnum = loc.linnum;
while (true) {
while (true) {
ubyte c = *p;
switch (c) {
case '/':
break;
case '\n':
loc.linnum++;
p++;
continue;
case '\r':
p++;
if (*p != '\n') {
loc.linnum++;
}
continue;
case 0:
case 0x1A:
error("unterminated /* */ comment");
p = end;
t.value = TOK.TOKeof;
return;
default:
if (c & 0x80) {
uint u = decodeUTF();
if (u == PS || u == LS) {
loc.linnum++;
}
}
p++;
continue;
}
break;
}
p++;
if (p[-2] == '*' && p - 3 != t.ptr) {
break;
}
}
if (commentToken) {
t.value = TOK.TOKcomment;
return;
}
// if /** but not /**/
else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr) {
getDocComment(t, lastLine == linnum); //! ?
}
continue;
case '/': // do // style comments
linnum = loc.linnum;
while (1) {
ubyte c = *++p;
switch (c) {
case '\n':
break;
case '\r':
if (p[1] == '\n') {
p++;
}
break;
case 0:
case 0x1a:
if (commentToken) {
p = end;
t.value = TOK.TOKcomment;
return;
}
if (doDocComment && t.ptr[2] == '/') {
getDocComment(t, lastLine == linnum);
}
p = end;
t.value = TOK.TOKeof;
return;
default:
if (c & 0x80) {
uint u = decodeUTF();
if (u == PS || u == LS) {
break;
}
}
continue;
}
break;
}
if (commentToken) {
p++;
loc.linnum++;
t.value = TOK.TOKcomment;
return;
}
if (doDocComment && t.ptr[2] == '/') {
getDocComment(t, lastLine == linnum);
}
p++;
loc.linnum++;
continue;
case '+':
{
int nest;
linnum = loc.linnum;
p++;
nest = 1;
while (1) {
ubyte c = *p;
switch (c) {
case '/':
p++;
if (*p == '+') {
p++;
nest++;
}
continue;
case '+':
p++;
if (*p == '/') {
p++;
if (--nest == 0) {
break;
}
}
continue;
case '\r':
p++;
if (*p != '\n') {
loc.linnum++;
}
continue;
case '\n':
loc.linnum++;
p++;
continue;
case 0:
case 0x1A:
error("unterminated /+ +/ comment");
p = end;
t.value = TOK.TOKeof;
return;
default:
if (c & 0x80) {
uint u = decodeUTF();
if (u == PS || u == LS) {
loc.linnum++;
}
}
p++;
continue;
}
break;
}
if (commentToken) {
t.value = TOK.TOKcomment;
return;
}
if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr) {
// if /++ but not /++/
getDocComment(t, lastLine == linnum);
}
continue;
}
default:
break;
}
t.value = TOK.TOKdiv;
return;
case '.':
p++;
if (isdigit(*p)) {
p--;
t.value = inreal(t);
}
else if (p[0] == '.') {
if (p[1] == '.') {
p += 2;
t.value = TOK.TOKdotdotdot;
}
else {
p++;
t.value = TOK.TOKslice;
}
}
else {
t.value = TOK.TOKdot;
}
return;
case '&':
p++;
if (*p == '=') {
p++;
t.value = TOK.TOKandass;
}
else if (*p == '&') {
p++;
t.value = TOK.TOKandand;
}
else {
t.value = TOK.TOKand;
}
return;
// |, ||, |=
case '|':
p++;
if (*p == '=') {
p++;
t.value = TOK.TOKorass;
}
else if (*p == '|') {
p++;
t.value = TOK.TOKoror;
}
else {
t.value = TOK.TOKor;
}
return;
case '-':
p++;
if (*p == '=') {
p++;
t.value = TOK.TOKminass;
}
else if (*p == '-') {
p++;
t.value = TOK.TOKminusminus;
}
else {
t.value = TOK.TOKmin;
}
return;
// +, +=, ++
case '+':
p++;
if (*p == '=') {
p++;
t.value = TOK.TOKaddass; // +=
}
else if (*p == '+') {
p++;
t.value = TOK.TOKplusplus; // ++
}
else {
t.value = TOK.TOKadd; // +
}
return;
// <, <=, <<=, <<, <>=, <>
case '<':
p++;
if (*p == '=') {
p++;
t.value = TOK.TOKle; // <=
}
else if (*p == '<') {
p++;
if (*p == '=') {
p++;
t.value = TOK.TOKshlass; // <<=
}
else {
t.value = TOK.TOKshl; // <<
}
}
else if (*p == '>') {
p++;
if (*p == '=') {
p++;
t.value = TOK.TOKleg; // <>=
}
else {
t.value = TOK.TOKlg; // <>
}
}
else {
t.value = TOK.TOKlt; // <
}
return;
// >, >>, >>>, >=, >>=, >>>=
case '>':
p++;
if (*p == '=') {
p++;
t.value = TOK.TOKge; // >=
}
else if (*p == '>') {
p++;
if (*p == '=') {
p++;
t.value = TOK.TOKshrass; // >>=
}
else if (*p == '>') {
p++;
if (*p == '=') {
p++;
t.value = TOK.TOKushrass; // >>>=
}
else {
t.value = TOK.TOKushr; // >>>
}
}
else {
t.value = TOK.TOKshr; // >>
}
}
else {
t.value = TOK.TOKgt; // >
}
return;
case '!':
p++;
if (*p == '=') {
p++;
if (*p == '=') {
p++;
t.value = TOK.TOKnotidentity; // !==
}
else {
t.value = TOK.TOKnotequal; // !=
}
}
else if (*p == '<') {
p++;
if (*p == '>') {
p++;
if (*p == '=') {
p++;
t.value = TOK.TOKunord; // !<>=
}
else {
t.value = TOK.TOKue; // !<>
}
}
else if (*p == '=') {
p++;
t.value = TOK.TOKug; // !<=
}
else {
t.value = TOK.TOKuge; // !<
}
}
else if (*p == '>') {
p++;
if (*p == '=') {
p++;
t.value = TOK.TOKul; // !>=
}
else {
t.value = TOK.TOKule; // !>
}
}
else {
t.value = TOK.TOKnot; // !
}
return;
case '=':
p++;
if (*p == '=') {
p++;
if (*p == '=') {
p++;
t.value = TOK.TOKidentity; // ===
}
else {
t.value = TOK.TOKequal; // ==
}
}
else {
t.value = TOK.TOKassign; // =
}
return;
case '~':
p++;
if (*p == '=') {
p++;
t.value = TOK.TOKcatass; // ~=
}
else {
t.value = TOK.TOKtilde; // ~
}
return;
// SINGLE
case '(': p++; t.value = TOK.TOKlparen; return;
case ')': p++; t.value = TOK.TOKrparen; return;
case '[': p++; t.value = TOK.TOKlbracket; return;
case ']': p++; t.value = TOK.TOKrbracket; return;
case '{': p++; t.value = TOK.TOKlcurly; return;
case '}': p++; t.value = TOK.TOKrcurly; return;
case '?': p++; t.value = TOK.TOKquestion; return;
case ',': p++; t.value = TOK.TOKcomma; return;
case ';': p++; t.value = TOK.TOKsemicolon; return;
case ':': p++; t.value = TOK.TOKcolon; return;
case '$': p++; t.value = TOK.TOKdollar; return;
// DOUBLE
case '*': p++; if (*p == '=') {
p++; t.value = TOK.TOKmulass;
}
else {
t.value = TOK.TOKmul;
} return;
case '%': p++; if (*p == '=') {
p++; t.value = TOK.TOKmodass;
}
else {
t.value = TOK.TOKmod;
} return;
case '^': p++; if (*p == '=') {
p++; t.value = TOK.TOKxorass;
}
else {
t.value = TOK.TOKxor;
} return;
// removed 148 case '~': p++; if( *p == '=' ) { p++; t.value = TOK.TOKcatass; } else t.value = TOK.TOKtilde; return;
case '#':
p++;
Pragma();
continue;
default:
{
debug writefln(" default char");
ubyte c = *p;
if (c & 0x80) {
uint u = decodeUTF();
// Check for start of unicode identifier
if (isUniAlpha(u)) {
goto case_identifier;
}
if (u == PS || u == LS) {
loc.linnum++;
p++;
continue;
}
}
if (isprint(c)) {
error("unsupported char '%s'", cast(char)c);
}
else {
error("unsupported char 0x%02x", cast(ubyte)c);
}
p++;
continue;
}
}
}
}
// Parse escape sequence.
uint escapeSequence() {
uint c;
int n;
int ndigits;
c = *p;
switch (c) {
case '\'':
case '"':
case '?':
case '\\':
Lconsume:
p++;
break;
case 'a': c = 7; goto Lconsume;
case 'b': c = 8; goto Lconsume;
case 'f': c = 12; goto Lconsume;
case 'n': c = 10; goto Lconsume;
case 'r': c = 13; goto Lconsume;
case 't': c = 9; goto Lconsume;
case 'v': c = 11; goto Lconsume;
case 'u':
ndigits = 4;
goto Lhex;
case 'U':
ndigits = 8;
goto Lhex;
case 'x':
ndigits = 2;
Lhex:
p++;
c = *p;
if (ishex(c)) {
uint v;
n = 0;
v = 0;
while (1) {
if (isdigit(c)) {
c -= '0';
}
else if (islower(c)) {
c -= 'a' - 10;
}
else {
c -= 'A' - 10;
}
v = v * 16 + c;
c = *++p;
if (++n == ndigits) {
break;
}
if (!ishex(c)) {
error("escape hex sequence has %d hex digits instead of %d", n, ndigits);
break;
}
}
//! if( ndigits != 2 && !utf_isValidDchar(v))
//! error("invalid UTF character \\U%08x", v);
c = v;
}
else {
error("undefined escape hex sequence \\%s\n", c);
}
break;
case '&': // named character entity
for (ubyte *idstart = ++p; 1; p++) {
switch (*p) {
case ';':
//!!!
/+
* c = HtmlNamedEntity(idstart, p - idstart);
* if( c == ~0 )
* {
* error("unnamed character entity &%.*s;", p - idstart, idstart);
* c = ' ';
* }
*
* p++;
+/
break;
default:
if (isalpha(*p) || (p != idstart + 1 && isdigit(*p))) {
continue;
}
error("unterminated named entity");
break;
}
break;
}
break;
case 0:
case 0x1a: // end of file
c = '\\';
break;
default:
if (isoctal(c)) {
ubyte v;
n = 0;
do {
v = v * 8 + (c - '0');
c = *++p;
} while (++n < 3 && isoctal(c));
c = v;
}
else {
error("undefined escape sequence \\%s\n", c);
}
break;
}
return(c);
}
/**************************************
*/
TOK wysiwygStringConstant(Token *t, int tc) {
uint c;
Loc start = loc;
p++;
stringbuffer.offset = 0;
while (1) {
c = *p++;
switch (c) {
case '\n':
loc.linnum++;
break;
case '\r':
if (*p == '\n') {
continue; // ignore
}
c = '\n'; // treat EndOfLine as \n character
loc.linnum++;
break;
case 0:
case 0x1a:
error("unterminated string constant starting at %s", start.toChars());
t.ustring = "";
t.postfix = 0;
return(TOK.TOKstring);
case '"':
case '`':
if (c == tc) {
// t.len = stringbuffer.offset;
stringbuffer.write(cast(byte)0);
t.ustring = stringbuffer.toString;
// t.ustring = (ubyte *)mem.malloc(stringbuffer.offset);
// memcpy(t.ustring, stringbuffer.data, stringbuffer.offset);
stringPostfix(t);
return(TOK.TOKstring);
}
break;
default:
if (c & 0x80) {
p--;
uint u = decodeUTF();
p++;
if (u == PS || u == LS) {
loc.linnum++;
}
stringbuffer.write(u);
continue;
}
break;
}
stringbuffer.write(c);
}
}
/**************************************
* Lex hex strings:
* x"0A ae 34FE BD"
*/
TOK hexStringConstant(Token *t) {
uint c;
Loc start = loc;
uint n = 0;
uint v;
p++;
stringbuffer.offset = 0;
while (1) {
c = *p++;
switch (c) {
case ' ':
case '\t':
case '\v':
case '\f':
continue; // skip white space
case '\r':
if (*p == '\n') {
continue; // ignore
}
// Treat isolated '\r' as if it were a '\n'
case '\n':
loc.linnum++;
continue;
case 0:
case 0x1a:
error("unterminated string constant starting at %s", start.toChars());
t.ustring = "";
t.postfix = 0;
return(TOK.TOKstring);
case '"':
if (n & 1) {
error("odd number (%d) of hex characters in hex string", n);
stringbuffer.write(v);
}
// t.len = stringbuffer.offset;
// stringbuffer.write(cast(byte)0);
t.ustring = stringbuffer.toString;
// t.ustring = (ubyte *)mem.malloc(stringbuffer.offset);
// memcpy(t.ustring, stringbuffer.data, stringbuffer.offset);
stringPostfix(t);
return(TOK.TOKstring);
default:
if (c >= '0' && c <= '9') {
c -= '0';
}
else if (c >= 'a' && c <= 'f') {
c -= 'a' - 10;
}
else if (c >= 'A' && c <= 'F') {
c -= 'A' - 10;
}
else if (c & 0x80) {
p--;
uint u = decodeUTF();
p++;
if (u == PS || u == LS) {
loc.linnum++;
}
else {
error("non-hex character \\u%x", u);
}
}
else {
error("non-hex character '%s'", c);
}
if (n & 1) {
v = (v << 4) | c;
stringbuffer.write(v);
}
else {
v = c;
}
n++;
break;
}
}
}
/**************************************
*/
TOK escapeStringConstant(Token *t, int wide) {
uint c;
Loc start = loc;
p++;
stringbuffer.offset = 0;
// debug writefln( "escape string constant: %s", std.string.toString( cast(char*)p ) );
while (1) {
c = *p++;
switch (c) {
case '\\':
switch (*p) {
case 'u':
case 'U':
case '&':
c = escapeSequence();
stringbuffer.write(c);
continue;
default:
c = escapeSequence();
break;
}
break;
case '\n':
loc.linnum++;
break;
case '\r':
if (*p == '\n') {
continue; // ignore
}
c = '\n'; // treat EndOfLine as \n character
loc.linnum++;
break;
case '"':
// writefln( "end of string: ", stringbuffer.toString );
t.ustring = stringbuffer.toString().dup;
// t.len = stringbuffer.offset;
// stringbuffer.write(cast(byte)0);
// t.ustring = (ubyte *)mem.malloc(stringbuffer.offset);
// memcpy(t.ustring, stringbuffer.data, stringbuffer.offset);
stringPostfix(t);
return(TOK.TOKstring);
case 0:
case 0x1a:
p--;
error("unterminated string constant starting at %s", start.toChars());
t.ustring = "";
// t.len = 0;
t.postfix = 0;
return(TOK.TOKstring);
default:
if (c & 0x80) {
p--;
c = decodeUTF();
if (c == LS || c == PS) {
c = '\n';
loc.linnum++;
}
p++;
stringbuffer.write(cast(char)c);
continue;
}
break;
}
stringbuffer.write(cast(char)c);
// writefln( stringbuffer.toString );
}
}
//**************************************
TOK charConstant(Token *t, int wide) {
uint c;
TOK tk = TOK.TOKcharv;
//printf("Lexer.charConstant\n");
p++;
c = *p++;
switch (c) {
case '\\':
switch (*p) {
case 'u':
t.uns64value = escapeSequence();
tk = TOK.TOKwcharv;
break;
case 'U':
case '&':
t.uns64value = escapeSequence();
tk = TOK.TOKdcharv;
break;
default:
t.uns64value = escapeSequence();
break;
}
break;
case '\n':
L1:
loc.linnum++;
case '\r':
case 0:
case 0x1a:
case '\'':
error("unterminated character constant");
return(tk);
default:
if (c & 0x80) {
p--;
c = decodeUTF();
p++;
if (c == LS || c == PS) {
goto L1;
}
if (c < 0xd800 || (c >= 0xe000 && c < 0xfffe)) {
tk = TOK.TOKwcharv;
}
else {
tk = TOK.TOKdcharv;
}
}
t.uns64value = c;
break;
}
if (*p != '\'') {
error("unterminated character constant");
return(tk);
}
p++;
return(tk);
}
// Get postfix of string literal.
void stringPostfix(Token *t) {
switch (*p) {
case 'c':
case 'w':
case 'd':
t.postfix = *p;
p++;
break;
default:
t.postfix = 0;
break;
}
}
/***************************************
* Read \u or \U unicode sequence
* Input:
* u 'u' or 'U'
*/
/*
* uint Wchar(uint u)
* {
* uint value;
* uint n;
* ubyte c;
* uint nchars;
*
* nchars = (u == 'U') ? 8 : 4;
* value = 0;
* for (n = 0; 1; n++)
* {
* ++p;
* if( n == nchars)
* break;
* c = *p;
* if( !ishex(c))
* {
* error("\\%s sequence must be followed by %d hex characters", u, nchars);
* break;
* }
* if( isdigit(c))
* c -= '0';
* else if( islower(c))
* c -= 'a' - 10;
* else
* c -= 'A' - 10;
* value <<= 4;
* value |= c;
* }
* return value;
* }
*/
/**************************************
* Read in a number.
* If it's an integer, store it in tok.TKutok.Vlong.
* integers can be decimal, octal or hex
* Handle the suffixes U, UL, LU, L, etc.
* If it's double, store it in tok.TKutok.Vdouble.
* Returns:
* TKnum
* TKdouble,...
*/
TOK number(Token *t) {
//debug writefln("Lexer.number()");
// We use a state machine to collect numbers
enum STATE {
STATE_initial,
STATE_0,
STATE_decimal,
STATE_octal,
STATE_octale,
STATE_hex,
STATE_binary,
STATE_hex0,
STATE_binary0,
STATE_hexh,
STATE_error
}
enum FLAGS {
FLAGS_decimal = 1, // decimal
FLAGS_unsigned = 2, // u or U suffix
FLAGS_long = 4, // l or L suffix
}
FLAGS flags = FLAGS.FLAGS_decimal;
int i;
TOK result;
int base;
stringbuffer.offset = 0;
// stringbuffer.data = null;
STATE state = STATE.STATE_initial;
ubyte * start = p;
TOK _isreal() {
p = start;
return(inreal(t));
}
while (true) {
char c = cast(char)*p;
switch (state) {
case STATE.STATE_initial: // opening state
if (c == '0') {
state = STATE.STATE_0;
}
else {
state = STATE.STATE_decimal;
}
break;
case STATE.STATE_0:
flags = cast(FLAGS)(flags & ~FLAGS.FLAGS_decimal);
switch (c) {
// #if ZEROH
// case 'H': // 0h
// case 'h':
// goto hexh;
// #endif
case 'X':
case 'x':
state = STATE.STATE_hex0;
break;
case '.':
if (p[1] == '.') { // .. is a separate token
goto done;
}
case 'i':
case 'f':
case 'F':
goto _Real;
// #if ZEROH
// case 'E':
// case 'e':
// goto case_hex;
// #endif
case 'B':
case 'b':
state = STATE.STATE_binary0;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
state = STATE.STATE_octal;
break;
// #if ZEROH
// case '8': case '9': case 'A':
// case 'C': case 'D': case 'F':
// case 'a': case 'c': case 'd': case 'f':
// case_hex:
// state = STATE.STATE_hexh;
// break;
// #endif
case '_':
state = STATE.STATE_octal;
p++;
continue;
default:
goto done;
}
break;
case STATE.STATE_decimal: // reading decimal number
// if its not a digit - decimal complete or not a decimal
if (!isdigit(c)) {
// debug writefln( "\tnon-digit( %s )", c );
// #if ZEROH
// if( ishex(c) || c == 'H' || c == 'h' )
// goto hexh;
// #endif
//! wtf ?
// ignore embedded _
if (c == '_') {
p++;
continue;
}
// check decimal point - make real
if (c == '.' && p[1] != '.') {
goto _Real;
}
// check for mantra - make real
if (c == 'i' || c == 'f' || c == 'F' || c == 'e' || c == 'E') {
_Real: // It's a real number. Back up and rescan as a real
p = start;
return(inreal(t));
}
goto done;
}
break;
case STATE.STATE_hex0: // reading hex number
case STATE.STATE_hex:
if (!ishex(c)) {
if (c == '_') { // ignore embedded _
p++;
continue;
}
if (c == '.' && p[1] != '.') {
goto _Real;
}
if (c == 'P' || c == 'p' || c == 'i') {
goto _Real;
}
if (state == STATE.STATE_hex0) {
error("Hex digit expected, not '%s'", c);
}
goto done;
}
state = STATE.STATE_hex;
break;
// #if ZEROH
// hexh:
// state = STATE.STATE_hexh;
//
// case STATE.STATE_hexh: // parse numbers like 0FFh
// if( !ishex(c))
// {
// if( c == 'H' || c == 'h')
// {
// p++;
// base = 16;
// goto done;
// }
// else
// {
// // Check for something like 1E3 or 0E24
// if( memchr(stringbuffer.data.ptr, 'E', stringbuffer.offset) || memchr( stringbuffer.data.ptr, 'e', stringbuffer.offset))
// goto _Real;
// error("Hex digit expected, not '%s'", c);
// goto done;
// }
// }
// break;
// #endif
case STATE.STATE_octal: // reading octal number
case STATE.STATE_octale: // reading octal number with non-octal digits
if (!isoctal(c)) {
// #if ZEROH
// if( ishex(c) || c == 'H' || c == 'h' )
// goto hexh;
// #endif
if (c == '_') { // ignore embedded _
p++;
continue;
}
if (c == '.' && p[1] != '.') {
goto _Real;
}
if (c == 'i') {
goto _Real;
}
if (isdigit(c)) {
state = STATE.STATE_octale;
}
else {
goto done;
}
}
break;
case STATE.STATE_binary0: // starting binary number
case STATE.STATE_binary: // reading binary number
if (c != '0' && c != '1') {
// #if ZEROH
// if( ishex(c) || c == 'H' || c == 'h' )
// goto hexh;
// #endif
if (c == '_') { // ignore embedded _
p++;
continue;
}
if (state == STATE.STATE_binary0) {
error("binary digit expected");
state = STATE.STATE_error;
break;
}
else {
goto done;
}
}
state = STATE.STATE_binary;
break;
case STATE.STATE_error: // for error recovery
if (!isdigit(c)) { // scan until non-digit
goto done;
}
break;
default:
assert(0);
}
stringbuffer.write(cast(char)c);
p++;
}
done:
stringbuffer.write(cast(char)0); // terminate string
// debug writefln( "\tdigit complete( %s )", stringbuffer.toString );
if (state == STATE.STATE_octale) {
error("Octal digit expected");
}
uinteger_t n; // unsigned >=64 bit integer type
if (stringbuffer.offset == 2 && (state == STATE.STATE_decimal || state == STATE.STATE_0)) {
n = stringbuffer.data[0] - '0';
}
else {
// Convert string to integer
char* p = cast(char*)stringbuffer.data.ptr;
int r = 10;
int d;
if (*p == '0') {
if (p[1] == 'x' || p[1] == 'X') {
// "0x#"
p += 2;
r = 16;
}
else if (p[1] == 'b' || p[1] == 'B') {
// "0b#" - binary
p += 2;
r = 2;
}
else if (isdigit(p[1])) {
p += 1;
r = 8;
}
}
n = 0;
while (true) {
if (*p >= '0' && *p <= '9') {
d = *p - '0';
}
else if (*p >= 'a' && *p <= 'z') {
d = *p - 'a' + 10;
}
else if (*p >= 'A' && *p <= 'Z') {
d = *p - 'A' + 10;
}
else {
break;
}
if (d >= r) {
break;
}
if (n * r + d < n) {
error("integer overflow");
break;
}
n = n * r + d;
p++;
}
// if n needs more than 64 bits
if (n.sizeof > 8 && n > 0xffffffffffffffffL) {
error("integer overflow");
}
}
// Parse trailing 'u', 'U', 'l' or 'L' in any combination
while (true) {
ubyte f;
switch (*p) {
case 'U':
case 'u':
f = FLAGS.FLAGS_unsigned;
goto L1;
case 'L':
case 'l':
f = FLAGS.FLAGS_long;
L1:
p++;
if (flags & f) {
error("unrecognized token");
}
flags = cast(FLAGS)(flags | f);
continue;
default:
break;
}
break;
}
switch (flags) {
case 0:
/* Octal or Hexadecimal constant.
* First that fits: int, uint, long, ulong
*/
if (n & 0x8000000000000000L) {
result = TOK.TOKuns64v;
}
else if (n & 0xffffffff00000000L) {
result = TOK.TOKint64v;
}
else if (n & 0x80000000) {
result = TOK.TOKuns32v;
}
else {
result = TOK.TOKint32v;
}
break;
case FLAGS.FLAGS_decimal:
/* First that fits: int, long, long long
*/
if (n & 0x8000000000000000L) {
error("signed integer overflow");
result = TOK.TOKuns64v;
}
else if (n & 0xffffffff80000000L) {
result = TOK.TOKint64v;
}
else {
result = TOK.TOKint32v;
}
break;
case FLAGS.FLAGS_unsigned:
case FLAGS.FLAGS_decimal | FLAGS.FLAGS_unsigned:
/* First that fits: uint, ulong
*/
if (n & 0xffffffff00000000L) {
result = TOK.TOKuns64v;
}
else {
result = TOK.TOKuns32v;
}
break;
case FLAGS.FLAGS_decimal | FLAGS.FLAGS_long:
if (n & 0x8000000000000000L) {
error("signed integer overflow");
result = TOK.TOKuns64v;
}
else {
result = TOK.TOKint64v;
}
break;
case FLAGS.FLAGS_long:
if (n & 0x8000000000000000L) {
result = TOK.TOKuns64v;
}
else {
result = TOK.TOKint64v;
}
break;
case FLAGS.FLAGS_unsigned | FLAGS.FLAGS_long:
case FLAGS.FLAGS_decimal | FLAGS.FLAGS_unsigned | FLAGS.FLAGS_long:
result = TOK.TOKuns64v;
break;
default:
debug writefln("%x", flags);
assert(0);
}
t.uns64value = n;
return(result);
}
/**************************************
* Read in characters, converting them to real.
* Bugs:
* Exponent overflow not detected.
* Too much requested precision is not detected.
*/
TOK inreal(Token *t) {
int dblstate;
uint c;
char hex; // is this a hexadecimal-floating-constant?
TOK result;
//printf("Lexer.inreal()\n");
stringbuffer.offset = 0;
dblstate = 0;
hex = 0;
Lnext:
while (1) {
// Get next char from input
c = *p++;
//printf("dblstate = %d, c = '%s'\n", dblstate, c);
while (1) {
switch (dblstate) {
case 0: // opening state
if (c == '0') {
dblstate = 9;
}
else if (c == '.') {
dblstate = 3;
}
else {
dblstate = 1;
}
break;
case 9:
dblstate = 1;
if (c == 'X' || c == 'x') {
hex++;
break;
}
case 1: // digits to left of .
case 3: // digits to right of .
case 7: // continuing exponent digits
if (!isdigit(c) && !(hex && isxdigit(c))) {
if (c == '_') {
goto Lnext; // ignore embedded '_'
}
dblstate++;
continue;
}
break;
case 2: // no more digits to left of .
if (c == '.') {
dblstate++;
break;
}
case 4: // no more digits to right of .
if ((c == 'E' || c == 'e') || hex && (c == 'P' || c == 'p')) {
dblstate = 5;
hex = 0; // exponent is always decimal
break;
}
if (hex) {
error("binary-exponent-part required");
}
goto done;
case 5: // looking immediately to right of E
dblstate++;
if (c == '-' || c == '+') {
break;
}
case 6: // 1st exponent digit expected
if (!isdigit(c)) {
error("exponent expected");
}
dblstate++;
break;
case 8: // past end of exponent digits
goto done;
}
break;
}
stringbuffer.write(c);
}
done:
p--;
stringbuffer.write(cast(byte)0);
// #if _WIN32 && __DMC__
char *save = __locale_decpoint;
__locale_decpoint = ".";
// #endif
t.float80value = strtold(cast(char *)stringbuffer.data.ptr, null);
errno = 0;
switch (*p) {
case 'F':
case 'f':
strtof(cast(char *)stringbuffer.data.ptr, null);
result = TOK.TOKfloat32v;
p++;
break;
default:
strtod(cast(char *)stringbuffer.data.ptr, null);
result = TOK.TOKfloat64v;
break;
case 'L':
case 'l':
result = TOK.TOKfloat80v;
p++;
break;
}
if (*p == 'i' || *p == 'I') {
p++;
switch (result) {
case TOK.TOKfloat32v:
result = TOK.TOKimaginary32v;
break;
case TOK.TOKfloat64v:
result = TOK.TOKimaginary64v;
break;
case TOK.TOKfloat80v:
result = TOK.TOKimaginary80v;
break;
}
}
// #if _WIN32 && __DMC__
__locale_decpoint = save;
// #endif
if (errno == ERANGE) {
error("number is not representable");
}
return(result);
}
/*********************************************
* Do pragma.
* Currently, the only pragma supported is:
* #line linnum [filespec]
*/
void Pragma() {
Token tok;
int linnum;
char[] filespec;
Loc loc = this.loc;
scan(&tok);
if (tok.value != TOK.TOKidentifier || tok.identifier != Id.line) {
goto Lerr;
}
scan(&tok);
if (tok.value == TOK.TOKint32v || tok.value == TOK.TOKint64v) {
linnum = tok.uns64value - 1;
}
else {
goto Lerr;
}
while (1) {
switch (*p) {
case 0:
case 0x1a:
case '\n':
Lnewline:
this.loc.linnum = linnum;
if (filespec.length) {
this.loc.filename = filespec;
}
return;
case '\r':
p++;
if (*p != '\n') {
p--;
goto Lnewline;
}
continue;
case ' ':
case '\t':
case '\v':
case '\f':
p++;
continue; // skip white space
case '_':
if (mod && memcmp(p, cast(char*)"__FILE__", 8) == 0) {
p += 8;
//! filespec = mem.strdup(loc.filename ? loc.filename : mod.identifier.toChars());
}
continue;
case '"':
if (filespec) {
goto Lerr;
}
stringbuffer.offset = 0;
p++;
while (1) {
uint c;
c = *p;
switch (c) {
case '\n':
case '\r':
case 0:
case 0x1a:
goto Lerr;
case '"':
stringbuffer.write(cast(byte)0);
// filespec = mem.strdup((char *)stringbuffer.data);
filespec = stringbuffer.toString.dup;
p++;
break;
default:
if (c & 0x80) {
uint u = decodeUTF();
if (u == PS || u == LS) {
goto Lerr;
}
}
stringbuffer.write(c);
p++;
continue;
}
break;
}
continue;
default:
if (*p & 0x80) {
uint u = decodeUTF();
if (u == PS || u == LS) {
goto Lnewline;
}
}
goto Lerr;
}
}
Lerr:
errorLoc(loc, "#line integer [\"filespec\"]\\n expected");
}
/***************************************************
* Parse doc comment embedded between t.ptr and p.
* Remove trailing blanks and tabs from lines.
* Replace all newlines with \n.
* Remove leading comment character from each line.
* Decide if it's a lineComment or a blockComment.
* Append to previous one for this token.
*/
void getDocComment(Token *t, uint lineComment) {
auto OutBuffer buf = new OutBuffer;
ubyte ct = t.ptr[2];
ubyte *q = t.ptr + 3; // start of comment text
int linestart = 0;
ubyte *qend = p;
if (ct == '*' || ct == '+') {
qend -= 2;
}
// Scan over initial row of ****'s or ++++'s or ////'s
for (; q < qend; q++) {
if (*q != ct) {
break;
}
}
// Remove trailing row of ****'s or ++++'s
if (ct != '/') {
for (; q < qend; qend--) {
if (qend[-1] != ct) {
break;
}
}
}
for (; q < qend; q++) {
ubyte c = *q;
switch (c) {
case '*':
case '+':
if (linestart && c == ct) {
linestart = 0;
// Trim preceding whitespace up to preceding \n
while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) {
buf.offset--;
}
continue;
}
break;
case ' ':
case '\t':
break;
case '\r':
if (q[1] == '\n') {
continue; // skip the \r
}
goto Lnewline;
default:
if (c == 226) {
// If LS or PS
if (q[1] == 128 &&
(q[2] == 168 || q[2] == 169)) {
q += 2;
goto Lnewline;
}
}
linestart = 0;
break;
Lnewline:
c = '\n'; // replace all newlines with \n
case '\n':
linestart = 1;
// Trim trailing whitespace
while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) {
buf.offset--;
}
break;
}
buf.write(c);
}
// Always end with a newline
if (!buf.offset || buf.data[buf.offset - 1] != '\n') {
buf.writenl();
}
//buf.write(cast(char)0);
// It's a line comment if the start of the doc comment comes
// after other non-whitespace on the same line.
// ubyte** dc = (lineComment && anyToken)
// ? &t.lineComment
// : &t.blockComment;
char[] dc = (lineComment && anyToken) ? t.lineComment : t.blockComment;
// Combine with previous doc comment, if any
if (dc.length) {
dc = combineComments(dc, buf.toString().dup);
}
else {
dc = buf.toString().dup;
}
// writefln( dc );
if (lineComment && anyToken) {
t.lineComment = dc;
}
else {
t.blockComment = dc;
}
}
}
// character maps
static ubyte[256] cmtable;
const int CMoctal = 0x1;
const int CMhex = 0x2;
const int CMidchar = 0x4;
ubyte isoctal(ubyte c) {
return(cmtable[c] & CMoctal);
}
ubyte ishex(ubyte c) {
return(cmtable[c] & CMhex);
}
ubyte isidchar(ubyte c) {
return(cmtable[c] & CMidchar);
}
static void cmtable_init() {
for (uint c = 0; c < cmtable.length; c++) {
if ('0' <= c && c <= '7') {
cmtable[c] |= CMoctal;
}
if (isdigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')) {
cmtable[c] |= CMhex;
}
if (isalnum(c) || c == '_') {
cmtable[c] |= CMidchar;
}
}
}
/+
* struct StringValue
* {
* union
* {
* int intvalue;
* void *ptrvalue;
* dchar *string;
* }
*
* char[] lstring;
* }
* #define CASE_BASIC_TYPES
* case TOKwchar: case TOKdchar:
* case TOKbit: case TOKbool: case TOKchar:
* case TOKint8: case TOKuns8:
* case TOKint16: case TOKuns16:
* case TOKint32: case TOKuns32:
* case TOKint64: case TOKuns64:
* case TOKfloat32: case TOKfloat64: case TOKfloat80:
* case TOKimaginary32: case TOKimaginary64: case TOKimaginary80:
* case TOKcomplex32: case TOKcomplex64: case TOKcomplex80:
* case TOKvoid:
*
* #define CASE_BASIC_TYPES_X(t) \
* case TOKvoid: t = Type::tvoid; goto LabelX; \
* case TOKint8: t = Type::tint8; goto LabelX; \
* case TOKuns8: t = Type::tuns8; goto LabelX; \
* case TOKint16: t = Type::tint16; goto LabelX; \
* case TOKuns16: t = Type::tuns16; goto LabelX; \
* case TOKint32: t = Type::tint32; goto LabelX; \
* case TOKuns32: t = Type::tuns32; goto LabelX; \
* case TOKint64: t = Type::tint64; goto LabelX; \
* case TOKuns64: t = Type::tuns64; goto LabelX; \
* case TOKfloat32: t = Type::tfloat32; goto LabelX; \
* case TOKfloat64: t = Type::tfloat64; goto LabelX; \
* case TOKfloat80: t = Type::tfloat80; goto LabelX; \
* case TOKimaginary32: t = Type::timaginary32; goto LabelX; \
* case TOKimaginary64: t = Type::timaginary64; goto LabelX; \
* case TOKimaginary80: t = Type::timaginary80; goto LabelX; \
* case TOKcomplex32: t = Type::tcomplex32; goto LabelX; \
* case TOKcomplex64: t = Type::tcomplex64; goto LabelX; \
* case TOKcomplex80: t = Type::tcomplex80; goto LabelX; \
* case TOKbit: t = Type::tbit; goto LabelX; \
* case TOKchar: t = Type::tchar; goto LabelX; \
* case TOKwchar: t = Type::twchar; goto LabelX; \
* case TOKdchar: t = Type::tdchar; goto LabelX; \
* LabelX
+/