You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tdepim/libksieve/parser/lexer.cpp

667 lines
17 KiB

/* -*- c++ -*-
parser/lexer.cpp
This file is part of KSieve,
the KDE internet mail/usenet news message filtering library.
Copyright (c) 2002-2003 Marc Mutz <mutz@kde.org>
KSieve is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License, version 2, as
published by the Free Software Foundation.
KSieve is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
In addition, as a special exception, the copyright holders give
permission to link the code of this program with any edition of
the TQt library by Trolltech AS, Norway (or with modified versions
of TQt that use the same license as TQt), and distribute linked
combinations including the two. You must obey the GNU General
Public License in all respects for all of the code used other than
TQt. If you modify this file, you may extend this exception to
your version of the file, but you are not obligated to do so. If
you do not wish to do so, delete this exception statement from
your version.
*/
#include <config.h>
#include <ksieve/lexer.h>
#include <impl/lexer.h>
#include <impl/utf8validator.h>
#include <ksieve/error.h>
#include <tqstring.h>
#include <tqstringlist.h>
#include <tqtextcodec.h>
#include <memory> // std::auto_ptr
#include <assert.h>
#include <ctype.h> // isdigit
#ifdef STR_DIM
# undef STR_DIM
#endif
#define STR_DIM(x) (sizeof(x) - 1)
namespace KSieve {
//
//
// Lexer Bridge implementation
//
//
Lexer::Lexer( const char * scursor, const char * send, int options )
: i( 0 )
{
i = new Impl( scursor, send, options );
}
Lexer::~Lexer() {
delete i; i = 0;
}
bool Lexer::ignoreComments() const {
assert( i );
return i->ignoreComments();
}
const Error & Lexer::error() const {
assert( i );
return i->error();
}
bool Lexer::atEnd() const {
assert( i );
return i->atEnd();
}
int Lexer::column() const {
assert( i );
return i->column();
}
int Lexer::line() const {
assert( i );
return i->line();
}
void Lexer::save() {
assert( i );
i->save();
}
void Lexer::restore() {
assert( i );
i->restore();
}
Lexer::Token Lexer::nextToken( TQString & result ) {
assert( i );
return i->nextToken( result );
}
} // namespace KSieve
// none except a-zA-Z0-9_
static const unsigned char iTextMap[16] = {
0x00, 0x00, 0x00, 0x00, // CTLs: none
0x00, 0x00, 0xFF, 0xC0, // SP ... '?': 0-9
0x7F, 0xFF, 0xFF, 0xE1, // '@' ... '_': A-Z_
0x7F, 0xFF, 0xFF, 0xE0 // '`' ... DEL: a-z
};
// SP, HT, CR, LF, {}[]();,#/
// ### exclude '['? Why would one want to write identifier["foo"]?
static const unsigned char delimMap[16] = {
0x00, 0x64, 0x00, 0x00, // CTLs: CR, HT, LF
0x90, 0xC9, 0x00, 0x10, // SP ... '?': SP, #(),;
0x00, 0x00, 0x00, 0x16, // '@' ... '_': []
0x00, 0x00, 0x00, 0x16 // '`' ... DEL: {}
};
// All except iText, delim, "*:
static const unsigned char illegalMap[16] = {
0xFF, 0x9B, 0xFF, 0xFF,
0x4F, 0x16, 0x00, 0x0F,
0x80, 0x00, 0x00, 0x0A,
0x80, 0x00, 0x00, 0x0A
};
static inline bool isOfSet( const unsigned char map[16], unsigned char ch ) {
assert( ch < 128 );
return ( map[ ch/8 ] & 0x80 >> ch%8 );
}
static inline bool isIText( unsigned char ch ) {
return ch <= 'z' && isOfSet( iTextMap, ch );
}
static inline bool isDelim( unsigned char ch ) {
return ch <= '}' && isOfSet( delimMap, ch );
}
static inline bool isIllegal( unsigned char ch ) {
return ch >= '~' || isOfSet( illegalMap, ch );
}
static inline bool is8Bit( signed char ch ) {
return ch < 0;
}
static TQString removeCRLF( const TQString & s ) {
const bool CRLF = s.endsWith( "\r\n" );
const bool LF = !CRLF && s.endsWith( "\n" );
const int e = CRLF ? 2 : LF ? 1 : 0 ; // what to chop off at the end
return s.left( s.length() - e );
}
static TQString removeDotStuff( const TQString & s ) {
return s.startsWith( ".." ) ? s.mid( 1 ) : s ;
}
namespace KSieve {
//
//
// Lexer Implementation
//
//
Lexer::Impl::Impl( const char * scursor, const char * send, int options )
: mState( scursor ? scursor : send ),
mEnd( send ? send : scursor ),
mIgnoreComments( options & IgnoreComments ),
mIgnoreLF( options & IgnoreLineFeeds )
{
if ( !scursor || !send )
assert( atEnd() );
}
Lexer::Token Lexer::Impl::nextToken( TQString & result ) {
assert( !atEnd() );
result = TQString();
//clearErrors();
const int oldLine = line();
const bool eatingWSSucceeded = ignoreComments() ? eatCWS() : eatWS() ;
if ( !ignoreLineFeeds() && oldLine != line() ) {
result.setNum( line() - oldLine ); // return number of linefeeds encountered
return LineFeeds;
}
if ( !eatingWSSucceeded )
return None;
if ( atEnd() )
return None;
switch ( *mState.cursor ) {
case '#': // HashComment
assert( !ignoreComments() );
++mState.cursor;
if ( !atEnd() )
parseHashComment( result, true );
return HashComment;
case '/': // BracketComment
assert( !ignoreComments() );
++mState.cursor; // eat slash
if ( atEnd() || *mState.cursor != '*' ) {
makeError( Error::SlashWithoutAsterisk );
return BracketComment;
}
++mState.cursor; // eat asterisk
if ( atEnd() ) {
makeError( Error::UnfinishedBracketComment );
return BracketComment;
}
parseBracketComment( result, true );
return BracketComment;
case ':': // Tag
++mState.cursor;
if ( atEnd() ) {
makeError( Error::UnexpectedCharacter, line(), column() - 1 );
return Tag;
}
if ( !isIText( *mState.cursor ) ) {
makeIllegalCharError( *mState.cursor );
return Tag;
}
parseTag( result );
return Tag;
case '"': // QuotedString
++mState.cursor;
parseQuotedString( result );
return QuotedString;
case '{':
case '}':
case '[':
case ']':
case '(':
case ')':
case ';':
case ',': // Special
result = *mState.cursor++;
return Special;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': // Number
parseNumber( result );
return Number;
case 't': // maybe MultiLineString, else Identifier
if ( _strnicmp( mState.cursor, "text:", STR_DIM("text:") ) == 0 ) {
// MultiLineString
mState.cursor += STR_DIM("text:");
parseMultiLine( result );
// ### FIXME: There can be a hash-comment between "text:"
// and CRLF! That should be preserved somehow...
return MultiLineString;
}
// else fall through:
default: // Identifier (first must not be 0-9, and can't (caught by Number above))
if ( !isIText( *mState.cursor ) ) {
makeError( Error::IllegalCharacter );
return None;
}
parseIdentifier( result );
return Identifier;
}
}
bool Lexer::Impl::eatWS() {
while ( !atEnd() )
switch ( *mState.cursor ) {
case '\r':
case '\n':
if ( !eatCRLF() )
return false;
break;
case ' ':
case '\t':
++mState.cursor;
break;
default:
return true;
}
// at end:
return true;
}
bool Lexer::Impl::eatCRLF() {
assert( !atEnd() );
assert( *mState.cursor == '\n' || *mState.cursor == '\r' );
if ( *mState.cursor == '\r' ) {
++mState.cursor;
if ( atEnd() || *mState.cursor != '\n' ) {
// CR w/o LF -> error
makeError( Error::CRWithoutLF );
return false;
} else {
// good CRLF
newLine();
return true;
}
} else /* *mState.cursor == '\n' */ {
// good, LF only
newLine();
return true;
}
}
bool Lexer::Impl::parseHashComment( TQString & result, bool reallySave ) {
// hash-comment := "#" *CHAR-NOT-CRLF CRLF
// check that the caller plays by the rules:
assert( *(mState.cursor-1) == '#' );
const char * const commentStart = mState.cursor;
// find next CRLF:
while ( !atEnd() ) {
if ( *mState.cursor == '\n' || *mState.cursor == '\r' ) break;
++mState.cursor;
}
const char * const commentEnd = mState.cursor - 1;
if ( commentEnd == commentStart ) return true; // # was last char in script...
if ( atEnd() || eatCRLF() ) {
const int commentLength = commentEnd - commentStart + 1;
if ( commentLength > 0 ) {
if ( !isValidUtf8( commentStart, commentLength ) ) {
makeError( Error::InvalidUTF8 );
return false;
}
if ( reallySave )
result += TQString::fromUtf8( commentStart, commentLength );
}
return true;
}
return false;
}
bool Lexer::Impl::parseBracketComment( TQString & result, bool reallySave ) {
// bracket-comment := "/*" *(CHAR-NOT-STAR / ("*" CHAR-NOT-SLASH )) "*/"
// check that caller plays by the rules:
assert( *(mState.cursor-2) == '/' );
assert( *(mState.cursor-1) == '*' );
const char * const commentStart = mState.cursor;
const int commentCol = column() - 2;
const int commentLine = line();
// find next asterisk:
do {
if ( !skipTo( '*' ) ) {
if ( !error() )
makeError( Error::UnfinishedBracketComment, commentLine, commentCol );
return false;
}
} while ( !atEnd() && *++mState.cursor != '/' );
if ( atEnd() ) {
makeError( Error::UnfinishedBracketComment, commentLine, commentCol );
return false;
}
assert( *mState.cursor == '/' );
const int commentLength = mState.cursor - commentStart - 1;
if ( commentLength > 0 ) {
if ( !isValidUtf8( commentStart, commentLength ) ) {
makeError( Error::InvalidUTF8 );
return false;
}
if ( reallySave ) {
TQString tmp = TQString::fromUtf8( commentStart, commentLength );
result += tmp.remove( '\r' ); // get rid of CR in CRLF pairs
}
}
++mState.cursor; // eat '/'
return true;
}
bool Lexer::Impl::parseComment( TQString & result, bool reallySave ) {
// comment := hash-comment / bracket-comment
switch( *mState.cursor ) {
case '#':
++mState.cursor;
return parseHashComment( result, reallySave );
case '/':
if ( charsLeft() < 2 || mState.cursor[1] != '*' ) {
makeError( Error::IllegalCharacter );
return false;
} else {
mState.cursor += 2; // eat "/*"
return parseBracketComment( result, reallySave );
}
default:
return false; // don't set an error here - there was no comment
}
}
bool Lexer::Impl::eatCWS() {
// white-space := 1*(SP / CRLF / HTAB / comment )
while ( !atEnd() ) {
switch( *mState.cursor ) {
case ' ':
case '\t': // SP / HTAB
++mState.cursor;
break;;
case '\n':
case '\r': // CRLF
if ( !eatCRLF() )
return false;
break;
case '#':
case '/': // comments
{
TQString dummy;
if ( !parseComment( dummy ) )
return false;
}
break;
default:
return true;
}
}
return true;
}
bool Lexer::Impl::parseIdentifier( TQString & result ) {
// identifier := (ALPHA / "_") *(ALPHA DIGIT "_")
assert( isIText( *mState.cursor ) );
const char * const identifierStart = mState.cursor;
// first char:
if ( isdigit( *mState.cursor ) ) { // no digits for the first
makeError( Error::NoLeadingDigits );
return false;
}
// rest of identifier chars ( now digits are allowed ):
for ( ++mState.cursor ; !atEnd() && isIText( *mState.cursor ) ; ++mState.cursor );
const int identifierLength = mState.cursor - identifierStart;
// Can use the fast tqfromLatin1 here, since identifiers are always
// in the us-ascii subset:
result += TQString::tqfromLatin1( identifierStart, identifierLength );
if ( atEnd() || isDelim( *mState.cursor ) )
return true;
makeIllegalCharError( *mState.cursor );
return false;
}
bool Lexer::Impl::parseTag( TQString & result ) {
// tag := ":" identifier
// check that the caller plays by the rules:
assert( *(mState.cursor-1) == ':' );
assert( !atEnd() );
assert( isIText( *mState.cursor ) );
return parseIdentifier( result );
}
bool Lexer::Impl::parseNumber( TQString & result ) {
// number := 1*DIGIT [TQUANTIFIER]
// TQUANTIFIER := "K" / "M" / "G"
assert( isdigit( *mState.cursor ) );
while ( !atEnd() && isdigit( *mState.cursor ) )
result += *mState.cursor++;
if ( atEnd() || isDelim( *mState.cursor ) )
return true;
switch ( *mState.cursor ) {
case 'G':
case 'g':
case 'M':
case 'm':
case 'K':
case 'k':
result += *mState.cursor++;
break;
default:
makeIllegalCharError();
return false;
}
// quantifier found. Check for delimiter:
if ( atEnd() || isDelim( *mState.cursor ) )
return true;
makeIllegalCharError();
return false;
}
bool Lexer::Impl::parseMultiLine( TQString & result ) {
// multi-line := "text:" *(SP / HTAB) (hash-comment / CRLF)
// *(multi-line-literal / multi-line-dotstuff)
// "." CRLF
// multi-line-literal := [CHAR-NOT-DOT *CHAR-NOT-CRLF] CRLF
// multi-line-dotstuff := "." 1*CHAR-NOT-CRLF CRLF
// ;; A line containing only "." ends the multi-line.
// ;; Remove a leading '.' if followed by another '.'.
assert( _strnicmp( mState.cursor - 5, "text:", STR_DIM("text:") ) == 0 );
const int mlBeginLine = line();
const int mlBeginCol = column() - 5;
while ( !atEnd() ) {
switch ( *mState.cursor ) {
case ' ':
case '\t':
++mState.cursor;
break;
case '#':
{
++mState.cursor;
TQString dummy;
if ( !parseHashComment( dummy ) )
return false;
goto MultiLineStart; // break from switch _and_ while
}
case '\n':
case '\r':
if ( !eatCRLF() ) return false;
goto MultiLineStart; // break from switch _and_ while
default:
makeError( Error::NonCWSAfterTextColon );
return false;
}
}
MultiLineStart:
if ( atEnd() ) {
makeError( Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol );
return false;
}
// Now, collect the single lines until one with only a single dot is found:
TQStringList lines;
while ( !atEnd() ) {
const char * const oldBeginOfLine = beginOfLine();
if ( !skipToCRLF() )
return false;
const int lineLength = mState.cursor - oldBeginOfLine;
if ( lineLength > 0 ) {
if ( !isValidUtf8( oldBeginOfLine, lineLength ) ) {
makeError( Error::InvalidUTF8 );
return false;
}
const TQString line = removeCRLF( TQString::fromUtf8( oldBeginOfLine, lineLength ) );
lines.push_back( removeDotStuff( line ) );
if ( line == "." )
break;
} else {
lines.push_back( TQString() );
}
}
if ( lines.back() != "." ) {
makeError( Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol );
return false;
}
assert( !lines.empty() );
lines.erase( --lines.end() ); // don't include the lone dot.
result = lines.join("\n");
return true;
}
bool Lexer::Impl::parseQuotedString( TQString & result ) {
// quoted-string := DTQUOTE *CHAR DTQUOTE
// check that caller plays by the rules:
assert( *(mState.cursor-1) == '"' );
const int qsBeginCol = column() - 1;
const int qsBeginLine = line();
const TQTextCodec * const codec = TQTextCodec::codecForMib( 106 ); // UTF-8
assert( codec );
const std::auto_ptr<TQTextDecoder> dec( codec->makeDecoder() );
assert( dec.get() );
while ( !atEnd() )
switch ( *mState.cursor ) {
case '"':
++mState.cursor;
return true;
case '\r':
case '\n':
if ( !eatCRLF() )
return false;
result += '\n';
break;
case '\\':
++mState.cursor;
if ( atEnd() )
break;
// else fall through:
default:
if ( !is8Bit( *mState.cursor ) )
result += *mState.cursor++;
else { // probably UTF-8
const char * const eightBitBegin = mState.cursor;
skipTo8BitEnd();
const int eightBitLen = mState.cursor - eightBitBegin;
assert( eightBitLen > 0 );
if ( isValidUtf8( eightBitBegin, eightBitLen ) )
result += dec->toUnicode( eightBitBegin, eightBitLen );
else {
assert( column() >= eightBitLen );
makeError( Error::InvalidUTF8, line(), column() - eightBitLen );
return false;
}
}
}
makeError( Error::PrematureEndOfQuotedString, qsBeginLine, qsBeginCol );
return false;
}
void Lexer::Impl::makeIllegalCharError( char ch ) {
makeError( isIllegal( ch ) ? Error::IllegalCharacter : Error::UnexpectedCharacter );
}
} // namespace KSieve