|
|
|
#undef REGEX_IS_PCRE2
|
|
|
|
#define OPTIMIZE_ASCII_LOOKUP
|
|
|
|
|
|
|
|
#ifdef REGEXP_IS_PCRE2
|
|
|
|
#pragma message "############ Assuming regular expressions are PCRE2 ############"
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef OPTIMIZE_ASCII_LOOKUP
|
|
|
|
#pragma message "############ ASCII characters will be processed separately ############"
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include "tequivchars.h"
|
|
|
|
|
|
|
|
struct defaultCollation {
|
|
|
|
TQChar character;
|
|
|
|
TQChar collatesTo;
|
|
|
|
};
|
|
|
|
|
|
|
|
static const defaultCollation EquivalentsTable[] = {
|
|
|
|
#include "tequivchars-mapping.h"
|
|
|
|
};
|
|
|
|
uint EquivTableROWS = sizeof(EquivalentsTable)/sizeof(EquivalentsTable[0]);
|
|
|
|
|
|
|
|
|
|
|
|
const TQString TEquivChars::replaceChars( const TQString &inputString, bool isRegex )
|
|
|
|
{
|
|
|
|
int inStrLen = inputString.length();
|
|
|
|
TQString outString = TQString::fromLatin1( "" );
|
|
|
|
outString.reserve( inStrLen );
|
|
|
|
const TQChar *char16 = inputString.unicode();
|
|
|
|
|
|
|
|
bool backSlashed = false; // \_
|
|
|
|
bool startedCharClass = false; // Previous character was starting '[' of character class
|
|
|
|
bool inCharacterClass = false; // [___]
|
|
|
|
bool inPosixBracketExpr = false; // [:___:]
|
|
|
|
#ifdef REGEXP_IS_PCRE2
|
|
|
|
bool quoteLiteral = false; // \Q___\E
|
|
|
|
bool inBraceExpr = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g'
|
|
|
|
bool inDirective = false; // (*___)
|
|
|
|
bool inGroupName = false; // (?<___>
|
|
|
|
#endif // REGEXP_IS_PCRE2
|
|
|
|
TQChar currChar = 0;
|
|
|
|
TQChar prevChar = 0;
|
|
|
|
TQChar nextChar = 0;
|
|
|
|
|
|
|
|
for ( int i = 0 ; i < inStrLen ; outString[i] = currChar, i++ ) {
|
|
|
|
|
|
|
|
prevChar = currChar;
|
|
|
|
currChar = char16[i].unicode();
|
|
|
|
|
|
|
|
if ( isRegex ) {
|
|
|
|
|
|
|
|
/*
|
|
|
|
Look for regex characters and character sequences
|
|
|
|
that should never be converted to an equivalent.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if ( i < ( inStrLen - 1 ) )
|
|
|
|
nextChar = char16[i+1].unicode();
|
|
|
|
else
|
|
|
|
nextChar = 0;
|
|
|
|
|
|
|
|
if ( currChar == '\\' ) {
|
|
|
|
backSlashed = true;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Don't convert backSlashed characters
|
|
|
|
if ( backSlashed ) {
|
|
|
|
#ifdef REGEXP_IS_PCRE2
|
|
|
|
switch (currChar) {
|
|
|
|
case 'Q' : quoteLiteral = true; break; // Entering literal \Q___\E
|
|
|
|
case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E
|
|
|
|
case 'N' : // Entering Unicode codepoint specification \N{U+___} ?
|
|
|
|
case 'P' : // Entering (negated) Unicode property specification \p{} ?
|
|
|
|
case 'p' : // Entering Unicode property specification \p{} ?
|
|
|
|
case 'g' : // Entering a named backreference \g{___} ?
|
|
|
|
if ( nextChar == '{' ) inBraceExpr = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
#endif // REGEXP_IS_PCRE2
|
|
|
|
backSlashed = false;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef REGEXP_IS_PCRE2
|
|
|
|
if ( quoteLiteral )
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if ( inBraceExpr ) {
|
|
|
|
// Is it time to leave brace expression {___} ?
|
|
|
|
if ( nextChar == '}' ) inBraceExpr = true;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
#endif // REGEXP_IS_PCRE2
|
|
|
|
|
|
|
|
if ( startedCharClass ) {
|
|
|
|
switch (currChar) {
|
|
|
|
case '^' : // Negated character class, proceed to next character
|
|
|
|
continue; // Bypass converting this special character
|
|
|
|
case ']' : // Treat as part of character class, not as a closure
|
|
|
|
case ':' : // Treat as part of character class, not as start of bracket expression
|
|
|
|
startedCharClass = false;
|
|
|
|
continue; // Bypass converting these special characters
|
|
|
|
}
|
|
|
|
startedCharClass = false;
|
|
|
|
} // startedCharClass
|
|
|
|
|
|
|
|
if ( inCharacterClass ) {
|
|
|
|
|
|
|
|
if ( inPosixBracketExpr ) {
|
|
|
|
// Is it time to leave POSIX bracket expression [:___:] ?
|
|
|
|
if ( currChar == ':' && nextChar == ']' ) inPosixBracketExpr = false;
|
|
|
|
continue;
|
|
|
|
} // inPosixBracketExpr
|
|
|
|
|
|
|
|
else { // ! inPosixBracketExpr
|
|
|
|
|
|
|
|
if ( prevChar == '[' && currChar == ':' ) {
|
|
|
|
// Enter POSIX bracket expression [:___:]
|
|
|
|
inPosixBracketExpr = true;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( currChar == ']' ) {
|
|
|
|
// Leaving character class [___]
|
|
|
|
inCharacterClass = false;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // ! inPosixBracketExpr
|
|
|
|
|
|
|
|
} // inCharacterClass
|
|
|
|
|
|
|
|
else { // ! inCharacterClass
|
|
|
|
|
|
|
|
switch (currChar) {
|
|
|
|
|
|
|
|
case '[' :
|
|
|
|
// Entering a character class [___]
|
|
|
|
startedCharClass = true;
|
|
|
|
inCharacterClass = true;
|
|
|
|
continue;
|
|
|
|
break;
|
|
|
|
#ifdef REGEXP_IS_PCRE2
|
|
|
|
case '*' :
|
|
|
|
if ( prevChar != '(' ) continue;
|
|
|
|
// Entering a PCRE2 directive (*___)
|
|
|
|
inDirective = true;
|
|
|
|
continue;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case '?' :
|
|
|
|
if ( prevChar != '(' ) continue;
|
|
|
|
if ( nextChar != '<' ) continue;
|
|
|
|
// Entering PCRE2 group name (?<___>)
|
|
|
|
inGroupName = true;
|
|
|
|
continue;
|
|
|
|
break;
|
|
|
|
#endif // REGEXP_IS_PCRE2
|
|
|
|
}
|
|
|
|
#ifdef REGEXP_IS_PCRE2
|
|
|
|
if ( inDirective ) {
|
|
|
|
// Is it time to leave PCRE2 directive (*___) ?
|
|
|
|
if (currChar == ')' ) inDirective = false;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( inGroupName ) {
|
|
|
|
// Is it time to leave PCRE2 group name (?<___>) ?
|
|
|
|
if (currChar == '>' ) inGroupName = false;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
#endif // REGEXP_IS_PCRE2
|
|
|
|
} // ! inCharacterClass
|
|
|
|
|
|
|
|
/*
|
|
|
|
If we have reached here, this regex character is a
|
|
|
|
candidate for potential conversion to an equivalent.
|
|
|
|
*/
|
|
|
|
|
|
|
|
} // isRegex
|
|
|
|
|
|
|
|
//-Debug: std::cerr << "Converting '" << TQString(currChar).utf8().data() << "' to '";
|
|
|
|
|
|
|
|
#ifdef OPTIMIZE_ASCII_LOOKUP
|
|
|
|
// We can process ASCII quickly without using lookup table
|
|
|
|
unsigned short codepoint = currChar.unicode();
|
|
|
|
if ( codepoint < 128 ) {
|
|
|
|
if ( codepoint > 64 && codepoint < 91 ) // convert upper case ASCII
|
|
|
|
currChar = TQChar(codepoint + 32 ); // to corresponding lower case
|
|
|
|
|
|
|
|
// All other ASCII characters are equivalent to themselves
|
|
|
|
//-Debug: std::cerr << TQString(currChar).utf8().data() << "' (ascii)" << std::endl;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
// Only letters and numbers are in the table
|
|
|
|
if ( ! currChar.isLetterOrNumber() )
|
|
|
|
continue;
|
|
|
|
// Use a simple binary search to look up an equivalent character
|
|
|
|
int low = 0;
|
|
|
|
int high = EquivTableROWS - 1;
|
|
|
|
while (low <= high) {
|
|
|
|
int mid = low + (high - low) / 2;
|
|
|
|
if ( currChar == EquivalentsTable[mid].character ) {
|
|
|
|
// Found equivalent character, use it instead
|
|
|
|
currChar = EquivalentsTable[mid].collatesTo;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if ( EquivalentsTable[mid].character < currChar )
|
|
|
|
low = mid + 1;
|
|
|
|
else
|
|
|
|
high = mid - 1;
|
|
|
|
}
|
|
|
|
//-Debug: std::cerr << TQString(currChar).utf8().data() << "'" << std::endl;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return outString;
|
|
|
|
}
|