You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tdelibs/tdecore/tequivchars.cpp

223 lines
6.6 KiB

#undef REGEX_IS_PCRE2
#define OPTIMIZE_ASCII_LOOKUP
#ifdef REGEXP_IS_PCRE2
#pragma message "############ Assuming regular expressions are PCRE2 ############"
#endif
#ifdef OPTIMIZE_ASCII_LOOKUP
#pragma message "############ ASCII characters will be processed separately ############"
#endif
#include "tequivchars.h"
struct defaultCollation {
TQChar character;
TQChar collatesTo;
};
static const defaultCollation EquivalentsTable[] = {
#include "tequivchars-mapping.h"
};
uint EquivTableROWS = sizeof(EquivalentsTable)/sizeof(EquivalentsTable[0]);
const TQString TEquivChars::replaceChars( const TQString &inputString, bool isRegex )
{
int inStrLen = inputString.length();
TQString outString = TQString::fromLatin1( "" );
outString.reserve( inStrLen );
const TQChar *char16 = inputString.unicode();
bool backSlashed = false; // \_
bool startedCharClass = false; // Previous character was starting '[' of character class
bool inCharacterClass = false; // [___]
bool inPosixBracketExpr = false; // [:___:]
#ifdef REGEXP_IS_PCRE2
bool quoteLiteral = false; // \Q___\E
bool inBraceExpr = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g'
bool inDirective = false; // (*___)
bool inGroupName = false; // (?<___>
#endif // REGEXP_IS_PCRE2
TQChar currChar = 0;
TQChar prevChar = 0;
TQChar nextChar = 0;
for ( int i = 0 ; i < inStrLen ; outString[i] = currChar, i++ ) {
prevChar = currChar;
currChar = char16[i].unicode();
if ( isRegex ) {
/*
Look for regex characters and character sequences
that should never be converted to an equivalent.
*/
if ( i < ( inStrLen - 1 ) )
nextChar = char16[i+1].unicode();
else
nextChar = 0;
if ( currChar == '\\' ) {
backSlashed = true;
continue;
}
// Don't convert backSlashed characters
if ( backSlashed ) {
#ifdef REGEXP_IS_PCRE2
switch (currChar) {
case 'Q' : quoteLiteral = true; break; // Entering literal \Q___\E
case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E
case 'N' : // Entering Unicode codepoint specification \N{U+___} ?
case 'P' : // Entering (negated) Unicode property specification \p{} ?
case 'p' : // Entering Unicode property specification \p{} ?
case 'g' : // Entering a named backreference \g{___} ?
if ( nextChar == '{' ) inBraceExpr = true;
break;
}
#endif // REGEXP_IS_PCRE2
backSlashed = false;
continue;
}
#ifdef REGEXP_IS_PCRE2
if ( quoteLiteral )
continue;
if ( inBraceExpr ) {
// Is it time to leave brace expression {___} ?
if ( nextChar == '}' ) inBraceExpr = true;
continue;
}
#endif // REGEXP_IS_PCRE2
if ( startedCharClass ) {
switch (currChar) {
case '^' : // Negated character class, proceed to next character
continue; // Bypass converting this special character
case ']' : // Treat as part of character class, not as a closure
case ':' : // Treat as part of character class, not as start of bracket expression
startedCharClass = false;
continue; // Bypass converting these special characters
}
startedCharClass = false;
} // startedCharClass
if ( inCharacterClass ) {
if ( inPosixBracketExpr ) {
// Is it time to leave POSIX bracket expression [:___:] ?
if ( currChar == ':' && nextChar == ']' ) inPosixBracketExpr = false;
continue;
} // inPosixBracketExpr
else { // ! inPosixBracketExpr
if ( prevChar == '[' && currChar == ':' ) {
// Enter POSIX bracket expression [:___:]
inPosixBracketExpr = true;
continue;
}
if ( currChar == ']' ) {
// Leaving character class [___]
inCharacterClass = false;
continue;
}
} // ! inPosixBracketExpr
} // inCharacterClass
else { // ! inCharacterClass
switch (currChar) {
case '[' :
// Entering a character class [___]
startedCharClass = true;
inCharacterClass = true;
continue;
break;
#ifdef REGEXP_IS_PCRE2
case '*' :
if ( prevChar != '(' ) continue;
// Entering a PCRE2 directive (*___)
inDirective = true;
continue;
break;
case '?' :
if ( prevChar != '(' ) continue;
if ( nextChar != '<' ) continue;
// Entering PCRE2 group name (?<___>)
inGroupName = true;
continue;
break;
#endif // REGEXP_IS_PCRE2
}
#ifdef REGEXP_IS_PCRE2
if ( inDirective ) {
// Is it time to leave PCRE2 directive (*___) ?
if (currChar == ')' ) inDirective = false;
continue;
}
if ( inGroupName ) {
// Is it time to leave PCRE2 group name (?<___>) ?
if (currChar == '>' ) inGroupName = false;
continue;
}
#endif // REGEXP_IS_PCRE2
} // ! inCharacterClass
/*
If we have reached here, this regex character is a
candidate for potential conversion to an equivalent.
*/
} // isRegex
//-Debug: std::cerr << "Converting '" << TQString(currChar).utf8().data() << "' to '";
#ifdef OPTIMIZE_ASCII_LOOKUP
// We can process ASCII quickly without using lookup table
unsigned short codepoint = currChar.unicode();
if ( codepoint < 128 ) {
if ( codepoint > 64 && codepoint < 91 ) // convert upper case ASCII
currChar = TQChar(codepoint + 32 ); // to corresponding lower case
// All other ASCII characters are equivalent to themselves
//-Debug: std::cerr << TQString(currChar).utf8().data() << "' (ascii)" << std::endl;
continue;
}
#endif
// Only letters and numbers are in the table
if ( ! currChar.isLetterOrNumber() )
continue;
// Use a simple binary search to look up an equivalent character
int low = 0;
int high = EquivTableROWS - 1;
while (low <= high) {
int mid = low + (high - low) / 2;
if ( currChar == EquivalentsTable[mid].character ) {
// Found equivalent character, use it instead
currChar = EquivalentsTable[mid].collatesTo;
break;
}
if ( EquivalentsTable[mid].character < currChar )
low = mid + 1;
else
high = mid - 1;
}
//-Debug: std::cerr << TQString(currChar).utf8().data() << "'" << std::endl;
}
return outString;
}