#undef REGEX_IS_PCRE2 #define OPTIMIZE_ASCII_LOOKUP #ifdef REGEXP_IS_PCRE2 #pragma message "############ Assuming regular expressions are PCRE2 ############" #endif #ifdef OPTIMIZE_ASCII_LOOKUP #pragma message "############ ASCII characters will be processed separately ############" #endif #include "tequivchars.h" struct defaultCollation { TQChar character; TQChar collatesTo; }; static const defaultCollation EquivalentsTable[] = { #include "tequivchars-mapping.h" }; uint EquivTableROWS = sizeof(EquivalentsTable)/sizeof(EquivalentsTable[0]); const TQString TEquivChars::replaceChars( const TQString &inputString, bool isRegex ) { int inStrLen = inputString.length(); TQString outString = TQString::fromLatin1( "" ); outString.reserve( inStrLen ); const TQChar *char16 = inputString.unicode(); bool backSlashed = false; // \_ bool startedCharClass = false; // Previous character was starting '[' of character class bool inCharacterClass = false; // [___] bool inPosixBracketExpr = false; // [:___:] #ifdef REGEXP_IS_PCRE2 bool quoteLiteral = false; // \Q___\E bool inBraceExpr = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g' bool inDirective = false; // (*___) bool inGroupName = false; // (?<___> #endif // REGEXP_IS_PCRE2 TQChar currChar = 0; TQChar prevChar = 0; TQChar nextChar = 0; for ( int i = 0 ; i < inStrLen ; outString[i] = currChar, i++ ) { prevChar = currChar; currChar = char16[i].unicode(); if ( isRegex ) { /* Look for regex characters and character sequences that should never be converted to an equivalent. */ if ( i < ( inStrLen - 1 ) ) nextChar = char16[i+1].unicode(); else nextChar = 0; if ( currChar == '\\' ) { backSlashed = true; continue; } // Don't convert backSlashed characters if ( backSlashed ) { #ifdef REGEXP_IS_PCRE2 switch (currChar) { case 'Q' : quoteLiteral = true; break; // Entering literal \Q___\E case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E case 'N' : // Entering Unicode codepoint specification \N{U+___} ? case 'P' : // Entering (negated) Unicode property specification \p{} ? case 'p' : // Entering Unicode property specification \p{} ? case 'g' : // Entering a named backreference \g{___} ? if ( nextChar == '{' ) inBraceExpr = true; break; } #endif // REGEXP_IS_PCRE2 backSlashed = false; continue; } #ifdef REGEXP_IS_PCRE2 if ( quoteLiteral ) continue; if ( inBraceExpr ) { // Is it time to leave brace expression {___} ? if ( nextChar == '}' ) inBraceExpr = true; continue; } #endif // REGEXP_IS_PCRE2 if ( startedCharClass ) { switch (currChar) { case '^' : // Negated character class, proceed to next character continue; // Bypass converting this special character case ']' : // Treat as part of character class, not as a closure case ':' : // Treat as part of character class, not as start of bracket expression startedCharClass = false; continue; // Bypass converting these special characters } startedCharClass = false; } // startedCharClass if ( inCharacterClass ) { if ( inPosixBracketExpr ) { // Is it time to leave POSIX bracket expression [:___:] ? if ( currChar == ':' && nextChar == ']' ) inPosixBracketExpr = false; continue; } // inPosixBracketExpr else { // ! inPosixBracketExpr if ( prevChar == '[' && currChar == ':' ) { // Enter POSIX bracket expression [:___:] inPosixBracketExpr = true; continue; } if ( currChar == ']' ) { // Leaving character class [___] inCharacterClass = false; continue; } } // ! inPosixBracketExpr } // inCharacterClass else { // ! inCharacterClass switch (currChar) { case '[' : // Entering a character class [___] startedCharClass = true; inCharacterClass = true; continue; break; #ifdef REGEXP_IS_PCRE2 case '*' : if ( prevChar != '(' ) continue; // Entering a PCRE2 directive (*___) inDirective = true; continue; break; case '?' : if ( prevChar != '(' ) continue; if ( nextChar != '<' ) continue; // Entering PCRE2 group name (?<___>) inGroupName = true; continue; break; #endif // REGEXP_IS_PCRE2 } #ifdef REGEXP_IS_PCRE2 if ( inDirective ) { // Is it time to leave PCRE2 directive (*___) ? if (currChar == ')' ) inDirective = false; continue; } if ( inGroupName ) { // Is it time to leave PCRE2 group name (?<___>) ? if (currChar == '>' ) inGroupName = false; continue; } #endif // REGEXP_IS_PCRE2 } // ! inCharacterClass /* If we have reached here, this regex character is a candidate for potential conversion to an equivalent. */ } // isRegex //-Debug: std::cerr << "Converting '" << TQString(currChar).utf8().data() << "' to '"; #ifdef OPTIMIZE_ASCII_LOOKUP // We can process ASCII quickly without using lookup table unsigned short codepoint = currChar.unicode(); if ( codepoint < 128 ) { if ( codepoint > 64 && codepoint < 91 ) // convert upper case ASCII currChar = TQChar(codepoint + 32 ); // to corresponding lower case // All other ASCII characters are equivalent to themselves //-Debug: std::cerr << TQString(currChar).utf8().data() << "' (ascii)" << std::endl; continue; } #endif // Only letters and numbers are in the table if ( ! currChar.isLetterOrNumber() ) continue; // Use a simple binary search to look up an equivalent character int low = 0; int high = EquivTableROWS - 1; while (low <= high) { int mid = low + (high - low) / 2; if ( currChar == EquivalentsTable[mid].character ) { // Found equivalent character, use it instead currChar = EquivalentsTable[mid].collatesTo; break; } if ( EquivalentsTable[mid].character < currChar ) low = mid + 1; else high = mid - 1; } //-Debug: std::cerr << TQString(currChar).utf8().data() << "'" << std::endl; } return outString; }