tdelibs/tdecore/tequivchars.cpp

#undef REGEX_IS_PCRE2
#define OPTIMIZE_ASCII_LOOKUP

#ifdef REGEXP_IS_PCRE2
#pragma message "############ Assuming regular expressions are PCRE2 ############"
#endif

#ifdef OPTIMIZE_ASCII_LOOKUP
#pragma message "############ ASCII characters will be processed separately ############"
#endif

#include "tequivchars.h"

struct defaultCollation {
  TQChar character;
  TQChar collatesTo;
};

static const defaultCollation EquivalentsTable[] = {
#include "tequivchars-mapping.h"
};
uint EquivTableROWS = sizeof(EquivalentsTable)/sizeof(EquivalentsTable[0]);


const TQString TEquivChars::replaceChars( const TQString &inputString, bool isRegex )
{
  int inStrLen = inputString.length();
  TQString outString = TQString::fromLatin1( "" );
  outString.reserve( inStrLen );
  const TQChar *char16 = inputString.unicode();

  bool backSlashed        = false; // \_
  bool startedCharClass   = false; // Previous character was starting '[' of character class
  bool inCharacterClass   = false; // [___]
  bool inPosixBracketExpr = false; // [:___:]
#ifdef REGEXP_IS_PCRE2
  bool quoteLiteral       = false; // \Q___\E
  bool inBraceExpr        = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g'
  bool inDirective        = false; // (*___)
  bool inGroupName        = false; // (?<___>
#endif // REGEXP_IS_PCRE2
  TQChar currChar  = 0;
  TQChar prevChar  = 0;
  TQChar nextChar  = 0;

  for ( int i = 0 ; i < inStrLen ; outString[i] = currChar, i++  ) {

    prevChar = currChar;
    currChar = char16[i].unicode();

    if ( isRegex ) {

      /*
         Look for regex characters and character sequences
         that should never be converted to an equivalent.
      */

      if ( i < ( inStrLen - 1 ) )
        nextChar = char16[i+1].unicode();
      else
        nextChar = 0;

      if ( currChar == '\\' ) {
        backSlashed = true;
        continue;
      }

      // Don't convert backSlashed characters
      if ( backSlashed ) {
#ifdef REGEXP_IS_PCRE2
        switch (currChar) {
          case 'Q' : quoteLiteral = true;  break; // Entering literal \Q___\E
          case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E
          case 'N' : // Entering Unicode codepoint specification \N{U+___} ?
          case 'P' : // Entering (negated) Unicode property specification \p{} ?
          case 'p' : // Entering Unicode property specification \p{} ?
          case 'g' : // Entering a named backreference \g{___} ?
            if ( nextChar == '{' ) inBraceExpr = true;
            break;
        }
#endif // REGEXP_IS_PCRE2
        backSlashed = false;
        continue;
      }

#ifdef REGEXP_IS_PCRE2
      if ( quoteLiteral )
        continue;

      if ( inBraceExpr ) {
        // Is it time to leave brace expression {___} ?
        if ( nextChar == '}' ) inBraceExpr = true;
        continue;
      }
#endif // REGEXP_IS_PCRE2

      if ( startedCharClass ) {
        switch (currChar) {
          case '^' : // Negated character class, proceed to next character
            continue; // Bypass converting this special character
          case ']' : // Treat as part of character class, not as a closure
          case ':' : // Treat as part of character class, not as start of bracket expression
            startedCharClass = false;
            continue;  // Bypass converting these special characters
        }
        startedCharClass = false;
      } // startedCharClass

      if ( inCharacterClass ) {

        if ( inPosixBracketExpr ) {
          // Is it time to leave POSIX bracket expression [:___:] ?
          if ( currChar == ':' && nextChar == ']' ) inPosixBracketExpr = false;
          continue;
        } // inPosixBracketExpr

        else { // ! inPosixBracketExpr

          if ( prevChar == '[' && currChar == ':' ) {
            // Enter POSIX bracket expression [:___:]
            inPosixBracketExpr = true;
            continue;
          }

          if ( currChar == ']' ) {
            // Leaving character class [___]
            inCharacterClass = false;
            continue;
          }

        } // ! inPosixBracketExpr

      } // inCharacterClass

      else { // ! inCharacterClass

        switch (currChar) {

          case '[' :
            // Entering a character class [___]
            startedCharClass = true;
            inCharacterClass = true;
            continue;
            break;
#ifdef REGEXP_IS_PCRE2
          case '*' :
            if ( prevChar != '(' ) continue;
            // Entering a PCRE2 directive (*___)
            inDirective = true;
            continue;
            break;

          case '?' :
            if ( prevChar != '(' ) continue;
            if ( nextChar != '<' ) continue;
            // Entering PCRE2 group name (?<___>)
            inGroupName = true;
            continue;
            break;
#endif // REGEXP_IS_PCRE2
        }
#ifdef REGEXP_IS_PCRE2
        if ( inDirective ) {
          // Is it time to leave PCRE2 directive (*___) ?
          if (currChar == ')' ) inDirective = false;
          continue;
        }

        if ( inGroupName ) {
          // Is it time to leave PCRE2 group name (?<___>) ?
          if (currChar == '>' ) inGroupName = false;
          continue;
        }
#endif // REGEXP_IS_PCRE2
      } // ! inCharacterClass

      /*
         If we have reached here, this regex character is a
         candidate for potential conversion to an equivalent.
      */

    } // isRegex

    //-Debug: std::cerr << "Converting '" << TQString(currChar).utf8().data() << "' to '";

#ifdef OPTIMIZE_ASCII_LOOKUP
    // We can process ASCII quickly without using lookup table
    unsigned short codepoint = currChar.unicode();
    if ( codepoint < 128 ) {
      if ( codepoint > 64 && codepoint < 91 ) // convert upper case ASCII
        currChar = TQChar(codepoint + 32 ); // to corresponding lower case

      // All other ASCII characters are equivalent to themselves
      //-Debug: std::cerr << TQString(currChar).utf8().data() << "' (ascii)" << std::endl;
      continue;
    }
#endif

    // Only letters and numbers are in the table
    if ( ! currChar.isLetterOrNumber() )
      continue;
    // Use a simple binary search to look up an equivalent character
    int low  =  0;
    int high =  EquivTableROWS - 1;
    while (low <= high) {
      int mid = low + (high - low) / 2;
      if ( currChar == EquivalentsTable[mid].character ) {
        // Found equivalent character, use it instead
        currChar = EquivalentsTable[mid].collatesTo;
        break;
      }
      if ( EquivalentsTable[mid].character < currChar )
        low = mid + 1;
      else
        high = mid - 1;
    }
    //-Debug: std::cerr << TQString(currChar).utf8().data() << "'" << std::endl;

  }

  return outString;
}