tdelibs/tdecore/tequivchars.cpp

#undef REGEX_IS_PCRE2
#define OPTIMIZE_ASCII_LOOKUP

#ifdef REGEXP_IS_PCRE2
#pragma message "############ Assuming regular expressions are PCRE2 ############"
#endif

#ifdef OPTIMIZE_ASCII_LOOKUP
#pragma message "############ ASCII characters will be processed separately ############"
#endif

#include "tequivchars.h"

struct defaultCollation {
  TQChar character;
  TQChar collatesTo;
};

static const defaultCollation EquivalentsTable[] = {
#include "tequivchars-mapping.h"
};
uint EquivTableROWS = sizeof(EquivalentsTable)/sizeof(EquivalentsTable[0]);


const TQString TEquivChars::replaceChars( const TQString &inputString, bool isRegex )
{
  int inStrLen = inputString.length();
  TQString outString = TQString::fromLatin1( "" );
  outString.reserve( inStrLen );
  const TQChar *char16 = inputString.unicode();

  bool backSlashed        = false; // \_
  bool startedCharClass   = false; // Previous character was starting '[' of character class
  bool inCharacterClass   = false; // [___]
  bool inPosixBracketExpr = false; // [:___:]
#ifdef REGEXP_IS_PCRE2
  bool quoteLiteral       = false; // \Q___\E
  bool inBraceExpr        = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g'
  bool inDirective        = false; // (*___)
  bool inGroupName        = false; // (?<___>
#endif // REGEXP_IS_PCRE2
  TQChar currChar  = 0;
  TQChar prevChar  = 0;
  TQChar nextChar  = 0;

  for ( int i = 0 ; i < inStrLen ; outString[i] = currChar, i++  ) {

    prevChar = currChar;
    currChar = char16[i].unicode();

    if ( isRegex ) {

      /*
         Look for regex characters and character sequences
         that should never be converted to an equivalent.
      */

      if ( i < ( inStrLen - 1 ) )
        nextChar = char16[i+1].unicode();
      else
        nextChar = 0;

      if ( currChar == '\\' ) {
        backSlashed = true;
        continue;
      }

      // Don't convert backSlashed characters
      if ( backSlashed ) {
#ifdef REGEXP_IS_PCRE2
        switch (currChar) {
          case 'Q' : quoteLiteral = true;  break; // Entering literal \Q___\E
          case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E
          case 'N' : // Entering Unicode codepoint specification \N{U+___} ?
          case 'P' : // Entering (negated) Unicode property specification \p{} ?
          case 'p' : // Entering Unicode property specification \p{} ?
          case 'g' : // Entering a named backreference \g{___} ?
            if ( nextChar == '{' ) inBraceExpr = true;
            break;
        }
#endif // REGEXP_IS_PCRE2
        backSlashed = false;
        continue;
      }

#ifdef REGEXP_IS_PCRE2
      if ( quoteLiteral )
        continue;

      if ( inBraceExpr ) {
        // Is it time to leave brace expression {___} ?
        if ( nextChar == '}' ) inBraceExpr = true;
        continue;
      }
#endif // REGEXP_IS_PCRE2

      if ( startedCharClass ) {
        switch (currChar) {
          case '^' : // Negated character class, proceed to next character
            continue; // Bypass converting this special character
          case ']' : // Treat as part of character class, not as a closure
          case ':' : // Treat as part of character class, not as start of bracket expression
            startedCharClass = false;
            continue;  // Bypass converting these special characters
        }
        startedCharClass = false;
      } // startedCharClass

      if ( inCharacterClass ) {

        if ( inPosixBracketExpr ) {
          // Is it time to leave POSIX bracket expression [:___:] ?
          if ( currChar == ':' && nextChar == ']' ) inPosixBracketExpr = false;
          continue;
        } // inPosixBracketExpr

        else { // ! inPosixBracketExpr

          if ( prevChar == '[' && currChar == ':' ) {
            // Enter POSIX bracket expression [:___:]
            inPosixBracketExpr = true;
            continue;
          }

          if ( currChar == ']' ) {
            // Leaving character class [___]
            inCharacterClass = false;
            continue;
          }

        } // ! inPosixBracketExpr

      } // inCharacterClass

      else { // ! inCharacterClass

        switch (currChar) {

          case '[' :
            // Entering a character class [___]
            startedCharClass = true;
            inCharacterClass = true;
            continue;
            break;
#ifdef REGEXP_IS_PCRE2
          case '*' :
            if ( prevChar != '(' ) continue;
            // Entering a PCRE2 directive (*___)
            inDirective = true;
            continue;
            break;

          case '?' :
            if ( prevChar != '(' ) continue;
            if ( nextChar != '<' ) continue;
            // Entering PCRE2 group name (?<___>)
            inGroupName = true;
            continue;
            break;
#endif // REGEXP_IS_PCRE2
        }
#ifdef REGEXP_IS_PCRE2
        if ( inDirective ) {
          // Is it time to leave PCRE2 directive (*___) ?
          if (currChar == ')' ) inDirective = false;
          continue;
        }

        if ( inGroupName ) {
          // Is it time to leave PCRE2 group name (?<___>) ?
          if (currChar == '>' ) inGroupName = false;
          continue;
        }
#endif // REGEXP_IS_PCRE2
      } // ! inCharacterClass

      /*
         If we have reached here, this regex character is a
         candidate for potential conversion to an equivalent.
      */

    } // isRegex

    //-Debug: std::cerr << "Converting '" << TQString(currChar).utf8().data() << "' to '";

#ifdef OPTIMIZE_ASCII_LOOKUP
    // We can process ASCII quickly without using lookup table
    unsigned short codepoint = currChar.unicode();
    if ( codepoint < 128 ) {
      if ( codepoint > 64 && codepoint < 91 ) // convert upper case ASCII
        currChar = TQChar(codepoint + 32 ); // to corresponding lower case

      // All other ASCII characters are equivalent to themselves
      //-Debug: std::cerr << TQString(currChar).utf8().data() << "' (ascii)" << std::endl;
      continue;
    }
#endif

    // Only letters and numbers are in the table
    if ( ! currChar.isLetterOrNumber() )
      continue;
    // Use a simple binary search to look up an equivalent character
    int low  =  0;
    int high =  EquivTableROWS - 1;
    while (low <= high) {
      int mid = low + (high - low) / 2;
      if ( currChar == EquivalentsTable[mid].character ) {
        // Found equivalent character, use it instead
        currChar = EquivalentsTable[mid].collatesTo;
        break;
      }
      if ( EquivalentsTable[mid].character < currChar )
        low = mid + 1;
      else
        high = mid - 1;	
    }
    //-Debug: std::cerr << TQString(currChar).utf8().data() << "'" << std::endl;

  }

  return outString;
}
This new branch is a follow-up to the latest and final commit to branch issue/270/tdelibs-V3. It includes the following files that were mistakenly omitted in that commit: tdecore/CMakeLists.txt tdecore/README.tdestringmatcher tdecore/tdeglobal.cpp tdecore/tdeglobal.h tdecore/tdestringmatcher.cpp tdecore/tdestringmatcher.h tdeio/tdeio/tdefileitem.cpp tdeio/tdeio/tdefileitem.h It also includes updates to the following files, some of which are based on recent feedback from @MicheleC: tdecore/tequivchars-mapping.h tdecore/tequivchars.cpp tdecore/tequivchars.h Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago			`#undef REGEX_IS_PCRE2`
			`#define OPTIMIZE_ASCII_LOOKUP`

			`#ifdef REGEXP_IS_PCRE2`
			`#pragma message "############ Assuming regular expressions are PCRE2 ############"`
			`#endif`

			`#ifdef OPTIMIZE_ASCII_LOOKUP`
			`#pragma message "############ ASCII characters will be processed separately ############"`
			`#endif`

			`#include "tequivchars.h"`

Modifications made in response to June 2023 feedback from @MicheleC. Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago			`struct defaultCollation {`
			`TQChar character;`
			`TQChar collatesTo;`
This new branch is a follow-up to the latest and final commit to branch issue/270/tdelibs-V3. It includes the following files that were mistakenly omitted in that commit: tdecore/CMakeLists.txt tdecore/README.tdestringmatcher tdecore/tdeglobal.cpp tdecore/tdeglobal.h tdecore/tdestringmatcher.cpp tdecore/tdestringmatcher.h tdeio/tdeio/tdefileitem.cpp tdeio/tdeio/tdefileitem.h It also includes updates to the following files, some of which are based on recent feedback from @MicheleC: tdecore/tequivchars-mapping.h tdecore/tequivchars.cpp tdecore/tequivchars.h Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago			`};`

Modifications made in response to June 2023 feedback from @MicheleC. Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago			`static const defaultCollation EquivalentsTable[] = {`
			`#include "tequivchars-mapping.h"`
			`};`
			`uint EquivTableROWS = sizeof(EquivalentsTable)/sizeof(EquivalentsTable[0]);`
This new branch is a follow-up to the latest and final commit to branch issue/270/tdelibs-V3. It includes the following files that were mistakenly omitted in that commit: tdecore/CMakeLists.txt tdecore/README.tdestringmatcher tdecore/tdeglobal.cpp tdecore/tdeglobal.h tdecore/tdestringmatcher.cpp tdecore/tdestringmatcher.h tdeio/tdeio/tdefileitem.cpp tdeio/tdeio/tdefileitem.h It also includes updates to the following files, some of which are based on recent feedback from @MicheleC: tdecore/tequivchars-mapping.h tdecore/tequivchars.cpp tdecore/tequivchars.h Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago

Modifications made in response to June 2023 feedback from @MicheleC. Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago			`const TQString TEquivChars::replaceChars( const TQString &inputString, bool isRegex )`
This new branch is a follow-up to the latest and final commit to branch issue/270/tdelibs-V3. It includes the following files that were mistakenly omitted in that commit: tdecore/CMakeLists.txt tdecore/README.tdestringmatcher tdecore/tdeglobal.cpp tdecore/tdeglobal.h tdecore/tdestringmatcher.cpp tdecore/tdestringmatcher.h tdeio/tdeio/tdefileitem.cpp tdeio/tdeio/tdefileitem.h It also includes updates to the following files, some of which are based on recent feedback from @MicheleC: tdecore/tequivchars-mapping.h tdecore/tequivchars.cpp tdecore/tequivchars.h Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago			`{`
			`int inStrLen = inputString.length();`
			`TQString outString = TQString::fromLatin1( "" );`
			`outString.reserve( inStrLen );`
			`const TQChar *char16 = inputString.unicode();`

			`bool backSlashed = false; // \_`
			`bool startedCharClass = false; // Previous character was starting '[' of character class`
			`bool inCharacterClass = false; // [___]`
			`bool inPosixBracketExpr = false; // [:___:]`
			`#ifdef REGEXP_IS_PCRE2`
			`bool quoteLiteral = false; // \Q___\E`
			`bool inBraceExpr = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g'`
			`bool inDirective = false; // (*___)`
			`bool inGroupName = false; // (?<___>`
			`#endif // REGEXP_IS_PCRE2`
Modifications made in response to June 2023 feedback from @MicheleC. Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago			`TQChar currChar = 0;`
			`TQChar prevChar = 0;`
			`TQChar nextChar = 0;`
This new branch is a follow-up to the latest and final commit to branch issue/270/tdelibs-V3. It includes the following files that were mistakenly omitted in that commit: tdecore/CMakeLists.txt tdecore/README.tdestringmatcher tdecore/tdeglobal.cpp tdecore/tdeglobal.h tdecore/tdestringmatcher.cpp tdecore/tdestringmatcher.h tdeio/tdeio/tdefileitem.cpp tdeio/tdeio/tdefileitem.h It also includes updates to the following files, some of which are based on recent feedback from @MicheleC: tdecore/tequivchars-mapping.h tdecore/tequivchars.cpp tdecore/tequivchars.h Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago
Modifications made in response to June 2023 feedback from @MicheleC. Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago			`for ( int i = 0 ; i < inStrLen ; outString[i] = currChar, i++ ) {`
This new branch is a follow-up to the latest and final commit to branch issue/270/tdelibs-V3. It includes the following files that were mistakenly omitted in that commit: tdecore/CMakeLists.txt tdecore/README.tdestringmatcher tdecore/tdeglobal.cpp tdecore/tdeglobal.h tdecore/tdestringmatcher.cpp tdecore/tdestringmatcher.h tdeio/tdeio/tdefileitem.cpp tdeio/tdeio/tdefileitem.h It also includes updates to the following files, some of which are based on recent feedback from @MicheleC: tdecore/tequivchars-mapping.h tdecore/tequivchars.cpp tdecore/tequivchars.h Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago
			`prevChar = currChar;`
			`currChar = char16[i].unicode();`

			`if ( isRegex ) {`

			`/*`
			`Look for regex characters and character sequences`
			`that should never be converted to an equivalent.`
			`*/`

			`if ( i < ( inStrLen - 1 ) )`
			`nextChar = char16[i+1].unicode();`
			`else`
			`nextChar = 0;`

			`if ( currChar == '\\' ) {`
			`backSlashed = true;`
			`continue;`
			`}`

			`// Don't convert backSlashed characters`
			`if ( backSlashed ) {`
			`#ifdef REGEXP_IS_PCRE2`
			`switch (currChar) {`
			`case 'Q' : quoteLiteral = true; break; // Entering literal \Q___\E`
			`case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E`
			`case 'N' : // Entering Unicode codepoint specification \N{U+___} ?`
			`case 'P' : // Entering (negated) Unicode property specification \p{} ?`
			`case 'p' : // Entering Unicode property specification \p{} ?`
			`case 'g' : // Entering a named backreference \g{___} ?`
			`if ( nextChar == '{' ) inBraceExpr = true;`
			`break;`
			`}`
			`#endif // REGEXP_IS_PCRE2`
			`backSlashed = false;`
			`continue;`
			`}`

			`#ifdef REGEXP_IS_PCRE2`
			`if ( quoteLiteral )`
			`continue;`

			`if ( inBraceExpr ) {`
			`// Is it time to leave brace expression {___} ?`
			`if ( nextChar == '}' ) inBraceExpr = true;`
			`continue;`
			`}`
			`#endif // REGEXP_IS_PCRE2`

			`if ( startedCharClass ) {`
			`switch (currChar) {`
			`case '^' : // Negated character class, proceed to next character`
			`continue; // Bypass converting this special character`
			`case ']' : // Treat as part of character class, not as a closure`
			`case ':' : // Treat as part of character class, not as start of bracket expression`
			`startedCharClass = false;`
			`continue; // Bypass converting these special characters`
			`}`
			`startedCharClass = false;`
			`} // startedCharClass`

			`if ( inCharacterClass ) {`

			`if ( inPosixBracketExpr ) {`
			`// Is it time to leave POSIX bracket expression [:___:] ?`
			`if ( currChar == ':' && nextChar == ']' ) inPosixBracketExpr = false;`
			`continue;`
			`} // inPosixBracketExpr`

			`else { // ! inPosixBracketExpr`

			`if ( prevChar == '[' && currChar == ':' ) {`
			`// Enter POSIX bracket expression [:___:]`
			`inPosixBracketExpr = true;`
			`continue;`
			`}`

			`if ( currChar == ']' ) {`
			`// Leaving character class [___]`
			`inCharacterClass = false;`
			`continue;`
			`}`

			`} // ! inPosixBracketExpr`

			`} // inCharacterClass`

			`else { // ! inCharacterClass`

			`switch (currChar) {`

			`case '[' :`
			`// Entering a character class [___]`
			`startedCharClass = true;`
			`inCharacterClass = true;`
			`continue;`
			`break;`
			`#ifdef REGEXP_IS_PCRE2`
			`case '*' :`
			`if ( prevChar != '(' ) continue;`
			`// Entering a PCRE2 directive (*___)`
			`inDirective = true;`
			`continue;`
			`break;`

			`case '?' :`
			`if ( prevChar != '(' ) continue;`
			`if ( nextChar != '<' ) continue;`
			`// Entering PCRE2 group name (?<___>)`
			`inGroupName = true;`
			`continue;`
			`break;`
			`#endif // REGEXP_IS_PCRE2`
			`}`
			`#ifdef REGEXP_IS_PCRE2`
			`if ( inDirective ) {`
			`// Is it time to leave PCRE2 directive (*___) ?`
			`if (currChar == ')' ) inDirective = false;`
			`continue;`
			`}`

			`if ( inGroupName ) {`
			`// Is it time to leave PCRE2 group name (?<___>) ?`
			`if (currChar == '>' ) inGroupName = false;`
			`continue;`
			`}`
			`#endif // REGEXP_IS_PCRE2`
			`} // ! inCharacterClass`

			`/*`
			`If we have reached here, this regex character is a`
			`candidate for potential conversion to an equivalent.`
			`*/`

			`} // isRegex`

			`//-Debug: std::cerr << "Converting '" << TQString(currChar).utf8().data() << "' to '";`

			`#ifdef OPTIMIZE_ASCII_LOOKUP`
			`// We can process ASCII quickly without using lookup table`
			`unsigned short codepoint = currChar.unicode();`
			`if ( codepoint < 128 ) {`
			`if ( codepoint > 64 && codepoint < 91 ) // convert upper case ASCII`
			`currChar = TQChar(codepoint + 32 ); // to corresponding lower case`
Modifications made in response to June 2023 feedback from @MicheleC. Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago
This new branch is a follow-up to the latest and final commit to branch issue/270/tdelibs-V3. It includes the following files that were mistakenly omitted in that commit: tdecore/CMakeLists.txt tdecore/README.tdestringmatcher tdecore/tdeglobal.cpp tdecore/tdeglobal.h tdecore/tdestringmatcher.cpp tdecore/tdestringmatcher.h tdeio/tdeio/tdefileitem.cpp tdeio/tdeio/tdefileitem.h It also includes updates to the following files, some of which are based on recent feedback from @MicheleC: tdecore/tequivchars-mapping.h tdecore/tequivchars.cpp tdecore/tequivchars.h Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago			`// All other ASCII characters are equivalent to themselves`
			`//-Debug: std::cerr << TQString(currChar).utf8().data() << "' (ascii)" << std::endl;`
			`continue;`
			`}`
			`#endif`

Modifications made in response to June 2023 feedback from @MicheleC. Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago			`// Only letters and numbers are in the table`
			`if ( ! currChar.isLetterOrNumber() )`
			`continue;`
This new branch is a follow-up to the latest and final commit to branch issue/270/tdelibs-V3. It includes the following files that were mistakenly omitted in that commit: tdecore/CMakeLists.txt tdecore/README.tdestringmatcher tdecore/tdeglobal.cpp tdecore/tdeglobal.h tdecore/tdestringmatcher.cpp tdecore/tdestringmatcher.h tdeio/tdeio/tdefileitem.cpp tdeio/tdeio/tdefileitem.h It also includes updates to the following files, some of which are based on recent feedback from @MicheleC: tdecore/tequivchars-mapping.h tdecore/tequivchars.cpp tdecore/tequivchars.h Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago			`// Use a simple binary search to look up an equivalent character`
			`int low = 0;`
Modifications made in response to June 2023 feedback from @MicheleC. Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago			`int high = EquivTableROWS - 1;`
This new branch is a follow-up to the latest and final commit to branch issue/270/tdelibs-V3. It includes the following files that were mistakenly omitted in that commit: tdecore/CMakeLists.txt tdecore/README.tdestringmatcher tdecore/tdeglobal.cpp tdecore/tdeglobal.h tdecore/tdestringmatcher.cpp tdecore/tdestringmatcher.h tdeio/tdeio/tdefileitem.cpp tdeio/tdeio/tdefileitem.h It also includes updates to the following files, some of which are based on recent feedback from @MicheleC: tdecore/tequivchars-mapping.h tdecore/tequivchars.cpp tdecore/tequivchars.h Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago			`while (low <= high) {`
			`int mid = low + (high - low) / 2;`
Modifications made in response to June 2023 feedback from @MicheleC. Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago			`if ( currChar == EquivalentsTable[mid].character ) {`
This new branch is a follow-up to the latest and final commit to branch issue/270/tdelibs-V3. It includes the following files that were mistakenly omitted in that commit: tdecore/CMakeLists.txt tdecore/README.tdestringmatcher tdecore/tdeglobal.cpp tdecore/tdeglobal.h tdecore/tdestringmatcher.cpp tdecore/tdestringmatcher.h tdeio/tdeio/tdefileitem.cpp tdeio/tdeio/tdefileitem.h It also includes updates to the following files, some of which are based on recent feedback from @MicheleC: tdecore/tequivchars-mapping.h tdecore/tequivchars.cpp tdecore/tequivchars.h Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago			`// Found equivalent character, use it instead`
Modifications made in response to June 2023 feedback from @MicheleC. Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago			`currChar = EquivalentsTable[mid].collatesTo;`
This new branch is a follow-up to the latest and final commit to branch issue/270/tdelibs-V3. It includes the following files that were mistakenly omitted in that commit: tdecore/CMakeLists.txt tdecore/README.tdestringmatcher tdecore/tdeglobal.cpp tdecore/tdeglobal.h tdecore/tdestringmatcher.cpp tdecore/tdestringmatcher.h tdeio/tdeio/tdefileitem.cpp tdeio/tdeio/tdefileitem.h It also includes updates to the following files, some of which are based on recent feedback from @MicheleC: tdecore/tequivchars-mapping.h tdecore/tequivchars.cpp tdecore/tequivchars.h Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago			`break;`
			`}`
Modifications made in response to June 2023 feedback from @MicheleC. Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago			`if ( EquivalentsTable[mid].character < currChar )`
This new branch is a follow-up to the latest and final commit to branch issue/270/tdelibs-V3. It includes the following files that were mistakenly omitted in that commit: tdecore/CMakeLists.txt tdecore/README.tdestringmatcher tdecore/tdeglobal.cpp tdecore/tdeglobal.h tdecore/tdestringmatcher.cpp tdecore/tdestringmatcher.h tdeio/tdeio/tdefileitem.cpp tdeio/tdeio/tdefileitem.h It also includes updates to the following files, some of which are based on recent feedback from @MicheleC: tdecore/tequivchars-mapping.h tdecore/tequivchars.cpp tdecore/tequivchars.h Signed-off-by: Vincent Reher <tde@4reher.org> 1 year ago			`low = mid + 1;`
			`else`
			`high = mid - 1;`
			`}`
			`//-Debug: std::cerr << TQString(currChar).utf8().data() << "'" << std::endl;`

			`}`

			`return outString;`
			`}`