tellico/src/translators/btparse/string_util.c

/* ------------------------------------------------------------------------
@NAME       : string_util.c
@DESCRIPTION: Various string-processing utility functions:
                bt_purify_string()
                bt_change_case()

              and their helpers:
                foreign_letter()
                purify_special_char()
@GLOBALS    :
@CALLS      :
@CALLERS    :
@CREATED    : 1997/10/19, Greg Ward
@MODIFIED   : 1997/11/25, GPW: renamed to from purify.c to string_util.c
                               added bt_change_case() and friends
@VERSION    : $Id: string_util.c,v 1.10 1999/10/28 22:50:28 greg Rel $
-------------------------------------------------------------------------- */

#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <assert.h>
#include "error.h"
#include "btparse.h"
#include "bt_debug.h"


/*
 * These definitions should be fixed to be consistent with HTML
 * entities, just for fun.  And perhaps I should add entries for
 * accented letters (at least those supported by TeX and HTML).
 */
typedef enum
{
   L_OTHER,                             /* not a "foreign" letter */
   L_OSLASH_L,                          /* Eastern European {\o} */
   L_OSLASH_U,
   L_LSLASH_L,                          /* {\l} */
   L_LSLASH_U,
   L_OELIG_L,                           /* Latin {\oe} ligature */
   L_OELIG_U,
   L_AELIG_L,                           /* {\ae} ligature */
   L_AELIG_U,
   L_SSHARP_L,                          /* German "sharp s" {\ss} */
   L_SSHARP_U,
   L_ACIRCLE_L,                         /* Nordic {\aa} */
   L_ACIRCLE_U,
   L_INODOT_L,                          /* undotted i: {\i} */
   L_JNODOT_L                           /* {\j} */
} bt_letter;


static const char * uc_version[] =
{
   NULL,                                /* L_OTHER */
   "\\O",                               /* L_OSLASH_L */
   "\\O",                               /* L_OSLASH_U */
   "\\L",                               /* L_LSLASH_L */
   "\\L",                               /* L_LSLASH_U */
   "\\OE",                              /* L_OELIG_L */
   "\\OE",                              /* L_OELIG_U */
   "\\AE",                              /* L_AELIG_L */
   "\\AE",                              /* L_AELIG_U */
   "SS",                                /* L_SSHARP_L -- for LaTeX 2.09 */
   "\\SS",                              /* L_SSHARP_U */
   "\\AA",                              /* L_ACIRCLE_L */
   "\\AA",                              /* L_ACIRCLE_U */
   "I",                                 /* L_INODOT_L */
   "J"                                  /* L_JNODOT_L */
};

static const char * lc_version[] =
{
   NULL,                                /* L_OTHER */
   "\\o",                               /* L_OSLASH_L */
   "\\o",                               /* L_OSLASH_U */
   "\\l",                               /* L_LSLASH_L */
   "\\l",                               /* L_LSLASH_U */
   "\\oe",                              /* L_OELIG_L */
   "\\oe",                              /* L_OELIG_U */
   "\\ae",                              /* L_AELIG_L */
   "\\ae",                              /* L_AELIG_U */
   "\\ss",                              /* L_SSHARP_L */
   "\\ss",                              /* L_SSHARP_U */
   "\\aa",                              /* L_ACIRCLE_L */
   "\\aa",                              /* L_ACIRCLE_U */
   "\\i",                               /* L_INODOT_L */
   "\\j"                                /* L_JNODOT_L */
};


/* ------------------------------------------------------------------------
@NAME       : foreign_letter()
@INPUT      : str
              start
              stop
@OUTPUT     : letter
@RETURNS    : TRUE if the string delimited by start and stop is a foreign
              letter control sequence
@DESCRIPTION: Determines if a character sequence is one of (La)TeX's
              "foreign letter" control sequences (l, o, ae, oe, aa, ss, plus
              uppercase versions).  If `letter' is non-NULL, returns which
              letter was found in it (as a bt_letter value).
@CALLS      :
@CALLERS    : purify_special_char()
@CREATED    : 1997/10/19, GPW
@MODIFIED   :
-------------------------------------------------------------------------- */
static boolean
foreign_letter (char *str, int start, int stop, bt_letter * letter)
{
   char      c1, c2;
   bt_letter dummy;


   /*
    * This is written for speed, not flexibility -- adding new foreign
    * letters would be trying and vexatious.
    *
    * N.B. my gold standard list of foreign letters is Kopka and Daly's
    * *A Guide to LaTeX 2e*, section 2.5.6.
    */

   if (letter == NULL)                  /* so we can assign to *letter */
      letter = &dummy;                  /* without compunctions */
   *letter = L_OTHER;                   /* assume not a "foreign" letter */

   c1 = str[start+0];                   /* only two characters that we're */
   c2 = str[start+1];                   /* interested in */

   switch (stop - start)
   {
      case 1:                           /* one-character control sequences */
         switch (c1)                    /* (\o and \l) */
         {
            case 'o':
               *letter = L_OSLASH_L; return TRUE;
            case 'O':
               *letter = L_OSLASH_U; return TRUE;
            case 'l':
               *letter = L_LSLASH_L; return TRUE;
            case 'L':
               *letter = L_LSLASH_L; return TRUE;
            case 'i':
               *letter = L_INODOT_L; return TRUE;
            case 'j':
               *letter = L_JNODOT_L; return TRUE;
            default:
               return FALSE;
         }
         break;
      case 2:                           /* two character control sequences */
         switch (c1)                    /* (\oe, \ae, \aa, and \ss) */
         {
            case 'o':
               if (c2 == 'e') { *letter = L_OELIG_L; return TRUE; }
            case 'O':
               if (c2 == 'E') { *letter = L_OELIG_U; return TRUE; }

            /* BibTeX 0.99 does not handle \aa and \AA -- but I do!*/
            case 'a':
               if (c2 == 'e')
                  { *letter = L_AELIG_L; return TRUE; }
               else if (c2 == 'a')
                  { *letter = L_ACIRCLE_L; return TRUE; }
               else
                  return FALSE;
            case 'A':
               if (c2 == 'E')
                  { *letter = L_AELIG_U; return TRUE; }
               else if (c2 == 'A')
                  { *letter = L_ACIRCLE_U; return TRUE; }
               else
                  return FALSE;

            /* uppercase sharp-s -- new with LaTeX 2e (so far all I do
             * is recognize it as a "foreign" letter)
             */
            case 's':
               if (c2 == 's')
                  { *letter = L_SSHARP_L; return TRUE; }
               else
                  return FALSE;
            case 'S':
               if (c2 == 'S')
                  { *letter = L_SSHARP_U; return TRUE; }
               else
                  return FALSE;
         }
         break;
      default:
         return FALSE;
   } /* switch on length of control sequence */

   internal_error ("foreign_letter(): should never reach end of function");
   return FALSE;                        /* to keep gcc -Wall happy */

} /* foreign_letter */


/* ------------------------------------------------------------------------
@NAME       : purify_special_char()
@INPUT      : *src, *dst - pointers into the input and output strings
@OUTPUT     : *src       - updated to point to the closing brace of the
                           special char
              *dst       - updated to point to the next available spot
                           for copying text to
@RETURNS    :
@DESCRIPTION: "Purifies" a BibTeX special character.  On input, *src should
              point to the opening brace of a special character (ie. the
              brace must be at depth 0 of the whole string, and the
              character immediately following it must be a backslash).
              *dst should point to the next spot to copy into the output
              (purified) string.  purify_special_char() will skip over the
              opening brace and backslash; if the control sequence is one
              of LaTeX's foreign letter sequences (as determined by
              foreign_letter()), then it is simply copied to *dst.
              Otherwise the control sequence is skipped.  In either case,
              text after the control sequence is either copied (alphabetic
              characters) or skipped (anything else, including hyphens,
              ties, and digits).
@CALLS      : foreign_letter()
@CALLERS    : bt_purify_string()
@CREATED    : 1997/10/19, GPW
@MODIFIED   :
-------------------------------------------------------------------------- */
static void
purify_special_char (char *str, int * src, int * dst)
{
   int    depth;
   int    peek;

   assert (str[*src] == '{' && str[*src + 1] == '\\');
   depth = 1;

   *src += 2;                           /* jump to start of control sequence */
   peek = *src;                         /* scan to end of control sequence */
   while (isalpha (str[peek]))
      peek++;
   if (peek == *src)                    /* in case of single-char, non-alpha */
      peek++;                           /* control sequence (eg. {\'e}) */

   if (foreign_letter (str, *src, peek, NULL))
   {
      assert (peek - *src == 1 || peek - *src == 2);
      str[(*dst)++] = str[(*src)++];    /* copy first char */
      if (*src < peek)                  /* copy second char, downcasing */
         str[(*dst)++] = tolower (str[(*src)++]);
   }
   else                                 /* not a foreign letter -- skip */
   {                                    /* the control sequence entirely */
      *src = peek;
   }

   while (str[*src])
   {
      switch (str[*src])
      {
         case '{':
            depth++;
            (*src)++;
            break;
         case '}':
            depth--;
            if (depth == 0) return;     /* done with special char */
            (*src)++;
            break;
         default:
            if (isalpha (str[*src]))    /* copy alphabetic chars */
               str[(*dst)++] = str[(*src)++];
            else                        /* skip everything else */
               (*src)++;
      }
   }

   /*
    * If we get here, we have unbalanced braces -- the '}' case should
    * always hit a depth == 0 point if braces are balanced.  No warning,
    * though, because a) BibTeX doesn't warn about purifying unbalanced
    * strings, and b) we (should have) already warned about it in the
    * lexer.
    */

} /* purify_special_char() */


/* ------------------------------------------------------------------------
@NAME       : bt_purify_string()
@INOUT      : instr
@INPUT      : options
@OUTPUT     :
@RETURNS    : instr   - same as input string, but modified in place
@DESCRIPTION: "Purifies" a BibTeX string.  This consists of copying
              alphanumeric characters, converting hyphens and ties to
              space, copying spaces, and skipping everything else.  (Well,
              almost -- special characters are handled specially, of
              course.  Basically, accented letters have the control
              sequence skipped, while foreign letters have the control
              sequence preserved in a reasonable manner.  See
              purify_special_char() for details.)
@CALLS      : purify_special_char()
@CALLERS    :
@CREATED    : 1997/10/19, GPW
@MODIFIED   :
-------------------------------------------------------------------------- */
void
bt_purify_string (char * string, ushort options)
{
   int    src,                          /* both indeces into string */
          dst;
   int    depth;                        /* brace depth in string */
   unsigned orig_len;

   /*
    * Since purification always copies or deletes chars, outstr will
    * be no longer than string -- so nothing fancy is required to put
    * an upper bound on its eventual size.
    */

   depth = 0;
   src = 0;
   dst = 0;
   orig_len = strlen (string);

   DBG_ACTION (1, printf ("bt_purify_string(): input = %p (%s)\n",
                          string, string));

   while (string[src] != (char) 0)
   {
      DBG_ACTION (2, printf ("  next: >%c<: ", string[src]));
      switch (string[src])
      {
         case '~':                      /* "separator" characters -- */
         case '-':                      /* replaced with space */
         case ' ':                      /* and copy an actual space */
            string[dst++] = ' ';
            src++;
            DBG_ACTION (2, printf ("replacing with space"));
            break;
         case '{':
            if (depth == 0 && string[src+1] == '\\')
            {
               DBG_ACTION (2, printf ("special char found"));
               purify_special_char (string, &src, &dst);
            }
            else
            {
               DBG_ACTION (2, printf ("ordinary open brace"));
               src++;
            }
            depth++;
            break;
         case '}':
            DBG_ACTION (2, printf ("close brace"));
            depth--;
            src++;
            break;
         default:
            if (isalnum (string[src]))         /* any alphanumeric char -- */
            {
               DBG_ACTION (2, printf ("alphanumeric -- copying"));
               string[dst++] = string[src++]; /* copy it */
            }
            else                        /* anything else -- skip it */
            {
               DBG_ACTION (2, printf ("non-separator, non-brace, non-alpha"));
               src++;
            }
      } /* switch string[src] */

      DBG_ACTION (2, printf ("\n"));

   } /* while string[src] */

   DBG_ACTION (1, printf ("bt_purify_string(): depth on exit: %d\n", depth));

   string[dst] = (char) 0;
   assert (strlen (string) <= orig_len);
} /* bt_purify_string() */


/* ======================================================================
 * Case-transformation stuff
 */


/* ------------------------------------------------------------------------
@NAME       : convert_special_char()
@INPUT      : transform
@INOUT      : string
              src
              dst
              start_sentence
              after_colon
@RETURNS    :
@DESCRIPTION: Does case conversion on a special character.
@GLOBALS    :
@CALLS      :
@CALLERS    :
@CREATED    : 1997/11/25, GPW
@MODIFIED   :
-------------------------------------------------------------------------- */
static void
convert_special_char (char transform,
                      char * string,
                      int * src,
                      int * dst,
                      boolean * start_sentence,
                      boolean * after_colon)
{
   int       depth;
   boolean   done_special;
   int       cs_end;
   int       cs_len;                    /* counting the backslash */
   bt_letter letter;
   const char *    repl;
   int       repl_len;

#ifndef ALLOW_WARNINGS
   repl = NULL;                         /* silence "might be used" */
                                        /* uninitialized" warning */
#endif

   /* First, copy just the opening brace */
   string[(*dst)++] = string[(*src)++];

   /*
    * Now loop over characters inside the braces -- stop when we reach
    * the matching close brace, or when the string ends.
    */
   depth = 1;                           /* because we're in a special char */
   done_special = FALSE;

   while (string[*src] != 0 && !done_special)
   {
      switch (string[*src])
      {
         case '\\':                     /* a control sequence */
         {
            cs_end = *src+1;            /* scan over chars of c.s. */
            while (isalpha (string[cs_end]))
               cs_end++;

            /*
             * OK, now *src points to the backslash (so src+*1 points to
             * first char. of control sequence), and cs_end points to
             * character immediately following end of control sequence.
             * Thus we analyze [*src+1..cs_end] to determine if the control
             * sequence is a foreign letter, and use (cs_end - (*src+1) + 1)
             * = (cs_end - *src) as the length of the control sequence.
             */

            cs_len = cs_end - *src;     /* length of cs, counting backslash */

            if (foreign_letter (string, *src+1, cs_end, &letter))
            {
               if (letter == L_OTHER)
                  internal_error ("impossible foreign letter");

               switch (transform)
               {
                  case 'u':
                     repl = uc_version[(int) letter];
                     break;
                  case 'l':
                     repl = lc_version[(int) letter];
                     break;
                  case 't':
                     if (*start_sentence || *after_colon)
                     {
                        repl = uc_version[(int) letter];
                        *start_sentence = *after_colon = FALSE;
                     }
                     else
                     {
                        repl = lc_version[(int) letter];
                     }
                     break;
                  default:
                     internal_error ("impossible case transform \"%c\"",
                                     transform);
               }

               repl_len = strlen (repl);
               if (repl_len > cs_len)
                  internal_error
                     ("replacement text longer than original cs");

               strncpy (string + *dst, repl, repl_len);
               *src = cs_end;
               *dst += repl_len;
            } /* control sequence is a foreign letter */
            else
            {
               /* not a foreign letter -- just copy the control seq. as is */


               strncpy (string + *dst, string + *src, cs_end - *src);
               *src += cs_len;
               assert (*src == cs_end);
               *dst += cs_len;
            } /* control sequence not a foreign letter */

            break;
         } /* case: '\\' */

         case '{':
         {
            string[(*dst)++] = string[(*src)++];
            depth++;
            break;
         }

         case '}':
         {
            string[(*dst)++] = string[(*src)++];
            depth--;
            if (depth == 0)
               done_special = TRUE;
            break;
         }

         default:                       /* any other character */
         {
            switch (transform)
            {
               /*
                * Inside special chars, lowercase and title caps are same.
                * (At least, that's bibtex's convention.  I might change this
                * at some point to be a bit smarter.)
                */
               case 'l':
               case 't':
                  string[(*dst)++] = tolower (string[(*src)++]);
                  break;
               case 'u':
                  string[(*dst)++] = toupper (string[(*src)++]);
                  break;
               default:
                  internal_error ("impossible case transform \"%c\"",
                                  transform);
            }
         } /* default char */

      } /* switch: current char */

   } /* while: string or special char not done */

} /* convert_special_char() */


/* ------------------------------------------------------------------------
@NAME       : bt_change_case()
@INPUT      :
@OUTPUT     :
@RETURNS    :
@DESCRIPTION: Converts a string (in-place) to either uppercase, lowercase,
              or "title capitalization">
@GLOBALS    :
@CALLS      :
@CALLERS    :
@CREATED    : 1997/11/25, GPW
@MODIFIED   :
-------------------------------------------------------------------------- */
void
bt_change_case (char   transform,
                char * string,
                ushort options)
{
   int    len;
   int    depth;
   int    src, dst;                     /* indeces into string */
   boolean start_sentence;
   boolean after_colon;

   src = dst = 0;
   len = strlen (string);
   depth = 0;

   start_sentence = TRUE;
   after_colon = FALSE;

   while (string[src] != 0)
   {
      switch (string[src])
      {
         case '{':

            /*
             * At start of special character?  The entire special char.
             * will be handled here, as follows:
             *   - text at any brace-depth within the s.c. is case-mangled;
             *     punctuation (sentence endings, colons) are ignored
             *   - control sequences are left alone, unless they are
             *     one of the "foreign letter" control sequences, in
             *     which case they're converted to the appropriate string
             *     according to the uc_version or lc_version tables.
             */
            if (depth == 0 && string[src+1] == '\\')
            {
               convert_special_char (transform, string, &src, &dst,
                                     &start_sentence, &after_colon);
            }

            /*
             * Otherwise, it's just something in braces.  This is probably
             * a proper noun or something encased in braces to protect it
             * from case-mangling, so we do not case-mangle it.  However,
             * we *do* switch out of start_sentence or after_colon mode if
             * we happen to be there (otherwise we'll do the wrong thing
             * once we're out of the braces).
             */
            else
            {
               string[dst++] = string[src++];
               start_sentence = after_colon = FALSE;
               depth++;
            }
            break;

         case '}':
            string[dst++] = string[src++];
            depth--;
            break;

         /*
          * Sentence-ending punctuation and colons are handled separately
          * to allow for exact mimicing of BibTeX's behaviour.  I happen
          * to think that this behaviour (capitalize first word of sentences
          * in a title) is better than BibTeX's, but I want to keep my
          * options open for a future goal of perfect compatability.
          */
         case '.':
         case '?':
         case '!':
            start_sentence = TRUE;
            string[dst++] = string[src++];
            break;

         case ':':
            after_colon = TRUE;
            string[dst++] = string[src++];
            break;

         default:
            if (isspace (string[src]))
            {
               string[dst++] = string[src++];
            }
            else
            {
               if (depth == 0)
               {
                  switch (transform)
                  {
                     case 'u':
                        string[dst++] = toupper (string[src++]);
                        break;
                     case 'l':
                        string[dst++] = tolower (string[src++]);
                        break;
                     case 't':
                        if (start_sentence || after_colon)
                        {
                           /*
                            * XXX BibTeX only preserves case of character
                            * immediately after a colon; I do two things
                            * differently: first, I pay attention to sentence
                            * punctuation, and second I force uppercase
                            * at start of sentence or after a colon.
                            */
                           string[dst++] = toupper (string[src++]);
                           start_sentence = after_colon = FALSE;
                        }
                        else
                        {
                           string[dst++] = tolower (string[src++]);
                        }
                        break;
                     default:
                        internal_error ("impossible case transform \"%c\"",
                                        transform);
                  }
               } /* depth == 0 */
               else
               {
                  string[dst++] = string[src++];
               }
            } /* not blank */
      } /* switch on current character */

   } /* while not at end of string */

} /* bt_change_case */