/* ------------------------------------------------------------------------ @NAME : names.c @DESCRIPTION: Functions for dealing with BibTeX names and lists of names: bt_split_list bt_split_name @GLOBALS : @CALLS : @CREATED : 1997/05/05, Greg Ward (as string_util.c) @MODIFIED : 1997/05/14-05/16, GW: added all the code to split individual names, renamed file to names.c @VERSION : $Id: names.c,v 1.23 1999/11/29 01:13:10 greg Rel $ @COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved. This file is part of the btparse library. This library is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. -------------------------------------------------------------------------- */ /*#include "bt_config.h"*/ #include #include #include #include #include "btparse.h" #include "prototypes.h" #include "error.h" #include "my_alloca.h" /*#include "my_dmalloc.h"*/ #include "bt_debug.h" #define MAX_COMMAS 2 #define update_depth(s,offs,depth) \ switch (s[offs]) \ { \ case '{': depth++; break; \ case '}': depth--; break; \ } /* * `name_loc' specifies where a name is found -- used for generating * useful warning messages. `line' and `name_num' are both 1-based. */ typedef struct { char * filename; int line; int name_num; } name_loc; GEN_PRIVATE_ERRFUNC (name_warning, (name_loc * loc, const char * fmt, ...), BTERR_CONTENT, loc->filename, loc->line, "name", loc->name_num, fmt) /* ------------------------------------------------------------------------ @NAME : bt_split_list() @INPUT : string - string to split up; whitespace must be collapsed eg. by bt_postprocess_string() delim - delimiter to use; must be lowercase and should be free of whitespace (code requires that delimiters in string be surrounded by whitespace) filename - source of string (for warning messages) line - 1-based line number into file (for warning messages) description - what substrings are (eg. "name") (for warning messages); if NULL will use "substring" @OUTPUT : substrings (*substrings is allocated by bt_split_list() for you) @RETURNS : number of substrings found @DESCRIPTION: Splits a string using a fixed delimiter, in the BibTeX way: * delimiters at beginning or end of string are ignored * delimiters in string must be surrounded by whitespace * case insensitive * delimiters at non-zero brace depth are ignored The list of substrings is returned as *substrings, which is an array of pointers into a duplicate of string. This duplicate copy has been scribbled on such that there is a nul byte at the end of every substring. You should call bt_free_list() to free both the duplicate copy of string and *substrings itself. Do *not* walk over the array free()'ing the substrings yourself, as this is invalid -- they were not malloc()'d! @GLOBALS : @CALLS : @CALLERS : anyone (exported by library) @CREATED : 1997/05/05, GPW @MODIFIED : -------------------------------------------------------------------------- */ bt_stringlist * bt_split_list (char * string, char * delim, char * filename, int line, char * description) { int depth; /* brace depth */ int i, j; /* offset into string and delim */ int inword; /* flag telling if prev. char == ws */ int string_len; int delim_len; int maxdiv; /* upper limit on no. of divisions */ int maxoffs; /* max offset of delim in string */ int numdiv; /* number of divisions */ int * start; /* start of each division */ int * stop; /* stop of each division */ bt_stringlist * list; /* structure to return */ if (string == NULL) return NULL; if (description == NULL) description = "substring"; string_len = strlen (string); delim_len = strlen (delim); maxdiv = (string_len / delim_len) + 1; maxoffs = string_len - delim_len + 1; /* * This is a bit of a band-aid solution to the "split empty string" * bug (formerly hit the internal_error() at the end of hte function). * Still need a general "detect and fix unpreprocessed string" -- * admittedly a different bug/misfeature. */ if (string_len == 0) return NULL; start = (int *) alloca (maxdiv * sizeof (int)); stop = (int *) alloca (maxdiv * sizeof (int)); list = (bt_stringlist *) malloc (sizeof (bt_stringlist)); depth = 0; i = j = 0; inword = 1; /* so leading delim ignored */ numdiv = 0; start[0] = 0; /* first substring @ start of string */ while (i < maxoffs) { /* does current char. in string match current char. in delim? */ if (depth == 0 && !inword && tolower (string[i]) == delim[j]) { j++; i++; /* have we found an entire delim, followed by a space? */ if (j == delim_len && string[i] == ' ') { stop[numdiv] = i - delim_len - 1; start[++numdiv] = ++i; j = 0; #if DEBUG printf ("found complete delim; i == %d, numdiv == %d: " "stop[%d] == %d, start[%d] == %d\n", i, numdiv, numdiv-1, stop[numdiv-1], numdiv, start[numdiv]); #endif } } /* no match between string and delim, at non-zero depth, or in a word */ else { update_depth (string, i, depth); inword = (i < string_len) && (string[i] != ' '); i++; j = 0; } } stop[numdiv] = string_len; /* last substring ends just past eos */ list->num_items = numdiv+1; /* * OK, now we know how many divisions there are and where they are -- * so let's split that string up for real! * * list->items will be an array of pointers into a duplicate of * `string'; we duplicate `string' so we can safely scribble on it and * free() it later (in bt_free_list()). */ list->items = (char **) malloc (list->num_items * sizeof (char *)); list->string = strdup (string); for (i = 0; i < list->num_items; i++) { /* * Possible cases: * - stop < start is for empty elements, e.g. "and and" seen in * input. (`start' for empty element will be the 'a' of the * second 'and', and its stop will be the ' ' *before* the * second 'and'.) * - stop > start is for anything else between two and's (the usual) * - stop == start should never happen if the loop above is correct */ if (stop[i] > start[i]) /* the usual case */ { list->string[stop[i]] = 0; list->items[i] = list->string+start[i]; } else if (stop[i] < start[i]) /* empty element */ { list->items[i] = NULL; general_error (BTERR_CONTENT, filename, line, description, i+1, "empty %s", description); } else /* should not happen! */ { internal_error ("stop == start for substring %d", i); } } return list; /* return num_substrings; */ } /* bt_split_list () */ /* ------------------------------------------------------------------------ @NAME : bt_free_list() @INPUT : list @OUTPUT : @RETURNS : @DESCRIPTION: Frees the list of strings created by bt_split_list(). @GLOBALS : @CALLS : @CALLERS : anyone (exported by library) @CREATED : 1997/05/06, GPW @MODIFIED : -------------------------------------------------------------------------- */ void bt_free_list (bt_stringlist *list) { if (list && list->string) free (list->string); if (list && list->items) free (list->items); if (list) free (list); } /* ---------------------------------------------------------------------- * Stuff for splitting up a single name */ /* ------------------------------------------------------------------------ @NAME : find_commas @INPUT : name - string to search for commas max_commas - maximum number of commas to allow (if more than this number are seen, a warning is printed and the excess commas are removed) @OUTPUT : @RETURNS : number of commas found @DESCRIPTION: Counts and records positions of commas at brace-depth 0. Modifies string in-place to remove whitespace around commas, excess commas, and any trailing commas; warns on excess or trailing commas. Excess commas are removed by replacing them with space and calling bt_postprocess_string() to collapse whitespace a second time; trailing commas are simply replaced with (char) 0 to truncate the string. Assumes whitespace has been collapsed (ie. no space at beginning or end of string, and all internal strings of whitespace reduced to exactly one space). @GLOBALS : @CALLS : name_warning() (if too many commas, or commas at end) @CALLERS : bt_split_name() @CREATED : 1997/05/14, Greg Ward @MODIFIED : -------------------------------------------------------------------------- */ static int find_commas (name_loc * loc, char *name, int max_commas) { int i, j; int depth; int num_commas; int len; boolean at_comma; boolean warned; i = j = 0; depth = 0; num_commas = 0; len = strlen (name); warned = 0; /* First pass to check for and blank out excess commas */ for (i = 0; i < len; i++) { if (depth == 0 && name[i] == ',') { num_commas++; if (num_commas > max_commas) { if (! warned) { name_warning (loc, "too many commas in name (removing extras)"); warned = TRUE; } name[i] = ' '; } } } /* * If we blanked out a comma, better re-collapse whitespace. (This is * a bit of a cop-out -- I could probably adjust i and j appropriately * in the above loop to do the collapsing for me, but my brain * hurt when I tried to think it through. Some other time, perhaps. */ if (warned) bt_postprocess_string (name, BTO_COLLAPSE); /* Now the real comma-finding loop (only if necessary) */ if (num_commas == 0) return 0; num_commas = 0; i = 0; while (i < len) { at_comma = (depth == 0 && name[i] == ','); if (at_comma) { while (j > 0 && name[j-1] == ' ') j--; num_commas++; } update_depth (name, i, depth); if (i != j) name[j] = name[i]; i++; j++; if (at_comma) { while (i < len && name[i] == ' ') i++; } } /* while i */ if (i != j) name[j] = (char) 0; j--; if (name[j] == ',') { name_warning (loc, "comma(s) at end of name (removing)"); while (name[j] == ',') { name[j--] = (char) 0; num_commas--; } } return num_commas; } /* find_commas() */ /* ------------------------------------------------------------------------ @NAME : find_tokens @INPUT : name - string to tokenize (should be a private copy that we're free to clobber and mangle) @OUTPUT : comma_token- number of token immediately preceding each comma (caller must allocate with at least one element per comma in `name') @RETURNS : newly-allocated bt_stringlist structure @DESCRIPTION: Finds tokens in a string; delimiter is space or comma at brace-depth zero. Assumes whitespace has been collapsed and find_commas has been run on the string to remove whitespace around commas and any trailing commas. The bt_stringlist structure returned can (and should) be freed with bt_free_list(). @GLOBALS : @CALLS : @CALLERS : bt_split_name() @CREATED : 1997/05/14, Greg Ward @MODIFIED : -------------------------------------------------------------------------- */ static bt_stringlist * find_tokens (char * name, int * comma_token) { int i; /* index into name */ int num_tok; int in_boundary; /* previous char was ' ' or ',' */ int cur_comma; /* index into comma_token */ int len; int depth; bt_stringlist * tokens; i = 0; in_boundary = 1; /* so first char will start a token */ cur_comma = 0; len = strlen (name); depth = 0; tokens = (bt_stringlist *) malloc (sizeof (bt_stringlist)); /* tokens->string = name ? strdup (name) : NULL; */ tokens->string = name; num_tok = 0; tokens->items = NULL; if (len == 0) /* empty string? */ return tokens; /* return empty token list */ tokens->items = (char **) malloc (sizeof (char *) * len); while (i < len) { if (depth == 0 && in_boundary) /* at start of a new token */ { tokens->items[num_tok++] = name+i; } if (depth == 0 && (name[i] == ' ' || name[i] == ',')) { /* if we're at a comma, record the token preceding the comma */ if (name[i] == ',') { comma_token[cur_comma++] = num_tok-1; } /* * if already in a boundary zone, we have an empty token * (caused by multiple consecutive commas) */ if (in_boundary) { tokens->items[num_tok-1] = NULL; } num_tok--; /* in any case, mark the end of one token and prepare for the * start of the next */ name[i] = (char) 0; in_boundary = 1; } else { in_boundary = 0; /* inside a token */ } update_depth (name, i, depth); i++; } /* while i */ tokens->num_items = num_tok; return tokens; } /* find_tokens() */ /* ------------------------------------------------------------------------ @NAME : find_lc_tokens() @INPUT : tokens @OUTPUT : first_lc last_lc @RETURNS : @DESCRIPTION: Finds the first contiguous string of lowercase tokens in `name'. The string must already be tokenized by find_tokens(), and the input args num_tok, tok_start, and tok_stop are the return value and the two same-named output arguments from find_tokens(). @GLOBALS : @CALLS : @CALLERS : bt_split_name() @CREATED : 1997/05/14, Greg Ward @MODIFIED : -------------------------------------------------------------------------- */ static void find_lc_tokens (bt_stringlist * tokens, int * first_lc, int * last_lc) { int i; /* iterate over token list this time */ int in_lc_sequence; /* in contig. sequence of lc tokens? */ *first_lc = *last_lc = -1; /* haven't found either yet */ in_lc_sequence = 0; i = 0; while (i < tokens->num_items) { if (*first_lc == -1 && islower (tokens->items[i][0])) { *first_lc = i; i++; while (i < tokens->num_items && islower (tokens->items[i][0])) i++; *last_lc = i-1; } else { i++; } } } /* find_lc_tokens() */ /* ------------------------------------------------------------------------ @NAME : resolve_token_range() @INPUT : tokens - structure containing the token list tok_range - two-element array with start and stop token number @OUTPUT : *part - set to point to first token in range, or NULL if empty range *num_tok - number of tokens in the range @RETURNS : @DESCRIPTION: Given a list of tokens and a range of token numbers (as a two-element array, tok_range), computes the number of tokens in the range. If this is >= 0, sets *part to point to the first token in the range; otherwise, sets *part to NULL. @CALLERS : @CREATED : May 1997, GPW @MODIFIED : -------------------------------------------------------------------------- */ static void resolve_token_range (bt_stringlist *tokens, int * tok_range, char *** part, int * num_tok) { *num_tok = (tok_range[1] - tok_range[0]) + 1; if (*num_tok <= 0) { *num_tok = 0; *part = NULL; } else { *part = tokens->items + tok_range[0]; } } /* resolve_token_range() */ /* ------------------------------------------------------------------------ @NAME : split_simple_name() @INPUT : name first_lc last_lc @OUTPUT : name @RETURNS : @DESCRIPTION: Splits up a name (represented as a string divided into non-overlapping, whitespace-separated tokens) according to the BibTeX rules for names without commas. Specifically: * tokens up to (but not including) the first lowercase token, or the last token of the string if there are no lowercase tokens, become the `first' part * the earliest contiguous sequence of lowercase tokens, up to (but not including) the last token of the string, becomes the `von' part * the tokens following the `von' part, or the last single token if there is no `von' part, become the `last' part * there is no `jr' part @GLOBALS : @CALLS : name_warning() (if last lc token taken as lastname) resolve_token_range() @CALLERS : bt_split_name() @CREATED : 1997/05/15, Greg Ward @MODIFIED : -------------------------------------------------------------------------- */ static void split_simple_name (name_loc * loc, bt_name * name, int first_lc, int last_lc) { int first_t[2], von_t[2], last_t[2]; int end; end = name->tokens->num_items-1; /* token number of last token */ if (first_lc > -1) /* any lowercase tokens at all? */ { first_t[0] = 0; /* first name goes from beginning */ first_t[1] = first_lc-1; /* to just before first lc token */ if (last_lc == end) /* sequence of lowercase tokens */ { /* goes all the way to end of string */ last_lc--; /* -- roll it back by one so we */ /* still have a lastname */ #ifdef WARN_LC_LASTNAME /* * disable this warning for now because "others" is used fairly * often as a name in BibTeX databases -- oops! */ name_warning (loc, "no capitalized token at end of name; " "using \"%s\" as lastname", name->tokens->items[end]); #else # ifndef ALLOW_WARNINGS loc = NULL; /* avoid "unused parameter" warning */ # endif #endif } von_t[0] = first_lc; /* `von' part covers sequence of */ von_t[1] = last_lc; /* lowercase tokens */ last_t[0] = last_lc+1; /* lastname from after `von' to end */ last_t[1] = end; /* of string */ } else /* no lowercase tokens */ { von_t[0] = 0; /* empty `von' part */ von_t[1] = -1; first_t[0] = 0; /* `first' goes from first to second */ first_t[1] = end-1; /* last token */ last_t[0] = last_t[1] = end; /* and `last' is just the last token */ } resolve_token_range (name->tokens, first_t, name->parts+BTN_FIRST, name->part_len+BTN_FIRST); resolve_token_range (name->tokens, von_t, name->parts+BTN_VON, name->part_len+BTN_VON); resolve_token_range (name->tokens, last_t, name->parts+BTN_LAST, name->part_len+BTN_LAST); name->parts[BTN_JR] = NULL; /* no jr part possible */ name->part_len[BTN_JR] = 0; } /* split_simple_name() */ /* ------------------------------------------------------------------------ @NAME : split_general_name() @INPUT : name num_commas comma_token first_lc last_lc @OUTPUT : name @RETURNS : @DESCRIPTION: Splits a name according to the BibTeX rules for names with 1 or 2 commas (> 2 commas is handled elsewhere, namely by bt_split_name() calling find_commas() with max_commas == 2). Specifically: * an initial string of lowercase tokens, up to (but not including) the token before the first comma, becomes the `von' part * tokens from immediately after the `von' part, or from the beginning of the string if no `von', up to the first comma become the `last' part if one comma: * all tokens following the sole comma become the `first' part if two commas: * tokens between the two commas become the `jr' part * all tokens following the second comma become the `first' part @GLOBALS : @CALLS : name_warning() (if last lc token taken as lastname) resolve_token_range() @CALLERS : bt_split_name() @CREATED : 1997/05/15, Greg Ward @MODIFIED : -------------------------------------------------------------------------- */ static void split_general_name (name_loc * loc, bt_name * name, int num_commas, int * comma_token, int first_lc, int last_lc) { int first_t[2], von_t[2], last_t[2], jr_t[2]; int end; end = name->tokens->num_items-1; /* last token number */ if (first_lc == 0) /* we have an initial string of */ { /* lowercase tokens */ if (last_lc == comma_token[0]) /* lc string ends at first comma */ { name_warning (loc, "no capitalized tokens before first comma"); last_lc--; } von_t[0] = first_lc; /* `von' covers the sequence of */ von_t[1] = last_lc; /* lowercase tokens */ } else /* no lowercase tokens at start */ { von_t[0] = 0; /* empty `von' part */ von_t[1] = -1; } last_t[0] = von_t[1] + 1; /* start right after end of `von' */ last_t[1] = comma_token[0]; /* and end at first comma */ if (num_commas == 1) { first_t[0] = comma_token[0]+1; /* start right after comma */ first_t[1] = end; /* stop at end of string */ jr_t[0] = 0; /* empty `jr' part */ jr_t[1] = -1; } else /* more than 1 comma */ { jr_t[0] = comma_token[0]+1; /* start after first comma */ jr_t[1] = comma_token[1]; /* stop at second comma */ first_t[0] = comma_token[1]+1; /* start after second comma */ first_t[1] = end; /* and go to end */ } resolve_token_range (name->tokens, first_t, name->parts+BTN_FIRST, name->part_len+BTN_FIRST); resolve_token_range (name->tokens, von_t, name->parts+BTN_VON, name->part_len+BTN_VON); resolve_token_range (name->tokens, last_t, name->parts+BTN_LAST, name->part_len+BTN_LAST); resolve_token_range (name->tokens, jr_t, name->parts+BTN_JR, name->part_len+BTN_JR); } /* split_general_name() */ /* ------------------------------------------------------------------------ @NAME : bt_split_name() @INPUT : name filename line name_num @OUTPUT : @RETURNS : newly-allocated bt_name structure containing the four parts as token-lists @DESCRIPTION: Splits a name according to the BibTeX rules. There are actually two sets of rules: one for names with no commas, and one for names with 1 or 2 commas. (If a name has more than 2 commas, the extras are removed and it's treated as though it had just the first 2.) See split_simple_name() for the no-comma rules, and split_general_name() for the 1-or-2-commas rules. The bt_name structure returned can (and should) be freed with bt_free_name() when you no longer need it. @GLOBALS : @CALLS : @CALLERS : anyone (exported by library) @CREATED : 1997/05/14, Greg Ward @MODIFIED : @COMMENTS : The name-splitting code all implicitly assumes that the string being split has been post-processed to collapse whitespace in the BibTeX way. This means that it tends to dump core on such things as leading whitespace, or more than one space in a row inside the string. This could probably be alleviated with a call to bt_postprocess_string(), possibly preceded by a check for any of those occurences. Before doing that, though, I want to examine the code carefully to determine just what assumptions it makes -- so I can check/correct for all of them. -------------------------------------------------------------------------- */ bt_name * bt_split_name (char * name, char * filename, int line, int name_num) { name_loc loc; bt_stringlist * tokens; int comma_token[MAX_COMMAS]; int len; int num_commas; int first_lc, last_lc; bt_name * split_name; int i; DBG_ACTION (1, printf ("bt_split_name(): name=%p (%s)\n", name, name)) split_name = (bt_name *) malloc (sizeof (bt_name)); if (name == NULL) { len = 0; } else { name = strdup (name); /* private copy that we may clobber */ len = strlen (name); } DBG_ACTION (1, printf ("bt_split_name(): split_name=%p\n", split_name)) if (len == 0) /* non-existent or empty string? */ { split_name->tokens = NULL; for (i = 0; i < BT_MAX_NAMEPARTS; i++) { split_name->parts[i] = NULL; split_name->part_len[i] = 0; } return split_name; } loc.filename = filename; /* so called functions can generate */ loc.line = line; /* decent warning messages */ loc.name_num = name_num; num_commas = find_commas (&loc, name, MAX_COMMAS); assert (num_commas <= MAX_COMMAS); DBG_ACTION (1, printf ("found %d commas: ", num_commas)) tokens = find_tokens (name, comma_token); #if DEBUG printf ("found %d tokens:\n", tokens->num_items); for (i = 0; i < tokens->num_items; i++) { printf (" %d: ", i); if (tokens->items[i]) /* non-empty token? */ { printf (">%s<\n", tokens->items[i]); } else { printf ("(empty)\n"); } } #endif #if DEBUG printf ("comma tokens: "); for (i = 0; i < num_commas; i++) printf ("%d ", comma_token[i]); printf ("\n"); #endif find_lc_tokens (tokens, &first_lc, &last_lc); #if DEBUG printf ("(first,last) lc tokens = (%d,%d)\n", first_lc, last_lc); #endif if (strlen (name) == 0) /* name now empty? */ { split_name->tokens = NULL; for (i = 0; i < BT_MAX_NAMEPARTS; i++) { split_name->parts[i] = NULL; split_name->part_len[i] = 0; } } else { split_name->tokens = tokens; if (num_commas == 0) /* no commas -- "simple" format */ { split_simple_name (&loc, split_name, first_lc, last_lc); } else { split_general_name (&loc, split_name, num_commas, comma_token, first_lc, last_lc); } } #if DEBUG printf ("bt_split_name(): returning structure %p\n", split_name); #endif return split_name; } /* bt_split_name() */ /* ------------------------------------------------------------------------ @NAME : bt_free_name() @INPUT : name @OUTPUT : @RETURNS : @DESCRIPTION: Frees up any memory allocated for a bt_name structure (namely, the `tokens' field [a bt_stringlist structure, this freed with bt_free_list()] and the structure itself.) @CALLS : bt_free_list() @CALLERS : anyone (exported) @CREATED : 1997/11/14, GPW @MODIFIED : -------------------------------------------------------------------------- */ void bt_free_name (bt_name * name) { DBG_ACTION (2, printf ("bt_free_name(): freeing name %p " "(%d tokens, string=%p (%s), last[0]=%s)\n", name, name->tokens->num_items, name->tokens->string, name->tokens->string, name->parts[BTN_LAST][0])); bt_free_list (name->tokens); free (name); DBG_ACTION (2, printf ("bt_free_name(): done, everything freed\n")); }