You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
940 lines
25 KiB
940 lines
25 KiB
/* ------------------------------------------------------------------------
|
|
@NAME : lex_auxiliary.c
|
|
@INPUT :
|
|
@OUTPUT :
|
|
@RETURNS :
|
|
@DESCRIPTION: The code and global variables here have three main purposes:
|
|
- maintain the lexical buffer (zztoktext, which
|
|
traditionally with PCCTS is a static array; I have
|
|
changed things so that it's dynamically allocated and
|
|
resized on overflow)
|
|
- keep track of lexical state that's not handled by PCCTS
|
|
code (like "where are we in terms of BibTeX entries?" or
|
|
"what are the delimiters for the current entry/string?")
|
|
- everything called from lexical actions is here, to keep
|
|
the grammar file itself neat and clean
|
|
@GLOBALS :
|
|
@CALLS :
|
|
@CALLERS :
|
|
@CREATED : Greg Ward, 1996/07/25-28
|
|
@MODIFIED : Jan 1997
|
|
Jun 1997
|
|
@VERSION : $Id: lex_auxiliary.c,v 1.31 1999/11/29 01:13:10 greg Rel $
|
|
@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
|
|
|
|
This file is part of the btparse library. This library is
|
|
free software; you can redistribute it and/or modify it under
|
|
the terms of the GNU General Public License as
|
|
published by the Free Software Foundation; either version 2
|
|
of the License, or (at your option) any later version.
|
|
-------------------------------------------------------------------------- */
|
|
|
|
/*#include "bt_config.h"*/
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <ctype.h>
|
|
#include <stdarg.h>
|
|
#include <assert.h>
|
|
#include "lex_auxiliary.h"
|
|
#include "stdpccts.h"
|
|
#include "error.h"
|
|
#include "prototypes.h"
|
|
/*#include "my_dmalloc.h"*/
|
|
|
|
#define DUPE_TEXT 0
|
|
|
|
extern char * InputFilename; /* from input.c */
|
|
|
|
GEN_PRIVATE_ERRFUNC (lexical_warning, (const char * fmt, ...),
|
|
BTERR_LEXWARN, InputFilename, zzline, NULL, -1, fmt)
|
|
GEN_PRIVATE_ERRFUNC (lexical_error, (const char * fmt, ...),
|
|
BTERR_LEXERR, InputFilename, zzline, NULL, -1, fmt)
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
* Global variables
|
|
*/
|
|
|
|
/* First, the lexical buffer. This is used elsewhere, so can't be static */
|
|
char * zztoktext = NULL;
|
|
|
|
/*
|
|
* Now, the lexical state -- first, stuff that arises from scanning
|
|
* at top-level and the beginnings of entries;
|
|
* EntryState:
|
|
* toplevel when we start scanning a file, or when we are in in_entry
|
|
* mode and see '}' or ')'
|
|
* after_at when we are in toplevel mode and see an '@'
|
|
* after_type when we are in after_at mode and see a name (!= 'comment')
|
|
* in_comment when we are in after_at mode and see a name (== 'comment')
|
|
* in_entry when we are in after_type mode and see '{' or '('
|
|
* EntryOpener:
|
|
* the character ('(' or '{') which opened the entry currently being
|
|
* scanned (we use this to make sure that the entry opener and closer
|
|
* match; if not, we issue a warning)
|
|
* EntryMetatype: (NB. typedef for bt_metatype is in btparse.h)
|
|
* classifies entries according to the syntax we will use to parse them;
|
|
* also winds up (after being changed to a bt_nodetype value) in the
|
|
* node that roots the entry AST:
|
|
* comment - anything between () or {}
|
|
* preamble - a single compound value
|
|
* string - a list of "name = compound_value" assignments; no key
|
|
* alias - a single "name = compound_value" assignment (where
|
|
* the compound value in this case is presumably a
|
|
* name, rather than a string -- this is not syntactically
|
|
* checked though)
|
|
* modify,
|
|
* entry - a key followed by a list of "name = compound_value"
|
|
* assignments
|
|
* JunkCount:
|
|
* the number of non-whitespace, non-'@' characters seen at toplevel
|
|
* between two entries (used to print out a warning when we hit
|
|
* the beginning of entry, to help people catch "old style" implicit
|
|
* comments
|
|
*/
|
|
static enum { toplevel, after_at, after_type, in_comment, in_entry }
|
|
EntryState;
|
|
static char EntryOpener; /* '(' or '{' */
|
|
static bt_metatype
|
|
EntryMetatype;
|
|
static int JunkCount; /* non-whitespace chars at toplevel */
|
|
|
|
/*
|
|
* String state -- these are maintained and used by the functions called
|
|
* from actions in the string lexer.
|
|
* BraceDepth:
|
|
* brace depth within a string; we can only end the current string
|
|
* when this is zero
|
|
* ParenDepth:
|
|
* parenthesis depth within a string; needed for @comment entries
|
|
* that are paren-delimited (because the comment in that case is
|
|
* a paren-delimited string)
|
|
* StringOpener:
|
|
* similar to EntryOpener, but stronger than merely warning of token
|
|
* mismatch -- this determines which character ('"' or '}') can
|
|
* actually end the string
|
|
* StringStart:
|
|
* line on which current string started; if we detect an apparent
|
|
* runaway, this is used to report where the runaway started
|
|
* ApparentRunaway:
|
|
* flags if we have already detected (and warned) that the current
|
|
* string appears to be a runaway, so that we don't warn again
|
|
* (and again and again and again)
|
|
* QuoteWarned:
|
|
* flags if we have already warned about seeing a '"' in a string,
|
|
* because they tend to come in pairs and one warning per string
|
|
* is enough
|
|
*
|
|
* (See bibtex.g for an explanation of my runaway string detection heuristic.)
|
|
*/
|
|
static char StringOpener = '\0'; /* '{' or '"' */
|
|
static int BraceDepth; /* depth of brace-nesting */
|
|
static int ParenDepth; /* depth of parenthesis-nesting */
|
|
static int StringStart = -1; /* start line of current string */
|
|
static int ApparentRunaway; /* current string looks like runaway */
|
|
static int QuoteWarned; /* already warned about " in string? */
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
* Miscellaneous functions:
|
|
* lex_info() (handy for debugging)
|
|
* zzcr_attr() (called from PCCTS-generated code)
|
|
*/
|
|
|
|
void lex_info (void)
|
|
{
|
|
printf ("LA(1) = \"%s\" token %d, %s\n", LATEXT(1), LA(1), zztokens[LA(1)]);
|
|
#ifdef LL_K
|
|
printf ("LA(2) = \"%s\" token %d, %s\n", LATEXT(2), LA(2), zztokens[LA(2)]);
|
|
#endif
|
|
}
|
|
|
|
|
|
void zzcr_attr (Attrib *a, int tok, char *txt)
|
|
{
|
|
if (tok == STRING)
|
|
{
|
|
int len = strlen (txt);
|
|
|
|
assert ((txt[0] == '{' && txt[len-1] == '}')
|
|
|| (txt[0] == '"' && txt[len-1] == '"'));
|
|
txt[len-1] = (char) 0; /* remove closing quote from string */
|
|
txt++; /* so we'll skip the opening quote */
|
|
}
|
|
|
|
#if DUPE_TEXT
|
|
a->text = strdup (txt);
|
|
#else
|
|
a->text = txt;
|
|
#endif
|
|
a->token = tok;
|
|
a->line = zzline;
|
|
a->offset = zzbegcol;
|
|
#if DEBUG > 1
|
|
dprintf ("zzcr_attr: input txt = %p (%s)\n", txt, txt);
|
|
dprintf (" dupe txt = %p (%s)\n", a->text, a->text);
|
|
#endif
|
|
}
|
|
|
|
|
|
#if DUPE_TEXT
|
|
void zzd_attr (Attrib *attr)
|
|
{
|
|
free (attr->text);
|
|
}
|
|
#endif
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
* Lexical buffer functions:
|
|
* alloc_lex_buffer()
|
|
* realloc_lex_buffer()
|
|
* free_lex_buffer()
|
|
* lexer_overflow()
|
|
* zzcopy() (only if ZZCOPY_FUNCTION is defined and true)
|
|
*/
|
|
|
|
|
|
/*
|
|
* alloc_lex_buffer()
|
|
*
|
|
* allocates the lexical buffer with `size' characters. Clears the buffer,
|
|
* points zzlextext at it, and sets zzbufsize to `size'.
|
|
*
|
|
* Does nothing if the buffer is already allocated.
|
|
*
|
|
* globals: zztoktext, zzlextext, zzbufsize
|
|
* callers: bt_parse_entry() (in input.c)
|
|
*/
|
|
void alloc_lex_buffer (int size)
|
|
{
|
|
if (zztoktext == NULL)
|
|
{
|
|
zztoktext = (char *) malloc (size * sizeof (char));
|
|
memset (zztoktext, 0, size);
|
|
zzlextext = zztoktext;
|
|
zzbufsize = size;
|
|
}
|
|
} /* alloc_lex_buffer() */
|
|
|
|
|
|
/*
|
|
* realloc_lex_buffer()
|
|
*
|
|
* Reallocates the lexical buffer -- size is increased by `size_increment'
|
|
* characters (which could be negative). Updates all globals that point
|
|
* to or into the buffer (zzlextext, zzbegexpr, zzendexpr), as well as
|
|
* zztoktext (the buffer itself) zzbufsize (the buffer size).
|
|
*
|
|
* This is only meant to be called (ultimately) from zzgettok(), part of
|
|
* the DLG code. (In fact, zzgettok() invokes the ZZCOPY() macro, which
|
|
* calls lexer_overflow() on buffer overflow, which calls
|
|
* realloc_lex_buffer(). Whatever.) The `lastpos' and `nextpos' arguments
|
|
* correspond, respectively, to a local variable in zzgettok() and a static
|
|
* global in dlgauto.h (hence really in scan.c). They both point into
|
|
* the lexical buffer, so have to be passed by reference here so that
|
|
* we can update them to point into the newly-reallocated buffer.
|
|
*
|
|
* globals: zztottext, zzbufsize, zzlextext, zzbegexpr, zzendexpr
|
|
* callers: lexer_overflow()
|
|
*/
|
|
static void
|
|
realloc_lex_buffer (int size_increment,
|
|
unsigned char ** lastpos,
|
|
unsigned char ** nextpos)
|
|
{
|
|
int beg, end, next;
|
|
|
|
if (zztoktext == NULL)
|
|
internal_error ("attempt to reallocate unallocated lexical buffer");
|
|
|
|
zztoktext = (char *) realloc (zztoktext, zzbufsize+size_increment);
|
|
memset (zztoktext+zzbufsize, 0, size_increment);
|
|
zzbufsize += size_increment;
|
|
|
|
beg = zzbegexpr - zzlextext;
|
|
end = zzendexpr - zzlextext;
|
|
next = *nextpos - zzlextext;
|
|
zzlextext = zztoktext;
|
|
|
|
if (lastpos != NULL)
|
|
*lastpos = zzlextext+zzbufsize-1;
|
|
zzbegexpr = zzlextext + beg;
|
|
zzendexpr = zzlextext + end;
|
|
*nextpos = zzlextext + next;
|
|
|
|
} /* realloc_lex_buffer() */
|
|
|
|
|
|
/*
|
|
* free_lex_buffer()
|
|
*
|
|
* Frees the lexical buffer allocated by alloc_lex_buffer().
|
|
*/
|
|
void free_lex_buffer (void)
|
|
{
|
|
if (zztoktext == NULL)
|
|
internal_error ("attempt to free unallocated (or already freed) "
|
|
"lexical buffer");
|
|
|
|
free (zztoktext);
|
|
zztoktext = NULL;
|
|
} /* free_lex_buffer() */
|
|
|
|
|
|
/*
|
|
* lexer_overflow()
|
|
*
|
|
* Prints a warning and calls realloc_lex_buffer() to increase the size
|
|
* of the lexical buffer by ZZLEXBUFSIZE (a constant -- hence the buffer
|
|
* size increases linearly, not exponentially).
|
|
*
|
|
* Also prints a couple of lines of useful debugging stuff if DEBUG is true.
|
|
*/
|
|
void lexer_overflow (unsigned char **lastpos, unsigned char **nextpos)
|
|
{
|
|
#if DEBUG
|
|
char head[16], tail[16];
|
|
|
|
printf ("zzcopy: overflow detected\n");
|
|
printf (" zzbegcol=%d, zzendcol=%d, zzline=%d\n",
|
|
zzbegcol, zzendcol, zzline);
|
|
strncpy (head, zzlextext, 15); head[15] = 0;
|
|
strncpy (tail, zzlextext+ZZLEXBUFSIZE-15, 15); tail[15] = 0;
|
|
printf (" zzlextext=>%s...%s< (last char=%d (%c))\n",
|
|
head, tail,
|
|
zzlextext[ZZLEXBUFSIZE-1], zzlextext[ZZLEXBUFSIZE-1]);
|
|
printf (" zzchar = %d (%c), zzbegexpr=zzlextext+%d\n",
|
|
zzchar, zzchar, zzbegexpr-zzlextext);
|
|
#endif
|
|
|
|
notify ("lexical buffer overflowed (reallocating to %d bytes)",
|
|
zzbufsize+ZZLEXBUFSIZE);
|
|
realloc_lex_buffer (ZZLEXBUFSIZE, lastpos, nextpos);
|
|
|
|
} /* lexer_overflow () */
|
|
|
|
|
|
#if ZZCOPY_FUNCTION
|
|
/*
|
|
* zzcopy()
|
|
*
|
|
* Does the same as the ZZCOPY macro (in lex_auxiliary.h), but as a
|
|
* function for easier debugging.
|
|
*/
|
|
void zzcopy (char **nextpos, char **lastpos, int *ovf_flag)
|
|
{
|
|
if (*nextpos >= *lastpos)
|
|
{
|
|
lexer_overflow (lastpos, nextpos);
|
|
}
|
|
|
|
**nextpos = zzchar;
|
|
(*nextpos)++;
|
|
}
|
|
#endif
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
* Report/maintain lexical state
|
|
* report_state() (only meaningful if DEBUG)
|
|
* initialize_lexer_state()
|
|
*
|
|
* Note that the lexical action functions, below, also fiddle with
|
|
* the lexical state variables an awful lot.
|
|
*/
|
|
|
|
#if DEBUG
|
|
char *state_names[] =
|
|
{ "toplevel", "after_at", "after_type", "in_comment", "in_entry" };
|
|
char *metatype_names[] =
|
|
{ "unknown", "comment", "preamble", "string", "alias", "modify", "entry" };
|
|
|
|
static void
|
|
report_state (char *where)
|
|
{
|
|
printf ("%s: lextext=%s (line %d, offset %d), token=%d, "
|
|
"EntryState=%s\n",
|
|
where, zzlextext, zzline, zzbegcol, NLA,
|
|
state_names[EntryState]);
|
|
}
|
|
#else
|
|
# define report_state(where)
|
|
/*
|
|
static void
|
|
report_state (char *where) { }
|
|
*/
|
|
#endif
|
|
|
|
void initialize_lexer_state (void)
|
|
{
|
|
zzmode (START);
|
|
EntryState = toplevel;
|
|
EntryOpener = (char) 0;
|
|
EntryMetatype = BTE_UNKNOWN;
|
|
JunkCount = 0;
|
|
}
|
|
|
|
|
|
bt_metatype entry_metatype (void)
|
|
{
|
|
return EntryMetatype;
|
|
}
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
* Lexical actions (START and LEX_ENTRY modes)
|
|
*/
|
|
|
|
/*
|
|
* newline ()
|
|
*
|
|
* Does everything needed to handle newline outside of a quoted string:
|
|
* increments line counter and skips the newline.
|
|
*/
|
|
void newline (void)
|
|
{
|
|
zzline++;
|
|
zzskip();
|
|
}
|
|
|
|
|
|
void comment (void)
|
|
{
|
|
zzline++;
|
|
zzskip();
|
|
}
|
|
|
|
|
|
void at_sign (void)
|
|
{
|
|
if (EntryState == toplevel)
|
|
{
|
|
EntryState = after_at;
|
|
zzmode (LEX_ENTRY);
|
|
if (JunkCount > 0)
|
|
{
|
|
lexical_warning ("%d characters of junk seen at toplevel", JunkCount);
|
|
JunkCount = 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* internal_error ("lexer recognized \"@\" at other than top-level"); */
|
|
lexical_warning ("\"@\" in strange place -- should get syntax error");
|
|
}
|
|
report_state ("at_sign");
|
|
}
|
|
|
|
|
|
void toplevel_junk (void)
|
|
{
|
|
JunkCount += strlen (zzlextext);
|
|
zzskip ();
|
|
}
|
|
|
|
|
|
void name (void)
|
|
{
|
|
report_state ("name (pre)");
|
|
|
|
switch (EntryState)
|
|
{
|
|
case toplevel:
|
|
{
|
|
internal_error ("junk at toplevel (\"%s\")", zzlextext);
|
|
break;
|
|
}
|
|
case after_at:
|
|
{
|
|
char * etype = zzlextext;
|
|
EntryState = after_type;
|
|
|
|
if (strcasecmp (etype, "comment") == 0)
|
|
{
|
|
EntryMetatype = BTE_COMMENT;
|
|
EntryState = in_comment;
|
|
}
|
|
|
|
else if (strcasecmp (etype, "preamble") == 0)
|
|
EntryMetatype = BTE_PREAMBLE;
|
|
|
|
else if (strcasecmp (etype, "string") == 0)
|
|
EntryMetatype = BTE_MACRODEF;
|
|
/*
|
|
else if (strcasecmp (etype, "alias") == 0)
|
|
EntryMetatype = BTE_ALIAS;
|
|
|
|
else if (strcasecmp (etype, "modify") == 0)
|
|
EntryMetatype = BTE_MODIFY;
|
|
*/
|
|
else
|
|
EntryMetatype = BTE_REGULAR;
|
|
|
|
break;
|
|
}
|
|
case after_type:
|
|
case in_comment:
|
|
case in_entry:
|
|
break; /* do nothing */
|
|
}
|
|
|
|
report_state ("name (post)");
|
|
|
|
}
|
|
|
|
|
|
void lbrace (void)
|
|
{
|
|
/*
|
|
* Currently takes a restrictive view of "when an lbrace is an entry
|
|
* opener" -- ie. *only* after '@name' (as determined by EntryState),
|
|
* where name is not 'comment'. This means that lbrace usually
|
|
* determines a string (in particular, when it's seen at toplevel --
|
|
* which will happen under certain error situations), which in turn
|
|
* means that some unexpected things can become strings (like whole
|
|
* entries).
|
|
*/
|
|
|
|
if (EntryState == in_entry || EntryState == in_comment)
|
|
{
|
|
start_string ('{');
|
|
}
|
|
else if (EntryState == after_type)
|
|
{
|
|
EntryState = in_entry;
|
|
EntryOpener = '{';
|
|
NLA = ENTRY_OPEN;
|
|
}
|
|
else
|
|
{
|
|
lexical_warning ("\"{\" in strange place -- should get a syntax error");
|
|
}
|
|
|
|
report_state ("lbrace");
|
|
}
|
|
|
|
|
|
void rbrace (void)
|
|
{
|
|
if (EntryState == in_entry)
|
|
{
|
|
if (EntryOpener == '(')
|
|
lexical_warning ("entry started with \"(\", but ends with \"}\"");
|
|
NLA = ENTRY_CLOSE;
|
|
initialize_lexer_state ();
|
|
}
|
|
else
|
|
{
|
|
lexical_warning ("\"}\" in strange place -- should get a syntax error");
|
|
}
|
|
report_state ("rbrace");
|
|
}
|
|
|
|
|
|
void lparen (void)
|
|
{
|
|
if (EntryState == in_comment)
|
|
{
|
|
start_string ('(');
|
|
}
|
|
else if (EntryState == after_type)
|
|
{
|
|
EntryState = in_entry;
|
|
EntryOpener = '(';
|
|
}
|
|
else
|
|
{
|
|
lexical_warning ("\"(\" in strange place -- should get a syntax error");
|
|
}
|
|
report_state ("lparen");
|
|
}
|
|
|
|
|
|
void rparen (void)
|
|
{
|
|
if (EntryState == in_entry)
|
|
{
|
|
if (EntryOpener == '{')
|
|
lexical_warning ("entry started with \"{\", but ends with \")\"");
|
|
initialize_lexer_state ();
|
|
}
|
|
else
|
|
{
|
|
lexical_warning ("\")\" in strange place -- should get a syntax error");
|
|
}
|
|
report_state ("rparen");
|
|
}
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
* Stuff for processing strings.
|
|
*/
|
|
|
|
|
|
/*
|
|
* start_string ()
|
|
*
|
|
* Called when we see a '{' or '"' in the field data. Records which quote
|
|
* character was used, and calls open_brace() to increment the depth
|
|
* counter if it was a '{'. Switches to LEX_STRING mode, and tells the
|
|
* lexer to continue slurping characters into the same buffer.
|
|
*/
|
|
void start_string (char start_char)
|
|
{
|
|
StringOpener = start_char;
|
|
BraceDepth = 0;
|
|
ParenDepth = 0;
|
|
StringStart = zzline;
|
|
ApparentRunaway = 0;
|
|
QuoteWarned = 0;
|
|
if (start_char == '{')
|
|
open_brace ();
|
|
if (start_char == '(')
|
|
ParenDepth++;
|
|
if (start_char == '"' && EntryState == in_comment)
|
|
{
|
|
lexical_error ("comment entries must be delimited by either braces or parentheses");
|
|
EntryState = toplevel;
|
|
zzmode (START);
|
|
return;
|
|
}
|
|
|
|
#ifdef USER_ZZMODE_STACK
|
|
if (zzauto != LEX_ENTRY || EntryState != in_entry)
|
|
#else
|
|
if (EntryState != in_entry && EntryState != in_comment)
|
|
#endif
|
|
{
|
|
lexical_warning ("start of string seen at weird place");
|
|
}
|
|
|
|
zzmore ();
|
|
zzmode (LEX_STRING);
|
|
}
|
|
|
|
|
|
/*
|
|
* end_string ()
|
|
*
|
|
* Called when we see either a '"' (at depth 0) or '}' (if it brings us
|
|
* down to depth 0) in a quoted string. Just makes sure that braces are
|
|
* balanced, and then goes back to the LEX_FIELD mode.
|
|
*/
|
|
void end_string (char end_char)
|
|
{
|
|
char match;
|
|
|
|
#ifndef ALLOW_WARNINGS
|
|
match = (char) 0; /* silence "might be used" */
|
|
/* uninitialized" warning */
|
|
#endif
|
|
|
|
switch (end_char)
|
|
{
|
|
case '}': match = '{'; break;
|
|
case ')': match = '('; break;
|
|
case '"': match = '"'; break;
|
|
default:
|
|
internal_error ("end_string(): invalid end_char \"%c\"", end_char);
|
|
}
|
|
|
|
assert (StringOpener == match);
|
|
|
|
/*
|
|
* If we're at non-zero BraceDepth, that probably means mismatched braces
|
|
* somewhere -- complain about it and reset BraceDepth to minimize future
|
|
* confusion.
|
|
*/
|
|
|
|
if (BraceDepth > 0)
|
|
{
|
|
lexical_error ("unbalanced braces: too many {'s");
|
|
BraceDepth = 0;
|
|
}
|
|
|
|
StringOpener = (char) 0;
|
|
StringStart = -1;
|
|
NLA = STRING;
|
|
|
|
if (EntryState == in_comment)
|
|
{
|
|
int len = strlen (zzlextext);
|
|
|
|
/*
|
|
* ARG! no, this is wrong -- what if unbalanced braces in the string
|
|
* and we try to output put it later?
|
|
*
|
|
* ARG! again, this is no more wrong than when we strip quotes in
|
|
* post_parse.c, and blithely assume that we can put them back on
|
|
* later for output in BibTeX syntax. Hmmm.
|
|
*
|
|
* Actually, it looks like this isn't a problem after all: you
|
|
* can't have unbalanced braces in a BibTeX string (at least
|
|
* not as parsed by btparse).
|
|
*/
|
|
|
|
if (zzlextext[0] == '(') /* convert to standard quote delims */
|
|
{
|
|
zzlextext[ 0] = '{';
|
|
zzlextext[len-1] = '}';
|
|
}
|
|
|
|
EntryState = toplevel;
|
|
zzmode (START);
|
|
}
|
|
else
|
|
{
|
|
zzmode (LEX_ENTRY);
|
|
}
|
|
|
|
report_state ("string");
|
|
}
|
|
|
|
|
|
/*
|
|
* open_brace ()
|
|
*
|
|
* Called when we see a '{', either to start a string (in which case
|
|
* it's called from start_string()) or inside a string (called directly
|
|
* from the lexer).
|
|
*/
|
|
void open_brace (void)
|
|
{
|
|
BraceDepth++;
|
|
zzmore ();
|
|
report_state ("open_brace");
|
|
}
|
|
|
|
|
|
/*
|
|
* close_brace ()
|
|
*
|
|
* Called when we see a '}' inside a string. Decrements the depth counter
|
|
* and checks to see if we are down to depth 0, in which case the string is
|
|
* ended and the current lookahead token is set to STRING. Otherwise,
|
|
* just tells the lexer to keep slurping characters into the buffer.
|
|
*/
|
|
void close_brace (void)
|
|
{
|
|
BraceDepth--;
|
|
if (StringOpener == '{' && BraceDepth == 0)
|
|
{
|
|
end_string ('}');
|
|
}
|
|
|
|
/*
|
|
* This could happen if some bonehead puts an unmatched right-brace
|
|
* in a quote-delimited string (eg. "Hello}"). To attempt to recover,
|
|
* we reset the depth to zero and continue slurping into the string.
|
|
*/
|
|
else if (BraceDepth < 0)
|
|
{
|
|
lexical_error ("unbalanced braces: too many }'s");
|
|
BraceDepth = 0;
|
|
zzmore ();
|
|
}
|
|
|
|
/* Otherwise, it's just any old right brace in a string -- keep eating */
|
|
else
|
|
{
|
|
zzmore ();
|
|
}
|
|
report_state ("close_brace");
|
|
}
|
|
|
|
|
|
void lparen_in_string (void)
|
|
{
|
|
ParenDepth++;
|
|
zzmore ();
|
|
}
|
|
|
|
|
|
void rparen_in_string (void)
|
|
{
|
|
ParenDepth--;
|
|
if (StringOpener == '(' && ParenDepth == 0)
|
|
{
|
|
end_string (')');
|
|
}
|
|
else
|
|
{
|
|
zzmore ();
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* quote_in_string ()
|
|
*
|
|
* Called when we see '"' in a string. Ends the string if the quote is at
|
|
* depth 0 and the string was started with a quote, otherwise instructs the
|
|
* lexer to continue munching happily along. (Also prints a warning,
|
|
* assuming that input is destined for processing by TeX and you really
|
|
* want either `` or '' rather than ".)
|
|
*/
|
|
void quote_in_string (void)
|
|
{
|
|
if (StringOpener == '"' && BraceDepth == 0)
|
|
{
|
|
end_string ('"');
|
|
}
|
|
else
|
|
{
|
|
boolean at_top = FALSE;;
|
|
|
|
/*
|
|
* Note -- this warning assumes that strings are destined
|
|
* to be processed by TeX, so it should be optional. Hmmm.
|
|
*/
|
|
|
|
if (StringOpener == '"' || StringOpener == '(')
|
|
at_top = (BraceDepth == 0);
|
|
else if (StringOpener == '{')
|
|
at_top = (BraceDepth == 1);
|
|
else
|
|
internal_error ("Illegal string opener \"%c\"", StringOpener);
|
|
|
|
if (!QuoteWarned && at_top)
|
|
{
|
|
lexical_warning ("found \" at brace-depth zero in string "
|
|
"(TeX accents in BibTeX should be inside braces)");
|
|
QuoteWarned = 1;
|
|
}
|
|
zzmore ();
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* check_runaway_string ()
|
|
*
|
|
* Called from the lexer whenever we see a newline in a string. See
|
|
* bibtex.g for a detailed explanation; basically, this function
|
|
* looks for an entry start ("@name{") or new field ("name=") immediately
|
|
* after a newline (with possible whitespace). This is a heuristic
|
|
* check for runaway strings, under the assumption that text that looks
|
|
* like a new entry or new field won't actually occur inside a string
|
|
* very often.
|
|
*/
|
|
void check_runaway_string (void)
|
|
{
|
|
int len;
|
|
int i;
|
|
|
|
/*
|
|
* could these be made significantly more efficient by a 256-element
|
|
* lookup table instead of calling strchr()?
|
|
*/
|
|
static const char *alpha_chars = "abcdefghijklmnopqrstuvwxyz";
|
|
static const char *name_chars = "abcdefghijklmnopqrstuvwxyz0123456789:+/'.-";
|
|
|
|
/*
|
|
* on entry: zzlextext contains the whole string, starting with {
|
|
* and with newlines/tabs converted to space; zzbegexpr points to
|
|
* a chunk of the string starting with newline (newlines and
|
|
* tabs have not yet been converted)
|
|
*/
|
|
|
|
#if DEBUG > 1
|
|
printf ("check_runaway_string(): zzline=%d\n", zzline);
|
|
printf ("zzlextext=>%s<\nzzbegexpr=>%s<\n",
|
|
zzlextext, zzbegexpr);
|
|
#endif
|
|
|
|
|
|
/*
|
|
* increment zzline to take the leading newline into account -- but
|
|
* first a sanity check to be sure that newline is there!
|
|
*/
|
|
|
|
if (zzbegexpr[0] != '\n')
|
|
{
|
|
lexical_warning ("huh? something's wrong (buffer overflow?) near "
|
|
"offset %d (line %d)", zzendcol, zzline);
|
|
/* internal_error ("zzbegexpr (line %d, offset %d-%d, "
|
|
"text >%s<, expr >%s<)"
|
|
"should start with a newline",
|
|
zzline, zzbegcol, zzendcol, zzlextext, zzbegexpr);
|
|
*/
|
|
}
|
|
else
|
|
{
|
|
zzline++;
|
|
}
|
|
|
|
/* standardize whitespace (convert all to space) */
|
|
|
|
len = strlen (zzbegexpr);
|
|
for (i = 0; i < len; i++)
|
|
{
|
|
if (isspace (zzbegexpr[i]))
|
|
zzbegexpr[i] = ' ';
|
|
}
|
|
|
|
|
|
if (!ApparentRunaway) /* haven't already warned about it */
|
|
{
|
|
enum { none, entry, field, giveup } guess;
|
|
|
|
i = 1;
|
|
guess = none;
|
|
while (i < len && zzbegexpr[i] == ' ') i++;
|
|
|
|
if (zzbegexpr[i] == '@')
|
|
{
|
|
i++;
|
|
while (i < len && zzbegexpr[i] == ' ') i++;
|
|
guess = entry;
|
|
}
|
|
|
|
if (strchr (alpha_chars, tolower (zzbegexpr[i])) != NULL)
|
|
{
|
|
while (i < len && strchr (name_chars, tolower (zzbegexpr[i])) != NULL)
|
|
i++;
|
|
while (i < len && zzbegexpr[i] == ' ') i++;
|
|
if (i == len)
|
|
{
|
|
guess = giveup;
|
|
}
|
|
else
|
|
{
|
|
if (guess == entry)
|
|
{
|
|
if (zzbegexpr[i] != '{' && zzbegexpr[i] != '(')
|
|
guess = giveup;
|
|
}
|
|
else /* assume it's a field */
|
|
{
|
|
if (zzbegexpr[i] == '=')
|
|
guess = field;
|
|
else
|
|
guess = giveup;
|
|
}
|
|
}
|
|
}
|
|
else /* no name seen after WS or @ */
|
|
{
|
|
guess = giveup;
|
|
}
|
|
|
|
if (guess == none)
|
|
internal_error ("gee, I should have made a guess by now");
|
|
|
|
if (guess != giveup)
|
|
{
|
|
lexical_warning ("possible runaway string started at line %d",
|
|
StringStart);
|
|
ApparentRunaway = 1;
|
|
}
|
|
}
|
|
|
|
zzmore();
|
|
}
|
|
|