/* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ /* enchant * Copyright (C) 2003 Dom Lachowicz * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * * In addition, as a special exception, Dom Lachowicz * gives permission to link the code of this program with * non-LGPL Spelling Provider libraries (eg: a MSFT Office * spell checker backend) and distribute linked combinations including * the two. You must obey the GNU Lesser General Public License in all * respects for all of the code used other than said providers. If you modify * this file, you may extend this exception to your version of the * file, but you are not obligated to do so. If you do not wish to * do so, delete this exception statement from your version. */ /* * Copyright 1987, 1988, 1989, 1992, 1993, Geoff Kuenning, Granada Hills, CA * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All modifications to the source code must be clearly marked as * such. Binary redistributions based on modified source code * must be clearly marked as modified versions in the documentation * and/or other materials provided with the distribution. * 4. All advertising materials mentioning features or use of this software * must display the following acknowledgment: * This product includes software developed by Geoff Kuenning and * other unpaid contributors. * 5. The name of Geoff Kuenning may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Table-driven version of good.c. * * Geoff Kuenning, July 1987 */ /* * $Log$ * Revision 1.1 2004/01/31 16:44:12 zrusin * ISpell plugin. * * Revision 1.4 2003/08/14 17:51:29 dom * update license - exception clause should be Lesser GPL * * Revision 1.3 2003/07/28 20:40:28 dom * fix up the license clause, further win32-registry proof some directory getting functions * * Revision 1.2 2003/07/16 22:52:56 dom * LGPL + exception license * * Revision 1.1 2003/07/15 01:15:09 dom * ispell enchant backend * * Revision 1.2 2003/01/29 05:50:12 hippietrail * * Fixed my mess in EncodingManager. * Changed many C casts to C++ casts. * * Revision 1.1 2003/01/24 05:52:36 hippietrail * * Refactored ispell code. Old ispell global variables had been put into * an allocated structure, a pointer to which was passed to many functions. * I have now made all such functions and variables private members of the * ISpellChecker class. It was C OO, now it's C++ OO. * * I've fixed the makefiles and tested compilation but am unable to test * operation. Please back out my changes if they cause problems which * are not obvious or easy to fix. * * Revision 1.6 2003/01/06 18:48:42 dom * ispell cleanup, start of using new 'add' save features * * Revision 1.5 2002/09/19 05:31:20 hippietrail * * More Ispell cleanup. Conditional globals and DEREF macros are removed. * K&R function declarations removed, converted to Doxygen style comments * where possible. No code has been changed (I hope). Compiles for me but * unable to test. * * Revision 1.4 2002/09/17 03:03:31 hippietrail * * After seeking permission on the developer list I've reformatted all the * spelling source which seemed to have parts which used 2, 3, 4, and 8 * spaces for tabs. It should all look good with our standard 4-space * tabs now. * I've concentrated just on indentation in the actual code. More prettying * could be done. * * NO code changes were made * * * Revision 1.3 2002/09/13 17:20:14 mpritchett * Fix more warnings for Linux build * * Revision 1.2 2001/05/12 16:05:42 thomasf * Big pseudo changes to ispell to make it pass around a structure rather * than rely on all sorts of gloabals willy nilly here and there. Also * fixed our spelling class to work with accepting suggestions once more. * This code is dirty, gross and ugly (not to mention still not supporting * multiple hash sized just yet) but it works on my machine and will no * doubt break other machines. * * Revision 1.1 2001/04/15 16:01:24 tomas_f * moving to spell/xp * * Revision 1.7 1999/10/20 06:03:56 sterwill * Changed C++-style comments to C-style comments in C code. * * Revision 1.6 1999/10/20 03:19:35 paul * Hacked ispell code to ignore any characters that don't fit in the lookup tables loaded from the dictionary. It ain't pretty, but at least we don't crash there any more. * * Revision 1.5 1999/04/13 17:12:51 jeff * Applied "Darren O. Benham" spell check changes. * Fixed crash on Win32 with the new code. * * Revision 1.4 1998/12/29 14:55:33 eric * * I've doctored the ispell code pretty extensively here. It is now * warning-free on Win32. It also *works* on Win32 now, since I * replaced all the I/O calls with ANSI standard ones. * * Revision 1.4 1998/12/29 14:55:33 eric * * I've doctored the ispell code pretty extensively here. It is now * warning-free on Win32. It also *works* on Win32 now, since I * replaced all the I/O calls with ANSI standard ones. * * Revision 1.3 1998/12/28 23:11:30 eric * * modified spell code and integration to build on Windows. * This is still a hack. * * Actually, it doesn't yet WORK on Windows. It just builds. * SpellCheckInit is failing for some reason. * * Revision 1.2 1998/12/28 22:16:22 eric * * These changes begin to incorporate the spell checker into AbiWord. Most * of this is a hack. * * 1. added other/spell to the -I list in config/abi_defs * 2. replaced other/spell/Makefile with one which is more like * our build system. * 3. added other/spell to other/Makefile so that the build will now * dive down and build the spell check library. * 4. added the AbiSpell library to the Makefiles in wp/main * 5. added a call to SpellCheckInit in wp/main/unix/UnixMain.cpp. * This call is a HACK and should be replaced with something * proper later. * 6. added code to fv_View.cpp as follows: * whenever you double-click on a word, the spell checker * verifies that word and prints its status to stdout. * * Caveats: * 1. This will break the Windows build. I'm going to work on fixing it * now. * 2. This only works if your dictionary is in /usr/lib/ispell/american.hash. * The dictionary location is currently hard-coded. This will be * fixed as well. * * Anyway, such as it is, it works. * * Revision 1.1 1998/12/28 18:04:43 davet * Spell checker code stripped from ispell. At this point, there are * two external routines... the Init routine, and a check-a-word routine * which returns a boolean value, and takes a 16 bit char string. * The code resembles the ispell code as much as possible still. * * Revision 1.32 1994/11/02 06:56:16 geoff * Remove the anyword feature, which I've decided is a bad idea. * * Revision 1.31 1994/10/25 05:46:25 geoff * Add support for the FF_ANYWORD (affix applies to all words, even if * flag bit isn't set) flag option. * * Revision 1.30 1994/05/24 06:23:08 geoff * Don't create a hit if "allhits" is clear and capitalization * mismatches. This cures a bug where a word could be in the dictionary * and yet not found. * * Revision 1.29 1994/05/17 06:44:21 geoff * Add support for controlled compound formation and the COMPOUNDONLY * option to affix flags. * * Revision 1.28 1994/01/25 07:12:13 geoff * Get rid of all old RCS log lines in preparation for the 3.1 release. * */ #include #include #include #include "ispell_checker.h" /*! * Check possible affixes * * \param word Word to be checked * \param ucword Upper-case-only copy of word * \param len The length of word/ucword * \param ignoreflagbits Ignore whether affix is legal * \param allhits Keep going after first hit * \param pfxopts Options to apply to prefixes * \param sfxopts Options to apply to suffixes */ void ISpellChecker::chk_aff (ichar_t *word, ichar_t *ucword, int len, int ignoreflagbits, int allhits, int pfxopts, int sfxopts) { ichar_t * cp; /* Pointer to char to index on */ struct flagptr * ind; /* Flag index table to test */ pfx_list_chk (word, ucword, len, pfxopts, sfxopts, &m_pflagindex[0], ignoreflagbits, allhits); cp = ucword; /* HACK: bail on unrecognized chars */ if (*cp >= (SET_SIZE + MAXSTRINGCHARS)) return; ind = &m_pflagindex[*cp++]; while (ind->numents == 0 && ind->pu.fp != NULL) { if (*cp == 0) return; if (ind->pu.fp[0].numents) { pfx_list_chk (word, ucword, len, pfxopts, sfxopts, &ind->pu.fp[0], ignoreflagbits, allhits); if (m_numhits && !allhits && /* !cflag && */ !ignoreflagbits) return; } /* HACK: bail on unrecognized chars */ if (*cp >= (SET_SIZE + MAXSTRINGCHARS)) return; ind = &ind->pu.fp[*cp++]; } pfx_list_chk (word, ucword, len, pfxopts, sfxopts, ind, ignoreflagbits, allhits); if (m_numhits && !allhits && /* !cflag &&*/ !ignoreflagbits) return; chk_suf (word, ucword, len, sfxopts, static_cast(NULL), ignoreflagbits, allhits); } /*! * Check some prefix flags * * \param word Word to be checked * \param ucword Upper-case-only word * \param len The length of ucword * \param optflags Options to apply * \param sfxopts Options to apply to suffixes * \param ind Flag index table * \param ignoreflagbits Ignore whether affix is legal * \param allhits Keep going after first hit * */ void ISpellChecker::pfx_list_chk (ichar_t *word, ichar_t *ucword, int len, int optflags, int sfxopts, struct flagptr * ind, int ignoreflagbits, int allhits) { int cond; /* Condition number */ ichar_t * cp; /* Pointer into end of ucword */ struct dent * dent; /* Dictionary entry we found */ int entcount; /* Number of entries to process */ struct flagent * flent; /* Current table entry */ int preadd; /* Length added to tword2 as prefix */ int tlen; /* Length of tword */ ichar_t tword[INPUTWORDLEN + 4 * MAXAFFIXLEN + 4]; /* Tmp cpy */ ichar_t tword2[sizeof tword]; /* 2nd copy for ins_root_cap */ for (flent = ind->pu.ent, entcount = ind->numents; entcount > 0; flent++, entcount--) { /* * If this is a compound-only affix, ignore it unless we're * looking for that specific thing. */ if ((flent->flagflags & FF_COMPOUNDONLY) != 0 && (optflags & FF_COMPOUNDONLY) == 0) continue; /* * See if the prefix matches. */ tlen = len - flent->affl; if (tlen > 0 && (flent->affl == 0 || icharncmp (flent->affix, ucword, flent->affl) == 0) && tlen + flent->stripl >= flent->numconds) { /* * The prefix matches. Remove it, replace it by the "strip" * string (if any), and check the original conditions. */ if (flent->stripl) icharcpy (tword, flent->strip); icharcpy (tword + flent->stripl, ucword + flent->affl); cp = tword; for (cond = 0; cond < flent->numconds; cond++) { if ((flent->conds[*cp++] & (1 << cond)) == 0) break; } if (cond >= flent->numconds) { /* * The conditions match. See if the word is in the * dictionary. */ tlen += flent->stripl; if (ignoreflagbits) { if ((dent = ispell_lookup (tword, 1)) != NULL) { cp = tword2; if (flent->affl) { icharcpy (cp, flent->affix); cp += flent->affl; *cp++ = '+'; } preadd = cp - tword2; icharcpy (cp, tword); cp += tlen; if (flent->stripl) { *cp++ = '-'; icharcpy (cp, flent->strip); } } } else if ((dent = ispell_lookup (tword, 1)) != NULL && TSTMASKBIT (dent->mask, flent->flagbit)) { if (m_numhits < MAX_HITS) { m_hits[m_numhits].dictent = dent; m_hits[m_numhits].prefix = flent; m_hits[m_numhits].suffix = NULL; m_numhits++; } if (!allhits) { #ifndef NO_CAPITALIZATION_SUPPORT if (cap_ok (word, &m_hits[0], len)) return; m_numhits = 0; #else /* NO_CAPITALIZATION_SUPPORT */ return; #endif /* NO_CAPITALIZATION_SUPPORT */ } } /* * Handle cross-products. */ if (flent->flagflags & FF_CROSSPRODUCT) chk_suf (word, tword, tlen, sfxopts | FF_CROSSPRODUCT, flent, ignoreflagbits, allhits); } } } } /*! * Check possible suffixes * * \param word Word to be checked * \param ucword Upper-case-only word * \param len The length of ucword * \param optflags Affix option flags * \param pfxent Prefix flag entry if cross-prod * \param ignoreflagbits Ignore whether affix is legal * \param allhits Keep going after first hit */ void ISpellChecker::chk_suf (ichar_t *word, ichar_t *ucword, int len, int optflags, struct flagent *pfxent, int ignoreflagbits, int allhits) { ichar_t * cp; /* Pointer to char to index on */ struct flagptr * ind; /* Flag index table to test */ suf_list_chk (word, ucword, len, &m_sflagindex[0], optflags, pfxent, ignoreflagbits, allhits); cp = ucword + len - 1; /* HACK: bail on unrecognized chars */ if (*cp >= (SET_SIZE + MAXSTRINGCHARS)) return; ind = &m_sflagindex[*cp]; while (ind->numents == 0 && ind->pu.fp != NULL) { if (cp == ucword) return; if (ind->pu.fp[0].numents) { suf_list_chk (word, ucword, len, &ind->pu.fp[0], optflags, pfxent, ignoreflagbits, allhits); if (m_numhits != 0 && !allhits && /* !cflag && */ !ignoreflagbits) return; } /* HACK: bail on unrecognized chars */ if (*(cp-1) >= (SET_SIZE + MAXSTRINGCHARS)) return; ind = &ind->pu.fp[*--cp]; } suf_list_chk (word, ucword, len, ind, optflags, pfxent, ignoreflagbits, allhits); } /*! * \param word Word to be checked * \param ucword Upper-case-only word * \param len The length of ucword * \param ind Flag index table * \param optflags Affix option flags * \param pfxent Prefix flag entry if crossonly * \param ignoreflagbits Ignore whether affix is legal * \pram allhits Keep going after first hit */ void ISpellChecker::suf_list_chk (ichar_t *word, ichar_t *ucword, int len, struct flagptr *ind, int optflags, struct flagent *pfxent, int ignoreflagbits, int allhits) { ichar_t * cp; /* Pointer into end of ucword */ int cond; /* Condition number */ struct dent * dent; /* Dictionary entry we found */ int entcount; /* Number of entries to process */ struct flagent * flent; /* Current table entry */ int preadd; /* Length added to tword2 as prefix */ int tlen; /* Length of tword */ ichar_t tword[INPUTWORDLEN + 4 * MAXAFFIXLEN + 4]; /* Tmp cpy */ ichar_t tword2[sizeof tword]; /* 2nd copy for ins_root_cap */ icharcpy (tword, ucword); for (flent = ind->pu.ent, entcount = ind->numents; entcount > 0; flent++, entcount--) { if ((optflags & FF_CROSSPRODUCT) != 0 && (flent->flagflags & FF_CROSSPRODUCT) == 0) continue; /* * If this is a compound-only affix, ignore it unless we're * looking for that specific thing. */ if ((flent->flagflags & FF_COMPOUNDONLY) != 0 && (optflags & FF_COMPOUNDONLY) == 0) continue; /* * See if the suffix matches. */ tlen = len - flent->affl; if (tlen > 0 && (flent->affl == 0 || icharcmp (flent->affix, ucword + tlen) == 0) && tlen + flent->stripl >= flent->numconds) { /* * The suffix matches. Remove it, replace it by the "strip" * string (if any), and check the original conditions. */ icharcpy (tword, ucword); cp = tword + tlen; if (flent->stripl) { icharcpy (cp, flent->strip); tlen += flent->stripl; cp = tword + tlen; } else *cp = '\0'; for (cond = flent->numconds; --cond >= 0; ) { if ((flent->conds[*--cp] & (1 << cond)) == 0) break; } if (cond < 0) { /* * The conditions match. See if the word is in the * dictionary. */ if (ignoreflagbits) { if ((dent = ispell_lookup (tword, 1)) != NULL) { cp = tword2; if ((optflags & FF_CROSSPRODUCT) && pfxent->affl != 0) { icharcpy (cp, pfxent->affix); cp += pfxent->affl; *cp++ = '+'; } preadd = cp - tword2; icharcpy (cp, tword); cp += tlen; if ((optflags & FF_CROSSPRODUCT) && pfxent->stripl != 0) { *cp++ = '-'; icharcpy (cp, pfxent->strip); cp += pfxent->stripl; } if (flent->stripl) { *cp++ = '-'; icharcpy (cp, flent->strip); cp += flent->stripl; } if (flent->affl) { *cp++ = '+'; icharcpy (cp, flent->affix); cp += flent->affl; } } } else if ((dent = ispell_lookup (tword, 1)) != NULL && TSTMASKBIT (dent->mask, flent->flagbit) && ((optflags & FF_CROSSPRODUCT) == 0 || TSTMASKBIT (dent->mask, pfxent->flagbit))) { if (m_numhits < MAX_HITS) { m_hits[m_numhits].dictent = dent; m_hits[m_numhits].prefix = pfxent; m_hits[m_numhits].suffix = flent; m_numhits++; } if (!allhits) { #ifndef NO_CAPITALIZATION_SUPPORT if (cap_ok (word, &m_hits[0], len)) return; m_numhits = 0; #else /* NO_CAPITALIZATION_SUPPORT */ return; #endif /* NO_CAPITALIZATION_SUPPORT */ } } } } } } /*! * Expand a dictionary prefix entry * * \param croot Char version of rootword * \param rootword Root word to expand * \param mask Mask bits to expand on * \param option Option, see expandmode * \param extra Extra info to add to line * * \return */ int ISpellChecker::expand_pre (char *croot, ichar_t *rootword, MASKTYPE mask[], int option, char *extra) { int entcount; /* No. of entries to process */ int explength; /* Length of expansions */ struct flagent * flent; /* Current table entry */ for (flent = m_pflaglist, entcount = m_numpflags, explength = 0; entcount > 0; flent++, entcount--) { if (TSTMASKBIT (mask, flent->flagbit)) explength += pr_pre_expansion (croot, rootword, flent, mask, option, extra); } return explength; } /*! * Print a prefix expansion * * \param croot Char version of rootword * \param rootword Root word to expand * \param flent Current table entry * \param mask Mask bits to expand on * \param option Option, see expandmode * \param extra Extra info to add to line * * \return */ int ISpellChecker::pr_pre_expansion ( char *croot, ichar_t *rootword, struct flagent *flent, MASKTYPE mask[], int option, char *extra) { int cond; /* Current condition number */ ichar_t * nextc; /* Next case choice */ int tlen; /* Length of tword */ ichar_t tword[INPUTWORDLEN + MAXAFFIXLEN]; /* Temp */ tlen = icharlen (rootword); if (flent->numconds > tlen) return 0; tlen -= flent->stripl; if (tlen <= 0) return 0; tlen += flent->affl; for (cond = 0, nextc = rootword; cond < flent->numconds; cond++) { if ((flent->conds[mytoupper (*nextc++)] & (1 << cond)) == 0) return 0; } /* * The conditions are satisfied. Copy the word, add the prefix, * and make it the proper case. This code is carefully written * to match that ins_cap and cap_ok. Note that the affix, as * inserted, is uppercase. * * There is a tricky bit here: if the root is capitalized, we * want a capitalized result. If the root is followcase, however, * we want to duplicate the case of the first remaining letter * of the root. In other words, "Loved/U" should generate "Unloved", * but "LOved/U" should generate "UNLOved" and "lOved/U" should * produce "unlOved". */ if (flent->affl) { icharcpy (tword, flent->affix); nextc = tword + flent->affl; } icharcpy (nextc, rootword + flent->stripl); if (myupper (rootword[0])) { /* We must distinguish followcase from capitalized and all-upper */ for (nextc = rootword + 1; *nextc; nextc++) { if (!myupper (*nextc)) break; } if (*nextc) { /* It's a followcase or capitalized word. Figure out which. */ for ( ; *nextc; nextc++) { if (myupper (*nextc)) break; } if (*nextc) { /* It's followcase. */ if (!myupper (tword[flent->affl])) forcelc (tword, flent->affl); } else { /* It's capitalized */ forcelc (tword + 1, tlen - 1); } } } else { /* Followcase or all-lower, we don't care which */ if (!myupper (*nextc)) forcelc (tword, flent->affl); } if (option == 3) printf ("\n%s", croot); if (option != 4) printf (" %s%s", ichartosstr (tword, 1), extra); if (flent->flagflags & FF_CROSSPRODUCT) return tlen + expand_suf (croot, tword, mask, FF_CROSSPRODUCT, option, extra); else return tlen; } /*! * Expand a dictionary suffix entry * * \param croot Char version of rootword * \param rootword Root word to expand * \param mask Mask bits to expand on * \param optflags Affix option flags * \param option Option, see expandmode * \param extra Extra info to add to line * * \return */ int ISpellChecker::expand_suf (char *croot, ichar_t *rootword, MASKTYPE mask[], int optflags, int option, char *extra) { int entcount; /* No. of entries to process */ int explength; /* Length of expansions */ struct flagent * flent; /* Current table entry */ for (flent = m_sflaglist, entcount = m_numsflags, explength = 0; entcount > 0; flent++, entcount--) { if (TSTMASKBIT (mask, flent->flagbit)) { if ((optflags & FF_CROSSPRODUCT) == 0 || (flent->flagflags & FF_CROSSPRODUCT)) explength += pr_suf_expansion (croot, rootword, flent, option, extra); } } return explength; } /*! * Print a suffix expansion * * \param croot Char version of rootword * \param rootword Root word to expand * \param flent Current table entry * \param option Option, see expandmode * \param extra Extra info to add to line * * \return */ int ISpellChecker::pr_suf_expansion (char *croot, ichar_t *rootword, struct flagent *flent, int option, char *extra) { int cond; /* Current condition number */ ichar_t * nextc; /* Next case choice */ int tlen; /* Length of tword */ ichar_t tword[INPUTWORDLEN + MAXAFFIXLEN]; /* Temp */ tlen = icharlen (rootword); cond = flent->numconds; if (cond > tlen) return 0; if (tlen - flent->stripl <= 0) return 0; for (nextc = rootword + tlen; --cond >= 0; ) { if ((flent->conds[mytoupper (*--nextc)] & (1 << cond)) == 0) return 0; } /* * The conditions are satisfied. Copy the word, add the suffix, * and make it match the case of the last remaining character of the * root. Again, this code carefully matches ins_cap and cap_ok. */ icharcpy (tword, rootword); nextc = tword + tlen - flent->stripl; if (flent->affl) { icharcpy (nextc, flent->affix); if (!myupper (nextc[-1])) forcelc (nextc, flent->affl); } else *nextc = 0; if (option == 3) printf ("\n%s", croot); if (option != 4) printf (" %s%s", ichartosstr (tword, 1), extra); return tlen + flent->affl - flent->stripl; } /*! * \param dst Destination to modify * \param len Length to copy */ void ISpellChecker::forcelc (ichar_t *dst, int len) /* Force to lowercase */ { for ( ; --len >= 0; dst++) *dst = mytolower (*dst); }