You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
115 lines
3.8 KiB
115 lines
3.8 KiB
//
|
|
// WordDBCompress.h
|
|
//
|
|
// WordDBCompress: Implements specific compression scheme for
|
|
// Berkeley DB pages containing WordReferences objects.
|
|
//
|
|
// Part of the ht://Dig package <http://www.htdig.org/>
|
|
// Copyright (c) 1999-2004 The ht://Dig Group
|
|
// For copyright details, see the file COPYING in your distribution
|
|
// or the GNU Library General Public License (LGPL) version 2 or later
|
|
// <http://www.gnu.org/copyleft/lgpl.html>
|
|
//
|
|
// $Id: WordDBCompress.h,v 1.6 2004/05/28 13:15:26 lha Exp $
|
|
//
|
|
|
|
#ifndef _WordDBCompress_h_
|
|
#define _WordDBCompress_h_
|
|
|
|
// ***********************************************
|
|
// *************** WordDBCompress*****************
|
|
// ***********************************************
|
|
// Starting point for compression.
|
|
//
|
|
//
|
|
// Comrpession HOW IT WORKS:
|
|
//
|
|
// ** General outline:
|
|
//
|
|
// BerkeleyDB pages are stored in a memory pool. When the memory pool
|
|
// is full, least recently used pages are swaped to disk. Page
|
|
// compression occurs at page in/out level. The
|
|
// WordDBCompress_compress_c functions are C callbacks that are called
|
|
// by the the page compression code in BerkeleyDB. The C callbacks the
|
|
// call the WordDBCompress comress/uncompress methods. The
|
|
// WordDBCompress creates a WordDBPage which does the actual
|
|
// compress/uncompress job.
|
|
//
|
|
// The WordDBPage compression/uncompression methods store/retreive data
|
|
// from a bitstream. BitStream is a simple bitstream, and Compressor is
|
|
// a bitstream with added compression capabilities.
|
|
//
|
|
|
|
// Compression algorithm.
|
|
//
|
|
// Most DB pages are full of really redundant data. Mifluz choice of using
|
|
// one db entry per word makes the DB pages have an even more redundant.
|
|
// But this choice also makes the pages have a very simple structure.
|
|
//
|
|
// Here is a real world example of what a page can look like:
|
|
// (key structure: word + 4 numerical fields)
|
|
//
|
|
// "trois" 1 4482 1 10b
|
|
// "trois" 1 4482 1 142
|
|
// "trois" 1 4484 1 40
|
|
// "trois" 1 449f 1 11e
|
|
// "trois" 1 4545 1 11
|
|
// "trois" 1 45d3 1 545
|
|
// "trois" 1 45e0 1 7e5
|
|
// "trois" 1 45e2 1 830
|
|
// "trois" 1 45e8 1 545
|
|
// "trois" 1 45fe 1 ec
|
|
// "trois" 1 4616 1 395
|
|
// "trois" 1 461a 1 1eb
|
|
// "trois" 1 4631 1 49
|
|
// "trois" 1 4634 1 48
|
|
// .... etc ....
|
|
//
|
|
// To compress we chose to only code differences between succesive entries.
|
|
//
|
|
// Differences in words are coded by 2 numbers and some letters:
|
|
// - the position within the word of the first letter that changes
|
|
// - the size of the new suffix
|
|
// - the letters in the new suffix
|
|
//
|
|
// Only differences in succesive numerical entries are stored.
|
|
//
|
|
// A flag is stored for each entry indicating which fields have changed.
|
|
//
|
|
// All this gives us a few numerical arrays which are themselves compressed
|
|
// and sent to the bitstream.
|
|
//
|
|
//
|
|
class WordDBCompress
|
|
{
|
|
public:
|
|
WordDBCompress();
|
|
WordDBCompress(int, int);
|
|
|
|
int Compress(const u_int8_t* inbuff, int inbuff_length, u_int8_t** outbuffp, int* outbuff_lengthp);
|
|
int Uncompress(const u_int8_t* inbuff, int inbuff_length, u_int8_t* outbuff, int outbuff_length);
|
|
|
|
//
|
|
// Return a new DB_CMPR_INFO initialized with characteristics of the
|
|
// current object and suitable as WordDB::CmprInfo argument.
|
|
//
|
|
DB_CMPR_INFO *CmprInfo();
|
|
|
|
private:
|
|
DB_CMPR_INFO *cmprInfo;
|
|
|
|
//ZLIB WordDBCompression Flags
|
|
int use_zlib;
|
|
int zlib_level;
|
|
|
|
// DEBUGING / BENCHMARKING
|
|
int debug;
|
|
// 0 : no debug no check
|
|
// 1 : TestCompress before each compression (but no debug within Compress Uncompress)
|
|
// 2 : use_tags (BitStream) within TestCompress -> Compress Uncompress
|
|
// 3 : verbose
|
|
int TestCompress(const u_int8_t* pagebuff, int pagebuffsize);
|
|
};
|
|
|
|
#endif
|