/* ===================================================================== // // Filename : hash.h // Author : kem // Description : hashtable template // // Modification History: // 6/18/97 bean Fixed [] operator, added documentation, changed // from linear probing to double hashing because // clustering was slowing things down in Sundance // 11/21/97 bean Add iterator code to solve inefficiency in // scanning code that was using the [] operator. // Also redocumented everything. // // ************************************************************************** */ // ------------------------------------------------------------------- /* HASHTABLE CLASS This class provides a data structure for fast lookup and storage. Standard functions are: hashtable(unsigned int maxsz = 0); - Creates a hashtable of the size passed in maxsz/ T& add(const char* key, T& item); - Adds a item to the table using the char[] as a key. int member(const char* key); - Is 'key' in the table? void remove(const char* key); - Delete item indexed by key from the table. unsigned int count(); - How many items are in the table? T& get(const char* key); - Returns a reference to the item indexed by key. PLEASE SEE THE FUNCTION DEFINITIONS BELOW FOR MORE INFORMATION ON THESE AND OTHER FUNCTIONS AVAILABLE INCLUDING ITERATORS AND SPECIAL OPERATORS. Interface Note -- this hash table keeps actual objects around -- so the objects must have copy constructors that actually copy the objects -- not just pointers to them -- note that you can have a hashtable of pointers to objects however -- and the compiler provides the straightforward copy constructor. for example... hashtable hsp; {you must allocate the strings yourself} hashtable hs; {the hashtable copies the strings} EXAMPLE CODE : (for a hashtable of string objects) #include "sunstr.h" #include "hash.h" #include int main(void) { sunstr AString; hashtable MyHashTable; // defaults to 100 buckets while (cin >> AString) { // read words from stdin MyHashTable.add(AString,AString); // This add is interesting -- note that we're storing // a string object in the hash table (that's the // second parameter), but we're also storing it under // the string's value, i.e. "abc". The string object // gets converted to a char* automatically by C++ // because the compiler knows that the first parameter // to add() must be a char* and there is a (char*) // typecasting operator defined in the string class. } // do a lookup on a word if (MyHashTable.member("abc")) cout << "Found 'abc' in the table." << endl; // do something to one of the items MyHashTable.get("abc").toUpper(); // Now the item "abc" has been changed to "ABC" // although it's still _keyed_ by "abc" // Find out how many items are in the table cout << MyHashTable.count() << " items in the table." << endl; // Get the 10th item... if (MyHashTable.count() >= 10) cout << MyHashTable[10] << " is the 10th item." << endl; // Iterate through all the items MyHashTable.ResetIterator(); for (unsigned int i=0; i < MyHashTable.count(); i++){ // these two lines get the address of the reference, // allowing us to use a pointer which is good if // you want to do more than one thing with the // item... string* SPtr; SPtr = &MyHashTable.Iterate(); // this line use the returned reference // MyHashTable.Iterate().toUpper(); // NOTE, however, that every call to Iterate() moves // the scan by one item, so you only want to call // Iterate() once for each pass through this loop. // The second iterate example above is just for // illustrative purposes -- in practice you would // do one or the other...not both. } // If you are storing actual objects, rather than pointers // to objects, in the table, then when the table is destroyed, // all memory will be released. If you are storing pointers // then you need to iterate through the table doing "delete" // on each valid item. In this case, we're storing actual // strings and the hashtable will be destroyed automatically // when the scope of main() is left, so no cleanup is needed. } */ #ifndef __HASH_H #define __HASH_H #include #include // for memset() #include #include "sunstr.h" template class hashtable { public: // ------------------------------------------------------------------- virtual T& add(const char* key, T& item) // ------------------------------------------------------------------- /* This function add the item T into the hash table indexed by the string (char*) passed in key. If key already exists, add will simply return the previous member */ { if (member(key)) return (*table[hash(key)]); ensure_we_have_enough_space(); unsigned int ind = hash(key, 1); table[ind] = new T(item); /* copy it */ assert(table[ind]); keys[ind] = new char[strlen(key)+1]; /* alloc mem for key */ assert(keys[ind]); strcpy(keys[ind], key); /* copy key */ num_elems++; return (*table[ind]); }; // ------------------------------------------------------------------- virtual int member(const char* key) // ------------------------------------------------------------------- /* This function returns 1 if the item indexed by key is in the hash table, 0 otherwise. NOTE: This function should really be declared as a const function, but it can't be without resorting to using the mutuable keyword so that the hash() function can do some manipulation without being considered non-const. We originally were using the newer C++ mutuable keyword standard, but the optimized compilers on the UltraSparcs weren't up-to-date enough. Perhaps it's time we either forget about the machine-specific compilers or get newer versions. bean 9/20/99 */ { return (table[hash(key)]!=0); }; // ------------------------------------------------------------------- virtual T& get(const char* key) // ------------------------------------------------------------------- /* This function returns a reference to the item in the table indexed by the key. */ { unsigned int ind = hash(key); assert(table[ind]); return (*table[ind]); }; // ------------------------------------------------------------------- virtual void remove(const char* key) // ------------------------------------------------------------------- /* This function removes an item from the table given the key. */ { unsigned int ind = hash(key); if (table[ind]!=0) { delete table[ind]; delete keys[ind]; table[ind]=0; keys[ind]=(char*)1; /* flag for deletion */ num_elems--; } }; // ------------------------------------------------------------------- virtual unsigned int count() const {return (num_elems);}; // ------------------------------------------------------------------- // ------------------------------------------------------------------- // ITERATOR CODE // // The following three functions are special iterators, which make // it easy and efficient to scan the contents of the hashtable // sequentially. To use, be sure to execute ResetIterator() first! // Then, use either Iterate() or Iterate(char**) to extract a // reference to the next item in the hashtable. Two versions of // Iterate() are available because some code needs to get at not // only the item itself but its key as well -- that's the advantage // of Iterate(char**) which sets the key value in the char** passed in. // MEMORY CAUTION: // If you use the Iterate(char**), remember that the hash table code // uses that char* to create a copy of the key value each time the // iterator is called. The code is smart enough to clean up the // existing memory of the char* before it copies in a new value, // but after the last iteration, it will be left up to you to delete // the final copy in memory! // Example code: // // hashtable.ResetIterator(); // *don't* forget this! // for (unsigned int i=0; i < hashtable.count(); i++) { // item = hashtable.Iterate(); // do something with item // } // or // // char* KeyValue = 0; // hashtable.ResetIterator(); // *don't* forget this! // for (unsigned int i=0; i < hashtable.count(); i++) { // item = hashtable.Iterate(&KeyValue); // do something with item and KeyValue // } // if (KeyValue) delete KeyValue; // clean up memory // // Of course, any changes made to the hash table during an iteration // will cause false results! So, be sure to only use the iterators // when no changes will be made to the table. // ------------------------------------------------------------------- virtual void ResetIterator() { // ------------------------------------------------------------------- /* This function resets the counters used by the iterators */ tableind = 0; // raw index into the table's buckets lastind = -1; // index to an item in the table } // ------------------------------------------------------------------- virtual T& Iterate( void ){ // ------------------------------------------------------------------- /* This function, when called repeatedly, returns references to succcessive items in the hashtable. If it gets called more times than there are items in the table, it will wrap around to the beginning of the table... */ // increment the lastind we're looking for lastind++; // if we run off the end, wrap around again... if (lastind == (int)num_elems) ResetIterator(); while (!table[tableind]) tableind++; return (*table[tableind++]); } // ------------------------------------------------------------------- virtual T& Iterate( char** PutKeyHere ){ // ------------------------------------------------------------------- /* This function, when called repeatedly, returns references to succcessive items in the hashtable. It also sets the PutKeyHere parameter to the key of the time being returned. Please remember to initialize PutKeyHere to null the first time. Also, keep in mind to initialize *PutKeyHere to 0, NOT **PutKeyHere. Finally, in your calling code, remember to delete the pointer to the copy of the key after the last iteration! If it gets called more times than there are items in the table, it will wrap around to the beginning of the table... */ // increment the lastind we're looking for lastind++; // if we run off the end, wrap around again... if (lastind == (int)num_elems) ResetIterator(); while (!table[tableind]) tableind++; if (PutKeyHere) { // user wants the key value sent back // 1. delete space for PutKeyHere if (*PutKeyHere) delete (*PutKeyHere); // 2. allocate new space *PutKeyHere = new char[strlen(keys[tableind])+1]; // 3. copy over the string strcpy(*PutKeyHere, keys[tableind]); } return (*table[tableind++]); } // ------------------------------------------------------------------- virtual T& operator[](unsigned int ind) const // ------------------------------------------------------------------- /* This function retrieves the i'th element in the table when measured sequentially from the front of the table. If you're planning on scanning through the table from beginning to end, DO NOT USE this function -- use the iterators instead...this function *always* starts at the beginning of the table, so it gets very inefficient if you're doing a scan through the table. */ /* See weird code documentation note below subj: LocalThis */ { // do a check on ind -- this also means that we don't have to // worry about falling off the end of the table... assert(ind < num_elems); hashtable * const LocalThis = const_cast(this); LocalThis->lastind = -1; LocalThis->tableind = 0; for (; lastind < (int)ind; LocalThis->tableind++) if (table[tableind]) LocalThis->lastind++; LocalThis->tableind--; //return (keys[tableind]); why did this work ever? bean 4/5/98 return (*table[tableind]); }; // ------------------------------------------------------------------- virtual char* getkey(unsigned int ind) const // ------------------------------------------------------------------- /* This function retrieves the key used to access the ind'th item in the table. This function always scans from the beginning of the table up to the ind'th item, so if you're doing a sequential scan through the table, use the iterators -- they're much more efficient. */ { assert(ind < num_elems); hashtable * const LocalThis = const_cast(this); LocalThis->lastind = -1; LocalThis->tableind = 0; for (;lastind < (int)ind; LocalThis->tableind++) if (table[tableind]) LocalThis->lastind++; LocalThis->tableind--; return (keys[tableind]); }; // ------------------------------------------------------------------- virtual void reset() // ------------------------------------------------------------------- /* This function clears and resets the hash table to a set size defined in the constructor. To force your own initial table size, use reset_and_resize(unsigned int). */ { reset_and_resize(maxsize); } // ------------------------------------------------------------------- virtual void reset_and_resize(unsigned int maxsz) // ------------------------------------------------------------------- /* This function clears and resets the hash table to a set size */ { clear(); lastind = -1; tableind = 0; if (maxsz<1) maxsize = 100; else maxsize = maxsz; table = new T*[maxsize]; keys = new char*[maxsize]; assert(table); assert(keys); memset(table, 0, sizeof(T*)*maxsize); memset(keys, 0, sizeof(char*)*maxsize); num_elems=0; }; // ------------------------------------------------------------------- hashtable(unsigned int maxsz = 0) // ------------------------------------------------------------------- // constructor - default size set to 100 { lastind = -1; tableind = 0; if (maxsz==0) maxsize = 100; else maxsize = maxsz; table = new T*[maxsize]; keys = new char*[maxsize]; assert(table); assert(keys); memset(table, 0, sizeof(T*)*maxsize); memset(keys, 0, sizeof(char*)*maxsize); num_elems=0; }; // ------------------------------------------------------------------- hashtable(const hashtable& ht) // ------------------------------------------------------------------- /* Constructs the hash table from another one */ { lastind = -1; tableind = 0; maxsize = ht.maxsize; num_elems=0; table = new T*[maxsize]; keys = new char*[maxsize]; assert(table); assert(keys); memset(table, 0, sizeof(T*)*maxsize); memset(keys, 0, sizeof(char*)*maxsize); for (unsigned int i=0;i= 2) ensure_we_have_enough_space(1); } // Some debugging messages... //cerr << "(" << num_collisions << "," //<< num_elems << "," << maxsize << ") "; return (result); }; // ------------------------------------------------------------------- void ensure_we_have_enough_space(int forced = 0) // ------------------------------------------------------------------- { // forced = TRUE if we want the table grown regardless of its "fullness" // Increase the size of the table if we're more than 10% full // or if the forced flag is true. if ((num_elems >= ((maxsize * 0.1)-1)) || forced) { int oldmax = maxsize; T **oldtable = table; char **oldkeys = keys; maxsize = maxsize * 2; table = new T*[maxsize]; keys = new char*[maxsize]; assert(table); assert(keys); memset(table, 0, sizeof(T*)*maxsize); memset(keys, 0, sizeof(char*)*maxsize); for (int i=0;i> MyHashTable; -- do some processing with MyHashTable -- */ // ------------------------------------------------------------------- template class hashio : public hashtable { // ------------------------------------------------------------------- friend ostream& operator<<(ostream& os, hashio& ht) // ------------------------------------------------------------------- { /* Rewritted to take advantage of the iterator code, bean 8/2/99 */ ht.ResetIterator(); char* Key = 0; for (unsigned int i=0;i>(istream& is, hashio& ht) // ------------------------------------------------------------------- { sunstr key; T item; while (is >> key) { is >> item; ht.add(key,item); } return (is); }; public: hashio(unsigned int maxsz = 0) : hashtable(maxsz){}; hashio(const hashio& ht) : hashtable(ht){}; }; // Long Key hash table -- optimized for really long string keys. template class hashLK : public hashtable { // ---------------- hash function ------------------------------------ virtual unsigned int hash(const char* key, int inserting = 0) // ------------------------------------------------------------------- { assert(key); int result = 0; // the hash value from hash1() fn int skip = 0; // the value from hash2() fn int looping = 0; // are we in an infinite loop? int Length = strlen(key); unsigned int i; unsigned int EveryNChar = 1; if (Length > 100) EveryNChar = 5; else if (Length > 60) EveryNChar = 4; else if (Length > 30) EveryNChar = 3; // initial hash function here for (i=0; i < (unsigned int)Length; i += EveryNChar) result = (64 * result + key[i]); result = result % hashtable::maxsize; // second hash function here skip = 64 - (result % 64); /* the following will stop as soon as we find an empty slot, or a slot with the same key -- or if we find a key that says its been deleted (==1) and we're in insertion mode. */ // look for an empty bucket while (hashtable::keys[result]!=0){ // if this bucket holds a deleted value and we're inserting, use it! if ((hashtable::keys[result]==(char*)1) && inserting) return (result); // if this bucket holds some value and it's == to our key, reuse it! if ((hashtable::keys[result]!=(char*)1) && (!strcmp(hashtable::keys[result], key))) return (result); // this next line implements simple linear probing which is causing // clustering, and that's slowing things down to a crawl // result = (result + 1) % maxsize; // let's try double hashing // HEY! What about setting the skip value here...that way the double // hashing function doesn't get called except when needed. //skip = doublehash( result ); result = (result + skip) % hashtable::maxsize; if (result < skip) looping++; // we looped to the start of the table /* The trouble with double hashing is that we run the risk of getting in an infinite loop due to an even valued skip size and table size. Without having a prime number generator, we can test for looping by checking to see if we've gone from the end of the table back to the front more than once. If so, then we better grow the table! */ if (looping >= 2) hashtable::ensure_we_have_enough_space(1); } // Some debugging messages... //cerr << "(" << num_collisions << "," //<< num_elems << "," << maxsize << ") "; return (result); }; }; /* ostream& operator<<(ostream&, const hashio&); istream& operator>>(istream&, hashio&); */ #endif /*#ifndef __HASH_H*/