AbstractHashTable.h

00001 /*  _________________________________________________________________________
00002  *
00003  *  UTILIB: A utility library for developing portable C++ codes.
00004  *  Copyright (c) 2001, Sandia National Laboratories.
00005  *  This software is distributed under the GNU Lesser General Public License.
00006  *  For more information, see the README file in the top UTILIB directory.
00007  *  _________________________________________________________________________
00008  */
00009 
00010 //
00011 // AbstractHashTable.h
00012 //
00025 #ifndef __AbstractHashTable_h
00026 #define __AbstractHashTable_h
00027 
00028 #ifdef __GNUC__
00029 #pragma interface
00030 #endif
00031 
00032 #ifdef NON_ANSI
00033 #include <iostream.h>
00034 #else
00035 #include <iostream>
00036 using namespace std;
00037 #endif
00038 
00039 #ifndef ANSI_HDRS
00040 #include <string.h>
00041 #include <stdio.h>
00042 #else
00043 #include <cstring>
00044 #include <cstdio>
00045 #endif
00046 
00047 #include "_math.h"
00048 #include "LinkedList.h"
00049 #include "BasicArray.h"
00050 #include "hash_fn.h"
00051 #include "compare.h"
00052 
00053 //
00054 // Externs of a list of primes that are good for hash tables
00055 //
00056 extern int              utilib_num_primes;
00057 extern unsigned long    utilib_prime_list[];
00058 
00059 
00060 template <class T, class KEY>
00061 class AbstractHashTable
00062 { 
00063 public:
00065   typedef size_t size_type;
00067   typedef ListItem<T*>* hash_element;
00068   
00069 public:
00070 
00072   explicit AbstractHashTable(const char* nameBuff = "Unnamed");
00074   explicit AbstractHashTable(size_type initial_size, 
00075                                         const char* nameBuff = "Unnamed");
00077   virtual ~AbstractHashTable()
00078                 {
00079                 clear();
00080                 ListItem<T*>::delete_unused();
00081                 }
00082 
00084   bool      empty() const {return (data.len() == 0);}
00086   operator bool()   const {return (data.len() != 0);}
00088   int      size()  const {return data.len();}
00089 
00095   virtual T* find(KEY& key) const;
00096 
00098   virtual T* first() const
00099                 {
00100                 if (data.len() == 0) return ((T*)0);
00101                 return data.head()->data(); 
00102                 }
00103 
00105   virtual void add(KEY& key)
00106                 {insert(&key);}
00111   virtual void add(KEY& key, T*& item)
00112                 {item = insert(&key);}
00118   virtual void remove(KEY& key, bool& status)
00119                 {
00120                 T* tmp = find(key);
00121                 if (tmp) extract(tmp,status);
00122                 else     status = false;
00123                 }
00129   virtual void remove(T*& item, bool& status)
00130                 {
00131                 if (!item)
00132                    status = false;
00133                 else {
00134                    extract(item,status);
00135                    item = NULL;
00136                    }
00137                 }
00143   virtual void remove(T*& item, KEY& key, bool& status)
00144                 {
00145                 if (!item)
00146                    status = false;
00147                 else {
00148                    key = item->key();
00149                    extract(item,status);
00150                    item = NULL;
00151                    }
00152                 }
00153 
00155   virtual T* next(T* item);
00156 
00158   int      write(ostream& os) const;
00160   int      read(istream& is);
00162   void clear(); 
00163 
00165   void     statistics(ostream& os);
00166 
00168   bool& ignore_duplicates()
00169                 {return ignore_duplicates_flag;}
00170 
00172   bool using_prime_ht;
00174   void set_hash_fn(size_type (*hashfn)(const KEY&,size_type))
00175                 {curr_hashfn = hashfn;}
00177   size_type (*curr_hashfn)(const KEY&,size_type);
00178 
00179  protected:
00180 
00182   virtual size_type hash(KEY& key, size_type table_size) const
00183         {
00184         if (curr_hashfn)
00185            return (*curr_hashfn)(key,table_size);
00186         else
00187            return hash_fn(key,table_size);
00188         }
00190   virtual T* insert(KEY* key);
00192   void    insert(T* item, const bool resize_OK=true);
00193 
00199   virtual void    extract(T* item, bool& status);
00201   void resize(const int newsize=-1);
00202 
00204   LinkedList<T*> data;
00206   BasicArray<ListItem<T*>*> table;
00208   BasicArray<int> bucket_size;
00209 
00211   double max_load;
00213   bool ignore_duplicates_flag;
00214         
00215 };
00216 
00217 
00218 
00219 template <class T, class KEY>
00220 inline ostream& operator<<(ostream& output, const AbstractHashTable<T,KEY>& 
00221                                         table)
00222 { table.write(output); return(output); }
00223  
00224 
00225 
00226 template <class T, class KEY>
00227 inline istream& operator>>(istream& input, AbstractHashTable<T,KEY>& table)
00228 { table.read(input); return(input); }
00229 
00230 
00231 
00232 template <class T, class KEY>
00233 AbstractHashTable<T,KEY>::AbstractHashTable(const char* /*nameBuff*/):
00234   using_prime_ht(true),
00235   curr_hashfn(0),
00236   table(utilib_prime_list[0]),
00237   bucket_size(utilib_prime_list[0]),
00238   max_load(3.0),
00239   ignore_duplicates_flag(true)
00240 { bucket_size=0; table = (ListItem<T*>*)NULL; }
00241 
00242 
00243 
00244 template <class T, class KEY>
00245 AbstractHashTable<T,KEY>::AbstractHashTable(size_type initial_size, 
00246                                                 const char* /*nameBuff*/) :
00247   using_prime_ht(true),
00248   curr_hashfn(0),
00249   table(initial_size),
00250   bucket_size(initial_size),
00251   max_load(3.0),
00252   ignore_duplicates_flag(true)
00253 { bucket_size=0; table = (ListItem<T*>*)NULL; }
00254 
00255 
00256 
00257 template <class T, class KEY>
00258 T* AbstractHashTable<T,KEY>::find(KEY& key) const
00259 {
00260 size_type index = hash(key, table.size());
00261 if (bucket_size[index] == 0)
00262    return ((T*)NULL);
00263 
00264 hash_element curr = table[index];
00265 
00266 int i=0;
00267 for (; curr && (i<bucket_size[index]); i++) {
00268   if (curr->data()->compare(key) == 0)
00269      break;
00270   curr = data.next(curr);
00271   }
00272 
00273 if ((curr == NULL) || (i == bucket_size[index]))
00274    return NULL;
00275 return curr->data();
00276 }
00277 
00278 
00279 
00280 //
00281 // This code is virtually identical to 'find', only we return the 
00282 // hash table item for the 'next' element in the 'data' linked list.
00283 //
00284 template <class T, class KEY>
00285 T* AbstractHashTable<T,KEY>::next(T* item)
00286 {
00287 size_type index = hash(item->key(),table.size());
00288 if (bucket_size[index] == 0)
00289    return ((T*)NULL);
00290 
00291 hash_element curr = table[index];
00292 
00293 int i=0;
00294 for (; i<bucket_size[index]; i++) {
00295   if (item->compare(curr->data()->key()) == 0)
00296      break;
00297   curr = data.next(curr);
00298   }
00299 
00300 curr = data.next(curr);
00301 if (curr == NULL)
00302    return NULL;
00303 return curr->data();
00304 }
00305 
00306 
00307 
00308 template <class T, class KEY>
00309 T* AbstractHashTable<T,KEY>::insert(KEY* key)
00310 {
00311 T* item = new T(key);
00312 if (!item)
00313    ErrAbort("AbstractHashTable<T,KEY>::insert - memory error.");
00314 insert(item);
00315 return item;
00316 }
00317 
00318 
00319 template <class T, class KEY>
00320 void AbstractHashTable<T,KEY>::insert(T* item, const bool resize_OK)
00321 {
00322 bool resize_flag=false;
00323 size_type index;
00324 if ( !(ignore_duplicates_flag && (find(item->key()) != NULL)) ) {
00325    index = hash(item->key(),table.size());
00326    table[index] = data.insert(item, table[index]);
00327    bucket_size[index]++;
00328    }
00329 if (resize_OK == false) return;
00330 if ( (resize_flag == true) || 
00331      (max_load < (((double)data.size())/((double)table.size()))) )
00332    resize();
00333 }
00334 
00335 
00336 
00337 template <class T, class KEY>
00338 void AbstractHashTable<T,KEY>::extract(T* item, bool& status)
00339 {
00340 hash_element curr;
00341 status = false;
00342 
00343 if ((curr = data.find(item)) != NULL) {
00344    size_type index = hash(item->key(),table.size());
00345    if (curr == table[index]) {
00346       if (bucket_size[index] > 1)
00347          table[index] = data.next(table[index]);
00348       else
00349          table[index] = NULL;
00350       }
00351    data.remove(curr);
00352    bucket_size[index]--;
00353    status = true;
00354    delete item;
00355    }
00356 }
00357 
00358 
00359 
00360 template <class T, class KEY>
00361 void AbstractHashTable<T,KEY>::statistics(ostream& os)
00362 {
00363 os << "Hash table size is " << table.size() << endl;
00364 os << "Total number of objects is " << data.size() << endl;
00365 os << "Table load is " << (((double)data.size())/table.size()) << endl;
00366 os << "Largest bucket contains " << max(bucket_size) << " objects" << endl;
00367 os << "Largest bucket is #" << argmax(bucket_size) << endl;
00368 os << "Hash table histogram:" << endl;
00369 char str[128];
00370 sprintf(str,"%10s %10s %10s %10s", "size", "buckets", "objects", "sum-pct");
00371 os << str << endl;
00372 
00373 size_type buckets = table.len();
00374 int tnbuckets=0;
00375 for (int j=0; j<16; j++) {
00376   int nbuckets=0;
00377   int nobjs=0;
00378   for (size_type i=0; i < buckets; i++) {
00379     if (bucket_size[i] == j) {
00380        nbuckets++;
00381        nobjs += j;
00382        }
00383   }
00384   tnbuckets+=nobjs;
00385   sprintf(str,"%10d %10d %10d %10d", j, nbuckets, nobjs, (int)((100.0*tnbuckets)/data.len()) );
00386   os << str << endl;
00387   }
00388 {
00389 int nbuckets=0;
00390 int nobjs=0;
00391 for (size_type i=0; i < buckets; i++) {
00392   if (bucket_size[i] >= 16) {
00393      nbuckets++;
00394      nobjs += bucket_size[i];
00395      }
00396   }
00397 tnbuckets+=nobjs;
00398 sprintf(str,"%10s %10d %10d %10d", ">15", nbuckets, nobjs, (int)((100.0*tnbuckets)/data.len()) );
00399 os << str << endl;
00400 }
00401 }
00402 
00403 
00404 
00405 template <class T, class KEY>
00406 void AbstractHashTable<T,KEY>::clear()
00407 {
00408 bucket_size = 0;
00409 while (data) {
00410   T* tmp = data.head()->data();
00411   delete tmp;
00412   data.remove(data.head());
00413   }
00414 table = (ListItem<T*>*)NULL;
00415 }
00416 
00417 
00418 template <class T, class KEY>
00419 void AbstractHashTable<T,KEY>::resize(const int newsize)
00420 {
00421 int tmp = newsize;
00422 bool flag=true;
00423 
00424 while (flag) {
00425    if (tmp == -1) {
00426       if (using_prime_ht) {
00427          int i=0;
00428          while (i < utilib_num_primes) {
00429            if (utilib_prime_list[i] > table.size())
00430               break;
00431            i++;
00432            }
00433          if (utilib_prime_list[i] <= table.size())
00434             ErrAbort("AbstractHashTable<T,KEY>::resize -- Cannot automatically resize the hashtable any larger.");
00435          tmp = utilib_prime_list[i];
00436          }
00437       else {
00438          tmp = table.size()*2;
00439          }
00440       }
00441       
00442    table.resize(tmp);
00443    table = (ListItem<T*>*)NULL;
00444    bucket_size.resize(tmp);
00445    bucket_size = 0;
00446    
00447    int ndata = data.size();
00448    for (int i=0; i<ndata; i++) {
00449      T* tmp_item=NULL;
00450      data.remove(tmp_item);
00451      if (data.size() != (ndata-1))
00452         ErrAbort("Bug here - First.");
00453      insert(tmp_item,false);
00454      if (data.size() != ndata)
00455         ErrAbort("Bug here - Second.");
00456      }
00457 
00458    //
00459    // We might need to loop until the table is big enough, so
00460    // we set the flag variable for now...
00461    //
00462    flag = false;
00463    }
00464 }
00465 
00466 
00467 template <class T, class KEY>
00468 int AbstractHashTable<T,KEY>::write(ostream& os) const
00469 {
00470 os << data.size() << endl;
00471 
00472 hash_element curr = data.head();
00473 while (curr) {
00474   curr->data()->write(os);
00475   os << endl;
00476   curr = data.next(curr);
00477   }
00478   
00479 return OK;
00480 }
00481 
00482 
00483 
00484 //
00485 // BUGGY!!!
00486 //
00487 template <class T, class KEY>
00488 int AbstractHashTable<T,KEY>::read(istream& is)
00489 {
00490 clear();
00491 size_type Size;
00492 is >> Size;
00493 for (size_type i=0; i<Size; i++) {
00494   T* item;
00495   item->read(is);
00496   insert(item);
00497   }
00498 
00499 return OK;
00500 }
00501 
00502 
00503 #endif