dfa_min_hash.h

00001 #ifndef ASTL_DFA_MIN_HASH_H
00002 #define ASTL_DFA_MIN_HASH_H
00003 
00004 // Descrition:  
00005 //   ASTL 2.0 Determinisic Finite Automaton Class Template DFA_min
00006 //   Dynamic minimal acyclic DFA implementing a perfect hash function
00007 //   from words (sequences of chars) to integers and from integers to
00008 //   words.
00009 //   This class has a limited interface: words can be added with the
00010 //   insert method but no direct state/transition creations are
00011 //   supported. Moreover, this container does not support tags.
00012 //   Sorry, words removal is not implemented yet!
00013 
00014 #include <astl.h>
00015 #include <hash.h>
00016 #include <match.h>
00017 #include <iostream>
00018 #include <set>
00019 #include <vector>
00020 #include <functional>
00021 #include <string>
00022 #include <cstring>    // memcpy
00023 #include <bitset>
00024 #include <iterator>
00025 #include <utility>    // pair<>
00026 
00027 using namespace std;
00028 
00029 namespace astl {
00030 
00053 template <typename CharTraits = plain>
00054 class DFA_min_hash : public DFA_min<CharTraits, hash_tag>
00055 {
00056 public:
00057   typedef DFA_min<CharTraits, hash_tag> super;
00058   typedef DFA_min_hash                  self;
00059   typedef typename super::state_type    state_type;
00060   typedef typename super::char_type     char_type;
00061   typedef typename super::char_traits   char_traits;
00062 
00068   template <typename InputI>
00069   int hash_value(InputI first, InputI last) const {
00070     return astl::hash_value(forwardc(*this), first, last);
00071   }
00072   
00077   int hash_value(const basic_string<char_type> &s) const {
00078     return hash_value(s.begin(), s.end());
00079   }
00080 
00085   template <typename OutputI>
00086   int value_hash(int w, OutputI out) const {
00087     return value_hash(forwardc(*this), w, out);
00088   }
00089 
00093   basic_string<char_type> value_hash(int w) const {
00094     basic_string<char_type> r;
00095     r.reserve(height(super::initial()));
00096     return value_hash(w, back_inserter(r));
00097   }
00098 
00105   template <class ForwardI>
00106   bool insert(ForwardI, ForwardI);
00107   
00113   bool insert(const basic_string<char_type> &s) {
00114     return insert(s.begin(), s.end());
00115   }
00116 };
00117 
00118   template <class CharTraits> 
00119   template <class ForwardI>
00120   bool DFA_min_hash<CharTraits>::insert(ForwardI first, ForwardI last)
00121   {
00122     {
00123       cursor<DFA_min_hash<CharTraits> > c(*this);
00124       if (match(c, first, last)) return false;  // word exists ?
00125     }
00126     if (!super::ready) super::prepare();  // build the structure if necessary
00127     stack_cursor<forward_cursor<self> > c(forwardc(*this));
00128     bool duplicating;
00129 
00130     for(duplicating = false; first != last; ++first) {
00131       if (!duplicating)
00132         pool(height(c.src())).erase(c.src());
00133 
00134       if (!c.find(*first)) {
00135         set_trans(c.src(), *first, super::new_state());
00136         duplicating = true;
00137         c.find(*first);
00138       }
00139       else 
00140         if (super::Q[c.aim()]->degree_in() > 1) {
00141           change_trans(c.src(), *first, duplicate_state(c.aim())); 
00142           duplicating = true;
00143           c.find(*first);
00144         }
00145       c.forward();
00146     }
00147 
00148     if (!duplicating) 
00149       pool(height(c.src())).erase(c.src());
00150 
00151     if (!final(c.src())) {
00152       tag(c.src()).wc() += 1;
00153       final(c.src()) = true;
00154     }
00155 
00156     unsigned int h = height(c.src());
00157 
00158     for(; c.backward(); ) {
00159       // try to insert the current state in its height pool
00160       // if an equivalent state is found its ID is returned, otherwise
00161       // it is inserted 
00162       pair<typename super::container::iterator, bool> i = 
00163         pool(height(c.aim())).insert(c.aim());
00164       if (i.second == false) {  // found an equivalent state ?
00165         state_type tmp = c.aim(); 
00166         // redirect the transition towards equivalent state
00167         change_trans(c.src(), c.letter(), *i.first);
00168         tag(*i.first).wc() = tag(tmp).wc();
00169         del_state(tmp);
00170       }
00171       tag(c.src()).wc() += 1; // update hashing
00172       h = max(height(c.src()), h+1); //Precond: states height initialized with 0
00173       height(c.src(), h);
00174     }
00175     pool(height(super::initial())).insert(super::initial());
00176     ++super::wc_;
00177     return true;
00178   }
00179 
00180 // This specialization of the ordering of states takes advantage of hash tags:
00181 template <typename Transition>
00182 inline
00183 bool operator<(const state<Transition, hash_tag> &x, const state<Transition, hash_tag> &y) {
00184   if (x.tag().wc() < y.tag().wc()) return true;
00185   if (x.tag().wc() == y.tag().wc()) {
00186     if (x.final() < y.final()) return true;
00187     if (x.final() == y.final()) {
00188       if (x.size() < y.size()) return true;
00189       else
00190         if (x.size() == y.size()) { 
00191           typename state<Transition, hash_tag>::const_iterator i = x.begin(), j = x.end(), k = y.begin();
00192           while (i != j)
00193             if (*i < *k) return true;
00194             else if (*k < *i) return false;
00195             else { ++i; ++k; }
00196           return false; // equivalent, not lower than
00197         }
00198     }
00199   }
00200   return false;
00201 }
00202 
00203 // statistics:
00204 
00205 // word count 2000000 (unsorted)
00206 // states 1735092
00207 // transitions 3470388
00208 // ../bin/dfaminhash: 144.54 s CPU time
00209 
00210 // word count 2000000 (sorted)
00211 // states 1735092
00212 // transitions 3470388
00213 // ../bin/dfaminhash: 73.46 s CPU time
00214 
00215 // word count 5000000 (sorted)
00216 // states 3365677
00217 // transitions 7328087
00218 // ../bin/dfaminhash: 202.44 s CPU time
00219 
00220 // word count 8000000 (sorted)
00221 // states 4648267
00222 // transitions 10575345
00223 // ../bin/dfaminhash: 334.99 s CPU time
00224 
00225 } // namespace astl
00226 
00227 #endif
00228 
00229   

Generated on Sun Mar 8 02:41:33 2009 for ASTL by  doxygen 1.5.7.1