dfa_compact.h

00001 /*
00002  * ASTL - the Automaton Standard Template Library.
00003  * C++ generic components for Finite State Automata handling.
00004  * Copyright (C) 2000-2003 Vincent Le Maout (vincent.lemaout@chello.fr).
00005  * 
00006  * This library is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  * 
00011  * This library is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  * 
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with this library; if not, write to the Free Software
00018  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00019  *
00020  */
00021 
00022 #ifndef ASTL_DFA_COMPACT_H
00023 #define ASTL_DFA_COMPACT_H
00024 
00025 // Descrition:    
00026 //   ASTL 2.0 Determinisic Finite Automaton Class Template DFA_compact
00027 //   Compact Representation with a single transition array 
00028 //  
00029 
00030 
00031 
00032 #ifndef _MSC_VER
00033 #include <unistd.h>   // close(int)
00034 #include <sys/mman.h>  // mmap
00035 #include <fcntl.h>     // read
00036 #include <errno.h>
00037 #endif
00038 
00039 #include <vector>
00040 #include <utility>     // pair<>
00041 #include <functional>  // identity<>
00042 #include <iostream>
00043 
00044 using namespace std;
00045 
00046 namespace astl {
00047 
00048 // ** WARNING: to be able to use tags and the binary save/load
00049 // functionality (along side memory mapping), the tag type MUST BE A POD TYPE **
00050 
00051 // Implementation notes
00052 // This representation makes use of 4 arrays:
00053 // 1. vector v1 for transitions labels (value_type == DFA::Alphabet)
00054 // 2. vector v2 for transitions aims (value_type == unsigned long)
00055 // 3. a bit vector for final states flags
00056 // 4. vector of tags
00057 
00058 // A matrix cell is made of two cells from vectors v1 and v2.
00059 // if (v1[n] == 0) then there is no transition is this cell
00060 //   then if (v2[n] != 0) then v2[n] is the tag index for the state in cell n+1
00061 //   else the cell is free
00062 // else v1[n] is the transition label and v2[n] is the transition target
00063 
00064 template <class DFA, class tag_mapping = identity<typename DFA::tag_type> >
00065 class DFA_compact : public DFA_concept
00066 {
00067 public:
00068   typedef typename DFA::char_traits         char_traits;
00069   typedef typename DFA::char_type           char_type;
00070   typedef unsigned long                     state_type;
00071   typedef typename tag_mapping::result_type tag_type;
00072  
00073 private:
00074   typedef mapable_vector<int>::size_type size_type;
00075   typedef mapable_vector<bool> set_F;
00076   typedef DFA_compact self;
00077 
00078 public:
00079   class const_iterator : public iterator<forward_iterator_tag, state_type>
00080   {
00081     friend class DFA_compact<DFA, tag_mapping>;
00082   private:
00083     state_type state;
00084     const mapable_vector<char_type> *ml;
00085     const mapable_vector<state_type> *ma;
00086 
00087     const_iterator(state_type x, const mapable_vector<char_type> *a, 
00088                    const mapable_vector<state_type> *s) 
00089       : state(x), ml(a), ma(s) { }    
00090 
00091   public:
00092 
00093     const_iterator() : ml(NULL), ma(NULL) { }
00094     state_type operator * () const { return (state); }
00095     const_iterator& operator ++ ()
00096     {
00097       for (; state < ml->size(); ++state) {
00098         if ((*ml)[state] == (char_type) 0 && (*ma)[state] != 0) {
00099           // this is a cell holding a tag index
00100           ++state; // there is a state after a tag
00101           break;
00102         }
00103       }
00104       return (*this);
00105     }
00106     const_iterator operator ++ (int)
00107     {
00108       const_iterator tmp = *this;
00109       ++(*this);
00110       return (tmp);
00111     }
00112     const_iterator& operator = (const const_iterator &x) 
00113     {
00114       state = x.state;
00115       ml = x.ml;
00116       ma = x.ma;
00117       return (*this);
00118     }
00119     bool operator == (const const_iterator &x) const {
00120       return (state == x.state);
00121     }
00122     bool operator != (const const_iterator &x) const {
00123       return (!(*this == x));
00124     }
00125   };
00126 
00127   class edges_type
00128   {
00129   private:
00130     state_type                s;
00131     const mapable_vector<char_type>   *ml;
00132     const mapable_vector<state_type> *ma;
00133 
00134   public:
00135     edges_type(state_type st, const mapable_vector<char_type> &l, 
00136                const mapable_vector<state_type> &a) 
00137       : s(st), ml(&l), ma(&a) { }
00138 
00139     typedef char_type key_type;
00140     typedef pair<const key_type, state_type> value_type;
00141     typedef less<char_type> key_compare;
00142 
00143     typedef value_type* pointer;
00144     typedef value_type& reference;
00145     typedef const value_type& const_reference;
00146     typedef state_type size_type;
00147     typedef typename DFA::edges_type::difference_type difference_type;
00148     
00149     class value_compare : public binary_function<value_type, value_type, bool>
00150     {
00151       friend class edges_type;
00152       
00153     protected:
00154       key_compare comp;
00155       value_compare(key_compare c) : comp(c) { }
00156       
00157     public:
00158       bool operator () (const value_type& x, const value_type& y) {
00159         return (comp(x.first, y.first));
00160       }
00161     };
00162 
00163         class const_iterator : std::iterator<bidirectional_iterator_tag, value_type>
00164     {
00165     public:
00166       const_iterator(const char_type *l, 
00167                      const state_type *a, size_type pos_ = 0)
00168         : ml(l), ma(a), pos(pos_) { }
00169 
00170       typedef pair<char_type, state_type> value_type;
00171 
00172       const_iterator() { }
00173       const_iterator(const const_iterator& i) 
00174         : ml(i.ml), ma(i.ma), pos(i.pos) 
00175       { }
00176 
00177       bool operator == (const const_iterator &x) const {
00178         return ml == x.ml;
00179       }
00180 
00181       value_type operator * () const {
00182         return typename const_iterator::value_type(*ml, *ma);
00183       }
00184 
00185       const_iterator& operator ++ ()
00186       {
00187         for (++pos, ++ml, ++ma; pos < self::char_traits::size; 
00188              ++pos, ++ml, ++ma)
00189           if (*ml != (char_type) 0 &&
00190               (size_t) self::char_traits::to_int_type(*ml) == pos)
00191             break;
00192         return (*this);
00193       }
00194       const_iterator operator ++ (int)
00195       {
00196         const_iterator tmp = (*this);
00197         ++(*this);
00198         return tmp;
00199       }
00200       const_iterator& operator -- ()
00201       {
00202         for (--pos, --ml, --ma; pos != 0; --pos, --ml, --ma)
00203           if (self::char_traits::to_int_type(*ml) == pos) 
00204             break;
00205         return (*this);
00206       }
00207       const_iterator operator -- (int)
00208       {
00209         const_iterator tmp = (*this);
00210         --(*this);
00211         return tmp;
00212       }
00213 
00214     protected:
00215       const char_type  *ml;
00216       const state_type *ma;
00217       size_type        pos;
00218     };
00219 
00220     // Back to edges_type class
00221 
00222     typedef const_iterator iterator;
00223 
00224     edges_type() : s((state_type) 0), ml(NULL), ma(NULL) { }
00225 
00226     key_compare key_comp() const
00227     { 
00228       key_compare tmp;
00229       return tmp; 
00230     }
00231 
00232     value_compare value_comp() const { return (value_compare(key_comp())); }
00233 
00234     state_type operator [] (const key_type& k) const
00235     {
00236       state_type q = s + self::char_traits::to_int_type(k);
00237       if ((*ml)[q] == k) return ((*ma)[q]);
00238       return (state_type) 0;
00239     }
00240 
00241     const_iterator begin() const {
00242       const_iterator r(ml->begin() + s, ma->begin() + s, 0);
00243       return ++r;
00244     }
00245 
00246     const_iterator end() const {
00247       return const_iterator(ml->begin() + s + self::char_traits::size, 
00248                             ma->begin() + s + self::char_traits::size, 
00249                             self::char_traits::size);
00250     }
00251 
00252     size_type size() const
00253     {
00254       typename self::char_traits::int_type offset;
00255       typename self::char_traits::int_type result = 0;
00256       
00257       for (offset = 0; offset < self::char_traits::size; ++offset)
00258         if ((*ml)[s + offset] != (char_type) 0 
00259             && self::char_traits::to_int_type((*ml)[s + offset]) == offset)
00260           ++result;
00261     
00262       return result;
00263     }
00264 
00265     bool empty() const { return size() == 0; }
00266 
00267     const_iterator find(const key_type &k) const
00268     {
00269       long mapped = self::char_traits::to_int_type(k);
00270       state_type result = s + mapped;
00271       if ((*ml)[result] == k)
00272         return const_iterator(ml->begin() + result, ma->begin() + result, mapped);
00273       return end();
00274     }
00275     
00276     size_type count(const key_type& k) const {
00277       return (*ml)[s + self::char_traits::to_int_type(k)] == k ? 1 : 0;
00278     }
00279 
00280     iterator lower_bound(const key_type &k) const { return find(k); }
00281 
00282     iterator upper_bound(const key_type &k) const { 
00283       iterator tmp = find(k);
00284       return tmp == end() ? tmp : ++tmp;
00285     }
00286 
00287     pair<iterator, iterator> equal_range(const key_type &k) const
00288     {
00289       return make_pair(lower_bound(k), upper_bound(k));
00290     }
00291   };
00292 
00293   // Back to DFA_compact class
00294 protected:
00295   mapable_vector<char_type>  letters;
00296   mapable_vector<state_type> aims;
00297   mapable_vector<tag_type>   tags;
00298   state_type I;
00299   unsigned long _state_count;
00300   unsigned long _trans_count;
00301 
00302 #ifndef _MSC_VER
00303   int fd;        // Memory mapping
00304 #endif
00305   set_F F;
00306 
00307   bool free_cell(state_type s) const { 
00308     return (letters[s] == (char_type) 0 && aims[s] == (state_type) 0);
00309   }
00310 
00311   bool tagged_cell(state_type s) const {
00312     return (letters[s] == (char_type) 0 && aims[s] != (state_type) 0);
00313   }
00314 
00315 public:
00316   static const state_type null_state = 0;
00317 
00318   ostream& write(ostream &out) const // binary
00319   { 
00320     out.write((const char *) &I, sizeof(I));
00321     out.write((const char *) &_state_count, sizeof(_state_count));
00322     out.write((const char *) &_trans_count, sizeof(_trans_count));
00323     letters.write(out);
00324     aims.write(out);
00325     tags.write(out);
00326     F.write(out);
00327     return out;
00328   }
00329 
00330   istream& read(istream &in) // binary
00331   {
00332     in.read((char *)&I, sizeof(I));
00333     in.read((char *)&_state_count, sizeof(_state_count));
00334     in.read((char *)&_trans_count, sizeof(_trans_count));
00335     letters.read(in);
00336     aims.read(in);
00337     tags.read(in);
00338     F.read(in);
00339     return in;
00340   }
00341 
00342 #ifndef _MSC_VER
00343   // Memory mapping
00344   bool mmap(const string &path)
00345   {
00346     fd = open(path.c_str(), O_RDONLY);
00347     if (fd == -1) return false;
00348     return mmap(fd);
00349   }
00350 
00351   bool mmap(int f)
00352   {
00353     ::read(f, (void*) &I, sizeof(I));
00354     ::read(f, (void*) &_state_count, sizeof(_state_count));
00355     ::read(f, (void*) &_trans_count, sizeof(_trans_count));
00356     if (!letters.mmap(f) || !aims.mmap(f) || !tags.mmap(f) || !F.mmap(f))
00357       return false;
00358     return true;
00359   }
00360 
00361   void munmap()
00362   {
00363     letters.munmap();
00364     aims.munmap();
00365     tags.munmap();
00366     F.munmap();
00367   }
00368 #endif
00369 
00370   state_type initial() const { 
00371     return I; 
00372   }
00373 
00374   set_F::const_reference final(state_type s) const {
00375     return F[s];
00376   }
00377 
00378   state_type delta1(state_type s, const char_type &l) const
00379   {
00380     state_type q = s + self::char_traits::to_int_type(l);
00381     if (letters[q] == l) return aims[q];
00382     return null_state;
00383   }
00384 
00385   unsigned long state_count() const { 
00386     return _state_count; 
00387   }
00388 
00389   unsigned long trans_count() const { 
00390     return _trans_count; 
00391   }
00392   
00393   const tag_type& tag(state_type s) const { 
00394     return tags[aims[s - 1]]; 
00395   }
00396 
00397   edges_type delta2(state_type s) const { 
00398     return edges_type(s, letters, aims); 
00399   }
00400 
00401   const_iterator begin() const
00402   {
00403     if (state_count() > 0)
00404       return const_iterator((state_type) 1, &letters, &aims);
00405     else
00406       return end();
00407   }
00408 
00409   const_iterator end() const {
00410     return const_iterator(letters.size(), &letters, &aims);
00411   }
00412 
00413   float density() const
00414   {
00415     unsigned long occupied_cells = 0;
00416     state_type i, fin = letters.size();
00417     
00418     for (i = 0; i != fin; ++i)
00419       if (!free_cell(i))
00420         ++occupied_cells;
00421 
00422     return ((float) occupied_cells / (float) fin);
00423   }
00424 
00425   void stats(ostream &out = cout) const
00426   {
00427     out << "Matrix size       : " << letters.size() << " cells (" 
00428         << aims.size() * sizeof(state_type) + letters.size() 
00429       * sizeof(char_type) + tags.size() * sizeof(tag_type) + 
00430       F.size() * sizeof(bool) << " bytes)" << endl;
00431     out << "Matrix density    : " << density() << endl;
00432     out << "States            : " << state_count() 
00433         << " (" << aims.size() * sizeof(state_type)
00434         << " bytes)" << endl;
00435     out << "Transitions       : " << trans_count() << " (" 
00436         << letters.size() * sizeof(char_type) 
00437         << " bytes)" << endl;
00438     out << "Tags              : " << tags.size() << " (" 
00439         << tags.size() * sizeof(tag_type)
00440         << " bytes)" << endl;
00441     out << "Final states bool : " << F.size() << " (" 
00442         << F.size() * sizeof(bool) 
00443         << " bytes)" << endl;
00444     out << "Space waste ratio : " << (1. - density()) * 100. << "%" << endl;
00445   }
00446 
00447   ~DFA_compact() {
00448 #ifndef _MSC_VER
00449     letters.munmap();
00450     aims.munmap();
00451     tags.munmap();
00452     F.munmap();
00453     if (fd > 0) close(fd);
00454 #endif
00455   }
00456 
00457   // TEMPORARY CODE:
00458   DFA_compact()
00459 #ifndef _MSC_VER
00460     : fd(0)
00461 #endif
00462   { 
00463     I = null_state;
00464   }
00465 
00466   DFA_compact(istream &in) 
00467 #ifndef _MSC_VER
00468     : fd(0)
00469 #endif
00470   {
00471     I = null_state;
00472     read(in);
00473   }
00474   
00475   DFA_compact(const DFA &dfa, long opt_value = 2) 
00476     : letters((unsigned int) (dfa.state_count() * 2 + dfa.trans_count())), 
00477       aims((unsigned int) (dfa.state_count() * 2 + dfa.trans_count())), 
00478       tags((unsigned int) 1), I(null_state), _state_count(0), _trans_count(0),
00479 #ifndef _MSC_VER
00480       fd(0), 
00481 #endif
00482       F((unsigned int) dfa.state_count())
00483   {
00484     if (dfa.state_count() > 0) {
00485       tag_mapping map_tag;
00486       typename DFA::const_iterator i, end_i = dfa.end();
00487       vector<state_type> old2new(dfa.state_count());//indexed by DFA::state_type
00488       state_type pos, solid_first_free = 0UL, first_free = 0UL;
00489       typename DFA::edges_type::const_iterator trans, edg_end;
00490       long count = 0;  // For time optimization of compacting process
00491 
00492       for (i = dfa.begin(); i != end_i; ++i) {
00493         ++count;
00494         ++_state_count;
00495 
00496         if (count == opt_value) {  // discard solid_first_free ?
00497           for(++solid_first_free; solid_first_free < letters.size() && 
00498                 !free_cell(solid_first_free); ++solid_first_free);
00499           count = 0;
00500         }
00501 
00502         while (solid_first_free + self::char_traits::size 
00503                >= letters.size()) {  // Matrix is too small
00504           // Doubling the size
00505           letters.resize(2 * letters.size(), (char_type) 0);
00506           aims.resize(2 * aims.size(), (state_type) 0);
00507         }
00508             
00509         first_free = solid_first_free;
00510 
00511         while (1) {
00512           pos = first_free + 1;  // Because we put the tag at first_free
00513           typename DFA::edges_type edg = dfa.delta2(*i);
00514           edg_end = edg.end();
00515 
00516           // Trying to put every transition by beginning at pos (first_free)
00517           for (trans = edg.begin(); trans != edg_end; ++trans)
00518             if (!free_cell(pos + self::char_traits::to_int_type((*trans).first)))
00519               break;
00520               
00521           if (trans == edg_end) {  // Ok, insert state
00522             // Storing new "name" of the state
00523             if (old2new.size() <= (unsigned long) *i)
00524               old2new.resize((unsigned long) (*i) * 2, null_state);
00525             old2new[(unsigned long) *i] = pos;
00526 
00527             // Putting the tag if not already in vector tags
00528             tag_type t_tmp = map_tag(dfa.tag(*i));
00529             typename mapable_vector<tag_type>::iterator position 
00530               = find(tags.begin() + 1, tags.end(), t_tmp);
00531             aims[pos - 1] = (state_type) (position - tags.begin()); 
00532             if (position == tags.end()) 
00533               tags.push_back(t_tmp);
00534                   
00535             // Put the state in F if needed
00536             if (dfa.final(*i)) {
00537               if (F.size() <= pos) F.resize(pos * 2, false);
00538               F[pos] = true;
00539             }
00540                   
00541             // Adding transitions
00542             unsigned long mapped_letter;
00543             for (trans = edg.begin(); trans != edg_end; ++trans) {
00544               mapped_letter = 
00545                 self::char_traits::to_int_type((*trans).first);
00546               letters[pos + mapped_letter] = (*trans).first;
00547               aims[pos + mapped_letter] = 
00548                 (state_type) (*trans).second;
00549               ++_trans_count;
00550             }
00551             if (first_free == solid_first_free)
00552               count = opt_value - 1; // Force solid_first_free updating
00553             break;    // move on to the next state
00554           }
00555                 
00556           // first_free is updated
00557           for(++first_free; first_free < letters.size() && 
00558                 !free_cell(first_free); ++first_free);
00559                 
00560           while (first_free + self::char_traits::size >= 
00561                  letters.size()) {  // Matrix is too small
00562             // Doubling the size
00563             letters.resize(2 * letters.size(), (char_type) 0);
00564             aims.resize(2 * aims.size(), (state_type) 0);
00565           }
00566         }
00567       }
00568         
00569       // Setting edges aim to states new "names"
00570       state_type last_one = 0;
00571       for (pos = 0; pos != letters.size(); ++pos)
00572         if (letters[pos] != (char_type) 0)
00573           aims[pos] = old2new[aims[pos]];
00574         else
00575           if (aims[pos] != 0) // letter == 0, aim != 0 => aim is a tag index
00576             last_one = pos + 1 + self::char_traits::size;
00577 
00578       if (dfa.initial() != dfa.null_state)
00579         I = old2new[(unsigned long) dfa.initial()];
00580       letters.resize(last_one + 1, (char_type) 0);
00581       aims.resize(last_one + 1, (state_type) 0);
00582       F.resize(last_one + 1, false);
00583     }
00584   }
00585 };
00586 
00587 template <typename DFA, typename TagMapping>
00588 const typename DFA_compact<DFA, TagMapping>::state_type 
00589 DFA_compact<DFA, TagMapping>::null_state;
00590 
00591 } // namespace astl
00592 
00593 #endif   // DFA_COMPACT_H
00594 
00595 
00596 
00597 
00598 

Generated on Sun Mar 8 02:41:32 2009 for ASTL by  doxygen 1.5.7.1