fa_compress.h

00001 /*
00002  * ASTL - the Automaton Standard Template Library.
00003  * C++ generic components for Finite State Automata handling.
00004  * Copyright (C) 2000-2003 Vincent Le Maout (vincent.lemaout@chello.fr).
00005  * 
00006  * This library is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  * 
00011  * This library is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  * 
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with this library; if not, write to the Free Software
00018  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00019  *
00020  */
00021 
00022 #ifndef LILIAN_FA_COMPRESS_H
00023 #define LILIAN_FA_COMPRESS_H
00024 
00025 
00026 #include <concept.h>
00027 #include <alphabet.h>
00028 #include <tag.h>
00029 #include <tools.h>
00030 
00031 #ifndef _MSC_VER
00032 #include <unistd.h>   // close(int)
00033 #include <sys/mman.h>  // mmap
00034 #include <fcntl.h>     // read
00035 #include <errno.h>
00036 #endif
00037 
00038 #include <vector>
00039 #include <utility>     // pair<>
00040 #include <functional>  // identity<>
00041 #include <algorithm>   // rotate<>()
00042 
00043 using namespace std;
00044 
00045 // Notes: transition access time is linear
00046 //        can be binary read/written from/to disk and memory mapped
00047 //        provided that *** the tag type is a POD type ***
00048 //        *** do not use the letter '\0' ***
00049 
00050 namespace astl {
00051 
00052 template <typename CharTraits = plain, typename TagType = empty_tag>
00053 class FA_compress : public FA_concept
00054 {
00055 public:
00056   typedef CharTraits       char_traits;
00057   typedef typename CharTraits::char_type char_type;
00058   typedef unsigned int     state_type;
00059   typedef TagType          tag_type;
00060   typedef FA_concept       concept;
00061 
00062 #ifndef _MSC_VER
00063 protected:
00064 #else
00065 public:
00066 #endif
00067   typedef mapable_vector<bool> set_F;
00068   typedef FA_compress self;
00069 
00070   mapable_vector<TagType>    tags;
00071   mapable_vector<char_type>  letters;
00072   mapable_vector<state_type> aims_tags;
00073 
00074 public:
00075   class const_iterator : public iterator<forward_iterator_tag, state_type>
00076   {
00077   protected:
00078     state_type state;
00079     const mapable_vector<char_type> *ml;
00080 
00081   public:
00082     const_iterator() {}
00083     const_iterator(state_type x, const mapable_vector<char_type> &a)
00084       : state(x), ml(&a) 
00085     { }
00086 
00087     state_type operator*() const { 
00088       return state; 
00089     }
00090 
00091     const_iterator& operator++() {
00092       while ((*ml)[++state] != (char_type) 0); // sentry at the end anyway
00093       return *this;
00094     }
00095 
00096     const_iterator operator ++ (int) {
00097       const_iterator tmp = *this;
00098       ++(*this);
00099       return tmp;
00100     }
00101 
00102     bool operator==(const const_iterator &x) const {
00103       return (state == x.state);
00104     }
00105     bool operator != (const const_iterator &x) const {
00106       return (!(*this == x));
00107     }
00108   };
00109 
00110   class edges_type
00111   {
00112   private:
00113     //    const mapable_vector<char_type> *a;
00114     //    const mapable_vector<state_type> *p;
00115     state_type q;
00116     const FA_compress *fa;
00117 
00118   public:
00119     edges_type()
00120     { }
00121 
00122     edges_type(state_type s, const FA_compress &f)
00123       //               const mapable_vector<char_type> &i,
00124       //               const mapable_vector<state_type> &j)
00125       //      : a(&i), p(&j), q(s)
00126       : q(s + 1), fa(&f)
00127     { }
00128 
00129     typedef char_type key_type;
00130     typedef pair<const key_type, state_type> value_type;
00131     typedef less<char_type> key_compare;
00132 
00133     typedef value_type* pointer;
00134     typedef value_type& reference;
00135     typedef const value_type& const_reference;
00136     typedef state_type size_type;
00137     typedef int difference_type;
00138     
00139     struct value_compare 
00140       : public binary_function<value_type, value_type, bool>
00141     {
00142       key_compare comp;
00143       value_compare(key_compare c) : comp(c) { }
00144       bool operator ()(const value_type& x, const value_type& y) {
00145         return comp(x.first, y.first);
00146       }
00147     };
00148 
00149     class const_iterator 
00150       : iterator<bidirectional_iterator_tag, value_type>
00151     {
00152     protected:
00153       typename mapable_vector<char_type>::const_iterator a;
00154       typename mapable_vector<state_type>::const_iterator q;
00155 
00156     public:
00157       typedef pair<char_type, state_type> value_type;
00158 
00159       const_iterator() 
00160       { }
00161 
00162       const_iterator(typename mapable_vector<char_type>::const_iterator i,
00163                      typename mapable_vector<state_type>::const_iterator j)
00164         : a(i), q(j)
00165       { }
00166 
00167       // end of range:
00168       const_iterator(typename mapable_vector<char_type>::const_iterator i)
00169         : a(i)
00170       { }
00171         
00172       bool operator==(const const_iterator &x) const {
00173         return (*a == (char_type) 0 && *x.a == (char_type) 0) // end
00174                                                               // of range
00175           || (a == x.a && q == x.q);
00176       }
00177 
00178       pair<char_type, state_type> operator *() const {
00179         return make_pair(*a, *q);
00180       }
00181 
00182       const_iterator& operator ++() {
00183         ++a; ++q;
00184         return *this;
00185       }
00186 
00187       const_iterator operator ++(int) {
00188         const_iterator tmp = (*this);
00189         ++(*this);
00190         return tmp;
00191       }
00192 
00193       const_iterator& operator --() {
00194         --a; --q;
00195         return (*this);
00196       }
00197       const_iterator operator --(int) {
00198         const_iterator tmp = (*this);
00199         --(*this);
00200         return tmp;
00201       }
00202     };
00203 
00204     // Back to edges_type class
00205 
00206     key_compare key_comp() const { 
00207       key_compare tmp;
00208       return (tmp); 
00209     }
00210 
00211     value_compare value_comp() const { 
00212       return value_compare(key_comp()); 
00213     } 
00214 
00215     /*
00216     state_type operator[](key_type k) const
00217     {
00218       typename mapable_vector<char_type>::const_iterator i;
00219       for(i = a->begin() + q + 1;
00220           *i != (char_type) 0 && *i != k; ++i);
00221       return *i == (char_type) 0 ? null_state : (*p)[i - a->begin()];
00222     }
00223     */
00224 
00225     const_iterator begin() const {
00226       return const_iterator(fa->letters.begin() + q, fa->aims_tags.begin() + q);
00227       //      return const_iterator(a->begin() + q + 1, p->begin() + q + 1);
00228     }
00229 
00230     const_iterator end() const {
00231       return const_iterator(fa->letters.end() - 1);
00232       //      return const_iterator(a->end() - 1);
00233     }
00234 
00235     /*
00236     size_type size() const {
00237       return find(a->begin() + q + 1, a->end(), 
00238                   (char_type) 0) - (a->begin() + q + 1);
00239     }
00240     */
00241 
00242     bool empty() const {
00243       return fa->letters[q] == (char_type) 0;
00244       //      return (*a)[q + 1] == (char_type) 0;
00245     }
00246 
00247     /*
00248     const_iterator find(const key_type &k) const
00249     {
00250       typename mapable_vector<char_type>::const_iterator i; 
00251       for(i = a->begin() + q + 1;
00252           *i != (char_type) 0 && *i != k; ++i);
00253       return *i == (char_type) 0 ? 
00254         end() : const_iterator(i, p->begin() + (i - a->begin()));
00255     }
00256 
00257     size_type count(const key_type& k) const
00258     {
00259       const_iterator i = find(k);
00260       size_type c = 0;
00261       for(; i->first == k; ++i, ++c);
00262       return c;
00263     }
00264     */
00265 
00266     // assume transitions are sorted:
00267     const_iterator lower_bound(const key_type &k) const {
00268       const_iterator j;
00269       for(j = begin(); j != end() && (*j).first < k; ++j);
00270       return j;
00271     }
00272 
00273     // assume transitions are sorted:
00274     const_iterator upper_bound(const key_type &k) const {
00275       const_iterator j;
00276       for(j = begin(); j != end() && !(k < (*j).first); ++j);
00277       return j;
00278     }
00279 
00280     pair<const_iterator, const_iterator> equal_range(const key_type &k) const
00281     {
00282       return make_pair(lower_bound(k), upper_bound(k));
00283     }
00284   };
00285 
00286   // Back to FA class
00287 protected:
00288   state_type initial_;
00289   set<state_type> I;
00290   unsigned int _state_count;
00291   unsigned int _trans_count;
00292 
00293   int fd;        // Memory mapping under UNIX
00294   set_F F;
00295 
00296 public:
00297 #ifndef _MSC_VER
00298   static const state_type null_state = 0;
00299 #else
00300   static const state_type null_state;
00301 #endif
00302 
00303   bool write(const string &path) {
00304     ofstream out(path.c_str(), ios::out | ios::binary);
00305     if (out && write(out)) {
00306       out.close();
00307       return true;
00308     }
00309     return false;
00310   }
00311 
00312   bool write(ostream &out) const // binary
00313   { 
00314     out.write((const char *) &initial_, sizeof(initial_));
00315     out.write((const char *) &_state_count, sizeof(_state_count));
00316     out.write((const char *) &_trans_count, sizeof(_trans_count));
00317     letters.write(out);
00318     aims_tags.write(out);
00319     tags.write(out);
00320     F.write(out);
00321     return (bool) out;
00322   }
00323 
00324   bool read(const string &path) {
00325     ifstream in(path.c_str(), ios::in | ios::binary);
00326     if (in && read(in)) {
00327       in.close();
00328       return true;
00329     }
00330     return false;
00331   }
00332 
00333   bool read(istream &in) // binary
00334   {
00335     in.read((char *)&initial_, sizeof(initial_));
00336     in.read((char *)&_state_count, sizeof(_state_count));
00337     in.read((char *)&_trans_count, sizeof(_trans_count));
00338     I.clear();
00339     I.insert(initial_);
00340     return letters.read(in) && aims_tags.read(in) && 
00341       tags.read(in) && F.read(in);
00342   }
00343 
00344 #ifndef _MSC_VER
00345   bool mmap(const string &path)  // memory mapping
00346   {
00347     fd = open(path.c_str(), O_RDONLY);
00348     if (fd == -1) return false;
00349     return mmap(fd);
00350   }
00351 
00352   bool mmap(int f)
00353   {
00354     ::read(f, (void*) &initial_, sizeof(initial_));
00355     ::read(f, (void*) &_state_count, sizeof(_state_count));
00356     ::read(f, (void*) &_trans_count, sizeof(_trans_count));
00357     I.clear();
00358     I.insert(initial_);
00359     return letters.mmap(f) && aims_tags.mmap(f) 
00360       && tags.mmap(f) && F.mmap(f);
00361   }
00362 
00363   void munmap()
00364   {
00365     initial_ = null_state;
00366     I.clear();
00367     letters.munmap();
00368     aims_tags.munmap();
00369     tags.munmap();
00370     F.munmap();
00371   }
00372 #endif
00373 
00374   state_type initial() const { 
00375     return initial_; 
00376   }
00377 
00378   const set<state_type>& initials() const {
00379     return I;
00380   }
00381 
00382   set_F::const_reference final(state_type s) const {
00383     return F[s];
00384   }
00385 
00386   state_type delta1(state_type s, char_type l) const
00387   {
00388     typename mapable_vector<char_type>::const_iterator a 
00389       = letters.begin() + s + 1;
00390     for(; *a != (char_type) 0 && *a != l; ++a);
00391     return *a == (char_type) 0 ? 
00392       null_state : aims_tags[a - letters.begin()];
00393   }
00394 
00395   template <typename OutputIterator>
00396   OutputIterator delta1(state_type q, char_type c, OutputIterator x) const
00397   {
00398     typename mapable_vector<char_type>::const_iterator a 
00399       = letters.begin() + q + 1;
00400     for(; *a != (char_type) 0 && *a != c; ++a);
00401     for(; *a != (char_type) 0 && *a == c; ++a, ++x)
00402       *x = aims_tags[a - letters.begin()];
00403     return x;
00404   }
00405 
00406   unsigned int state_count() const { 
00407     return _state_count; 
00408   }
00409 
00410   unsigned int trans_count() const { 
00411     return _trans_count; 
00412   }
00413   
00414   const tag_type& tag(state_type s) const { 
00415     return tags[aims_tags[s]]; 
00416   }
00417 
00418   // beware if tags are compressed:
00419   tag_type& tag(state_type s) { 
00420     return tags[aims_tags[s]]; 
00421   }
00422 
00423   edges_type delta2(state_type s) const { 
00424     return edges_type(s, *this);
00425     //    return edges_type(s, letters, aims_tags); 
00426   }
00427 
00428   const_iterator begin() const
00429   {
00430     return state_count() > 0 ?
00431       const_iterator((state_type) 1, letters) : end();
00432   }
00433 
00434   const_iterator end() const {
00435     return const_iterator(letters.size() - 1, letters);
00436   }
00437 
00438 //   void del_trans(state_type q, char_type c) {
00439 //      mapable_vector<char_type>::iterator last
00440 //        = letters.begin() + q + 1;
00441 //      for(; *last != (char_type) 0 && *last != c; ++last); // find
00442 //      mapable_vector<char_type>::iterator first = last, middle = last;
00443 //      for(; *last != (char_type) 0; ++last) // count
00444 //        if (*last == c) middle = last + 1;
00445 //      std::rotate(first, middle, last + 1);
00446 //      std::rotate(aims_tags.begin() + (first - letters.begin()),
00447 //               aims_tags.begin() + (middle - letters.begin()),
00448 //               aims_tags.begin() + (last - letters.begin()) + 1);
00449 //   }
00450 
00451   vector<char_type> del_trans(state_type q, state_type p) {
00452      typename mapable_vector<char_type>::iterator last
00453        = letters.begin() + q + 1;
00454      typename mapable_vector<char_type>::iterator first = last;
00455      for(; *last != (char_type) 0; ++last);
00456      vector<char_type> v;
00457      v.reserve(128);
00458      for(++last; *first != '\0';) {
00459        if (aims_tags[first - letters.begin()] == p) {
00460          v.push_back(*first);
00461          std::rotate(aims_tags.begin() + (first - letters.begin()), 
00462                      aims_tags.begin() + (first - letters.begin()) + 1, 
00463                      aims_tags.begin() + (last - letters.begin()) - 1);
00464          // the tag position is pointed to by last
00465          std::rotate(first, first + 1, last);
00466        }
00467        else ++first;
00468      }
00469      return v;
00470   }
00471 
00472   ~FA_compress() {
00473 #ifndef _MSC_VER
00474     letters.munmap();
00475     aims_tags.munmap();
00476     tags.munmap();
00477     F.munmap();
00478     if (fd > 0) close(fd);
00479 #endif
00480   }
00481   
00482   FA_compress()
00483     : initial_(null_state), fd(0) { 
00484     I.insert(initial_);
00485   }
00486   
00487   FA_compress(istream &in) 
00488     : initial_(null_state), fd(0) {
00489     read(in);
00490   }
00491   
00492   void clear() {
00493     letters.clear();
00494     aims_tags.clear();
00495     tags.clear();
00496     F.clear();
00497     fd = 0;
00498     initial_ = null_state;
00499     I.clear();
00500     I.insert(initial_);
00501   }
00502 
00503   template <typename BFirstCursor>
00504   void compress(BFirstCursor i, bool tag_compression = true) {
00505     compress(i, BFirstCursor(), tag_compression);
00506   }
00507 
00508   template <typename BFirstCursor>
00509   void compress(BFirstCursor i, BFirstCursor j, bool tag_compression = true) {
00510     clear();
00511     if (i == j) return;
00512     tags.reserve(8192);
00513     letters.reserve(8192);
00514     letters.push_back((char_type) 0);
00515     aims_tags.reserve(8192);
00516     aims_tags.push_back(0);
00517     F.reserve(8192);
00518     F.push_back(false);
00519 #ifndef _MSC_VER
00520     typedef typename BFirstCursor::state_type bstate_type;
00521 #else
00522     typedef BFirstCursor::state_type bstate_type;
00523 #endif
00524     map<bstate_type, safe<state_type> > old2new;
00525     vector<bstate_type> new2old;
00526     new2old.push_back(bstate_type());
00527     bstate_type ii = i.src();
00528 #ifndef _MSC_VER
00529     map<bstate_type, pair<typename BFirstCursor::tag_type, bool> > sinks;
00530 #else
00531     map<bstate_type, pair<BFirstCursor::tag_type, bool> > sinks;
00532 #endif
00533 
00534     while (i != j) {
00535       safe<state_type> &q = old2new[i.src()];
00536       if (q == null_state) {
00537         sinks.erase(i.src());
00538         q = letters.size();
00539         letters.push_back((char_type) 0);
00540         new2old.push_back(bstate_type());
00541         F.push_back(i.src_final());
00542         typename mapable_vector<tag_type>::iterator k = tags.end();
00543         tag_type tmp_tag;
00544         tmp_tag = i.src_tag();
00545         if (tag_compression) 
00546           k = find(tags.begin(), tags.end(), tmp_tag);
00547 
00548         if (k == tags.end()) {
00549           tags.push_back(tmp_tag);
00550           k = tags.end() - 1;
00551         }
00552 
00553         aims_tags.push_back(k - tags.begin());
00554         
00555         set<pair<char_type, bstate_type> > tmp;
00556         do {
00557           tmp.insert(make_pair(i.letter(), i.aim()));
00558           safe<state_type> &p = old2new[i.aim()]; 
00559           if (p == null_state)
00560             // may be a transition-less state:
00561             sinks.insert(make_pair(i.aim(), make_pair(i.aim_tag(), 
00562                                                       i.aim_final())));
00563         }
00564         while (i.next());
00565         
00566         for(typename set<pair<char_type, bstate_type> >::const_iterator jj
00567               = tmp.begin(); jj != tmp.end(); ++jj) {
00568           letters.push_back(jj->first);
00569           new2old.push_back(jj->second);
00570           F.push_back(false);
00571           aims_tags.push_back(0);
00572         }
00573       }
00574       else // q != 0, dejavu
00575         while (i.next());
00576     }
00577 
00578     for(unsigned int k = 1; k < new2old.size(); ++k)
00579       if (letters[k] != (char_type) 0) {
00580         safe<state_type> &p = old2new[new2old[k]];
00581         if (p == null_state) {
00582           // trans pointing to a transition-less state, add it
00583           p = letters.size();
00584           letters.push_back((char_type) 0);
00585           new2old.push_back(bstate_type());
00586 #ifndef _MSC_VER
00587           pair<typename BFirstCursor::tag_type, bool> pp = sinks[new2old[k]];
00588 #else
00589           pair<BFirstCursor::tag_type, bool> pp = sinks[new2old[k]];
00590 #endif
00591           F.push_back(pp.second);
00592           typename mapable_vector<tag_type>::iterator k = tags.end();
00593           tag_type tmp;
00594           tmp = pp.first;
00595           if (tag_compression) 
00596             k = find(tags.begin(), tags.end(), tmp);
00597           if (k == tags.end()) {
00598             tags.push_back(tmp);
00599             k = tags.end() - 1;
00600           }
00601           aims_tags.push_back(k - tags.begin());
00602         }
00603         aims_tags[k] = p;
00604       }
00605         
00606     letters.push_back((char_type) 0);
00607     I.clear();
00608 //      for(typename set<typename ForwardCursor::state_type>::const_iterator i 
00609 //        = init.begin(); i != init.end(); ++i)
00610 //        I.insert(old2new[*i]);
00611 //      if (I.size() == 1) 
00612 //        initial_ = *I.begin();
00613 //      else
00614 //        initial_ = null_state;
00615     initial_ = old2new[ii];
00616     I.insert(initial_);
00617     _state_count = old2new.size();
00618     _trans_count = letters.size() - _state_count - 2;
00619   }
00620 };
00621 
00622 #ifndef _MSC_VER
00623 template <typename T, typename U>
00624 const typename FA_compress<T, U>::state_type 
00625 FA_compress<T, U>::null_state;
00626 #else
00627 template <typename T, typename U>
00628 const typename FA_compress<T, U>::state_type 
00629 FA_compress<T, U>::null_state = 0;
00630 #endif
00631 
00632 } // namespace astl
00633 
00634 #endif   // ASTL_DFA_COMPACT_H
00635 

Generated on Sun Mar 8 02:41:34 2009 for ASTL by  doxygen 1.5.7.1