capture.h

00001 /*
00002  * ASTL - the Automaton Standard Template Library.
00003  * C++ generic components for Finite State Automata handling.
00004  * Copyright (C) 2000-2003 Vincent Le Maout (vincent.lemaout@chello.fr).
00005  * 
00006  * This library is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  * 
00011  * This library is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  * 
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with this library; if not, write to the Free Software
00018  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00019  *
00020  */
00021 
00022 #ifndef ASTL_CAPTURE_H
00023 #define ASTL_CAPTURE_H
00024 
00025 //
00026 // ** STILL UNDER CONSTRUCTION **
00027 // USE AT YOUR OWN RISKS (THAT IS, DON'T USE THIS :-)
00028 //
00029 
00030 #include <astl.h>
00031 #include <lazy.h>
00032 #include <vector>
00033 #include <iostream>
00034 #include <set>
00035 #include <iterator>
00036 #include <cctype>  
00037 #include <bitset>
00038 #include <cstring>
00039 #include <sstream>
00040 
00041 
00042 
00043 #include <cassert>
00044 
00045 #include <tr1/unordered_map>
00046 
00047 
00048 // TODO/FIXME
00049 // - final transition to unique final state that must finish the processing
00050 // - determinization of sets of capture along the path. The method src_tag()
00051 //   will return something useful for the upper layer to handle the offsets
00052 //   of captures (an adapter capture_cursor could encapsulate either a 
00053 //   regexp_capture_cursor or a lazy_cursor<regexp_capture_cursor> ?)
00054 // - horspool optimization mixed with captures (?)
00055 // - implement tag processing for plus
00056 // - replug processing of the initial state for non-left-anchored expressions
00057 // - implement intersection of configuration
00058 // - implement a transducer-like DFA_matrix with pairs <letter, action> on 
00059 //   transitions (use a second matrix ?)
00060 // - put configurations in nodes so that the state is just a set of
00061 //   node* (not thread safe)
00062 
00063 // REFACTORING/GENERALIZATION
00064 // - externalize tokenizer
00065 // - externalize parser
00066 // - externalize the node processing so that it can process a tree directly
00067 // - templatize nodes so that any alphabet can be used
00068 // => make node an abstract type and use visitors
00069 
00070 // - make the parser be able to parse a regexp on any alphabet type (?)
00071 // - merge regexp_cursor and capture_cursor since we need a capture for general
00072 //   case when the expression is not left-anchored (need to know where the
00073 //   match begins). Must not pay the price of the captures if none are present
00074 //   in the expression.
00075 
00076 // - can we manage greediness with this kind of processing ?
00077 // - can we manage {n,m} repetitions with this kind of processing ?
00078 // - can we manage approximative regexp matching ?
00079 // - can we replace the neighbor cursor with this ? (more efficient ?)
00080 // - debug trace at each point where there is a tag in the regexp ?
00081 // => generalize to markers in expression corresponding to an action to be taken
00082 //    (say, increment repetition counter, choose between greedy/non greedy,
00083 //    update a edit cost or whatever is needed). Trouble: deteminization.
00084 
00085 // OPTIMIZATIONS
00086 // - generalize boyer-moore-horspool to a set of words (?) (Tom|Sawyer|Huckleberry|Finn)
00087 // - generalize boyer-moore-horspool to sets of characters (?) ([Tt]wain)
00088 // - look for an optimization for the case ^[^ ]*?Twain (xpressive is 3 times faster)
00089 
00090 using namespace std;
00091 
00092 namespace astl {
00093 
00094   // regular expressions grammar:
00095   // exp    -> term exp2
00096   // exp2   -> + term exp2 | epsilon
00097   // term   -> form repeat term2
00098   // term2  -> . form repeat term2 | epsilon
00099   // repeat -> * repeat | ? repeat | + repeat | {n,m} repeat | epsilon
00100   // form   -> ( exp ) | letter 
00101   //
00102   // + is union, . is concatenation, epsilon is the empty word
00103   // a letter is a set of chars
00104 
00105   class regexp_capture_cursor : public cursor_concept
00106   {
00107   public:
00108     typedef enum { STAR, CONCAT, OR, REPEAT, EPSILON, 
00109                    LETTER, OPEN, CLOSE, QUESTION, PLUS,
00110                    ANY, OPENCAPTURE, CLOSECAPTURE, FINAL } token_type;
00111   protected:  
00112     class node;
00113 
00114   public:
00115     class capture
00116     {
00117     public:
00118       capture(const node *n)
00119         : n(n)
00120       { }
00121       
00122       int        id()   const { return n->n; }
00123       token_type type() const { return n->type; }
00124 
00125       bool operator<(const capture &c) const {
00126         return id() < c.id();
00127       }
00128 
00129     protected:
00130       const node *n;
00131     };
00132 
00133     class configuration;
00134     friend ostream& operator<<(ostream&, const configuration&);
00135 
00136     class configuration : public map<capture, long>
00137     {
00138     public:
00139       bool operator<(const configuration &y) const {
00140 
00141         cout << "comparing " << *this << " and " << y << " == ";
00142 
00143         if (&y == this || empty()) 
00144           {
00145             cout << "false" << endl;
00146             return false;
00147           }
00148         // kind of lexicographical_compare
00149         if (size() < y.size()) 
00150           {
00151             cout << "false" << endl;
00152             return false;
00153           }
00154         if (size() > y.size()) 
00155           {
00156             cout << "true" << endl;
00157             return true;
00158           }
00159         const_iterator i, j;
00160         for(i = begin(), j = y.begin(); 
00161             i != end() && i->first.id() == j->first.id() && i->first.type() == j->first.type() && i->second == j->second; 
00162             ++i, ++j);
00163         if (i == end()) // equal ?
00164           {
00165             cout << "false" << endl;
00166             return false;
00167           }
00168         if (i->first.type() == CLOSECAPTURE) {
00169           if (j->first.type() == OPENCAPTURE) 
00170             {
00171             cout << "true" << endl;
00172               return true;
00173             }
00174         }
00175         else 
00176           {
00177             cout << "false" << endl;
00178             return false;
00179           }
00180         if (i->first.id() > j->first.id())
00181           {
00182             cout << "true" << endl;
00183             return true;
00184           }
00185         if (i->first.id() < j->first.id())
00186           {
00187             cout << "false" << endl;
00188             return false;
00189           }
00190         if (i->first.type() == OPENCAPTURE) // OPEN ?
00191           {
00192             cout << (i->second < j->second ? "true" : "false") << endl;
00193             return i->second < j->second;
00194           }
00195         cout << (j->second < i->second ? "true" : "false") << endl;
00196         return j->second < i->second;
00197       }
00198 
00199       configuration intersection(const configuration &x) const {
00200         configuration r;
00201         configuration::const_iterator i = begin(), j = x.begin();
00202         while(i != end() && j != x.end()) {
00203           if (i->first < j->first) ++i;
00204           else if (j->first < i->first) ++j;
00205           else if (i->second == j->second || 
00206                    (i->first.type() == OPENCAPTURE && i->second < j->second) ||
00207                    (i->first.type() != OPENCAPTURE && j->second < i->second)) {
00208             r.insert(*i);
00209             ++i;
00210             ++j;
00211           }
00212           else { 
00213             r.insert(*j);
00214             ++i;
00215             ++j;
00216           }
00217         }
00218         return r;
00219       }
00220     };
00221 
00222   protected:
00223     typedef node* position;
00224     typedef map<position, configuration> S;
00225 
00226     struct node {
00227       bitset<256> letter;
00228       token_type type;
00229       short n, m; // repeat operator
00230       bool nullable;
00231       S firstpos, lastpos, nextpos;
00232       configuration emptymatch;
00233       configuration conf;
00234     
00235       node(token_type t, short n_ = 0, short m_ = 0)
00236         : type(t), n(n_), m(m_), nullable(false) { 
00237         switch (type) {
00238         case FINAL :
00239           letter.set(0);
00240           break;
00241         case ANY :
00242           type = LETTER;
00243           letter.set();
00244           break;
00245         case OPENCAPTURE :
00246         case CLOSECAPTURE :
00247           nullable = true;
00248           break;
00249         default :;
00250         }
00251       }
00252 
00253       node(char t)
00254         : type(LETTER), nullable(false) { 
00255         letter.set((unsigned char) t);
00256       }
00257 
00258       node(const bitset<256> &t)
00259         : letter(t), type(LETTER), nullable(false)
00260       { }
00261 
00262       bool final() const {
00263         return type == FINAL;
00264       }
00265 
00266       char to_char() const {   // does not work with ranges
00267         size_t i;
00268         for(i = 0; i != letter.size() && letter.test(i) == false; ++i);
00269         return (char) (-128 + i);
00270       }
00271 
00272       string pretty() const {
00273         ostringstream r;
00274         r << "[ " << *this;
00275         for(S::const_iterator i = nextpos.begin(); i != nextpos.end(); ++i) {
00276           r << ", {";
00277           for(configuration::const_iterator j = i->second.begin(); j != i->second.end(); ++j)
00278             r << ' ' << j->first.id();
00279           r << " }";
00280         }
00281         r << " ]";
00282         return r.str();
00283       }
00284     };
00285 
00286   protected:
00287     S q;  // current state
00288     smart_ptr<vector<node> > e;  // node "allocator"
00289     const char *errmsg;
00290     smart_ptr<S> I;
00291     unsigned char capture_count_;
00292     configuration intersection;
00293 
00294   public:
00295     smart_ptr<horspool_finder> horspool;
00296 
00297     friend ostream& operator<<(ostream&, const regexp_capture_cursor&);
00298   protected:
00299 
00300     node* exp(vector<node>::iterator &first, vector<node>::iterator last);
00301     node* form(vector<node>::iterator &first, vector<node>::iterator last);
00302     node* term(vector<node>::iterator &first, vector<node>::iterator last);
00303     node* exp2(vector<node>::iterator &first, vector<node>::iterator last, node *left);
00304     node* term2(vector<node>::iterator &first, vector<node>::iterator last, node *left);
00305     node* repeat(vector<node>::iterator &first, vector<node>::iterator last, node *left);
00306     void preprocess_repeats();
00307     void build_char_sets(map<string, bitset<256> >&);
00308     template <typename OutputIterator>
00309     int tokenize(const char *first, const char *last, OutputIterator x);
00310 
00311   public:
00312     friend ostream& operator<<(ostream &, const node&);
00313     ptrdiff_t errpos;
00314 
00315     typedef regexp_capture_cursor    self;
00316     typedef S                state_type;
00317     typedef set<node*>       ordered_state_type;
00318     typedef plain            char_traits;
00319     typedef plain::char_type char_type;
00320     typedef configuration    tag_type;
00321 
00322     regexp_capture_cursor(const string &expression = "", bool horspool_optimized = false)
00323       : errmsg(NULL), capture_count_(0), errpos(-1)
00324     {
00325       if (!expression.empty()) {
00326         e->reserve(128);
00327         errpos = tokenize(expression.c_str(), 
00328                           expression.c_str() + expression.size(), 
00329                           back_inserter(*e));
00330         if (errpos == -1) {
00331           preprocess_repeats();
00332           vector<node>::iterator first = e->begin();
00333           node *root = exp(first, e->end());
00334 
00335           // managed non-anchored alternatives
00336           if (root != NULL) {
00337             q = root->firstpos;
00338             for(S::const_iterator i = q.begin(); i != q.end(); ++i)
00339               // if not beginning of line (anchor ^)
00340               if ((*i).first->letter.count() == 256 || (*i).first->letter.test(0) == false)
00341                 I->insert(*i);
00342           }
00343           else errpos = expression.size() - errpos;
00344         }
00345       }
00346     }
00347 
00348     ptrdiff_t error() const { // possible error position (-1 if expression is ok)
00349       return errpos;
00350     }
00351     
00352     const char* error_message() const { return errmsg; }
00353 
00354     self& operator=(const state_type &s) {
00355       q = s;
00356       return *this;
00357     }
00358 
00359     const state_type& src() const { return q; }
00360 
00361     bool exists(char a) const {
00362       for(S::const_iterator i = q.begin(); i != q.end(); ++i)
00363         if ((*i).first->letter.test((unsigned char) a) == true)
00364           return true;
00365       return false;
00366     }
00367 
00368     int capture_count() const { return capture_count_; }
00369 
00370     bool forward(char letter) {
00371       state_type p = *I; // for non-anchored expressions
00372       intersection.clear(); // useful ?
00373       cout << "forward(" << (int) (unsigned char) letter << ")" << endl;
00374       for(S::const_iterator i = q.begin(); i != q.end(); ++i) { // for each (position, configuration)
00375         cout << i->first << ':' << i->first->pretty() << ":=" << i->second << endl;
00376         // compute tag
00377         if (letter == '\0') { 
00378           if (i->first->letter.test((unsigned char) '\0') == true) {
00379             if (i->second < intersection)
00380               intersection = i->second;
00381           }
00382         }
00383         else if (i->first->letter.test((unsigned char) letter) == true) {
00384           if (i == q.begin())
00385             intersection = i->second;
00386           else 
00387             intersection = intersection.intersection(i->second);
00388         }
00389       }
00390        cout << "intersection " << intersection << endl;
00391       for(S::const_iterator i = q.begin(); i != q.end(); ++i) { // for each (position, configuration)
00392         if (i->first->letter.test((unsigned char) letter) == true) {
00393           state_type::const_iterator first = i->first->nextpos.begin();
00394           const state_type::const_iterator last = i->first->nextpos.end();
00395           for(; first != last; ++first) { // for each target (position, configuration)
00396             configuration tmp;// = i->second;
00397             for(configuration::const_iterator cc = i->second.begin(); cc != i->second.end(); ++cc)
00398               if (intersection.find(cc->first) == intersection.end())
00399                 tmp.insert(make_pair(cc->first, cc->second - 1)); // only those that are not in the intersection
00400             configuration::const_iterator 
00401               cfirst = first->second.begin(),
00402               clast = first->second.end();
00403             for(; cfirst != clast; ++cfirst)
00404               tmp[cfirst->first] = 0; //offset; // keep the last one always
00405             configuration &conf = p[first->first];
00406             if (tmp < conf) conf = tmp; // dedup of configurations (only one per position)
00407           }
00408         }
00409       }
00410       q.swap(p);   // optimized equivalent to q = p;
00411       return !q.empty();
00412     }
00413     
00414     bool src_final() const {
00415       for(S::const_iterator i = q.begin(); i != q.end(); ++i)
00416         if (i->first->final()) return true;
00417       return false;
00418     }
00419      
00420     bool sink() const { return q.empty(); }
00421  
00422     const tag_type& src_tag() const { return intersection; }
00423 
00424     void clear() {
00425       q.clear();
00426       e->clear();
00427       I->clear();
00428     }
00429 
00430     //     void determinize(DFA_matrix<plain> &dfa) const {
00431     //       typedef DFA_matrix<plain> dfa_type;
00432     //       map<ordered_state_type, dfa_type::state_type> old2new;
00433     //       dfa_type::state_type i = dfa.new_state();
00434     //       dfa.initial(i);
00435     //       determinize_(dfa, old2new, old2new.insert(make_pair(ordered_state_type(q.begin(), q.end()), i)).first);
00436     //     }
00437 
00438     //     void determinize_(DFA_matrix<plain> &dfa, 
00439     //                map<ordered_state_type, DFA_matrix<plain>::state_type> &old2new,
00440     //                map<ordered_state_type, DFA_matrix<plain>::state_type>::iterator p) const {
00441     //       for(int i = -128; i < 128; ++i) {
00442     //  ordered_state_type aim(I->begin(), I->end());    
00443     //  for(ordered_state_type::iterator s = p->first.begin(); s != p->first.end(); ++s) {
00444     //    if ((*s)->letter.test((unsigned char) i)) {
00445     //      aim.insert((*s)->nextpos.begin(), (*s)->nextpos.end());
00446     //    }
00447     //  }
00448     //  if (!aim.empty()) {
00449     //    pair<map<ordered_state_type, DFA_matrix<plain>::state_type>::iterator, bool> j = 
00450     //      old2new.insert(make_pair(aim, DFA_matrix<plain>::null_state));
00451     //    if (j.second == false) {
00452     //      dfa.set_trans(p->second, (char) (unsigned char) i, j.first->second);
00453     //    }
00454     //    else {
00455     //      j.first->second = dfa.new_state();
00456     //      dfa.set_trans(p->second, (char) (unsigned char) i, j.first->second);
00457     //      for(ordered_state_type::const_iterator ii = aim.begin(); ii != aim.end(); ++ii)
00458     //        if ((*ii)->final()) {
00459     //          dfa.final(j.first->second) = true;
00460     //          break;
00461     //        }
00462     //      aim.clear();
00463     //      determinize_(dfa, old2new, j.first);
00464     //    }
00465     //  }
00466     //       }
00467     //     }
00468 
00469   protected:
00470     ptrdiff_t token_error(ptrdiff_t pos, const char *message) {
00471       errmsg = message;
00472       errpos = pos;
00473       return pos;
00474     }
00475 
00476     node* syntax_error(ptrdiff_t pos, const char *message = NULL) {
00477       errmsg = message;
00478       errpos = pos;
00479       return NULL;
00480     }
00481 
00482     int next_char(bitset<256>& letter, const char *begin, const char*& first, const char *last, 
00483                   bool& literal, const map<string, bitset<256> >& char_sets, bool icase);
00484 
00485   };
00486 
00487   inline
00488   ostream& operator<<(ostream &out, const regexp_capture_cursor::configuration &c) {
00489     for(regexp_capture_cursor::configuration::const_iterator i = c.begin();
00490         i != c.end(); ++i)
00491       out << i->first.id() << ':' << i->second << ' ';
00492     return out;
00493   }
00494 
00495   inline
00496   void regexp_capture_cursor::preprocess_repeats() {
00497     vector<node> &v = *e;
00498     for(vector<node>::size_type p = 0; p < v.size();) {
00499       if (v[p].type != REPEAT) ++p;
00500       else {
00501         // look for what is repeated: letter, (exp), ?, +, *, ANY 
00502         vector<node>::size_type first = p - 1, last = p;
00503         for(; first > 0; --first) {
00504           if (v[first].type == LETTER || v[first].type == CLOSE || v[first].type == ANY)
00505             break;
00506         }
00507         if (v[first].type == CLOSE) {
00508           int count = 1;
00509           while(count != 0 && --first > 0)
00510             if (v[first].type == CLOSE) ++count;
00511             else if (v[first].type == OPEN) --count;
00512         }
00513         // the range [first, last) contains the pattern to be repeated
00514         // enclose it in parenthesis because of such things: a{0,2}{3}
00515         // which would otherwise be rewritten: a?a?{3} but shoud be written: (a?a?){3}
00516         // limiting to 128
00517         short n = (v[p].n > 128 ? 128 : v[p].n), 
00518           m = (v[p].m > 128 ? 128 : v[p].m);
00519         v.insert(v.begin() + first, node(OPEN));
00520         ++p;
00521         ++last;
00522         ++first;
00523         // remove repeat node
00524         if (n == 0)
00525           v[p] = node(QUESTION);
00526         else
00527           v.erase(v.begin() + p);
00528         vector<node> tmp;
00529         if (m == -1)
00530           tmp.reserve((last + 1 - first) * n + 1);
00531         else
00532           tmp.reserve((last + 1 - first) * m + m - n + 1);
00533 
00534         while(n > 0 || m > 0) {
00535           --n; 
00536           --m;
00537           if (n <= 0) { // end of mandatory repeats ?
00538             if (m < n) {  // case {n,}
00539               tmp.push_back(node(PLUS));
00540               continue; // end of processing
00541             }
00542             if (m == 0)
00543               continue; // end of processing
00544           }
00545           tmp.push_back(node(CONCAT));
00546           for(vector<node>::size_type tmp_p = first; tmp_p < last; ++tmp_p)
00547             tmp.push_back(v[tmp_p]);
00548           if (n <= 0)
00549             tmp.push_back(node(QUESTION));
00550         }
00551         tmp.push_back(node(CLOSE));
00552         v.insert(v.begin() + p, tmp.begin(), tmp.end());
00553         p += tmp.size();
00554       }
00555     }
00556   }
00557 
00558   inline
00559   ostream& operator<<(ostream &out, const regexp_capture_cursor::node &x) 
00560   {
00561     switch(x.type) {
00562     case regexp_capture_cursor::LETTER :
00563       if (x.letter.count() == 256)
00564         out << ".";
00565       else {
00566         if (x.letter.count() > 1)
00567           out << '[';
00568         int lower = 0;
00569         // rebuild approximatively the character set
00570         for(int i = 32; i < 257; ++i) {
00571           if (i < 256 && x.letter.test(i) == true) {
00572             if (lower == 0)
00573               lower = i;
00574           }
00575           else {
00576             if (lower > 0) {
00577               if (i - lower == 1) {
00578                 out << (unsigned char) lower;
00579               }
00580               else if (i - lower == 2) {
00581                 out << (unsigned char) lower << (unsigned char) (lower + 1);
00582               }
00583               else out << (unsigned char) lower << '-' 
00584                        << (unsigned char) (i - 1);
00585               lower = 0;
00586             }
00587           }
00588         }
00589         if (x.letter.count() > 1)
00590           out << ']';
00591       }
00592       break;
00593         
00594     case regexp_capture_cursor::OR : 
00595       out << "|"; 
00596       break;
00597         
00598     case regexp_capture_cursor::FINAL : 
00599       out << "[EoE]";
00600       break;
00601         
00602     case regexp_capture_cursor::OPEN : 
00603       out << "("; 
00604       break;
00605         
00606     case regexp_capture_cursor::CLOSE : 
00607       out << ")"; 
00608       break;
00609         
00610     case regexp_capture_cursor::CONCAT : 
00611       out << "."; 
00612       break;
00613         
00614     case regexp_capture_cursor::STAR : 
00615       out << "*"; 
00616       break;
00617         
00618     case regexp_capture_cursor::QUESTION : 
00619       out << "?"; 
00620       break;
00621         
00622     case regexp_capture_cursor::PLUS : 
00623       out << "+"; 
00624       break;
00625         
00626     case regexp_capture_cursor::REPEAT : 
00627       out << "{" << x.n << "," << x.m << "}";
00628       break;
00629         
00630     case regexp_capture_cursor::OPENCAPTURE :
00631       out << "_(" << x.n << "_";
00632       break;
00633 
00634     case regexp_capture_cursor::CLOSECAPTURE :
00635       out << "_)" << x.n << "_";
00636       break;
00637 
00638     default :
00639       break;
00640     } 
00641     return out;
00642   }
00643 
00644   inline
00645   void regexp_capture_cursor::build_char_sets(map<string, bitset<256> >& char_sets) {
00646     bitset<256> alpha, alnum, ascii, blank, cntrl, digit, word;
00647     bitset<256> graph, lower, print, punct, space, upper, xdigit;
00648     for(int c = 0; c < 256; ++c) {
00649       alnum.set(c, isalnum(c) != 0);
00650       alpha.set(c, isalpha(c) != 0);
00651       ascii.set(c, isascii(c) != 0);
00652       cntrl.set(c, iscntrl(c) != 0);
00653       digit.set(c, isdigit(c) != 0);
00654       graph.set(c, isgraph(c) != 0);
00655       lower.set(c, islower(c) != 0);
00656       print.set(c, isprint(c) != 0);
00657       punct.set(c, ispunct(c) != 0);
00658       space.set(c, isspace(c) != 0);
00659       upper.set(c, isupper(c) != 0);
00660       xdigit.set(c, isxdigit(c) != 0);
00661     }
00662     char_sets["[:alnum:]"] = alnum;
00663     char_sets["[:^alnum:]"] = ~ char_sets["[:alnum:]"];
00664     char_sets["[:alpha:]"] = alpha;
00665     char_sets["[:^alpha:]"] = ~ char_sets["[:alpha:]"];
00666     char_sets["[:ascii:]"] = ascii;
00667     char_sets["[:^ascii:]"] = ~ char_sets["[:ascii:]"];
00668     blank.set((unsigned char) ' ');
00669     blank.set((unsigned char) '\t');
00670     char_sets["[:blank:]"] = blank;
00671     char_sets["[:^blank:]"] = ~ char_sets["[:blank:"];
00672     char_sets["[:cntrl:]"] = cntrl;
00673     char_sets["[:^cntrl:]"] = ~ char_sets["[:cntrl:]"];
00674     char_sets["[:digit:]"] = digit;
00675     char_sets["[:^digit:]"] = ~ char_sets["[:digit:]"];
00676     char_sets["[:graph:]"] = graph;
00677     char_sets["[:^graph:]"] = ~ char_sets["[:graph:]"];
00678     char_sets["[:lower:]"] = lower;
00679     char_sets["[:^lower:]"] = ~ char_sets["[:lower:]"];
00680     char_sets["[:print:]"] = print;
00681     char_sets["[:^print:]"] = ~ char_sets["[:print:]"];
00682     char_sets["[:punct:]"] = punct;
00683     char_sets["[:^punct:]"] = ~ char_sets["[:punct:]"];
00684     char_sets["[:space:]"] = space;
00685     char_sets["[:^space:]"] = ~ char_sets["[:space:]"];
00686     char_sets["[:upper:]"] = upper;
00687     char_sets["[:^upper:]"] = ~ char_sets["[:upper:]"];
00688     char_sets["[:xdigit:]"] = xdigit;
00689     char_sets["[:^xdigit:]"] = ~ char_sets["[:xdigit:]"];
00690     alnum.set((unsigned char) '_');
00691     char_sets["[:word:]"] = alnum;
00692     char_sets["[:^word:]"] = ~ char_sets["[:word:]"];
00693     char_sets["\\d"] = digit;
00694     char_sets["\\D"] = ~ char_sets["\\d"];
00695     char_sets["\\s"] = space;
00696     char_sets["\\S"] = ~ char_sets["\\s"];
00697     char_sets["\\w"] = alnum;
00698     char_sets["\\W"] = ~alnum;
00699     char_sets["\\t"] = bitset<256>().set((unsigned char) '\t');
00700     char_sets["\\n"] = bitset<256>().set((unsigned char) '\n');
00701     char_sets["\\r"] = bitset<256>().set((unsigned char) '\r');
00702     char_sets["\\f"] = bitset<256>().set((unsigned char) '\f');
00703     char_sets["\\a"] = bitset<256>().set((unsigned char) '\a');
00704     char_sets["\\e"] = bitset<256>().set((unsigned char) '\x1B');
00705     char_sets["\\v"] = bitset<256>().set((unsigned char) '\v');
00706     char_sets["\\V"] = ~ char_sets["\\v"];
00707     char_sets["\\h"] = bitset<256>().set((unsigned char) ' ');
00708     char_sets["\\H"] = ~ char_sets["\\h"];
00709   }
00710 
00711   // read char at position first and advance first
00712   // may return an empty char when encountering \E
00713   inline
00714   int regexp_capture_cursor::next_char(bitset<256>& letter, const char *begin, const char*& first, 
00715                                        const char *last, bool& literal, 
00716                                        const map<string, bitset<256> >& char_sets, bool icase) {
00717     letter.reset();
00718 
00719     // anchores
00720     if ((*first == '$' || *first == '^') && !literal) {
00721       letter.set((unsigned char) 0); // beginning/end of line is '\0'
00722       ++first;
00723       return -1;
00724     }
00725 
00726     if (*first == '\\') {
00727       if (++first == last) 
00728         return token_error(first - begin, "unterminated escaped sequence");
00729 
00730       // quote sequence \Q ... \E
00731       if (*first == 'Q') {
00732         if (literal) {
00733           letter.set((unsigned char) '\\');
00734           return -1;
00735         }
00736         else {
00737           literal = true;
00738           if (++first == last) 
00739             return token_error(first - begin, "unterminated quote sequence");
00740           return next_char(letter, begin, first, last, literal, char_sets, icase);
00741         }
00742       }
00743       if (*first == 'E') {
00744         if (!literal) {
00745           letter.set((unsigned char) 'E');
00746           if (icase)
00747             letter.set((unsigned char) 'e');
00748         }
00749         else literal = false;
00750         ++first;
00751         return -1;
00752       }
00753 
00754       if (literal) {
00755         letter.set((unsigned char) '\\');
00756         return -1;
00757       }
00758       
00759       if (*first == 'x') { // hexadecimal character \x01 through \xFF
00760         if (++first == last)
00761           return token_error(first - begin, "\\x used with no following hex digits");
00762         int n = 0;
00763         if (*first >= '0' && *first <= '9') {
00764           n = 16 * (*first - '0'); 
00765         }
00766         else 
00767           if (*first >= 'A' && *first <= 'F') {
00768             n = 16 * (10 + *first - 'A'); 
00769           }
00770           else 
00771             if (*first >= 'a' && *first <= 'f') {
00772               n = 16 * (10 + *first - 'a'); 
00773             }
00774             else return token_error(first - begin, "\\x used with no following hex digits");
00775         
00776         if (++first == last)
00777           return token_error(first - begin, "\\x used with only one following hex digit");
00778         
00779         if (*first >= '0' && *first <= '9') {
00780           n += *first - '0'; 
00781         }
00782         else 
00783           if (*first >= 'A' && *first <= 'F') {
00784             n += 10 + (*first - 'A'); 
00785           }
00786           else
00787             if (*first >= 'a' && *first <= 'f') {
00788               n += 10 + (*first - 'a'); 
00789             }
00790             else return token_error(first - begin, "\\x used with only one following hex digit");
00791         letter.set(n);
00792         ++first;
00793         return -1;
00794       }
00795       
00796       if (*first == 'c') { // control character \cA through \cZ maps to \x01 \x1A
00797         if (++first == last || *first < 'A' || *first > 'Z')
00798           return token_error(first - begin, "\\c used with no following upper-case letter");
00799         letter.set(*first - 'A' + 1);
00800         ++first;
00801         return -1;
00802       }
00803 
00804       // look for \s, \S, \w, \W, \d, \D, \n, \t, \v, \V, \r, \a, \e, \f, \h, \H
00805       std::map<string, bitset<256> >::const_iterator c = char_sets.find(std::string("\\") + *first);
00806       if (c != char_sets.end()) {
00807         letter = c->second;
00808         ++first;
00809         return -1;
00810       }
00811 
00812       if (*first == '.') {
00813         letter.set((unsigned char) '.');
00814         ++first;
00815         return -1;
00816       }
00817     }
00818 
00819     if (*first == '.') {
00820       if (literal)
00821         letter.set((unsigned char) '.');
00822       else
00823         letter.set();
00824     }
00825     else
00826       if (icase && isalpha(*first)) {
00827         letter.set((unsigned char) tolower((unsigned char) *first));
00828         letter.set((unsigned char) toupper((unsigned char) *first));
00829       }
00830       else letter.set((unsigned char) *first);
00831     ++first;
00832     return -1;
00833   }
00834 
00835   // Return error position or -1 if ok
00836   template <typename OutputIterator>
00837   int regexp_capture_cursor::tokenize(const char *first, const char *last, OutputIterator x)
00838   {
00839     static map<string, bitset<256> > char_sets;
00840     if (char_sets.empty())
00841       build_char_sets(char_sets);
00842 
00843     *x++ = node(OPEN);
00844     node previous = node(OPEN);
00845   
00846     bool literal = false;
00847     std::vector<char> parenthesis; // contains '(' or 'i' for (?i:
00848     bool icase = false;
00849     std::vector<int> open_captures;
00850     int current_open_capture = 0;
00851 
00852     for(const char *begin = first; first != last; ++first, ++x) {
00853       if (literal && *first != '\\') {
00854         if (previous.type != OPEN && previous.type != OR) 
00855           *x++ = node(CONCAT);
00856         if (icase && isalpha(*first)) {
00857           bitset<256> c;
00858           c.set((unsigned char) tolower(*first));
00859           c.set((unsigned char) toupper(*first));
00860           *x = previous = node(c);
00861         }
00862         else
00863           *x = previous = node(*first);
00864         continue;
00865       }
00866 
00867       switch (*first) {
00868       case '|' :
00869         if (first == begin)
00870           return token_error(0, "missing left part of alternative");
00871         *x = previous = node(OR);
00872         break;
00873 
00874       case '(' : 
00875         if (previous.type != OPEN && previous.type != OR) 
00876           *x++ = node(CONCAT);
00877         if (first + 2 < last && first[1] == '?' && first[2] == ':') { // non capturing parenthesis (?:
00878           *x = previous = node(OPEN);
00879           first += 2;
00880           parenthesis.push_back('(');
00881         }
00882         else {
00883           if (first + 3 < last && first[1] == '?' && 
00884               first[2] == 'i' && first[3] == ':') { // case insensitive (?i:
00885             first += 3;
00886             if (icase == false) {
00887               parenthesis.push_back('i');
00888               icase = true;
00889             }
00890             else parenthesis.push_back('(');
00891           }
00892           else { // capture
00893             ++capture_count_;
00894             *x++ = node(OPEN);
00895             *x++ = node(OPENCAPTURE, current_open_capture);
00896             open_captures.push_back(current_open_capture);
00897             current_open_capture += 2;
00898             *x++ = node(CONCAT);
00899             parenthesis.push_back('c');
00900           }
00901           *x = previous = node(OPEN);
00902         }
00903         break;
00904 
00905       case ')' : 
00906         if (parenthesis.empty())
00907           return token_error(first - begin, "unbalanced parenthesis");
00908         if (parenthesis.back() == 'i')
00909           icase = false;
00910         else 
00911           if (parenthesis.back() == 'c') { // end of capture ?
00912             ++capture_count_;
00913             *x++ = node(CLOSE);
00914             *x++ = node(CONCAT);
00915             *x++ = node(CLOSECAPTURE, open_captures.back() + 1);
00916             open_captures.pop_back();
00917           }
00918         *x = previous = node(CLOSE);
00919         parenthesis.pop_back();
00920         break;
00921 
00922       case '*' :
00923         if (first == begin)
00924           return token_error(0, "missing expression before operator *");
00925         *x = previous = node(STAR);
00926         break;
00927 
00928       case '?' :
00929         if (first == begin)
00930           return token_error(0, "missing expression before operator ?");
00931         *x = previous = node(QUESTION);
00932         break;
00933 
00934       case '+' :
00935         if (first == begin)
00936           return token_error(0, "missing expression before operator +");
00937         *x = previous = node(PLUS);
00938         break;
00939 
00940       case '{' : // {n,m}
00941         if (first == begin)
00942           return token_error(0, "missing expression before repeat operator");
00943         if (++first == last) 
00944           return token_error(first - begin, "unterminated expression");
00945         {
00946           int n = 0;
00947           const char *tmp = first;
00948           while (isdigit(*first)) {
00949             n = n * 10 + *first - '0';
00950             if (++first == last) 
00951               return token_error(first - begin, "unterminated repeat operator");
00952           }
00953           int m = n;
00954           if (*first == ',') {
00955             if (++first == last) 
00956               return token_error(first - begin, "unterminated repeat operator");
00957             tmp = first;
00958             for(m = 0; isdigit(*first);) {
00959               m = m * 10 + *first - '0';
00960               if (++first == last) 
00961                 return token_error(first - begin, "unterminated repeat operator");
00962             }
00963             if (tmp == first) m = -1; // comma but no upper bound: {n,}
00964           }
00965           if (*first != '}') 
00966             return token_error(first - begin, "invalid repeat operator");
00967 
00968           // errors: {0} {0,0} {,0}
00969           if (n > 32768 || m > 65535 || (n > m && m > -1) || (n == m && n == 0)) 
00970             return token_error(first - begin, "wrong range for repeat operator");
00971 
00972           // special cases: {0,1} {,1} {0,} {1,} {1,1}
00973           if (n == 0 && m == 1)
00974             *x = node(QUESTION);
00975           else if (n == 0 && m == -1)
00976             *x = node(STAR);
00977           else if (n == 1 && m == -1)
00978             *x = node(PLUS);
00979           else if (!(n == 1 &&  m == 1))
00980             *x = node(REPEAT, n, m);
00981         }
00982         break;
00983 
00984       case '[' :   // characters set
00985         {
00986           if (previous.type != OPEN && previous.type != OPENCAPTURE && previous.type != OR) 
00987             *x++ = node(CONCAT);
00988           bitset<256> a;
00989           bool v = true;
00990           if (++first == last) 
00991             return token_error(first - begin, "unterminated expression");
00992           if (*first == '^') {  // a negation ?
00993             a.set();
00994             a.set(0, false);
00995             v = false;
00996             if (++first == last) 
00997               return token_error(first - begin, "unterminated expression");
00998           }
00999           if (*first == '-') {  // "escaped" hyphen at the beginning ?
01000             a.set((unsigned char) '-', v);
01001             ++first;
01002           }
01003           else
01004             if (*first == ']') {  // "escaped" closing square bracket at the beginning ?
01005               a.set((unsigned char) ']', v);
01006               ++first;
01007             }
01008           for(; first != last && *first != ']'; ++first) {
01009             if (*first == '[') {
01010               if (++first == last) break;
01011               if (*first != ':') {
01012                 a.set((unsigned char) '[', v);
01013               }
01014               else { // a predefined set as [:digits:]
01015                 const char *start = first - 1;
01016                 first = std::find(++first, last, ']');
01017                 if (first == last) break;
01018                 std::map<string, bitset<256> >::const_iterator c = char_sets.find(std::string(start, first + 1));
01019                 if (c == char_sets.end())
01020                   return token_error(start - begin, "unknown character set");
01021                 if (v == true)
01022                   a |= c->second;
01023                 else a &= ~c->second;
01024                 continue;
01025               }
01026             }
01027             // TODO: next_char manages ranges and return a character set (pass a boolean for 
01028             // saying this is a character set context and '-' as a special meaning, good for
01029             // . also, see below the ugly hack)
01030             bitset<256> tmp; // will possibly be used as a range lower bound
01031             if (*first != '-') {  // not a potential character range ?
01032               switch (*first) {
01033               case '^' :
01034               case '$' :
01035                 tmp.set((unsigned char) *first++);
01036                 break;
01037               default :
01038                 {
01039                   int r = next_char(tmp, begin, first, last, literal, char_sets, icase);
01040                   if (r != -1)
01041                     return r;
01042                   // ugly hack for dot
01043                   if (tmp.count() == 256) {
01044                     tmp.reset();
01045                     tmp.set((unsigned char) '.'); // . is a literal in a character set
01046                   }
01047                 }
01048               }
01049               if (v == true)
01050                 a |= tmp;
01051               else a &= ~ tmp; // negative definition
01052               if (*first != '-') {
01053                 --first;
01054                 continue;
01055               }
01056             }
01057             // now *first == '-'
01058             if (++first == last) 
01059               return token_error(first - begin, "unterminated expression");
01060             if (*first == ']') {  // unterminated character range ?
01061               a.set((unsigned char) '-', v);
01062               break; // end of character set
01063             }
01064             else { // character range [a-z]
01065               if (tmp.count() != 1) { // a bound must not be a character set
01066                 return token_error(first - 2 - begin, "wrong character range: the lower bound cannot be a character set");
01067               }
01068               bitset<256> upper_bound;
01069               switch (*first) {
01070               case '^' :
01071               case '$' :
01072                 upper_bound.set((unsigned char) *first);
01073                 break;
01074               default :
01075                 {
01076                   int r = next_char(upper_bound, begin, first, last, literal, char_sets, icase);
01077                   --first;
01078                   if (r != -1)
01079                     return token_error(first - begin, "wrong character range");
01080                   // ugly hack for dot
01081                   if (upper_bound.count() == 256) {
01082                     upper_bound.reset();
01083                     upper_bound.set((unsigned char) '.'); // . is a literal in a character set
01084                   }
01085                 }
01086               }
01087               if (upper_bound.count() != 1) // a bound must not be a character set
01088                 return token_error(first - begin, "wrong character range: the upper bound cannot be a character set");
01089 
01090               unsigned int int_lower_bound = 0, int_upper_bound = 0;
01091               while (tmp.test(int_lower_bound) == false) 
01092                 ++int_lower_bound;
01093               while (upper_bound.test(int_upper_bound) == false)
01094                 ++int_upper_bound;
01095 
01096               // guess what, lower bound should be lower or equal to upper bound
01097               if (int_upper_bound < int_lower_bound)
01098                 return token_error(first - begin, "wrong character range: lower bound cannot be greater than upper bound");
01099               else
01100                 for(unsigned int i = int_lower_bound; i <= int_upper_bound; ++i)
01101                   a.set(i, v);
01102             }
01103           }
01104           // first is supposed to point to ']'
01105           if (first == last) 
01106             return token_error(first - begin, "unterminated expression");
01107           *x = previous = node(a);
01108           break;
01109         }
01110 
01111       default :
01112         bitset<256> next_letter;
01113         int r = next_char(next_letter, begin, first, last, literal, char_sets, icase);
01114         if (r != -1)
01115           return r;
01116         if (next_letter.any()) {
01117           if (previous.type != OPEN && previous.type != OR) 
01118             *x++ = node(CONCAT);
01119           if (next_letter.count() == 256) // catch-all dot 
01120 #ifndef UTF8
01121             *x = previous = node(ANY);
01122 #else
01123           {
01124             *x++ = node(OPEN);
01125             next_letter.reset();
01126             for(int i = 1; i < 0x7F; ++i)
01127               next_letter.set(i);
01128             *x++ = node(next_letter);
01129             *x++ = node(OR);
01130             next_letter.reset();
01131             for(int i = 0; i < 32; ++i)
01132               next_letter.set(i | 192);
01133             *x++ = node(next_letter);
01134             *x++ = node(CONCAT);
01135             *x++ = node(ANY);
01136             *x = previous = node(CLOSE);
01137           }
01138 #endif
01139           else
01140             *x = previous = node(next_letter);
01141         }
01142         --first;
01143         break;
01144       }
01145     }
01146     *x++ = node(CLOSE);
01147     *x++ = node(CONCAT);
01148     *x++ = node(FINAL);
01149     *x++ = node(CONCAT);
01150     *x++ = node(FINAL);
01151     return -1;
01152   }
01153 
01154   // exp -> term exp2    
01155   inline
01156   regexp_capture_cursor::node*
01157   regexp_capture_cursor::exp(vector<regexp_capture_cursor::node>::iterator &first, 
01158                              vector<regexp_capture_cursor::node>::iterator last)
01159   {
01160     if (first == last)
01161       return syntax_error(last - first, "empty expression");
01162     node *t = term(first, last);
01163     assert(t != 0);
01164     if (t == 0) return 0;
01165     return exp2(first, last, t);
01166   }
01167 
01168   // exp2 -> + term exp2 | epsilon
01169   inline
01170   regexp_capture_cursor::node*
01171   regexp_capture_cursor::exp2(vector<regexp_capture_cursor::node>::iterator &first, 
01172                               vector<regexp_capture_cursor::node>::iterator last, node *left)
01173   {
01174     assert(left != 0);
01175     if (first != last && first->type == OR) {
01176       node &root = *first;
01177       node *right = term(++first, last);
01178       if (right == 0) return 0;
01179       root.nullable = right->nullable || left->nullable;
01180       // firstpos(here) = firstpos(left) U firstpos(right): 
01181       root.firstpos = left->firstpos;
01182       root.firstpos.insert(right->firstpos.begin(), right->firstpos.end()); 
01183       root.lastpos = left->lastpos;
01184       root.lastpos.insert(right->lastpos.begin(), right->lastpos.end());
01185       if (left->nullable)
01186         root.emptymatch = left->emptymatch;
01187       else
01188         root.emptymatch = right->emptymatch;
01189       return exp2(first, last, &root);
01190     }
01191     return left;
01192   }
01193 
01194   // term -> form repeat term2
01195   inline
01196   regexp_capture_cursor::node*
01197   regexp_capture_cursor::term(vector<regexp_capture_cursor::node>::iterator &first, 
01198                               vector<regexp_capture_cursor::node>::iterator last)
01199   {
01200     if (first == last) 
01201       return syntax_error(last - first, "unterminated expression");
01202     node *f = form(first, last);
01203     assert(f != 0);
01204     if (f == 0) return 0;
01205     node *s = repeat(first, last, f);  // return f if no repeat token is found
01206     return term2(first, last, s);
01207   }
01208 
01209   // repeat -> * repeat | ? repeat | + repeat | epsilon
01210   inline
01211   regexp_capture_cursor::node*
01212   regexp_capture_cursor::repeat(vector<regexp_capture_cursor::node>::iterator &first, 
01213                                 vector<regexp_capture_cursor::node>::iterator last, node *left)
01214   {
01215     assert(left != 0);
01216     if (first != last)
01217       switch (first->type) {
01218       case STAR :
01219       case PLUS :
01220         {
01221           node &root = *first;
01222           root.nullable = first->type == STAR;
01223           root.firstpos = left->firstpos;
01224           root.lastpos = left->lastpos;
01225           if (left->nullable)
01226             root.emptymatch = left->emptymatch;
01227           // nextpos(lastpos(here)) += firstpos(here):
01228           for(S::const_iterator i = root.lastpos.begin(); i != root.lastpos.end(); ++i)
01229             for(S::const_iterator j = root.firstpos.begin(); j != root.firstpos.end(); ++j) {
01230               configuration tmp = i->second;
01231               tmp.insert(j->second.begin(), j->second.end());
01232               (*i).first->nextpos.insert(make_pair(j->first, tmp));
01233             }
01234           return repeat(++first, last, &root);
01235         }
01236       case QUESTION : 
01237         {
01238           node &root = *first;
01239           root.nullable = true;
01240           root.firstpos = left->firstpos;
01241           root.lastpos = left->lastpos;
01242           if (left->nullable)
01243             root.emptymatch = left->emptymatch;
01244           return repeat(++first, last, &root);
01245         }
01246       default : 
01247         break;
01248       }
01249     return left;
01250   }
01251     
01252 
01253   // term2 -> . form repeat term2 | epsilon
01254   inline
01255   regexp_capture_cursor::node*
01256   regexp_capture_cursor::term2(vector<regexp_capture_cursor::node>::iterator &first, 
01257                                vector<regexp_capture_cursor::node>::iterator last, node *left)
01258   {
01259     if (first != last && first->type == CONCAT) {
01260       node &root = *first;
01261       node *f = form(++first, last);
01262       assert(f != 0);
01263       if (f == 0) return 0; 
01264       node *right = repeat(first, last, f);
01265       root.nullable = left->nullable && right->nullable;
01266 
01267       root.firstpos = left->firstpos;
01268       if (left->nullable) {
01269         for(S::const_iterator s = right->firstpos.begin(); s != right->firstpos.end(); ++s) {
01270           configuration tmp = s->second;
01271           tmp.insert(left->emptymatch.begin(), left->emptymatch.end());
01272           root.firstpos.insert(make_pair(s->first, tmp));
01273         }
01274       }
01275 
01276       root.lastpos = right->lastpos;    
01277       if (right->nullable) {
01278         for(S::const_iterator s = left->lastpos.begin(); s != left->lastpos.end(); ++s) {
01279           configuration tmp = s->second;
01280           tmp.insert(right->emptymatch.begin(), right->emptymatch.end());
01281           root.lastpos.insert(make_pair(s->first, tmp));
01282         }
01283       }
01284 
01285       root.emptymatch = left->emptymatch;
01286       root.emptymatch.insert(right->emptymatch.begin(), right->emptymatch.end());
01287 
01288       // nextpos(lastpos(left)) += firstpos(right):
01289       for(S::const_iterator i = left->lastpos.begin(); i != left->lastpos.end(); ++i)
01290         for(S::const_iterator j = right->firstpos.begin(); j != right->firstpos.end(); ++j) {
01291           configuration tmp = j->second;
01292           tmp.insert(i->second.begin(), i->second.end());
01293           i->first->nextpos.insert(make_pair(j->first, tmp));     
01294         }
01295       return term2(first, last, &root);
01296     }
01297     return left;
01298   }
01299 
01300 
01301   // form -> ( exp ) | letter
01302   inline
01303   regexp_capture_cursor::node*
01304   regexp_capture_cursor::form(vector<regexp_capture_cursor::node>::iterator &first, 
01305                               vector<regexp_capture_cursor::node>::iterator last)
01306   {
01307     if (first == last) 
01308       return syntax_error(last - first, "unterminated expression");
01309     if (first->type == OPEN) {
01310       node *e = exp(++first, last);
01311       assert(e != 0);
01312       if (e == 0) return 0;
01313       if (first->type == CLOSE) ++first;
01314       else return syntax_error(last - first, "unbalanced parenthesis");
01315       return e;
01316     }
01317     if (first->type == OPENCAPTURE || first->type == CLOSECAPTURE) {
01318       first->emptymatch[&*first] = 0;
01319       ++first;
01320       return &(first[-1]);
01321     }
01322     first->nullable = false;
01323     first->firstpos.insert(make_pair(&*first, configuration()));
01324     first->lastpos.insert(make_pair(&*first, configuration()));
01325     ++first;
01326     return &(first[-1]);
01327   }
01328 
01329   inline
01330   ostream& operator<<(ostream& out, const regexp_capture_cursor& c) {
01331     for(vector<regexp_capture_cursor::node>::const_iterator n = c.e->begin(); n != c.e->end(); ++n) {
01332       if (c.q.find(const_cast<astl::regexp_capture_cursor::node*>(&*n)) != c.q.end())
01333         out << "\033[1m" << *n << "\033[0m";
01334       else
01335         out << *n;
01336     }
01337     return out;
01338   }
01339 
01340   //   inline
01341   //   regexp_capture_cursor regexpc(const string &exp) {
01342   //     return regexp_capture_cursor(exp.c_str());
01343   //   }
01344 
01345   //   inline
01346   //   regexp_capture_cursor regexpc() {
01347   //     return regexp_capture_cursor();
01348   //   }
01349 
01350   template <typename Tag, typename SigmaOut>
01351   class DFT_matrix_mini : public DFA_matrix_mini<plain, Tag>
01352   {
01353   public:
01354     typedef DFA_matrix_mini<plain, Tag> super;
01355     typedef typename super::state_type state_type;
01356     typedef typename super::char_type char_type;
01357     typedef SigmaOut output_type;
01358 
01359     state_type new_state() {
01360       state_type q = super::new_state();
01361       // FIXME: optimize
01362       output_slots.resize(q * 256 + 256, false);
01363       return q;
01364     }
01365 
01366     output_type* output(state_type q, char_type c) {
01367       return output_slots[pack(q, c)] == true ? &outputs[pack(q, c)] : NULL;
01368     }
01369 
01370     void set_output(state_type q, char_type c, const output_type &o) {
01371       unsigned int packed = pack(q, c);
01372       output_slots[packed] = true;
01373       outputs[packed] = o;
01374     }
01375 
01376   protected:
01377     tr1::unordered_map<unsigned int, SigmaOut> outputs;
01378     vector<bool> output_slots;
01379 
01380     unsigned int pack(state_type q, char_type c) const {
01381       return (((unsigned int) q) << 8) | (unsigned char) c;
01382     }
01383   };
01384 
01385   template <>
01386   class lazy_cursor<regexp_capture_cursor, DFT_matrix_mini> : public cursor_concept
01387   {
01388   public:
01389     typedef regexp_capture_cursor Cursor;
01390 #ifdef USE_HASH_TABLE
01391     typedef DFT_matrix_mini<const regexp_capture_cursor::ordered_state_type*,
01392                             regexp_capture_cursor::tag_type> DFA_type;
01393 #else
01394     typedef DFT_matrix_mini<const regexp_capture_cursor::state_type*,
01395                             regexp_capture_cursor::tag_type> DFA_type;
01396 #endif
01397 
01398   protected:
01399 #ifdef USE_HASH_TABLE
01400     typedef std::map<regexp_capture_cursor::ordered_state_type, DFA_type::state_type> mapper;
01401 #else
01402     typedef std::map<regexp_capture_cursor::state_type, DFA_type::state_type> mapper;
01403 #endif
01404 
01405     typedef unsigned long offset_type;
01406 
01407     Cursor               c;
01408     smart_ptr<DFA_type>  dfa;
01409     smart_ptr<mapper>    mapping;  
01410     DFA_type::state_type my_sink;
01411     DFA_type::state_type q;
01412 
01413   public:
01414     typedef lazy_cursor           self;
01415     typedef DFA_type::state_type  state_type;
01416     typedef Cursor::char_type     char_type;
01417     typedef Cursor::char_traits   char_traits;
01418     typedef Cursor::configuration configuration;
01419     typedef configuration         tag_type;
01420 
01421   protected:
01422     const configuration *conf;
01423 
01424   public:
01425     lazy_cursor(const Cursor &x)
01426       : c(x), my_sink(dfa->new_state()), q(DFA_type::null_state), conf(NULL)
01427     { 
01428       if (!x.sink()) {
01429         q = dfa->new_state();
01430 #ifdef USE_HASH_TABLE
01431         dfa->tag(q) = &(mapping->insert(make_pair(regexp_capture_cursor::ordered_state_type(c.src().begin(), c.src().end()), q)).first->first);
01432 #else
01433         dfa->tag(q) = &(mapping->insert(make_pair(c.src(), q)).first->first);
01434 #endif
01435         dfa->final(q) = c.src_final();
01436         dfa->initial(q);
01437       }
01438     }
01439 
01440     int capture_count() const { return c.capture_count(); }
01441 
01442     const Cursor& adaptee() {
01443       if (q != DFA_type::null_state)
01444         c = regexp_capture_cursor::state_type(dfa->tag(q)->begin(), dfa->tag(q)->end());
01445       return c;
01446     }
01447     
01448     const DFA_type& cache() const {
01449       return *dfa;
01450     }
01451     
01452     state_type src() const {
01453       return q;
01454     }
01455     
01456     self& operator=(state_type p) {
01457       q = p;
01458       return *this;
01459     }
01460     
01461     bool src_final() const {
01462       return dfa->final(q);
01463     }
01464     
01465     // return relative offsets
01466     const tag_type& src_tag() const {
01467       static const configuration empty;
01468       if (conf != NULL)
01469         return *conf;
01470       return empty;
01471     }
01472     
01473     bool sink() const {
01474       return q == DFA_type::null_state;
01475     }
01476     
01477     state_type sink_state() const {
01478       return DFA_type::null_state;
01479     }
01480 
01481     bool exists(const char_type &a) const {
01482       state_type aim = dfa->delta1(q, a);
01483       if (aim == DFA_type::null_state) {
01484         Cursor tmp = c;
01485 #ifdef USE_HASH_TABLE
01486         tmp = regexp_capture_cursor::state_type(dfa->tag(q)->begin(), dfa->tag(q)->end());
01487 #else
01488         tmp = *dfa->tag(q);
01489 #endif
01490         return tmp.exists(a);
01491       }
01492       return aim != my_sink;
01493     }
01494 
01495     bool forward(const char_type &a) {
01496       state_type tmp = dfa->delta1(q, a);
01497       if (tmp == my_sink) {
01498         q = DFA_type::null_state; // we already know there is no transition with 'a'
01499         return false;
01500       }
01501       if (tmp == DFA_type::null_state) {    // transition not in the cache ?
01502         c = regexp_capture_cursor::state_type(dfa->tag(q)->begin(), dfa->tag(q)->end());
01503         if (c.forward(a)) {   // any transition labelled with 'a' ?
01504 #ifdef USE_HASH_TABLE
01505           std::pair<mapper::iterator, bool> p = 
01506             mapping->insert(std::make_pair(regexp_capture_cursor::ordered_state_type(c.src().begin(), c.src().end()), DFA_type::null_state));
01507 #else
01508           std::pair<mapper::iterator, bool> p = 
01509             mapping->insert(std::make_pair(c.src(), DFA_type::null_state));
01510 #endif
01511           if (p.second == true) { // target state is not dejavu ?
01512             tmp = dfa->new_state();
01513             dfa->final(tmp) = c.src_final();
01514             dfa->tag(tmp) = &(p.first->first);
01515             p.first->second = tmp;
01516           }
01517           else tmp = p.first->second;
01518           dfa->set_trans(q, a, tmp);
01519           if (!c.src_tag().empty()) {
01520             // cout << "dfa->set_output(" << q << ", " << a << ", " << c.src_tag() << ")" << endl;
01521             dfa->set_output(q, a, c.src_tag());
01522           }
01523         }
01524         else {
01525           dfa->set_trans(q, a, my_sink);
01526           q = DFA_type::null_state;
01527           return false;
01528         }
01529       }
01530 
01531 //       conf = dfa->output(q, a);
01532 //       if (conf != NULL) {
01533 //      for(regexp_capture_cursor::configuration::const_iterator ic = conf->begin(); ic != conf->end(); ++ic) {
01534 //        cout << "captures[" << ic->first.id() << "] displacement " << ic->second << endl;
01535 //        //      (*captures)[ic->first.id()] = offset + ic->second;
01536 //      }
01537 //       }
01538 //       //      ++offset;
01539 
01540       conf = dfa->output(q, a); // how to avoid this if we don't need
01541                                 // capture and don't need to pay the price
01542       q = tmp;
01543       return true;
01544     }
01545   };
01546 
01547 #if 0
01548   // match algorithm specializations for regexp
01549 
01550   inline
01551   bool match(lazy_cursor<regexp_capture_cursor>& c, const char *w)
01552   {
01553     if (!c.forward('\0')) // ^ anchoring
01554       return false;
01555     for(; *w != '\0' && c.forward(*w); ++w)
01556       if (c.src_final()) return true;
01557     return *w == '\0' && c.forward('\0') && c.src_final(); // $ anchoring
01558   }
01559 
01560   template <typename InputI>
01561   inline
01562   bool match(lazy_cursor<regexp_capture_cursor>& c, InputI first, InputI last)
01563   {
01564     if (!c.forward('\0'))  // ^ anchoring
01565       return false;
01566     for(; first != last && c.forward(*first); ++first)
01567       if (c.src_final()) return true;
01568     return first == last && c.forward('\0') && c.src_final(); // $ anchoring
01569   }
01570 
01571   template <typename ForwardI>
01572   inline
01573   ForwardI first_match(lazy_cursor<regexp_capture_cursor> &c, ForwardI first, ForwardI last)
01574   {
01575     if (!c.forward('\0')) return first;
01576     const ForwardI start = first;
01577     for(; first != last && c.forward(*first); ++first)
01578       if (c.src_final()) return ++first;
01579     return first == last && c.forward('\0') && c.src_final() ? first : start;
01580   }
01581 
01582   inline
01583   const char*
01584   first_match(lazy_cursor<regexp_capture_cursor> &c, const char *text)
01585   {
01586     if (!c.forward('\0')) return text;
01587     const char *start = text;
01588     for(; *text != '\0' && c.forward(*text); ++text)
01589       if (c.src_final()) return ++text;
01590     return *text == '\0' && c.forward('\0') && c.src_final() ? text : start;
01591   }
01592 
01593 #endif
01594 
01595   template <typename ForwardI>
01596   inline 
01597   bool astl_capture(lazy_cursor<regexp_capture_cursor, DFT_matrix_mini> &c, ForwardI first, ForwardI last, vector<unsigned long> &offsets) {
01598     for(; first != last && c.forward(*first); ++first);
01599     if (first == last && c.forward('\0') && c.src_final()) {
01600       offsets = c.src_tag();
01601       return true;
01602     }
01603     return false;
01604   }
01605 
01606 #if 0
01607   template <typename ForwardI>
01608   inline
01609   ForwardI first_match(regexp_capture_cursor &c, ForwardI first, ForwardI last)
01610   {
01611     //    if (!c.forward('\0')) return first;
01612     const ForwardI start = first;
01613     for(regexp_capture_cursor::state_type::const_iterator i = c.src().begin(); i != c.src().end(); ++i) {
01614       cout << "< " << i->first << ':' << *i->first << ", { ";
01615       for(regexp_capture_cursor::configuration::const_iterator j = i->second.begin(); j != i->second.end(); ++j)
01616         cout << (int) *j << " ";
01617       cout << "} > ";
01618     }
01619     cout << endl;
01620     for(; first != last && c.forward(*first); ++first) {
01621       for(regexp_capture_cursor::state_type::const_iterator i = c.src().begin(); i != c.src().end(); ++i) {
01622         cout << "< " << i->first << ':' << *i->first << ", { ";
01623         for(regexp_capture_cursor::configuration::const_iterator j = i->second.begin(); j != i->second.end(); ++j)
01624           cout << (int) *j << " ";
01625         cout << "} > ";
01626       }
01627       cout << endl;
01628       if (c.src_final()) return ++first;
01629     }
01630     return first == last && c.forward('\0') && c.src_final() ? first : start;
01631   }
01632 
01633 #endif
01634 
01635 
01636 }
01637 
01638 #endif

Generated on Sun Mar 8 02:41:30 2009 for ASTL by  doxygen 1.5.7.1