experiment.h

00001 /*
00002  * ASTL - the Automaton Standard Template Library.
00003  * C++ generic components for Finite State Automata handling.
00004  * Copyright (C) 2000-2003 Vincent Le Maout (vincent.lemaout@chello.fr).
00005  * 
00006  * This library is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  * 
00011  * This library is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  * 
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with this library; if not, write to the Free Software
00018  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00019  *
00020  */
00021 
00022 #ifndef ASTL_EXPERIMENT_H
00023 #define ASTL_EXPERIMENT_H
00024 
00025 //
00026 // ** STILL UNDER CONSTRUCTION **
00027 // USE AT YOUR OWN RISKS (THAT IS, DON'T USE THIS :-)
00028 //
00029 
00030 #include "astl.h"
00031 #include "lazy.h"
00032 #include <vector>
00033 #include <iostream>
00034 #include <set>
00035 #include <iterator>
00036 #include <cctype>  
00037 #include <bitset>
00038 #include <cstring>
00039 #include <sstream>
00040 
00041 
00042 
00043 #include <cassert>
00044 
00045 #include <tr1/unordered_map>
00046 
00047 
00048 // TODO/FIXME
00049 // - determinization of sets of capture along the path. The method src_tag()
00050 //   will return something useful for the upper layer to handle the offsets
00051 //   of captures (an adapter capture_cursor could encapsulate either a 
00052 //   regexp_capture_cursor or a lazy_cursor<regexp_capture_cursor> ?)
00053 // - horspool optimization mixed with captures (?)
00054 // - better implementation for the DFT_matrix_mini ?
00055 
00056 // REFACTORING/GENERALIZATION
00057 // - externalize tokenizer
00058 // - externalize parser
00059 // - externalize the node processing so that it can process a tree directly
00060 // - templatize nodes so that any alphabet can be used
00061 // => make node an abstract type and use visitors
00062 
00063 // - make the parser be able to parse a regexp on any alphabet type (?)
00064 // - merge regexp_cursor and capture_cursor since we need a capture for general
00065 //   case when the expression is not left-anchored (need to know where the
00066 //   match begins). Must not pay the price of the captures if none are present
00067 //   in the expression.
00068 
00069 // - can we manage greediness with this kind of processing ?
00070 // - can we manage {n,m} repetitions with this kind of processing ?
00071 // - can we manage approximative regexp matching ?
00072 // - can we replace the neighbor cursor with this ? (more efficient ?)
00073 // - debug trace at each point where there is a tag in the regexp ?
00074 // => generalize to markers in expression corresponding to an action to be taken
00075 //    (say, increment repetition counter, choose between greedy/non greedy,
00076 //    update a edit cost or whatever is needed). Trouble: geneiric determinization.
00077 
00078 // OPTIMIZATIONS
00079 // - generalize boyer-moore-horspool to a set of words (?) (Tom|Sawyer|Huckleberry|Finn)
00080 // - generalize boyer-moore-horspool to sets of characters (?) ([Tt]wain)
00081 // - look for an optimization for the case ^[^ ]*?Twain (xpressive is 3 times faster)
00082 
00083 #ifdef USE_HASH_TABLE
00084 #undef USE_HASH_TABLE
00085 #endif
00086 
00087 using namespace std;
00088 
00089 namespace astl {
00090 
00091   // regular expressions grammar:
00092   // exp    -> term exp2
00093   // exp2   -> + term exp2 | epsilon
00094   // term   -> form repeat term2
00095   // term2  -> . form repeat term2 | epsilon
00096   // repeat -> * repeat | ? repeat | + repeat | {n,m} repeat | epsilon
00097   // form   -> ( exp ) | letter 
00098   //
00099   // + is union, . is concatenation, epsilon is the empty word
00100   // a letter is a set of chars
00101 
00102   class non_deterministic_regexp_capture_cursor : public cursor_concept
00103   {
00104   public:
00105     typedef enum { STAR, CONCAT, OR, REPEAT, EPSILON, 
00106                    LETTER, OPEN, CLOSE, QUESTION, PLUS,
00107                    ANY, OPENCAPTURE, CLOSECAPTURE, FINAL, GRANDFINAL } token_type;
00108   protected:  
00109     class node;
00110 
00111   public:
00112     class capture
00113     {
00114     public:
00115       capture(const node *n)
00116         : n(n)
00117       { }
00118       
00119       int        id()    const { return n->n; }
00120       int        index() const { return n->m; }
00121       token_type type()  const { return n->type; }
00122 
00123       bool operator<(const capture &c) const {
00124         return id() < c.id();
00125       }
00126 
00127       bool operator==(const capture &c) const {
00128         return n == c.n;
00129       }
00130 
00131     protected:
00132       const node *n;
00133     };
00134 
00135     class configuration;
00136     friend ostream& operator<<(ostream&, const configuration&);
00137 
00138     class configuration : public map<capture, long>
00139     {
00140     public:
00141       configuration& operator--() {
00142         for(iterator i = begin(); i != end(); ++i)
00143           --(i->second);
00144         return *this;
00145       }
00146 
00147 
00148       bool operator<(const configuration &y) const {
00149 
00150         cout << "comparing " << *this << " and " << y << " == ";
00151 
00152         if (&y == this || empty()) 
00153           {
00154             cout << "false" << endl;
00155             return false;
00156           }
00157 
00158         const_iterator i, j;
00159         for(i = begin(), j = y.begin(); 
00160             i != end() && j != y.end() && i->first.id() == j->first.id() && i->second == j->second; 
00161             ++i, ++j);
00162         if (i == end() && j == y.end())
00163           {
00164             cout << "false" << endl;
00165             return false;
00166           }
00167 
00168         // check validity of this:
00169         if (i == end())
00170           {
00171             cout << "false" << endl;
00172             return false;
00173           }
00174 
00175         // check validity of this:
00176         if (j == y.end())
00177           {
00178             cout << "true" << endl;
00179             return true;
00180           }
00181 
00182         // check validity of this:
00183         if (i->first.id() != j->first.id())
00184           {
00185             cout << (i->first.id() < j->first.id() ? "true" : "false") << endl;
00186             return i->first.id() < j->first.id();
00187           }
00188 
00189         // left-most longest: 
00190         // * left-most OPENCAPTURE are preferred,
00191         // * right-most CLOSECAPTURE are preferred 
00192         // * empty match is better than no match
00193         if (i->first.type() == OPENCAPTURE) // OPEN ?
00194           {
00195             cout << (i->second < j->second ? "true" : "false") << endl;
00196             return i->second < j->second;
00197           }
00198         cout << (j->second < i->second ? "true" : "false") << endl;
00199         return j->second < i->second;
00200       }
00201 
00202 
00203       configuration intersection(const configuration &x) const {
00204         cout << "intersection(" << *this << ", " << x << ") == ";
00205         if (empty()) {
00206           cout << *this << endl;
00207           return *this;
00208         }
00209         if (x.empty()) {
00210           cout << x << endl;
00211           return x;
00212         }
00213         configuration r;
00214         configuration::const_iterator i = begin(), j = x.begin();
00215         while (i != end() && j != x.end()) {
00216           if (i->first.id() < j->first.id()) {
00217             ++i;
00218           }
00219           else if (j->first.id() < i->first.id()) {
00220             ++j;
00221           }
00222           else {
00223             if (i->second == j->second) {
00224               r.insert(*j);
00225             }
00226             ++j;
00227             ++i;
00228           }
00229         }
00230         cout << r << endl;
00231         return r;
00232       }
00233 
00234     };
00235 
00236   protected:
00237     typedef node* position;
00238     typedef map<position, configuration> S;
00239 
00240     struct node {
00241       bitset<256> letter;
00242       token_type type;
00243       short n, m; // repeat operator
00244       bool nullable;
00245       S firstpos, lastpos, nextpos;
00246       configuration emptymatch;
00247       // configuration conf;
00248     
00249       node(token_type t, short n_ = 0, short m_ = 0)
00250         : type(t), n(n_), m(m_), nullable(false) { 
00251         switch (type) {
00252         case FINAL :
00253           letter.set(0);
00254           break;
00255         case ANY :
00256           type = LETTER;
00257           letter.set();
00258 
00259 
00260 
00261           // letter.set(0, false); // don't match '\0'
00262 
00263 
00264 
00265           break;
00266         case OPENCAPTURE :
00267         case CLOSECAPTURE :
00268           nullable = true;
00269           break;
00270         default :;
00271         }
00272       }
00273 
00274       node(char t)
00275         : type(LETTER), nullable(false) { 
00276         letter.set((unsigned char) t);
00277       }
00278 
00279       node(const bitset<256> &t)
00280         : letter(t), type(LETTER), nullable(false)
00281       { }
00282 
00283       bool final() const {
00284         return type == FINAL;
00285       }
00286 
00287       char to_char() const {   // does not work with ranges
00288         size_t i;
00289         for(i = 0; i != letter.size() && letter.test(i) == false; ++i);
00290         return (char) (-128 + i);
00291       }
00292 
00293       string pretty() const {
00294         ostringstream r;
00295         r << "[ " << *this;
00296         for(S::const_iterator i = nextpos.begin(); i != nextpos.end(); ++i) {
00297           r << ", {";
00298           for(configuration::const_iterator j = i->second.begin(); j != i->second.end(); ++j)
00299             r << ' ' << j->first.id();
00300           r << " }";
00301         }
00302         r << " ]";
00303         return r.str();
00304       }
00305     };
00306 
00307   protected:
00308     S q;  // current state
00309     smart_ptr<vector<node> > e;  // node "allocator"
00310     const char *errmsg;
00311     smart_ptr<S> I;
00312     unsigned char capture_count_;
00313     configuration intersection;
00314 
00315   public:
00316     smart_ptr<horspool_finder> horspool;
00317 
00318   protected:
00319     friend ostream& operator<<(ostream&, const non_deterministic_regexp_capture_cursor&);
00320 
00321     node* exp(vector<node>::iterator &first, vector<node>::iterator last);
00322     node* form(vector<node>::iterator &first, vector<node>::iterator last);
00323     node* term(vector<node>::iterator &first, vector<node>::iterator last);
00324     node* exp2(vector<node>::iterator &first, vector<node>::iterator last, node *left);
00325     node* term2(vector<node>::iterator &first, vector<node>::iterator last, node *left);
00326     node* repeat(vector<node>::iterator &first, vector<node>::iterator last, node *left);
00327     void preprocess_repeats();
00328     void build_char_sets(map<string, bitset<256> >&);
00329     template <typename OutputIterator>
00330     int tokenize(const char *first, const char *last, OutputIterator x);
00331 
00332   public:
00333     friend ostream& operator<<(ostream &, const node&);
00334     ptrdiff_t errpos;
00335 
00336     typedef non_deterministic_regexp_capture_cursor    self;
00337     typedef S                state_type;
00338     typedef set<node*>       ordered_state_type;
00339     typedef plain            char_traits;
00340     typedef char_traits::char_type char_type;
00341     typedef configuration    tag_type;
00342 
00343     non_deterministic_regexp_capture_cursor(const string &expression = "", bool horspool_optimized = false)
00344       : errmsg(NULL), capture_count_(0), errpos(-1)
00345     {
00346       if (!expression.empty()) {
00347 
00348         // bourinnage a l'ancienne:
00349         const string enhanced = string(".*(") + expression + ')';
00350 
00351         e->reserve(128);
00352 //      errpos = tokenize(expression.c_str(), 
00353 //                        expression.c_str() + expression.size(), 
00354 //                        back_inserter(*e));
00355         errpos = tokenize(enhanced.c_str(), 
00356                           enhanced.c_str() + enhanced.size(), 
00357                           back_inserter(*e));
00358         if (errpos == -1) {
00359           preprocess_repeats();
00360           vector<node>::iterator first = e->begin();
00361           node *root = exp(first, e->end());
00362 
00363           // managed non-anchored alternatives
00364           if (root != NULL) {
00365             q = root->firstpos;
00366 #if 0
00367             for(S::iterator i = q.begin(); i != q.end(); ++i)
00368               // if not beginning of line (anchor ^)
00369               if ((*i).first->letter.count() == 256 || (*i).first->letter.test(0) == false)
00370                 I->insert(make_pair(i->first, i->second));
00371 #endif
00372           }
00373           else errpos = expression.size() - errpos;
00374         }
00375       }
00376     }
00377 
00378     ptrdiff_t error() const { // possible error position (-1 if expression is ok)
00379       return errpos;
00380     }
00381     
00382     const char* error_message() const { return errmsg; }
00383 
00384     self& operator=(const state_type &s) {
00385       q = s;
00386       return *this;
00387     }
00388 
00389     const state_type& src() const { return q; }
00390 
00391     bool exists(char a) const {
00392       for(S::const_iterator i = q.begin(); i != q.end(); ++i)
00393         if ((*i).first->letter.test((unsigned char) a) == true)
00394           return true;
00395       return false;
00396     }
00397 
00398     int capture_count() const { return capture_count_; }
00399 
00400     bool forward(char letter) {
00401       state_type p; // = *I; // for non-anchored expressions
00402       cout << string(80, '_') << endl << "forward(" << (int) (unsigned char) letter << ")\tI == ";
00403       for(S::const_iterator i = I->begin(); i != I->end(); ++i) {
00404         cout << '\t' << i->first << ':' << i->first->pretty() << ":=" << i->second;
00405       }
00406       cout << endl;
00407       // q.insert(I->begin(), I->end());
00408       for(S::const_iterator i = q.begin(); i != q.end(); ++i) { // for each (position, configuration)
00409         cout <<  "\033[1m" << "position: " << i->first << ':' << i->first->pretty() 
00410              << ":=" << i->second << "\033[0m" << endl;
00411         if (i->first->letter.test((unsigned char) letter) == true) {
00412           state_type::const_iterator target = i->first->nextpos.begin();
00413           const state_type::const_iterator last = i->first->nextpos.end();
00414           for(; target != last; ++target) { // for each target (position, configuration)
00415             configuration tmp = i->second;
00416             --tmp; // shift configuration offsets
00417             // union of configurations:
00418             configuration::const_iterator cfirst = target->second.begin(), clast = target->second.end();
00419             for(; cfirst != clast; ++cfirst) {
00420 //            configuration::const_iterator ci = tmp.find(cfirst->first);
00421 //            if (ci == tmp.end() || cfirst->first.type() == CLOSECAPTURE)
00422                 tmp[cfirst->first] = 0; // update offset
00423             }
00424             configuration &conf = p[target->first];
00425             cout << "configurations dedup (one/position): ";
00426             if (tmp < conf) conf = tmp; // dedup of configurations (only one per position)
00427           }
00428         }
00429       }
00430       q.swap(p);
00431       //      q.insert(I->begin(), I->end());
00432       return !q.empty();
00433     }
00434 
00435     bool src_final() const {
00436       for(S::const_iterator i = q.begin(); i != q.end(); ++i)
00437         if (i->first->final()) return true;
00438       return false;
00439     }
00440      
00441   bool sink() const { return q.empty(); }
00442  
00443   const tag_type& src_tag() const { 
00444     // return "minimal" configuration
00445     S::const_iterator m = q.end();
00446     for(S::const_iterator i = q.begin(); i != q.end(); ++i)
00447       if (m == q.end() || i->second < m->second)
00448         m = i;
00449     return m->second;
00450   }
00451 
00452     void clear() {
00453       q.clear();
00454       e->clear();
00455       I->clear();
00456     }
00457 
00458 //     void determinize(DFA_matrix<plain> &dfa) const {
00459 //       typedef DFA_matrix<plain> dfa_type;
00460 //       map<ordered_state_type, dfa_type::state_type> old2new;
00461 //       dfa_type::state_type i = dfa.new_state();
00462 //       dfa.initial(i);
00463 //       determinize_(dfa, old2new, old2new.insert(make_pair(ordered_state_type(q.begin(), q.end()), i)).first);
00464 //     }
00465 
00466 //     void determinize_(DFA_matrix<plain> &dfa, 
00467 //                    map<ordered_state_type, DFA_matrix<plain>::state_type> &old2new,
00468 //                    map<ordered_state_type, DFA_matrix<plain>::state_type>::iterator p) const {
00469 //       for(int i = -128; i < 128; ++i) {
00470 //      ordered_state_type aim(I->begin(), I->end());    
00471 //      for(ordered_state_type::iterator s = p->first.begin(); s != p->first.end(); ++s) {
00472 //        if ((*s)->letter.test((unsigned char) i)) {
00473 //          aim.insert((*s)->nextpos.begin(), (*s)->nextpos.end());
00474 //        }
00475 //      }
00476 //      if (!aim.empty()) {
00477 //        pair<map<ordered_state_type, DFA_matrix<plain>::state_type>::iterator, bool> j = 
00478 //          old2new.insert(make_pair(aim, DFA_matrix<plain>::null_state));
00479 //        if (j.second == false) {
00480 //          dfa.set_trans(p->second, (char) (unsigned char) i, j.first->second);
00481 //        }
00482 //        else {
00483 //          j.first->second = dfa.new_state();
00484 //          dfa.set_trans(p->second, (char) (unsigned char) i, j.first->second);
00485 //          for(ordered_state_type::const_iterator ii = aim.begin(); ii != aim.end(); ++ii)
00486 //            if ((*ii)->final()) {
00487 //              dfa.final(j.first->second) = true;
00488 //              break;
00489 //            }
00490 //          aim.clear();
00491 //          determinize_(dfa, old2new, j.first);
00492 //        }
00493 //      }
00494 //       }
00495 //     }
00496 
00497   protected:
00498     ptrdiff_t token_error(ptrdiff_t pos, const char *message) {
00499       errmsg = message;
00500       errpos = pos;
00501       return pos;
00502     }
00503 
00504     node* syntax_error(ptrdiff_t pos, const char *message = NULL) {
00505       errmsg = message;
00506       errpos = pos;
00507       return NULL;
00508     }
00509 
00510     int next_char(bitset<256>& letter, const char *begin, const char*& first, const char *last, 
00511                   bool& literal, const map<string, bitset<256> >& char_sets, bool icase);
00512 
00513   };
00514 
00515   inline
00516   ostream& operator<<(ostream &out, const non_deterministic_regexp_capture_cursor::configuration &c) {
00517     for(non_deterministic_regexp_capture_cursor::configuration::const_iterator i = c.begin();
00518         i != c.end(); ++i)
00519       out << i->first.id() << ':' << i->second << ' ';
00520     return out;
00521   }
00522 
00523   inline
00524   void non_deterministic_regexp_capture_cursor::preprocess_repeats() {
00525     vector<node> &v = *e;
00526     for(vector<node>::size_type p = 0; p < v.size();) {
00527       if (v[p].type != REPEAT) ++p;
00528       else {
00529         // look for what is repeated: letter, (exp), ?, +, *, ANY 
00530         vector<node>::size_type first = p - 1, last = p;
00531         for(; first > 0; --first) {
00532           if (v[first].type == LETTER || v[first].type == CLOSE || v[first].type == ANY)
00533             break;
00534         }
00535         if (v[first].type == CLOSE) {
00536           int count = 1;
00537           while(count != 0 && --first > 0)
00538             if (v[first].type == CLOSE) ++count;
00539             else if (v[first].type == OPEN) --count;
00540         }
00541         // the range [first, last) contains the pattern to be repeated
00542         // enclose it in parenthesis because of such things: a{0,2}{3}
00543         // which would otherwise be rewritten: a?a?{3} but shoud be written: (a?a?){3}
00544         // limiting to 128
00545         short n = (v[p].n > 128 ? 128 : v[p].n), 
00546           m = (v[p].m > 128 ? 128 : v[p].m);
00547         v.insert(v.begin() + first, node(OPEN));
00548         ++p;
00549         ++last;
00550         ++first;
00551         // remove repeat node
00552         if (n == 0)
00553           v[p] = node(QUESTION);
00554         else
00555           v.erase(v.begin() + p);
00556         vector<node> tmp;
00557         if (m == -1)
00558           tmp.reserve((last + 1 - first) * n + 1);
00559         else
00560           tmp.reserve((last + 1 - first) * m + m - n + 1);
00561 
00562         while(n > 0 || m > 0) {
00563           --n; 
00564           --m;
00565           if (n <= 0) { // end of mandatory repeats ?
00566             if (m < n) {  // case {n,}
00567               tmp.push_back(node(PLUS));
00568               continue; // end of processing
00569             }
00570             if (m == 0)
00571               continue; // end of processing
00572           }
00573           tmp.push_back(node(CONCAT));
00574           for(vector<node>::size_type tmp_p = first; tmp_p < last; ++tmp_p)
00575             tmp.push_back(v[tmp_p]);
00576           if (n <= 0)
00577             tmp.push_back(node(QUESTION));
00578         }
00579         tmp.push_back(node(CLOSE));
00580         v.insert(v.begin() + p, tmp.begin(), tmp.end());
00581         p += tmp.size();
00582       }
00583     }
00584   }
00585 
00586   inline
00587   ostream& operator<<(ostream &out, const non_deterministic_regexp_capture_cursor::node &x) 
00588   {
00589     switch(x.type) {
00590     case non_deterministic_regexp_capture_cursor::LETTER :
00591 //      if (x.letter.count() == 256)
00592 
00593 
00594 
00595 
00596       if (x.letter.count() == 256)
00597 
00598 
00599 
00600 
00601         out << ".";
00602       else {
00603         if (x.letter.count() > 1)
00604           out << '[';
00605         int lower = 0;
00606         // rebuild approximatively the character set
00607         for(int i = 32; i < 257; ++i) {
00608           if (i < 256 && x.letter.test(i) == true) {
00609             if (lower == 0)
00610               lower = i;
00611           }
00612           else {
00613             if (lower > 0) {
00614               if (i - lower == 1) {
00615                 out << (unsigned char) lower;
00616               }
00617               else if (i - lower == 2) {
00618                 out << (unsigned char) lower << (unsigned char) (lower + 1);
00619               }
00620               else out << (unsigned char) lower << '-' 
00621                         << (unsigned char) (i - 1);
00622               lower = 0;
00623             }
00624           }
00625         }
00626         if (x.letter.count() > 1)
00627           out << ']';
00628       }
00629       break;
00630         
00631     case non_deterministic_regexp_capture_cursor::OR : 
00632       out << "|"; 
00633       break;
00634         
00635     case non_deterministic_regexp_capture_cursor::FINAL : 
00636       out << "[EoE]";
00637       break;
00638 
00639     case non_deterministic_regexp_capture_cursor::GRANDFINAL : 
00640       out << "[GF]";
00641       break;
00642         
00643     case non_deterministic_regexp_capture_cursor::OPEN : 
00644       out << "("; 
00645       break;
00646         
00647     case non_deterministic_regexp_capture_cursor::CLOSE : 
00648       out << ")"; 
00649       break;
00650         
00651     case non_deterministic_regexp_capture_cursor::CONCAT : 
00652       out << " "; 
00653       break;
00654         
00655     case non_deterministic_regexp_capture_cursor::STAR : 
00656       out << "*"; 
00657       break;
00658         
00659     case non_deterministic_regexp_capture_cursor::QUESTION : 
00660       out << "?"; 
00661       break;
00662         
00663     case non_deterministic_regexp_capture_cursor::PLUS : 
00664       out << "+"; 
00665       break;
00666         
00667     case non_deterministic_regexp_capture_cursor::REPEAT : 
00668       out << "{" << x.n << "," << x.m << "}";
00669       break;
00670         
00671     case non_deterministic_regexp_capture_cursor::OPENCAPTURE :
00672       out << "_(" << x.n << "_";
00673       break;
00674 
00675     case non_deterministic_regexp_capture_cursor::CLOSECAPTURE :
00676       out << "_)" << x.n << "_";
00677       break;
00678 
00679     default :
00680       break;
00681     } 
00682     return out;
00683   }
00684 
00685   inline
00686   ostream& operator<<(ostream& out, const non_deterministic_regexp_capture_cursor& c) {
00687     for(vector<non_deterministic_regexp_capture_cursor::node>::const_iterator n = c.e->begin(); n != c.e->end(); ++n)
00688       if (c.q.find(const_cast<astl::non_deterministic_regexp_capture_cursor::node*>(&*n)) != c.q.end())
00689         out << "\033[1m" << *n << "\033[0m";
00690       else
00691         out << *n;
00692     return out;
00693   }
00694 
00695   inline
00696   void non_deterministic_regexp_capture_cursor::build_char_sets(map<string, bitset<256> >& char_sets) {
00697     bitset<256> alpha, alnum, ascii, blank, cntrl, digit, word;
00698     bitset<256> graph, lower, print, punct, space, upper, xdigit;
00699     for(int c = 0; c < 256; ++c) {
00700       alnum.set(c, isalnum(c) != 0);
00701       alpha.set(c, isalpha(c) != 0);
00702       ascii.set(c, isascii(c) != 0);
00703       cntrl.set(c, iscntrl(c) != 0);
00704       digit.set(c, isdigit(c) != 0);
00705       graph.set(c, isgraph(c) != 0);
00706       lower.set(c, islower(c) != 0);
00707       print.set(c, isprint(c) != 0);
00708       punct.set(c, ispunct(c) != 0);
00709       space.set(c, isspace(c) != 0);
00710       upper.set(c, isupper(c) != 0);
00711       xdigit.set(c, isxdigit(c) != 0);
00712     }
00713     char_sets["[:alnum:]"] = alnum;
00714     char_sets["[:^alnum:]"] = ~ char_sets["[:alnum:]"];
00715     char_sets["[:alpha:]"] = alpha;
00716     char_sets["[:^alpha:]"] = ~ char_sets["[:alpha:]"];
00717     char_sets["[:ascii:]"] = ascii;
00718     char_sets["[:^ascii:]"] = ~ char_sets["[:ascii:]"];
00719     blank.set((unsigned char) ' ');
00720     blank.set((unsigned char) '\t');
00721     char_sets["[:blank:]"] = blank;
00722     char_sets["[:^blank:]"] = ~ char_sets["[:blank:"];
00723     char_sets["[:cntrl:]"] = cntrl;
00724     char_sets["[:^cntrl:]"] = ~ char_sets["[:cntrl:]"];
00725     char_sets["[:digit:]"] = digit;
00726     char_sets["[:^digit:]"] = ~ char_sets["[:digit:]"];
00727     char_sets["[:graph:]"] = graph;
00728     char_sets["[:^graph:]"] = ~ char_sets["[:graph:]"];
00729     char_sets["[:lower:]"] = lower;
00730     char_sets["[:^lower:]"] = ~ char_sets["[:lower:]"];
00731     char_sets["[:print:]"] = print;
00732     char_sets["[:^print:]"] = ~ char_sets["[:print:]"];
00733     char_sets["[:punct:]"] = punct;
00734     char_sets["[:^punct:]"] = ~ char_sets["[:punct:]"];
00735     char_sets["[:space:]"] = space;
00736     char_sets["[:^space:]"] = ~ char_sets["[:space:]"];
00737     char_sets["[:upper:]"] = upper;
00738     char_sets["[:^upper:]"] = ~ char_sets["[:upper:]"];
00739     char_sets["[:xdigit:]"] = xdigit;
00740     char_sets["[:^xdigit:]"] = ~ char_sets["[:xdigit:]"];
00741     alnum.set((unsigned char) '_');
00742     char_sets["[:word:]"] = alnum;
00743     char_sets["[:^word:]"] = ~ char_sets["[:word:]"];
00744     char_sets["\\d"] = digit;
00745     char_sets["\\D"] = ~ char_sets["\\d"];
00746     char_sets["\\s"] = space;
00747     char_sets["\\S"] = ~ char_sets["\\s"];
00748     char_sets["\\w"] = alnum;
00749     char_sets["\\W"] = ~alnum;
00750     char_sets["\\t"] = bitset<256>().set((unsigned char) '\t');
00751     char_sets["\\n"] = bitset<256>().set((unsigned char) '\n');
00752     char_sets["\\r"] = bitset<256>().set((unsigned char) '\r');
00753     char_sets["\\f"] = bitset<256>().set((unsigned char) '\f');
00754     char_sets["\\a"] = bitset<256>().set((unsigned char) '\a');
00755     char_sets["\\e"] = bitset<256>().set((unsigned char) '\x1B');
00756     char_sets["\\v"] = bitset<256>().set((unsigned char) '\v');
00757     char_sets["\\V"] = ~ char_sets["\\v"];
00758     char_sets["\\h"] = bitset<256>().set((unsigned char) ' ');
00759     char_sets["\\H"] = ~ char_sets["\\h"];
00760   }
00761 
00762   // read char at position first and advance first
00763   // may return an empty char when encountering \E
00764   inline
00765   int non_deterministic_regexp_capture_cursor::next_char(bitset<256>& letter, const char *begin, const char*& first, 
00766                                        const char *last, bool& literal, 
00767                                        const map<string, bitset<256> >& char_sets, bool icase) {
00768     letter.reset();
00769 
00770     // anchores
00771     if ((*first == '$' || *first == '^') && !literal) {
00772       letter.set((unsigned char) 0); // beginning/end of line is '\0'
00773       ++first;
00774       return -1;
00775     }
00776 
00777     if (*first == '\\') {
00778       if (++first == last) 
00779         return token_error(first - begin, "unterminated escaped sequence");
00780 
00781       // quote sequence \Q ... \E
00782       if (*first == 'Q') {
00783         if (literal) {
00784           letter.set((unsigned char) '\\');
00785           return -1;
00786         }
00787         else {
00788           literal = true;
00789           if (++first == last) 
00790             return token_error(first - begin, "unterminated quote sequence");
00791           return next_char(letter, begin, first, last, literal, char_sets, icase);
00792         }
00793       }
00794       if (*first == 'E') {
00795         if (!literal) {
00796           letter.set((unsigned char) 'E');
00797           if (icase)
00798             letter.set((unsigned char) 'e');
00799         }
00800         else literal = false;
00801         ++first;
00802         return -1;
00803       }
00804 
00805       if (literal) {
00806         letter.set((unsigned char) '\\');
00807         return -1;
00808       }
00809       
00810       if (*first == 'x') { // hexadecimal character \x01 through \xFF
00811         if (++first == last)
00812           return token_error(first - begin, "\\x used with no following hex digits");
00813         int n = 0;
00814         if (*first >= '0' && *first <= '9') {
00815           n = 16 * (*first - '0'); 
00816         }
00817         else 
00818           if (*first >= 'A' && *first <= 'F') {
00819             n = 16 * (10 + *first - 'A'); 
00820           }
00821           else 
00822             if (*first >= 'a' && *first <= 'f') {
00823               n = 16 * (10 + *first - 'a'); 
00824             }
00825             else return token_error(first - begin, "\\x used with no following hex digits");
00826         
00827         if (++first == last)
00828           return token_error(first - begin, "\\x used with only one following hex digit");
00829         
00830         if (*first >= '0' && *first <= '9') {
00831           n += *first - '0'; 
00832         }
00833         else 
00834           if (*first >= 'A' && *first <= 'F') {
00835             n += 10 + (*first - 'A'); 
00836           }
00837           else
00838             if (*first >= 'a' && *first <= 'f') {
00839               n += 10 + (*first - 'a'); 
00840             }
00841             else return token_error(first - begin, "\\x used with only one following hex digit");
00842         letter.set(n);
00843         ++first;
00844         return -1;
00845       }
00846       
00847       if (*first == 'c') { // control character \cA through \cZ maps to \x01 \x1A
00848         if (++first == last || *first < 'A' || *first > 'Z')
00849           return token_error(first - begin, "\\c used with no following upper-case letter");
00850         letter.set(*first - 'A' + 1);
00851         ++first;
00852         return -1;
00853       }
00854 
00855       // look for \s, \S, \w, \W, \d, \D, \n, \t, \v, \V, \r, \a, \e, \f, \h, \H
00856       std::map<string, bitset<256> >::const_iterator c = char_sets.find(std::string("\\") + *first);
00857       if (c != char_sets.end()) {
00858         letter = c->second;
00859         ++first;
00860         return -1;
00861       }
00862 
00863       if (*first == '.') {
00864         letter.set((unsigned char) '.');
00865         ++first;
00866         return -1;
00867       }
00868     }
00869 
00870     if (*first == '.') {
00871       if (literal)
00872         letter.set((unsigned char) '.');
00873       else {
00874         letter.set();
00875 
00876 
00877 
00878 
00879 
00880         // letter.set(0, false);
00881 
00882 
00883 
00884 
00885 
00886       }
00887     }
00888     else
00889       if (icase && isalpha(*first)) {
00890         letter.set((unsigned char) tolower((unsigned char) *first));
00891         letter.set((unsigned char) toupper((unsigned char) *first));
00892       }
00893       else letter.set((unsigned char) *first);
00894     ++first;
00895     return -1;
00896   }
00897 
00898   // Return error position or -1 if ok
00899   template <typename OutputIterator>
00900   int non_deterministic_regexp_capture_cursor::tokenize(const char *first, const char *last, OutputIterator x)
00901   {
00902     static map<string, bitset<256> > char_sets;
00903     if (char_sets.empty())
00904       build_char_sets(char_sets);
00905 
00906     *x++ = node(OPEN);
00907     node previous = node(OPEN);
00908   
00909     bool literal = false;
00910     std::vector<char> parenthesis; // contains '(' or 'i' for (?i:
00911     bool icase = false;
00912     std::vector<int> open_captures;
00913     int current_open_capture = 0;
00914 
00915     for(const char *begin = first; first != last; ++first, ++x) {
00916       if (literal && *first != '\\') {
00917         if (previous.type != OPEN && previous.type != OR) 
00918           *x++ = node(CONCAT);
00919         if (icase && isalpha(*first)) {
00920           bitset<256> c;
00921           c.set((unsigned char) tolower(*first));
00922           c.set((unsigned char) toupper(*first));
00923           *x = previous = node(c);
00924         }
00925         else
00926           *x = previous = node(*first);
00927         continue;
00928       }
00929 
00930       switch (*first) {
00931       case '|' :
00932         if (first == begin)
00933           return token_error(0, "missing left part of alternative");
00934         *x = previous = node(OR);
00935         break;
00936 
00937       case '(' : 
00938         if (previous.type != OPEN && previous.type != OR) 
00939           *x++ = node(CONCAT);
00940         if (first + 2 < last && first[1] == '?' && first[2] == ':') { // non capturing parenthesis (?:
00941           first += 2;
00942           parenthesis.push_back('(');
00943         }
00944         else {
00945           if (first + 3 < last && first[1] == '?' && 
00946               first[2] == 'i' && first[3] == ':') { // case insensitive (?i:
00947             first += 3;
00948             if (icase == false) {
00949               parenthesis.push_back('i');
00950               icase = true;
00951             }
00952             else parenthesis.push_back('(');
00953           }
00954           else { // capture
00955             ++capture_count_;
00956             *x++ = node(OPEN);
00957             *x++ = node(OPENCAPTURE, current_open_capture, current_open_capture / 2);
00958             open_captures.push_back(current_open_capture);
00959             current_open_capture += 2;
00960             *x++ = node(CONCAT);
00961             parenthesis.push_back('c');
00962           }
00963         }
00964         *x = previous = node(OPEN);
00965         break;
00966 
00967       case ')' : 
00968         if (parenthesis.empty())
00969           return token_error(first - begin, "unbalanced parenthesis");
00970         if (parenthesis.back() == 'i')
00971           icase = false;
00972         else 
00973           if (parenthesis.back() == 'c') { // end of capture ?
00974             ++capture_count_;
00975             *x++ = node(CLOSE);
00976             *x++ = node(CONCAT);
00977             *x++ = node(CLOSECAPTURE, open_captures.back() + 1, open_captures.back() / 2);
00978             open_captures.pop_back();
00979           }
00980         *x = previous = node(CLOSE);
00981         parenthesis.pop_back();
00982         break;
00983 
00984       case '*' :
00985         if (first == begin)
00986           return token_error(0, "missing expression before operator *");
00987         *x = previous = node(STAR);
00988         break;
00989 
00990       case '?' :
00991         if (first == begin)
00992           return token_error(0, "missing expression before operator ?");
00993         *x = previous = node(QUESTION);
00994         break;
00995 
00996       case '+' :
00997         if (first == begin)
00998           return token_error(0, "missing expression before operator +");
00999         *x = previous = node(PLUS);
01000         break;
01001 
01002       case '{' : // {n,m}
01003         if (first == begin)
01004           return token_error(0, "missing expression before repeat operator");
01005         if (++first == last) 
01006           return token_error(first - begin, "unterminated expression");
01007         {
01008           int n = 0;
01009           const char *tmp = first;
01010           while (isdigit(*first)) {
01011             n = n * 10 + *first - '0';
01012             if (++first == last) 
01013               return token_error(first - begin, "unterminated repeat operator");
01014           }
01015           int m = n;
01016           if (*first == ',') {
01017             if (++first == last) 
01018               return token_error(first - begin, "unterminated repeat operator");
01019             tmp = first;
01020             for(m = 0; isdigit(*first);) {
01021               m = m * 10 + *first - '0';
01022               if (++first == last) 
01023                 return token_error(first - begin, "unterminated repeat operator");
01024             }
01025             if (tmp == first) m = -1; // comma but no upper bound: {n,}
01026           }
01027           if (*first != '}') 
01028             return token_error(first - begin, "invalid repeat operator");
01029 
01030           // errors: {0} {0,0} {,0}
01031           if (n > 32768 || m > 65535 || (n > m && m > -1) || (n == m && n == 0)) 
01032             return token_error(first - begin, "wrong range for repeat operator");
01033 
01034           // special cases: {0,1} {,1} {0,} {1,} {1,1}
01035           if (n == 0 && m == 1)
01036             *x = node(QUESTION);
01037           else if (n == 0 && m == -1)
01038             *x = node(STAR);
01039           else if (n == 1 && m == -1)
01040             *x = node(PLUS);
01041           else if (!(n == 1 &&  m == 1))
01042             *x = node(REPEAT, n, m);
01043         }
01044         break;
01045 
01046       case '[' :   // characters set
01047         {
01048           if (previous.type != OPEN && previous.type != OPENCAPTURE && previous.type != OR) 
01049             *x++ = node(CONCAT);
01050           bitset<256> a;
01051           bool v = true;
01052           if (++first == last) 
01053             return token_error(first - begin, "unterminated expression");
01054           if (*first == '^') {  // a negation ?
01055             a.set();
01056             a.set(0, false);
01057             v = false;
01058             if (++first == last) 
01059               return token_error(first - begin, "unterminated expression");
01060           }
01061           if (*first == '-') {  // "escaped" hyphen at the beginning ?
01062             a.set((unsigned char) '-', v);
01063             ++first;
01064           }
01065           else
01066             if (*first == ']') {  // "escaped" closing square bracket at the beginning ?
01067               a.set((unsigned char) ']', v);
01068               ++first;
01069             }
01070           for(; first != last && *first != ']'; ++first) {
01071             if (*first == '[') {
01072               if (++first == last) break;
01073               if (*first != ':') {
01074                 a.set((unsigned char) '[', v);
01075               }
01076               else { // a predefined set as [:digits:]
01077                 const char *start = first - 1;
01078                 first = std::find(++first, last, ']');
01079                 if (first == last) break;
01080                 std::map<string, bitset<256> >::const_iterator c = char_sets.find(std::string(start, first + 1));
01081                 if (c == char_sets.end())
01082                   return token_error(start - begin, "unknown character set");
01083                 if (v == true)
01084                   a |= c->second;
01085                 else a &= ~c->second;
01086                 continue;
01087               }
01088             }
01089             // TODO: next_char manages ranges and return a character set (pass a boolean for 
01090             // saying this is a character set context and '-' as a special meaning, good for
01091             // . also, see below the ugly hack)
01092             bitset<256> tmp; // will possibly be used as a range lower bound
01093             if (*first != '-') {  // not a potential character range ?
01094               switch (*first) {
01095               case '^' :
01096               case '$' :
01097                 tmp.set((unsigned char) *first++);
01098                 break;
01099               default :
01100                 {
01101                   int r = next_char(tmp, begin, first, last, literal, char_sets, icase);
01102                   if (r != -1)
01103                     return r;
01104                   // ugly hack for dot
01105                   if (tmp.count() == 256) {
01106                     tmp.reset();
01107                     tmp.set((unsigned char) '.'); // . is a literal in a character set
01108                   }
01109                 }
01110               }
01111               if (v == true)
01112                 a |= tmp;
01113               else a &= ~ tmp; // negative definition
01114               if (*first != '-') {
01115                 --first;
01116                 continue;
01117               }
01118             }
01119             // now *first == '-'
01120             if (++first == last) 
01121               return token_error(first - begin, "unterminated expression");
01122             if (*first == ']') {  // unterminated character range ?
01123               a.set((unsigned char) '-', v);
01124               break; // end of character set
01125             }
01126             else { // character range [a-z]
01127               if (tmp.count() != 1) { // a bound must not be a character set
01128                 return token_error(first - 2 - begin, "wrong character range: the lower bound cannot be a character set");
01129               }
01130               bitset<256> upper_bound;
01131               switch (*first) {
01132               case '^' :
01133               case '$' :
01134                 upper_bound.set((unsigned char) *first);
01135                 break;
01136               default :
01137                 {
01138                   int r = next_char(upper_bound, begin, first, last, literal, char_sets, icase);
01139                   --first;
01140                   if (r != -1)
01141                     return token_error(first - begin, "wrong character range");
01142                   // ugly hack for dot
01143                   if (upper_bound.count() == 256) {
01144                     upper_bound.reset();
01145                     upper_bound.set((unsigned char) '.'); // . is a literal in a character set
01146                   }
01147                 }
01148               }
01149               if (upper_bound.count() != 1) // a bound must not be a character set
01150                 return token_error(first - begin, "wrong character range: the upper bound cannot be a character set");
01151 
01152               unsigned int int_lower_bound = 0, int_upper_bound = 0;
01153               while (tmp.test(int_lower_bound) == false) 
01154                 ++int_lower_bound;
01155               while (upper_bound.test(int_upper_bound) == false)
01156                 ++int_upper_bound;
01157 
01158               // guess what, lower bound should be lower or equal to upper bound
01159               if (int_upper_bound < int_lower_bound)
01160                 return token_error(first - begin, "wrong character range: lower bound cannot be greater than upper bound");
01161               else
01162                 for(unsigned int i = int_lower_bound; i <= int_upper_bound; ++i)
01163                   a.set(i, v);
01164             }
01165           }
01166           // first is supposed to point to ']'
01167           if (first == last) 
01168             return token_error(first - begin, "unterminated expression");
01169           *x = previous = node(a);
01170           break;
01171         }
01172 
01173       default :
01174         bitset<256> next_letter;
01175         int r = next_char(next_letter, begin, first, last, literal, char_sets, icase);
01176         if (r != -1)
01177           return r;
01178         if (next_letter.any()) {
01179           if (previous.type != OPEN && previous.type != OR) 
01180             *x++ = node(CONCAT);
01181 
01182 
01183 
01184           if (next_letter.count() == 256) // catch-all dot 
01185 
01186 
01187 
01188 #ifndef UTF8
01189             *x = previous = node(ANY);
01190 #else
01191           {
01192             *x++ = node(OPEN);
01193             next_letter.reset();
01194             for(int i = 1; i < 0x7F; ++i)
01195               next_letter.set(i);
01196             *x++ = node(next_letter);
01197             *x++ = node(OR);
01198             next_letter.reset();
01199             for(int i = 0; i < 32; ++i)
01200               next_letter.set(i | 192);
01201             *x++ = node(next_letter);
01202             *x++ = node(CONCAT);
01203             *x++ = node(ANY);
01204             *x = previous = node(CLOSE);
01205           }
01206 #endif
01207           else
01208             *x = previous = node(next_letter);
01209         }
01210         --first;
01211         break;
01212       }
01213     }
01214     *x++ = node(CLOSE);
01215     *x++ = node(CONCAT);
01216     *x++ = node(FINAL);
01217     *x++ = node(CONCAT);
01218     *x++ = node(GRANDFINAL);
01219     return -1;
01220   }
01221 
01222   // exp -> term exp2    
01223   inline
01224   non_deterministic_regexp_capture_cursor::node*
01225   non_deterministic_regexp_capture_cursor::exp(vector<non_deterministic_regexp_capture_cursor::node>::iterator &first, 
01226                      vector<non_deterministic_regexp_capture_cursor::node>::iterator last)
01227   {
01228     if (first == last)
01229       return syntax_error(last - first, "empty expression");
01230     node *t = term(first, last);
01231     assert(t != 0);
01232     if (t == 0) return 0;
01233     return exp2(first, last, t);
01234   }
01235 
01236   // exp2 -> + term exp2 | epsilon
01237   inline
01238   non_deterministic_regexp_capture_cursor::node*
01239   non_deterministic_regexp_capture_cursor::exp2(vector<non_deterministic_regexp_capture_cursor::node>::iterator &first, 
01240                       vector<non_deterministic_regexp_capture_cursor::node>::iterator last, node *left)
01241   {
01242     assert(left != 0);
01243     if (first != last && first->type == OR) {
01244       node &root = *first;
01245       node *right = term(++first, last);
01246       if (right == 0) return 0;
01247       root.nullable = right->nullable || left->nullable;
01248       // firstpos(here) = firstpos(left) U firstpos(right): 
01249       root.firstpos = left->firstpos;
01250       root.firstpos.insert(right->firstpos.begin(), right->firstpos.end()); 
01251       root.lastpos = left->lastpos;
01252       root.lastpos.insert(right->lastpos.begin(), right->lastpos.end());
01253       if (left->nullable)
01254         root.emptymatch = left->emptymatch;
01255       else
01256         root.emptymatch = right->emptymatch;
01257       return exp2(first, last, &root);
01258     }
01259     return left;
01260   }
01261 
01262   // term -> form repeat term2
01263   inline
01264   non_deterministic_regexp_capture_cursor::node*
01265   non_deterministic_regexp_capture_cursor::term(vector<non_deterministic_regexp_capture_cursor::node>::iterator &first, 
01266                       vector<non_deterministic_regexp_capture_cursor::node>::iterator last)
01267   {
01268     if (first == last) 
01269       return syntax_error(last - first, "unterminated expression");
01270     node *f = form(first, last);
01271     assert(f != 0);
01272     if (f == 0) return 0;
01273     node *s = repeat(first, last, f);  // return f if no repeat token is found
01274     return term2(first, last, s);
01275   }
01276 
01277   // repeat -> * repeat | ? repeat | + repeat | epsilon
01278   inline
01279   non_deterministic_regexp_capture_cursor::node*
01280   non_deterministic_regexp_capture_cursor::repeat(vector<non_deterministic_regexp_capture_cursor::node>::iterator &first, 
01281                         vector<non_deterministic_regexp_capture_cursor::node>::iterator last, node *left)
01282   {
01283     assert(left != 0);
01284     if (first != last)
01285       switch (first->type) {
01286       case STAR :
01287       case PLUS :
01288         {
01289           node &root = *first;
01290           root.nullable = first->type == STAR;
01291           root.firstpos = left->firstpos;
01292           root.lastpos = left->lastpos;
01293 //        if (first->type == STAR)
01294             if (left->nullable)
01295               root.emptymatch = left->emptymatch;
01296 
01297           // nextpos(lastpos(here)) += firstpos(here):
01298           for(S::const_iterator i = root.lastpos.begin(); i != root.lastpos.end(); ++i)
01299             for(S::const_iterator j = root.firstpos.begin(); j != root.firstpos.end(); ++j) {
01300               configuration tmp = i->second;
01301               tmp.insert(j->second.begin(), j->second.end());
01302               (*i).first->nextpos.insert(make_pair(j->first, tmp));
01303             }
01304           return repeat(++first, last, &root);
01305         }
01306       case QUESTION : 
01307         {
01308           node &root = *first;
01309           root.nullable = true;
01310           root.firstpos = left->firstpos;
01311           root.lastpos = left->lastpos;
01312           if (left->nullable)
01313             root.emptymatch = left->emptymatch;
01314           return repeat(++first, last, &root);
01315         }
01316       default : 
01317         break;
01318       }
01319     return left;
01320   }
01321     
01322 
01323   // term2 -> . form repeat term2 | epsilon
01324   inline
01325   non_deterministic_regexp_capture_cursor::node*
01326   non_deterministic_regexp_capture_cursor::term2(vector<non_deterministic_regexp_capture_cursor::node>::iterator &first, 
01327                        vector<non_deterministic_regexp_capture_cursor::node>::iterator last, node *left)
01328   {
01329     if (first != last && first->type == CONCAT) {
01330       node &root = *first;
01331       node *f = form(++first, last);
01332       assert(f != 0);
01333       if (f == 0) return 0; 
01334       node *right = repeat(first, last, f);
01335       root.nullable = left->nullable && right->nullable;
01336 
01337       root.firstpos = left->firstpos;
01338       if (left->nullable) {
01339         for(S::const_iterator s = right->firstpos.begin(); s != right->firstpos.end(); ++s) {
01340           configuration tmp = s->second;
01341           tmp.insert(left->emptymatch.begin(), left->emptymatch.end());
01342           root.firstpos.insert(make_pair(s->first, tmp));
01343         }
01344       }
01345 
01346       root.lastpos = right->lastpos;    
01347       if (right->nullable) {
01348         for(S::const_iterator s = left->lastpos.begin(); s != left->lastpos.end(); ++s) {
01349           configuration tmp = s->second;
01350           tmp.insert(right->emptymatch.begin(), right->emptymatch.end());
01351           root.lastpos.insert(make_pair(s->first, tmp));
01352         }
01353       }
01354 
01355       root.emptymatch = left->emptymatch;
01356       root.emptymatch.insert(right->emptymatch.begin(), right->emptymatch.end());
01357 
01358       // nextpos(lastpos(left)) += firstpos(right):
01359       for(S::const_iterator i = left->lastpos.begin(); i != left->lastpos.end(); ++i)
01360         for(S::const_iterator j = right->firstpos.begin(); j != right->firstpos.end(); ++j) {
01361           configuration tmp = j->second;
01362           tmp.insert(i->second.begin(), i->second.end());
01363           i->first->nextpos.insert(make_pair(j->first, tmp));     
01364         }
01365       return term2(first, last, &root);
01366     }
01367     return left;
01368   }
01369 
01370 
01371   // form -> ( exp ) | letter
01372   inline
01373   non_deterministic_regexp_capture_cursor::node*
01374   non_deterministic_regexp_capture_cursor::form(vector<non_deterministic_regexp_capture_cursor::node>::iterator &first, 
01375                       vector<non_deterministic_regexp_capture_cursor::node>::iterator last)
01376   {
01377     if (first == last) 
01378       return syntax_error(last - first, "unterminated expression");
01379     if (first->type == OPEN) {
01380       node *e = exp(++first, last);
01381       assert(e != 0);
01382       if (e == 0) return 0;
01383       if (first->type == CLOSE) ++first;
01384       else return syntax_error(last - first, "unbalanced parenthesis");
01385       return e;
01386     }
01387     if (first->type == OPENCAPTURE || first->type == CLOSECAPTURE) {
01388       first->emptymatch[&*first] = 0;
01389       ++first;
01390       return &(first[-1]);
01391     }
01392     first->nullable = false;
01393     first->firstpos.insert(make_pair(&*first, configuration()));
01394     first->lastpos.insert(make_pair(&*first, configuration()));
01395     ++first;
01396     return &(first[-1]);
01397   }
01398 
01399 //   inline
01400 //   non_deterministic_regexp_capture_cursor regexpc(const string &exp) {
01401 //     return non_deterministic_regexp_capture_cursor(exp.c_str());
01402 //   }
01403 
01404 //   inline
01405 //   non_deterministic_regexp_capture_cursor regexpc() {
01406 //     return non_deterministic_regexp_capture_cursor();
01407 //   }
01408 
01409 
01410 class capture_cursor : public non_deterministic_regexp_capture_cursor
01411 {
01412 public:
01413   typedef capture_cursor                          self;
01414   typedef non_deterministic_regexp_capture_cursor super;
01415   typedef configuration                           tag_type;
01416 
01417   capture_cursor(const string &exp, bool use_horspool)
01418     : super(exp, use_horspool)
01419   { }
01420 
01421   bool forward(char_type c) {
01422     const bool r = super::forward(c);
01423     intersection.clear();
01424 
01425     if (q.size() == 1) {
01426       // FIXME:optimize set_difference in this case
01427       intersection.swap(q.begin()->second);
01428       cout << "INTERSECTION " << "\033[47;31m" << intersection << "\033[0m" << endl;
01429       return r;
01430     }
01431     else {
01432       //      state_type::const_iterator best = q.end();
01433       for(state_type::const_iterator i = q.begin(); i != q.end(); ++i) {
01434 //      if (i->first->type == GRANDFINAL && 
01435 //          (best == q.end() || i->second < best->second))
01436 //        best = i;
01437 //      else
01438           if (i == q.begin())
01439             intersection = i->second;
01440           else
01441             intersection = intersection.intersection(i->second);
01442       }
01443 //       if (c == '\0' && best != q.end()) {
01444 //      intersection = best->second; // '\0' + GRANDFINAL == finalizing
01445 //      cout << "INTERSECTION " << "\033[47;31m" << intersection << "\033[0m" << endl;
01446 //      return r;
01447 //       }
01448     }
01449     cout << "INTERSECTION " << "\033[47;31m" << intersection << "\033[0m" << endl;
01450     for(state_type::iterator i = q.begin(); i != q.end(); ++i) {
01451       configuration diff;
01452       set_difference(i->second.begin(), i->second.end(), intersection.begin(), intersection.end(), inserter(diff, diff.end()));
01453       i->second = diff;
01454       cout <<  "\033[1m" << "remain: " << i->first << ':' << i->first->pretty() 
01455            << ":=" << i->second << "\033[0m" << endl;
01456     }
01457     return r;
01458   }
01459 
01460   void finalize() {
01461     super::forward('\0');
01462     state_type::const_iterator best = q.end();
01463     for(state_type::const_iterator i = q.begin(); i != q.end(); ++i) {
01464       if (i->first->type == GRANDFINAL && 
01465           (best == q.end() || i->second < best->second))
01466         best = i;
01467     }
01468     intersection = best->second;
01469   }
01470 
01471   self& operator=(const state_type &s) {
01472     super::operator=(s);
01473     return *this;
01474   }
01475 
01476   const tag_type& src_tag() const {
01477     return intersection;
01478   }
01479 
01480 protected:
01481   configuration intersection;
01482 };
01483 
01484 
01485   template <>
01486   class lazy_cursor<capture_cursor, DFT_matrix_mini> : public cursor_concept
01487   {
01488   public:
01489     typedef capture_cursor Cursor;
01490 #ifdef USE_HASH_TABLE
01491     typedef DFT_matrix_mini<const capture_cursor::ordered_state_type*,
01492                             capture_cursor::tag_type> DFA_type;
01493 #else
01494     typedef DFT_matrix_mini<const capture_cursor::state_type*,
01495                             capture_cursor::tag_type> DFA_type;
01496 #endif
01497 
01498   protected:
01499 #ifdef USE_HASH_TABLE
01500     typedef std::map<capture_cursor::ordered_state_type, DFA_type::state_type> mapper;
01501 #else
01502     typedef std::map<capture_cursor::state_type, DFA_type::state_type> mapper;
01503 #endif
01504 
01505     typedef unsigned long offset_type;
01506 
01507     Cursor               c;
01508     smart_ptr<DFA_type>  dfa;
01509     smart_ptr<mapper>    mapping;  
01510     DFA_type::state_type my_sink;
01511     DFA_type::state_type q;
01512 
01513   public:
01514     typedef lazy_cursor           self;
01515     typedef DFA_type::state_type  state_type;
01516     typedef Cursor::char_type     char_type;
01517     typedef Cursor::char_traits   char_traits;
01518     typedef Cursor::configuration configuration;
01519     typedef configuration         tag_type;
01520 
01521   protected:
01522     const configuration *conf;
01523 
01524   public:
01525     lazy_cursor(const Cursor &x)
01526       : c(x), my_sink(dfa->new_state()), q(DFA_type::null_state), conf(NULL)
01527     { 
01528       if (!x.sink()) {
01529         q = dfa->new_state();
01530 #ifdef USE_HASH_TABLE
01531         dfa->tag(q) = &(mapping->insert(make_pair(capture_cursor::ordered_state_type(c.src().begin(), c.src().end()), q)).first->first);
01532 #else
01533         dfa->tag(q) = &(mapping->insert(make_pair(c.src(), q)).first->first);
01534 #endif
01535         dfa->final(q) = c.src_final();
01536         dfa->initial(q);
01537       }
01538     }
01539 
01540     int capture_count() const { return c.capture_count(); }
01541 
01542     const Cursor& adaptee() {
01543       if (q != DFA_type::null_state)
01544         c = capture_cursor::state_type(dfa->tag(q)->begin(), dfa->tag(q)->end());
01545       return c;
01546     }
01547     
01548     const DFA_type& cache() const {
01549       return *dfa;
01550     }
01551     
01552     state_type src() const {
01553       return q;
01554     }
01555     
01556     self& operator=(state_type p) {
01557       q = p;
01558       return *this;
01559     }
01560     
01561     bool src_final() const {
01562       return dfa->final(q);
01563     }
01564     
01565     // return relative offsets
01566     const tag_type& src_tag() const {
01567       static const configuration empty;
01568       if (conf != NULL)
01569         return *conf;
01570       return empty;
01571     }
01572     
01573     bool sink() const {
01574       return q == DFA_type::null_state;
01575     }
01576     
01577     state_type sink_state() const {
01578       return DFA_type::null_state;
01579     }
01580 
01581     bool exists(const char_type &a) const {
01582       state_type aim = dfa->delta1(q, a);
01583       if (aim == DFA_type::null_state) {
01584         Cursor tmp = c;
01585 #ifdef USE_HASH_TABLE
01586         tmp = capture_cursor::state_type(dfa->tag(q)->begin(), dfa->tag(q)->end());
01587 #else
01588         tmp = *dfa->tag(q);
01589 #endif
01590         return tmp.exists(a);
01591       }
01592       return aim != my_sink;
01593     }
01594 
01595     bool forward(const char_type &a) {
01596       state_type tmp = dfa->delta1(q, a);
01597       if (tmp == my_sink) {
01598         q = DFA_type::null_state; // we already know there is no transition with 'a'
01599         return false;
01600       }
01601       if (tmp == DFA_type::null_state) {    // transition not in the cache ?
01602         c = capture_cursor::state_type(dfa->tag(q)->begin(), dfa->tag(q)->end());
01603         if (c.forward(a)) {   // any transition labelled with 'a' ?
01604 #ifdef USE_HASH_TABLE
01605           std::pair<mapper::iterator, bool> p = 
01606             mapping->insert(std::make_pair(capture_cursor::ordered_state_type(c.src().begin(), c.src().end()), DFA_type::null_state));
01607 #else
01608           std::pair<mapper::iterator, bool> p = 
01609             mapping->insert(std::make_pair(c.src(), DFA_type::null_state));
01610 #endif
01611           if (p.second == true) { // target state is not dejavu ?
01612             tmp = dfa->new_state();
01613             dfa->final(tmp) = c.src_final();
01614             dfa->tag(tmp) = &(p.first->first);
01615             p.first->second = tmp;
01616           }
01617           else tmp = p.first->second;
01618           dfa->set_trans(q, a, tmp);
01619           if (!c.src_tag().empty()) {
01620             // cout << "dfa->set_output(" << q << ", " << a << ", " << c.src_tag() << ")" << endl;
01621             dfa->set_output(q, a, c.src_tag());
01622           }
01623         }
01624         else {
01625           dfa->set_trans(q, a, my_sink);
01626           q = DFA_type::null_state;
01627           return false;
01628         }
01629       }
01630 
01631 //       conf = dfa->output(q, a);
01632 //       if (conf != NULL) {
01633 //      for(non_deterministic_regexp_capture_cursor::configuration::const_iterator ic = conf->begin(); ic != conf->end(); ++ic) {
01634 //        cout << "captures[" << ic->first.id() << "] displacement " << ic->second << endl;
01635 //        //      (*captures)[ic->first.id()] = offset + ic->second;
01636 //      }
01637 //       }
01638 //       //      ++offset;
01639 
01640       conf = dfa->output(q, a); // how to avoid this if we don't need
01641                                 // capture and don't need to pay the price
01642       q = tmp;
01643       return true;
01644     }
01645   };
01646 
01647 #if 0
01648   template <>
01649   class lazy_cursor<non_deterministic_regexp_capture_cursor, DFT_matrix_mini> : public cursor_concept
01650   {
01651   public:
01652     typedef non_deterministic_regexp_capture_cursor Cursor;
01653 #ifdef USE_HASH_TABLE
01654     typedef DFT_matrix_mini<const non_deterministic_regexp_capture_cursor::ordered_state_type*,
01655                             non_deterministic_regexp_capture_cursor::tag_type> DFA_type;
01656 #else
01657     typedef DFT_matrix_mini<const non_deterministic_regexp_capture_cursor::state_type*,
01658                             non_deterministic_regexp_capture_cursor::tag_type> DFA_type;
01659 #endif
01660 
01661   protected:
01662 #ifdef USE_HASH_TABLE
01663     typedef std::map<non_deterministic_regexp_capture_cursor::ordered_state_type, DFA_type::state_type> mapper;
01664 #else
01665     typedef std::map<non_deterministic_regexp_capture_cursor::state_type, DFA_type::state_type> mapper;
01666 #endif
01667 
01668     typedef unsigned long offset_type;
01669 
01670     Cursor               c;
01671     smart_ptr<DFA_type>  dfa;
01672     smart_ptr<mapper>    mapping;  
01673     smart_ptr<vector<offset_type> > captures;
01674     DFA_type::state_type my_sink;
01675     DFA_type::state_type q;
01676     offset_type          offset;
01677 
01678   public:
01679     typedef lazy_cursor          self;
01680     typedef DFA_type::state_type state_type;
01681     typedef vector<offset_type>  tag_type;
01682     typedef Cursor::char_type    char_type;
01683     typedef Cursor::char_traits  char_traits;
01684 
01685     lazy_cursor(const Cursor &x)
01686       : c(x), my_sink(dfa->new_state()), q(DFA_type::null_state), offset(0)
01687     { 
01688       if (!x.sink()) {
01689         q = dfa->new_state();
01690 #ifdef USE_HASH_TABLE
01691         dfa->tag(q) = &(mapping->insert(make_pair(non_deterministic_regexp_capture_cursor::ordered_state_type(c.src().begin(), c.src().end()), q)).first->first);
01692 #else
01693         dfa->tag(q) = &(mapping->insert(make_pair(c.src(), q)).first->first);
01694 #endif
01695         dfa->final(q) = c.src_final();
01696         dfa->initial(q);
01697         captures->resize(x.capture_count(), 0); 
01698       }
01699     }
01700 
01701     const Cursor& adaptee() {
01702       if (q != DFA_type::null_state)
01703         c = *dfa->tag(q);
01704       return c;
01705     }
01706     
01707     const DFA_type& cache() const {
01708       return *dfa;
01709     }
01710     
01711     state_type src() const {
01712       return q;
01713     }
01714     
01715     self& operator=(state_type p) {
01716       q = p;
01717       offset = 0;
01718       fill(captures->begin(), captures->end(), 0);
01719       return *this;
01720     }
01721     
01722     bool src_final() const {
01723       return dfa->final(q);
01724     }
01725     
01726     const tag_type& src_tag() const {
01727       return *captures;
01728     }
01729     
01730     bool sink() const {
01731       return q == DFA_type::null_state;
01732     }
01733     
01734     state_type sink_state() const {
01735       return DFA_type::null_state;
01736     }
01737 
01738     //   bool exists(const char_type &a) const {
01739     //     state_type aim = dfa->delta1(q, a);
01740     //     if (aim == my_sink) return false;
01741     //     Cursor tmp = c;
01742     //     tmp = dfa->tag(q);
01743     //     return tmp.exists(a);
01744     //   }
01745 
01746     bool forward(const char_type &a) {
01747       state_type tmp = dfa->delta1(q, a);
01748       if (tmp == my_sink) {
01749         q = DFA_type::null_state; // we already know there is no transition with 'a'
01750         return false;
01751       }
01752       if (tmp == DFA_type::null_state) {    // transition not in the cache ?
01753         c = *dfa->tag(q);
01754         if (c.forward(a)) {   // any transition labelled with 'a' ?
01755 #ifdef USE_HASH_TABLE
01756           std::pair<mapper::iterator, bool> p = 
01757             mapping->insert(std::make_pair(non_deterministic_regexp_capture_cursor::ordered_state_type(c.src().begin(), c.src().end()), DFA_type::null_state));
01758 #else
01759           std::pair<mapper::iterator, bool> p = 
01760             mapping->insert(std::make_pair(c.src(), DFA_type::null_state));
01761 #endif
01762           if (p.second == true) { // target state is not dejavu ?
01763             tmp = dfa->new_state();
01764             dfa->final(tmp) = c.src_final();
01765             dfa->tag(tmp) = &(p.first->first);
01766             p.first->second = tmp;
01767           }
01768           else tmp = p.first->second;
01769           dfa->set_trans(q, a, tmp);
01770           if (!c.src_tag().empty()) {
01771             // cout << "dfa->set_output(" << q << ", " << a << ", " << c.src_tag() << ")" << endl;
01772             dfa->set_output(q, a, c.src_tag());
01773           }
01774         }
01775         else {
01776           dfa->set_trans(q, a, my_sink);
01777           q = DFA_type::null_state;
01778           return false;
01779         }
01780       }
01781       non_deterministic_regexp_capture_cursor::configuration *conf = dfa->output(q, a);
01782       if (conf != NULL) {
01783         for(non_deterministic_regexp_capture_cursor::configuration::const_iterator ic = conf->begin(); ic != conf->end(); ++ic) {
01784           // cout << "captures[" << ic->first.id() << "] = offset + displacement = " << offset << " + " << ic->second << " = " << offset + ic->second << endl;
01785           (*captures)[ic->first.id()] = offset + ic->second;
01786         }
01787       }
01788       q = tmp;
01789       ++offset;
01790       return true;
01791     }
01792   };
01793 
01794   // match algorithm specializations for regexp
01795 
01796   inline
01797   bool match(lazy_cursor<non_deterministic_regexp_capture_cursor>& c, const char *w)
01798   {
01799     if (!c.forward('\0')) // ^ anchoring
01800       return false;
01801     for(; *w != '\0' && c.forward(*w); ++w)
01802       if (c.src_final()) return true;
01803     return *w == '\0' && c.forward('\0') && c.src_final(); // $ anchoring
01804   }
01805 
01806   template <typename InputI>
01807   inline
01808   bool match(lazy_cursor<non_deterministic_regexp_capture_cursor>& c, InputI first, InputI last)
01809   {
01810     if (!c.forward('\0'))  // ^ anchoring
01811       return false;
01812     for(; first != last && c.forward(*first); ++first)
01813       if (c.src_final()) return true;
01814     return first == last && c.forward('\0') && c.src_final(); // $ anchoring
01815   }
01816 
01817   template <typename ForwardI>
01818   inline
01819   ForwardI first_match(lazy_cursor<non_deterministic_regexp_capture_cursor> &c, ForwardI first, ForwardI last)
01820   {
01821     if (!c.forward('\0')) return first;
01822     const ForwardI start = first;
01823     for(; first != last && c.forward(*first); ++first)
01824       if (c.src_final()) return ++first;
01825     return first == last && c.forward('\0') && c.src_final() ? first : start;
01826   }
01827 
01828   inline
01829   const char*
01830   first_match(lazy_cursor<non_deterministic_regexp_capture_cursor> &c, const char *text)
01831   {
01832     if (!c.forward('\0')) return text;
01833     const char *start = text;
01834     for(; *text != '\0' && c.forward(*text); ++text)
01835       if (c.src_final()) return ++text;
01836     return *text == '\0' && c.forward('\0') && c.src_final() ? text : start;
01837   }
01838 
01839   template <typename ForwardI>
01840   inline 
01841   bool capture(lazy_cursor<non_deterministic_regexp_capture_cursor, DFT_matrix_mini> &c, ForwardI first, ForwardI last, vector<unsigned long> &offsets) {
01842     for(; first != last && c.forward(*first); ++first)
01843     if (first == last && c.forward('\0') && c.src_final()) {
01844       offsets = c.src_tag();
01845       return true;
01846     }
01847     return false;
01848   }
01849 
01850 //   inline
01851 //   ostream& operator<<(ostream& out, const non_deterministic_regexp_capture_cursor& c) {
01852 //     for(vector<non_deterministic_regexp_capture_cursor::node>::const_iterator n = c.e->begin(); n != c.e->end(); ++n) {
01853 //       if (c.q.find(const_cast<astl::non_deterministic_regexp_capture_cursor::node*>(&*n)) != c.q.end())
01854 //      out << "\033[1m" << *n << "\033[0m";
01855 //       else
01856 //      out << *n;
01857 //     }
01858 //     return out;
01859 //   }
01860 
01861 
01862   template <typename ForwardI>
01863   inline
01864   ForwardI first_match(non_deterministic_regexp_capture_cursor &c, ForwardI first, ForwardI last)
01865   {
01866     //    if (!c.forward('\0')) return first;
01867     const ForwardI start = first;
01868     for(non_deterministic_regexp_capture_cursor::state_type::const_iterator i = c.src().begin(); i != c.src().end(); ++i) {
01869       cout << "< " << i->first << ':' << *i->first << ", { ";
01870       for(non_deterministic_regexp_capture_cursor::configuration::const_iterator j = i->second.begin(); j != i->second.end(); ++j)
01871         cout << (int) *j << " ";
01872       cout << "} > ";
01873     }
01874     cout << endl;
01875     for(; first != last && c.forward(*first); ++first) {
01876       for(non_deterministic_regexp_capture_cursor::state_type::const_iterator i = c.src().begin(); i != c.src().end(); ++i) {
01877         cout << "< " << i->first << ':' << *i->first << ", { ";
01878         for(non_deterministic_regexp_capture_cursor::configuration::const_iterator j = i->second.begin(); j != i->second.end(); ++j)
01879           cout << (int) *j << " ";
01880         cout << "} > ";
01881       }
01882       cout << endl;
01883       if (c.src_final()) return ++first;
01884     }
01885     return first == last && c.forward('\0') && c.src_final() ? first : start;
01886   }
01887 #endif
01888 
01889 }
01890 
01891 #endif

Generated on Sun Mar 8 02:41:34 2009 for ASTL by  doxygen 1.5.7.1