regexp.h

00001 /*
00002  * ASTL - the Automaton Standard Template Library.
00003  * C++ generic components for Finite State Automata handling.
00004  * Copyright (C) 2000-2003 Vincent Le Maout (vincent.lemaout@chello.fr).
00005  * 
00006  * This library is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  * 
00011  * This library is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  * 
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with this library; if not, write to the Free Software
00018  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00019  *
00020  */
00021 
00022 #ifndef ASTL_REGEXP_H
00023 #define ASTL_REGEXP_H
00024 
00025 #include <astl.h>
00026 #include <lazy.h>
00027 #include <vector>
00028 #include <iostream>
00029 #include <set>
00030 #include <iterator>
00031 #include <cctype>  
00032 #include <bitset>
00033 #include <cstring>
00034 #include <sstream>
00035 
00036 #define USE_HASH_TABLE
00037 
00038 // #define UTF8
00039 
00040 // guess what? crappy C++ compiler from sparc/solaris can't compile hash sets...
00041 #if defined(USE_HASH_TABLE) && defined(__sparc)
00042 #undef USE_HASH_TABLE
00043 #endif
00044 
00045 #if defined(USE_HASH_TABLE)
00046 // Use g++'s tr1::unordered_set if possible as it looks more efficient than boost's
00047 #if defined(__GNUG__) && (__GNUC__ > 3)
00048 #define USE_TR1_UNORDERED_SET
00049 #include <tr1/unordered_set>
00050 #else
00051 #define USE_BOOST_UNORDERED_SET
00052 #include <boost/unordered_set.hpp>
00053 #endif
00054 
00055 #endif // USE_HASH_TABLE
00056 
00057 using namespace std;
00058 
00059 namespace astl {
00060 
00061   // regular expressions grammar:
00062   // exp    -> term exp2
00063   // exp2   -> + term exp2 | epsilon
00064   // term   -> form repeat term2
00065   // term2  -> . form repeat term2 | epsilon
00066   // repeat -> * repeat | ? repeat | + repeat | {n,m} repeat | epsilon
00067   // form   -> ( exp ) | letter 
00068   //
00069   // + is union, . is concatenation, epsilon is the empty word
00070   // a letter is a set of chars
00071 
00072   class regexp_cursor : public cursor_concept
00073   {
00074 #ifdef _MSC_VER
00075   public:
00076 #else
00077   protected:
00078 #endif
00079     struct node;
00080 #ifdef USE_TR1_UNORDERED_SET
00081     typedef tr1::unordered_set<node*> S;
00082 #else
00083 #ifdef USE_BOOST_UNORDERED_SET
00084     typedef boost::unordered_set<node*> S;
00085 #else
00086     typedef set<node*> S;
00087 #endif
00088 #endif
00089 
00090     typedef enum { STAR, CONCAT, OR, REPEAT, EPSILON, 
00091                    LETTER, OPEN, CLOSE, QUESTION, PLUS,
00092                    ANY, FINAL } token_type;
00093   
00094     struct node {
00095       bitset<256> letter;
00096       token_type type;
00097       short n, m; // repeat operator
00098       bool annulable;
00099       S premiere_pos, derniere_pos, pos_suivante;
00100     
00101       node(token_type t, short n_ = 0, short m_ = 0)
00102         : type(t), n(n_), m(m_) { 
00103         if (type == ANY) {
00104           type = LETTER;
00105           letter.set();
00106         }
00107       }
00108 
00109       node(char t)
00110         : type(LETTER), annulable(false) { 
00111         letter.set((unsigned char) t);
00112       }
00113 
00114       node(const bitset<256> &t)
00115         : letter(t), type(LETTER), annulable(false)
00116       { }
00117 
00118       bool final() const {
00119         return type == FINAL;
00120       }
00121 
00122       char to_char() const {   // does not work with ranges
00123         size_t i;
00124         for(i = 0; i != letter.size() && letter.test(i) == false; ++i);
00125         return (char) (-128 + i);
00126       }
00127     };
00128 
00129     S q;  // current state
00130     smart_ptr<vector<node> > e;  // node "allocator"
00131     const char *errmsg;
00132     smart_ptr<S> I;
00133 
00134   public:
00135     smart_ptr<horspool_finder> horspool;
00136 
00137   protected:
00138     friend ostream& operator<<(ostream&, const regexp_cursor&);
00139 
00140     node* exp(vector<node>::iterator &first, vector<node>::iterator last);
00141     node* form(vector<node>::iterator &first, vector<node>::iterator last);
00142     node* term(vector<node>::iterator &first, vector<node>::iterator last);
00143     node* exp2(vector<node>::iterator &first, vector<node>::iterator last, node *left);
00144     node* term2(vector<node>::iterator &first, vector<node>::iterator last, node *left);
00145     node* repeat(vector<node>::iterator &first, vector<node>::iterator last, node *left);
00146     void preprocess_repeats();
00147     void build_char_sets(map<string, bitset<256> >&);
00148     template <typename OutputIterator>
00149     int tokenize(const char *first, const char *last, OutputIterator x);
00150 
00151   public:
00152     friend ostream& operator<<(ostream &, const node&);
00153     ptrdiff_t errpos;
00154 
00155     typedef regexp_cursor    self;
00156     typedef S                state_type;
00157     typedef set<node*>       ordered_state_type;
00158     typedef plain            char_traits;
00159     typedef plain::char_type char_type;
00160     typedef empty_tag        tag_type;
00161 
00162     regexp_cursor(const string &expression = "", bool horspool_optimized = false)
00163       : errmsg(NULL), errpos(-1)
00164     {
00165       if (!expression.empty()) {
00166         e->reserve(128);
00167         errpos = tokenize(expression.c_str(), 
00168                           expression.c_str() + expression.size(), 
00169                           back_inserter(*e));
00170 
00171         if (errpos == -1) {
00172           preprocess_repeats();
00173           vector<node>::iterator first = e->begin();
00174           node *root = exp(first, e->end());
00175           // managed non-anchored alternatives
00176           if (root != NULL) {
00177             // if there is only one position for the root and this position
00178             // is not ^ nor ANY, then there may be a literal prefix that can
00179             // be matched more efficiently with a boyer-moore search
00180             if (horspool_optimized && root->premiere_pos.size() == 1) {
00181               // look for a litteral prefix of size > 1
00182               string prefix;
00183               node *n = *root->premiere_pos.begin();
00184               for(; n->letter.count() == 1 && !n->annulable && n->letter.test(0) == false; n = *(n->pos_suivante.begin())) {
00185                 for(int i = 0; i < 256; ++i)
00186                   if (n->letter.test(i) == true) {
00187                     prefix += (char) i;
00188                     break;
00189                   }
00190                 if (n->pos_suivante.size() != 1 || (*n->pos_suivante.begin())->type == FINAL) {
00191                   if (prefix.size() > 1) {
00192                     // initialize boyer-moore searcher and remove prefix from expression
00193                     horspool->init(prefix);
00194                     q = n->pos_suivante;
00195                     return;
00196                   }
00197                   break;
00198                 }
00199               }
00200               if (prefix.size() > 1) {
00201                 // initialize boyer-moore searcher and remove prefix from expression
00202                 horspool->init(prefix);
00203                 q.insert(n);
00204                 return;
00205               }
00206             }
00207 
00208             q = root->premiere_pos;
00209             for(S::const_iterator i = q.begin(); i != q.end(); ++i)
00210               // if not beginning of line (anchor ^)
00211               if ((*i)->letter.count() == 256 || (*i)->letter.test(0) == false)
00212                 I->insert(*i);
00213           }
00214           else errpos = expression.size() - errpos;
00215         }
00216       }
00217     }
00218 
00219     ptrdiff_t error() const { // possible error position (-1 if expression is ok)
00220       return errpos;
00221     }
00222     
00223     const char* error_message() const {
00224       return errmsg;
00225     }
00226 
00227     self& operator=(const state_type &s) {
00228       q = s;
00229       return *this;
00230     }
00231 
00232 #ifdef USE_HASH_TABLE
00233     self& operator=(const ordered_state_type &s) {
00234       q.clear();
00235       q.insert(s.begin(), s.end());
00236       return *this;
00237     }
00238 #endif
00239     // #ifdef USE_HASH_TABLE
00240     //    state_type src() const {
00241     //      return state_type(q.begin(), q.end());
00242     //#else
00243     const state_type& src() const {
00244       return q;
00245       //#endif
00246     }
00247 
00248     bool exists(char a) const {
00249       for(S::const_iterator i = q.begin(); i != q.end(); ++i)
00250         if ((*i)->letter.test((unsigned char) a) == true)
00251           return true;
00252       return false;
00253     }
00254 
00255     bool forward(char letter) {
00256       state_type p = *I; // for non-anchored expressions
00257       for(S::const_iterator i = q.begin(); i != q.end(); ++i)
00258         if ((*i)->letter.test((unsigned char) letter) == true)
00259           p.insert((*i)->pos_suivante.begin(),
00260                    (*i)->pos_suivante.end());
00261       q.swap(p);   // optimized equivalent to q = p;
00262       return !q.empty();
00263     }
00264     
00265     bool src_final() const {
00266       for(S::const_iterator i = q.begin(); i != q.end(); ++i)
00267         if ((*i)->final()) return true;
00268       return false;
00269     }
00270      
00271     bool sink() const {
00272       return q.empty();
00273     }
00274  
00275     tag_type src_tag() const {
00276       return tag_type();
00277     }
00278 
00279     void clear() {
00280       q.clear();
00281       e->clear();
00282       I->clear();
00283     }
00284 
00285     void determinize(DFA_matrix<plain> &dfa) const {
00286       typedef DFA_matrix<plain> dfa_type;
00287       map<ordered_state_type, dfa_type::state_type> old2new;
00288       dfa_type::state_type i = dfa.new_state();
00289       dfa.initial(i);
00290       determinize_(dfa, old2new, old2new.insert(make_pair(ordered_state_type(q.begin(), q.end()), i)).first);
00291     }
00292 
00293     void determinize(DFA_matrix_mini<plain> &dfa) const {
00294       typedef DFA_matrix_mini<plain> dfa_type;
00295       map<ordered_state_type, dfa_type::state_type> old2new;
00296       dfa_type::state_type i = dfa.new_state();
00297       dfa.initial(i);
00298       determinize_(dfa, old2new, old2new.insert(make_pair(ordered_state_type(q.begin(), q.end()), i)).first);
00299     }
00300 
00301     template <typename DFA>
00302     void determinize_(DFA &dfa, 
00303                       map<ordered_state_type, typename DFA::state_type> &old2new,
00304                       typename map<ordered_state_type, typename DFA::state_type>::iterator p) const {
00305       for(int i = -128; i < 128; ++i) {
00306         ordered_state_type aim(I->begin(), I->end());    
00307         for(typename ordered_state_type::iterator s = p->first.begin(); s != p->first.end(); ++s) {
00308           if ((*s)->letter.test((unsigned char) i)) {
00309             aim.insert((*s)->pos_suivante.begin(), (*s)->pos_suivante.end());
00310           }
00311         }
00312         if (!aim.empty()) {
00313           pair<typename map<ordered_state_type, typename DFA::state_type>::iterator, bool> j = 
00314             old2new.insert(make_pair(aim, DFA::null_state));
00315           if (j.second == false) {
00316             dfa.set_trans(p->second, (char) (unsigned char) i, j.first->second);
00317           }
00318           else {
00319             j.first->second = dfa.new_state();
00320             dfa.set_trans(p->second, (char) (unsigned char) i, j.first->second);
00321             for(typename ordered_state_type::const_iterator ii = aim.begin(); ii != aim.end(); ++ii)
00322               if ((*ii)->final()) {
00323                 dfa.final(j.first->second) = true;
00324                 break;
00325               }
00326             aim.clear();
00327             determinize_(dfa, old2new, j.first);
00328           }
00329         }
00330       }
00331     }
00332 
00333   protected:
00334     ptrdiff_t token_error(ptrdiff_t pos, const char *message) {
00335       errmsg = message;
00336       errpos = pos;
00337       return pos;
00338     }
00339 
00340     node* syntax_error(ptrdiff_t pos, const char *message = NULL) {
00341       errmsg = message;
00342       errpos = pos;
00343       return NULL;
00344     }
00345 
00346     int next_char(bitset<256>& letter, const char *begin, const char*& first, const char *last, 
00347                   bool& literal, const map<string, bitset<256> >& char_sets, bool icase);
00348 
00349   };
00350 
00351   inline
00352   void regexp_cursor::preprocess_repeats() {
00353     vector<node> &v = *e;
00354     for(vector<node>::size_type p = 0; p < v.size();) {
00355       if (v[p].type != REPEAT) ++p;
00356       else {
00357         // look for what is repeated: letter, (exp), ?, +, *, ANY 
00358         vector<node>::size_type first = p - 1, last = p;
00359         for(; first > 0; --first) {
00360           if (v[first].type == LETTER || v[first].type == CLOSE || v[first].type == ANY)
00361             break;
00362         }
00363         if (v[first].type == CLOSE) {
00364           int count = 1;
00365           while(count != 0 && --first > 0)
00366             if (v[first].type == CLOSE) ++count;
00367             else if (v[first].type == OPEN) --count;
00368         }
00369         // the range [first, last) contains the pattern to be repeated
00370         // enclose it in parenthesis because of such things: a{0,2}{3}
00371         // which would otherwise be rewritten: a?a?{3} but shoud be written: (a?a?){3}
00372         // limiting to 128
00373         short n = (v[p].n > 128 ? 128 : v[p].n), 
00374           m = (v[p].m > 128 ? 128 : v[p].m);
00375         v.insert(v.begin() + first, node(OPEN));
00376         ++p;
00377         ++last;
00378         ++first;
00379         // remove repeat node
00380         if (n == 0)
00381           v[p] = node(QUESTION);
00382         else
00383           v.erase(v.begin() + p);
00384         vector<node> tmp;
00385         if (m == -1)
00386           tmp.reserve((last + 1 - first) * n + 1);
00387         else
00388           tmp.reserve((last + 1 - first) * m + m - n + 1);
00389 
00390         while(n > 0 || m > 0) {
00391           --n; 
00392           --m;
00393           if (n <= 0) { // end of mandatory repeats ?
00394             if (m < n) {  // case {n,}
00395               tmp.push_back(node(PLUS));
00396               continue; // end of processing
00397             }
00398             if (m == 0)
00399               continue; // end of processing
00400           }
00401           tmp.push_back(node(CONCAT));
00402           for(vector<node>::size_type tmp_p = first; tmp_p < last; ++tmp_p)
00403             tmp.push_back(v[tmp_p]);
00404           if (n <= 0)
00405             tmp.push_back(node(QUESTION));
00406         }
00407         tmp.push_back(node(CLOSE));
00408         v.insert(v.begin() + p, tmp.begin(), tmp.end());
00409         p += tmp.size();
00410       }
00411     }
00412   }
00413 
00414   inline
00415   ostream& operator<<(ostream &out, const regexp_cursor::node &x) 
00416   {
00417     switch(x.type) {
00418     case regexp_cursor::LETTER :
00419       if (x.letter.count() == 256)
00420         out << ".";
00421       else {
00422         if (x.letter.count() > 1)
00423           out << '[';
00424         int lower = 0;
00425         // rebuild approximatively the character set
00426         for(int i = 32; i < 257; ++i) {
00427           if (i < 256 && x.letter.test(i) == true) {
00428             if (lower == 0)
00429               lower = i;
00430           }
00431           else {
00432             if (lower > 0) {
00433               if (i - lower == 1) {
00434                 cout << (unsigned char) lower;
00435               }
00436               else if (i - lower == 2) {
00437                 cout << (unsigned char) lower << (unsigned char) (lower + 1);
00438               }
00439               else cout << (unsigned char) lower << '-' 
00440                         << (unsigned char) (i - 1);
00441               lower = 0;
00442             }
00443           }
00444         }
00445         if (x.letter.count() > 1)
00446           out << ']';
00447       }
00448       break;
00449         
00450     case regexp_cursor::OR : 
00451       out << "|"; 
00452       break;
00453         
00454     case regexp_cursor::FINAL : 
00455       out << "[EoE]";
00456       break;
00457         
00458     case regexp_cursor::OPEN : 
00459       out << "("; 
00460       break;
00461         
00462     case regexp_cursor::CLOSE : 
00463       out << ")"; 
00464       break;
00465         
00466     case regexp_cursor::CONCAT : 
00467       out << " "; 
00468       break;
00469         
00470     case regexp_cursor::STAR : 
00471       out << "*"; 
00472       break;
00473         
00474     case regexp_cursor::QUESTION : 
00475       out << "?"; 
00476       break;
00477         
00478     case regexp_cursor::PLUS : 
00479       out << "+"; 
00480       break;
00481         
00482     case regexp_cursor::REPEAT : 
00483       out << "{" << x.n << "," << x.m << "}";
00484       break;
00485         
00486     default :
00487       break;
00488     } 
00489     return out;
00490   }
00491 
00492   inline
00493   void regexp_cursor::build_char_sets(map<string, bitset<256> >& char_sets) {
00494     bitset<256> alpha, alnum, ascii, blank, cntrl, digit, word;
00495     bitset<256> graph, lower, print, punct, space, upper, xdigit;
00496     for(int c = 0; c < 256; ++c) {
00497       alnum.set(c, isalnum(c) != 0);
00498       alpha.set(c, isalpha(c) != 0);
00499       ascii.set(c, isascii(c) != 0);
00500       cntrl.set(c, iscntrl(c) != 0);
00501       digit.set(c, isdigit(c) != 0);
00502       graph.set(c, isgraph(c) != 0);
00503       lower.set(c, islower(c) != 0);
00504       print.set(c, isprint(c) != 0);
00505       punct.set(c, ispunct(c) != 0);
00506       space.set(c, isspace(c) != 0);
00507       upper.set(c, isupper(c) != 0);
00508       xdigit.set(c, isxdigit(c) != 0);
00509     }
00510     char_sets["[:alnum:]"] = alnum;
00511     char_sets["[:^alnum:]"] = ~ char_sets["[:alnum:]"];
00512     char_sets["[:alpha:]"] = alpha;
00513     char_sets["[:^alpha:]"] = ~ char_sets["[:alpha:]"];
00514     char_sets["[:ascii:]"] = ascii;
00515     char_sets["[:^ascii:]"] = ~ char_sets["[:ascii:]"];
00516     blank.set((unsigned char) ' ');
00517     blank.set((unsigned char) '\t');
00518     char_sets["[:blank:]"] = blank;
00519     char_sets["[:^blank:]"] = ~ char_sets["[:blank:"];
00520     char_sets["[:cntrl:]"] = cntrl;
00521     char_sets["[:^cntrl:]"] = ~ char_sets["[:cntrl:]"];
00522     char_sets["[:digit:]"] = digit;
00523     char_sets["[:^digit:]"] = ~ char_sets["[:digit:]"];
00524     char_sets["[:graph:]"] = graph;
00525     char_sets["[:^graph:]"] = ~ char_sets["[:graph:]"];
00526     char_sets["[:lower:]"] = lower;
00527     char_sets["[:^lower:]"] = ~ char_sets["[:lower:]"];
00528     char_sets["[:print:]"] = print;
00529     char_sets["[:^print:]"] = ~ char_sets["[:print:]"];
00530     char_sets["[:punct:]"] = punct;
00531     char_sets["[:^punct:]"] = ~ char_sets["[:punct:]"];
00532     char_sets["[:space:]"] = space;
00533     char_sets["[:^space:]"] = ~ char_sets["[:space:]"];
00534     char_sets["[:upper:]"] = upper;
00535     char_sets["[:^upper:]"] = ~ char_sets["[:upper:]"];
00536     char_sets["[:xdigit:]"] = xdigit;
00537     char_sets["[:^xdigit:]"] = ~ char_sets["[:xdigit:]"];
00538     alnum.set((unsigned char) '_');
00539     char_sets["[:word:]"] = alnum;
00540     char_sets["[:^word:]"] = ~ char_sets["[:word:]"];
00541     char_sets["\\d"] = digit;
00542     char_sets["\\D"] = ~ char_sets["\\d"];
00543     char_sets["\\s"] = space;
00544     char_sets["\\S"] = ~ char_sets["\\s"];
00545     char_sets["\\w"] = alnum;
00546     char_sets["\\W"] = ~alnum;
00547     char_sets["\\t"] = bitset<256>().set((unsigned char) '\t');
00548     char_sets["\\n"] = bitset<256>().set((unsigned char) '\n');
00549     char_sets["\\r"] = bitset<256>().set((unsigned char) '\r');
00550     char_sets["\\f"] = bitset<256>().set((unsigned char) '\f');
00551     char_sets["\\a"] = bitset<256>().set((unsigned char) '\a');
00552     char_sets["\\e"] = bitset<256>().set((unsigned char) '\x1B');
00553     char_sets["\\v"] = bitset<256>().set((unsigned char) '\v');
00554     char_sets["\\V"] = ~ char_sets["\\v"];
00555     char_sets["\\h"] = bitset<256>().set((unsigned char) ' ');
00556     char_sets["\\H"] = ~ char_sets["\\h"];
00557   }
00558 
00559   // read char at position first and advance first
00560   // may return an empty char when encountering \E
00561   inline
00562   int regexp_cursor::next_char(bitset<256>& letter, const char *begin, const char*& first, const char *last, 
00563                                bool& literal, const map<string, bitset<256> >& char_sets, bool icase) {
00564     letter.reset();
00565 
00566     // anchores
00567     if ((*first == '$' || *first == '^') && !literal) {
00568       letter.set((unsigned char) 0); // beginning/end of line is '\0'
00569       ++first;
00570       return -1;
00571     }
00572 
00573     if (*first == '\\') {
00574       if (++first == last) 
00575         return token_error(first - begin, "unterminated escaped sequence");
00576 
00577       // quote sequence \Q ... \E
00578       if (*first == 'Q') {
00579         if (literal) {
00580           letter.set((unsigned char) '\\');
00581           return -1;
00582         }
00583         else {
00584           literal = true;
00585           if (++first == last) 
00586             return token_error(first - begin, "unterminated quote sequence");
00587           return next_char(letter, begin, first, last, literal, char_sets, icase);
00588         }
00589       }
00590       if (*first == 'E') {
00591         if (!literal) {
00592           letter.set((unsigned char) 'E');
00593           if (icase)
00594             letter.set((unsigned char) 'e');
00595         }
00596         else literal = false;
00597         ++first;
00598         return -1;
00599       }
00600 
00601       if (literal) {
00602         letter.set((unsigned char) '\\');
00603         return -1;
00604       }
00605       
00606       if (*first == 'x') { // hexadecimal character \x01 through \xFF
00607         if (++first == last)
00608           return token_error(first - begin, "\\x used with no following hex digits");
00609         int n = 0;
00610         if (*first >= '0' && *first <= '9') {
00611           n = 16 * (*first - '0'); 
00612         }
00613         else 
00614           if (*first >= 'A' && *first <= 'F') {
00615             n = 16 * (10 + *first - 'A'); 
00616           }
00617           else 
00618             if (*first >= 'a' && *first <= 'f') {
00619               n = 16 * (10 + *first - 'a'); 
00620             }
00621             else return token_error(first - begin, "\\x used with no following hex digits");
00622         
00623         if (++first == last)
00624           return token_error(first - begin, "\\x used with only one following hex digit");
00625         
00626         if (*first >= '0' && *first <= '9') {
00627           n += *first - '0'; 
00628         }
00629         else 
00630           if (*first >= 'A' && *first <= 'F') {
00631             n += 10 + (*first - 'A'); 
00632           }
00633           else
00634             if (*first >= 'a' && *first <= 'f') {
00635               n += 10 + (*first - 'a'); 
00636             }
00637             else return token_error(first - begin, "\\x used with only one following hex digit");
00638         letter.set(n);
00639         ++first;
00640         return -1;
00641       }
00642       
00643       if (*first == 'c') { // control character \cA through \cZ maps to \x01 \x1A
00644         if (++first == last || *first < 'A' || *first > 'Z')
00645           return token_error(first - begin, "\\c used with no following upper-case letter");
00646         letter.set(*first - 'A' + 1);
00647         ++first;
00648         return -1;
00649       }
00650 
00651       // look for \s, \S, \w, \W, \d, \D, \n, \t, \v, \V, \r, \a, \e, \f, \h, \H
00652       std::map<string, bitset<256> >::const_iterator c = char_sets.find(std::string("\\") + *first);
00653       if (c != char_sets.end()) {
00654         letter = c->second;
00655         ++first;
00656         return -1;
00657       }
00658 
00659       if (*first == '.') {
00660         letter.set((unsigned char) '.');
00661         ++first;
00662         return -1;
00663       }
00664     }
00665 
00666     if (*first == '.') {
00667       if (literal)
00668         letter.set((unsigned char) '.');
00669       else
00670         letter.set();
00671     }
00672     else
00673       if (icase && isalpha(*first)) {
00674         letter.set((unsigned char) tolower((unsigned char) *first));
00675         letter.set((unsigned char) toupper((unsigned char) *first));
00676       }
00677       else letter.set((unsigned char) *first);
00678     ++first;
00679     return -1;
00680   }
00681 
00682   // Return error position or -1 if ok
00683   template <typename OutputIterator>
00684   int regexp_cursor::tokenize(const char *first, const char *last, OutputIterator x)
00685   {
00686     static map<string, bitset<256> > char_sets;
00687     if (char_sets.empty())
00688       build_char_sets(char_sets);
00689 
00690     *x++ = node(OPEN);
00691     node previous = node(OPEN);
00692   
00693     bool literal = false;
00694     std::vector<char> parenthesis; // contains '(' or 'i' for (?i:
00695     bool icase = false;
00696   
00697     for(const char *begin = first; first != last; ++first, ++x) {
00698       if (literal && *first != '\\') {
00699         if (previous.type != OPEN && previous.type != OR) 
00700           *x++ = node(CONCAT);
00701         if (icase && isalpha(*first)) {
00702           bitset<256> c;
00703           c.set((unsigned char) tolower(*first));
00704           c.set((unsigned char) toupper(*first));
00705           *x = previous = node(c);
00706         }
00707         else
00708           *x = previous = node(*first);
00709         continue;
00710       }
00711 
00712       switch (*first) {
00713       case '|' :
00714         if (first == begin)
00715           return token_error(0, "missing left part of alternative");
00716         *x = previous = node(OR);
00717         break;
00718 
00719       case '(' : 
00720         if (previous.type != OPEN && previous.type != OR) 
00721           *x++ = node(CONCAT);
00722         *x = previous = node(OPEN);
00723         if (first + 2 < last && first[1] == '?' && first[2] == ':') // non capturing parenthesis (?:
00724           first += 2;
00725         else
00726           if (first + 3 < last && first[1] == '?' && 
00727               first[2] == 'i' && first[3] == ':') { // case insensitive (?i:
00728             first += 3;
00729             if (icase == false) {
00730               parenthesis.push_back('i');
00731               icase = true;
00732               break;
00733             }
00734           }
00735         parenthesis.push_back('(');
00736         break;
00737 
00738       case ')' : 
00739         if (parenthesis.empty())
00740           return token_error(first - begin, "unbalanced parenthesis");
00741         if (parenthesis.back() == 'i')
00742           icase = false;
00743         parenthesis.pop_back();
00744         *x = previous = node(CLOSE);
00745         break;
00746 
00747       case '*' :
00748         if (first == begin)
00749           return token_error(0, "missing expression before operator *");
00750         *x = previous = node(STAR);
00751         break;
00752 
00753       case '?' :
00754         if (first == begin)
00755           return token_error(0, "missing expression before operator ?");
00756         *x = previous = node(QUESTION);
00757         break;
00758 
00759       case '+' :
00760         if (first == begin)
00761           return token_error(0, "missing expression before operator +");
00762         *x = previous = node(PLUS);
00763         break;
00764 
00765       case '{' : // {n,m}
00766         if (first == begin)
00767           return token_error(0, "missing expression before repeat operator");
00768         if (++first == last) 
00769           return token_error(first - begin, "unterminated expression");
00770         {
00771           int n = 0;
00772           const char *tmp = first;
00773           while (isdigit(*first)) {
00774             n = n * 10 + *first - '0';
00775             if (++first == last) 
00776               return token_error(first - begin, "unterminated repeat operator");
00777           }
00778           int m = n;
00779           if (*first == ',') {
00780             if (++first == last) 
00781               return token_error(first - begin, "unterminated repeat operator");
00782             tmp = first;
00783             for(m = 0; isdigit(*first);) {
00784               m = m * 10 + *first - '0';
00785               if (++first == last) 
00786                 return token_error(first - begin, "unterminated repeat operator");
00787             }
00788             if (tmp == first) m = -1; // comma but no upper bound: {n,}
00789           }
00790           if (*first != '}') 
00791             return token_error(first - begin, "invalid repeat operator");
00792 
00793           // errors: {0} {0,0} {,0}
00794           if (n > 32768 || m > 65535 || (n > m && m > -1) || (n == m && n == 0)) 
00795             return token_error(first - begin, "wrong range for repeat operator");
00796 
00797           // special cases: {0,1} {,1} {0,} {1,} {1,1}
00798           if (n == 0 && m == 1)
00799             *x = node(QUESTION);
00800           else if (n == 0 && m == -1)
00801             *x = node(STAR);
00802           else if (n == 1 && m == -1)
00803             *x = node(PLUS);
00804           else if (!(n == 1 &&  m == 1))
00805             *x = node(REPEAT, n, m);
00806         }
00807         break;
00808 
00809       case '[' :   // characters set
00810         {
00811           if (previous.type != OPEN && previous.type != OR) 
00812             *x++ = node(CONCAT);
00813           bitset<256> a;
00814           bool v = true;
00815           if (++first == last) 
00816             return token_error(first - begin, "unterminated expression");
00817           if (*first == '^') {  // a negation ?
00818             a.set();
00819             a.set(0, false);
00820             v = false;
00821             if (++first == last) 
00822               return token_error(first - begin, "unterminated expression");
00823           }
00824           if (*first == '-') {  // "escaped" hyphen at the beginning ?
00825             a.set((unsigned char) '-', v);
00826             ++first;
00827           }
00828           else
00829             if (*first == ']') {  // "escaped" closing square bracket at the beginning ?
00830               a.set((unsigned char) ']', v);
00831               ++first;
00832             }
00833           for(; first != last && *first != ']'; ++first) {
00834             if (*first == '[') {
00835               if (++first == last) break;
00836               if (*first != ':') {
00837                 a.set((unsigned char) '[', v);
00838               }
00839               else { // a predefined set as [:digits:]
00840                 const char *start = first - 1;
00841                 first = std::find(++first, last, ']');
00842                 if (first == last) break;
00843                 std::map<string, bitset<256> >::const_iterator c = char_sets.find(std::string(start, first + 1));
00844                 if (c == char_sets.end())
00845                   return token_error(start - begin, "unknown character set");
00846                 if (v == true)
00847                   a |= c->second;
00848                 else a &= ~c->second;
00849                 continue;
00850               }
00851             }
00852             // TODO: next_char manages ranges and return a character set (pass a boolean for 
00853             // saying this is a character set context and '-' as a special meaning, good for
00854             // . also, see below the ugly hack)
00855             bitset<256> tmp; // will possibly be used as a range lower bound
00856             if (*first != '-') {  // not a potential character range ?
00857               switch (*first) {
00858               case '^' :
00859               case '$' :
00860                 tmp.set((unsigned char) *first++);
00861                 break;
00862               default :
00863                 {
00864                   int r = next_char(tmp, begin, first, last, literal, char_sets, icase);
00865                   if (r != -1)
00866                     return r;
00867                   // ugly hack for dot
00868                   if (tmp.count() == 256) {
00869                     tmp.reset();
00870                     tmp.set((unsigned char) '.'); // . is a literal in a character set
00871                   }
00872                 }
00873               }
00874               if (v == true)
00875                 a |= tmp;
00876               else a &= ~ tmp; // negative definition
00877               if (*first != '-') {
00878                 --first;
00879                 continue;
00880               }
00881             }
00882             // now *first == '-'
00883             if (++first == last) 
00884               return token_error(first - begin, "unterminated expression");
00885             if (*first == ']') {  // unterminated character range ?
00886               a.set((unsigned char) '-', v);
00887               break; // end of character set
00888             }
00889             else { // character range [a-z]
00890               if (tmp.count() != 1) { // a bound must not be a character set
00891                 return token_error(first - 2 - begin, "wrong character range: the lower bound cannot be a character set");
00892               }
00893               bitset<256> upper_bound;
00894               switch (*first) {
00895               case '^' :
00896               case '$' :
00897                 upper_bound.set((unsigned char) *first);
00898                 break;
00899               default :
00900                 {
00901                   int r = next_char(upper_bound, begin, first, last, literal, char_sets, icase);
00902                   --first;
00903                   if (r != -1)
00904                     return token_error(first - begin, "wrong character range");
00905                   // ugly hack for dot
00906                   if (upper_bound.count() == 256) {
00907                     upper_bound.reset();
00908                     upper_bound.set((unsigned char) '.'); // . is a literal in a character set
00909                   }
00910                 }
00911               }
00912               if (upper_bound.count() != 1) // a bound must not be a character set
00913                 return token_error(first - begin, "wrong character range: the upper bound cannot be a character set");
00914 
00915               unsigned int int_lower_bound = 0, int_upper_bound = 0;
00916               while (tmp.test(int_lower_bound) == false) 
00917                 ++int_lower_bound;
00918               while (upper_bound.test(int_upper_bound) == false)
00919                 ++int_upper_bound;
00920 
00921               // guess what, lower bound should be lower or equal to upper bound
00922               if (int_upper_bound < int_lower_bound)
00923                 return token_error(first - begin, "wrong character range: lower bound cannot be greater than upper bound");
00924               else
00925                 for(unsigned int i = int_lower_bound; i <= int_upper_bound; ++i)
00926                   a.set(i, v);
00927             }
00928           }
00929           // first is supposed to point to ']'
00930           if (first == last) 
00931             return token_error(first - begin, "unterminated expression");
00932           *x = previous = node(a);
00933           break;
00934         }
00935 
00936       default :
00937         bitset<256> next_letter;
00938         int r = next_char(next_letter, begin, first, last, literal, char_sets, icase);
00939         if (r != -1)
00940           return r;
00941         if (next_letter.any()) {
00942           if (previous.type != OPEN && previous.type != OR) 
00943             *x++ = node(CONCAT);
00944           if (next_letter.count() == 256) // catch-all dot 
00945 #ifndef UTF8
00946             *x = previous = node(ANY);
00947 #else
00948           {
00949             *x++ = node(OPEN);
00950             next_letter.reset();
00951             for(int i = 1; i < 0x7F; ++i)
00952               next_letter.set(i);
00953             *x++ = node(next_letter);
00954             *x++ = node(OR);
00955             next_letter.reset();
00956             for(int i = 0; i < 32; ++i)
00957               next_letter.set(i | 192);
00958             *x++ = node(next_letter);
00959             *x++ = node(CONCAT);
00960             *x++ = node(ANY);
00961             *x = previous = node(CLOSE);
00962           }
00963 #endif
00964           else
00965             *x = previous = node(next_letter);
00966         }
00967         --first;
00968         break;
00969       }
00970     }
00971     *x++ = node(CLOSE);
00972     *x++ = node(CONCAT);
00973     *x++ = node(FINAL);
00974     return -1;
00975   }
00976 
00977   // exp -> term exp2    
00978   inline
00979   regexp_cursor::node*
00980   regexp_cursor::exp(vector<regexp_cursor::node>::iterator &first, 
00981                      vector<regexp_cursor::node>::iterator last)
00982   {
00983     if (first == last)
00984       return syntax_error(last - first, "empty expression");
00985     node *t = term(first, last);
00986     if (t == 0) return 0;
00987     return exp2(first, last, t);
00988   }
00989 
00990   // exp2 -> + term exp2 | epsilon
00991   inline
00992   regexp_cursor::node*
00993   regexp_cursor::exp2(vector<regexp_cursor::node>::iterator &first, 
00994                       vector<regexp_cursor::node>::iterator last, node *left)
00995   {
00996     if (first != last && first->type == OR) {
00997       node &root = *first;
00998       node *right = term(++first, last);
00999       if (right == 0) return 0;
01000       root.annulable = right->annulable || left->annulable;
01001       // premiere_pos(here) = premiere_pos(left) U premiere_pos(right): 
01002       root.premiere_pos.insert(left->premiere_pos.begin(),
01003                                left->premiere_pos.end()); 
01004       root.premiere_pos.insert(right->premiere_pos.begin(),
01005                                right->premiere_pos.end()); 
01006       root.derniere_pos.insert(left->derniere_pos.begin(),
01007                                left->derniere_pos.end()); 
01008       root.derniere_pos.insert(right->derniere_pos.begin(),
01009                                right->derniere_pos.end());
01010       return exp2(first, last, &root);
01011     }
01012     return left;
01013   }
01014 
01015   // term -> form repeat term2
01016   inline
01017   regexp_cursor::node*
01018   regexp_cursor::term(vector<regexp_cursor::node>::iterator &first, 
01019                       vector<regexp_cursor::node>::iterator last)
01020   {
01021     if (first == last) 
01022       return syntax_error(last - first, "unterminated expression");
01023     node *f = form(first, last);
01024     if (f == 0) return 0;
01025     node *s = repeat(first, last, f);  // return f if no repeat token is found
01026     return term2(first, last, s);
01027   }
01028 
01029   // repeat -> * repeat | ? repeat | + repeat | epsilon
01030   inline
01031   regexp_cursor::node*
01032   regexp_cursor::repeat(vector<regexp_cursor::node>::iterator &first, 
01033                         vector<regexp_cursor::node>::iterator last, node *left)
01034   {
01035     if (first != last)
01036       switch (first->type) {
01037       case STAR :
01038         {
01039           node &root = *first;
01040           root.annulable = true;
01041           root.premiere_pos = left->premiere_pos;
01042           root.derniere_pos = left->derniere_pos;
01043         
01044           // pos_suivante(derniere_pos(here)) += premiere_pos(here):
01045           for(S::const_iterator i = root.derniere_pos.begin(); 
01046               i != root.derniere_pos.end(); ++i)
01047             (*i)->pos_suivante.insert(root.premiere_pos.begin(),
01048                                       root.premiere_pos.end());
01049           return repeat(++first, last, &root);
01050         }
01051       case QUESTION : 
01052         {
01053           node &root = *first;
01054           root.annulable = true;
01055           root.premiere_pos = left->premiere_pos;
01056           root.derniere_pos = left->derniere_pos;
01057           return repeat(++first, last, &root);
01058         }
01059       case PLUS :
01060         {
01061           node &root = *first;
01062           root.annulable = false;
01063           root.premiere_pos = left->premiere_pos;
01064           root.derniere_pos = left->derniere_pos;
01065         
01066           // pos_suivante(derniere_pos(here)) += premiere_pos(here):
01067           for(S::const_iterator i = root.derniere_pos.begin(); 
01068               i != root.derniere_pos.end(); ++i)
01069             (*i)->pos_suivante.insert(root.premiere_pos.begin(),
01070                                       root.premiere_pos.end());
01071           return repeat(++first, last, &root);
01072         }
01073       default : 
01074         break;
01075       }
01076     return left;
01077   }
01078     
01079 
01080   // term2 -> . form repeat term2 | epsilon
01081   inline
01082   regexp_cursor::node*
01083   regexp_cursor::term2(vector<regexp_cursor::node>::iterator &first, 
01084                        vector<regexp_cursor::node>::iterator last, node *left)
01085   {
01086     if (first != last && first->type == CONCAT) {
01087       node &root = *first;
01088       node *f = form(++first, last);
01089       if (f == 0) return 0; 
01090       node *right = repeat(first, last, f);
01091       root.annulable = left->annulable && right->annulable;
01092       if (left->annulable) {
01093         root.premiere_pos.insert(left->premiere_pos.begin(),
01094                                  left->premiere_pos.end()); 
01095         root.premiere_pos.insert(right->premiere_pos.begin(),
01096                                  right->premiere_pos.end()); 
01097       }
01098       else
01099         root.premiere_pos = left->premiere_pos;
01100     
01101       if (right->annulable) {
01102         root.derniere_pos.insert(right->derniere_pos.begin(),
01103                                  right->derniere_pos.end()); 
01104         root.derniere_pos.insert(left->derniere_pos.begin(),
01105                                  left->derniere_pos.end());
01106       }
01107       else
01108         root.derniere_pos = right->derniere_pos;
01109       // pos_suivante(derniere_pos(left)) += premiere_pos(right):
01110       for(S::const_iterator i = left->derniere_pos.begin(); 
01111           i != left->derniere_pos.end(); ++i)
01112         (*i)->pos_suivante.insert(right->premiere_pos.begin(),
01113                                   right->premiere_pos.end());
01114       return term2(first, last, &root);
01115     }
01116     return left;
01117   }
01118 
01119 
01120   // form -> ( exp ) | letter
01121   inline
01122   regexp_cursor::node*
01123   regexp_cursor::form(vector<regexp_cursor::node>::iterator &first, 
01124                       vector<regexp_cursor::node>::iterator last)
01125   {
01126     if (first == last) 
01127       return syntax_error(last - first, "unterminated expression");
01128     if (first->type == OPEN) {
01129       node *e = exp(++first, last);
01130       if (e == 0) return 0;
01131       if (first->type == CLOSE) ++first;
01132       else return syntax_error(last - first, "unbalanced parenthesis");
01133       return e;
01134     }
01135     first->annulable = false;
01136     first->premiere_pos.insert(&*first);
01137     first->derniere_pos.insert(&*first);
01138     ++first;
01139     return &(first[-1]);
01140   }
01141 
01142   inline
01143   regexp_cursor regexpc(const string &exp) {
01144     return regexp_cursor(exp.c_str());
01145   }
01146 
01147   inline
01148   regexp_cursor regexpc() {
01149     return regexp_cursor();
01150   }
01151 
01152 
01153   template <template <typename T, typename U> class DFA>
01154   class lazy_cursor<regexp_cursor, DFA> : public cursor_concept
01155   {
01156   public:
01157     typedef regexp_cursor Cursor;
01158 #ifdef USE_HASH_TABLE
01159     typedef DFA<Cursor::char_traits, 
01160                 const regexp_cursor::ordered_state_type*> DFA_type;
01161 #else
01162     typedef DFA<Cursor::char_traits, 
01163                 const regexp_cursor::state_type*> DFA_type;
01164 #endif
01165 
01166   protected:
01167 #ifdef USE_HASH_TABLE
01168     typedef std::map<regexp_cursor::ordered_state_type, typename DFA_type::state_type> mapper;
01169 #else
01170     typedef std::map<regexp_cursor::state_type, typename DFA_type::state_type> mapper;
01171 #endif
01172 
01173     Cursor               c;
01174     smart_ptr<DFA_type>  dfa;
01175     smart_ptr<mapper>    mapping;  
01176     typename DFA_type::state_type my_sink;
01177     typename DFA_type::state_type q;
01178     
01179   public:
01180     typedef lazy_cursor                   self;
01181     typedef typename DFA_type::state_type state_type;
01182     typedef empty_tag                     tag_type;
01183     typedef Cursor::char_type    char_type;
01184     typedef Cursor::char_traits  char_traits;
01185 
01186     lazy_cursor(const Cursor &x)
01187       : c(x), my_sink(dfa->new_state()), q(DFA_type::null_state)
01188     { 
01189       if (!x.sink()) {
01190         q = dfa->new_state();
01191 #ifdef USE_HASH_TABLE
01192         dfa->tag(q) = &(mapping->insert(make_pair(regexp_cursor::ordered_state_type(c.src().begin(), c.src().end()), q)).first->first);
01193 #else
01194         dfa->tag(q) = &(mapping->insert(make_pair(c.src(), q)).first->first);
01195 #endif
01196         dfa->final(q) = c.src_final();
01197         dfa->initial(q);
01198       }
01199     }
01200 
01201     // deprecated:
01202     //   const Cursor& cursor() {
01203     //     c = dfa->tag(q);
01204     //     return c;
01205     //   }
01206 
01207     const Cursor& adaptee() {
01208       if (q != DFA_type::null_state)
01209         c = *dfa->tag(q);
01210       return c;
01211     }
01212     
01213     const DFA_type& cache() const {
01214       return *dfa;
01215     }
01216     
01217     state_type src() const {
01218       return q;
01219     }
01220     
01221     self& operator=(state_type p) {
01222       q = p;
01223       return *this;
01224     }
01225     
01226     bool src_final() const {
01227       return dfa->final(q);
01228     }
01229     
01230     empty_tag src_tag() const {
01231       return empty_tag();
01232     }
01233     
01234     bool sink() const {
01235       return q == DFA_type::null_state;
01236     }
01237     
01238     state_type sink_state() const {
01239       return DFA_type::null_state;
01240     }
01241 
01242     //   bool exists(const char_type &a) const {
01243     //     state_type aim = dfa->delta1(q, a);
01244     //     if (aim == my_sink) return false;
01245     //     Cursor tmp = c;
01246     //     tmp = dfa->tag(q);
01247     //     return tmp.exists(a);
01248     //   }
01249 
01250     bool forward(const char_type &a) {
01251       state_type tmp = dfa->delta1(q, a);
01252       if (tmp == my_sink) {
01253         q = DFA_type::null_state; // we already know there is no transition with 'a'
01254         return false;
01255       }
01256       if (tmp == DFA_type::null_state) {    // transition not in the cache ?
01257         c = *dfa->tag(q);
01258         if (c.forward(a)) {   // any transition labelled with 'a' ?
01259 #ifdef USE_HASH_TABLE
01260           std::pair<typename mapper::iterator, bool> p = 
01261             mapping->insert(std::make_pair(regexp_cursor::ordered_state_type(c.src().begin(), c.src().end()), DFA_type::null_state));
01262 #else
01263           std::pair<typename mapper::iterator, bool> p = 
01264             mapping->insert(std::make_pair(c.src(), DFA_type::null_state));
01265 #endif
01266           if (p.second == true) { // target state is not dejavu ?
01267             tmp = dfa->new_state();
01268             dfa->final(tmp) = c.src_final();
01269             dfa->tag(tmp) = &(p.first->first);
01270             p.first->second = tmp;
01271           }
01272           else tmp = p.first->second;
01273           dfa->set_trans(q, a, tmp);
01274         }
01275         else {
01276           dfa->set_trans(q, a, my_sink);
01277           q = DFA_type::null_state;
01278           return false;
01279         }
01280       }
01281       q = tmp;
01282       return true;
01283     }
01284   };
01285 
01286   // match algorithm specializations for regexp
01287 
01288   inline
01289   bool match(lazy_cursor<regexp_cursor>& c, const char *w)
01290   {
01291     if (!c.forward('\0')) // ^ anchoring
01292       return false;
01293     for(; *w != '\0' && c.forward(*w); ++w)
01294       if (c.src_final()) return true;
01295     return *w == '\0' && c.forward('\0') && c.src_final(); // $ anchoring
01296   }
01297 
01298   template <typename InputI>
01299   inline
01300   bool match(lazy_cursor<regexp_cursor>& c, InputI first, InputI last)
01301   {
01302     if (!c.forward('\0'))  // ^ anchoring
01303       return false;
01304     for(; first != last && c.forward(*first); ++first)
01305       if (c.src_final()) return true;
01306     return first == last && c.forward('\0') && c.src_final(); // $ anchoring
01307   }
01308 
01309   template <typename ForwardI>
01310   inline
01311   ForwardI first_match(lazy_cursor<regexp_cursor> &c, ForwardI first, ForwardI last)
01312   {
01313     if (!c.forward('\0')) return first;
01314     const ForwardI start = first;
01315     for(; first != last && c.forward(*first); ++first)
01316       if (c.src_final()) return ++first;
01317     return first == last && c.forward('\0') && c.src_final() ? first : start;
01318   }
01319 
01320   inline
01321   const char*
01322   first_match(lazy_cursor<regexp_cursor> &c, const char *text)
01323   {
01324     if (!c.forward('\0')) return text;
01325     const char *start = text;
01326     for(; *text != '\0' && c.forward(*text); ++text)
01327       if (c.src_final()) return ++text;
01328     return *text == '\0' && c.forward('\0') && c.src_final() ? text : start;
01329   }
01330 
01331 
01332   inline
01333   ostream& operator<<(ostream& out, const regexp_cursor& c) {
01334     for(vector<regexp_cursor::node>::const_iterator n = c.e->begin(); n != c.e->end(); ++n)
01335       if (c.q.find(const_cast<astl::regexp_cursor::node*>(&*n)) != c.q.end())
01336         out << "\033[1m" << *n << "\033[0m";
01337       else
01338         out << *n;
01339     return out;
01340   }
01341 
01342   static string regexp_fcursor_errmsg;
01343 
01344   // le regexp forward cursor ne gère pas les ensembles de lettres du
01345   // genre [A-Z] (trop cher pour le parcours des transitions sortantes) 
01346   class regexp_fcursor : public forward_cursor_concept
01347   {
01348   protected:
01349 #ifdef _MSC_VER
01350   public:
01351 #endif
01352     struct node;
01353     typedef set<node*> S;
01354 
01355     typedef enum { STAR, CONCAT, OR, REPEAT, EPSILON, 
01356                    LETTER, OPEN, CLOSE, QUESTION, PLUS,
01357                    ANY, FINAL } token_type;
01358   
01359     struct node {
01360       token_type type;
01361       short letter;
01362       bool annulable;
01363       const char *pos;
01364       S premiere_pos, derniere_pos, pos_suivante;
01365     
01366       node(token_type t, const char *p = 0)
01367         : type(t), letter(128),pos(p)
01368       { }
01369 
01370       node(token_type type_, short letter_, const char *p = 0)
01371         : type(type_), letter(letter_), annulable(false), pos(p) 
01372       { }
01373 
01374       bool final() const {
01375         return type == FINAL;
01376       }
01377     };
01378 
01379     S q;  // current state
01380     smart_ptr<vector<node> > e;  // node "allocator"
01381     short c; // letter
01382 
01383     node* exp(vector<node>::iterator &first, vector<node>::iterator last);
01384     node* form(vector<node>::iterator &first, vector<node>::iterator last);
01385     node* term(vector<node>::iterator &first, vector<node>::iterator last);
01386     node* exp2(vector<node>::iterator &first, vector<node>::iterator last, node *left);
01387     node* term2(vector<node>::iterator &first, vector<node>::iterator last, node *left);
01388     node* repeat(vector<node>::iterator &first, vector<node>::iterator last, node *left);
01389     template <typename OutputIterator>
01390     int tokenize(const char *first, const char *last, OutputIterator x);
01391 
01392   public:
01393     ptrdiff_t errpos;
01394 
01395     friend ostream& print_node(ostream&, const node&);
01396     typedef regexp_fcursor   self;
01397     typedef regexp_cursor    super;
01398     typedef S                state_type;
01399     typedef plain            char_traits;
01400     typedef plain::char_type char_type;
01401     typedef empty_tag        tag_type;
01402     typedef forward_cursor_concept concept;
01403 
01404     regexp_fcursor(const char *expression = "")
01405       : errpos(-1)
01406     {
01407       if (strlen(expression) > 0) {
01408         e->reserve(8);
01409         tokenize(expression, expression + strlen(expression),
01410                  back_inserter(*e));
01411         vector<node>::iterator first = e->begin();
01412         node *root = exp(first, e->end());
01413         if (root == 0) 
01414           errpos = first->pos - expression;
01415         else 
01416           q = root->premiere_pos;
01417       }
01418     }
01419 
01420     bool sink() const {
01421       return q.empty();
01422     }
01423 
01424     const string& error_message() const {
01425       return regexp_fcursor_errmsg;
01426     }
01427 
01428     self& operator=(const state_type &s) {
01429       q = s;
01430       return *this;
01431     }
01432   
01433     char_type letter() const {
01434       return (char_type) c;
01435     }
01436 
01437     bool first() {
01438       c = 128;
01439       for(S::const_iterator i = q.begin(); i != q.end(); ++i) 
01440         if ((*i)->type == ANY) {
01441           c = -128;
01442           return true;
01443         }
01444         else
01445           c = min__(c, (*i)->letter);
01446       return c != 128;
01447     }
01448     
01449     bool next() {
01450       short tmp = c;
01451       c = 128;
01452       for(S::const_iterator i = q.begin(); i != q.end(); ++i) {
01453         if ((*i)->type == ANY) {
01454           c = tmp + 1;
01455           break;
01456         }
01457         if ((*i)->letter > tmp)
01458           c = min__(c, (*i)->letter);
01459       }
01460       return c != 128;
01461     }
01462 
01463     state_type src() const {
01464       return q;
01465     }
01466 
01467     bool src_final() const {
01468       for(S::const_iterator i = q.begin(); i != q.end(); ++i)
01469         if ((*i)->type == FINAL) return true;
01470       return false;
01471     }   
01472     
01473     // Brute force and massive ignorance:
01474     state_type aim() const { 
01475       regexp_fcursor tmp = *this;
01476       tmp.forward();
01477       return tmp.src();
01478     }
01479 
01480     // Brute force and massive ignorance:
01481     bool aim_final() const
01482     { 
01483       self tmp = *this;
01484       tmp.forward();
01485       return tmp.src_final();
01486     }
01487 
01488     void forward() {
01489       forward(c);
01490     }
01491 
01492     bool exists(char a) const {
01493       for(S::const_iterator i = q.begin(); i != q.end(); ++i)
01494         if ((*i)->type == ANY || (*i)->letter == a)
01495           return true;
01496       return false;
01497     }
01498 
01499     bool forward(char a) {
01500       state_type p;
01501       for(S::const_iterator i = q.begin(); i != q.end(); ++i)
01502         if ((*i)->type == ANY || (*i)->letter == a)
01503           p.insert((*i)->pos_suivante.begin(),
01504                    (*i)->pos_suivante.end());
01505       q.swap(p);   // optimized equivalent to q = p;
01506       return !q.empty();
01507     }
01508 
01509     tag_type aim_tag() const {
01510       return tag_type();
01511     }
01512 
01513     bool operator==(const regexp_fcursor &y) const {
01514       return true;
01515     }
01516 
01517     tag_type src_tag() const {
01518       return tag_type();
01519     }
01520   };
01521 
01522   // Return error position or -1 if ok
01523   template <typename OutputIterator>
01524   int regexp_fcursor::tokenize(const char *first, const char *last, 
01525                                OutputIterator x)
01526   {
01527     *x++ = node(OPEN);
01528     for(const char *begin = first; first != last; ++first, ++x)
01529       switch (*first) {
01530       case '|' : 
01531         *x = node(OR, first); 
01532         break;
01533 
01534       case '(' : 
01535         if (first != begin && first[-1] != '(' && first[-1] != '|') 
01536           *x++ = node(CONCAT, *first);
01537         *x = node(OPEN, first); 
01538         break;
01539 
01540       case ')' : 
01541         *x = node(CLOSE, first);
01542         break;
01543 
01544       case '*' :
01545         *x = node(STAR, first);
01546         break;
01547 
01548       case '?' :
01549         *x = node(QUESTION, first);
01550         break;
01551 
01552       case '+' :
01553         *x = node(PLUS, first);
01554         break;
01555         // Escaped character:
01556       case '\\' : 
01557         if (first != begin && first[-1] != '|' && first[-1] != '(') 
01558           *x++ = node(CONCAT, first);
01559         if (++first == last) return first - begin;
01560         switch (*first) {
01561         case 'n' : *x = node(LETTER, '\n', first); break;
01562         case 't' : *x = node(LETTER, '\t', first); break;
01563         case 'r' : *x = node(LETTER, '\r', first); break;
01564         default : *x = node(LETTER, *first, first); 
01565         }
01566         break;
01567       
01568       default :
01569         if (first != begin && first[-1] != '|' && first[-1] != '(') 
01570           *x++ = node(CONCAT, first);
01571         if (*first == '.') *x = node(ANY, first);
01572         else *x = node(LETTER, *first, first); 
01573         break;
01574       }
01575     *x++ = node(CLOSE);
01576     *x++ = node(CONCAT);
01577     *x++ = node(FINAL);
01578     return 0;
01579   }
01580 
01581   // exp -> term exp2    
01582   inline
01583   regexp_fcursor::node*
01584   regexp_fcursor::exp(vector<regexp_fcursor::node>::iterator &first, 
01585                       vector<regexp_fcursor::node>::iterator last)
01586   {
01587     if (first == last) {
01588       regexp_fcursor_errmsg = "empty expression";
01589       return 0;
01590     }
01591     node *t = term(first, last);
01592     if (t == 0) return 0;
01593     return exp2(first, last, t);
01594   }
01595 
01596   // exp2 -> + term exp2 | epsilon
01597   inline
01598   regexp_fcursor::node*
01599   regexp_fcursor::exp2(vector<regexp_fcursor::node>::iterator &first, 
01600                        vector<regexp_fcursor::node>::iterator last, node *left)
01601   {
01602     if (first != last && first->type == OR) {
01603       node &root = *first;
01604       node *right = term(++first, last);
01605       if (right == 0) return 0;
01606       root.annulable = right->annulable || left->annulable;
01607       // premiere_pos(here) = premiere_pos(left) U premiere_pos(right): 
01608       root.premiere_pos.insert(left->premiere_pos.begin(),
01609                                left->premiere_pos.end()); 
01610       root.premiere_pos.insert(right->premiere_pos.begin(),
01611                                right->premiere_pos.end()); 
01612       root.derniere_pos.insert(left->derniere_pos.begin(),
01613                                left->derniere_pos.end()); 
01614       root.derniere_pos.insert(right->derniere_pos.begin(),
01615                                right->derniere_pos.end());
01616       return exp2(first, last, &root);
01617     }
01618     return left;
01619   }
01620 
01621   // term -> form repeat term2
01622   inline
01623   regexp_fcursor::node*
01624   regexp_fcursor::term(vector<regexp_fcursor::node>::iterator &first, 
01625                        vector<regexp_fcursor::node>::iterator last)
01626   {
01627     if (first == last) return 0;
01628     node *f = form(first, last);
01629     if (f == 0) return 0;
01630     node *s = repeat(first, last, f);  // return f if no repeat token is found
01631     return term2(first, last, s);
01632   }
01633 
01634   // repeat -> * repeat | ? repeat | + repeat | {n,m} repeat | epsilon
01635   inline
01636   regexp_fcursor::node*
01637   regexp_fcursor::repeat(vector<regexp_fcursor::node>::iterator &first, 
01638                          vector<regexp_fcursor::node>::iterator last, node *left)
01639   {
01640     if (first != last)
01641       switch (first->type) {
01642       case STAR :
01643         {
01644           node &root = *first;
01645           root.annulable = true;
01646           root.premiere_pos = left->premiere_pos;
01647           root.derniere_pos = left->derniere_pos;
01648         
01649           // pos_suivante(derniere_pos(here)) += premiere_pos(here):
01650           for(S::const_iterator i = root.derniere_pos.begin(); 
01651               i != root.derniere_pos.end(); ++i)
01652             (*i)->pos_suivante.insert(root.premiere_pos.begin(),
01653                                       root.premiere_pos.end());
01654           return repeat(++first, last, &root);
01655         }
01656       case QUESTION : 
01657         {
01658           node &root = *first;
01659           root.annulable = true;
01660           root.premiere_pos = left->premiere_pos;
01661           root.derniere_pos = left->derniere_pos;
01662           return repeat(++first, last, &root);
01663         }
01664       case PLUS :
01665         {
01666           node &root = *first;
01667           root.annulable = false;
01668           root.premiere_pos = left->premiere_pos;
01669           root.derniere_pos = left->derniere_pos;
01670         
01671           // pos_suivante(derniere_pos(here)) += premiere_pos(here):
01672           for(S::const_iterator i = root.derniere_pos.begin(); 
01673               i != root.derniere_pos.end(); ++i)
01674             (*i)->pos_suivante.insert(root.premiere_pos.begin(),
01675                                       root.premiere_pos.end());
01676           return repeat(++first, last, &root);
01677         }
01678       default : 
01679         return left;
01680       }
01681     return left;
01682   }
01683     
01684   // term2 -> . form repeat term2 | epsilon
01685   inline
01686   regexp_fcursor::node*
01687   regexp_fcursor::term2(vector<regexp_fcursor::node>::iterator &first, 
01688                         vector<regexp_fcursor::node>::iterator last, node *left)
01689   {
01690     if (first != last && first->type == CONCAT) {
01691       node &root = *first;
01692       node *f = form(++first, last);
01693       if (f == 0) return 0;
01694       node *right = repeat(first, last, f);
01695       root.annulable = left->annulable && right->annulable;
01696       if (left->annulable) {
01697         root.premiere_pos.insert(left->premiere_pos.begin(),
01698                                  left->premiere_pos.end()); 
01699         root.premiere_pos.insert(right->premiere_pos.begin(),
01700                                  right->premiere_pos.end()); 
01701       }
01702       else
01703         root.premiere_pos = left->premiere_pos;
01704     
01705       if (right->annulable) {
01706         root.derniere_pos.insert(right->derniere_pos.begin(),
01707                                  right->derniere_pos.end()); 
01708         root.derniere_pos.insert(left->derniere_pos.begin(),
01709                                  left->derniere_pos.end());
01710       }
01711       else
01712         root.derniere_pos = right->derniere_pos;
01713       // pos_suivante(derniere_pos(left)) += premiere_pos(right):
01714       for(S::const_iterator i = left->derniere_pos.begin(); 
01715           i != left->derniere_pos.end(); ++i)
01716         (*i)->pos_suivante.insert(right->premiere_pos.begin(),
01717                                   right->premiere_pos.end());
01718       return term2(first, last, &root);
01719     }
01720     return left;
01721   }
01722 
01723 
01724   // form -> ( exp ) | letter
01725   inline
01726   regexp_fcursor::node*
01727   regexp_fcursor::form(vector<regexp_fcursor::node>::iterator &first, 
01728                        vector<regexp_fcursor::node>::iterator last)
01729   {
01730     if (first == last) return 0;
01731     if (first->type == OPEN) {
01732       node *e = exp(++first, last);
01733       if (e == 0) return 0;
01734       if (first->type == CLOSE) ++first;
01735       else {
01736         regexp_fcursor_errmsg = "unbalanced parenthesis";
01737         return 0;
01738       }
01739       return e;
01740     }
01741     first->annulable = false;
01742     first->premiere_pos.insert(&*first);
01743     first->derniere_pos.insert(&*first);
01744     ++first;
01745     return &(first[-1]);
01746   }
01747 
01748   inline
01749   regexp_fcursor regexpfc(const string &exp) {
01750     return regexp_fcursor(exp.c_str());
01751   }
01752 
01753   inline
01754   std::ostream& print_node(std::ostream &out, const regexp_fcursor::node &x) 
01755   {
01756     switch(x.type) {
01757     case regexp_fcursor::LETTER :
01758       out << "[" << x.letter << "]";
01759       break;
01760 
01761     case regexp_fcursor::OR : 
01762       out << "|"; 
01763       break;
01764 
01765     case regexp_fcursor::FINAL : 
01766       out << "[EoE]";
01767       break;
01768 
01769     case regexp_fcursor::OPEN : 
01770       out << "("; 
01771       break;
01772 
01773     case regexp_fcursor::CLOSE : 
01774       out << ")"; 
01775       break;
01776 
01777     case regexp_fcursor::CONCAT : 
01778       out << " "; 
01779       break;
01780 
01781     case regexp_fcursor::STAR : 
01782       out << "*"; 
01783       break;
01784 
01785     case regexp_fcursor::QUESTION : 
01786       out << "?"; 
01787       break;
01788 
01789     case regexp_fcursor::PLUS : 
01790       out << "+"; 
01791       break;
01792 
01793     default :
01794       break;
01795     } 
01796     return out;
01797   }
01798 
01799   static const int EXTRACT_CURSOR_MAX_SUB_EXPRESSION = 16;
01800 
01801   class extract_cursor : public cursor_concept
01802   {
01803   public:
01804     struct node;
01805     typedef set<node*> S;
01806     typedef pair<int, int> match;
01807     struct token
01808     {
01809       node *state;
01810       vector<match> submatches;
01811       int ccount;  // char count along the way
01812       token(node *state_ = NULL)
01813         : state(state_), ccount(0)
01814       { }
01815       token(node *state_, const vector<match> &v, int count)
01816         : state(state_), submatches(v), ccount(count)
01817       { }
01818       bool operator<(const token &x) const {
01819         return state < x.state ||
01820           (state == x.state && ccount < x.ccount) ||
01821           (ccount == x.ccount && submatches < x.submatches);
01822       }
01823     };
01824 
01825     typedef enum { STAR, CONCAT, OR, REPEAT, EPSILON, 
01826                    LETTER, OPEN, CLOSE, QUESTION, PLUS,
01827                    ANY, FINAL } token_type;
01828   
01829     struct node {
01830       token_type type;
01831       bitset<256> letter;
01832       bool annulable;
01833       const char *pos;
01834       S premiere_pos, derniere_pos, pos_suivante;
01835       bitset<EXTRACT_CURSOR_MAX_SUB_EXPRESSION> begin, end;  // starting & ending subexpressions
01836       int subexpression; // subexpression number if token == '('
01837     
01838       node(token_type t, const char *p = 0, int sub = 0) //, int n_ = 0, int m_ = 0)
01839         : type(t), pos(p), subexpression(sub) { 
01840         if (type == ANY) {
01841           type = LETTER;
01842           letter.set();
01843         }
01844       }
01845 
01846       node(char t, const char *p = 0)
01847         : type(LETTER), annulable(false), pos(p) {
01848         letter.set((unsigned char) t);
01849       }
01850 
01851       node(const bitset<256> &t, const char *p = 0)
01852         : type(LETTER), letter(t), annulable(false), pos(p)
01853       { }
01854 
01855       bool final() const {
01856         return type == FINAL;
01857       }
01858 
01859       char to_char() const {   // does not work with ranges
01860         size_t i;
01861         for(i = 0; i != letter.size() && letter.test(i) == false; ++i);
01862         return (char) (-128 + i);
01863       }
01864     };
01865 
01866   public:
01867     typedef set<token> state_type;
01868 
01869   protected:
01870     set<token>                 q;  // current state
01871     smart_ptr<vector<node> >   e;  // node "allocator"
01872     int                        sub_count; // subexpression count
01873     bitset<EXTRACT_CURSOR_MAX_SUB_EXPRESSION> statics;
01874     // sub with static start, i.e. different from (A)*
01875 
01876     node* exp(vector<node>::iterator &first, vector<node>::iterator last);
01877     node* form(vector<node>::iterator &first, vector<node>::iterator last);
01878     node* term(vector<node>::iterator &first, vector<node>::iterator last);
01879     node* exp2(vector<node>::iterator &first, vector<node>::iterator last, node *left);
01880     node* term2(vector<node>::iterator &first, vector<node>::iterator last, node *left);
01881     node* repeat(vector<node>::iterator &first, vector<node>::iterator last, node *left);
01882     template <typename OutputIterator>
01883     int tokenize(const char *first, const char *last, OutputIterator x, 
01884                  bool kmp_style = false);
01885 
01886   public:
01887     friend std::ostream& operator<<(std::ostream &, const
01888                                     extract_cursor::node &);
01889     ptrdiff_t errpos;
01890 
01891     typedef extract_cursor    self;
01892     typedef plain            char_traits;
01893     typedef plain::char_type char_type;
01894     typedef vector<match>    tag_type;
01895 
01896     extract_cursor(const string &expression = ".", bool kmp_style = false, 
01897                    bool verbose = false)
01898       : sub_count(0), errpos(-1)
01899     {
01900       statics.set();
01901       e->reserve(128);
01902       tokenize(expression.c_str(), expression.c_str() + expression.size(),
01903                back_inserter(*e), kmp_style);
01904       if (verbose) {
01905         cout << "tokenized expression:" << endl;
01906         copy(e->begin(), e->end(), ostream_iterator<node>(cout));
01907         cout << endl;
01908       }
01909       vector<node>::iterator first = e->begin();
01910       node *root = exp(first, e->end());
01911       if (root == NULL) 
01912         errpos = first->pos - expression.c_str();
01913       else {
01914         for(set<node*>::iterator i = root->premiere_pos.begin(); i != root->premiere_pos.end(); ++i)
01915           q.insert(token(*i));
01916       }
01917     }
01918 
01919     ptrdiff_t error() const { // possible error position (-1 if ok)
01920       return errpos;
01921     }
01922 
01923     self& operator=(const state_type &s) {
01924       q = s;
01925       return *this;
01926     }
01927   
01928     state_type src() const {
01929       return q;
01930     }
01931 
01932     bool exists(char a) const {
01933       for(set<token>::const_iterator i = q.begin(); i != q.end(); ++i)
01934         if (i->state->letter[(unsigned char) a] == true)
01935           return true;
01936       return false;
01937     }
01938 
01939     bool forward(char letter) {
01940       static vector<match> tmp(EXTRACT_CURSOR_MAX_SUB_EXPRESSION);
01941       set<token> p;
01942       for(set<token>::iterator i = q.begin(); i != q.end(); ++i) {
01943         if (i->state->letter[(unsigned char) letter] == true) {
01944           tmp = i->submatches;
01945           if (i->state->begin.any() || i->state->end.any())
01946             for(int j = 0; j < subexp_count(); ++j) {
01947               if (i->state->begin[j]) { // sub-expression j starts here ?
01948                 if ((unsigned int) j >= tmp.size()) {
01949                   tmp.resize(j + 1, match(-1, -1));
01950                   tmp[j].first = i->ccount;
01951                 }
01952                 else
01953                   if (tmp[j].first == -1 || statics[j] == false)
01954                     tmp[j].first = i->ccount;  // update start position
01955               }
01956               if (i->state->end[j])   // sub-expression j ends here ?
01957                 tmp[j].second = i->ccount + 1;
01958             }
01959           for(S::const_iterator k = i->state->pos_suivante.begin();
01960               k != i->state->pos_suivante.end(); ++k)
01961             p.insert(token(*k, tmp, i->ccount + 1));
01962         }
01963       }
01964       q.swap(p);
01965       return !q.empty();
01966     }
01967 
01968     int subexp_count() const {
01969       return sub_count;  // always at least one subexpression
01970     }
01971 
01972     vector<pair<int, int> > submatches() const {
01973       for(set<token>::const_iterator i = q.begin(); i != q.end(); ++i)
01974         if (i->state->final()) return i->submatches;
01975       return vector<match>();
01976     }
01977 
01978     bool src_final() const {
01979       for(set<token>::const_iterator i = q.begin(); i != q.end(); ++i)
01980         if (i->state->final()) return true;
01981       return false;
01982     }
01983      
01984     bool sink() const {
01985       return q.empty();
01986     }
01987  
01988     tag_type src_tag() const {
01989       return submatches();
01990     }
01991   };
01992 
01993   inline
01994   std::ostream& operator<<(std::ostream &out, const extract_cursor::node &x) 
01995   {
01996     switch(x.type) {
01997     case extract_cursor::LETTER :
01998       out << "[";
01999       if (x.letter.count() == 256)   // Sigma
02000         out << "Sigma";
02001       else
02002         for(int i = 32; i < 256; ++i)
02003           if (x.letter.test(i) == true) out << (char) i;
02004       out << "]";
02005       break;
02006 
02007     case extract_cursor::OR : 
02008       out << "|"; 
02009       break;
02010 
02011     case extract_cursor::FINAL : 
02012       out << "[EoE]";
02013       break;
02014 
02015     case extract_cursor::OPEN : 
02016       out << "("; 
02017       break;
02018 
02019     case extract_cursor::CLOSE : 
02020       out << ")"; 
02021       break;
02022 
02023     case extract_cursor::CONCAT : 
02024       out << " "; 
02025       break;
02026 
02027     case extract_cursor::STAR : 
02028       out << "*"; 
02029       break;
02030 
02031     case extract_cursor::QUESTION : 
02032       out << "?"; 
02033       break;
02034 
02035     case extract_cursor::PLUS : 
02036       out << "+"; 
02037       break;
02038 
02039       //    case extract_cursor::REPEAT : 
02040       //      out << "{" << x.n << "," << x.m << "}";
02041       //      break;
02042 
02043     default :
02044       break;
02045     } 
02046     return out;
02047   }
02048 
02049   // Return error position or -1 if ok
02050   template <typename OutputIterator>
02051   int extract_cursor::tokenize(const char *first, const char *last, 
02052                                OutputIterator x, bool kmp_style)
02053   {
02054     int subexpression = 0;
02055 
02056     if (kmp_style) {  // prepend Sigma*
02057       *x++ = node(ANY);
02058       *x++ = node(STAR);
02059       *x++ = node(CONCAT); 
02060     }
02061     *x++ = node(OPEN, NULL, subexpression++); // the overall subexpression
02062     for(const char *begin = first; first != last; ++first, ++x)
02063       switch (*first) {
02064       case '|' : 
02065         *x = node(OR, first); 
02066         break;
02067 
02068       case '(' : 
02069         if (first != begin && first[-1] != '(' && first[-1] != '|') 
02070           *x++ = node(CONCAT, first);
02071         *x = node(OPEN, first, subexpression++); 
02072         break;
02073 
02074       case ')' : 
02075         *x = node(CLOSE, first);
02076         break;
02077 
02078       case '*' :
02079         *x = node(STAR, first);
02080         break;
02081 
02082       case '?' :
02083         *x = node(QUESTION, first);
02084         break;
02085 
02086       case '+' :
02087         *x = node(PLUS, first);
02088         break;
02089 
02090         /* not yet implemented 
02091            case '{' :
02092            if (++first == last) return first - begin;
02093            {
02094            int n = 0;
02095            const char *tmp = first;
02096            while (isdigit(*first)) {
02097            n = n * 10 + *first - '0';
02098            if (++first == last) return first - begin;
02099            }
02100            if (tmp == first) return first - begin;
02101            int m = n;
02102            if (*first == ',') {
02103            if (++first == last) return first - begin;
02104            tmp = first;
02105            for(m = 0; isdigit(*first);) {
02106            m = m * 10 + *first - '0';
02107            if (++first == last) return first - begin;
02108            }
02109            if (tmp == first) return first - begin;
02110            }
02111            if (*first != '}') return first - begin;
02112            if (n > m) return first - begin;
02113            *x = node(REPEAT, first, n, m);
02114            }
02115            break;
02116         */
02117 
02118       case '[' :   // characters set
02119         {
02120           if (first != begin && first[-1] != '|' && first[-1] != '(') 
02121             *x++ = node(CONCAT, first);
02122           bitset<256> a;
02123           bool v = true;
02124           if (++first == last) return first - begin;
02125           if (*first == '^') {
02126             a.set();
02127             v = false;
02128             if (++first == last) return first - begin;
02129           }
02130           while (first != last && *first != ']') {
02131             const unsigned char b = (unsigned char) *first;
02132             if (++first == last) break;
02133             if (*first != '-') {
02134               a.set(b, v);
02135               continue;
02136             }
02137             if (++first == last || *first == ']' || *first < b) return first - begin;
02138             for(unsigned int i = b; i <= (unsigned char) *first; ++i)
02139               a.set(i, v);
02140             ++first;
02141           }
02142           if (first == last) return first - begin; // first is supposed to point to ']'
02143           *x++ = node(a);
02144           break;
02145         }
02146           
02147 
02148         // Escaped character:
02149       case '\\' : 
02150         if (first != begin && first[-1] != '|' && first[-1] != '(') 
02151           *x++ = node(CONCAT, first);
02152         if (++first == last) return first - begin;
02153         switch (*first) {
02154         case 'n' : *x = node('\n', first); break;
02155         case 't' : *x = node('\t', first); break;
02156         case 'r' : *x = node('\r', first); break;
02157         default : *x = node(*first, first); 
02158         }
02159         break;
02160       
02161       default :
02162         if (first != begin && first[-1] != '|' && first[-1] != '(') 
02163           *x++ = node(CONCAT, first);
02164         if (*first == '.') *x = node(ANY, first);
02165         else
02166           *x = node(*first, first); 
02167         break;
02168       }
02169     *x++ = node(CLOSE);
02170     *x++ = node(CONCAT);
02171     *x++ = node(FINAL);
02172     return 0;
02173   }
02174 
02175   // exp -> term exp2    
02176   inline
02177   extract_cursor::node*
02178   extract_cursor::exp(vector<extract_cursor::node>::iterator &first, 
02179                       vector<extract_cursor::node>::iterator last)
02180   {
02181     if (first == last) return NULL;
02182     node *t = term(first, last);
02183     if (t == 0) return 0;
02184     return exp2(first, last, t);
02185   }
02186 
02187   // exp2 -> + term exp2 | epsilon
02188   inline
02189   extract_cursor::node*
02190   extract_cursor::exp2(vector<extract_cursor::node>::iterator &first, 
02191                        vector<extract_cursor::node>::iterator last, node *left)
02192   {
02193     if (first != last && first->type == OR) {
02194       node &root = *first;
02195       node *right = term(++first, last);
02196       if (right == 0) return 0;
02197       root.annulable = right->annulable || left->annulable;
02198       // premiere_pos(here) = premiere_pos(left) U premiere_pos(right): 
02199       root.premiere_pos.insert(left->premiere_pos.begin(),
02200                                left->premiere_pos.end()); 
02201       root.premiere_pos.insert(right->premiere_pos.begin(),
02202                                right->premiere_pos.end()); 
02203       root.derniere_pos.insert(left->derniere_pos.begin(),
02204                                left->derniere_pos.end()); 
02205       root.derniere_pos.insert(right->derniere_pos.begin(),
02206                                right->derniere_pos.end());
02207       return exp2(first, last, &root);
02208     }
02209     return left;
02210   }
02211 
02212   // term -> form repeat term2
02213   inline
02214   extract_cursor::node*
02215   extract_cursor::term(vector<extract_cursor::node>::iterator &first, 
02216                        vector<extract_cursor::node>::iterator last)
02217   {
02218     if (first == last) return 0;
02219     node *f = form(first, last);
02220     if (f == 0) return 0;
02221     node *s = repeat(first, last, f);  // return f if no repeat token is found
02222     return term2(first, last, s);
02223   }
02224 
02225   // repeat -> * repeat | ? repeat | + repeat | epsilon
02226   inline
02227   extract_cursor::node*
02228   extract_cursor::repeat(vector<extract_cursor::node>::iterator &first, 
02229                          vector<extract_cursor::node>::iterator last, node *left)
02230   {
02231     if (first != last)
02232       switch (first->type) {
02233       case STAR :
02234         {
02235           if (left->type == OPEN) statics &= ~left->begin;
02236 
02237           node &root = *first;
02238           root.annulable = true;
02239           root.premiere_pos = left->premiere_pos;
02240           root.derniere_pos = left->derniere_pos;
02241         
02242           // pos_suivante(derniere_pos(here)) += premiere_pos(here):
02243           for(S::const_iterator i = root.derniere_pos.begin(); 
02244               i != root.derniere_pos.end(); ++i)
02245             (*i)->pos_suivante.insert(root.premiere_pos.begin(),
02246                                       root.premiere_pos.end());
02247           return repeat(++first, last, &root);
02248         }
02249       case QUESTION : 
02250         {
02251           node &root = *first;
02252           root.annulable = true;
02253           root.premiere_pos = left->premiere_pos;
02254           root.derniere_pos = left->derniere_pos;
02255           return repeat(++first, last, &root);
02256         }
02257       case PLUS :
02258         {
02259           if (left->type == OPEN) statics &= ~left->begin;
02260 
02261           node &root = *first;
02262           root.annulable = false;
02263           root.premiere_pos = left->premiere_pos;
02264           root.derniere_pos = left->derniere_pos;
02265         
02266           // pos_suivante(derniere_pos(here)) += premiere_pos(here):
02267           for(S::const_iterator i = root.derniere_pos.begin(); 
02268               i != root.derniere_pos.end(); ++i)
02269             (*i)->pos_suivante.insert(root.premiere_pos.begin(),
02270                                       root.premiere_pos.end());
02271           return repeat(++first, last, &root);
02272         }
02273       default : 
02274         return left;
02275       }
02276     return left;
02277   }
02278     
02279 
02280   // term2 -> . form repeat term2 | epsilon
02281   inline
02282   extract_cursor::node*
02283   extract_cursor::term2(vector<extract_cursor::node>::iterator &first, 
02284                         vector<extract_cursor::node>::iterator last, node *left)
02285   {
02286     if (first != last && first->type == CONCAT) {
02287       node &root = *first;
02288       node *f = form(++first, last);
02289       if (f == 0) return 0;
02290       node *right = repeat(first, last, f);
02291       root.annulable = left->annulable && right->annulable;
02292       if (left->annulable) {
02293         root.premiere_pos.insert(left->premiere_pos.begin(),
02294                                  left->premiere_pos.end()); 
02295         root.premiere_pos.insert(right->premiere_pos.begin(),
02296                                  right->premiere_pos.end()); 
02297       }
02298       else
02299         root.premiere_pos = left->premiere_pos;
02300     
02301       if (right->annulable) {
02302         root.derniere_pos.insert(right->derniere_pos.begin(),
02303                                  right->derniere_pos.end()); 
02304         root.derniere_pos.insert(left->derniere_pos.begin(),
02305                                  left->derniere_pos.end());
02306       }
02307       else
02308         root.derniere_pos = right->derniere_pos;
02309       // pos_suivante(derniere_pos(left)) += premiere_pos(right):
02310       for(S::const_iterator i = left->derniere_pos.begin(); 
02311           i != left->derniere_pos.end(); ++i)
02312         (*i)->pos_suivante.insert(right->premiere_pos.begin(),
02313                                   right->premiere_pos.end());
02314       return term2(first, last, &root);
02315     }
02316     return left;
02317   }
02318 
02319 
02320   // form -> ( exp ) | letter
02321   inline
02322   extract_cursor::node*
02323   extract_cursor::form(vector<extract_cursor::node>::iterator &first, 
02324                        vector<extract_cursor::node>::iterator last)
02325   {
02326     if (first == last) return 0;
02327     if (first->type == OPEN) {
02328       int tmp = first->subexpression;
02329       vector<extract_cursor::node>::iterator tmp_first = first;
02330       node *e = exp(++first, last);
02331       if (e == 0) return 0;
02332       if (first->type == CLOSE) ++first;
02333       else return NULL; // unbalanced parenthesis
02334 
02335       if (tmp < EXTRACT_CURSOR_MAX_SUB_EXPRESSION) {
02336         for(S::iterator i = e->premiere_pos.begin(); 
02337             i != e->premiere_pos.end(); ++i)
02338           (*i)->begin[tmp] = true;
02339     
02340         for(S::iterator ii = e->derniere_pos.begin(); 
02341             ii != e->derniere_pos.end(); ++ii)
02342           (*ii)->end[tmp] = true;
02343         sub_count = max__(sub_count, tmp + 1);
02344       }    
02345       *tmp_first = *e;
02346       tmp_first->type = OPEN;
02347       return &*tmp_first;
02348     }
02349     first->annulable = false;
02350     first->premiere_pos.insert(&*first);
02351     first->derniere_pos.insert(&*first);
02352     ++first;
02353     return &(first[-1]);
02354   }
02355 
02356   inline
02357   extract_cursor extractc(const string &exp, bool kmp_style) {
02358     return extract_cursor(exp.c_str(), kmp_style);
02359   }
02360 
02361   inline
02362   extract_cursor extractc(const string &exp) {
02363     return extract_cursor(exp.c_str());
02364   }
02365 
02366   inline
02367   extract_cursor extractc() {
02368     return extract_cursor();
02369   }
02370 
02371   template <typename RandomAccessIterator>
02372   inline 
02373   vector<pair<RandomAccessIterator, RandomAccessIterator> > 
02374   extract(extract_cursor &c, 
02375           RandomAccessIterator i, RandomAccessIterator j) 
02376   {
02377     vector<pair<RandomAccessIterator, RandomAccessIterator> > r;
02378     return r;
02379   }
02380 
02381   inline
02382   ostream& operator<<(ostream &out, const extract_cursor::token &x) 
02383   {
02384     return out << x.state;
02385   }
02386 
02387 }
02388 
02389 #endif

Generated on Sun Mar 8 02:41:35 2009 for ASTL by  doxygen 1.5.7.1