00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #ifndef ASTL_REGEXP_H
00023 #define ASTL_REGEXP_H
00024
00025 #include <astl.h>
00026 #include <lazy.h>
00027 #include <vector>
00028 #include <iostream>
00029 #include <set>
00030 #include <iterator>
00031 #include <cctype>
00032 #include <bitset>
00033 #include <cstring>
00034 #include <sstream>
00035
00036 #define USE_HASH_TABLE
00037
00038
00039
00040
00041 #if defined(USE_HASH_TABLE) && defined(__sparc)
00042 #undef USE_HASH_TABLE
00043 #endif
00044
00045 #if defined(USE_HASH_TABLE)
00046
00047 #if defined(__GNUG__) && (__GNUC__ > 3)
00048 #define USE_TR1_UNORDERED_SET
00049 #include <tr1/unordered_set>
00050 #else
00051 #define USE_BOOST_UNORDERED_SET
00052 #include <boost/unordered_set.hpp>
00053 #endif
00054
00055 #endif // USE_HASH_TABLE
00056
00057 using namespace std;
00058
00059 namespace astl {
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072 class regexp_cursor : public cursor_concept
00073 {
00074 #ifdef _MSC_VER
00075 public:
00076 #else
00077 protected:
00078 #endif
00079 struct node;
00080 #ifdef USE_TR1_UNORDERED_SET
00081 typedef tr1::unordered_set<node*> S;
00082 #else
00083 #ifdef USE_BOOST_UNORDERED_SET
00084 typedef boost::unordered_set<node*> S;
00085 #else
00086 typedef set<node*> S;
00087 #endif
00088 #endif
00089
00090 typedef enum { STAR, CONCAT, OR, REPEAT, EPSILON,
00091 LETTER, OPEN, CLOSE, QUESTION, PLUS,
00092 ANY, FINAL } token_type;
00093
00094 struct node {
00095 bitset<256> letter;
00096 token_type type;
00097 short n, m;
00098 bool annulable;
00099 S premiere_pos, derniere_pos, pos_suivante;
00100
00101 node(token_type t, short n_ = 0, short m_ = 0)
00102 : type(t), n(n_), m(m_) {
00103 if (type == ANY) {
00104 type = LETTER;
00105 letter.set();
00106 }
00107 }
00108
00109 node(char t)
00110 : type(LETTER), annulable(false) {
00111 letter.set((unsigned char) t);
00112 }
00113
00114 node(const bitset<256> &t)
00115 : letter(t), type(LETTER), annulable(false)
00116 { }
00117
00118 bool final() const {
00119 return type == FINAL;
00120 }
00121
00122 char to_char() const {
00123 size_t i;
00124 for(i = 0; i != letter.size() && letter.test(i) == false; ++i);
00125 return (char) (-128 + i);
00126 }
00127 };
00128
00129 S q;
00130 smart_ptr<vector<node> > e;
00131 const char *errmsg;
00132 smart_ptr<S> I;
00133
00134 public:
00135 smart_ptr<horspool_finder> horspool;
00136
00137 protected:
00138 friend ostream& operator<<(ostream&, const regexp_cursor&);
00139
00140 node* exp(vector<node>::iterator &first, vector<node>::iterator last);
00141 node* form(vector<node>::iterator &first, vector<node>::iterator last);
00142 node* term(vector<node>::iterator &first, vector<node>::iterator last);
00143 node* exp2(vector<node>::iterator &first, vector<node>::iterator last, node *left);
00144 node* term2(vector<node>::iterator &first, vector<node>::iterator last, node *left);
00145 node* repeat(vector<node>::iterator &first, vector<node>::iterator last, node *left);
00146 void preprocess_repeats();
00147 void build_char_sets(map<string, bitset<256> >&);
00148 template <typename OutputIterator>
00149 int tokenize(const char *first, const char *last, OutputIterator x);
00150
00151 public:
00152 friend ostream& operator<<(ostream &, const node&);
00153 ptrdiff_t errpos;
00154
00155 typedef regexp_cursor self;
00156 typedef S state_type;
00157 typedef set<node*> ordered_state_type;
00158 typedef plain char_traits;
00159 typedef plain::char_type char_type;
00160 typedef empty_tag tag_type;
00161
00162 regexp_cursor(const string &expression = "", bool horspool_optimized = false)
00163 : errmsg(NULL), errpos(-1)
00164 {
00165 if (!expression.empty()) {
00166 e->reserve(128);
00167 errpos = tokenize(expression.c_str(),
00168 expression.c_str() + expression.size(),
00169 back_inserter(*e));
00170
00171 if (errpos == -1) {
00172 preprocess_repeats();
00173 vector<node>::iterator first = e->begin();
00174 node *root = exp(first, e->end());
00175
00176 if (root != NULL) {
00177
00178
00179
00180 if (horspool_optimized && root->premiere_pos.size() == 1) {
00181
00182 string prefix;
00183 node *n = *root->premiere_pos.begin();
00184 for(; n->letter.count() == 1 && !n->annulable && n->letter.test(0) == false; n = *(n->pos_suivante.begin())) {
00185 for(int i = 0; i < 256; ++i)
00186 if (n->letter.test(i) == true) {
00187 prefix += (char) i;
00188 break;
00189 }
00190 if (n->pos_suivante.size() != 1 || (*n->pos_suivante.begin())->type == FINAL) {
00191 if (prefix.size() > 1) {
00192
00193 horspool->init(prefix);
00194 q = n->pos_suivante;
00195 return;
00196 }
00197 break;
00198 }
00199 }
00200 if (prefix.size() > 1) {
00201
00202 horspool->init(prefix);
00203 q.insert(n);
00204 return;
00205 }
00206 }
00207
00208 q = root->premiere_pos;
00209 for(S::const_iterator i = q.begin(); i != q.end(); ++i)
00210
00211 if ((*i)->letter.count() == 256 || (*i)->letter.test(0) == false)
00212 I->insert(*i);
00213 }
00214 else errpos = expression.size() - errpos;
00215 }
00216 }
00217 }
00218
00219 ptrdiff_t error() const {
00220 return errpos;
00221 }
00222
00223 const char* error_message() const {
00224 return errmsg;
00225 }
00226
00227 self& operator=(const state_type &s) {
00228 q = s;
00229 return *this;
00230 }
00231
00232 #ifdef USE_HASH_TABLE
00233 self& operator=(const ordered_state_type &s) {
00234 q.clear();
00235 q.insert(s.begin(), s.end());
00236 return *this;
00237 }
00238 #endif
00239
00240
00241
00242
00243 const state_type& src() const {
00244 return q;
00245
00246 }
00247
00248 bool exists(char a) const {
00249 for(S::const_iterator i = q.begin(); i != q.end(); ++i)
00250 if ((*i)->letter.test((unsigned char) a) == true)
00251 return true;
00252 return false;
00253 }
00254
00255 bool forward(char letter) {
00256 state_type p = *I;
00257 for(S::const_iterator i = q.begin(); i != q.end(); ++i)
00258 if ((*i)->letter.test((unsigned char) letter) == true)
00259 p.insert((*i)->pos_suivante.begin(),
00260 (*i)->pos_suivante.end());
00261 q.swap(p);
00262 return !q.empty();
00263 }
00264
00265 bool src_final() const {
00266 for(S::const_iterator i = q.begin(); i != q.end(); ++i)
00267 if ((*i)->final()) return true;
00268 return false;
00269 }
00270
00271 bool sink() const {
00272 return q.empty();
00273 }
00274
00275 tag_type src_tag() const {
00276 return tag_type();
00277 }
00278
00279 void clear() {
00280 q.clear();
00281 e->clear();
00282 I->clear();
00283 }
00284
00285 void determinize(DFA_matrix<plain> &dfa) const {
00286 typedef DFA_matrix<plain> dfa_type;
00287 map<ordered_state_type, dfa_type::state_type> old2new;
00288 dfa_type::state_type i = dfa.new_state();
00289 dfa.initial(i);
00290 determinize_(dfa, old2new, old2new.insert(make_pair(ordered_state_type(q.begin(), q.end()), i)).first);
00291 }
00292
00293 void determinize(DFA_matrix_mini<plain> &dfa) const {
00294 typedef DFA_matrix_mini<plain> dfa_type;
00295 map<ordered_state_type, dfa_type::state_type> old2new;
00296 dfa_type::state_type i = dfa.new_state();
00297 dfa.initial(i);
00298 determinize_(dfa, old2new, old2new.insert(make_pair(ordered_state_type(q.begin(), q.end()), i)).first);
00299 }
00300
00301 template <typename DFA>
00302 void determinize_(DFA &dfa,
00303 map<ordered_state_type, typename DFA::state_type> &old2new,
00304 typename map<ordered_state_type, typename DFA::state_type>::iterator p) const {
00305 for(int i = -128; i < 128; ++i) {
00306 ordered_state_type aim(I->begin(), I->end());
00307 for(typename ordered_state_type::iterator s = p->first.begin(); s != p->first.end(); ++s) {
00308 if ((*s)->letter.test((unsigned char) i)) {
00309 aim.insert((*s)->pos_suivante.begin(), (*s)->pos_suivante.end());
00310 }
00311 }
00312 if (!aim.empty()) {
00313 pair<typename map<ordered_state_type, typename DFA::state_type>::iterator, bool> j =
00314 old2new.insert(make_pair(aim, DFA::null_state));
00315 if (j.second == false) {
00316 dfa.set_trans(p->second, (char) (unsigned char) i, j.first->second);
00317 }
00318 else {
00319 j.first->second = dfa.new_state();
00320 dfa.set_trans(p->second, (char) (unsigned char) i, j.first->second);
00321 for(typename ordered_state_type::const_iterator ii = aim.begin(); ii != aim.end(); ++ii)
00322 if ((*ii)->final()) {
00323 dfa.final(j.first->second) = true;
00324 break;
00325 }
00326 aim.clear();
00327 determinize_(dfa, old2new, j.first);
00328 }
00329 }
00330 }
00331 }
00332
00333 protected:
00334 ptrdiff_t token_error(ptrdiff_t pos, const char *message) {
00335 errmsg = message;
00336 errpos = pos;
00337 return pos;
00338 }
00339
00340 node* syntax_error(ptrdiff_t pos, const char *message = NULL) {
00341 errmsg = message;
00342 errpos = pos;
00343 return NULL;
00344 }
00345
00346 int next_char(bitset<256>& letter, const char *begin, const char*& first, const char *last,
00347 bool& literal, const map<string, bitset<256> >& char_sets, bool icase);
00348
00349 };
00350
00351 inline
00352 void regexp_cursor::preprocess_repeats() {
00353 vector<node> &v = *e;
00354 for(vector<node>::size_type p = 0; p < v.size();) {
00355 if (v[p].type != REPEAT) ++p;
00356 else {
00357
00358 vector<node>::size_type first = p - 1, last = p;
00359 for(; first > 0; --first) {
00360 if (v[first].type == LETTER || v[first].type == CLOSE || v[first].type == ANY)
00361 break;
00362 }
00363 if (v[first].type == CLOSE) {
00364 int count = 1;
00365 while(count != 0 && --first > 0)
00366 if (v[first].type == CLOSE) ++count;
00367 else if (v[first].type == OPEN) --count;
00368 }
00369
00370
00371
00372
00373 short n = (v[p].n > 128 ? 128 : v[p].n),
00374 m = (v[p].m > 128 ? 128 : v[p].m);
00375 v.insert(v.begin() + first, node(OPEN));
00376 ++p;
00377 ++last;
00378 ++first;
00379
00380 if (n == 0)
00381 v[p] = node(QUESTION);
00382 else
00383 v.erase(v.begin() + p);
00384 vector<node> tmp;
00385 if (m == -1)
00386 tmp.reserve((last + 1 - first) * n + 1);
00387 else
00388 tmp.reserve((last + 1 - first) * m + m - n + 1);
00389
00390 while(n > 0 || m > 0) {
00391 --n;
00392 --m;
00393 if (n <= 0) {
00394 if (m < n) {
00395 tmp.push_back(node(PLUS));
00396 continue;
00397 }
00398 if (m == 0)
00399 continue;
00400 }
00401 tmp.push_back(node(CONCAT));
00402 for(vector<node>::size_type tmp_p = first; tmp_p < last; ++tmp_p)
00403 tmp.push_back(v[tmp_p]);
00404 if (n <= 0)
00405 tmp.push_back(node(QUESTION));
00406 }
00407 tmp.push_back(node(CLOSE));
00408 v.insert(v.begin() + p, tmp.begin(), tmp.end());
00409 p += tmp.size();
00410 }
00411 }
00412 }
00413
00414 inline
00415 ostream& operator<<(ostream &out, const regexp_cursor::node &x)
00416 {
00417 switch(x.type) {
00418 case regexp_cursor::LETTER :
00419 if (x.letter.count() == 256)
00420 out << ".";
00421 else {
00422 if (x.letter.count() > 1)
00423 out << '[';
00424 int lower = 0;
00425
00426 for(int i = 32; i < 257; ++i) {
00427 if (i < 256 && x.letter.test(i) == true) {
00428 if (lower == 0)
00429 lower = i;
00430 }
00431 else {
00432 if (lower > 0) {
00433 if (i - lower == 1) {
00434 cout << (unsigned char) lower;
00435 }
00436 else if (i - lower == 2) {
00437 cout << (unsigned char) lower << (unsigned char) (lower + 1);
00438 }
00439 else cout << (unsigned char) lower << '-'
00440 << (unsigned char) (i - 1);
00441 lower = 0;
00442 }
00443 }
00444 }
00445 if (x.letter.count() > 1)
00446 out << ']';
00447 }
00448 break;
00449
00450 case regexp_cursor::OR :
00451 out << "|";
00452 break;
00453
00454 case regexp_cursor::FINAL :
00455 out << "[EoE]";
00456 break;
00457
00458 case regexp_cursor::OPEN :
00459 out << "(";
00460 break;
00461
00462 case regexp_cursor::CLOSE :
00463 out << ")";
00464 break;
00465
00466 case regexp_cursor::CONCAT :
00467 out << " ";
00468 break;
00469
00470 case regexp_cursor::STAR :
00471 out << "*";
00472 break;
00473
00474 case regexp_cursor::QUESTION :
00475 out << "?";
00476 break;
00477
00478 case regexp_cursor::PLUS :
00479 out << "+";
00480 break;
00481
00482 case regexp_cursor::REPEAT :
00483 out << "{" << x.n << "," << x.m << "}";
00484 break;
00485
00486 default :
00487 break;
00488 }
00489 return out;
00490 }
00491
00492 inline
00493 void regexp_cursor::build_char_sets(map<string, bitset<256> >& char_sets) {
00494 bitset<256> alpha, alnum, ascii, blank, cntrl, digit, word;
00495 bitset<256> graph, lower, print, punct, space, upper, xdigit;
00496 for(int c = 0; c < 256; ++c) {
00497 alnum.set(c, isalnum(c) != 0);
00498 alpha.set(c, isalpha(c) != 0);
00499 ascii.set(c, isascii(c) != 0);
00500 cntrl.set(c, iscntrl(c) != 0);
00501 digit.set(c, isdigit(c) != 0);
00502 graph.set(c, isgraph(c) != 0);
00503 lower.set(c, islower(c) != 0);
00504 print.set(c, isprint(c) != 0);
00505 punct.set(c, ispunct(c) != 0);
00506 space.set(c, isspace(c) != 0);
00507 upper.set(c, isupper(c) != 0);
00508 xdigit.set(c, isxdigit(c) != 0);
00509 }
00510 char_sets["[:alnum:]"] = alnum;
00511 char_sets["[:^alnum:]"] = ~ char_sets["[:alnum:]"];
00512 char_sets["[:alpha:]"] = alpha;
00513 char_sets["[:^alpha:]"] = ~ char_sets["[:alpha:]"];
00514 char_sets["[:ascii:]"] = ascii;
00515 char_sets["[:^ascii:]"] = ~ char_sets["[:ascii:]"];
00516 blank.set((unsigned char) ' ');
00517 blank.set((unsigned char) '\t');
00518 char_sets["[:blank:]"] = blank;
00519 char_sets["[:^blank:]"] = ~ char_sets["[:blank:"];
00520 char_sets["[:cntrl:]"] = cntrl;
00521 char_sets["[:^cntrl:]"] = ~ char_sets["[:cntrl:]"];
00522 char_sets["[:digit:]"] = digit;
00523 char_sets["[:^digit:]"] = ~ char_sets["[:digit:]"];
00524 char_sets["[:graph:]"] = graph;
00525 char_sets["[:^graph:]"] = ~ char_sets["[:graph:]"];
00526 char_sets["[:lower:]"] = lower;
00527 char_sets["[:^lower:]"] = ~ char_sets["[:lower:]"];
00528 char_sets["[:print:]"] = print;
00529 char_sets["[:^print:]"] = ~ char_sets["[:print:]"];
00530 char_sets["[:punct:]"] = punct;
00531 char_sets["[:^punct:]"] = ~ char_sets["[:punct:]"];
00532 char_sets["[:space:]"] = space;
00533 char_sets["[:^space:]"] = ~ char_sets["[:space:]"];
00534 char_sets["[:upper:]"] = upper;
00535 char_sets["[:^upper:]"] = ~ char_sets["[:upper:]"];
00536 char_sets["[:xdigit:]"] = xdigit;
00537 char_sets["[:^xdigit:]"] = ~ char_sets["[:xdigit:]"];
00538 alnum.set((unsigned char) '_');
00539 char_sets["[:word:]"] = alnum;
00540 char_sets["[:^word:]"] = ~ char_sets["[:word:]"];
00541 char_sets["\\d"] = digit;
00542 char_sets["\\D"] = ~ char_sets["\\d"];
00543 char_sets["\\s"] = space;
00544 char_sets["\\S"] = ~ char_sets["\\s"];
00545 char_sets["\\w"] = alnum;
00546 char_sets["\\W"] = ~alnum;
00547 char_sets["\\t"] = bitset<256>().set((unsigned char) '\t');
00548 char_sets["\\n"] = bitset<256>().set((unsigned char) '\n');
00549 char_sets["\\r"] = bitset<256>().set((unsigned char) '\r');
00550 char_sets["\\f"] = bitset<256>().set((unsigned char) '\f');
00551 char_sets["\\a"] = bitset<256>().set((unsigned char) '\a');
00552 char_sets["\\e"] = bitset<256>().set((unsigned char) '\x1B');
00553 char_sets["\\v"] = bitset<256>().set((unsigned char) '\v');
00554 char_sets["\\V"] = ~ char_sets["\\v"];
00555 char_sets["\\h"] = bitset<256>().set((unsigned char) ' ');
00556 char_sets["\\H"] = ~ char_sets["\\h"];
00557 }
00558
00559
00560
00561 inline
00562 int regexp_cursor::next_char(bitset<256>& letter, const char *begin, const char*& first, const char *last,
00563 bool& literal, const map<string, bitset<256> >& char_sets, bool icase) {
00564 letter.reset();
00565
00566
00567 if ((*first == '$' || *first == '^') && !literal) {
00568 letter.set((unsigned char) 0);
00569 ++first;
00570 return -1;
00571 }
00572
00573 if (*first == '\\') {
00574 if (++first == last)
00575 return token_error(first - begin, "unterminated escaped sequence");
00576
00577
00578 if (*first == 'Q') {
00579 if (literal) {
00580 letter.set((unsigned char) '\\');
00581 return -1;
00582 }
00583 else {
00584 literal = true;
00585 if (++first == last)
00586 return token_error(first - begin, "unterminated quote sequence");
00587 return next_char(letter, begin, first, last, literal, char_sets, icase);
00588 }
00589 }
00590 if (*first == 'E') {
00591 if (!literal) {
00592 letter.set((unsigned char) 'E');
00593 if (icase)
00594 letter.set((unsigned char) 'e');
00595 }
00596 else literal = false;
00597 ++first;
00598 return -1;
00599 }
00600
00601 if (literal) {
00602 letter.set((unsigned char) '\\');
00603 return -1;
00604 }
00605
00606 if (*first == 'x') {
00607 if (++first == last)
00608 return token_error(first - begin, "\\x used with no following hex digits");
00609 int n = 0;
00610 if (*first >= '0' && *first <= '9') {
00611 n = 16 * (*first - '0');
00612 }
00613 else
00614 if (*first >= 'A' && *first <= 'F') {
00615 n = 16 * (10 + *first - 'A');
00616 }
00617 else
00618 if (*first >= 'a' && *first <= 'f') {
00619 n = 16 * (10 + *first - 'a');
00620 }
00621 else return token_error(first - begin, "\\x used with no following hex digits");
00622
00623 if (++first == last)
00624 return token_error(first - begin, "\\x used with only one following hex digit");
00625
00626 if (*first >= '0' && *first <= '9') {
00627 n += *first - '0';
00628 }
00629 else
00630 if (*first >= 'A' && *first <= 'F') {
00631 n += 10 + (*first - 'A');
00632 }
00633 else
00634 if (*first >= 'a' && *first <= 'f') {
00635 n += 10 + (*first - 'a');
00636 }
00637 else return token_error(first - begin, "\\x used with only one following hex digit");
00638 letter.set(n);
00639 ++first;
00640 return -1;
00641 }
00642
00643 if (*first == 'c') {
00644 if (++first == last || *first < 'A' || *first > 'Z')
00645 return token_error(first - begin, "\\c used with no following upper-case letter");
00646 letter.set(*first - 'A' + 1);
00647 ++first;
00648 return -1;
00649 }
00650
00651
00652 std::map<string, bitset<256> >::const_iterator c = char_sets.find(std::string("\\") + *first);
00653 if (c != char_sets.end()) {
00654 letter = c->second;
00655 ++first;
00656 return -1;
00657 }
00658
00659 if (*first == '.') {
00660 letter.set((unsigned char) '.');
00661 ++first;
00662 return -1;
00663 }
00664 }
00665
00666 if (*first == '.') {
00667 if (literal)
00668 letter.set((unsigned char) '.');
00669 else
00670 letter.set();
00671 }
00672 else
00673 if (icase && isalpha(*first)) {
00674 letter.set((unsigned char) tolower((unsigned char) *first));
00675 letter.set((unsigned char) toupper((unsigned char) *first));
00676 }
00677 else letter.set((unsigned char) *first);
00678 ++first;
00679 return -1;
00680 }
00681
00682
00683 template <typename OutputIterator>
00684 int regexp_cursor::tokenize(const char *first, const char *last, OutputIterator x)
00685 {
00686 static map<string, bitset<256> > char_sets;
00687 if (char_sets.empty())
00688 build_char_sets(char_sets);
00689
00690 *x++ = node(OPEN);
00691 node previous = node(OPEN);
00692
00693 bool literal = false;
00694 std::vector<char> parenthesis;
00695 bool icase = false;
00696
00697 for(const char *begin = first; first != last; ++first, ++x) {
00698 if (literal && *first != '\\') {
00699 if (previous.type != OPEN && previous.type != OR)
00700 *x++ = node(CONCAT);
00701 if (icase && isalpha(*first)) {
00702 bitset<256> c;
00703 c.set((unsigned char) tolower(*first));
00704 c.set((unsigned char) toupper(*first));
00705 *x = previous = node(c);
00706 }
00707 else
00708 *x = previous = node(*first);
00709 continue;
00710 }
00711
00712 switch (*first) {
00713 case '|' :
00714 if (first == begin)
00715 return token_error(0, "missing left part of alternative");
00716 *x = previous = node(OR);
00717 break;
00718
00719 case '(' :
00720 if (previous.type != OPEN && previous.type != OR)
00721 *x++ = node(CONCAT);
00722 *x = previous = node(OPEN);
00723 if (first + 2 < last && first[1] == '?' && first[2] == ':')
00724 first += 2;
00725 else
00726 if (first + 3 < last && first[1] == '?' &&
00727 first[2] == 'i' && first[3] == ':') {
00728 first += 3;
00729 if (icase == false) {
00730 parenthesis.push_back('i');
00731 icase = true;
00732 break;
00733 }
00734 }
00735 parenthesis.push_back('(');
00736 break;
00737
00738 case ')' :
00739 if (parenthesis.empty())
00740 return token_error(first - begin, "unbalanced parenthesis");
00741 if (parenthesis.back() == 'i')
00742 icase = false;
00743 parenthesis.pop_back();
00744 *x = previous = node(CLOSE);
00745 break;
00746
00747 case '*' :
00748 if (first == begin)
00749 return token_error(0, "missing expression before operator *");
00750 *x = previous = node(STAR);
00751 break;
00752
00753 case '?' :
00754 if (first == begin)
00755 return token_error(0, "missing expression before operator ?");
00756 *x = previous = node(QUESTION);
00757 break;
00758
00759 case '+' :
00760 if (first == begin)
00761 return token_error(0, "missing expression before operator +");
00762 *x = previous = node(PLUS);
00763 break;
00764
00765 case '{' :
00766 if (first == begin)
00767 return token_error(0, "missing expression before repeat operator");
00768 if (++first == last)
00769 return token_error(first - begin, "unterminated expression");
00770 {
00771 int n = 0;
00772 const char *tmp = first;
00773 while (isdigit(*first)) {
00774 n = n * 10 + *first - '0';
00775 if (++first == last)
00776 return token_error(first - begin, "unterminated repeat operator");
00777 }
00778 int m = n;
00779 if (*first == ',') {
00780 if (++first == last)
00781 return token_error(first - begin, "unterminated repeat operator");
00782 tmp = first;
00783 for(m = 0; isdigit(*first);) {
00784 m = m * 10 + *first - '0';
00785 if (++first == last)
00786 return token_error(first - begin, "unterminated repeat operator");
00787 }
00788 if (tmp == first) m = -1;
00789 }
00790 if (*first != '}')
00791 return token_error(first - begin, "invalid repeat operator");
00792
00793
00794 if (n > 32768 || m > 65535 || (n > m && m > -1) || (n == m && n == 0))
00795 return token_error(first - begin, "wrong range for repeat operator");
00796
00797
00798 if (n == 0 && m == 1)
00799 *x = node(QUESTION);
00800 else if (n == 0 && m == -1)
00801 *x = node(STAR);
00802 else if (n == 1 && m == -1)
00803 *x = node(PLUS);
00804 else if (!(n == 1 && m == 1))
00805 *x = node(REPEAT, n, m);
00806 }
00807 break;
00808
00809 case '[' :
00810 {
00811 if (previous.type != OPEN && previous.type != OR)
00812 *x++ = node(CONCAT);
00813 bitset<256> a;
00814 bool v = true;
00815 if (++first == last)
00816 return token_error(first - begin, "unterminated expression");
00817 if (*first == '^') {
00818 a.set();
00819 a.set(0, false);
00820 v = false;
00821 if (++first == last)
00822 return token_error(first - begin, "unterminated expression");
00823 }
00824 if (*first == '-') {
00825 a.set((unsigned char) '-', v);
00826 ++first;
00827 }
00828 else
00829 if (*first == ']') {
00830 a.set((unsigned char) ']', v);
00831 ++first;
00832 }
00833 for(; first != last && *first != ']'; ++first) {
00834 if (*first == '[') {
00835 if (++first == last) break;
00836 if (*first != ':') {
00837 a.set((unsigned char) '[', v);
00838 }
00839 else {
00840 const char *start = first - 1;
00841 first = std::find(++first, last, ']');
00842 if (first == last) break;
00843 std::map<string, bitset<256> >::const_iterator c = char_sets.find(std::string(start, first + 1));
00844 if (c == char_sets.end())
00845 return token_error(start - begin, "unknown character set");
00846 if (v == true)
00847 a |= c->second;
00848 else a &= ~c->second;
00849 continue;
00850 }
00851 }
00852
00853
00854
00855 bitset<256> tmp;
00856 if (*first != '-') {
00857 switch (*first) {
00858 case '^' :
00859 case '$' :
00860 tmp.set((unsigned char) *first++);
00861 break;
00862 default :
00863 {
00864 int r = next_char(tmp, begin, first, last, literal, char_sets, icase);
00865 if (r != -1)
00866 return r;
00867
00868 if (tmp.count() == 256) {
00869 tmp.reset();
00870 tmp.set((unsigned char) '.');
00871 }
00872 }
00873 }
00874 if (v == true)
00875 a |= tmp;
00876 else a &= ~ tmp;
00877 if (*first != '-') {
00878 --first;
00879 continue;
00880 }
00881 }
00882
00883 if (++first == last)
00884 return token_error(first - begin, "unterminated expression");
00885 if (*first == ']') {
00886 a.set((unsigned char) '-', v);
00887 break;
00888 }
00889 else {
00890 if (tmp.count() != 1) {
00891 return token_error(first - 2 - begin, "wrong character range: the lower bound cannot be a character set");
00892 }
00893 bitset<256> upper_bound;
00894 switch (*first) {
00895 case '^' :
00896 case '$' :
00897 upper_bound.set((unsigned char) *first);
00898 break;
00899 default :
00900 {
00901 int r = next_char(upper_bound, begin, first, last, literal, char_sets, icase);
00902 --first;
00903 if (r != -1)
00904 return token_error(first - begin, "wrong character range");
00905
00906 if (upper_bound.count() == 256) {
00907 upper_bound.reset();
00908 upper_bound.set((unsigned char) '.');
00909 }
00910 }
00911 }
00912 if (upper_bound.count() != 1)
00913 return token_error(first - begin, "wrong character range: the upper bound cannot be a character set");
00914
00915 unsigned int int_lower_bound = 0, int_upper_bound = 0;
00916 while (tmp.test(int_lower_bound) == false)
00917 ++int_lower_bound;
00918 while (upper_bound.test(int_upper_bound) == false)
00919 ++int_upper_bound;
00920
00921
00922 if (int_upper_bound < int_lower_bound)
00923 return token_error(first - begin, "wrong character range: lower bound cannot be greater than upper bound");
00924 else
00925 for(unsigned int i = int_lower_bound; i <= int_upper_bound; ++i)
00926 a.set(i, v);
00927 }
00928 }
00929
00930 if (first == last)
00931 return token_error(first - begin, "unterminated expression");
00932 *x = previous = node(a);
00933 break;
00934 }
00935
00936 default :
00937 bitset<256> next_letter;
00938 int r = next_char(next_letter, begin, first, last, literal, char_sets, icase);
00939 if (r != -1)
00940 return r;
00941 if (next_letter.any()) {
00942 if (previous.type != OPEN && previous.type != OR)
00943 *x++ = node(CONCAT);
00944 if (next_letter.count() == 256)
00945 #ifndef UTF8
00946 *x = previous = node(ANY);
00947 #else
00948 {
00949 *x++ = node(OPEN);
00950 next_letter.reset();
00951 for(int i = 1; i < 0x7F; ++i)
00952 next_letter.set(i);
00953 *x++ = node(next_letter);
00954 *x++ = node(OR);
00955 next_letter.reset();
00956 for(int i = 0; i < 32; ++i)
00957 next_letter.set(i | 192);
00958 *x++ = node(next_letter);
00959 *x++ = node(CONCAT);
00960 *x++ = node(ANY);
00961 *x = previous = node(CLOSE);
00962 }
00963 #endif
00964 else
00965 *x = previous = node(next_letter);
00966 }
00967 --first;
00968 break;
00969 }
00970 }
00971 *x++ = node(CLOSE);
00972 *x++ = node(CONCAT);
00973 *x++ = node(FINAL);
00974 return -1;
00975 }
00976
00977
00978 inline
00979 regexp_cursor::node*
00980 regexp_cursor::exp(vector<regexp_cursor::node>::iterator &first,
00981 vector<regexp_cursor::node>::iterator last)
00982 {
00983 if (first == last)
00984 return syntax_error(last - first, "empty expression");
00985 node *t = term(first, last);
00986 if (t == 0) return 0;
00987 return exp2(first, last, t);
00988 }
00989
00990
00991 inline
00992 regexp_cursor::node*
00993 regexp_cursor::exp2(vector<regexp_cursor::node>::iterator &first,
00994 vector<regexp_cursor::node>::iterator last, node *left)
00995 {
00996 if (first != last && first->type == OR) {
00997 node &root = *first;
00998 node *right = term(++first, last);
00999 if (right == 0) return 0;
01000 root.annulable = right->annulable || left->annulable;
01001
01002 root.premiere_pos.insert(left->premiere_pos.begin(),
01003 left->premiere_pos.end());
01004 root.premiere_pos.insert(right->premiere_pos.begin(),
01005 right->premiere_pos.end());
01006 root.derniere_pos.insert(left->derniere_pos.begin(),
01007 left->derniere_pos.end());
01008 root.derniere_pos.insert(right->derniere_pos.begin(),
01009 right->derniere_pos.end());
01010 return exp2(first, last, &root);
01011 }
01012 return left;
01013 }
01014
01015
01016 inline
01017 regexp_cursor::node*
01018 regexp_cursor::term(vector<regexp_cursor::node>::iterator &first,
01019 vector<regexp_cursor::node>::iterator last)
01020 {
01021 if (first == last)
01022 return syntax_error(last - first, "unterminated expression");
01023 node *f = form(first, last);
01024 if (f == 0) return 0;
01025 node *s = repeat(first, last, f);
01026 return term2(first, last, s);
01027 }
01028
01029
01030 inline
01031 regexp_cursor::node*
01032 regexp_cursor::repeat(vector<regexp_cursor::node>::iterator &first,
01033 vector<regexp_cursor::node>::iterator last, node *left)
01034 {
01035 if (first != last)
01036 switch (first->type) {
01037 case STAR :
01038 {
01039 node &root = *first;
01040 root.annulable = true;
01041 root.premiere_pos = left->premiere_pos;
01042 root.derniere_pos = left->derniere_pos;
01043
01044
01045 for(S::const_iterator i = root.derniere_pos.begin();
01046 i != root.derniere_pos.end(); ++i)
01047 (*i)->pos_suivante.insert(root.premiere_pos.begin(),
01048 root.premiere_pos.end());
01049 return repeat(++first, last, &root);
01050 }
01051 case QUESTION :
01052 {
01053 node &root = *first;
01054 root.annulable = true;
01055 root.premiere_pos = left->premiere_pos;
01056 root.derniere_pos = left->derniere_pos;
01057 return repeat(++first, last, &root);
01058 }
01059 case PLUS :
01060 {
01061 node &root = *first;
01062 root.annulable = false;
01063 root.premiere_pos = left->premiere_pos;
01064 root.derniere_pos = left->derniere_pos;
01065
01066
01067 for(S::const_iterator i = root.derniere_pos.begin();
01068 i != root.derniere_pos.end(); ++i)
01069 (*i)->pos_suivante.insert(root.premiere_pos.begin(),
01070 root.premiere_pos.end());
01071 return repeat(++first, last, &root);
01072 }
01073 default :
01074 break;
01075 }
01076 return left;
01077 }
01078
01079
01080
01081 inline
01082 regexp_cursor::node*
01083 regexp_cursor::term2(vector<regexp_cursor::node>::iterator &first,
01084 vector<regexp_cursor::node>::iterator last, node *left)
01085 {
01086 if (first != last && first->type == CONCAT) {
01087 node &root = *first;
01088 node *f = form(++first, last);
01089 if (f == 0) return 0;
01090 node *right = repeat(first, last, f);
01091 root.annulable = left->annulable && right->annulable;
01092 if (left->annulable) {
01093 root.premiere_pos.insert(left->premiere_pos.begin(),
01094 left->premiere_pos.end());
01095 root.premiere_pos.insert(right->premiere_pos.begin(),
01096 right->premiere_pos.end());
01097 }
01098 else
01099 root.premiere_pos = left->premiere_pos;
01100
01101 if (right->annulable) {
01102 root.derniere_pos.insert(right->derniere_pos.begin(),
01103 right->derniere_pos.end());
01104 root.derniere_pos.insert(left->derniere_pos.begin(),
01105 left->derniere_pos.end());
01106 }
01107 else
01108 root.derniere_pos = right->derniere_pos;
01109
01110 for(S::const_iterator i = left->derniere_pos.begin();
01111 i != left->derniere_pos.end(); ++i)
01112 (*i)->pos_suivante.insert(right->premiere_pos.begin(),
01113 right->premiere_pos.end());
01114 return term2(first, last, &root);
01115 }
01116 return left;
01117 }
01118
01119
01120
01121 inline
01122 regexp_cursor::node*
01123 regexp_cursor::form(vector<regexp_cursor::node>::iterator &first,
01124 vector<regexp_cursor::node>::iterator last)
01125 {
01126 if (first == last)
01127 return syntax_error(last - first, "unterminated expression");
01128 if (first->type == OPEN) {
01129 node *e = exp(++first, last);
01130 if (e == 0) return 0;
01131 if (first->type == CLOSE) ++first;
01132 else return syntax_error(last - first, "unbalanced parenthesis");
01133 return e;
01134 }
01135 first->annulable = false;
01136 first->premiere_pos.insert(&*first);
01137 first->derniere_pos.insert(&*first);
01138 ++first;
01139 return &(first[-1]);
01140 }
01141
01142 inline
01143 regexp_cursor regexpc(const string &exp) {
01144 return regexp_cursor(exp.c_str());
01145 }
01146
01147 inline
01148 regexp_cursor regexpc() {
01149 return regexp_cursor();
01150 }
01151
01152
01153 template <template <typename T, typename U> class DFA>
01154 class lazy_cursor<regexp_cursor, DFA> : public cursor_concept
01155 {
01156 public:
01157 typedef regexp_cursor Cursor;
01158 #ifdef USE_HASH_TABLE
01159 typedef DFA<Cursor::char_traits,
01160 const regexp_cursor::ordered_state_type*> DFA_type;
01161 #else
01162 typedef DFA<Cursor::char_traits,
01163 const regexp_cursor::state_type*> DFA_type;
01164 #endif
01165
01166 protected:
01167 #ifdef USE_HASH_TABLE
01168 typedef std::map<regexp_cursor::ordered_state_type, typename DFA_type::state_type> mapper;
01169 #else
01170 typedef std::map<regexp_cursor::state_type, typename DFA_type::state_type> mapper;
01171 #endif
01172
01173 Cursor c;
01174 smart_ptr<DFA_type> dfa;
01175 smart_ptr<mapper> mapping;
01176 typename DFA_type::state_type my_sink;
01177 typename DFA_type::state_type q;
01178
01179 public:
01180 typedef lazy_cursor self;
01181 typedef typename DFA_type::state_type state_type;
01182 typedef empty_tag tag_type;
01183 typedef Cursor::char_type char_type;
01184 typedef Cursor::char_traits char_traits;
01185
01186 lazy_cursor(const Cursor &x)
01187 : c(x), my_sink(dfa->new_state()), q(DFA_type::null_state)
01188 {
01189 if (!x.sink()) {
01190 q = dfa->new_state();
01191 #ifdef USE_HASH_TABLE
01192 dfa->tag(q) = &(mapping->insert(make_pair(regexp_cursor::ordered_state_type(c.src().begin(), c.src().end()), q)).first->first);
01193 #else
01194 dfa->tag(q) = &(mapping->insert(make_pair(c.src(), q)).first->first);
01195 #endif
01196 dfa->final(q) = c.src_final();
01197 dfa->initial(q);
01198 }
01199 }
01200
01201
01202
01203
01204
01205
01206
01207 const Cursor& adaptee() {
01208 if (q != DFA_type::null_state)
01209 c = *dfa->tag(q);
01210 return c;
01211 }
01212
01213 const DFA_type& cache() const {
01214 return *dfa;
01215 }
01216
01217 state_type src() const {
01218 return q;
01219 }
01220
01221 self& operator=(state_type p) {
01222 q = p;
01223 return *this;
01224 }
01225
01226 bool src_final() const {
01227 return dfa->final(q);
01228 }
01229
01230 empty_tag src_tag() const {
01231 return empty_tag();
01232 }
01233
01234 bool sink() const {
01235 return q == DFA_type::null_state;
01236 }
01237
01238 state_type sink_state() const {
01239 return DFA_type::null_state;
01240 }
01241
01242
01243
01244
01245
01246
01247
01248
01249
01250 bool forward(const char_type &a) {
01251 state_type tmp = dfa->delta1(q, a);
01252 if (tmp == my_sink) {
01253 q = DFA_type::null_state;
01254 return false;
01255 }
01256 if (tmp == DFA_type::null_state) {
01257 c = *dfa->tag(q);
01258 if (c.forward(a)) {
01259 #ifdef USE_HASH_TABLE
01260 std::pair<typename mapper::iterator, bool> p =
01261 mapping->insert(std::make_pair(regexp_cursor::ordered_state_type(c.src().begin(), c.src().end()), DFA_type::null_state));
01262 #else
01263 std::pair<typename mapper::iterator, bool> p =
01264 mapping->insert(std::make_pair(c.src(), DFA_type::null_state));
01265 #endif
01266 if (p.second == true) {
01267 tmp = dfa->new_state();
01268 dfa->final(tmp) = c.src_final();
01269 dfa->tag(tmp) = &(p.first->first);
01270 p.first->second = tmp;
01271 }
01272 else tmp = p.first->second;
01273 dfa->set_trans(q, a, tmp);
01274 }
01275 else {
01276 dfa->set_trans(q, a, my_sink);
01277 q = DFA_type::null_state;
01278 return false;
01279 }
01280 }
01281 q = tmp;
01282 return true;
01283 }
01284 };
01285
01286
01287
01288 inline
01289 bool match(lazy_cursor<regexp_cursor>& c, const char *w)
01290 {
01291 if (!c.forward('\0'))
01292 return false;
01293 for(; *w != '\0' && c.forward(*w); ++w)
01294 if (c.src_final()) return true;
01295 return *w == '\0' && c.forward('\0') && c.src_final();
01296 }
01297
01298 template <typename InputI>
01299 inline
01300 bool match(lazy_cursor<regexp_cursor>& c, InputI first, InputI last)
01301 {
01302 if (!c.forward('\0'))
01303 return false;
01304 for(; first != last && c.forward(*first); ++first)
01305 if (c.src_final()) return true;
01306 return first == last && c.forward('\0') && c.src_final();
01307 }
01308
01309 template <typename ForwardI>
01310 inline
01311 ForwardI first_match(lazy_cursor<regexp_cursor> &c, ForwardI first, ForwardI last)
01312 {
01313 if (!c.forward('\0')) return first;
01314 const ForwardI start = first;
01315 for(; first != last && c.forward(*first); ++first)
01316 if (c.src_final()) return ++first;
01317 return first == last && c.forward('\0') && c.src_final() ? first : start;
01318 }
01319
01320 inline
01321 const char*
01322 first_match(lazy_cursor<regexp_cursor> &c, const char *text)
01323 {
01324 if (!c.forward('\0')) return text;
01325 const char *start = text;
01326 for(; *text != '\0' && c.forward(*text); ++text)
01327 if (c.src_final()) return ++text;
01328 return *text == '\0' && c.forward('\0') && c.src_final() ? text : start;
01329 }
01330
01331
01332 inline
01333 ostream& operator<<(ostream& out, const regexp_cursor& c) {
01334 for(vector<regexp_cursor::node>::const_iterator n = c.e->begin(); n != c.e->end(); ++n)
01335 if (c.q.find(const_cast<astl::regexp_cursor::node*>(&*n)) != c.q.end())
01336 out << "\033[1m" << *n << "\033[0m";
01337 else
01338 out << *n;
01339 return out;
01340 }
01341
01342 static string regexp_fcursor_errmsg;
01343
01344
01345
01346 class regexp_fcursor : public forward_cursor_concept
01347 {
01348 protected:
01349 #ifdef _MSC_VER
01350 public:
01351 #endif
01352 struct node;
01353 typedef set<node*> S;
01354
01355 typedef enum { STAR, CONCAT, OR, REPEAT, EPSILON,
01356 LETTER, OPEN, CLOSE, QUESTION, PLUS,
01357 ANY, FINAL } token_type;
01358
01359 struct node {
01360 token_type type;
01361 short letter;
01362 bool annulable;
01363 const char *pos;
01364 S premiere_pos, derniere_pos, pos_suivante;
01365
01366 node(token_type t, const char *p = 0)
01367 : type(t), letter(128),pos(p)
01368 { }
01369
01370 node(token_type type_, short letter_, const char *p = 0)
01371 : type(type_), letter(letter_), annulable(false), pos(p)
01372 { }
01373
01374 bool final() const {
01375 return type == FINAL;
01376 }
01377 };
01378
01379 S q;
01380 smart_ptr<vector<node> > e;
01381 short c;
01382
01383 node* exp(vector<node>::iterator &first, vector<node>::iterator last);
01384 node* form(vector<node>::iterator &first, vector<node>::iterator last);
01385 node* term(vector<node>::iterator &first, vector<node>::iterator last);
01386 node* exp2(vector<node>::iterator &first, vector<node>::iterator last, node *left);
01387 node* term2(vector<node>::iterator &first, vector<node>::iterator last, node *left);
01388 node* repeat(vector<node>::iterator &first, vector<node>::iterator last, node *left);
01389 template <typename OutputIterator>
01390 int tokenize(const char *first, const char *last, OutputIterator x);
01391
01392 public:
01393 ptrdiff_t errpos;
01394
01395 friend ostream& print_node(ostream&, const node&);
01396 typedef regexp_fcursor self;
01397 typedef regexp_cursor super;
01398 typedef S state_type;
01399 typedef plain char_traits;
01400 typedef plain::char_type char_type;
01401 typedef empty_tag tag_type;
01402 typedef forward_cursor_concept concept;
01403
01404 regexp_fcursor(const char *expression = "")
01405 : errpos(-1)
01406 {
01407 if (strlen(expression) > 0) {
01408 e->reserve(8);
01409 tokenize(expression, expression + strlen(expression),
01410 back_inserter(*e));
01411 vector<node>::iterator first = e->begin();
01412 node *root = exp(first, e->end());
01413 if (root == 0)
01414 errpos = first->pos - expression;
01415 else
01416 q = root->premiere_pos;
01417 }
01418 }
01419
01420 bool sink() const {
01421 return q.empty();
01422 }
01423
01424 const string& error_message() const {
01425 return regexp_fcursor_errmsg;
01426 }
01427
01428 self& operator=(const state_type &s) {
01429 q = s;
01430 return *this;
01431 }
01432
01433 char_type letter() const {
01434 return (char_type) c;
01435 }
01436
01437 bool first() {
01438 c = 128;
01439 for(S::const_iterator i = q.begin(); i != q.end(); ++i)
01440 if ((*i)->type == ANY) {
01441 c = -128;
01442 return true;
01443 }
01444 else
01445 c = min__(c, (*i)->letter);
01446 return c != 128;
01447 }
01448
01449 bool next() {
01450 short tmp = c;
01451 c = 128;
01452 for(S::const_iterator i = q.begin(); i != q.end(); ++i) {
01453 if ((*i)->type == ANY) {
01454 c = tmp + 1;
01455 break;
01456 }
01457 if ((*i)->letter > tmp)
01458 c = min__(c, (*i)->letter);
01459 }
01460 return c != 128;
01461 }
01462
01463 state_type src() const {
01464 return q;
01465 }
01466
01467 bool src_final() const {
01468 for(S::const_iterator i = q.begin(); i != q.end(); ++i)
01469 if ((*i)->type == FINAL) return true;
01470 return false;
01471 }
01472
01473
01474 state_type aim() const {
01475 regexp_fcursor tmp = *this;
01476 tmp.forward();
01477 return tmp.src();
01478 }
01479
01480
01481 bool aim_final() const
01482 {
01483 self tmp = *this;
01484 tmp.forward();
01485 return tmp.src_final();
01486 }
01487
01488 void forward() {
01489 forward(c);
01490 }
01491
01492 bool exists(char a) const {
01493 for(S::const_iterator i = q.begin(); i != q.end(); ++i)
01494 if ((*i)->type == ANY || (*i)->letter == a)
01495 return true;
01496 return false;
01497 }
01498
01499 bool forward(char a) {
01500 state_type p;
01501 for(S::const_iterator i = q.begin(); i != q.end(); ++i)
01502 if ((*i)->type == ANY || (*i)->letter == a)
01503 p.insert((*i)->pos_suivante.begin(),
01504 (*i)->pos_suivante.end());
01505 q.swap(p);
01506 return !q.empty();
01507 }
01508
01509 tag_type aim_tag() const {
01510 return tag_type();
01511 }
01512
01513 bool operator==(const regexp_fcursor &y) const {
01514 return true;
01515 }
01516
01517 tag_type src_tag() const {
01518 return tag_type();
01519 }
01520 };
01521
01522
01523 template <typename OutputIterator>
01524 int regexp_fcursor::tokenize(const char *first, const char *last,
01525 OutputIterator x)
01526 {
01527 *x++ = node(OPEN);
01528 for(const char *begin = first; first != last; ++first, ++x)
01529 switch (*first) {
01530 case '|' :
01531 *x = node(OR, first);
01532 break;
01533
01534 case '(' :
01535 if (first != begin && first[-1] != '(' && first[-1] != '|')
01536 *x++ = node(CONCAT, *first);
01537 *x = node(OPEN, first);
01538 break;
01539
01540 case ')' :
01541 *x = node(CLOSE, first);
01542 break;
01543
01544 case '*' :
01545 *x = node(STAR, first);
01546 break;
01547
01548 case '?' :
01549 *x = node(QUESTION, first);
01550 break;
01551
01552 case '+' :
01553 *x = node(PLUS, first);
01554 break;
01555
01556 case '\\' :
01557 if (first != begin && first[-1] != '|' && first[-1] != '(')
01558 *x++ = node(CONCAT, first);
01559 if (++first == last) return first - begin;
01560 switch (*first) {
01561 case 'n' : *x = node(LETTER, '\n', first); break;
01562 case 't' : *x = node(LETTER, '\t', first); break;
01563 case 'r' : *x = node(LETTER, '\r', first); break;
01564 default : *x = node(LETTER, *first, first);
01565 }
01566 break;
01567
01568 default :
01569 if (first != begin && first[-1] != '|' && first[-1] != '(')
01570 *x++ = node(CONCAT, first);
01571 if (*first == '.') *x = node(ANY, first);
01572 else *x = node(LETTER, *first, first);
01573 break;
01574 }
01575 *x++ = node(CLOSE);
01576 *x++ = node(CONCAT);
01577 *x++ = node(FINAL);
01578 return 0;
01579 }
01580
01581
01582 inline
01583 regexp_fcursor::node*
01584 regexp_fcursor::exp(vector<regexp_fcursor::node>::iterator &first,
01585 vector<regexp_fcursor::node>::iterator last)
01586 {
01587 if (first == last) {
01588 regexp_fcursor_errmsg = "empty expression";
01589 return 0;
01590 }
01591 node *t = term(first, last);
01592 if (t == 0) return 0;
01593 return exp2(first, last, t);
01594 }
01595
01596
01597 inline
01598 regexp_fcursor::node*
01599 regexp_fcursor::exp2(vector<regexp_fcursor::node>::iterator &first,
01600 vector<regexp_fcursor::node>::iterator last, node *left)
01601 {
01602 if (first != last && first->type == OR) {
01603 node &root = *first;
01604 node *right = term(++first, last);
01605 if (right == 0) return 0;
01606 root.annulable = right->annulable || left->annulable;
01607
01608 root.premiere_pos.insert(left->premiere_pos.begin(),
01609 left->premiere_pos.end());
01610 root.premiere_pos.insert(right->premiere_pos.begin(),
01611 right->premiere_pos.end());
01612 root.derniere_pos.insert(left->derniere_pos.begin(),
01613 left->derniere_pos.end());
01614 root.derniere_pos.insert(right->derniere_pos.begin(),
01615 right->derniere_pos.end());
01616 return exp2(first, last, &root);
01617 }
01618 return left;
01619 }
01620
01621
01622 inline
01623 regexp_fcursor::node*
01624 regexp_fcursor::term(vector<regexp_fcursor::node>::iterator &first,
01625 vector<regexp_fcursor::node>::iterator last)
01626 {
01627 if (first == last) return 0;
01628 node *f = form(first, last);
01629 if (f == 0) return 0;
01630 node *s = repeat(first, last, f);
01631 return term2(first, last, s);
01632 }
01633
01634
01635 inline
01636 regexp_fcursor::node*
01637 regexp_fcursor::repeat(vector<regexp_fcursor::node>::iterator &first,
01638 vector<regexp_fcursor::node>::iterator last, node *left)
01639 {
01640 if (first != last)
01641 switch (first->type) {
01642 case STAR :
01643 {
01644 node &root = *first;
01645 root.annulable = true;
01646 root.premiere_pos = left->premiere_pos;
01647 root.derniere_pos = left->derniere_pos;
01648
01649
01650 for(S::const_iterator i = root.derniere_pos.begin();
01651 i != root.derniere_pos.end(); ++i)
01652 (*i)->pos_suivante.insert(root.premiere_pos.begin(),
01653 root.premiere_pos.end());
01654 return repeat(++first, last, &root);
01655 }
01656 case QUESTION :
01657 {
01658 node &root = *first;
01659 root.annulable = true;
01660 root.premiere_pos = left->premiere_pos;
01661 root.derniere_pos = left->derniere_pos;
01662 return repeat(++first, last, &root);
01663 }
01664 case PLUS :
01665 {
01666 node &root = *first;
01667 root.annulable = false;
01668 root.premiere_pos = left->premiere_pos;
01669 root.derniere_pos = left->derniere_pos;
01670
01671
01672 for(S::const_iterator i = root.derniere_pos.begin();
01673 i != root.derniere_pos.end(); ++i)
01674 (*i)->pos_suivante.insert(root.premiere_pos.begin(),
01675 root.premiere_pos.end());
01676 return repeat(++first, last, &root);
01677 }
01678 default :
01679 return left;
01680 }
01681 return left;
01682 }
01683
01684
01685 inline
01686 regexp_fcursor::node*
01687 regexp_fcursor::term2(vector<regexp_fcursor::node>::iterator &first,
01688 vector<regexp_fcursor::node>::iterator last, node *left)
01689 {
01690 if (first != last && first->type == CONCAT) {
01691 node &root = *first;
01692 node *f = form(++first, last);
01693 if (f == 0) return 0;
01694 node *right = repeat(first, last, f);
01695 root.annulable = left->annulable && right->annulable;
01696 if (left->annulable) {
01697 root.premiere_pos.insert(left->premiere_pos.begin(),
01698 left->premiere_pos.end());
01699 root.premiere_pos.insert(right->premiere_pos.begin(),
01700 right->premiere_pos.end());
01701 }
01702 else
01703 root.premiere_pos = left->premiere_pos;
01704
01705 if (right->annulable) {
01706 root.derniere_pos.insert(right->derniere_pos.begin(),
01707 right->derniere_pos.end());
01708 root.derniere_pos.insert(left->derniere_pos.begin(),
01709 left->derniere_pos.end());
01710 }
01711 else
01712 root.derniere_pos = right->derniere_pos;
01713
01714 for(S::const_iterator i = left->derniere_pos.begin();
01715 i != left->derniere_pos.end(); ++i)
01716 (*i)->pos_suivante.insert(right->premiere_pos.begin(),
01717 right->premiere_pos.end());
01718 return term2(first, last, &root);
01719 }
01720 return left;
01721 }
01722
01723
01724
01725 inline
01726 regexp_fcursor::node*
01727 regexp_fcursor::form(vector<regexp_fcursor::node>::iterator &first,
01728 vector<regexp_fcursor::node>::iterator last)
01729 {
01730 if (first == last) return 0;
01731 if (first->type == OPEN) {
01732 node *e = exp(++first, last);
01733 if (e == 0) return 0;
01734 if (first->type == CLOSE) ++first;
01735 else {
01736 regexp_fcursor_errmsg = "unbalanced parenthesis";
01737 return 0;
01738 }
01739 return e;
01740 }
01741 first->annulable = false;
01742 first->premiere_pos.insert(&*first);
01743 first->derniere_pos.insert(&*first);
01744 ++first;
01745 return &(first[-1]);
01746 }
01747
01748 inline
01749 regexp_fcursor regexpfc(const string &exp) {
01750 return regexp_fcursor(exp.c_str());
01751 }
01752
01753 inline
01754 std::ostream& print_node(std::ostream &out, const regexp_fcursor::node &x)
01755 {
01756 switch(x.type) {
01757 case regexp_fcursor::LETTER :
01758 out << "[" << x.letter << "]";
01759 break;
01760
01761 case regexp_fcursor::OR :
01762 out << "|";
01763 break;
01764
01765 case regexp_fcursor::FINAL :
01766 out << "[EoE]";
01767 break;
01768
01769 case regexp_fcursor::OPEN :
01770 out << "(";
01771 break;
01772
01773 case regexp_fcursor::CLOSE :
01774 out << ")";
01775 break;
01776
01777 case regexp_fcursor::CONCAT :
01778 out << " ";
01779 break;
01780
01781 case regexp_fcursor::STAR :
01782 out << "*";
01783 break;
01784
01785 case regexp_fcursor::QUESTION :
01786 out << "?";
01787 break;
01788
01789 case regexp_fcursor::PLUS :
01790 out << "+";
01791 break;
01792
01793 default :
01794 break;
01795 }
01796 return out;
01797 }
01798
01799 static const int EXTRACT_CURSOR_MAX_SUB_EXPRESSION = 16;
01800
01801 class extract_cursor : public cursor_concept
01802 {
01803 public:
01804 struct node;
01805 typedef set<node*> S;
01806 typedef pair<int, int> match;
01807 struct token
01808 {
01809 node *state;
01810 vector<match> submatches;
01811 int ccount;
01812 token(node *state_ = NULL)
01813 : state(state_), ccount(0)
01814 { }
01815 token(node *state_, const vector<match> &v, int count)
01816 : state(state_), submatches(v), ccount(count)
01817 { }
01818 bool operator<(const token &x) const {
01819 return state < x.state ||
01820 (state == x.state && ccount < x.ccount) ||
01821 (ccount == x.ccount && submatches < x.submatches);
01822 }
01823 };
01824
01825 typedef enum { STAR, CONCAT, OR, REPEAT, EPSILON,
01826 LETTER, OPEN, CLOSE, QUESTION, PLUS,
01827 ANY, FINAL } token_type;
01828
01829 struct node {
01830 token_type type;
01831 bitset<256> letter;
01832 bool annulable;
01833 const char *pos;
01834 S premiere_pos, derniere_pos, pos_suivante;
01835 bitset<EXTRACT_CURSOR_MAX_SUB_EXPRESSION> begin, end;
01836 int subexpression;
01837
01838 node(token_type t, const char *p = 0, int sub = 0)
01839 : type(t), pos(p), subexpression(sub) {
01840 if (type == ANY) {
01841 type = LETTER;
01842 letter.set();
01843 }
01844 }
01845
01846 node(char t, const char *p = 0)
01847 : type(LETTER), annulable(false), pos(p) {
01848 letter.set((unsigned char) t);
01849 }
01850
01851 node(const bitset<256> &t, const char *p = 0)
01852 : type(LETTER), letter(t), annulable(false), pos(p)
01853 { }
01854
01855 bool final() const {
01856 return type == FINAL;
01857 }
01858
01859 char to_char() const {
01860 size_t i;
01861 for(i = 0; i != letter.size() && letter.test(i) == false; ++i);
01862 return (char) (-128 + i);
01863 }
01864 };
01865
01866 public:
01867 typedef set<token> state_type;
01868
01869 protected:
01870 set<token> q;
01871 smart_ptr<vector<node> > e;
01872 int sub_count;
01873 bitset<EXTRACT_CURSOR_MAX_SUB_EXPRESSION> statics;
01874
01875
01876 node* exp(vector<node>::iterator &first, vector<node>::iterator last);
01877 node* form(vector<node>::iterator &first, vector<node>::iterator last);
01878 node* term(vector<node>::iterator &first, vector<node>::iterator last);
01879 node* exp2(vector<node>::iterator &first, vector<node>::iterator last, node *left);
01880 node* term2(vector<node>::iterator &first, vector<node>::iterator last, node *left);
01881 node* repeat(vector<node>::iterator &first, vector<node>::iterator last, node *left);
01882 template <typename OutputIterator>
01883 int tokenize(const char *first, const char *last, OutputIterator x,
01884 bool kmp_style = false);
01885
01886 public:
01887 friend std::ostream& operator<<(std::ostream &, const
01888 extract_cursor::node &);
01889 ptrdiff_t errpos;
01890
01891 typedef extract_cursor self;
01892 typedef plain char_traits;
01893 typedef plain::char_type char_type;
01894 typedef vector<match> tag_type;
01895
01896 extract_cursor(const string &expression = ".", bool kmp_style = false,
01897 bool verbose = false)
01898 : sub_count(0), errpos(-1)
01899 {
01900 statics.set();
01901 e->reserve(128);
01902 tokenize(expression.c_str(), expression.c_str() + expression.size(),
01903 back_inserter(*e), kmp_style);
01904 if (verbose) {
01905 cout << "tokenized expression:" << endl;
01906 copy(e->begin(), e->end(), ostream_iterator<node>(cout));
01907 cout << endl;
01908 }
01909 vector<node>::iterator first = e->begin();
01910 node *root = exp(first, e->end());
01911 if (root == NULL)
01912 errpos = first->pos - expression.c_str();
01913 else {
01914 for(set<node*>::iterator i = root->premiere_pos.begin(); i != root->premiere_pos.end(); ++i)
01915 q.insert(token(*i));
01916 }
01917 }
01918
01919 ptrdiff_t error() const {
01920 return errpos;
01921 }
01922
01923 self& operator=(const state_type &s) {
01924 q = s;
01925 return *this;
01926 }
01927
01928 state_type src() const {
01929 return q;
01930 }
01931
01932 bool exists(char a) const {
01933 for(set<token>::const_iterator i = q.begin(); i != q.end(); ++i)
01934 if (i->state->letter[(unsigned char) a] == true)
01935 return true;
01936 return false;
01937 }
01938
01939 bool forward(char letter) {
01940 static vector<match> tmp(EXTRACT_CURSOR_MAX_SUB_EXPRESSION);
01941 set<token> p;
01942 for(set<token>::iterator i = q.begin(); i != q.end(); ++i) {
01943 if (i->state->letter[(unsigned char) letter] == true) {
01944 tmp = i->submatches;
01945 if (i->state->begin.any() || i->state->end.any())
01946 for(int j = 0; j < subexp_count(); ++j) {
01947 if (i->state->begin[j]) {
01948 if ((unsigned int) j >= tmp.size()) {
01949 tmp.resize(j + 1, match(-1, -1));
01950 tmp[j].first = i->ccount;
01951 }
01952 else
01953 if (tmp[j].first == -1 || statics[j] == false)
01954 tmp[j].first = i->ccount;
01955 }
01956 if (i->state->end[j])
01957 tmp[j].second = i->ccount + 1;
01958 }
01959 for(S::const_iterator k = i->state->pos_suivante.begin();
01960 k != i->state->pos_suivante.end(); ++k)
01961 p.insert(token(*k, tmp, i->ccount + 1));
01962 }
01963 }
01964 q.swap(p);
01965 return !q.empty();
01966 }
01967
01968 int subexp_count() const {
01969 return sub_count;
01970 }
01971
01972 vector<pair<int, int> > submatches() const {
01973 for(set<token>::const_iterator i = q.begin(); i != q.end(); ++i)
01974 if (i->state->final()) return i->submatches;
01975 return vector<match>();
01976 }
01977
01978 bool src_final() const {
01979 for(set<token>::const_iterator i = q.begin(); i != q.end(); ++i)
01980 if (i->state->final()) return true;
01981 return false;
01982 }
01983
01984 bool sink() const {
01985 return q.empty();
01986 }
01987
01988 tag_type src_tag() const {
01989 return submatches();
01990 }
01991 };
01992
01993 inline
01994 std::ostream& operator<<(std::ostream &out, const extract_cursor::node &x)
01995 {
01996 switch(x.type) {
01997 case extract_cursor::LETTER :
01998 out << "[";
01999 if (x.letter.count() == 256)
02000 out << "Sigma";
02001 else
02002 for(int i = 32; i < 256; ++i)
02003 if (x.letter.test(i) == true) out << (char) i;
02004 out << "]";
02005 break;
02006
02007 case extract_cursor::OR :
02008 out << "|";
02009 break;
02010
02011 case extract_cursor::FINAL :
02012 out << "[EoE]";
02013 break;
02014
02015 case extract_cursor::OPEN :
02016 out << "(";
02017 break;
02018
02019 case extract_cursor::CLOSE :
02020 out << ")";
02021 break;
02022
02023 case extract_cursor::CONCAT :
02024 out << " ";
02025 break;
02026
02027 case extract_cursor::STAR :
02028 out << "*";
02029 break;
02030
02031 case extract_cursor::QUESTION :
02032 out << "?";
02033 break;
02034
02035 case extract_cursor::PLUS :
02036 out << "+";
02037 break;
02038
02039
02040
02041
02042
02043 default :
02044 break;
02045 }
02046 return out;
02047 }
02048
02049
02050 template <typename OutputIterator>
02051 int extract_cursor::tokenize(const char *first, const char *last,
02052 OutputIterator x, bool kmp_style)
02053 {
02054 int subexpression = 0;
02055
02056 if (kmp_style) {
02057 *x++ = node(ANY);
02058 *x++ = node(STAR);
02059 *x++ = node(CONCAT);
02060 }
02061 *x++ = node(OPEN, NULL, subexpression++);
02062 for(const char *begin = first; first != last; ++first, ++x)
02063 switch (*first) {
02064 case '|' :
02065 *x = node(OR, first);
02066 break;
02067
02068 case '(' :
02069 if (first != begin && first[-1] != '(' && first[-1] != '|')
02070 *x++ = node(CONCAT, first);
02071 *x = node(OPEN, first, subexpression++);
02072 break;
02073
02074 case ')' :
02075 *x = node(CLOSE, first);
02076 break;
02077
02078 case '*' :
02079 *x = node(STAR, first);
02080 break;
02081
02082 case '?' :
02083 *x = node(QUESTION, first);
02084 break;
02085
02086 case '+' :
02087 *x = node(PLUS, first);
02088 break;
02089
02090
02091
02092
02093
02094
02095
02096
02097
02098
02099
02100
02101
02102
02103
02104
02105
02106
02107
02108
02109
02110
02111
02112
02113
02114
02115
02116
02117
02118 case '[' :
02119 {
02120 if (first != begin && first[-1] != '|' && first[-1] != '(')
02121 *x++ = node(CONCAT, first);
02122 bitset<256> a;
02123 bool v = true;
02124 if (++first == last) return first - begin;
02125 if (*first == '^') {
02126 a.set();
02127 v = false;
02128 if (++first == last) return first - begin;
02129 }
02130 while (first != last && *first != ']') {
02131 const unsigned char b = (unsigned char) *first;
02132 if (++first == last) break;
02133 if (*first != '-') {
02134 a.set(b, v);
02135 continue;
02136 }
02137 if (++first == last || *first == ']' || *first < b) return first - begin;
02138 for(unsigned int i = b; i <= (unsigned char) *first; ++i)
02139 a.set(i, v);
02140 ++first;
02141 }
02142 if (first == last) return first - begin;
02143 *x++ = node(a);
02144 break;
02145 }
02146
02147
02148
02149 case '\\' :
02150 if (first != begin && first[-1] != '|' && first[-1] != '(')
02151 *x++ = node(CONCAT, first);
02152 if (++first == last) return first - begin;
02153 switch (*first) {
02154 case 'n' : *x = node('\n', first); break;
02155 case 't' : *x = node('\t', first); break;
02156 case 'r' : *x = node('\r', first); break;
02157 default : *x = node(*first, first);
02158 }
02159 break;
02160
02161 default :
02162 if (first != begin && first[-1] != '|' && first[-1] != '(')
02163 *x++ = node(CONCAT, first);
02164 if (*first == '.') *x = node(ANY, first);
02165 else
02166 *x = node(*first, first);
02167 break;
02168 }
02169 *x++ = node(CLOSE);
02170 *x++ = node(CONCAT);
02171 *x++ = node(FINAL);
02172 return 0;
02173 }
02174
02175
02176 inline
02177 extract_cursor::node*
02178 extract_cursor::exp(vector<extract_cursor::node>::iterator &first,
02179 vector<extract_cursor::node>::iterator last)
02180 {
02181 if (first == last) return NULL;
02182 node *t = term(first, last);
02183 if (t == 0) return 0;
02184 return exp2(first, last, t);
02185 }
02186
02187
02188 inline
02189 extract_cursor::node*
02190 extract_cursor::exp2(vector<extract_cursor::node>::iterator &first,
02191 vector<extract_cursor::node>::iterator last, node *left)
02192 {
02193 if (first != last && first->type == OR) {
02194 node &root = *first;
02195 node *right = term(++first, last);
02196 if (right == 0) return 0;
02197 root.annulable = right->annulable || left->annulable;
02198
02199 root.premiere_pos.insert(left->premiere_pos.begin(),
02200 left->premiere_pos.end());
02201 root.premiere_pos.insert(right->premiere_pos.begin(),
02202 right->premiere_pos.end());
02203 root.derniere_pos.insert(left->derniere_pos.begin(),
02204 left->derniere_pos.end());
02205 root.derniere_pos.insert(right->derniere_pos.begin(),
02206 right->derniere_pos.end());
02207 return exp2(first, last, &root);
02208 }
02209 return left;
02210 }
02211
02212
02213 inline
02214 extract_cursor::node*
02215 extract_cursor::term(vector<extract_cursor::node>::iterator &first,
02216 vector<extract_cursor::node>::iterator last)
02217 {
02218 if (first == last) return 0;
02219 node *f = form(first, last);
02220 if (f == 0) return 0;
02221 node *s = repeat(first, last, f);
02222 return term2(first, last, s);
02223 }
02224
02225
02226 inline
02227 extract_cursor::node*
02228 extract_cursor::repeat(vector<extract_cursor::node>::iterator &first,
02229 vector<extract_cursor::node>::iterator last, node *left)
02230 {
02231 if (first != last)
02232 switch (first->type) {
02233 case STAR :
02234 {
02235 if (left->type == OPEN) statics &= ~left->begin;
02236
02237 node &root = *first;
02238 root.annulable = true;
02239 root.premiere_pos = left->premiere_pos;
02240 root.derniere_pos = left->derniere_pos;
02241
02242
02243 for(S::const_iterator i = root.derniere_pos.begin();
02244 i != root.derniere_pos.end(); ++i)
02245 (*i)->pos_suivante.insert(root.premiere_pos.begin(),
02246 root.premiere_pos.end());
02247 return repeat(++first, last, &root);
02248 }
02249 case QUESTION :
02250 {
02251 node &root = *first;
02252 root.annulable = true;
02253 root.premiere_pos = left->premiere_pos;
02254 root.derniere_pos = left->derniere_pos;
02255 return repeat(++first, last, &root);
02256 }
02257 case PLUS :
02258 {
02259 if (left->type == OPEN) statics &= ~left->begin;
02260
02261 node &root = *first;
02262 root.annulable = false;
02263 root.premiere_pos = left->premiere_pos;
02264 root.derniere_pos = left->derniere_pos;
02265
02266
02267 for(S::const_iterator i = root.derniere_pos.begin();
02268 i != root.derniere_pos.end(); ++i)
02269 (*i)->pos_suivante.insert(root.premiere_pos.begin(),
02270 root.premiere_pos.end());
02271 return repeat(++first, last, &root);
02272 }
02273 default :
02274 return left;
02275 }
02276 return left;
02277 }
02278
02279
02280
02281 inline
02282 extract_cursor::node*
02283 extract_cursor::term2(vector<extract_cursor::node>::iterator &first,
02284 vector<extract_cursor::node>::iterator last, node *left)
02285 {
02286 if (first != last && first->type == CONCAT) {
02287 node &root = *first;
02288 node *f = form(++first, last);
02289 if (f == 0) return 0;
02290 node *right = repeat(first, last, f);
02291 root.annulable = left->annulable && right->annulable;
02292 if (left->annulable) {
02293 root.premiere_pos.insert(left->premiere_pos.begin(),
02294 left->premiere_pos.end());
02295 root.premiere_pos.insert(right->premiere_pos.begin(),
02296 right->premiere_pos.end());
02297 }
02298 else
02299 root.premiere_pos = left->premiere_pos;
02300
02301 if (right->annulable) {
02302 root.derniere_pos.insert(right->derniere_pos.begin(),
02303 right->derniere_pos.end());
02304 root.derniere_pos.insert(left->derniere_pos.begin(),
02305 left->derniere_pos.end());
02306 }
02307 else
02308 root.derniere_pos = right->derniere_pos;
02309
02310 for(S::const_iterator i = left->derniere_pos.begin();
02311 i != left->derniere_pos.end(); ++i)
02312 (*i)->pos_suivante.insert(right->premiere_pos.begin(),
02313 right->premiere_pos.end());
02314 return term2(first, last, &root);
02315 }
02316 return left;
02317 }
02318
02319
02320
02321 inline
02322 extract_cursor::node*
02323 extract_cursor::form(vector<extract_cursor::node>::iterator &first,
02324 vector<extract_cursor::node>::iterator last)
02325 {
02326 if (first == last) return 0;
02327 if (first->type == OPEN) {
02328 int tmp = first->subexpression;
02329 vector<extract_cursor::node>::iterator tmp_first = first;
02330 node *e = exp(++first, last);
02331 if (e == 0) return 0;
02332 if (first->type == CLOSE) ++first;
02333 else return NULL;
02334
02335 if (tmp < EXTRACT_CURSOR_MAX_SUB_EXPRESSION) {
02336 for(S::iterator i = e->premiere_pos.begin();
02337 i != e->premiere_pos.end(); ++i)
02338 (*i)->begin[tmp] = true;
02339
02340 for(S::iterator ii = e->derniere_pos.begin();
02341 ii != e->derniere_pos.end(); ++ii)
02342 (*ii)->end[tmp] = true;
02343 sub_count = max__(sub_count, tmp + 1);
02344 }
02345 *tmp_first = *e;
02346 tmp_first->type = OPEN;
02347 return &*tmp_first;
02348 }
02349 first->annulable = false;
02350 first->premiere_pos.insert(&*first);
02351 first->derniere_pos.insert(&*first);
02352 ++first;
02353 return &(first[-1]);
02354 }
02355
02356 inline
02357 extract_cursor extractc(const string &exp, bool kmp_style) {
02358 return extract_cursor(exp.c_str(), kmp_style);
02359 }
02360
02361 inline
02362 extract_cursor extractc(const string &exp) {
02363 return extract_cursor(exp.c_str());
02364 }
02365
02366 inline
02367 extract_cursor extractc() {
02368 return extract_cursor();
02369 }
02370
02371 template <typename RandomAccessIterator>
02372 inline
02373 vector<pair<RandomAccessIterator, RandomAccessIterator> >
02374 extract(extract_cursor &c,
02375 RandomAccessIterator i, RandomAccessIterator j)
02376 {
02377 vector<pair<RandomAccessIterator, RandomAccessIterator> > r;
02378 return r;
02379 }
02380
02381 inline
02382 ostream& operator<<(ostream &out, const extract_cursor::token &x)
02383 {
02384 return out << x.state;
02385 }
02386
02387 }
02388
02389 #endif