00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #ifndef ASTL_EXPERIMENT_H
00023 #define ASTL_EXPERIMENT_H
00024
00025
00026
00027
00028
00029
00030 #include "astl.h"
00031 #include "lazy.h"
00032 #include <vector>
00033 #include <iostream>
00034 #include <set>
00035 #include <iterator>
00036 #include <cctype>
00037 #include <bitset>
00038 #include <cstring>
00039 #include <sstream>
00040
00041
00042
00043 #include <cassert>
00044
00045 #include <tr1/unordered_map>
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083 #ifdef USE_HASH_TABLE
00084 #undef USE_HASH_TABLE
00085 #endif
00086
00087 using namespace std;
00088
00089 namespace astl {
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102 class non_deterministic_regexp_capture_cursor : public cursor_concept
00103 {
00104 public:
00105 typedef enum { STAR, CONCAT, OR, REPEAT, EPSILON,
00106 LETTER, OPEN, CLOSE, QUESTION, PLUS,
00107 ANY, OPENCAPTURE, CLOSECAPTURE, FINAL, GRANDFINAL } token_type;
00108 protected:
00109 class node;
00110
00111 public:
00112 class capture
00113 {
00114 public:
00115 capture(const node *n)
00116 : n(n)
00117 { }
00118
00119 int id() const { return n->n; }
00120 int index() const { return n->m; }
00121 token_type type() const { return n->type; }
00122
00123 bool operator<(const capture &c) const {
00124 return id() < c.id();
00125 }
00126
00127 bool operator==(const capture &c) const {
00128 return n == c.n;
00129 }
00130
00131 protected:
00132 const node *n;
00133 };
00134
00135 class configuration;
00136 friend ostream& operator<<(ostream&, const configuration&);
00137
00138 class configuration : public map<capture, long>
00139 {
00140 public:
00141 configuration& operator--() {
00142 for(iterator i = begin(); i != end(); ++i)
00143 --(i->second);
00144 return *this;
00145 }
00146
00147
00148 bool operator<(const configuration &y) const {
00149
00150 cout << "comparing " << *this << " and " << y << " == ";
00151
00152 if (&y == this || empty())
00153 {
00154 cout << "false" << endl;
00155 return false;
00156 }
00157
00158 const_iterator i, j;
00159 for(i = begin(), j = y.begin();
00160 i != end() && j != y.end() && i->first.id() == j->first.id() && i->second == j->second;
00161 ++i, ++j);
00162 if (i == end() && j == y.end())
00163 {
00164 cout << "false" << endl;
00165 return false;
00166 }
00167
00168
00169 if (i == end())
00170 {
00171 cout << "false" << endl;
00172 return false;
00173 }
00174
00175
00176 if (j == y.end())
00177 {
00178 cout << "true" << endl;
00179 return true;
00180 }
00181
00182
00183 if (i->first.id() != j->first.id())
00184 {
00185 cout << (i->first.id() < j->first.id() ? "true" : "false") << endl;
00186 return i->first.id() < j->first.id();
00187 }
00188
00189
00190
00191
00192
00193 if (i->first.type() == OPENCAPTURE)
00194 {
00195 cout << (i->second < j->second ? "true" : "false") << endl;
00196 return i->second < j->second;
00197 }
00198 cout << (j->second < i->second ? "true" : "false") << endl;
00199 return j->second < i->second;
00200 }
00201
00202
00203 configuration intersection(const configuration &x) const {
00204 cout << "intersection(" << *this << ", " << x << ") == ";
00205 if (empty()) {
00206 cout << *this << endl;
00207 return *this;
00208 }
00209 if (x.empty()) {
00210 cout << x << endl;
00211 return x;
00212 }
00213 configuration r;
00214 configuration::const_iterator i = begin(), j = x.begin();
00215 while (i != end() && j != x.end()) {
00216 if (i->first.id() < j->first.id()) {
00217 ++i;
00218 }
00219 else if (j->first.id() < i->first.id()) {
00220 ++j;
00221 }
00222 else {
00223 if (i->second == j->second) {
00224 r.insert(*j);
00225 }
00226 ++j;
00227 ++i;
00228 }
00229 }
00230 cout << r << endl;
00231 return r;
00232 }
00233
00234 };
00235
00236 protected:
00237 typedef node* position;
00238 typedef map<position, configuration> S;
00239
00240 struct node {
00241 bitset<256> letter;
00242 token_type type;
00243 short n, m;
00244 bool nullable;
00245 S firstpos, lastpos, nextpos;
00246 configuration emptymatch;
00247
00248
00249 node(token_type t, short n_ = 0, short m_ = 0)
00250 : type(t), n(n_), m(m_), nullable(false) {
00251 switch (type) {
00252 case FINAL :
00253 letter.set(0);
00254 break;
00255 case ANY :
00256 type = LETTER;
00257 letter.set();
00258
00259
00260
00261
00262
00263
00264
00265 break;
00266 case OPENCAPTURE :
00267 case CLOSECAPTURE :
00268 nullable = true;
00269 break;
00270 default :;
00271 }
00272 }
00273
00274 node(char t)
00275 : type(LETTER), nullable(false) {
00276 letter.set((unsigned char) t);
00277 }
00278
00279 node(const bitset<256> &t)
00280 : letter(t), type(LETTER), nullable(false)
00281 { }
00282
00283 bool final() const {
00284 return type == FINAL;
00285 }
00286
00287 char to_char() const {
00288 size_t i;
00289 for(i = 0; i != letter.size() && letter.test(i) == false; ++i);
00290 return (char) (-128 + i);
00291 }
00292
00293 string pretty() const {
00294 ostringstream r;
00295 r << "[ " << *this;
00296 for(S::const_iterator i = nextpos.begin(); i != nextpos.end(); ++i) {
00297 r << ", {";
00298 for(configuration::const_iterator j = i->second.begin(); j != i->second.end(); ++j)
00299 r << ' ' << j->first.id();
00300 r << " }";
00301 }
00302 r << " ]";
00303 return r.str();
00304 }
00305 };
00306
00307 protected:
00308 S q;
00309 smart_ptr<vector<node> > e;
00310 const char *errmsg;
00311 smart_ptr<S> I;
00312 unsigned char capture_count_;
00313 configuration intersection;
00314
00315 public:
00316 smart_ptr<horspool_finder> horspool;
00317
00318 protected:
00319 friend ostream& operator<<(ostream&, const non_deterministic_regexp_capture_cursor&);
00320
00321 node* exp(vector<node>::iterator &first, vector<node>::iterator last);
00322 node* form(vector<node>::iterator &first, vector<node>::iterator last);
00323 node* term(vector<node>::iterator &first, vector<node>::iterator last);
00324 node* exp2(vector<node>::iterator &first, vector<node>::iterator last, node *left);
00325 node* term2(vector<node>::iterator &first, vector<node>::iterator last, node *left);
00326 node* repeat(vector<node>::iterator &first, vector<node>::iterator last, node *left);
00327 void preprocess_repeats();
00328 void build_char_sets(map<string, bitset<256> >&);
00329 template <typename OutputIterator>
00330 int tokenize(const char *first, const char *last, OutputIterator x);
00331
00332 public:
00333 friend ostream& operator<<(ostream &, const node&);
00334 ptrdiff_t errpos;
00335
00336 typedef non_deterministic_regexp_capture_cursor self;
00337 typedef S state_type;
00338 typedef set<node*> ordered_state_type;
00339 typedef plain char_traits;
00340 typedef char_traits::char_type char_type;
00341 typedef configuration tag_type;
00342
00343 non_deterministic_regexp_capture_cursor(const string &expression = "", bool horspool_optimized = false)
00344 : errmsg(NULL), capture_count_(0), errpos(-1)
00345 {
00346 if (!expression.empty()) {
00347
00348
00349 const string enhanced = string(".*(") + expression + ')';
00350
00351 e->reserve(128);
00352
00353
00354
00355 errpos = tokenize(enhanced.c_str(),
00356 enhanced.c_str() + enhanced.size(),
00357 back_inserter(*e));
00358 if (errpos == -1) {
00359 preprocess_repeats();
00360 vector<node>::iterator first = e->begin();
00361 node *root = exp(first, e->end());
00362
00363
00364 if (root != NULL) {
00365 q = root->firstpos;
00366 #if 0
00367 for(S::iterator i = q.begin(); i != q.end(); ++i)
00368
00369 if ((*i).first->letter.count() == 256 || (*i).first->letter.test(0) == false)
00370 I->insert(make_pair(i->first, i->second));
00371 #endif
00372 }
00373 else errpos = expression.size() - errpos;
00374 }
00375 }
00376 }
00377
00378 ptrdiff_t error() const {
00379 return errpos;
00380 }
00381
00382 const char* error_message() const { return errmsg; }
00383
00384 self& operator=(const state_type &s) {
00385 q = s;
00386 return *this;
00387 }
00388
00389 const state_type& src() const { return q; }
00390
00391 bool exists(char a) const {
00392 for(S::const_iterator i = q.begin(); i != q.end(); ++i)
00393 if ((*i).first->letter.test((unsigned char) a) == true)
00394 return true;
00395 return false;
00396 }
00397
00398 int capture_count() const { return capture_count_; }
00399
00400 bool forward(char letter) {
00401 state_type p;
00402 cout << string(80, '_') << endl << "forward(" << (int) (unsigned char) letter << ")\tI == ";
00403 for(S::const_iterator i = I->begin(); i != I->end(); ++i) {
00404 cout << '\t' << i->first << ':' << i->first->pretty() << ":=" << i->second;
00405 }
00406 cout << endl;
00407
00408 for(S::const_iterator i = q.begin(); i != q.end(); ++i) {
00409 cout << "\033[1m" << "position: " << i->first << ':' << i->first->pretty()
00410 << ":=" << i->second << "\033[0m" << endl;
00411 if (i->first->letter.test((unsigned char) letter) == true) {
00412 state_type::const_iterator target = i->first->nextpos.begin();
00413 const state_type::const_iterator last = i->first->nextpos.end();
00414 for(; target != last; ++target) {
00415 configuration tmp = i->second;
00416 --tmp;
00417
00418 configuration::const_iterator cfirst = target->second.begin(), clast = target->second.end();
00419 for(; cfirst != clast; ++cfirst) {
00420
00421
00422 tmp[cfirst->first] = 0;
00423 }
00424 configuration &conf = p[target->first];
00425 cout << "configurations dedup (one/position): ";
00426 if (tmp < conf) conf = tmp;
00427 }
00428 }
00429 }
00430 q.swap(p);
00431
00432 return !q.empty();
00433 }
00434
00435 bool src_final() const {
00436 for(S::const_iterator i = q.begin(); i != q.end(); ++i)
00437 if (i->first->final()) return true;
00438 return false;
00439 }
00440
00441 bool sink() const { return q.empty(); }
00442
00443 const tag_type& src_tag() const {
00444
00445 S::const_iterator m = q.end();
00446 for(S::const_iterator i = q.begin(); i != q.end(); ++i)
00447 if (m == q.end() || i->second < m->second)
00448 m = i;
00449 return m->second;
00450 }
00451
00452 void clear() {
00453 q.clear();
00454 e->clear();
00455 I->clear();
00456 }
00457
00458
00459
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497 protected:
00498 ptrdiff_t token_error(ptrdiff_t pos, const char *message) {
00499 errmsg = message;
00500 errpos = pos;
00501 return pos;
00502 }
00503
00504 node* syntax_error(ptrdiff_t pos, const char *message = NULL) {
00505 errmsg = message;
00506 errpos = pos;
00507 return NULL;
00508 }
00509
00510 int next_char(bitset<256>& letter, const char *begin, const char*& first, const char *last,
00511 bool& literal, const map<string, bitset<256> >& char_sets, bool icase);
00512
00513 };
00514
00515 inline
00516 ostream& operator<<(ostream &out, const non_deterministic_regexp_capture_cursor::configuration &c) {
00517 for(non_deterministic_regexp_capture_cursor::configuration::const_iterator i = c.begin();
00518 i != c.end(); ++i)
00519 out << i->first.id() << ':' << i->second << ' ';
00520 return out;
00521 }
00522
00523 inline
00524 void non_deterministic_regexp_capture_cursor::preprocess_repeats() {
00525 vector<node> &v = *e;
00526 for(vector<node>::size_type p = 0; p < v.size();) {
00527 if (v[p].type != REPEAT) ++p;
00528 else {
00529
00530 vector<node>::size_type first = p - 1, last = p;
00531 for(; first > 0; --first) {
00532 if (v[first].type == LETTER || v[first].type == CLOSE || v[first].type == ANY)
00533 break;
00534 }
00535 if (v[first].type == CLOSE) {
00536 int count = 1;
00537 while(count != 0 && --first > 0)
00538 if (v[first].type == CLOSE) ++count;
00539 else if (v[first].type == OPEN) --count;
00540 }
00541
00542
00543
00544
00545 short n = (v[p].n > 128 ? 128 : v[p].n),
00546 m = (v[p].m > 128 ? 128 : v[p].m);
00547 v.insert(v.begin() + first, node(OPEN));
00548 ++p;
00549 ++last;
00550 ++first;
00551
00552 if (n == 0)
00553 v[p] = node(QUESTION);
00554 else
00555 v.erase(v.begin() + p);
00556 vector<node> tmp;
00557 if (m == -1)
00558 tmp.reserve((last + 1 - first) * n + 1);
00559 else
00560 tmp.reserve((last + 1 - first) * m + m - n + 1);
00561
00562 while(n > 0 || m > 0) {
00563 --n;
00564 --m;
00565 if (n <= 0) {
00566 if (m < n) {
00567 tmp.push_back(node(PLUS));
00568 continue;
00569 }
00570 if (m == 0)
00571 continue;
00572 }
00573 tmp.push_back(node(CONCAT));
00574 for(vector<node>::size_type tmp_p = first; tmp_p < last; ++tmp_p)
00575 tmp.push_back(v[tmp_p]);
00576 if (n <= 0)
00577 tmp.push_back(node(QUESTION));
00578 }
00579 tmp.push_back(node(CLOSE));
00580 v.insert(v.begin() + p, tmp.begin(), tmp.end());
00581 p += tmp.size();
00582 }
00583 }
00584 }
00585
00586 inline
00587 ostream& operator<<(ostream &out, const non_deterministic_regexp_capture_cursor::node &x)
00588 {
00589 switch(x.type) {
00590 case non_deterministic_regexp_capture_cursor::LETTER :
00591
00592
00593
00594
00595
00596 if (x.letter.count() == 256)
00597
00598
00599
00600
00601 out << ".";
00602 else {
00603 if (x.letter.count() > 1)
00604 out << '[';
00605 int lower = 0;
00606
00607 for(int i = 32; i < 257; ++i) {
00608 if (i < 256 && x.letter.test(i) == true) {
00609 if (lower == 0)
00610 lower = i;
00611 }
00612 else {
00613 if (lower > 0) {
00614 if (i - lower == 1) {
00615 out << (unsigned char) lower;
00616 }
00617 else if (i - lower == 2) {
00618 out << (unsigned char) lower << (unsigned char) (lower + 1);
00619 }
00620 else out << (unsigned char) lower << '-'
00621 << (unsigned char) (i - 1);
00622 lower = 0;
00623 }
00624 }
00625 }
00626 if (x.letter.count() > 1)
00627 out << ']';
00628 }
00629 break;
00630
00631 case non_deterministic_regexp_capture_cursor::OR :
00632 out << "|";
00633 break;
00634
00635 case non_deterministic_regexp_capture_cursor::FINAL :
00636 out << "[EoE]";
00637 break;
00638
00639 case non_deterministic_regexp_capture_cursor::GRANDFINAL :
00640 out << "[GF]";
00641 break;
00642
00643 case non_deterministic_regexp_capture_cursor::OPEN :
00644 out << "(";
00645 break;
00646
00647 case non_deterministic_regexp_capture_cursor::CLOSE :
00648 out << ")";
00649 break;
00650
00651 case non_deterministic_regexp_capture_cursor::CONCAT :
00652 out << " ";
00653 break;
00654
00655 case non_deterministic_regexp_capture_cursor::STAR :
00656 out << "*";
00657 break;
00658
00659 case non_deterministic_regexp_capture_cursor::QUESTION :
00660 out << "?";
00661 break;
00662
00663 case non_deterministic_regexp_capture_cursor::PLUS :
00664 out << "+";
00665 break;
00666
00667 case non_deterministic_regexp_capture_cursor::REPEAT :
00668 out << "{" << x.n << "," << x.m << "}";
00669 break;
00670
00671 case non_deterministic_regexp_capture_cursor::OPENCAPTURE :
00672 out << "_(" << x.n << "_";
00673 break;
00674
00675 case non_deterministic_regexp_capture_cursor::CLOSECAPTURE :
00676 out << "_)" << x.n << "_";
00677 break;
00678
00679 default :
00680 break;
00681 }
00682 return out;
00683 }
00684
00685 inline
00686 ostream& operator<<(ostream& out, const non_deterministic_regexp_capture_cursor& c) {
00687 for(vector<non_deterministic_regexp_capture_cursor::node>::const_iterator n = c.e->begin(); n != c.e->end(); ++n)
00688 if (c.q.find(const_cast<astl::non_deterministic_regexp_capture_cursor::node*>(&*n)) != c.q.end())
00689 out << "\033[1m" << *n << "\033[0m";
00690 else
00691 out << *n;
00692 return out;
00693 }
00694
00695 inline
00696 void non_deterministic_regexp_capture_cursor::build_char_sets(map<string, bitset<256> >& char_sets) {
00697 bitset<256> alpha, alnum, ascii, blank, cntrl, digit, word;
00698 bitset<256> graph, lower, print, punct, space, upper, xdigit;
00699 for(int c = 0; c < 256; ++c) {
00700 alnum.set(c, isalnum(c) != 0);
00701 alpha.set(c, isalpha(c) != 0);
00702 ascii.set(c, isascii(c) != 0);
00703 cntrl.set(c, iscntrl(c) != 0);
00704 digit.set(c, isdigit(c) != 0);
00705 graph.set(c, isgraph(c) != 0);
00706 lower.set(c, islower(c) != 0);
00707 print.set(c, isprint(c) != 0);
00708 punct.set(c, ispunct(c) != 0);
00709 space.set(c, isspace(c) != 0);
00710 upper.set(c, isupper(c) != 0);
00711 xdigit.set(c, isxdigit(c) != 0);
00712 }
00713 char_sets["[:alnum:]"] = alnum;
00714 char_sets["[:^alnum:]"] = ~ char_sets["[:alnum:]"];
00715 char_sets["[:alpha:]"] = alpha;
00716 char_sets["[:^alpha:]"] = ~ char_sets["[:alpha:]"];
00717 char_sets["[:ascii:]"] = ascii;
00718 char_sets["[:^ascii:]"] = ~ char_sets["[:ascii:]"];
00719 blank.set((unsigned char) ' ');
00720 blank.set((unsigned char) '\t');
00721 char_sets["[:blank:]"] = blank;
00722 char_sets["[:^blank:]"] = ~ char_sets["[:blank:"];
00723 char_sets["[:cntrl:]"] = cntrl;
00724 char_sets["[:^cntrl:]"] = ~ char_sets["[:cntrl:]"];
00725 char_sets["[:digit:]"] = digit;
00726 char_sets["[:^digit:]"] = ~ char_sets["[:digit:]"];
00727 char_sets["[:graph:]"] = graph;
00728 char_sets["[:^graph:]"] = ~ char_sets["[:graph:]"];
00729 char_sets["[:lower:]"] = lower;
00730 char_sets["[:^lower:]"] = ~ char_sets["[:lower:]"];
00731 char_sets["[:print:]"] = print;
00732 char_sets["[:^print:]"] = ~ char_sets["[:print:]"];
00733 char_sets["[:punct:]"] = punct;
00734 char_sets["[:^punct:]"] = ~ char_sets["[:punct:]"];
00735 char_sets["[:space:]"] = space;
00736 char_sets["[:^space:]"] = ~ char_sets["[:space:]"];
00737 char_sets["[:upper:]"] = upper;
00738 char_sets["[:^upper:]"] = ~ char_sets["[:upper:]"];
00739 char_sets["[:xdigit:]"] = xdigit;
00740 char_sets["[:^xdigit:]"] = ~ char_sets["[:xdigit:]"];
00741 alnum.set((unsigned char) '_');
00742 char_sets["[:word:]"] = alnum;
00743 char_sets["[:^word:]"] = ~ char_sets["[:word:]"];
00744 char_sets["\\d"] = digit;
00745 char_sets["\\D"] = ~ char_sets["\\d"];
00746 char_sets["\\s"] = space;
00747 char_sets["\\S"] = ~ char_sets["\\s"];
00748 char_sets["\\w"] = alnum;
00749 char_sets["\\W"] = ~alnum;
00750 char_sets["\\t"] = bitset<256>().set((unsigned char) '\t');
00751 char_sets["\\n"] = bitset<256>().set((unsigned char) '\n');
00752 char_sets["\\r"] = bitset<256>().set((unsigned char) '\r');
00753 char_sets["\\f"] = bitset<256>().set((unsigned char) '\f');
00754 char_sets["\\a"] = bitset<256>().set((unsigned char) '\a');
00755 char_sets["\\e"] = bitset<256>().set((unsigned char) '\x1B');
00756 char_sets["\\v"] = bitset<256>().set((unsigned char) '\v');
00757 char_sets["\\V"] = ~ char_sets["\\v"];
00758 char_sets["\\h"] = bitset<256>().set((unsigned char) ' ');
00759 char_sets["\\H"] = ~ char_sets["\\h"];
00760 }
00761
00762
00763
00764 inline
00765 int non_deterministic_regexp_capture_cursor::next_char(bitset<256>& letter, const char *begin, const char*& first,
00766 const char *last, bool& literal,
00767 const map<string, bitset<256> >& char_sets, bool icase) {
00768 letter.reset();
00769
00770
00771 if ((*first == '$' || *first == '^') && !literal) {
00772 letter.set((unsigned char) 0);
00773 ++first;
00774 return -1;
00775 }
00776
00777 if (*first == '\\') {
00778 if (++first == last)
00779 return token_error(first - begin, "unterminated escaped sequence");
00780
00781
00782 if (*first == 'Q') {
00783 if (literal) {
00784 letter.set((unsigned char) '\\');
00785 return -1;
00786 }
00787 else {
00788 literal = true;
00789 if (++first == last)
00790 return token_error(first - begin, "unterminated quote sequence");
00791 return next_char(letter, begin, first, last, literal, char_sets, icase);
00792 }
00793 }
00794 if (*first == 'E') {
00795 if (!literal) {
00796 letter.set((unsigned char) 'E');
00797 if (icase)
00798 letter.set((unsigned char) 'e');
00799 }
00800 else literal = false;
00801 ++first;
00802 return -1;
00803 }
00804
00805 if (literal) {
00806 letter.set((unsigned char) '\\');
00807 return -1;
00808 }
00809
00810 if (*first == 'x') {
00811 if (++first == last)
00812 return token_error(first - begin, "\\x used with no following hex digits");
00813 int n = 0;
00814 if (*first >= '0' && *first <= '9') {
00815 n = 16 * (*first - '0');
00816 }
00817 else
00818 if (*first >= 'A' && *first <= 'F') {
00819 n = 16 * (10 + *first - 'A');
00820 }
00821 else
00822 if (*first >= 'a' && *first <= 'f') {
00823 n = 16 * (10 + *first - 'a');
00824 }
00825 else return token_error(first - begin, "\\x used with no following hex digits");
00826
00827 if (++first == last)
00828 return token_error(first - begin, "\\x used with only one following hex digit");
00829
00830 if (*first >= '0' && *first <= '9') {
00831 n += *first - '0';
00832 }
00833 else
00834 if (*first >= 'A' && *first <= 'F') {
00835 n += 10 + (*first - 'A');
00836 }
00837 else
00838 if (*first >= 'a' && *first <= 'f') {
00839 n += 10 + (*first - 'a');
00840 }
00841 else return token_error(first - begin, "\\x used with only one following hex digit");
00842 letter.set(n);
00843 ++first;
00844 return -1;
00845 }
00846
00847 if (*first == 'c') {
00848 if (++first == last || *first < 'A' || *first > 'Z')
00849 return token_error(first - begin, "\\c used with no following upper-case letter");
00850 letter.set(*first - 'A' + 1);
00851 ++first;
00852 return -1;
00853 }
00854
00855
00856 std::map<string, bitset<256> >::const_iterator c = char_sets.find(std::string("\\") + *first);
00857 if (c != char_sets.end()) {
00858 letter = c->second;
00859 ++first;
00860 return -1;
00861 }
00862
00863 if (*first == '.') {
00864 letter.set((unsigned char) '.');
00865 ++first;
00866 return -1;
00867 }
00868 }
00869
00870 if (*first == '.') {
00871 if (literal)
00872 letter.set((unsigned char) '.');
00873 else {
00874 letter.set();
00875
00876
00877
00878
00879
00880
00881
00882
00883
00884
00885
00886 }
00887 }
00888 else
00889 if (icase && isalpha(*first)) {
00890 letter.set((unsigned char) tolower((unsigned char) *first));
00891 letter.set((unsigned char) toupper((unsigned char) *first));
00892 }
00893 else letter.set((unsigned char) *first);
00894 ++first;
00895 return -1;
00896 }
00897
00898
00899 template <typename OutputIterator>
00900 int non_deterministic_regexp_capture_cursor::tokenize(const char *first, const char *last, OutputIterator x)
00901 {
00902 static map<string, bitset<256> > char_sets;
00903 if (char_sets.empty())
00904 build_char_sets(char_sets);
00905
00906 *x++ = node(OPEN);
00907 node previous = node(OPEN);
00908
00909 bool literal = false;
00910 std::vector<char> parenthesis;
00911 bool icase = false;
00912 std::vector<int> open_captures;
00913 int current_open_capture = 0;
00914
00915 for(const char *begin = first; first != last; ++first, ++x) {
00916 if (literal && *first != '\\') {
00917 if (previous.type != OPEN && previous.type != OR)
00918 *x++ = node(CONCAT);
00919 if (icase && isalpha(*first)) {
00920 bitset<256> c;
00921 c.set((unsigned char) tolower(*first));
00922 c.set((unsigned char) toupper(*first));
00923 *x = previous = node(c);
00924 }
00925 else
00926 *x = previous = node(*first);
00927 continue;
00928 }
00929
00930 switch (*first) {
00931 case '|' :
00932 if (first == begin)
00933 return token_error(0, "missing left part of alternative");
00934 *x = previous = node(OR);
00935 break;
00936
00937 case '(' :
00938 if (previous.type != OPEN && previous.type != OR)
00939 *x++ = node(CONCAT);
00940 if (first + 2 < last && first[1] == '?' && first[2] == ':') {
00941 first += 2;
00942 parenthesis.push_back('(');
00943 }
00944 else {
00945 if (first + 3 < last && first[1] == '?' &&
00946 first[2] == 'i' && first[3] == ':') {
00947 first += 3;
00948 if (icase == false) {
00949 parenthesis.push_back('i');
00950 icase = true;
00951 }
00952 else parenthesis.push_back('(');
00953 }
00954 else {
00955 ++capture_count_;
00956 *x++ = node(OPEN);
00957 *x++ = node(OPENCAPTURE, current_open_capture, current_open_capture / 2);
00958 open_captures.push_back(current_open_capture);
00959 current_open_capture += 2;
00960 *x++ = node(CONCAT);
00961 parenthesis.push_back('c');
00962 }
00963 }
00964 *x = previous = node(OPEN);
00965 break;
00966
00967 case ')' :
00968 if (parenthesis.empty())
00969 return token_error(first - begin, "unbalanced parenthesis");
00970 if (parenthesis.back() == 'i')
00971 icase = false;
00972 else
00973 if (parenthesis.back() == 'c') {
00974 ++capture_count_;
00975 *x++ = node(CLOSE);
00976 *x++ = node(CONCAT);
00977 *x++ = node(CLOSECAPTURE, open_captures.back() + 1, open_captures.back() / 2);
00978 open_captures.pop_back();
00979 }
00980 *x = previous = node(CLOSE);
00981 parenthesis.pop_back();
00982 break;
00983
00984 case '*' :
00985 if (first == begin)
00986 return token_error(0, "missing expression before operator *");
00987 *x = previous = node(STAR);
00988 break;
00989
00990 case '?' :
00991 if (first == begin)
00992 return token_error(0, "missing expression before operator ?");
00993 *x = previous = node(QUESTION);
00994 break;
00995
00996 case '+' :
00997 if (first == begin)
00998 return token_error(0, "missing expression before operator +");
00999 *x = previous = node(PLUS);
01000 break;
01001
01002 case '{' :
01003 if (first == begin)
01004 return token_error(0, "missing expression before repeat operator");
01005 if (++first == last)
01006 return token_error(first - begin, "unterminated expression");
01007 {
01008 int n = 0;
01009 const char *tmp = first;
01010 while (isdigit(*first)) {
01011 n = n * 10 + *first - '0';
01012 if (++first == last)
01013 return token_error(first - begin, "unterminated repeat operator");
01014 }
01015 int m = n;
01016 if (*first == ',') {
01017 if (++first == last)
01018 return token_error(first - begin, "unterminated repeat operator");
01019 tmp = first;
01020 for(m = 0; isdigit(*first);) {
01021 m = m * 10 + *first - '0';
01022 if (++first == last)
01023 return token_error(first - begin, "unterminated repeat operator");
01024 }
01025 if (tmp == first) m = -1;
01026 }
01027 if (*first != '}')
01028 return token_error(first - begin, "invalid repeat operator");
01029
01030
01031 if (n > 32768 || m > 65535 || (n > m && m > -1) || (n == m && n == 0))
01032 return token_error(first - begin, "wrong range for repeat operator");
01033
01034
01035 if (n == 0 && m == 1)
01036 *x = node(QUESTION);
01037 else if (n == 0 && m == -1)
01038 *x = node(STAR);
01039 else if (n == 1 && m == -1)
01040 *x = node(PLUS);
01041 else if (!(n == 1 && m == 1))
01042 *x = node(REPEAT, n, m);
01043 }
01044 break;
01045
01046 case '[' :
01047 {
01048 if (previous.type != OPEN && previous.type != OPENCAPTURE && previous.type != OR)
01049 *x++ = node(CONCAT);
01050 bitset<256> a;
01051 bool v = true;
01052 if (++first == last)
01053 return token_error(first - begin, "unterminated expression");
01054 if (*first == '^') {
01055 a.set();
01056 a.set(0, false);
01057 v = false;
01058 if (++first == last)
01059 return token_error(first - begin, "unterminated expression");
01060 }
01061 if (*first == '-') {
01062 a.set((unsigned char) '-', v);
01063 ++first;
01064 }
01065 else
01066 if (*first == ']') {
01067 a.set((unsigned char) ']', v);
01068 ++first;
01069 }
01070 for(; first != last && *first != ']'; ++first) {
01071 if (*first == '[') {
01072 if (++first == last) break;
01073 if (*first != ':') {
01074 a.set((unsigned char) '[', v);
01075 }
01076 else {
01077 const char *start = first - 1;
01078 first = std::find(++first, last, ']');
01079 if (first == last) break;
01080 std::map<string, bitset<256> >::const_iterator c = char_sets.find(std::string(start, first + 1));
01081 if (c == char_sets.end())
01082 return token_error(start - begin, "unknown character set");
01083 if (v == true)
01084 a |= c->second;
01085 else a &= ~c->second;
01086 continue;
01087 }
01088 }
01089
01090
01091
01092 bitset<256> tmp;
01093 if (*first != '-') {
01094 switch (*first) {
01095 case '^' :
01096 case '$' :
01097 tmp.set((unsigned char) *first++);
01098 break;
01099 default :
01100 {
01101 int r = next_char(tmp, begin, first, last, literal, char_sets, icase);
01102 if (r != -1)
01103 return r;
01104
01105 if (tmp.count() == 256) {
01106 tmp.reset();
01107 tmp.set((unsigned char) '.');
01108 }
01109 }
01110 }
01111 if (v == true)
01112 a |= tmp;
01113 else a &= ~ tmp;
01114 if (*first != '-') {
01115 --first;
01116 continue;
01117 }
01118 }
01119
01120 if (++first == last)
01121 return token_error(first - begin, "unterminated expression");
01122 if (*first == ']') {
01123 a.set((unsigned char) '-', v);
01124 break;
01125 }
01126 else {
01127 if (tmp.count() != 1) {
01128 return token_error(first - 2 - begin, "wrong character range: the lower bound cannot be a character set");
01129 }
01130 bitset<256> upper_bound;
01131 switch (*first) {
01132 case '^' :
01133 case '$' :
01134 upper_bound.set((unsigned char) *first);
01135 break;
01136 default :
01137 {
01138 int r = next_char(upper_bound, begin, first, last, literal, char_sets, icase);
01139 --first;
01140 if (r != -1)
01141 return token_error(first - begin, "wrong character range");
01142
01143 if (upper_bound.count() == 256) {
01144 upper_bound.reset();
01145 upper_bound.set((unsigned char) '.');
01146 }
01147 }
01148 }
01149 if (upper_bound.count() != 1)
01150 return token_error(first - begin, "wrong character range: the upper bound cannot be a character set");
01151
01152 unsigned int int_lower_bound = 0, int_upper_bound = 0;
01153 while (tmp.test(int_lower_bound) == false)
01154 ++int_lower_bound;
01155 while (upper_bound.test(int_upper_bound) == false)
01156 ++int_upper_bound;
01157
01158
01159 if (int_upper_bound < int_lower_bound)
01160 return token_error(first - begin, "wrong character range: lower bound cannot be greater than upper bound");
01161 else
01162 for(unsigned int i = int_lower_bound; i <= int_upper_bound; ++i)
01163 a.set(i, v);
01164 }
01165 }
01166
01167 if (first == last)
01168 return token_error(first - begin, "unterminated expression");
01169 *x = previous = node(a);
01170 break;
01171 }
01172
01173 default :
01174 bitset<256> next_letter;
01175 int r = next_char(next_letter, begin, first, last, literal, char_sets, icase);
01176 if (r != -1)
01177 return r;
01178 if (next_letter.any()) {
01179 if (previous.type != OPEN && previous.type != OR)
01180 *x++ = node(CONCAT);
01181
01182
01183
01184 if (next_letter.count() == 256)
01185
01186
01187
01188 #ifndef UTF8
01189 *x = previous = node(ANY);
01190 #else
01191 {
01192 *x++ = node(OPEN);
01193 next_letter.reset();
01194 for(int i = 1; i < 0x7F; ++i)
01195 next_letter.set(i);
01196 *x++ = node(next_letter);
01197 *x++ = node(OR);
01198 next_letter.reset();
01199 for(int i = 0; i < 32; ++i)
01200 next_letter.set(i | 192);
01201 *x++ = node(next_letter);
01202 *x++ = node(CONCAT);
01203 *x++ = node(ANY);
01204 *x = previous = node(CLOSE);
01205 }
01206 #endif
01207 else
01208 *x = previous = node(next_letter);
01209 }
01210 --first;
01211 break;
01212 }
01213 }
01214 *x++ = node(CLOSE);
01215 *x++ = node(CONCAT);
01216 *x++ = node(FINAL);
01217 *x++ = node(CONCAT);
01218 *x++ = node(GRANDFINAL);
01219 return -1;
01220 }
01221
01222
01223 inline
01224 non_deterministic_regexp_capture_cursor::node*
01225 non_deterministic_regexp_capture_cursor::exp(vector<non_deterministic_regexp_capture_cursor::node>::iterator &first,
01226 vector<non_deterministic_regexp_capture_cursor::node>::iterator last)
01227 {
01228 if (first == last)
01229 return syntax_error(last - first, "empty expression");
01230 node *t = term(first, last);
01231 assert(t != 0);
01232 if (t == 0) return 0;
01233 return exp2(first, last, t);
01234 }
01235
01236
01237 inline
01238 non_deterministic_regexp_capture_cursor::node*
01239 non_deterministic_regexp_capture_cursor::exp2(vector<non_deterministic_regexp_capture_cursor::node>::iterator &first,
01240 vector<non_deterministic_regexp_capture_cursor::node>::iterator last, node *left)
01241 {
01242 assert(left != 0);
01243 if (first != last && first->type == OR) {
01244 node &root = *first;
01245 node *right = term(++first, last);
01246 if (right == 0) return 0;
01247 root.nullable = right->nullable || left->nullable;
01248
01249 root.firstpos = left->firstpos;
01250 root.firstpos.insert(right->firstpos.begin(), right->firstpos.end());
01251 root.lastpos = left->lastpos;
01252 root.lastpos.insert(right->lastpos.begin(), right->lastpos.end());
01253 if (left->nullable)
01254 root.emptymatch = left->emptymatch;
01255 else
01256 root.emptymatch = right->emptymatch;
01257 return exp2(first, last, &root);
01258 }
01259 return left;
01260 }
01261
01262
01263 inline
01264 non_deterministic_regexp_capture_cursor::node*
01265 non_deterministic_regexp_capture_cursor::term(vector<non_deterministic_regexp_capture_cursor::node>::iterator &first,
01266 vector<non_deterministic_regexp_capture_cursor::node>::iterator last)
01267 {
01268 if (first == last)
01269 return syntax_error(last - first, "unterminated expression");
01270 node *f = form(first, last);
01271 assert(f != 0);
01272 if (f == 0) return 0;
01273 node *s = repeat(first, last, f);
01274 return term2(first, last, s);
01275 }
01276
01277
01278 inline
01279 non_deterministic_regexp_capture_cursor::node*
01280 non_deterministic_regexp_capture_cursor::repeat(vector<non_deterministic_regexp_capture_cursor::node>::iterator &first,
01281 vector<non_deterministic_regexp_capture_cursor::node>::iterator last, node *left)
01282 {
01283 assert(left != 0);
01284 if (first != last)
01285 switch (first->type) {
01286 case STAR :
01287 case PLUS :
01288 {
01289 node &root = *first;
01290 root.nullable = first->type == STAR;
01291 root.firstpos = left->firstpos;
01292 root.lastpos = left->lastpos;
01293
01294 if (left->nullable)
01295 root.emptymatch = left->emptymatch;
01296
01297
01298 for(S::const_iterator i = root.lastpos.begin(); i != root.lastpos.end(); ++i)
01299 for(S::const_iterator j = root.firstpos.begin(); j != root.firstpos.end(); ++j) {
01300 configuration tmp = i->second;
01301 tmp.insert(j->second.begin(), j->second.end());
01302 (*i).first->nextpos.insert(make_pair(j->first, tmp));
01303 }
01304 return repeat(++first, last, &root);
01305 }
01306 case QUESTION :
01307 {
01308 node &root = *first;
01309 root.nullable = true;
01310 root.firstpos = left->firstpos;
01311 root.lastpos = left->lastpos;
01312 if (left->nullable)
01313 root.emptymatch = left->emptymatch;
01314 return repeat(++first, last, &root);
01315 }
01316 default :
01317 break;
01318 }
01319 return left;
01320 }
01321
01322
01323
01324 inline
01325 non_deterministic_regexp_capture_cursor::node*
01326 non_deterministic_regexp_capture_cursor::term2(vector<non_deterministic_regexp_capture_cursor::node>::iterator &first,
01327 vector<non_deterministic_regexp_capture_cursor::node>::iterator last, node *left)
01328 {
01329 if (first != last && first->type == CONCAT) {
01330 node &root = *first;
01331 node *f = form(++first, last);
01332 assert(f != 0);
01333 if (f == 0) return 0;
01334 node *right = repeat(first, last, f);
01335 root.nullable = left->nullable && right->nullable;
01336
01337 root.firstpos = left->firstpos;
01338 if (left->nullable) {
01339 for(S::const_iterator s = right->firstpos.begin(); s != right->firstpos.end(); ++s) {
01340 configuration tmp = s->second;
01341 tmp.insert(left->emptymatch.begin(), left->emptymatch.end());
01342 root.firstpos.insert(make_pair(s->first, tmp));
01343 }
01344 }
01345
01346 root.lastpos = right->lastpos;
01347 if (right->nullable) {
01348 for(S::const_iterator s = left->lastpos.begin(); s != left->lastpos.end(); ++s) {
01349 configuration tmp = s->second;
01350 tmp.insert(right->emptymatch.begin(), right->emptymatch.end());
01351 root.lastpos.insert(make_pair(s->first, tmp));
01352 }
01353 }
01354
01355 root.emptymatch = left->emptymatch;
01356 root.emptymatch.insert(right->emptymatch.begin(), right->emptymatch.end());
01357
01358
01359 for(S::const_iterator i = left->lastpos.begin(); i != left->lastpos.end(); ++i)
01360 for(S::const_iterator j = right->firstpos.begin(); j != right->firstpos.end(); ++j) {
01361 configuration tmp = j->second;
01362 tmp.insert(i->second.begin(), i->second.end());
01363 i->first->nextpos.insert(make_pair(j->first, tmp));
01364 }
01365 return term2(first, last, &root);
01366 }
01367 return left;
01368 }
01369
01370
01371
01372 inline
01373 non_deterministic_regexp_capture_cursor::node*
01374 non_deterministic_regexp_capture_cursor::form(vector<non_deterministic_regexp_capture_cursor::node>::iterator &first,
01375 vector<non_deterministic_regexp_capture_cursor::node>::iterator last)
01376 {
01377 if (first == last)
01378 return syntax_error(last - first, "unterminated expression");
01379 if (first->type == OPEN) {
01380 node *e = exp(++first, last);
01381 assert(e != 0);
01382 if (e == 0) return 0;
01383 if (first->type == CLOSE) ++first;
01384 else return syntax_error(last - first, "unbalanced parenthesis");
01385 return e;
01386 }
01387 if (first->type == OPENCAPTURE || first->type == CLOSECAPTURE) {
01388 first->emptymatch[&*first] = 0;
01389 ++first;
01390 return &(first[-1]);
01391 }
01392 first->nullable = false;
01393 first->firstpos.insert(make_pair(&*first, configuration()));
01394 first->lastpos.insert(make_pair(&*first, configuration()));
01395 ++first;
01396 return &(first[-1]);
01397 }
01398
01399
01400
01401
01402
01403
01404
01405
01406
01407
01408
01409
01410 class capture_cursor : public non_deterministic_regexp_capture_cursor
01411 {
01412 public:
01413 typedef capture_cursor self;
01414 typedef non_deterministic_regexp_capture_cursor super;
01415 typedef configuration tag_type;
01416
01417 capture_cursor(const string &exp, bool use_horspool)
01418 : super(exp, use_horspool)
01419 { }
01420
01421 bool forward(char_type c) {
01422 const bool r = super::forward(c);
01423 intersection.clear();
01424
01425 if (q.size() == 1) {
01426
01427 intersection.swap(q.begin()->second);
01428 cout << "INTERSECTION " << "\033[47;31m" << intersection << "\033[0m" << endl;
01429 return r;
01430 }
01431 else {
01432
01433 for(state_type::const_iterator i = q.begin(); i != q.end(); ++i) {
01434
01435
01436
01437
01438 if (i == q.begin())
01439 intersection = i->second;
01440 else
01441 intersection = intersection.intersection(i->second);
01442 }
01443
01444
01445
01446
01447
01448 }
01449 cout << "INTERSECTION " << "\033[47;31m" << intersection << "\033[0m" << endl;
01450 for(state_type::iterator i = q.begin(); i != q.end(); ++i) {
01451 configuration diff;
01452 set_difference(i->second.begin(), i->second.end(), intersection.begin(), intersection.end(), inserter(diff, diff.end()));
01453 i->second = diff;
01454 cout << "\033[1m" << "remain: " << i->first << ':' << i->first->pretty()
01455 << ":=" << i->second << "\033[0m" << endl;
01456 }
01457 return r;
01458 }
01459
01460 void finalize() {
01461 super::forward('\0');
01462 state_type::const_iterator best = q.end();
01463 for(state_type::const_iterator i = q.begin(); i != q.end(); ++i) {
01464 if (i->first->type == GRANDFINAL &&
01465 (best == q.end() || i->second < best->second))
01466 best = i;
01467 }
01468 intersection = best->second;
01469 }
01470
01471 self& operator=(const state_type &s) {
01472 super::operator=(s);
01473 return *this;
01474 }
01475
01476 const tag_type& src_tag() const {
01477 return intersection;
01478 }
01479
01480 protected:
01481 configuration intersection;
01482 };
01483
01484
01485 template <>
01486 class lazy_cursor<capture_cursor, DFT_matrix_mini> : public cursor_concept
01487 {
01488 public:
01489 typedef capture_cursor Cursor;
01490 #ifdef USE_HASH_TABLE
01491 typedef DFT_matrix_mini<const capture_cursor::ordered_state_type*,
01492 capture_cursor::tag_type> DFA_type;
01493 #else
01494 typedef DFT_matrix_mini<const capture_cursor::state_type*,
01495 capture_cursor::tag_type> DFA_type;
01496 #endif
01497
01498 protected:
01499 #ifdef USE_HASH_TABLE
01500 typedef std::map<capture_cursor::ordered_state_type, DFA_type::state_type> mapper;
01501 #else
01502 typedef std::map<capture_cursor::state_type, DFA_type::state_type> mapper;
01503 #endif
01504
01505 typedef unsigned long offset_type;
01506
01507 Cursor c;
01508 smart_ptr<DFA_type> dfa;
01509 smart_ptr<mapper> mapping;
01510 DFA_type::state_type my_sink;
01511 DFA_type::state_type q;
01512
01513 public:
01514 typedef lazy_cursor self;
01515 typedef DFA_type::state_type state_type;
01516 typedef Cursor::char_type char_type;
01517 typedef Cursor::char_traits char_traits;
01518 typedef Cursor::configuration configuration;
01519 typedef configuration tag_type;
01520
01521 protected:
01522 const configuration *conf;
01523
01524 public:
01525 lazy_cursor(const Cursor &x)
01526 : c(x), my_sink(dfa->new_state()), q(DFA_type::null_state), conf(NULL)
01527 {
01528 if (!x.sink()) {
01529 q = dfa->new_state();
01530 #ifdef USE_HASH_TABLE
01531 dfa->tag(q) = &(mapping->insert(make_pair(capture_cursor::ordered_state_type(c.src().begin(), c.src().end()), q)).first->first);
01532 #else
01533 dfa->tag(q) = &(mapping->insert(make_pair(c.src(), q)).first->first);
01534 #endif
01535 dfa->final(q) = c.src_final();
01536 dfa->initial(q);
01537 }
01538 }
01539
01540 int capture_count() const { return c.capture_count(); }
01541
01542 const Cursor& adaptee() {
01543 if (q != DFA_type::null_state)
01544 c = capture_cursor::state_type(dfa->tag(q)->begin(), dfa->tag(q)->end());
01545 return c;
01546 }
01547
01548 const DFA_type& cache() const {
01549 return *dfa;
01550 }
01551
01552 state_type src() const {
01553 return q;
01554 }
01555
01556 self& operator=(state_type p) {
01557 q = p;
01558 return *this;
01559 }
01560
01561 bool src_final() const {
01562 return dfa->final(q);
01563 }
01564
01565
01566 const tag_type& src_tag() const {
01567 static const configuration empty;
01568 if (conf != NULL)
01569 return *conf;
01570 return empty;
01571 }
01572
01573 bool sink() const {
01574 return q == DFA_type::null_state;
01575 }
01576
01577 state_type sink_state() const {
01578 return DFA_type::null_state;
01579 }
01580
01581 bool exists(const char_type &a) const {
01582 state_type aim = dfa->delta1(q, a);
01583 if (aim == DFA_type::null_state) {
01584 Cursor tmp = c;
01585 #ifdef USE_HASH_TABLE
01586 tmp = capture_cursor::state_type(dfa->tag(q)->begin(), dfa->tag(q)->end());
01587 #else
01588 tmp = *dfa->tag(q);
01589 #endif
01590 return tmp.exists(a);
01591 }
01592 return aim != my_sink;
01593 }
01594
01595 bool forward(const char_type &a) {
01596 state_type tmp = dfa->delta1(q, a);
01597 if (tmp == my_sink) {
01598 q = DFA_type::null_state;
01599 return false;
01600 }
01601 if (tmp == DFA_type::null_state) {
01602 c = capture_cursor::state_type(dfa->tag(q)->begin(), dfa->tag(q)->end());
01603 if (c.forward(a)) {
01604 #ifdef USE_HASH_TABLE
01605 std::pair<mapper::iterator, bool> p =
01606 mapping->insert(std::make_pair(capture_cursor::ordered_state_type(c.src().begin(), c.src().end()), DFA_type::null_state));
01607 #else
01608 std::pair<mapper::iterator, bool> p =
01609 mapping->insert(std::make_pair(c.src(), DFA_type::null_state));
01610 #endif
01611 if (p.second == true) {
01612 tmp = dfa->new_state();
01613 dfa->final(tmp) = c.src_final();
01614 dfa->tag(tmp) = &(p.first->first);
01615 p.first->second = tmp;
01616 }
01617 else tmp = p.first->second;
01618 dfa->set_trans(q, a, tmp);
01619 if (!c.src_tag().empty()) {
01620
01621 dfa->set_output(q, a, c.src_tag());
01622 }
01623 }
01624 else {
01625 dfa->set_trans(q, a, my_sink);
01626 q = DFA_type::null_state;
01627 return false;
01628 }
01629 }
01630
01631
01632
01633
01634
01635
01636
01637
01638
01639
01640 conf = dfa->output(q, a);
01641
01642 q = tmp;
01643 return true;
01644 }
01645 };
01646
01647 #if 0
01648 template <>
01649 class lazy_cursor<non_deterministic_regexp_capture_cursor, DFT_matrix_mini> : public cursor_concept
01650 {
01651 public:
01652 typedef non_deterministic_regexp_capture_cursor Cursor;
01653 #ifdef USE_HASH_TABLE
01654 typedef DFT_matrix_mini<const non_deterministic_regexp_capture_cursor::ordered_state_type*,
01655 non_deterministic_regexp_capture_cursor::tag_type> DFA_type;
01656 #else
01657 typedef DFT_matrix_mini<const non_deterministic_regexp_capture_cursor::state_type*,
01658 non_deterministic_regexp_capture_cursor::tag_type> DFA_type;
01659 #endif
01660
01661 protected:
01662 #ifdef USE_HASH_TABLE
01663 typedef std::map<non_deterministic_regexp_capture_cursor::ordered_state_type, DFA_type::state_type> mapper;
01664 #else
01665 typedef std::map<non_deterministic_regexp_capture_cursor::state_type, DFA_type::state_type> mapper;
01666 #endif
01667
01668 typedef unsigned long offset_type;
01669
01670 Cursor c;
01671 smart_ptr<DFA_type> dfa;
01672 smart_ptr<mapper> mapping;
01673 smart_ptr<vector<offset_type> > captures;
01674 DFA_type::state_type my_sink;
01675 DFA_type::state_type q;
01676 offset_type offset;
01677
01678 public:
01679 typedef lazy_cursor self;
01680 typedef DFA_type::state_type state_type;
01681 typedef vector<offset_type> tag_type;
01682 typedef Cursor::char_type char_type;
01683 typedef Cursor::char_traits char_traits;
01684
01685 lazy_cursor(const Cursor &x)
01686 : c(x), my_sink(dfa->new_state()), q(DFA_type::null_state), offset(0)
01687 {
01688 if (!x.sink()) {
01689 q = dfa->new_state();
01690 #ifdef USE_HASH_TABLE
01691 dfa->tag(q) = &(mapping->insert(make_pair(non_deterministic_regexp_capture_cursor::ordered_state_type(c.src().begin(), c.src().end()), q)).first->first);
01692 #else
01693 dfa->tag(q) = &(mapping->insert(make_pair(c.src(), q)).first->first);
01694 #endif
01695 dfa->final(q) = c.src_final();
01696 dfa->initial(q);
01697 captures->resize(x.capture_count(), 0);
01698 }
01699 }
01700
01701 const Cursor& adaptee() {
01702 if (q != DFA_type::null_state)
01703 c = *dfa->tag(q);
01704 return c;
01705 }
01706
01707 const DFA_type& cache() const {
01708 return *dfa;
01709 }
01710
01711 state_type src() const {
01712 return q;
01713 }
01714
01715 self& operator=(state_type p) {
01716 q = p;
01717 offset = 0;
01718 fill(captures->begin(), captures->end(), 0);
01719 return *this;
01720 }
01721
01722 bool src_final() const {
01723 return dfa->final(q);
01724 }
01725
01726 const tag_type& src_tag() const {
01727 return *captures;
01728 }
01729
01730 bool sink() const {
01731 return q == DFA_type::null_state;
01732 }
01733
01734 state_type sink_state() const {
01735 return DFA_type::null_state;
01736 }
01737
01738
01739
01740
01741
01742
01743
01744
01745
01746 bool forward(const char_type &a) {
01747 state_type tmp = dfa->delta1(q, a);
01748 if (tmp == my_sink) {
01749 q = DFA_type::null_state;
01750 return false;
01751 }
01752 if (tmp == DFA_type::null_state) {
01753 c = *dfa->tag(q);
01754 if (c.forward(a)) {
01755 #ifdef USE_HASH_TABLE
01756 std::pair<mapper::iterator, bool> p =
01757 mapping->insert(std::make_pair(non_deterministic_regexp_capture_cursor::ordered_state_type(c.src().begin(), c.src().end()), DFA_type::null_state));
01758 #else
01759 std::pair<mapper::iterator, bool> p =
01760 mapping->insert(std::make_pair(c.src(), DFA_type::null_state));
01761 #endif
01762 if (p.second == true) {
01763 tmp = dfa->new_state();
01764 dfa->final(tmp) = c.src_final();
01765 dfa->tag(tmp) = &(p.first->first);
01766 p.first->second = tmp;
01767 }
01768 else tmp = p.first->second;
01769 dfa->set_trans(q, a, tmp);
01770 if (!c.src_tag().empty()) {
01771
01772 dfa->set_output(q, a, c.src_tag());
01773 }
01774 }
01775 else {
01776 dfa->set_trans(q, a, my_sink);
01777 q = DFA_type::null_state;
01778 return false;
01779 }
01780 }
01781 non_deterministic_regexp_capture_cursor::configuration *conf = dfa->output(q, a);
01782 if (conf != NULL) {
01783 for(non_deterministic_regexp_capture_cursor::configuration::const_iterator ic = conf->begin(); ic != conf->end(); ++ic) {
01784
01785 (*captures)[ic->first.id()] = offset + ic->second;
01786 }
01787 }
01788 q = tmp;
01789 ++offset;
01790 return true;
01791 }
01792 };
01793
01794
01795
01796 inline
01797 bool match(lazy_cursor<non_deterministic_regexp_capture_cursor>& c, const char *w)
01798 {
01799 if (!c.forward('\0'))
01800 return false;
01801 for(; *w != '\0' && c.forward(*w); ++w)
01802 if (c.src_final()) return true;
01803 return *w == '\0' && c.forward('\0') && c.src_final();
01804 }
01805
01806 template <typename InputI>
01807 inline
01808 bool match(lazy_cursor<non_deterministic_regexp_capture_cursor>& c, InputI first, InputI last)
01809 {
01810 if (!c.forward('\0'))
01811 return false;
01812 for(; first != last && c.forward(*first); ++first)
01813 if (c.src_final()) return true;
01814 return first == last && c.forward('\0') && c.src_final();
01815 }
01816
01817 template <typename ForwardI>
01818 inline
01819 ForwardI first_match(lazy_cursor<non_deterministic_regexp_capture_cursor> &c, ForwardI first, ForwardI last)
01820 {
01821 if (!c.forward('\0')) return first;
01822 const ForwardI start = first;
01823 for(; first != last && c.forward(*first); ++first)
01824 if (c.src_final()) return ++first;
01825 return first == last && c.forward('\0') && c.src_final() ? first : start;
01826 }
01827
01828 inline
01829 const char*
01830 first_match(lazy_cursor<non_deterministic_regexp_capture_cursor> &c, const char *text)
01831 {
01832 if (!c.forward('\0')) return text;
01833 const char *start = text;
01834 for(; *text != '\0' && c.forward(*text); ++text)
01835 if (c.src_final()) return ++text;
01836 return *text == '\0' && c.forward('\0') && c.src_final() ? text : start;
01837 }
01838
01839 template <typename ForwardI>
01840 inline
01841 bool capture(lazy_cursor<non_deterministic_regexp_capture_cursor, DFT_matrix_mini> &c, ForwardI first, ForwardI last, vector<unsigned long> &offsets) {
01842 for(; first != last && c.forward(*first); ++first)
01843 if (first == last && c.forward('\0') && c.src_final()) {
01844 offsets = c.src_tag();
01845 return true;
01846 }
01847 return false;
01848 }
01849
01850
01851
01852
01853
01854
01855
01856
01857
01858
01859
01860
01861
01862 template <typename ForwardI>
01863 inline
01864 ForwardI first_match(non_deterministic_regexp_capture_cursor &c, ForwardI first, ForwardI last)
01865 {
01866
01867 const ForwardI start = first;
01868 for(non_deterministic_regexp_capture_cursor::state_type::const_iterator i = c.src().begin(); i != c.src().end(); ++i) {
01869 cout << "< " << i->first << ':' << *i->first << ", { ";
01870 for(non_deterministic_regexp_capture_cursor::configuration::const_iterator j = i->second.begin(); j != i->second.end(); ++j)
01871 cout << (int) *j << " ";
01872 cout << "} > ";
01873 }
01874 cout << endl;
01875 for(; first != last && c.forward(*first); ++first) {
01876 for(non_deterministic_regexp_capture_cursor::state_type::const_iterator i = c.src().begin(); i != c.src().end(); ++i) {
01877 cout << "< " << i->first << ':' << *i->first << ", { ";
01878 for(non_deterministic_regexp_capture_cursor::configuration::const_iterator j = i->second.begin(); j != i->second.end(); ++j)
01879 cout << (int) *j << " ";
01880 cout << "} > ";
01881 }
01882 cout << endl;
01883 if (c.src_final()) return ++first;
01884 }
01885 return first == last && c.forward('\0') && c.src_final() ? first : start;
01886 }
01887 #endif
01888
01889 }
01890
01891 #endif