| 1 |
|
|---|
| 2 | // Boost.Regex partial_match implementation is incorrect
|
|---|
| 3 | //
|
|---|
| 4 | // Submitted by Dr. Robert van Engelen, engelen@genivia.com
|
|---|
| 5 | // Related Ticket: #11776
|
|---|
| 6 | // Affects: RE/flex project https://sourceforge.net/projects/re-flex/
|
|---|
| 7 | //
|
|---|
| 8 | // Problem: Using iterators with partial_match to search incremental input
|
|---|
| 9 | // (such as interactive input) and searching long patterns in files is
|
|---|
| 10 | // impossible when greedy repetition patterns are used (*, +, etc).
|
|---|
| 11 | // Repetitions may behave as lazy repetitions (*?, +?, etc).
|
|---|
| 12 |
|
|---|
| 13 | #include <iostream>
|
|---|
| 14 | #include <fstream>
|
|---|
| 15 | #include <sstream>
|
|---|
| 16 | #include <string>
|
|---|
| 17 | #include <cctype>
|
|---|
| 18 | #include <boost/regex.hpp>
|
|---|
| 19 |
|
|---|
| 20 | void search(std::istream& is, const boost::regex& e, unsigned n)
|
|---|
| 21 | {
|
|---|
| 22 | // buffer we'll be searching in:
|
|---|
| 23 | char buf[4096];
|
|---|
| 24 | // current position
|
|---|
| 25 | const char *cur = buf;
|
|---|
| 26 | // end of filled buffer text
|
|---|
| 27 | char *pos = buf;
|
|---|
| 28 | // flag to indicate whether there is more input to come:
|
|---|
| 29 | bool have_more = true;
|
|---|
| 30 | // flags
|
|---|
| 31 | boost::match_flag_type flg = boost::match_default | boost::regex_constants::match_not_null | boost::match_partial;
|
|---|
| 32 | // iterator end
|
|---|
| 33 | boost::cregex_iterator b;
|
|---|
| 34 |
|
|---|
| 35 | while (have_more)
|
|---|
| 36 | {
|
|---|
| 37 | // fill more of the buffer by adding n chars
|
|---|
| 38 | is.read(pos, n);
|
|---|
| 39 | unsigned read = is.gcount();
|
|---|
| 40 | // check to see if we've run out of text:
|
|---|
| 41 | have_more = read == n;
|
|---|
| 42 | // update the end of filled buffer text
|
|---|
| 43 | pos += read;
|
|---|
| 44 | // and then iterate:
|
|---|
| 45 | boost::cregex_iterator a(
|
|---|
| 46 | cur,
|
|---|
| 47 | pos,
|
|---|
| 48 | e,
|
|---|
| 49 | flg);
|
|---|
| 50 | // while more matches
|
|---|
| 51 | while (a != b)
|
|---|
| 52 | {
|
|---|
| 53 | if ((*a)[0].matched == false)
|
|---|
| 54 | {
|
|---|
| 55 | // Partial match
|
|---|
| 56 | break;
|
|---|
| 57 | }
|
|---|
| 58 | else if ((*a)[0].second < pos || !have_more) // as per documentation
|
|---|
| 59 | {
|
|---|
| 60 | // full match:
|
|---|
| 61 | std::string full((*a)[0].first, (*a)[0].second - (*a)[0].first);
|
|---|
| 62 | std::cout << "matched: " << full << std::endl;
|
|---|
| 63 | size_t cap, groups = (*a).size();
|
|---|
| 64 | for (cap = 1; cap < groups && !(*a)[cap].matched; ++cap)
|
|---|
| 65 | continue;
|
|---|
| 66 | if (cap < groups)
|
|---|
| 67 | std::cout << "group = " << cap << std::endl;
|
|---|
| 68 | // save next current position
|
|---|
| 69 | cur = (*a)[0].second;
|
|---|
| 70 | }
|
|---|
| 71 |
|
|---|
| 72 | // move to next match:
|
|---|
| 73 | ++a;
|
|---|
| 74 | }
|
|---|
| 75 | }
|
|---|
| 76 | }
|
|---|
| 77 |
|
|---|
| 78 | int main()
|
|---|
| 79 | {
|
|---|
| 80 | boost::regex e("a.*c");
|
|---|
| 81 | std::stringstream s("abc abc");
|
|---|
| 82 |
|
|---|
| 83 | // read blocks of 100 chars at a time, OK because match fits in 100 bytes
|
|---|
| 84 | std::cout << "\nCorrectly ";
|
|---|
| 85 | search(s, e, 100);
|
|---|
| 86 |
|
|---|
| 87 | // read by one char at a time (interactive), pattern behaves as a lazy .*?
|
|---|
| 88 | std::cout << "\nIncorrectly ";
|
|---|
| 89 | s.clear();
|
|---|
| 90 | s.seekg(0);
|
|---|
| 91 | search(s, e, 1);
|
|---|
| 92 |
|
|---|
| 93 | return 0;
|
|---|
| 94 | }
|
|---|