1 |
|
---|
2 | // Boost.Regex partial_match implementation is incorrect
|
---|
3 | //
|
---|
4 | // Submitted by Dr. Robert van Engelen, engelen@genivia.com
|
---|
5 | // Related Ticket: #11776
|
---|
6 | // Affects: RE/flex project https://sourceforge.net/projects/re-flex/
|
---|
7 | //
|
---|
8 | // Problem: Using iterators with partial_match to search incremental input
|
---|
9 | // (such as interactive input) and searching long patterns in files is
|
---|
10 | // impossible when greedy repetition patterns are used (*, +, etc).
|
---|
11 | // Repetitions may behave as lazy repetitions (*?, +?, etc).
|
---|
12 |
|
---|
13 | #include <iostream>
|
---|
14 | #include <fstream>
|
---|
15 | #include <sstream>
|
---|
16 | #include <string>
|
---|
17 | #include <cctype>
|
---|
18 | #include <boost/regex.hpp>
|
---|
19 |
|
---|
20 | void search(std::istream& is, const boost::regex& e, unsigned n)
|
---|
21 | {
|
---|
22 | // buffer we'll be searching in:
|
---|
23 | char buf[4096];
|
---|
24 | // current position
|
---|
25 | const char *cur = buf;
|
---|
26 | // end of filled buffer text
|
---|
27 | char *pos = buf;
|
---|
28 | // flag to indicate whether there is more input to come:
|
---|
29 | bool have_more = true;
|
---|
30 | // flags
|
---|
31 | boost::match_flag_type flg = boost::match_default | boost::regex_constants::match_not_null | boost::match_partial;
|
---|
32 | // iterator end
|
---|
33 | boost::cregex_iterator b;
|
---|
34 |
|
---|
35 | while (have_more)
|
---|
36 | {
|
---|
37 | // fill more of the buffer by adding n chars
|
---|
38 | is.read(pos, n);
|
---|
39 | unsigned read = is.gcount();
|
---|
40 | // check to see if we've run out of text:
|
---|
41 | have_more = read == n;
|
---|
42 | // update the end of filled buffer text
|
---|
43 | pos += read;
|
---|
44 | // and then iterate:
|
---|
45 | boost::cregex_iterator a(
|
---|
46 | cur,
|
---|
47 | pos,
|
---|
48 | e,
|
---|
49 | flg);
|
---|
50 | // while more matches
|
---|
51 | while (a != b)
|
---|
52 | {
|
---|
53 | if ((*a)[0].matched == false)
|
---|
54 | {
|
---|
55 | // Partial match
|
---|
56 | break;
|
---|
57 | }
|
---|
58 | else if ((*a)[0].second < pos || !have_more) // as per documentation
|
---|
59 | {
|
---|
60 | // full match:
|
---|
61 | std::string full((*a)[0].first, (*a)[0].second - (*a)[0].first);
|
---|
62 | std::cout << "matched: " << full << std::endl;
|
---|
63 | size_t cap, groups = (*a).size();
|
---|
64 | for (cap = 1; cap < groups && !(*a)[cap].matched; ++cap)
|
---|
65 | continue;
|
---|
66 | if (cap < groups)
|
---|
67 | std::cout << "group = " << cap << std::endl;
|
---|
68 | // save next current position
|
---|
69 | cur = (*a)[0].second;
|
---|
70 | }
|
---|
71 |
|
---|
72 | // move to next match:
|
---|
73 | ++a;
|
---|
74 | }
|
---|
75 | }
|
---|
76 | }
|
---|
77 |
|
---|
78 | int main()
|
---|
79 | {
|
---|
80 | boost::regex e("a.*c");
|
---|
81 | std::stringstream s("abc abc");
|
---|
82 |
|
---|
83 | // read blocks of 100 chars at a time, OK because match fits in 100 bytes
|
---|
84 | std::cout << "\nCorrectly ";
|
---|
85 | search(s, e, 100);
|
---|
86 |
|
---|
87 | // read by one char at a time (interactive), pattern behaves as a lazy .*?
|
---|
88 | std::cout << "\nIncorrectly ";
|
---|
89 | s.clear();
|
---|
90 | s.seekg(0);
|
---|
91 | search(s, e, 1);
|
---|
92 |
|
---|
93 | return 0;
|
---|
94 | }
|
---|