Opened 6 years ago
Last modified 5 years ago
#12959 new Bugs
Regex class negation
| Reported by: | Owned by: | John Maddock | |
|---|---|---|---|
| Milestone: | To Be Determined | Component: | regex | 
| Version: | Boost 1.61.0 | Severity: | Showstopper | 
| Keywords: | Cc: | 
Description
Pertains to boost::regex Tested on version 1.61
Flags: Perl
Target string: abc092efg
Regex: [^\W\D]+
Function: regex_search
Matches:  abc092efg
Should match:  092
Notes
Negative class resolution:  'Not-Not Word' AND 'Not-Not Digit'
The intersection of word AND digits is digits.
Every other regex engine does this correctly.
This includes Perl, PCRE, JS, C++11, Python, etc..
In this engine, [^\W\D] matches what [\w\d] does.
[^\W\D] appears not to be an intersection as the operator in
a negative class is AND.
Fwiw - this behavior is seen with all negated shorthand elements of a
negative class, i.e. [^\S\W] matches all whitespace OR all word char's.
Change History (3)
comment:1 by , 6 years ago
comment:2 by , 6 years ago
Sorry, were some typo's in my last comment. Result, as is actually used now:
<boost\regex\v4\basic_regex_creator.hpp>
  if ( m_negate )
   {
      // if it's not already there, add it ..
      if ( false == (std::find(m_NegNeg_Class.begin(), m_NegNeg_Class.end(), m) != m_NegNeg_Class.end()) )
         m_NegNeg_Class.insert( m_NegNeg_Class.end(), (unsigned __int64)m );
   }
   else
      m_negated_classes |= m;
   m_empty = false;
<boost\regex\v4\perl_matcher.hpp>
  // try and match a single character from the neg-neg classes
   if ( set_->cNegNegClasses ) //&& set_->isnot )
   {
       for(i = 0; i < set_->cNegNegClasses; i++)
       {
          uint64_t mask = *((uint64_t*)p);
          if(traits_inst.isctype(col, (mask_type)mask) == false)
             return set_->isnot ? next : ++next;
          p += (sizeof(uint64_t) / sizeof(charT));
       }
   }
 


A solution could be to keep a vector of individual class
instead of a mask for all classes cnclasses
Only in a negated class, and only negative classes need to be tracked.
The rest remains unchanged.
Something like this works (tested):
<boost\regex\v4\basic_regex_creator.hpp>
template <class charT, class traits> class basic_char_set { private: std::vector<unsigned __int64> m_NegNeg_Class; public: typedef typename std::vector<uint64_t>::const_iterator cNNclass_list_iterator; bool has_NegNegClasses()const { return m_NegNeg_Class.size() > 0 ? true : false; } cNNclass_list_iterator cn_begin()const { return m_NegNeg_Class.begin(); } cNNclass_list_iterator cn_end()const { return m_NegNeg_Class.end(); } void add_negated_class(m_type m) { if ( m_negate ) { // if it's not already there, add it .. bool bDoAdd = true; for ( int i = 0; i < m_NegNeg_Class.size(); i++ ) { if ( m_NegNeg_Class[i] == (unsigned __int64)m ) { bDoAdd = false; break; } } if ( bDoAdd ) m_NegNeg_Class.insert( m_NegNeg_Class.end(), (unsigned __int64)m ); } else m_negated_classes |= m; m_empty = false; } } template <class charT, class traits> re_syntax_base* basic_regex_creator<charT, traits>::append_set( const basic_char_set<charT, traits>& char_set, mpl::false_*) { typedef typename basic_char_set<uint64_t, traits>::prop_list_iterator cNegNegClas_item_iterator; result->cNegNegClasses = 0; if ( char_set.is_negated() ) { result->cNegNegClasses = static_cast<uint32_t>(::boost::BOOST_REGEX_DETAIL_NS::distance(char_set.cn_begin(), char_set.cn_end())); } // // now extend with all the negated negative character classes: // if ( result->isnot == true ) { cNegNegClas_item_iterator cnfirst, cnlast; cnfirst = char_set.cn_begin(); cnlast = char_set.cn_end(); while(cnfirst != cnlast) { uint64_t* p = static_cast<uint64_t*>(this->m_pdata->m_data.extend(sizeof(uint64_t) * 1)); p[0] = *cnfirst; if(flags() & regbase::icase) { // adjust class as needed: if(((p[0] & m_lower_mask) == m_lower_mask) || ((p[0] & m_upper_mask) == m_upper_mask)) p[0] |= m_alpha_mask; } ++cnfirst; } } }<boost\regex\v4\perl_matcher.hpp>
template <class iterator, class charT, class traits_type, class char_classT> iterator BOOST_REGEX_CALL re_is_set_member(iterator next, iterator last, const re_set_long<char_classT>* set_, const regex_data<charT, traits_type>& e, bool icase) { // try and match a single character from the neg-neg classes if ( set_->cNegNegClasses && set_->isnot ) { for(i = 0; i < set_->cNegNegClasses; i++) { uint64_t mask = *((uint64_t*)p); if(traits_inst.isctype(col, (mask_type)mask) == false) return set_->isnot ? next : ++next; p += (sizeof(uint64_t*) / sizeof(charT)); } } // the rest unchanged if( set_->cclasses != 0 ) { if(traits_inst.isctype(col, set_->cclasses) == true) return set_->isnot ? next : ++next; } if( set_->cnclasses != 0 ) { if(traits_inst.isctype(col, set_->cnclasses) == false) return set_->isnot ? next : ++next; } return set_->isnot ? ++next : next; }