Opened 6 years ago

Last modified 5 years ago

#12959 new Bugs

Regex class negation

Reported by: robic@… Owned by: John Maddock
Milestone: To Be Determined Component: regex
Version: Boost 1.61.0 Severity: Showstopper
Keywords: Cc:

Description

Pertains to boost::regex Tested on version 1.61

Flags: Perl
Target string: abc092efg
Regex: [^\W\D]+

Function: regex_search

Matches: abc092efg

Should match: 092

Notes

Negative class resolution: 'Not-Not Word' AND 'Not-Not Digit'
The intersection of word AND digits is digits.

Every other regex engine does this correctly.
This includes Perl, PCRE, JS, C++11, Python, etc..

In this engine, [^\W\D] matches what [\w\d] does.
[^\W\D] appears not to be an intersection as the operator in
a negative class is AND.

Fwiw - this behavior is seen with all negated shorthand elements of a
negative class, i.e. [^\S\W] matches all whitespace OR all word char's.

Change History (3)

comment:1 by robic@…, 6 years ago

A solution could be to keep a vector of individual class
instead of a mask for all classes cnclasses

Only in a negated class, and only negative classes need to be tracked.
The rest remains unchanged.

Something like this works (tested):

<boost\regex\v4\basic_regex_creator.hpp>

template <class charT, class traits>
class basic_char_set
{ 
  private:
  std::vector<unsigned __int64> m_NegNeg_Class;

public:

   typedef typename std::vector<uint64_t>::const_iterator  cNNclass_list_iterator;
   bool has_NegNegClasses()const
   {
      return m_NegNeg_Class.size() > 0 ? true : false;
   }
   cNNclass_list_iterator cn_begin()const
   {
      return m_NegNeg_Class.begin();
   }
   cNNclass_list_iterator cn_end()const
   {
      return m_NegNeg_Class.end();
   }

   void add_negated_class(m_type m)
   {
      if ( m_negate )
      {
        // if it's not already there, add it ..
        bool bDoAdd = true;

        for ( int i = 0; i < m_NegNeg_Class.size(); i++ )
        {
           if ( m_NegNeg_Class[i] == (unsigned __int64)m )
           {
             bDoAdd = false;
             break;
           }
        }
        if ( bDoAdd )
           m_NegNeg_Class.insert( m_NegNeg_Class.end(), (unsigned __int64)m );
      }
      else
         m_negated_classes |= m;
      m_empty = false;
   }
}

template <class charT, class traits>
re_syntax_base* basic_regex_creator<charT, traits>::append_set(
   const basic_char_set<charT, traits>& char_set, mpl::false_*)
{
   typedef typename basic_char_set<uint64_t, traits>::prop_list_iterator cNegNegClas_item_iterator;

   result->cNegNegClasses = 0;
   if ( char_set.is_negated() )
   {
      result->cNegNegClasses = static_cast<uint32_t>(::boost::BOOST_REGEX_DETAIL_NS::distance(char_set.cn_begin(), char_set.cn_end()));
   }

   //
   // now extend with all the negated negative character classes:
   //
   if ( result->isnot == true )
   {
      cNegNegClas_item_iterator cnfirst, cnlast;
   
      cnfirst = char_set.cn_begin();
      cnlast  = char_set.cn_end();
      
      while(cnfirst != cnlast)
      {
         uint64_t* p = static_cast<uint64_t*>(this->m_pdata->m_data.extend(sizeof(uint64_t) * 1));
         p[0] = *cnfirst;

         if(flags() & regbase::icase)
         {
            // adjust class as needed:
            if(((p[0] & m_lower_mask) == m_lower_mask) || ((p[0] & m_upper_mask) == m_upper_mask))
               p[0] |= m_alpha_mask;
         }
         ++cnfirst;
      }
   }
}

<boost\regex\v4\perl_matcher.hpp>

template <class iterator, class charT, class traits_type, class char_classT>
iterator BOOST_REGEX_CALL re_is_set_member(iterator next, 
                          iterator last, 
                          const re_set_long<char_classT>* set_, 
                          const regex_data<charT, traits_type>& e, bool icase)
{   

   // try and match a single character from the neg-neg classes
   if ( set_->cNegNegClasses && set_->isnot )
   {
       for(i = 0; i < set_->cNegNegClasses; i++)
       {
          uint64_t mask = *((uint64_t*)p);
          if(traits_inst.isctype(col, (mask_type)mask) == false)
             return set_->isnot ? next : ++next;
          p += (sizeof(uint64_t*) / sizeof(charT));
       }
   }

   // the rest unchanged
   if( set_->cclasses != 0 )
   {
      if(traits_inst.isctype(col, set_->cclasses) == true)
         return set_->isnot ? next : ++next;
   }

   if( set_->cnclasses != 0 )
   {
       if(traits_inst.isctype(col, set_->cnclasses) == false)
         return set_->isnot ? next : ++next;
   }
   return set_->isnot ? ++next : next;
}

comment:2 by robic@…, 6 years ago

Sorry, were some typo's in my last comment. Result, as is actually used now:

<boost\regex\v4\basic_regex_creator.hpp>

  if ( m_negate )
   {
      // if it's not already there, add it ..
      if ( false == (std::find(m_NegNeg_Class.begin(), m_NegNeg_Class.end(), m) != m_NegNeg_Class.end()) )
         m_NegNeg_Class.insert( m_NegNeg_Class.end(), (unsigned __int64)m );
   }
   else
      m_negated_classes |= m;
   m_empty = false;


<boost\regex\v4\perl_matcher.hpp>

  // try and match a single character from the neg-neg classes
   if ( set_->cNegNegClasses ) //&& set_->isnot )
   {
       for(i = 0; i < set_->cNegNegClasses; i++)
       {
          uint64_t mask = *((uint64_t*)p);
          if(traits_inst.isctype(col, (mask_type)mask) == false)
             return set_->isnot ? next : ++next;
          p += (sizeof(uint64_t) / sizeof(charT));
       }
   }
 

comment:3 by John Maddock, 5 years ago

Thanks, I fear you are correct :(

Note: See TracTickets for help on using tickets.