id,summary,reporter,owner,description,type,status,milestone,component,version,severity,resolution,keywords,cc 6057,Lazy variance in accumulator gives wrong result when too high statistics,Johan ,Matthias Troyer,"Hello, I was trying to compare performances of various ways to compute the variance of a distribution of integer values, because I need a very fast one. I tried to use lazy_variance and variance, a custom algorithm, (and also TProfile class from ROOT). What I saw is that using lazy_variance gives the correct mean value but a weird variance value when the number of events becomes large (n > 0x3FFFF). The bug was seen first with boost 1.44.0 on a 32 bit Fedora 14 machine (using gcc 4.5.1), and also with boost 1.46.1 on a 64 bit Mandriva 2011 (using gcc 4.6.1). Here below is the source code to reproduce the bug (ROOT stuff commented) : {{{ #include #include #include // accumulators #include #include #include #include // root //#include // timing info #include // Nbr aléatoires #include #include #include using namespace std; using namespace boost::accumulators; using namespace boost; class MyStat { public: MyStat() : NbrEvt( 0u ), Mean( 0. ), MeanSq( 0. ) {}; ~MyStat() {}; void fill( const int& ev ) { Mean = ( Mean * NbrEvt + ev ) / double( NbrEvt + 1 ); MeanSq = ( MeanSq * NbrEvt + ev * ev ) / double( NbrEvt + 1 ); NbrEvt++; }; double mean() const { return Mean; }; double variance() const { return MeanSq - Mean * Mean; }; protected: long unsigned NbrEvt; double Mean; double MeanSq; }; int main( int argc, char* argv[] ) { lagged_fibonacci44497 GenerateurBase( 12345 ); uniform_int< int > Distribution( -128, 127 ); variate_generator< lagged_fibonacci44497&, uniform_int< int > > Generateur( GenerateurBase, Distribution ); vector< int > Data; long unsigned i, total( 0xFFFF ); for ( i = 0; i < total; i++ ) Data . push_back( Generateur() ); cout << ""Test using "" << total << "" data points."" << endl << endl; accumulator_set< signed, stats< tag::mean, tag::lazy_variance > > lazyVar; accumulator_set< signed, stats< tag::mean, tag::variance > > immedVar; //TProfile * profile = new TProfile( ""profile"", ""My profile"", 1, -.5, .5, ""S"" ); MyStat simpleStat; progress_timer * chrono = 0; cout << ""Lazy variance accumulator : ""; chrono = new progress_timer(); for ( i = 0; i < total; i++ ) lazyVar( Data[ i ] ); delete chrono; cout << ""Mean : "" << extract_result< tag::mean >( lazyVar ) << endl << ""Variance : "" << extract_result< tag::lazy_variance >( lazyVar ) << endl << endl; cout << ""Immediate variance accumulator : ""; chrono = new progress_timer(); for ( i = 0; i < total; i++ ) immedVar( Data[ i ] ); delete chrono; cout << ""Mean : "" << extract_result< tag::mean >( immedVar ) << endl << ""Variance : "" << extract_result< tag::variance >( immedVar ) << endl << endl; /* cout << ""TProfile : ""; chrono = new progress_timer(); for ( i = 0; i < total; i++ ) profile -> Fill( 0., double( Data[ i ] ) ); delete chrono; cout << ""Mean : "" << profile -> GetBinContent( 1 ) << endl << ""Variance : "" << profile -> GetBinError( 1 ) * profile -> GetBinError( 1 ) << endl << endl; */ cout << ""Simple classe : ""; chrono = new progress_timer(); for ( i = 0; i < total; i++ ) simpleStat . fill( Data[ i ] ); delete chrono; cout << ""Mean : "" << simpleStat . mean() << endl << ""Variance : "" << simpleStat . variance() << endl << endl << flush; delete profile; return 0; } }}} And here is the output : {{{ Test using 268435455 data points. Lazy variance accumulator : 0.27 s Mean : -0.497278 Variance : 5.30971 Immediate variance accumulator : 9.07 s Mean : -0.497278 Variance : 5461.31 TProfile : 15.56 s Mean : -0.497278 Variance : 5461.31 Simple classe : 7.28 s Mean : -0.497278 Variance : 5461.31 }}} Note that all mean values agree, and all variance values but the lazy one do too. Cheers, johan",Bugs,closed,To Be Determined,accumulator,Boost 1.44.0,Problem,wontfix,accumulator lazy_variance,