Ticket #6992: CalculateMedianMeanOfInputColumn.cc

File CalculateMedianMeanOfInputColumn.cc, 5.6 KB (added by polyactis@…, 10 years ago)

source code

Line 
1/*
2 * 2012.6.14
3 * a c++ version of vervet/src/mapper/CalculateMedianModeFromSAMtoolsDepthOutput.py
4 *
5 * Example:
6 * CalculateMedianMeanOfInputColumn -i /tmp/input.tsv.gz -o /tmp/output -f 0.7
7 */
8
9#include "CalculateMedianMeanOfInputColumn.h"
10
11CalculateMedianMeanOfInputColumn::CalculateMedianMeanOfInputColumn(string _inputFname, string _outputFname, int _alignmentID,
12 float _fractionToSample, int _noOfLinesInHeader, int _whichColumn, int _maxNumberOfSamplings) \
13 :inputFname(_inputFname), outputFname(_outputFname), alignmentID(_alignmentID), fractionToSample(_fractionToSample),\
14 noOfLinesInHeader(_noOfLinesInHeader), whichColumn(_whichColumn), maxNumberOfSamplings(_maxNumberOfSamplings)
15{
16 int inputFnameLength = inputFname.length();
17 if (inputFname.substr(inputFnameLength-2, 2)=="gz"){
18 inputFile.open(inputFname.c_str(), std::ios::in | std::ios::binary);
19 input.push(boost::iostreams::gzip_decompressor());
20 }
21 else{
22 inputFile.open(inputFname.c_str(), std::ios::in );
23 }
24 input.push(inputFile);
25
26 outputFile.open(outputFname.c_str(), std::ios::out);
27 outputFile<<"alignmentID\ttotal_base_count\tsampled_base_count\tmeanDepth\tmedianDepth\tmodeDepth"<<endl;
28
29}
30
31
32
33CalculateMedianMeanOfInputColumn::~CalculateMedianMeanOfInputColumn(){
34 //inputFile.close();
35 stat2Occurrence.clear();
36 outputFile.flush();
37 outputFile.close();
38}
39
40
41void CalculateMedianMeanOfInputColumn::run(){
42
43 std::string line;
44 size_t bytes_read = 0;
45 std::istream incoming(&input);
46 boost::char_separator<char> sep(" \t,"); //blank or '\t' or ',' is the separator
47 typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
48
49 std::getline(incoming, line);
50 //while (incoming.good() && !incoming.eof())
51 statList.set_size(10000, 1);
52 int noOfData = 0;
53 int noOfSampledData = 0;
54 base_generator_type generator(42);
55 //boost::mt19937 generator;
56 boost::uniform_real<> uni_dist(0,1);
57 boost::variate_generator<base_generator_type&, boost::uniform_real<> > uni(generator, uni_dist);
58 //boost::uniform_01<> uni();
59 __gnu_cxx::hash_map<int, int >::iterator stat2OccurrenceIter = stat2Occurrence.begin();
60 __gnu_cxx::hash_map<int, int >::iterator modeStatIter = stat2Occurrence.begin();
61
62 double toss; //random number
63 while (!line.empty()){
64 noOfData++;
65 toss=uni();
66 if (toss<=fractionToSample && noOfSampledData<maxNumberOfSamplings){
67 noOfSampledData ++;
68 tokenizer line_toks(line, sep);
69 tokenizer::iterator tokenizer_iter = line_toks.begin();
70 for (int i=0;i<whichColumn;i++){
71 ++tokenizer_iter;
72 }
73 if (noOfSampledData>statList.n_elem){
74 statList.reshape(statList.n_elem+10000,1);
75 }
76 string statInStr = *tokenizer_iter;
77 int stat = atoi(statInStr.c_str());
78 cout<< stat << endl;
79 //statList << atoi(stat.c_str()) << endr; //endr is not recognized
80 statList(noOfSampledData-1,0) = stat;
81 acc(stat);
82 //for the mode statistic
83 stat2OccurrenceIter = stat2Occurrence.find(stat);
84 if (stat2OccurrenceIter==stat2Occurrence.end()){
85 stat2Occurrence[stat] = 0;
86 }
87 stat2Occurrence[stat]++;
88 stat2OccurrenceIter = stat2Occurrence.find(stat);
89 modeStatIter = stat2Occurrence.find(modeStat);
90 if (modeStatIter==stat2Occurrence.end()){ //doesn't exist yet
91 modeStat = stat;
92 }
93 else{
94 if (stat2OccurrenceIter->second > modeStatIter->second){
95 modeStat = stat;
96 }
97 }
98 }
99 //bytes_read += line.size();
100 // progress dlg with bytes_read / uncompressed size
101 std::getline(incoming, line);
102 }
103 statList.reshape(noOfSampledData,1);
104 //cout<< statList << endl;
105 double medianStat;
106 double meanStat;
107 if (noOfSampledData>0){
108 medianStat = arma::median(statList.col(0));
109 meanStat = arma::mean(statList.col(0));
110
111 // Display the results by boost accumulators ...
112 std::cout << "Mean: " << ba::mean(acc) << std::endl;
113 std::cout << "Median: " << ba::median(acc) << std::endl;
114 }
115 outputFile<< alignmentID << "\t" << noOfData << "\t" << noOfSampledData << "\t"<< meanStat << "\t"
116 << medianStat << "\t" << modeStat <<endl;
117 //cout<< medianStat << endl;
118 //cout<< meanStat << endl;
119 //output it
120}
121
122int main(int argc, char* argv[]) {
123 int alignmentID;
124 float fractionToSample;
125 int maxNumberOfSamplings;
126 int whichColumn;
127 int noOfLinesInHeader;
128 string inputFname;
129 string outputFname;
130 po::options_description desc("Allowed options");
131 desc.add_options()("help,h", "produce help message")
132 ("alignmentID,a", po::value<int>(&alignmentID)->default_value(0),
133 "ID of this alignment from which all the stats are extracted.")
134 ("fractionToSample,f", po::value<float>(&fractionToSample)->default_value(0.001),
135 "fraction of input data to sample for median/mean calculation.")
136 ("noOfLinesInHeader,n", po::value<int >(&noOfLinesInHeader)->default_value(1), "how many lines the header contains")
137 ("whichColumn,w", po::value<int >(&whichColumn)->default_value(1), "which column of inputFname is the target stat.")
138 ("outputFname,o", po::value<string>(&outputFname), "output filename")
139 ("inputFname,i", po::value<string>(&inputFname), "input filename, space/tab/coma-delimited, gzipped or not.")
140 ("maxNumberOfSamplings,m", po::value<int>(&maxNumberOfSamplings)->default_value(10000000),
141 "max number of samples to take into memory for median/mode/mean calculation to avoid memory blowup.");
142
143 //po::positional_options_description p;
144 //p.add("input-file", -1);
145
146 po::variables_map vm;
147 po::store(po::parse_command_line(argc, argv, desc), vm);
148 po::notify(vm);
149 if (vm.count("help") || inputFname.empty() || outputFname.empty()){
150 cout << desc << endl;
151 return 1;
152 }
153
154
155
156 CalculateMedianMeanOfInputColumn instance(inputFname, outputFname,
157 alignmentID, fractionToSample, noOfLinesInHeader, whichColumn, maxNumberOfSamplings);
158 instance.run();
159}