Context Navigation

Back to Ticket #6992

Ticket #6992: CalculateMedianMeanOfInputColumn.cc

File CalculateMedianMeanOfInputColumn.cc, 5.6 KB (added by polyactis@…, 10 years ago)
source code

Line
1	/*
2	* 2012.6.14
3	* a c++ version of vervet/src/mapper/CalculateMedianModeFromSAMtoolsDepthOutput.py
4	*
5	* Example:
6	* CalculateMedianMeanOfInputColumn -i /tmp/input.tsv.gz -o /tmp/output -f 0.7
7	*/
8
9	#include "CalculateMedianMeanOfInputColumn.h"
10
11	CalculateMedianMeanOfInputColumn::CalculateMedianMeanOfInputColumn(string _inputFname, string _outputFname, int _alignmentID,
12	float _fractionToSample, int _noOfLinesInHeader, int _whichColumn, int _maxNumberOfSamplings) \
13	:inputFname(_inputFname), outputFname(_outputFname), alignmentID(_alignmentID), fractionToSample(_fractionToSample),\
14	noOfLinesInHeader(_noOfLinesInHeader), whichColumn(_whichColumn), maxNumberOfSamplings(_maxNumberOfSamplings)
15	{
16	int inputFnameLength = inputFname.length();
17	if (inputFname.substr(inputFnameLength-2, 2)=="gz"){
18	inputFile.open(inputFname.c_str(), std::ios::in \| std::ios::binary);
19	input.push(boost::iostreams::gzip_decompressor());
20	}
21	else{
22	inputFile.open(inputFname.c_str(), std::ios::in );
23	}
24	input.push(inputFile);
25
26	outputFile.open(outputFname.c_str(), std::ios::out);
27	outputFile<<"alignmentID\ttotal_base_count\tsampled_base_count\tmeanDepth\tmedianDepth\tmodeDepth"<<endl;
28
29	}
30
31
32
33	CalculateMedianMeanOfInputColumn::~CalculateMedianMeanOfInputColumn(){
34	//inputFile.close();
35	stat2Occurrence.clear();
36	outputFile.flush();
37	outputFile.close();
38	}
39
40
41	void CalculateMedianMeanOfInputColumn::run(){
42
43	std::string line;
44	size_t bytes_read = 0;
45	std::istream incoming(&input);
46	boost::char_separator<char> sep(" \t,"); //blank or '\t' or ',' is the separator
47	typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
48
49	std::getline(incoming, line);
50	//while (incoming.good() && !incoming.eof())
51	statList.set_size(10000, 1);
52	int noOfData = 0;
53	int noOfSampledData = 0;
54	base_generator_type generator(42);
55	//boost::mt19937 generator;
56	boost::uniform_real<> uni_dist(0,1);
57	boost::variate_generator<base_generator_type&, boost::uniform_real<> > uni(generator, uni_dist);
58	//boost::uniform_01<> uni();
59	__gnu_cxx::hash_map<int, int >::iterator stat2OccurrenceIter = stat2Occurrence.begin();
60	__gnu_cxx::hash_map<int, int >::iterator modeStatIter = stat2Occurrence.begin();
61
62	double toss; //random number
63	while (!line.empty()){
64	noOfData++;
65	toss=uni();
66	if (toss<=fractionToSample && noOfSampledData<maxNumberOfSamplings){
67	noOfSampledData ++;
68	tokenizer line_toks(line, sep);
69	tokenizer::iterator tokenizer_iter = line_toks.begin();
70	for (int i=0;i<whichColumn;i++){
71	++tokenizer_iter;
72	}
73	if (noOfSampledData>statList.n_elem){
74	statList.reshape(statList.n_elem+10000,1);
75	}
76	string statInStr = *tokenizer_iter;
77	int stat = atoi(statInStr.c_str());
78	cout<< stat << endl;
79	//statList << atoi(stat.c_str()) << endr; //endr is not recognized
80	statList(noOfSampledData-1,0) = stat;
81	acc(stat);
82	//for the mode statistic
83	stat2OccurrenceIter = stat2Occurrence.find(stat);
84	if (stat2OccurrenceIter==stat2Occurrence.end()){
85	stat2Occurrence[stat] = 0;
86	}
87	stat2Occurrence[stat]++;
88	stat2OccurrenceIter = stat2Occurrence.find(stat);
89	modeStatIter = stat2Occurrence.find(modeStat);
90	if (modeStatIter==stat2Occurrence.end()){ //doesn't exist yet
91	modeStat = stat;
92	}
93	else{
94	if (stat2OccurrenceIter->second > modeStatIter->second){
95	modeStat = stat;
96	}
97	}
98	}
99	//bytes_read += line.size();
100	// progress dlg with bytes_read / uncompressed size
101	std::getline(incoming, line);
102	}
103	statList.reshape(noOfSampledData,1);
104	//cout<< statList << endl;
105	double medianStat;
106	double meanStat;
107	if (noOfSampledData>0){
108	medianStat = arma::median(statList.col(0));
109	meanStat = arma::mean(statList.col(0));
110
111	// Display the results by boost accumulators ...
112	std::cout << "Mean: " << ba::mean(acc) << std::endl;
113	std::cout << "Median: " << ba::median(acc) << std::endl;
114	}
115	outputFile<< alignmentID << "\t" << noOfData << "\t" << noOfSampledData << "\t"<< meanStat << "\t"
116	<< medianStat << "\t" << modeStat <<endl;
117	//cout<< medianStat << endl;
118	//cout<< meanStat << endl;
119	//output it
120	}
121
122	int main(int argc, char* argv[]) {
123	int alignmentID;
124	float fractionToSample;
125	int maxNumberOfSamplings;
126	int whichColumn;
127	int noOfLinesInHeader;
128	string inputFname;
129	string outputFname;
130	po::options_description desc("Allowed options");
131	desc.add_options()("help,h", "produce help message")
132	("alignmentID,a", po::value<int>(&alignmentID)->default_value(0),
133	"ID of this alignment from which all the stats are extracted.")
134	("fractionToSample,f", po::value<float>(&fractionToSample)->default_value(0.001),
135	"fraction of input data to sample for median/mean calculation.")
136	("noOfLinesInHeader,n", po::value<int >(&noOfLinesInHeader)->default_value(1), "how many lines the header contains")
137	("whichColumn,w", po::value<int >(&whichColumn)->default_value(1), "which column of inputFname is the target stat.")
138	("outputFname,o", po::value<string>(&outputFname), "output filename")
139	("inputFname,i", po::value<string>(&inputFname), "input filename, space/tab/coma-delimited, gzipped or not.")
140	("maxNumberOfSamplings,m", po::value<int>(&maxNumberOfSamplings)->default_value(10000000),
141	"max number of samples to take into memory for median/mode/mean calculation to avoid memory blowup.");
142
143	//po::positional_options_description p;
144	//p.add("input-file", -1);
145
146	po::variables_map vm;
147	po::store(po::parse_command_line(argc, argv, desc), vm);
148	po::notify(vm);
149	if (vm.count("help") \|\| inputFname.empty() \|\| outputFname.empty()){
150	cout << desc << endl;
151	return 1;
152	}
153
154
155
156	CalculateMedianMeanOfInputColumn instance(inputFname, outputFname,
157	alignmentID, fractionToSample, noOfLinesInHeader, whichColumn, maxNumberOfSamplings);
158	instance.run();
159	}

Download in other formats:

Original Format