/
ReadDistribution.h
134 lines (119 loc) · 4.42 KB
/
ReadDistribution.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#ifndef READDISTRIBUTION_H
#define READDISTRIBUTION_H
#include<vector>
#include<map>
using namespace std;
#include "TranscriptInfo.h"
#include "TranscriptExpression.h"
#include "TranscriptSequence.h"
#include "samtools/bam.h"
#include "samtools/sam.h"
namespace ns_rD {
// Defaults: {{{
const char LOW_PROB_MISSES = 6;
const char MAX_NODE_PAR = 2;
const long trSizes [] = { 1334,2104,2977,4389};
const char trSizesN = 4;
const char trNumberOfBins = 20;
const char vlmmNodeDependence [] = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0};
const char vlmmNodesN = 21;
const char vlmmStartOffset = 8;
const long pows4 [] = {1,4,16,64,256,1024,4096};
//}}}
struct fragmentT{//{{{
bam1_t *first,*second;
bool paired;
fragmentT(){
first = bam_init1();
second = bam_init1();
paired = true;
}
~fragmentT(){
bam_destroy1(first);
bam_destroy1(second);
}
void copyFragment(const fragmentT *sourceF){
paired = sourceF->paired;
bam_copy1(first, sourceF->first);
bam_copy1(second, sourceF->second);
}
};
typedef fragmentT *fragmentP;
//}}}
class VlmmNode{//{{{
private:
long parentsN;
vector<double> probs;
public:
VlmmNode(){parentsN = 0;}
VlmmNode(long p);
void setParentsN(long p);
void update(double Iexp, char b, char bp, char bpp);
void normalize();
double getP(char b, char bp, char bpp) const;
double getPsum(char b) const;
};//}}}
enum biasT { readM_5, readM_3, uniformM_5, uniformM_3, weight_5, weight_3};
enum readT { mate_5, mate_3, FullPair };
} // namespace ns_rD
class ReadDistribution{
private:
long procN,M,fragSeen,singleReadLength,minFragLen;
double lMu,lSigma,logLengthSum,logLengthSqSum;
long lowProbMismatches;
bool verbose,warnFirst,uniform,unstranded,lengthSet,gotExpression,normalized;
bool validLength;
long warnPos, warnTIDmismatch, warnUnknownTID, noteFirstMateDown;
TranscriptInfo* trInf;
TranscriptSequence* trSeq;
TranscriptExpression* trExp;
// for each transcript, remember seen fragments in map: length->(sum of probs)
vector<map<long,double> > trFragSeen5,trFragSeen3;
// cache for already computed weight norms for:
// (single reads 5',3', Pair) x Transcript x Length
vector<vector<map<long, double> > > weightNorms;
// position probability arrays (RE-FACTOR to array of 4 vectors)
vector<vector<vector<double> > > posProb;
vector<vector<ns_rD::VlmmNode> > seqProb;
// Cache probabilities for Phred score.
vector<double> lProbMis;
vector<double> lProbHit;
// Mismatch likelihods along read.
vector<double> lFreqHit;
vector<double> lFreqMis;
// Cache length probabilities.
vector<double> lLengthP,lLengthNorm;
map<long,long> fragLengths;
double getLengthLP(long len) const;
double computeLengthLP(double len) const;
double getLengthLNorm(long trLen) const;
void computeLengthProb();
void updateMismatchFreq(bam1_t *samA);
void updatePosBias(long pos, ns_rD::biasT bias, long tid, double Iexp);
void updateSeqBias(long pos, ns_rD::biasT bias, long tid, double Iexp);
double getPosBias(long start, long end, ns_rD::readT read,
long trLen) const;
double getSeqBias(long pos, ns_rD::readT read, long tid) const;
inline char getBase(long pos, const string &fSeq) const;
double getSeqBias(long start, long end, ns_rD::readT read,
const string &fSeq) const;
//inline char complementBase(char base) const;
double getWeightNorm(long len, ns_rD::readT read, long tid);
pair<double, double> getSequenceLProb(bam1_t *samA) const;
public:
ReadDistribution();
void setProcN(long procN);
void showFirstWarnings();
void writeWarnings();
bool init(long m, TranscriptInfo* trI, TranscriptSequence* trS, TranscriptExpression* trE, bool unstranded, bool verb = true);
bool initUniform(long m, TranscriptInfo* trI, TranscriptSequence* trS, bool verb = true);
void setLowProbMismatches(long m);
void setLength(double mu, double sigma);
bool observed(ns_rD::fragmentP frag);
void normalize();
void logProfiles(string logFileName = "");
bool getP(ns_rD::fragmentP frag,double &prob,double &probNoise);
long getWeightNormCount() const;
vector<double> getEffectiveLengths();
};
#endif