Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added mutual information via whichFst #208

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 17 additions & 1 deletion misc/realSFS.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,23 @@ int fst_index(int argc,char **argv){

int fst(int argc,char**argv){
if(argc==0){
fprintf(stderr,"\t-> Possible options: index print\n");
fprintf(stderr,"realSFS fst [index|print|stats|stats2]\n");
fprintf(stderr,"--------------------------------------\n");
fprintf(stderr,"\tindex [<pop1>.saf.idx <pop2>.saf.idx ...] -sfs [<pop1-pop2>.sfs] -fstout [<prefix>] -whichFst 0\n");
fprintf(stderr,"\t\t-sfs \tFlattened pairwise 2d site frequency spectra (if more than two populations, use multiple times)\n");
fprintf(stderr,"\t\t-fstout \tOutput prefix\n");
fprintf(stderr,"\t\t-whichFst\t0: default, Fst estimator from Reynolds et al (1983) Genetics 3: 767-779\n");
fprintf(stderr,"\t\t \t1: Fst estimator from Bhatia et al (2013) Genome Res 23: 1514-1521\n");
fprintf(stderr,"\t\t \t2: mutual information/Shannon differentiation (not an Fst estimator)\n");
fprintf(stderr,"\tprint [<fstout>.fst.idx]\n");
fprintf(stderr,"\tstats [<fstout>.fst.idx]\n");
fprintf(stderr,"\tstats2 [<fstout>.fst.idx]\n");
fprintf(stderr,"\t\t-win \tWindow size\n");
fprintf(stderr,"\t\t-step \tStep size, e.g. shift in bp from one window to next\n");
fprintf(stderr,"\t\t-type \t0: default, split genome into blocks and use the first window that has data for the entire window\n");
fprintf(stderr,"\t\t \t1: use first position with data as leftmost position of first window\n");
fprintf(stderr,"\t\t \t2: use first coordinate as leftmost position of first window, regardless of whether there is data\n");
fprintf(stderr,"Examples and output formats at www.popgen.dk/angsd/index.php/Fst\n");
return 0;
}
if(!strcasecmp(*argv,"index"))
Expand Down
43 changes: 42 additions & 1 deletion misc/safstat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,52 @@ void bhatiaFst(int sfs1,int sfs2,double **aMat,double **bMat){
at++;
}
}
//nspope; mutual information across 2d sfs bins
//the mutual information is saved in aMat
//the maximum possible mutual information is saved in bMat
//thus fst_stat calculates the normalized mutual information (shannon differentiation) which is bounded [0,1]
//note that mutual information isn't fst, although the "labels" from fst_stat will say fst
void mutualInf(int sfs1,int sfs2,double **aMat,double **bMat){
fprintf(stderr,"\t-> [%s] sfs1:%d sfs2:%d dimspace:%d \n",__FUNCTION__,sfs1,sfs2,(sfs1+1)*(sfs2+1));
fprintf(stderr,"\t-> WARNING! calculating mutual information instead of FST, pbs statistics will be meaningless\n");
*aMat = new double[(sfs1+1)*(sfs2+1)];
*bMat = new double[(sfs1+1)*(sfs2+1)];
double rat = static_cast<double>(sfs1)/static_cast<double>(sfs1+sfs2);
double K = -rat * log(rat) - (1.-rat) * log(1.-rat);
int at=0;
for(int a1=0;a1<=sfs1;a1++)
for(int a2=0;a2<=sfs2;a2++){
double p1 = static_cast<double>(a1);
double p2 = static_cast<double>(a2);
double q1 = static_cast<double>(sfs1-a1);
double q2 = static_cast<double>(sfs2-a2);
double z1 = static_cast<double>(a1+a2);
double z2 = static_cast<double>(sfs1+sfs2)-z1;
double f11 = z1*rat;
double f12 = z2*rat;
double f21 = z1*(1.-rat);
double f22 = z2*(1.-rat);
double lrt = 0.;
if (p1*f11 > 0.0) lrt += p1*log(p1/f11);
if (p2*f21 > 0.0) lrt += p2*log(p2/f21);
if (q1*f12 > 0.0) lrt += q1*log(q1/f12);
if (q2*f22 > 0.0) lrt += q2*log(q2/f22);
lrt /= (z2 + z1);

(*aMat)[at] = lrt;
(*bMat)[at] = K;

//fprintf(stderr,"(%d,%d) al:%f bal:%f\n",a1,a2,(*aMat)[at],(*bMat)[at]);
at++;
}
}
void calcCoef(int sfs1,int sfs2,double **aMat,double **bMat,int whichFst){
if(whichFst==0)
reynoldFst(sfs1,sfs2,aMat,bMat);
else if(whichFst==1)
bhatiaFst(sfs1,sfs2,aMat,bMat);
else if(whichFst==2)//nspope
mutualInf(sfs1,sfs2,aMat,bMat);
else{
fprintf(stderr,"\t-> Fst option is not implemented\n");
exit(0);
Expand Down Expand Up @@ -171,7 +212,7 @@ int fst_print(int argc,char **argv){
for(size_t s=first;s<last;s++){
fprintf(stdout,"%s\t%d",it->first,ppos[s]+1);
for(int i=0;i<choose((int)pf->names.size(),2);i++)
fprintf(stdout,"\t%f\t%f",ares[i][s],bres[i][s]);
fprintf(stdout,"\t%.7g\t%.7g",ares[i][s],bres[i][s]);
fprintf(stdout,"\n");
}
for(int i=0;i<choose((int)pf->names.size(),2);i++){
Expand Down