Copying latest version from Google Code

mahmoudibrahim · Aug 4, 2015 · 4fa4548 · 4fa4548
1 parent 508dca1
commit 4fa4548
Show file tree

Hide file tree

Showing 12 changed files with 3,167 additions and 0 deletions.
diff --git a/JAMM.sh b/JAMM.sh
diff --git a/SignalGenerator.sh b/SignalGenerator.sh
@@ -0,0 +1,333 @@
+########################################################################
+# JAMMv1.0.7rev2 is a peak finder for joint analysis of NGS replicates.
+# Copyright (C) 2014-2015  Mahmoud Ibrahim
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# Contact: mahmoud.ibrahim@mdc-berlin.de
+########################################################################
+
+##Finding out the path
+sPath="`dirname \"$0\"`"
+sPath="`( cd \"$sPath\" && pwd )`"
+
+
+
+usage()
+{
+cat << EOF
+Welcome to JAMM v1.0.7rev2 Signal Generator Script (GNU GPLv3). Copyright (C) 2014-2015  Mahmoud Ibrahim.
+
+This program comes with ABSOLUTELY NO WARRANTY; for details visit http://www.gnu.org/licenses/gpl.html. This is free software, and you are welcome to redistribute it under certain conditions; visit http://www.gnu.org/licenses/gpl.html for details.
+
+OPTIONS:
+   -s      Directory containing sample files (required)
+   -g      Genome size file (required)
+   -o      Output Directory (required)
+   -c      directory containing input or Control files
+   -r 	   file with Regions to get signal for (required)
+   -b      Bin size for signal generation (default: 10)
+   -f      Fragment lengths (required if -t is "single")
+   -p	   Number of processors used by R scripts (default: 1)
+   -t	   Alignment type, paired or single (default: single)
+   -n	   Normalization method, chromAverage or depth (default: chromAverage)
+EOF
+}
+
+
+# ========================= 
+# Process Input parameters
+# =========================
+
+
+sdir=""
+gsize=""
+out=""
+signal="10"
+regs=""
+fraglen=""
+wdir=$(mktemp -d)
+ran=$RANDOM
+cores="1"
+type="single"
+normal="chromAverage"
+
+while getopts "s:o:c:r:b:f:g:p:t:n:" OPTION
+do
+	case $OPTION in
+	s) sdir=$OPTARG
+	;;
+	g) gsize=$OPTARG
+	;;
+	o) out=$OPTARG
+	;;	
+	c) bdir=$OPTARG
+	;;
+	r) regs=$OPTARG
+	;;
+	b) signal=$OPTARG
+	;;
+	f) fraglen=$OPTARG
+	;;
+	p) cores=$OPTARG
+	;;
+	t) type=$OPTARG
+	;;
+	n) normal=$OPTARG
+	;;
+	?)
+	usage
+	exit
+	;;
+	esac
+done
+
+
+if [[ -z $sdir ]] || [[ -z $regs ]] || [[ -z $out ]]
+then
+     usage
+     exit 1
+fi
+
+if [[ -z $fraglen ]]
+then
+	if [ $type == "single" ]
+	then
+     usage
+     exit 1
+	fi
+fi
+if [[ -d "$out/signal" ]]; then
+	printf "\n\nOutput directory $out/signal already exists. I can't override existing results!\n\n"
+	exit 0
+fi
+#=======================> DONE!
+
+
+
+# ============================= 
+# Step One: Initial Processing
+# =============================
+printf "\n\n==========================================================\nStarted JAMM1.0.7rev2 Signal Generator Pipeline...Hang on!\n==========================================================\n\n"
+
+if [ ! -d "$wdir" ]; then
+	mkdir $wdir #make working directory
+fi
+if [ ! -d "$out" ]; then
+	mkdir $out #make working directory
+fi
+mkdir $wdir/bkgd.$ran/ #directory to store background files
+mkdir $wdir/sizes.$ran/ #chromosomes and sizes
+mkdir $wdir/samples.$ran/ #store sample files
+
+dupnum=$(ls -1 $sdir | wc -l) #count how many sample files
+
+
+#separate chromosome sizes
+printf "Loading genome size file..."
+ext="$wdir/sizes.$ran/"
+awk -v ext="$ext" '{ print >> ext"/size." $1 ".bed" }' $gsize
+printf "Done!\n"
+
+
+printf "Processing sample files..."
+#load each chromosome from each sample file
+for i in $sdir/*.bed; do
+samplefile=$(basename $i)	
+	for f in $wdir/sizes.$ran/*; do
+		sizefile=$(basename $f)
+		chr=$(echo $sizefile | awk -F"." '{print $2}' | awk -F"." '{print $1}');
+		awk -v chr="$chr" -v ext="$wdir/samples.$ran/" -v samplefile="$samplefile" -F"\t" '$1 == chr { print $2"\t"$6 >> ext"sample."chr"."samplefile }' "$i" 
+	done
+done
+printf "Done!\n"
+
+
+if [ ! -z $bdir ]; then
+#concatenate all background files into one file
+printf "Processing control files..."
+cat $bdir/*.bed > $wdir/bkgd.$ran/ctrl.bed
+
+for f in $wdir/sizes.$ran/*; do
+	sizefile=$(basename $f)
+	chr=$(echo $sizefile | awk -F"." '{print $2}' | awk -F"." '{print $1}');
+	awk -v chr="$chr" -v ext="$wdir/bkgd.$ran/" -F"\t" '$1 == chr { print $2"\t"$6 >> ext"bkgd."chr".ctrl.bed" }' "$wdir/bkgd.$ran/ctrl.bed"
+done
+
+
+printf "Done!\n"
+fi
+
+
+#determine average read lengths
+if [ $type == "single" ]; then
+printf "Getting average read lengths..."
+	if [ ! -z $bdir ]; then
+		readC=$(awk '{a=$3-$2;print a;}' "$wdir/bkgd.$ran/ctrl.bed" | perl -lane '$a+=$_;END{print $a/$.}' | awk '{print int($1)}')
+	fi
+	readL=""
+	for s in $sdir/*.bed; do #and for each sample file
+		read=$(awk '{a=$3-$2;print a;}' "$s" | perl -lane '$a+=$_;END{print $a/$.}' | awk '{print int($1)}')
+		readL="$readL,$read"
+	done
+	readL=${readL#","}
+printf "Done!\n"
+fi
+#=======================> DONE!
+
+
+
+
+
+
+# =========================== 
+# Step Three: Getting Signal
+# ===========================
+mkdir $wdir/signal.$ran/
+mkdir $out/signal #store signal
+
+
+printf "Generating Signal...(bin size: $signal)\n"
+
+if [ $type == "single" ]; then
+counting=1;			
+for f in $wdir/sizes.$ran/*; do #for each chromosome
+		samplelist=""
+		frag=""
+		frag=$fraglen
+		k=1
+
+		sizefile=$(basename $f)
+		chr=$(echo $sizefile | awk -F"." '{print $2}' | awk -F"." '{print $1}');
+		chrSize=$(cat $f | cut -f2);
+		printf "Chromosome $chr: "
+
+		#list of sample bed files and fragment lengths
+		for s in $wdir/samples.$ran/*.bed; do #and for each sample file
+			samplefile=$(basename $s)
+			chr2=$(echo $samplefile | awk -F"." '{print $2}');
+			if [ $chr == $chr2 ] #belonging to this chromosome
+			then
+				samplelist="$samplelist,$wdir/samples.$ran/ext.$samplefile"
+				samplename=$(echo $samplefile | awk -F"." '{ print $3 }')
+				samplefilename=$(echo $samplefile | cut -d'.' -f 3-)
+				shift=$(echo "$frag" | cut -f "$k" -d ",")
+				read=$(echo "$readL" | cut -f "$k" -d ",")	
+				k=$(($k+1))
+				perl "$sPath/readshifter.pl" "$wdir/samples.$ran/$samplefile" $shift $read > "$wdir/samples.$ran/ext.$samplefile"
+			fi
+		done
+
+		#control file
+		bkgdfile="None"
+		if [ ! -z $bdir ]; then
+			l=$(($dupnum+1))
+			bshift=$(echo $frag | cut -f "$l" -d ",")
+			perl "$sPath/readshifter.pl" "$wdir/bkgd.$ran/bkgd.$chr.ctrl.bed" $bshift $readC > "$wdir/bkgd.$ran/ext.bkgd.$chr.ctrl.bed"
+			bkgdfile="$wdir/bkgd.$ran/ext.bkgd.$chr.ctrl.bed"
+		fi
+
+		#remove leading comma
+		samplelist=${samplelist#","}
+		frag=${frag#","}
+
+		depth="0";
+		if [ $normal == "depth" ]; then
+			genomeSize=$(awk '{a=$2;print a;}' "$gsize" | perl -lane '$a+=$_;END{print $a}' | awk '{print int($1)}');
+			depthTemp="";
+			for s in $sdir/*.bed; do #and for each sample file
+				readnum=$(wc -l $s | cut -d" " -f1);
+				depthTemping=$(awk -v rn=$readnum -v gs=$genomeSize 'BEGIN { print rn / gs }');
+				depthTemp="$depthTemp,$depthTemping";
+			done
+			if [ ! -z $bdir ]; then
+				readnum=$(wc -l "$wdir/bkgd.$ran/ctrl.bed" | cut -d" " -f1);
+				depthTemping=$(awk -v rn=$readnum -v gs=$genomeSize 'BEGIN { print rn / gs }');
+				depthTemp="$depthTemp,$depthTemping";
+			fi
+			depth=${depthTemp#","}
+		fi
+		#call the peak calling R script
+		Rscript "$sPath/signalmaker.r" -chromoS="$chrSize" -chromo="$chr" -bednames=$samplelist -frag=$frag -bkgd=$bkgdfile -out="$wdir/signal.$ran/" -sig="$signal" -regions="$regs" -p="$cores" -chrcount="$counting" -t="$type" -normal="$normal" -depth="$depth"
+		cat $wdir/signal.$ran/*.bedSignal | awk -F"\t" -v j=0 '$4 > j' > "$out/signal/$chr.bedGraph"
+		rm $wdir/signal.$ran/*.bedSignal
+		counting=$(($counting+1));
+done
+counting=1;
+fi
+
+
+
+if [ $type == "paired" ]; then
+counting=1;
+for f in $wdir/sizes.$ran/*; do #for each chromosome
+	samplelist=""
+
+	sizefile=$(basename $f)
+	chr=$(echo $sizefile | awk -F"." '{print $2}' | awk -F"." '{print $1}');
+	chrSize=$(cat $f | cut -f2);
+	printf "Chromosome $chr: "
+
+	#list of sample bed files and fragment lengths
+	for s in $wdir/samples.$ran/*.bed; do #and for each sample file
+		samplefile=$(basename $s)
+		chr2=$(echo $samplefile | awk -F"." '{print $2}');
+		if [ $chr == $chr2 ] #belonging to this chromosome
+		then
+			samplelist="$samplelist,$wdir/samples.$ran/$samplefile"
+			samplename=$(echo $samplefile | awk -F"." '{ print $3 }')
+			samplefilename=$(echo $samplefile | cut -d'.' -f 3-)
+			x="$sdir/$samplefilename"
+		fi
+	done
+
+	#control file
+	bkgdfile="None"
+	if [ ! -z $bdir ]; then
+		bkgdfile="$wdir/bkgd.$ran/bkgd.$chr.ctrl.bed"
+	fi
+
+	#remove leading comma
+	samplelist=${samplelist#","}
+	frag=${frag#","}
+
+	depth="0";
+	if [ $normal == "depth" ]; then
+		genomeSize=$(awk '{a=$2;print a;}' "$gsize" | perl -lane '$a+=$_;END{print $a}' | awk '{print int($1)}');
+		depthTemp="";
+		for s in $sdir/*.bed; do #and for each sample file
+			readnum=$(wc -l $s | cut -d" " -f1);
+			depthTemping=$(awk -v rn=$readnum -v gs=$genomeSize 'BEGIN { print rn / gs }');
+			depthTemp="$depthTemp,$depthTemping";
+		done
+		if [ ! -z $bdir ]; then
+			readnum=$(wc -l "$wdir/bkgd.$ran/ctrl.bed" | cut -d" " -f1);
+			depthTemping=$(awk -v rn=$readnum -v gs=$genomeSize 'BEGIN { print rn / gs }');
+			depthTemp="$depthTemp,$depthTemping";
+		fi
+		depth=${depthTemp#","}
+	fi
+	#call the peak calling R script
+	Rscript "$sPath/signalmaker.r" -chromoS="$chrSize" -chromo="$chr" -bednames=$samplelist -frag=$frag -bkgd=$bkgdfile -out="$wdir/signal.$ran/" -sig="$signal" -regions="$regs" -p="$cores" -chrcount="$counting" -t="$type" -normal="$normal" -depth="$depth"
+	cat $wdir/signal.$ran/*.bedSignal | awk -F"\t" -v j=0 '$4 > j' > "$out/signal/$chr.bedGraph"
+	rm $wdir/signal.$ran/*.bedSignal
+	counting=$(($counting+1));
+done
+counting=1;
+fi
+#=======================> DONE!
+
+rm -rf $wdir
+
+printf "\n\n========================================\nWe're done...Congratulations!\n========================================\n\n"