forked from kweitemier/fastq_collapse
/
fastq_collapse.py
executable file
·138 lines (124 loc) · 5.35 KB
/
fastq_collapse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python
from os import remove
from os import path #Importing two methods from os module
from re import sub #This imports regular expression usage
from optparse import OptionParser #Imports the option parser module
###### OPTIONS and USAGE ######
parser = OptionParser(usage = """fastq_collapse.py -i INFILE -o OUTFILE -j JOB_ID
fastq_collapse.py - Discards exact fastq sequence duplicates, preserving the
highest average quality score. It also makes a file containing all the
duplicated reads.
by
Kevin Weitemier
February 2013
Copyright (c) 2011,2012,2013 Kevin Weitemier.
Version 0.03
This program is free software: you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation, either version 3 of the License, or (at your option) any later
version. A copy of this license is available at <http://www.gnu.org/licenses/>.
Great effort has been taken to make this software perform its said
task, however, this software comes with ABSOLUTELY NO WARRANTY,
not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
Input - A fastq file where each sequence occupies four (4) lines.""")
parser.add_option("-i", action="store", type="string", dest="inname",
help="Input filename", default="")
parser.add_option("-o", action="store", type="string", dest="outname",
help="Output filename", default="")
parser.add_option("-j", action="store", type="string", dest="job_id",
default="1", help="""Unique job ID for performing multiple runs at once in
the same directory. Don't use characters that might cause problems in filenames.
(Default = "1")""")
(options, args) = parser.parse_args()
# Makes sure all filenames are given
if options.inname == "":
parser.error("Please include an input file using -i.")
if options.outname == "":
parser.error("Please include an output file using -o.")
###### OPENING INPUT/OUTPUT FILES ######
if path.isfile(options.outname):
parser.error("""The output filename exists. Please delete it first or \
choose a new name.""")
DupName = "%s_Duplicated_reads" % (options.outname)
if path.isfile(DupName):
parser.error("""The duplicate reads filename exists. Please delete is first\
or choose a new name.""")
InFile = open(options.inname, 'r')
OutFile = open(options.outname, 'w')
DupFile = open(DupName, 'w')
Line = InFile.readline()
Bases = ['A','T','C','G','N']
#Temp files
PathDict = {}
HandleDict = {}
for Base in Bases:
PathDict[Base] = "%s.%s_reads.temp" % (options.job_id,Base)
HandleDict[Base] = open(PathDict[Base], 'w')
###### ANALYSES ######
###### Parsing reads to temp files ##############
print ("Parsing reads to temp files.")
while Line: #Reads file line-by-line
#Reads and stores the 4 fastq lines
AtLine = Line.strip()
Line = InFile.readline() #This should be the sequence line
Seq = Line.strip()
Line = InFile.readline() #This should be the qual ID line
PlusLine = Line.strip()
Line = InFile.readline() #This should be the quality line
Qual = Line.strip()
for Base, Handle in HandleDict.iteritems():
if Seq[0] == Base or Seq[0] == Base.lower(): #write the reads to appropriate files.
Handle.write("%s\n%s\n%s\n%s\n" % (AtLine,Seq,PlusLine,Qual))
break
Line = InFile.readline() #This goes to the next @ line to continue loop.
InFile.close()
# Closing the temp files so they can be opened to read later.
for Handle in HandleDict.values():
Handle.close()
###### ANALYZE EACH TEMP FILE ##################
for Base in Bases:
print ("Checking for duplicates starting with'%s'." % (Base))
TempFile = open(PathDict[Base], 'r')
Line = TempFile.readline()
SeqDict = {}
DupDict = {}
while Line:
Line = Line.strip()
SeqID = sub('^@(.+)$',r"\1",Line)
Line = TempFile.readline() #This should be Seq line
Seq = Line.strip()
QualID = TempFile.readline().strip() #This should be qual ID line
Line = TempFile.readline() #This should be Qual line
Qual = Line.strip()
if Seq in SeqDict: #Tests if sequence has already been seen
#Add to the duplicate dictionary
DupDict[SeqID] = [Seq,Qual,QualID]
DupDict[SeqDict[Seq][0]] = [Seq,SeqDict[Seq][1],SeqDict[Seq][2]]
#Do comparison
OldScore = 0
NewScore = 0
for Position in Qual:
Score = ord(Position) - 64
NewScore += Score
for Place in SeqDict[Seq][1]:
Score = ord(Place) - 64
OldScore += Score
if NewScore > OldScore:
SeqDict[Seq] = [SeqID,Qual,QualID]
else:
SeqDict[Seq] = [SeqID,Qual,QualID]
Line = TempFile.readline() #Advance to next @line to continue
print ("Writing reads starting with '%s' to file." % (Base))
for Key, Value in SeqDict.iteritems():
OutFile.write('@%s\n%s\n%s\n%s\n' % (Value[0],Key,Value[2],Value[1]))
for Key, Value in DupDict.iteritems():
DupFile.write('@%s\n%s\n%s\n%s\n' % (Key,Value[0],Value[2],Value[1]))
TempFile.close()
SeqDict.clear()
DupDict.clear()
###### CLEANUP ##########
for Path in PathDict.values():
remove(Path)
OutFile.close()
DupFile.close()
###### END OF FILE ########