Skip to content

Commit

Permalink
Merge pull request #96 from briney/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
briney committed Apr 28, 2021
2 parents dc49798 + 54d1760 commit 0b58a3f
Show file tree
Hide file tree
Showing 73 changed files with 5,269 additions and 85 deletions.
10 changes: 5 additions & 5 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ include abstar/assigners/germline_dbs/mouse/blast/*
include abstar/assigners/germline_dbs/mouse/ungapped/*
include abstar/assigners/germline_dbs/mouse/imgt_gapped/*
include abstar/assigners/germline_dbs/mouse/isotypes/*
include abstar/assigners/germline_dbs/vrc01mouse/*
include abstar/assigners/germline_dbs/vrc01mouse/blast/*
include abstar/assigners/germline_dbs/vrc01mouse/ungapped/*
include abstar/assigners/germline_dbs/vrc01mouse/imgt_gapped/*
include abstar/assigners/germline_dbs/vrc01mouse/isotypes/*
include abstar/assigners/germline_dbs/humouse/*
include abstar/assigners/germline_dbs/humouse/blast/*
include abstar/assigners/germline_dbs/humouse/ungapped/*
include abstar/assigners/germline_dbs/humouse/imgt_gapped/*
include abstar/assigners/germline_dbs/humouse/isotypes/*
include abstar/test_data/*
include abstar/utils/*
include abstar/utils/queue/*
Expand Down
19 changes: 10 additions & 9 deletions abstar/assigners/assigner.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def __call__(self, seqs, species):
``GermlineSegment`` objects contain information about the assigned germline gene segment (V, D or J).
Instantiation of a ``GermlineSegment`` object requires only the name of the germline gene segment (in
IMGT format, like 'IGHV3-23*01') and the species. Several other optional arguments can be included
IMGT format, like 'IGHV3-23*01') and the database name. Several other optional arguments can be included
at instantiation. ``score`` is the assignment score, typically an ``int`` or ``float``. ``strand``
indicates the strand orientation of the input sequence (``'+'`` or ``'-'``). ``others`` is a list
of additional high scoring ``GermlineSegment`` objects. ``assigner_name`` is the name of the custom
Expand Down Expand Up @@ -233,8 +233,8 @@ def __call__(self, seqs, species):
class MyAssigner(BaseAssigner):
def __init__(self, species):
super(MyAssigner, self).__init__(species)
def __init__(self, db_name):
super(MyAssigner, self).__init__(db_name)
self.binary = os.path.join(self.binary_directory, 'mybinary_{}'.format(platform.system()))
def __call__(self, sequence_file, file_format):
Expand Down Expand Up @@ -270,7 +270,7 @@ def __call__(self, sequence_file, file_format):
return vdj
def assign_germline(self, vdj, segment):
germ_db = os.path.join(self.germline_directory, '{}/ungapped/{}.fasta'.format(self.species,
germ_db = os.path.join(self.germline_directory, '{}/ungapped/{}.fasta'.format(self.db_name,
segment.lower()))
# do stuff to assign the germline gene, using the species-appropriate germline DB
Expand All @@ -283,17 +283,18 @@ def assign_germline(self, vdj, segment):
vdj.log('{}-ASSIGNMENT ERROR:'.format(segment),
'Score ({}) is too low'.format(germs[0].score))
return None
others = [GermlineSegment(germ.name, self.species, score=germ.score) for germ in germs[1:6]]
return GermlineSegment(germs[0].name, self.species, score=germs[0].score, others=others)
others = [GermlineSegment(germ.name, self.db_name, score=germ.score) for germ in germs[1:6]]
return GermlineSegment(germs[0].name, self.db_name, score=germs[0].score, others=others)
"""

__metaclass__ = abc.ABCMeta

def __init__(self, species):
def __init__(self, db_name):
super(BaseAssigner, self).__init__()
self.name = self.__class__.__name__.lower()
self.species = species
self.species = db_name
self.db_name = db_name
self._assigned = None
self._unassigned = None
self._germline_directory = None
Expand All @@ -308,7 +309,7 @@ def __call__(self, sequence_file, file_format):
@property
def germline_directory(self):
if self._germline_directory is None:
self._germline_directory = get_germline_database_directory(self.species)
self._germline_directory = get_germline_database_directory(self.db_name)
return self._germline_directory

@germline_directory.setter
Expand Down
41 changes: 24 additions & 17 deletions abstar/assigners/blastn.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ class Blastn(BaseAssigner):
docstring for Blastn
"""

def __init__(self, species):
super(Blastn, self).__init__(species)
def __init__(self, db_name):
super(Blastn, self).__init__(db_name)


def __call__(self, sequence_file, file_format):
Expand All @@ -59,7 +59,7 @@ def __call__(self, sequence_file, file_format):
handle.write('\n'.join(s.fasta for s in seqs))

# assign V-genes
vblast_records = self.blast(sequence_file, self.species, 'V')
vblast_records = self.blast(sequence_file, 'V')
# if there aren't any vblast_records, that means that none of the
# sequences in the input file contained sequences with a significant
# match to any germline V-gene. These are likely all non-antibody sequences.
Expand All @@ -74,7 +74,7 @@ def __call__(self, sequence_file, file_format):
jquery_seqs = []
for seq, vbr in zip(seqs, vblast_records):
try:
germ = self.process_blast_record(vbr, self.species)
germ = self.process_blast_record(vbr)
vdj = VDJ(seq, v=germ)
self.orient_query(vdj, vbr)
jquery = self.get_jquery_sequence(vdj.oriented, vbr)
Expand Down Expand Up @@ -105,10 +105,10 @@ def __call__(self, sequence_file, file_format):
_vdjs = []
dquery_seqs = []
jblast_infile = self.build_jblast_input(jquery_seqs)
jblast_records = self.blast(jblast_infile, self.species, 'J')
jblast_records = self.blast(jblast_infile, 'J')
for vdj, jquery, jbr in zip(vdjs, jquery_seqs, jblast_records):
try:
germ = self.process_blast_record(jbr, self.species)
germ = self.process_blast_record(jbr)
vdj.j = germ
# sanity check to make sure there's not an obvious problem with the V/J
# assignments (likely due to poor germline matches to a non-antibody sequence)
Expand All @@ -131,7 +131,7 @@ def __call__(self, sequence_file, file_format):
for vdj, dquery in zip(vdjs, dquery_seqs):
if all([vdj.v.chain == 'heavy', dquery]):
try:
germ = self.assign_dgene(dquery, self.species)
germ = self.assign_dgene(dquery)
vdj.d = germ
except:
vdj.exception('D-GENE ASSIGNMENT ERROR:', traceback.format_exc())
Expand All @@ -146,7 +146,7 @@ def __call__(self, sequence_file, file_format):
# return 'blastn'


def blast(self, seq_file, species, segment):
def blast(self, seq_file, segment):
'''
Runs BLASTn against an antibody germline database.
Expand All @@ -155,9 +155,6 @@ def blast(self, seq_file, species, segment):
seq_file (str): Path to a FASTA-formatted file of input sequences.
species (str): Species of origin of the antibody sequences in ``seq_file``.
Options are: ``human``, ``macaque``, ``mouse`` and ``rabbit``.
segment (str): Germline segment to query. Options are ``V`` and ``J``.
'''
blast_path = os.path.join(self.binary_directory, 'blastn_{}'.format(platform.system().lower()))
Expand All @@ -182,7 +179,7 @@ def blast(self, seq_file, species, segment):
return blast_records


def assign_dgene(self, seq, species):
def assign_dgene(self, seq):
db_file = os.path.join(self.germline_directory, 'ungapped/d.fasta')
with open(db_file, 'r') as db_handle:
germs = [Sequence(s) for s in SeqIO.parse(db_handle, 'fasta')]
Expand All @@ -192,22 +189,32 @@ def assign_dgene(self, seq, species):
gap_open=-20, gap_extend=-2)
alignments.sort(key=lambda x: x.score, reverse=True)
all_gls = [a.target.id for a in alignments]
if '__' in all_gls[0]:
species = all_gls[0].split('__')[-1].replace('-', ' ')
# all_gls = [gl.split('__')[0] for gl in all_gls]
else:
species = self.db_name
all_scores = [a.score for a in alignments]
if not all([all_gls, all_scores]):
return None
top_gl = all_gls[0]
top_score = all_scores[0]
others = [GermlineSegment(germ, species, score=score) for germ, score in zip(all_gls[1:6], all_scores[1:6])]
return GermlineSegment(top_gl, species, score=top_score, others=others, assigner_name=self.name)
others = [GermlineSegment(germ, species, self.db_name, score=score) for germ, score in zip(all_gls[1:6], all_scores[1:6])]
return GermlineSegment(top_gl, species, self.db_name, score=top_score, others=others, assigner_name=self.name)


def process_blast_record(self, blast_record, species):
def process_blast_record(self, blast_record):
all_gls = [a.title.split()[0] for a in blast_record.alignments]
if '__' in all_gls[0]:
species = all_gls[0].split('__')[-1].replace('-', ' ')
# all_gls = [gl.split('__')[0] for gl in all_gls]
else:
species = self.db_name
all_scores = [a.hsps[0].bits for a in blast_record.alignments]
top_gl = all_gls[0]
top_score = all_scores[0]
others = [GermlineSegment(germ, species, score=score) for germ, score in zip(all_gls[1:], all_scores[1:])]
return GermlineSegment(top_gl, species, score=top_score, others=others[:5], assigner_name=self.name)
others = [GermlineSegment(germ, species, self.db_name, score=score) for germ, score in zip(all_gls[1:], all_scores[1:])]
return GermlineSegment(top_gl, species, self.db_name, score=top_score, others=others[:5], assigner_name=self.name)


@staticmethod
Expand Down
9 changes: 9 additions & 0 deletions abstar/assigners/germline_dbs/humouse/blast/d.blastlog
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@


Building a new DB, current time: 04/27/2021 21:31:35
New DB name: /Users/bryanbriney/google_drive/brineylab/projects/ab[x]/abstar_germline-dbs/humouse_042721/humouse/blast/d
New DB title: d
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 63 sequences in 0.00511813 seconds.
Empty file.
Binary file added abstar/assigners/germline_dbs/humouse/blast/d.nhr
Binary file not shown.
Binary file added abstar/assigners/germline_dbs/humouse/blast/d.nin
Binary file not shown.
Binary file added abstar/assigners/germline_dbs/humouse/blast/d.nog
Binary file not shown.
126 changes: 126 additions & 0 deletions abstar/assigners/germline_dbs/humouse/blast/d.nsd
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
ighd1-1*01__homo-sapies53
ighd1-1*01__mus-musculus26
ighd1-1*02__mus-musculus36
ighd1-2*01__mus-musculus28
ighd1-20*01__homo-sapies54
ighd1-26*01__homo-sapies55
ighd1-3*01__mus-musculus9
ighd1-7*01__homo-sapies39
ighd2-1*01__mus-musculus10
ighd2-10*01__mus-musculus30
ighd2-10*02__mus-musculus11
ighd2-11*01__mus-musculus31
ighd2-11*02__mus-musculus12
ighd2-12*01__mus-musculus7
ighd2-13*01__mus-musculus8
ighd2-14*01__mus-musculus13
ighd2-15*01__homo-sapies20
ighd2-2*01__homo-sapies18
ighd2-2*01__mus-musculus29
ighd2-2*02__homo-sapies56
ighd2-2*03__homo-sapies37
ighd2-21*01__homo-sapies21
ighd2-21*02__homo-sapies57
ighd2-3*01__mus-musculus17
ighd2-4*01__mus-musculus23
ighd2-5*01__mus-musculus38
ighd2-6*01__mus-musculus0
ighd2-7*01__mus-musculus27
ighd2-8*01__homo-sapies40
ighd2-8*01__mus-musculus24
ighd2-8*02__homo-sapies19
ighd2-9*01__mus-musculus25
ighd2-9*02__mus-musculus14
ighd3-1*01__mus-musculus1
ighd3-10*01__homo-sapies41
ighd3-10*02__homo-sapies50
ighd3-16*01__homo-sapies49
ighd3-16*02__homo-sapies58
ighd3-2*01__mus-musculus34
ighd3-2*02__mus-musculus2
ighd3-22*01__homo-sapies51
ighd3-3*01__homo-sapies42
ighd3-3*01__mus-musculus15
ighd3-3*02__homo-sapies52
ighd3-9*01__homo-sapies43
ighd4-1*01__mus-musculus33
ighd4-1*02__mus-musculus32
ighd4-17*01__homo-sapies59
ighd4-4*01__homo-sapies44
ighd5-12*01__homo-sapies45
ighd5-18*01__homo-sapies60
ighd5-5*01__homo-sapies46
ighd6-1*01__mus-musculus3
ighd6-1*02__mus-musculus35
ighd6-13*01__homo-sapies47
ighd6-19*01__homo-sapies61
ighd6-2*01__mus-musculus4
ighd6-2*02__mus-musculus16
ighd6-25*01__homo-sapies62
ighd6-3*01__mus-musculus5
ighd6-4*01__mus-musculus6
ighd6-6*01__homo-sapies48
ighd7-27*01__homo-sapies22
lcl|ighd1-1*01__homo-sapies53
lcl|ighd1-1*01__mus-musculus26
lcl|ighd1-1*02__mus-musculus36
lcl|ighd1-2*01__mus-musculus28
lcl|ighd1-20*01__homo-sapies54
lcl|ighd1-26*01__homo-sapies55
lcl|ighd1-3*01__mus-musculus9
lcl|ighd1-7*01__homo-sapies39
lcl|ighd2-1*01__mus-musculus10
lcl|ighd2-10*01__mus-musculus30
lcl|ighd2-10*02__mus-musculus11
lcl|ighd2-11*01__mus-musculus31
lcl|ighd2-11*02__mus-musculus12
lcl|ighd2-12*01__mus-musculus7
lcl|ighd2-13*01__mus-musculus8
lcl|ighd2-14*01__mus-musculus13
lcl|ighd2-15*01__homo-sapies20
lcl|ighd2-2*01__homo-sapies18
lcl|ighd2-2*01__mus-musculus29
lcl|ighd2-2*02__homo-sapies56
lcl|ighd2-2*03__homo-sapies37
lcl|ighd2-21*01__homo-sapies21
lcl|ighd2-21*02__homo-sapies57
lcl|ighd2-3*01__mus-musculus17
lcl|ighd2-4*01__mus-musculus23
lcl|ighd2-5*01__mus-musculus38
lcl|ighd2-6*01__mus-musculus0
lcl|ighd2-7*01__mus-musculus27
lcl|ighd2-8*01__homo-sapies40
lcl|ighd2-8*01__mus-musculus24
lcl|ighd2-8*02__homo-sapies19
lcl|ighd2-9*01__mus-musculus25
lcl|ighd2-9*02__mus-musculus14
lcl|ighd3-1*01__mus-musculus1
lcl|ighd3-10*01__homo-sapies41
lcl|ighd3-10*02__homo-sapies50
lcl|ighd3-16*01__homo-sapies49
lcl|ighd3-16*02__homo-sapies58
lcl|ighd3-2*01__mus-musculus34
lcl|ighd3-2*02__mus-musculus2
lcl|ighd3-22*01__homo-sapies51
lcl|ighd3-3*01__homo-sapies42
lcl|ighd3-3*01__mus-musculus15
lcl|ighd3-3*02__homo-sapies52
lcl|ighd3-9*01__homo-sapies43
lcl|ighd4-1*01__mus-musculus33
lcl|ighd4-1*02__mus-musculus32
lcl|ighd4-17*01__homo-sapies59
lcl|ighd4-4*01__homo-sapies44
lcl|ighd5-12*01__homo-sapies45
lcl|ighd5-18*01__homo-sapies60
lcl|ighd5-5*01__homo-sapies46
lcl|ighd6-1*01__mus-musculus3
lcl|ighd6-1*02__mus-musculus35
lcl|ighd6-13*01__homo-sapies47
lcl|ighd6-19*01__homo-sapies61
lcl|ighd6-2*01__mus-musculus4
lcl|ighd6-2*02__mus-musculus16
lcl|ighd6-25*01__homo-sapies62
lcl|ighd6-3*01__mus-musculus5
lcl|ighd6-4*01__mus-musculus6
lcl|ighd6-6*01__homo-sapies48
lcl|ighd7-27*01__homo-sapies22
Binary file added abstar/assigners/germline_dbs/humouse/blast/d.nsi
Binary file not shown.
Binary file added abstar/assigners/germline_dbs/humouse/blast/d.nsq
Binary file not shown.
9 changes: 9 additions & 0 deletions abstar/assigners/germline_dbs/humouse/blast/j.blastlog
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@


Building a new DB, current time: 04/27/2021 21:31:35
New DB name: /Users/bryanbriney/google_drive/brineylab/projects/ab[x]/abstar_germline-dbs/humouse_042721/humouse/blast/j
New DB title: j
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 47 sequences in 0.00339293 seconds.
Empty file.
Binary file added abstar/assigners/germline_dbs/humouse/blast/j.nhr
Binary file not shown.
Binary file added abstar/assigners/germline_dbs/humouse/blast/j.nin
Binary file not shown.
Binary file added abstar/assigners/germline_dbs/humouse/blast/j.nog
Binary file not shown.

0 comments on commit 0b58a3f

Please sign in to comment.