Skip to content

Commit

Permalink
Add duplicate chain file support to alignment DB script
Browse files Browse the repository at this point in the history
This makes it more straightforward to create an alignment database directly from the flattened RODA downloads
  • Loading branch information
ljarosch committed Mar 20, 2024
1 parent e678050 commit ee0c5db
Showing 1 changed file with 38 additions and 2 deletions.
40 changes: 38 additions & 2 deletions scripts/alignment_db_scripts/create_alignment_db_sharded.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ def create_shard(
def main(args):
alignment_dir = args.alignment_dir
output_dir = args.output_db_path
output_dir.mkdir(exist_ok=True, parents=True)
output_db_name = args.output_db_name
n_shards = args.n_shards

Expand Down Expand Up @@ -165,6 +166,30 @@ def main(args):
super_index.update(shard_index)
print("\nCreated all shards.")

if args.duplicate_chains_file:
print("Extending super index with duplicate chains...")
duplicates_added = 0
with open(args.duplicate_chains_file, "r") as fp:
duplicate_chains = [line.strip().split() for line in fp]

for chains in duplicate_chains:
# find representative with alignment
for chain in chains:
if chain in super_index:
representative_chain = chain
break
else:
print(f"No representative chain found for {chains}, skipping...")
continue

# add duplicates to index
for chain in chains:
if chain != representative_chain:
super_index[chain] = super_index[representative_chain]
duplicates_added += 1

print(f"Added {duplicates_added} duplicate chains to index.")

# write super index to file
print("\nWriting super index...")
index_path = output_dir / f"{output_db_name}.index"
Expand All @@ -191,8 +216,8 @@ def main(args):
parser.add_argument(
"alignment_dir",
type=Path,
help="""Path to precomputed alignment directory, with one subdirectory
per chain.""",
help="""Path to precomputed flattened alignment directory, with one
subdirectory per chain.""",
)
parser.add_argument("output_db_path", type=Path)
parser.add_argument("output_db_name", type=str)
Expand All @@ -202,6 +227,17 @@ def main(args):
help="Number of shards to split the database into",
default=10,
)
parser.add_argument(
"--duplicate_chains_file",
type=Path,
help="""
Optional path to file containing duplicate chain information, where each
line contains chains that are 100% sequence identical. If provided,
duplicate chains will be added to the index and point to the same
underlying database entry as their representatives in the alignment dir.
""",
default=None,
)

args = parser.parse_args()

Expand Down

0 comments on commit ee0c5db

Please sign in to comment.