Skip to content

Commit

Permalink
Add script for expanding the alignment dir with duplicates
Browse files Browse the repository at this point in the history
This adds support for duplicate chain expansion for the alignment dir format. This script can be run on the flattened non-redundant RODA alignments to add explicit directories for all of the duplicate chains in the duplicate_chains file, symlinked to their representative chain alignment directory.
  • Loading branch information
ljarosch committed Mar 20, 2024
1 parent ee0c5db commit 94819bf
Showing 1 changed file with 75 additions and 0 deletions.
75 changes: 75 additions & 0 deletions scripts/expand_roda_duplicates.py
@@ -0,0 +1,75 @@
"""
The RODA database is non-redundant, meaning that it only stores one explicit
representative alignment directory for all PDB chains in a 100% sequence
identity cluster. In order to add explicit alignments for all PDB chains, this
script will add the missing chain directories and symlink them to their
representative alignment directories.
"""

from argparse import ArgumentParser
from pathlib import Path

from tqdm import tqdm


def create_duplicate_dirs(duplicate_chains: list[list[str]], alignment_dir: Path):
"""
Create duplicate directory symlinks for all chains in the given duplicate lists.
Args:
duplicate_lists (list[list[str]]): A list of lists, where each inner list
contains chains that are 100% sequence identical.
alignment_dir (Path): Path to flattened alignment directory, with one
subdirectory per chain.
"""
print("Creating duplicate directory symlinks...")
dirs_created = 0
for chains in tqdm(duplicate_chains):
# find the chain that has an alignment
for chain in chains:
if (alignment_dir / chain).exists():
representative_chain = chain
break
else:
print(f"No representative chain found for {chains}, skipping...")
continue

# create symlinks for all other chains
for chain in chains:
if chain != representative_chain:
target_path = alignment_dir / chain
if target_path.exists():
print(f"Chain {chain} already exists, skipping...")
else:
(target_path).symlink_to(alignment_dir / representative_chain)
dirs_created += 1

print(f"Created directories for {dirs_created} duplicate chains.")


def main(alignment_dir: Path, duplicate_chains_file: Path):
# read duplicate chains file
with open(duplicate_chains_file, "r") as fp:
duplicate_chains = [list(line.strip().split()) for line in fp]

create_duplicate_dirs(duplicate_chains, alignment_dir)


if __name__ == "__main__":
parser = ArgumentParser(description=__doc__)
parser.add_argument(
"alignment_dir",
type=Path,
help="""Path to flattened alignment directory, with one subdirectory
per chain.""",
)
parser.add_argument(
"duplicate_chains_file",
type=Path,
help="""Path to file containing duplicate chains, where each line
contains a space-separated list of chains that are 100%%
sequence identical.
""",
)
args = parser.parse_args()
main(args.alignment_dir, args.duplicate_chains_file)

0 comments on commit 94819bf

Please sign in to comment.