Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add script for expanding the alignment dir with duplicates
This adds support for duplicate chain expansion for the alignment dir format. This script can be run on the flattened non-redundant RODA alignments to add explicit directories for all of the duplicate chains in the duplicate_chains file, symlinked to their representative chain alignment directory.
- Loading branch information
Showing
1 changed file
with
75 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
""" | ||
The RODA database is non-redundant, meaning that it only stores one explicit | ||
representative alignment directory for all PDB chains in a 100% sequence | ||
identity cluster. In order to add explicit alignments for all PDB chains, this | ||
script will add the missing chain directories and symlink them to their | ||
representative alignment directories. | ||
""" | ||
|
||
from argparse import ArgumentParser | ||
from pathlib import Path | ||
|
||
from tqdm import tqdm | ||
|
||
|
||
def create_duplicate_dirs(duplicate_chains: list[list[str]], alignment_dir: Path): | ||
""" | ||
Create duplicate directory symlinks for all chains in the given duplicate lists. | ||
Args: | ||
duplicate_lists (list[list[str]]): A list of lists, where each inner list | ||
contains chains that are 100% sequence identical. | ||
alignment_dir (Path): Path to flattened alignment directory, with one | ||
subdirectory per chain. | ||
""" | ||
print("Creating duplicate directory symlinks...") | ||
dirs_created = 0 | ||
for chains in tqdm(duplicate_chains): | ||
# find the chain that has an alignment | ||
for chain in chains: | ||
if (alignment_dir / chain).exists(): | ||
representative_chain = chain | ||
break | ||
else: | ||
print(f"No representative chain found for {chains}, skipping...") | ||
continue | ||
|
||
# create symlinks for all other chains | ||
for chain in chains: | ||
if chain != representative_chain: | ||
target_path = alignment_dir / chain | ||
if target_path.exists(): | ||
print(f"Chain {chain} already exists, skipping...") | ||
else: | ||
(target_path).symlink_to(alignment_dir / representative_chain) | ||
dirs_created += 1 | ||
|
||
print(f"Created directories for {dirs_created} duplicate chains.") | ||
|
||
|
||
def main(alignment_dir: Path, duplicate_chains_file: Path): | ||
# read duplicate chains file | ||
with open(duplicate_chains_file, "r") as fp: | ||
duplicate_chains = [list(line.strip().split()) for line in fp] | ||
|
||
create_duplicate_dirs(duplicate_chains, alignment_dir) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = ArgumentParser(description=__doc__) | ||
parser.add_argument( | ||
"alignment_dir", | ||
type=Path, | ||
help="""Path to flattened alignment directory, with one subdirectory | ||
per chain.""", | ||
) | ||
parser.add_argument( | ||
"duplicate_chains_file", | ||
type=Path, | ||
help="""Path to file containing duplicate chains, where each line | ||
contains a space-separated list of chains that are 100%% | ||
sequence identical. | ||
""", | ||
) | ||
args = parser.parse_args() | ||
main(args.alignment_dir, args.duplicate_chains_file) |