diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c2f1f7d..80ccadf3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,12 +12,15 @@ * [Patch] - [#187](https://github.com/a-r-j/graphein/pull/187) updates sequence retrieval due to UniProt API changes. * [Patch] - [#189](https://github.com/a-r-j/graphein/pull/189) fixes bug where chains and PDB identifiers were not properly aligned in `ml.ProteinGraphDataset`. +* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Adds missing `MSE` to `graphein.protein.resi_atoms.RESI_NAMES`, `graphein.protein.resi_atoms.RESI_THREE_TO_1`. [#200](https://github.com/a-r-j/graphein/issues/200) +* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Fixes bug where check for same-chain always evaluates as False. [#199](https://github.com/a-r-j/graphein/issues/199) +* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Fixes bug where deprotonation would only remove hydrogens based on `atom_name` rather than `element_symbol`. [#198](https://github.com/a-r-j/graphein/issues/198) +* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Fixes bug in ProteinGraphDataset input validation. #### Breaking Changes * [#189](https://github.com/a-r-j/graphein/pull/189/) refactors PDB download util. Now returns path to download file, does not accept a config object but instead receives the output directory path directly. - ### 1.5.0 #### Protein diff --git a/docs/source/conf.py b/docs/source/conf.py index 2dd8f823..f3a0a9ff 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -29,7 +29,7 @@ author = "Arian Jamasb" # The full version, including alpha/beta/rc tags -release = "1.5.0" +release = "1.5.1" # -- General configuration --------------------------------------------------- diff --git a/graphein/__init__.py b/graphein/__init__.py index 8b58fc84..97a72298 100644 --- a/graphein/__init__.py +++ b/graphein/__init__.py @@ -12,7 +12,7 @@ from .testing import * __author__ = "Arian Jamasb " -__version__ = "1.5.0" +__version__ = "1.5.1" logger.configure( diff --git a/graphein/ml/datasets/torch_geometric_dataset.py b/graphein/ml/datasets/torch_geometric_dataset.py index 9f464fd7..3665918f 100644 --- a/graphein/ml/datasets/torch_geometric_dataset.py +++ b/graphein/ml/datasets/torch_geometric_dataset.py @@ -9,7 +9,7 @@ import logging as log import os from pathlib import Path -from typing import Callable, Dict, List, Optional +from typing import Callable, Dict, Generator, List, Optional import networkx as nx from tqdm import tqdm @@ -414,9 +414,9 @@ def __init__( if chain_selections is not None: self.chain_selection_map = dict(enumerate(chain_selections)) else: - self.graph_label_map = None + self.chain_selection_map = None self.validate_input() - self.bad_pdbs: List[str] = [] + self.bad_pdbs: List[str] = [] # Configs self.config = graphein_config @@ -451,23 +451,26 @@ def processed_file_names(self) -> List[str]: return [f"{pdb}.pt" for pdb in self.structures] def validate_input(self): - assert len(self.structures) == len( - self.graph_label_map - ), "Number of proteins and graph labels must match" - assert len(self.structures) == len( - self.node_label_map - ), "Number of proteins and node labels must match" - assert len(self.structures) == len( - self.chain_selection_map - ), "Number of proteins and chain selections must match" - assert len( - { - f"{pdb}_{chain}" - for pdb, chain in zip( - self.structures, self.chain_selection_map - ) - } - ) == len(self.structures), "Duplicate protein/chain combinations" + if self.graph_label_map is not None: + assert len(self.structures) == len( + self.graph_label_map + ), "Number of proteins and graph labels must match" + if self.node_label_map is not None: + assert len(self.structures) == len( + self.node_label_map + ), "Number of proteins and node labels must match" + if self.chain_selection_map is not None: + assert len(self.structures) == len( + self.chain_selection_map + ), "Number of proteins and chain selections must match" + assert len( + { + f"{pdb}_{chain}" + for pdb, chain in zip( + self.structures, self.chain_selection_map + ) + } + ) == len(self.structures), "Duplicate protein/chain combinations" def download(self): """Download the PDB files from RCSB or Alphafold.""" @@ -530,7 +533,7 @@ def process(self): # Chunk dataset for parallel processing chunk_size = 128 - def divide_chunks(l: List[str], n: int = 2) -> List[List[str]]: + def divide_chunks(l: List[str], n: int = 2) -> Generator: for i in range(0, len(l), n): yield l[i : i + n] @@ -584,12 +587,16 @@ def divide_chunks(l: List[str], n: int = 2) -> List[List[str]]: data_list = [self.pre_transform(data) for data in data_list] for i, (pdb, chain) in enumerate(zip(pdbs, chain_selections)): - - torch.save( - data_list[i], - os.path.join(self.processed_dir, f"{pdb}_{chain}.pt"), - ) - idx += 1 + if self.chain_selection_map is None: + torch.save( + data_list[i], + os.path.join(self.processed_dir, f"{pdb}.pt"), + ) + else: + torch.save( + data_list[i], + os.path.join(self.processed_dir, f"{pdb}_{chain}.pt"), + ) def get(self, idx: int): """ diff --git a/graphein/protein/edges/atomic.py b/graphein/protein/edges/atomic.py index d5b76f8f..a98ae694 100644 --- a/graphein/protein/edges/atomic.py +++ b/graphein/protein/edges/atomic.py @@ -134,7 +134,7 @@ def add_atomic_edges(G: nx.Graph, tolerance: float = 0.56) -> nx.Graph: continue # Check atoms are in the same chain - if not (chain_1 and chain_2): + if chain_1 != chain_2: continue if G.has_edge(node_1, node_2): diff --git a/graphein/protein/graphs.py b/graphein/protein/graphs.py index da278559..3ece7d36 100644 --- a/graphein/protein/graphs.py +++ b/graphein/protein/graphs.py @@ -149,7 +149,7 @@ def deprotonate_structure(df: pd.DataFrame) -> pd.DataFrame: "Deprotonating protein. This removes H atoms from the pdb_df dataframe" ) return filter_dataframe( - df, by_column="atom_name", list_of_values=["H"], boolean=False + df, by_column="element_symbol", list_of_values=["H"], boolean=False ) diff --git a/graphein/protein/resi_atoms.py b/graphein/protein/resi_atoms.py index 560acd49..b2244035 100644 --- a/graphein/protein/resi_atoms.py +++ b/graphein/protein/resi_atoms.py @@ -338,6 +338,7 @@ "LYS", "MET", "MLE", + "MSE", "MVA", "NH2", "NLE", @@ -434,6 +435,7 @@ "LYS": "K", "MET": "M", "MLE": "L", + "MSE": "M", "MVA": "V", "NH2": "X", "NLE": "L", diff --git a/setup.py b/setup.py index c20c0416..df550f23 100644 --- a/setup.py +++ b/setup.py @@ -135,7 +135,7 @@ def run(self): setup( name="graphein", - version="1.5.0", + version="1.5.1", # versioneer.get_version(), # cmdclass=versioneer.get_cmdclass(), description="Protein & Interactomic Graph Construction for Machine Learning",