Improve graphein.protein.tensor.io.to_pdb & prep for 1.7.5 release (

#352) * update pdb writing util * bump version strings to 1.7.5 * update changelog --------- Co-authored-by: Arian Jamasb <arian.jamasb@roche.com>
a-r-j · Oct 27, 2023 · 3d7af1f · 3d7af1f
1 parent 27c065b
commit 3d7af1f
Show file tree

Hide file tree

Showing 5 changed files with 26 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+### 1.7.5 - 27/10/2024
+
+* Improves the tensor->PDB writer (`graphein.protein.tensor.io.to_pdb`) by automatically unravelling residue-level b-factor predictions/annotations ([#352](https://github.com/a-r-j/pull/352)).
+
 ### 1.7.4 - 26/10/2023
 
 * Adds support for PyG 2.4+ ([#350](https://www.github.com/a-r-j/graphein/pull/339))

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -34,7 +34,7 @@
 copyright = f"{datetime.datetime.now().year}, {author}"
 
 # The full version, including alpha/beta/rc tags
-release = "1.7.4"
+release = "1.7.5"
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/graphein/__init__.py b/graphein/__init__.py
@@ -12,7 +12,7 @@
 from .testing import *
 
 __author__ = "Arian Jamasb <arian@jamasb.io>"
-__version__ = "1.7.4"
+__version__ = "1.7.5"
 
 
 logger.configure(

diff --git a/graphein/protein/tensor/io.py b/graphein/protein/tensor/io.py
@@ -392,7 +392,9 @@ def to_dataframe(
     :param insertions: List of insertion codes, defaults to ``None`` (``""``).
     :type insertions: Optional[List[Union[str, float]]], optional
     :param b_factors: List or tensor of b factors (length: num residues),
-        defaults to ``None`` (``""``).
+        defaults to ``None`` (``""``). If ``b_factors`` is of length/shape
+        number of residues (as opposed to number of atoms) it is automatically
+        unravelled to the correct length.
     :type b_factors: Optional[List[Union[str, float]]], optional
     :param occupancy: List or tensor of occupancy values (length: num residues),
         defaults to ``None`` (``1.0``).
@@ -434,12 +436,25 @@ def to_dataframe(
     element_symbols = [ELEMENT_SYMBOL_MAP[a] for a in atom_type]
 
     chains = ["A"] * len(res_nums) if chains is None else chains[res_nums - 1]
+    if b_factors is not None:
+        num_b_factors = (
+            len(b_factors)
+            if isinstance(b_factors, list)
+            else b_factors.shape[0]
+        )
+        b_factors = (
+            b_factors[res_nums - 1]
+            if num_b_factors == x.shape[0]
+            else b_factors
+        )
+        if isinstance(b_factors, torch.Tensor):
+            b_factors = b_factors.tolist()
+    else:
+        b_factors = [0.0] * len(res_nums)
     if segment_id is None:
         segment_id = [""] * len(res_nums)
     if insertions is None:
         insertions = [""] * len(res_nums)
-    if b_factors is None:
-        b_factors = [0.0] * len(res_nums)
     if occupancy is None:
         occupancy = [1.0] * len(res_nums)
     if charge is None:
@@ -480,7 +495,6 @@ def to_dataframe(
         "line_idx": atom_number,
     }
     df = pd.DataFrame().from_dict(out)
-
     if biopandas:
         ppdb = PandasPdb()
         ppdb.df["ATOM"] = df
@@ -501,7 +515,7 @@ def to_pdb(x: AtomTensor, out_path: str, gz: bool = False, **kwargs):
     :type x: AtomTensor
     :param out_path: Path to output pdb file.
     :type out_path: str
-    :param gz: Whether to gzip out the ouput, defaults to ``False``.
+    :param gz: Whether to gzip out the output, defaults to ``False``.
     :type gz: bool, optional
     :param kwargs: Keyword args for :func:`graphein.protein.tensor.to_dataframe`
     """

diff --git a/setup.py b/setup.py
@@ -134,7 +134,7 @@ def run(self):
 
 setup(
     name="graphein",
-    version="1.7.4",
+    version="1.7.5",
     description="Protein & Interactomic Graph Construction for Machine Learning",
     long_description=long_description,
     long_description_content_type="text/markdown",