Skip to content

Commit

Permalink
Fix Dssp on local files and ateroid plot hover name (#176)
Browse files Browse the repository at this point in the history
* adds node name to hover

* fixes relative paths (threw errors before)

* refactored protein graphs to always have as params: name, pdb_code, pdb_path. Also fixes #171, which was not properly fixed by #172

* fixed notebook execution failure, ran black, fixed docstring

* adds test for PR #176: dssp with pdb code or local pdb

* ran black, added notebook show_edges visualization, added myself to CONTRIBUTORS.md

* dssp now reconstructs a pdb instead of downloading one if none available. pdb_dir default changed to /tmp

* re-ran black

* fixed tmp security issue. Updated changelog.
  • Loading branch information
avivko committed May 23, 2022
1 parent dcf294f commit 0a2d5e3
Show file tree
Hide file tree
Showing 12 changed files with 35,915 additions and 3,678 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
### 1.5.0 - UNRELEASED

* [Patch] - [#178](https://github.com/a-r-j/graphein/pull/178) Fixes [#171](https://github.com/a-r-j/graphein/pull/171) and optimizes `graphein.protein.features.nodes.dssp`. Contribution by @avivko.
* [Feature] - [#170](https://github.com/a-r-j/graphein/pull/170) Adds support for viewing edges in `graphein.protein.visualisation.asteroid_plot`. Contribution by @avivko.
* [Feature] - #163 Adds support for conformer generation for SMILE inputs to molecule graph construction.
* [Feature] - #163 Adds support for molecule graph generation from an RDKit.Chem.Mol input.
Expand Down
2 changes: 2 additions & 0 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,5 @@ Ryan Greenhalgh: [@rg314](https://github.com/rg314)
Rico Meinl: [@ricomnl](https://github.com/ricomnl)

Alex Morehead [@amorehead](https://github.com/amorehead)

Aviv Korman [@avivko](https://github.com/avivko)
1 change: 1 addition & 0 deletions graphein/protein/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ def plot_graph_metric_property_correlation(
labels={
col: col.replace("_", " ") for col in dataf.columns
}, # remove underscore
hover_name=dataf.index,
color=colour_by,
height=height,
width=width,
Expand Down
15 changes: 9 additions & 6 deletions graphein/protein/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# Code Repository: https://github.com/a-r-j/graphein
from __future__ import annotations

import os
from functools import partial
from pathlib import Path
from typing import Any, Callable, List, Optional, Union
Expand Down Expand Up @@ -42,9 +43,13 @@ class GetContactsConfig(BaseModel):

get_contacts_path: Path = Path(
"/Users/arianjamasb/github/getcontacts/"
).resolve() # TODO: get rid of this absolute path
contacts_dir: Path = Path(
os.path.join(os.path.dirname(__file__), "../examples/contacts/")
).resolve()
pdb_dir: Path = Path(
os.path.join(os.path.dirname(__file__), "../examples/pdbs/")
).resolve()
contacts_dir: Path = Path("../examples/contacts/").resolve()
pdb_dir: Path = Path("../examples/pdbs/").resolve()
granularity: str = "CA"


Expand Down Expand Up @@ -114,7 +119,7 @@ class ProteinGraphConfig(BaseModel):
:param insertions: Controls whether or not insertions are allowed.
:type insertions: bool
:param pdb_dir: Specifies path to download protein structures into.
:type pdb_dir: pathlib.Path
:type pdb_dir: pathlib.Path. Optional.
:param verbose: Specifies verbosity of graph creation process.
:type verbose: bool
:param exclude_waters: Specifies whether or not water molecules are excluded from the structure
Expand Down Expand Up @@ -147,9 +152,7 @@ class ProteinGraphConfig(BaseModel):
granularity: Union[GraphAtoms, GranularityOpts] = "CA"
keep_hets: bool = False
insertions: bool = False
pdb_dir: Path = Path(
"../examples/pdbs/"
) # Also suggest to avoid hard-coding paths if possible!
pdb_dir: Optional[Path] = None
verbose: bool = False
exclude_waters: bool = True
deprotonate: bool = False
Expand Down
37 changes: 28 additions & 9 deletions graphein/protein/features/nodes/dssp.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@

import os
from typing import Any, Dict, Optional
import tempfile

import networkx as nx
import pandas as pd
from Bio.Data.IUPACData import protein_letters_1to3
from Bio.PDB.DSSP import dssp_dict_from_pdb_file, residue_max_acc

from graphein.protein.utils import download_pdb, is_tool
from graphein.protein.utils import is_tool, save_pdb_df_to_pdb

DSSP_COLS = [
"chain",
Expand Down Expand Up @@ -94,7 +95,10 @@ def process_dssp_df(df: pd.DataFrame) -> pd.DataFrame:
return df


def add_dssp_df(G: nx.Graph, dssp_config: Optional[DSSPConfig]) -> nx.Graph:
def add_dssp_df(
G: nx.Graph,
dssp_config: Optional[DSSPConfig],
) -> nx.Graph:
"""
Construct DSSP dataframe and add as graph level variable to protein graph
Expand All @@ -107,7 +111,9 @@ def add_dssp_df(G: nx.Graph, dssp_config: Optional[DSSPConfig]) -> nx.Graph:
"""

config = G.graph["config"]
pdb_id = G.graph["pdb_id"]
pdb_code = G.graph["pdb_code"]
pdb_path = G.graph["pdb_path"]
pdb_name = G.graph["name"]

# Extract DSSP executable
executable = dssp_config.executable
Expand All @@ -117,17 +123,30 @@ def add_dssp_df(G: nx.Graph, dssp_config: Optional[DSSPConfig]) -> nx.Graph:
executable
), "DSSP must be on PATH and marked as an executable"

# Check for existence of pdb file. If not, download it.
if not os.path.isfile(config.pdb_dir / (pdb_id + ".pdb")):
pdb_file = download_pdb(config, pdb_id)
pdb_file = None
if pdb_path:
if os.path.isfile(pdb_path):
pdb_file = pdb_path
else:
pdb_file = config.pdb_dir / (pdb_id + ".pdb")
if config.pdb_dir:
if os.path.isfile(config.pdb_dir / (pdb_code + ".pdb")):
pdb_file = config.pdb_dir / (pdb_code + ".pdb")

# Check for existence of pdb file. If not, reconstructs it from the raw df.
if pdb_file:
dssp_dict = dssp_dict_from_pdb_file(pdb_file, DSSP=executable)
else:
with tempfile.TemporaryDirectory() as tmpdirname:
save_pdb_df_to_pdb(
G.graph["raw_pdb_df"], tmpdirname + f"/{pdb_name}.pdb"
)
dssp_dict = dssp_dict_from_pdb_file(
tmpdirname + f"/{pdb_name}.pdb", DSSP=executable
)

if config.verbose:
print(f"Using DSSP executable '{executable}'")

# Run DSSP
dssp_dict = dssp_dict_from_pdb_file(pdb_file, DSSP=executable)
dssp_dict = parse_dssp_df(dssp_dict)
dssp_dict = process_dssp_df(dssp_dict)

Expand Down
66 changes: 49 additions & 17 deletions graphein/protein/graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,11 @@ def read_pdb_to_dataframe(
:returns: ``pd.DataFrame`` containing protein structure
:rtype: pd.DataFrame
"""
if pdb_code is None and pdb_path is None:
raise NameError("One of pdb_code or pdb_path must be specified!")

assert (pdb_code and not pdb_path) or (not pdb_code and pdb_path), (
"Either a PDB ID or a path to a local PDB file"
" must be specified to read a PDB"
)

atomic_df = (
PandasPdb().read_pdb(pdb_path)
Expand Down Expand Up @@ -349,8 +352,10 @@ def select_chains(
def initialise_graph_with_metadata(
protein_df: pd.DataFrame,
raw_pdb_df: pd.DataFrame,
pdb_id: str,
granularity: str,
name: Optional[str] = None,
pdb_code: Optional[str] = None,
pdb_path: Optional[str] = None,
) -> nx.Graph:
"""
Initializes the nx Graph object with initial metadata.
Expand All @@ -359,17 +364,35 @@ def initialise_graph_with_metadata(
:type protein_df: pd.DataFrame
:param raw_pdb_df: Unprocessed dataframe of protein structure for comparison and traceability downstream.
:type raw_pdb_df: pd.DataFrame
:param pdb_id: PDB Accession code.
:type pdb_id: str
:param granularity: Granularity of the graph (eg ``"atom"``, ``"CA"``, ``"CB"`` etc or ``"centroid"``).
See: :const:`~graphein.protein.config.GRAPH_ATOMS` and :const:`~graphein.protein.config.GRANULARITY_OPTS`.
:type granularity: str
:param name: specified given name for the graph. If None, the PDB code or the file name will be used to name the graph.
:type name: Optional[str], defaults to ``None``
:param pdb_code: PDB ID / Accession code, if the PDB is available on the PDB database.
:type pdb_code: Optional[str], defaults to ``None``
:param pdb_path: path to local PDB file, if constructing a graph from a local file.
:type pdb_path: Optional[str], defaults to ``None``
:return: Returns initial protein structure graph with metadata.
:rtype: nx.Graph
"""

assert (pdb_code and not pdb_path) or (not pdb_code and pdb_path), (
"Either a PDB ID or a path to a local PDB file"
" must be specified to read a PDB"
)

# Get name for graph if no name was provided
if not name:
if pdb_path:
name = get_protein_name_from_filename(pdb_path)
else:
name = pdb_code

G = nx.Graph(
name=pdb_id,
pdb_id=pdb_id,
name=name,
pdb_code=pdb_code,
pdb_path=pdb_path,
chain_ids=list(protein_df["chain_id"].unique()),
pdb_df=protein_df,
raw_pdb_df=raw_pdb_df,
Expand Down Expand Up @@ -501,6 +524,7 @@ def compute_edges(

def construct_graph(
config: Optional[ProteinGraphConfig] = None,
name: Optional[str] = None,
pdb_path: Optional[str] = None,
pdb_code: Optional[str] = None,
chain_selection: str = "all",
Expand All @@ -520,10 +544,12 @@ def construct_graph(
:param config: :class:`~graphein.protein.config.ProteinGraphConfig` object. If None, defaults to config in ``graphein.protein.config``.
:type config: graphein.protein.config.ProteinGraphConfig, optional
:param pdb_path: Path to ``pdb_file`` to build graph from. Default is ``None``.
:type pdb_path: str, optional
:param pdb_code: 4-character PDB accession pdb_code to build graph from. Default is ``None``.
:type pdb_code: str, optional
:param name: an optional given name for the graph. the PDB ID or PDB file name will be used if not specified.
:type name: str, optional
:param pdb_path: Path to ``pdb_file`` when constructing a graph from a local pdb file. Default is ``None``.
:type pdb_path: Optional[str], defaults to ``None``
:param pdb_code: A 4-character PDB ID / accession to be used to construct the graph, if available. Default is ``None``.
:type pdb_code: Optional[str], defaults to ``None``
:param chain_selection: String of polypeptide chains to include in graph. E.g ``"ABDF"`` or ``"all"``. Default is ``"all"``.
:type chain_selection: str
:param df_processing_funcs: List of dataframe processing functions. Default is ``None``.
Expand All @@ -540,14 +566,15 @@ def construct_graph(
:type: nx.Graph
"""

assert (pdb_code and not pdb_path) or (not pdb_code and pdb_path), (
"Either a PDB ID or a path to a local PDB file"
" must be specified to construct a graph"
)

# If no config is provided, use default
if config is None:
config = ProteinGraphConfig()

# Get name from pdb_file is no pdb_code is provided
if pdb_path and (pdb_code is None):
pdb_code = get_protein_name_from_filename(pdb_path)

# If config params are provided, overwrite them
config.protein_df_processing_functions = (
df_processing_funcs
Expand Down Expand Up @@ -582,14 +609,19 @@ def construct_graph(
granularity=config.granularity,
)
protein_df = process_dataframe(
raw_df, chain_selection=chain_selection, granularity=config.granularity, insertions=config.insertions
raw_df,
chain_selection=chain_selection,
granularity=config.granularity,
insertions=config.insertions,
)

# Initialise graph with metadata
g = initialise_graph_with_metadata(
protein_df=protein_df,
raw_pdb_df=raw_df.df["ATOM"],
pdb_id=pdb_code,
name=name,
pdb_code=pdb_code,
pdb_path=pdb_path,
granularity=config.granularity,
)
# Add nodes to graph
Expand Down
3 changes: 2 additions & 1 deletion graphein/protein/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# Project Website: https://github.com/a-r-j/graphein
# Code Repository: https://github.com/a-r-j/graphein
import os
import tempfile
from functools import lru_cache
from pathlib import Path
from shutil import which
Expand Down Expand Up @@ -68,7 +69,7 @@ def download_pdb(config, pdb_code: str) -> Path:
"""
pdb_code = pdb_code.lower()
if not config.pdb_dir:
config.pdb_dir = Path("/tmp/")
config.pdb_dir = Path(tempfile.TemporaryDirectory().name)

# Initialise class and download pdb file
pdbl = PDBList()
Expand Down
29 changes: 16 additions & 13 deletions graphein/protein/visualisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,12 +154,15 @@ def colour_edges(
raise NotImplementedError(
"Other edge colouring methods not implemented."
)

assert (0.0 <= set_alpha <= 1.0), f"Alpha value {set_alpha} must be between 0.0 and 1.0"

assert (
0.0 <= set_alpha <= 1.0
), f"Alpha value {set_alpha} must be between 0.0 and 1.0"
colors = [c[:3] + (set_alpha,) for c in colors]
if return_as_rgba:
return [
f"rgba{tuple(list(co.convert_to_RGB_255(c[:3])) + [c[3]])}" for c in colors
f"rgba{tuple(list(co.convert_to_RGB_255(c[:3])) + [c[3]])}"
for c in colors
]
return colors

Expand Down Expand Up @@ -735,11 +738,16 @@ def asteroid_plot(

if show_edges:
edge_colors = colour_edges(
subgraph, colour_map=edge_colour_map, colour_by=colour_edges_by,
set_alpha=edge_alpha, return_as_rgba=True
subgraph,
colour_map=edge_colour_map,
colour_by=colour_edges_by,
set_alpha=edge_alpha,
return_as_rgba=True,
)
show_legend_bools = [(True if x not in edge_colors[:i] else False)
for i, x in enumerate(edge_colors)]
show_legend_bools = [
(True if x not in edge_colors[:i] else False)
for i, x in enumerate(edge_colors)
]
edge_trace = []
for i, (u, v) in enumerate(subgraph.edges()):
x0, y0 = subgraph.nodes[u]["pos"]
Expand Down Expand Up @@ -812,12 +820,7 @@ def asteroid_plot(
width=width,
height=height,
titlefont_size=16,
legend=dict(
yanchor="top",
y=1,
xanchor="left",
x=1.10
),
legend=dict(yanchor="top", y=1, xanchor="left", x=1.10),
showlegend=True if show_legend else False,
hovermode="closest",
margin=dict(b=20, l=5, r=5, t=40),
Expand Down

0 comments on commit 0a2d5e3

Please sign in to comment.