Source code for dgllife.utils.mol_to_graph

# -*- coding: utf-8 -*-
#
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Convert molecules into DGLGraphs
#
# pylint: disable= no-member, arguments-differ, invalid-name

from functools import partial
import torch
import dgl

from sklearn.neighbors import NearestNeighbors

try:
    from rdkit import Chem
    from rdkit.Chem import rdmolfiles, rdmolops
except ImportError:
    pass

__all__ = ['mol_to_graph',
           'smiles_to_bigraph',
           'mol_to_bigraph',
           'smiles_to_complete_graph',
           'mol_to_complete_graph',
           'k_nearest_neighbors',
           'mol_to_nearest_neighbor_graph',
           'smiles_to_nearest_neighbor_graph',
           'ToGraph',
           'MolToBigraph',
           'SMILESToBigraph']

# pylint: disable=I1101
[docs]def mol_to_graph(mol, graph_constructor, node_featurizer, edge_featurizer, canonical_atom_order, explicit_hydrogens=False, num_virtual_nodes=0): """Convert an RDKit molecule object into a DGLGraph and featurize for it. This function can be used to construct any arbitrary ``DGLGraph`` from an RDKit molecule instance. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder graph_constructor : callable Takes an RDKit molecule as input and returns a DGLGraph node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. explicit_hydrogens : bool Whether to explicitly represent hydrogens as nodes in the graph. If True, it will call rdkit.Chem.AddHs(mol). If False, it will do nothing. Default to False. num_virtual_nodes : int The number of virtual nodes to add. The virtual nodes will be connected to all real nodes with virtual edges. If the returned graph has any node/edge feature, an additional column of binary values will be used for each feature to indicate the identity of virtual node/edges. The features of the virtual nodes/edges will be zero vectors except for the additional column. Default to 0. Returns ------- DGLGraph or None Converted DGLGraph for the molecule if :attr:`mol` is valid and None otherwise. See Also -------- mol_to_bigraph mol_to_complete_graph mol_to_nearest_neighbor_graph """ if mol is None: print('Invalid mol found') return None # Whether to have hydrogen atoms as explicit nodes if explicit_hydrogens: mol = Chem.AddHs(mol) if canonical_atom_order: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) g = graph_constructor(mol) if node_featurizer is not None: g.ndata.update(node_featurizer(mol)) if edge_featurizer is not None: g.edata.update(edge_featurizer(mol)) if num_virtual_nodes > 0: num_real_nodes = g.num_nodes() real_nodes = list(range(num_real_nodes)) g.add_nodes(num_virtual_nodes) # Change Topology virtual_src = [] virtual_dst = [] for count in range(num_virtual_nodes): virtual_node = num_real_nodes + count virtual_node_copy = [virtual_node] * num_real_nodes virtual_src.extend(real_nodes) virtual_src.extend(virtual_node_copy) virtual_dst.extend(virtual_node_copy) virtual_dst.extend(real_nodes) g.add_edges(virtual_src, virtual_dst) for nk, nv in g.ndata.items(): nv = torch.cat([nv, torch.zeros(g.num_nodes(), 1)], dim=1) nv[-num_virtual_nodes:, -1] = 1 g.ndata[nk] = nv for ek, ev in g.edata.items(): ev = torch.cat([ev, torch.zeros(g.num_edges(), 1)], dim=1) ev[-num_virtual_nodes * num_real_nodes * 2:, -1] = 1 g.edata[ek] = ev return g
def construct_bigraph_from_mol(mol, add_self_loop=False): """Construct a bi-directed DGLGraph with topology only for the molecule. The **i** th atom in the molecule, i.e. ``mol.GetAtomWithIdx(i)``, corresponds to the **i** th node in the returned DGLGraph. The **i** th bond in the molecule, i.e. ``mol.GetBondWithIdx(i)``, corresponds to the **(2i)**-th and **(2i+1)**-th edges in the returned DGLGraph. The **(2i)**-th and **(2i+1)**-th edges will be separately from **u** to **v** and **v** to **u**, where **u** is ``bond.GetBeginAtomIdx()`` and **v** is ``bond.GetEndAtomIdx()``. If self loops are added, the last **n** edges will separately be self loops for atoms ``0, 1, ..., n-1``. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder add_self_loop : bool Whether to add self loops in DGLGraphs. Default to False. Returns ------- g : DGLGraph Empty bigraph topology of the molecule """ g = dgl.graph(([], []), idtype=torch.int32) # Add nodes num_atoms = mol.GetNumAtoms() g.add_nodes(num_atoms) # Add edges src_list = [] dst_list = [] num_bonds = mol.GetNumBonds() for i in range(num_bonds): bond = mol.GetBondWithIdx(i) u = bond.GetBeginAtomIdx() v = bond.GetEndAtomIdx() src_list.extend([u, v]) dst_list.extend([v, u]) if add_self_loop: nodes = g.nodes().tolist() src_list.extend(nodes) dst_list.extend(nodes) g.add_edges(torch.IntTensor(src_list), torch.IntTensor(dst_list)) return g
[docs]def mol_to_bigraph(mol, add_self_loop=False, node_featurizer=None, edge_featurizer=None, canonical_atom_order=True, explicit_hydrogens=False, num_virtual_nodes=0): """Convert an RDKit molecule object into a bi-directed DGLGraph and featurize for it. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder add_self_loop : bool Whether to add self loops in DGLGraphs. Default to False. node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. Default to None. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. Default to None. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. Default to True. explicit_hydrogens : bool Whether to explicitly represent hydrogens as nodes in the graph. If True, it will call rdkit.Chem.AddHs(mol). Default to False. num_virtual_nodes : int The number of virtual nodes to add. The virtual nodes will be connected to all real nodes with virtual edges. If the returned graph has any node/edge feature, an additional column of binary values will be used for each feature to indicate the identity of virtual node/edges. The features of the virtual nodes/edges will be zero vectors except for the additional column. Default to 0. Returns ------- DGLGraph or None Bi-directed DGLGraph for the molecule if :attr:`mol` is valid and None otherwise. Examples -------- >>> from rdkit import Chem >>> from dgllife.utils import mol_to_bigraph >>> mol = Chem.MolFromSmiles('CCO') >>> g = mol_to_bigraph(mol) >>> print(g) DGLGraph(num_nodes=3, num_edges=4, ndata_schemes={} edata_schemes={}) We can also initialize node/edge features when constructing graphs. >>> import torch >>> from rdkit import Chem >>> from dgllife.utils import mol_to_bigraph >>> def featurize_atoms(mol): >>> feats = [] >>> for atom in mol.GetAtoms(): >>> feats.append(atom.GetAtomicNum()) >>> return {'atomic': torch.tensor(feats).reshape(-1, 1).float()} >>> def featurize_bonds(mol): >>> feats = [] >>> bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, >>> Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC] >>> for bond in mol.GetBonds(): >>> btype = bond_types.index(bond.GetBondType()) >>> # One bond between atom u and v corresponds to two edges (u, v) and (v, u) >>> feats.extend([btype, btype]) >>> return {'type': torch.tensor(feats).reshape(-1, 1).float()} >>> mol = Chem.MolFromSmiles('CCO') >>> g = mol_to_bigraph(mol, node_featurizer=featurize_atoms, >>> edge_featurizer=featurize_bonds) >>> print(g.ndata['atomic']) tensor([[6.], [8.], [6.]]) >>> print(g.edata['type']) tensor([[0.], [0.], [0.], [0.]]) By default, we do not explicitly represent hydrogens as nodes, which can be done as follows. >>> g = mol_to_bigraph(mol, explicit_hydrogens=True) >>> print(g) DGLGraph(num_nodes=9, num_edges=16, ndata_schemes={} edata_schemes={}) See Also -------- smiles_to_bigraph """ return mol_to_graph(mol, partial(construct_bigraph_from_mol, add_self_loop=add_self_loop), node_featurizer, edge_featurizer, canonical_atom_order, explicit_hydrogens, num_virtual_nodes)
[docs]def smiles_to_bigraph(smiles, add_self_loop=False, node_featurizer=None, edge_featurizer=None, canonical_atom_order=True, explicit_hydrogens=False, num_virtual_nodes=0): """Convert a SMILES into a bi-directed DGLGraph and featurize for it. Parameters ---------- smiles : str String of SMILES add_self_loop : bool Whether to add self loops in DGLGraphs. Default to False. node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. Default to None. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. Default to None. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. Default to True. explicit_hydrogens : bool Whether to explicitly represent hydrogens as nodes in the graph. If True, it will call rdkit.Chem.AddHs(mol). Default to False. num_virtual_nodes : int The number of virtual nodes to add. The virtual nodes will be connected to all real nodes with virtual edges. If the returned graph has any node/edge feature, an additional column of binary values will be used for each feature to indicate the identity of virtual node/edges. The features of the virtual nodes/edges will be zero vectors except for the additional column. Default to 0. Returns ------- DGLGraph or None Bi-directed DGLGraph for the molecule if :attr:`smiles` is valid and None otherwise. Examples -------- >>> from dgllife.utils import smiles_to_bigraph >>> g = smiles_to_bigraph('CCO') >>> print(g) DGLGraph(num_nodes=3, num_edges=4, ndata_schemes={} edata_schemes={}) We can also initialize node/edge features when constructing graphs. >>> import torch >>> from rdkit import Chem >>> from dgllife.utils import smiles_to_bigraph >>> def featurize_atoms(mol): >>> feats = [] >>> for atom in mol.GetAtoms(): >>> feats.append(atom.GetAtomicNum()) >>> return {'atomic': torch.tensor(feats).reshape(-1, 1).float()} >>> def featurize_bonds(mol): >>> feats = [] >>> bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, >>> Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC] >>> for bond in mol.GetBonds(): >>> btype = bond_types.index(bond.GetBondType()) >>> # One bond between atom u and v corresponds to two edges (u, v) and (v, u) >>> feats.extend([btype, btype]) >>> return {'type': torch.tensor(feats).reshape(-1, 1).float()} >>> g = smiles_to_bigraph('CCO', node_featurizer=featurize_atoms, >>> edge_featurizer=featurize_bonds) >>> print(g.ndata['atomic']) tensor([[6.], [8.], [6.]]) >>> print(g.edata['type']) tensor([[0.], [0.], [0.], [0.]]) By default, we do not explicitly represent hydrogens as nodes, which can be done as follows. >>> g = smiles_to_bigraph('CCO', explicit_hydrogens=True) >>> print(g) DGLGraph(num_nodes=9, num_edges=16, ndata_schemes={} edata_schemes={}) See Also -------- mol_to_bigraph """ mol = Chem.MolFromSmiles(smiles) return mol_to_bigraph(mol, add_self_loop, node_featurizer, edge_featurizer, canonical_atom_order, explicit_hydrogens, num_virtual_nodes)
def construct_complete_graph_from_mol(mol, add_self_loop=False): """Construct a complete graph with topology only for the molecule The **i** th atom in the molecule, i.e. ``mol.GetAtomWithIdx(i)``, corresponds to the **i** th node in the returned DGLGraph. The edges are in the order of (0, 0), (1, 0), (2, 0), ... (0, 1), (1, 1), (2, 1), ... If self loops are not created, we will not have (0, 0), (1, 1), ... Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder add_self_loop : bool Whether to add self loops in DGLGraphs. Default to False. Returns ------- g : DGLGraph Empty complete graph topology of the molecule """ num_atoms = mol.GetNumAtoms() src = [] dst = [] for i in range(num_atoms): for j in range(num_atoms): if i != j or add_self_loop: src.append(i) dst.append(j) g = dgl.graph((torch.IntTensor(src), torch.IntTensor(dst)), idtype=torch.int32) return g
[docs]def mol_to_complete_graph(mol, add_self_loop=False, node_featurizer=None, edge_featurizer=None, canonical_atom_order=True, explicit_hydrogens=False, num_virtual_nodes=0): """Convert an RDKit molecule into a complete DGLGraph and featurize for it. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder add_self_loop : bool Whether to add self loops in DGLGraphs. Default to False. node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. Default to None. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. Default to None. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. Default to True. explicit_hydrogens : bool Whether to explicitly represent hydrogens as nodes in the graph. If True, it will call rdkit.Chem.AddHs(mol). Default to False. num_virtual_nodes : int The number of virtual nodes to add. The virtual nodes will be connected to all real nodes with virtual edges. If the returned graph has any node/edge feature, an additional column of binary values will be used for each feature to indicate the identity of virtual node/edges. The features of the virtual nodes/edges will be zero vectors except for the additional column. Default to 0. Returns ------- DGLGraph or None Complete DGLGraph for the molecule if :attr:`mol` is valid and None otherwise. Examples -------- >>> from rdkit import Chem >>> from dgllife.utils import mol_to_complete_graph >>> mol = Chem.MolFromSmiles('CCO') >>> g = mol_to_complete_graph(mol) >>> print(g) DGLGraph(num_nodes=3, num_edges=6, ndata_schemes={} edata_schemes={}) We can also initialize node/edge features when constructing graphs. >>> import torch >>> from rdkit import Chem >>> from dgllife.utils import mol_to_complete_graph >>> from functools import partial >>> def featurize_atoms(mol): >>> feats = [] >>> for atom in mol.GetAtoms(): >>> feats.append(atom.GetAtomicNum()) >>> return {'atomic': torch.tensor(feats).reshape(-1, 1).float()} >>> def featurize_edges(mol, add_self_loop=False): >>> feats = [] >>> num_atoms = mol.GetNumAtoms() >>> atoms = list(mol.GetAtoms()) >>> distance_matrix = Chem.GetDistanceMatrix(mol) >>> for i in range(num_atoms): >>> for j in range(num_atoms): >>> if i != j or add_self_loop: >>> feats.append(float(distance_matrix[i, j])) >>> return {'dist': torch.tensor(feats).reshape(-1, 1).float()} >>> mol = Chem.MolFromSmiles('CCO') >>> add_self_loop = True >>> g = mol_to_complete_graph( >>> mol, add_self_loop=add_self_loop, node_featurizer=featurize_atoms, >>> edge_featurizer=partial(featurize_edges, add_self_loop=add_self_loop)) >>> print(g.ndata['atomic']) tensor([[6.], [8.], [6.]]) >>> print(g.edata['dist']) tensor([[0.], [2.], [1.], [2.], [0.], [1.], [1.], [1.], [0.]]) By default, we do not explicitly represent hydrogens as nodes, which can be done as follows. >>> g = mol_to_complete_graph(mol, explicit_hydrogens=True) >>> print(g) DGLGraph(num_nodes=9, num_edges=72, ndata_schemes={} edata_schemes={}) See Also -------- smiles_to_complete_graph """ return mol_to_graph(mol, partial(construct_complete_graph_from_mol, add_self_loop=add_self_loop), node_featurizer, edge_featurizer, canonical_atom_order, explicit_hydrogens, num_virtual_nodes)
[docs]def smiles_to_complete_graph(smiles, add_self_loop=False, node_featurizer=None, edge_featurizer=None, canonical_atom_order=True, explicit_hydrogens=False, num_virtual_nodes=0): """Convert a SMILES into a complete DGLGraph and featurize for it. Parameters ---------- smiles : str String of SMILES add_self_loop : bool Whether to add self loops in DGLGraphs. Default to False. node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. Default to None. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. Default to None. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. Default to True. explicit_hydrogens : bool Whether to explicitly represent hydrogens as nodes in the graph. If True, it will call rdkit.Chem.AddHs(mol). Default to False. num_virtual_nodes : int The number of virtual nodes to add. The virtual nodes will be connected to all real nodes with virtual edges. If the returned graph has any node/edge feature, an additional column of binary values will be used for each feature to indicate the identity of virtual node/edges. The features of the virtual nodes/edges will be zero vectors except for the additional column. Default to 0. Returns ------- DGLGraph or None Complete DGLGraph for the molecule if :attr:`smiles` is valid and None otherwise. Examples -------- >>> from dgllife.utils import smiles_to_complete_graph >>> g = smiles_to_complete_graph('CCO') >>> print(g) DGLGraph(num_nodes=3, num_edges=6, ndata_schemes={} edata_schemes={}) We can also initialize node/edge features when constructing graphs. >>> import torch >>> from rdkit import Chem >>> from dgllife.utils import smiles_to_complete_graph >>> from functools import partial >>> def featurize_atoms(mol): >>> feats = [] >>> for atom in mol.GetAtoms(): >>> feats.append(atom.GetAtomicNum()) >>> return {'atomic': torch.tensor(feats).reshape(-1, 1).float()} >>> def featurize_edges(mol, add_self_loop=False): >>> feats = [] >>> num_atoms = mol.GetNumAtoms() >>> atoms = list(mol.GetAtoms()) >>> distance_matrix = Chem.GetDistanceMatrix(mol) >>> for i in range(num_atoms): >>> for j in range(num_atoms): >>> if i != j or add_self_loop: >>> feats.append(float(distance_matrix[i, j])) >>> return {'dist': torch.tensor(feats).reshape(-1, 1).float()} >>> add_self_loop = True >>> g = smiles_to_complete_graph( >>> 'CCO', add_self_loop=add_self_loop, node_featurizer=featurize_atoms, >>> edge_featurizer=partial(featurize_edges, add_self_loop=add_self_loop)) >>> print(g.ndata['atomic']) tensor([[6.], [8.], [6.]]) >>> print(g.edata['dist']) tensor([[0.], [2.], [1.], [2.], [0.], [1.], [1.], [1.], [0.]]) By default, we do not explicitly represent hydrogens as nodes, which can be done as follows. >>> g = smiles_to_complete_graph('CCO', explicit_hydrogens=True) >>> print(g) DGLGraph(num_nodes=9, num_edges=72, ndata_schemes={} edata_schemes={}) See Also -------- mol_to_complete_graph """ mol = Chem.MolFromSmiles(smiles) return mol_to_complete_graph(mol, add_self_loop, node_featurizer, edge_featurizer, canonical_atom_order, explicit_hydrogens, num_virtual_nodes)
[docs]def k_nearest_neighbors(coordinates, neighbor_cutoff, max_num_neighbors=None, p_distance=2, self_loops=False): """Find k nearest neighbors for each atom We do not guarantee that the edges are sorted according to the distance between atoms. Parameters ---------- coordinates : numpy.ndarray of shape (N, D) The coordinates of atoms in the molecule. N for the number of atoms and D for the dimensions of the coordinates. neighbor_cutoff : float If the distance between a pair of nodes is larger than neighbor_cutoff, they will not be considered as neighboring nodes. max_num_neighbors : int or None. If not None, then this specifies the maximum number of neighbors allowed for each atom. Default to None. p_distance : int We compute the distance between neighbors using Minkowski (:math:`l_p`) distance. When ``p_distance = 1``, Minkowski distance is equivalent to Manhattan distance. When ``p_distance = 2``, Minkowski distance is equivalent to the standard Euclidean distance. Default to 2. self_loops : bool Whether to allow a node to be its own neighbor. Default to False. Returns ------- srcs : list of int Source nodes. dsts : list of int Destination nodes, corresponding to ``srcs``. distances : list of float Distances between the end nodes, corresponding to ``srcs`` and ``dsts``. Examples -------- >>> from dgllife.utils import get_mol_3d_coordinates, k_nearest_neighbors >>> from rdkit import Chem >>> from rdkit.Chem import AllChem >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C') >>> AllChem.EmbedMolecule(mol) >>> AllChem.MMFFOptimizeMolecule(mol) >>> coords = get_mol_3d_coordinates(mol) >>> srcs, dsts, dists = k_nearest_neighbors(coords, neighbor_cutoff=1.25) >>> print(srcs) [8, 7, 11, 10, 20, 19] >>> print(dsts) [7, 8, 10, 11, 19, 20] >>> print(dists) [1.2084666104583117, 1.2084666104583117, 1.226457824344217, 1.226457824344217, 1.2230522248065987, 1.2230522248065987] See Also -------- get_mol_3d_coordinates mol_to_nearest_neighbor_graph smiles_to_nearest_neighbor_graph """ num_atoms = coordinates.shape[0] model = NearestNeighbors(radius=neighbor_cutoff, p=p_distance) model.fit(coordinates) dists_, nbrs = model.radius_neighbors(coordinates) srcs, dsts, dists = [], [], [] for i in range(num_atoms): dists_i = dists_[i].tolist() nbrs_i = nbrs[i].tolist() if not self_loops: dists_i.remove(0) nbrs_i.remove(i) if max_num_neighbors is not None and len(nbrs_i) > max_num_neighbors: packed_nbrs = list(zip(dists_i, nbrs_i)) # Sort neighbors based on distance from smallest to largest packed_nbrs.sort(key=lambda tup: tup[0]) dists_i, nbrs_i = map(list, zip(*packed_nbrs)) dsts.extend([i for _ in range(max_num_neighbors)]) srcs.extend(nbrs_i[:max_num_neighbors]) dists.extend(dists_i[:max_num_neighbors]) else: dsts.extend([i for _ in range(len(nbrs_i))]) srcs.extend(nbrs_i) dists.extend(dists_i) return srcs, dsts, dists
# pylint: disable=E1102
[docs]def mol_to_nearest_neighbor_graph(mol, coordinates, neighbor_cutoff, max_num_neighbors=None, p_distance=2, add_self_loop=False, node_featurizer=None, edge_featurizer=None, canonical_atom_order=True, keep_dists=False, dist_field='dist', explicit_hydrogens=False, num_virtual_nodes=0): """Convert an RDKit molecule into a nearest neighbor graph and featurize for it. Different from bigraph and complete graph, the nearest neighbor graph may not be symmetric since i is the closest neighbor of j does not necessarily suggest the other way. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder coordinates : numpy.ndarray of shape (N, D) The coordinates of atoms in the molecule. N for the number of atoms and D for the dimensions of the coordinates. neighbor_cutoff : float If the distance between a pair of nodes is larger than neighbor_cutoff, they will not be considered as neighboring nodes. max_num_neighbors : int or None. If not None, then this specifies the maximum number of neighbors allowed for each atom. Default to None. p_distance : int We compute the distance between neighbors using Minkowski (:math:`l_p`) distance. When ``p_distance = 1``, Minkowski distance is equivalent to Manhattan distance. When ``p_distance = 2``, Minkowski distance is equivalent to the standard Euclidean distance. Default to 2. add_self_loop : bool Whether to add self loops in DGLGraphs. Default to False. node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. Default to None. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. Default to None. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. Default to True. keep_dists : bool Whether to store the distance between neighboring atoms in ``edata`` of the constructed DGLGraphs. Default to False. dist_field : str Field for storing distance between neighboring atoms in ``edata``. This comes into effect only when ``keep_dists=True``. Default to ``'dist'``. explicit_hydrogens : bool Whether to explicitly represent hydrogens as nodes in the graph. If True, it will call rdkit.Chem.AddHs(mol). Default to False. num_virtual_nodes : int The number of virtual nodes to add. The virtual nodes will be connected to all real nodes with virtual edges. If the returned graph has any node/edge feature, an additional column of binary values will be used for each feature to indicate the identity of virtual node/edges. The features of the virtual nodes/edges will be zero vectors except for the additional column. Default to 0. Returns ------- DGLGraph or None Nearest neighbor DGLGraph for the molecule if :attr:`mol` is valid and None otherwise. Examples -------- >>> from dgllife.utils import mol_to_nearest_neighbor_graph >>> from rdkit import Chem >>> from rdkit.Chem import AllChem >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C') >>> AllChem.EmbedMolecule(mol) >>> AllChem.MMFFOptimizeMolecule(mol) >>> coords = get_mol_3d_coordinates(mol) >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25) >>> print(g) DGLGraph(num_nodes=23, num_edges=6, ndata_schemes={} edata_schemes={}) Quite often we will want to use the distance between end atoms of edges, this can be achieved with >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25, keep_dists=True) >>> print(g.edata['dist']) tensor([[1.2024], [1.2024], [1.2270], [1.2270], [1.2259], [1.2259]]) By default, we do not explicitly represent hydrogens as nodes, which can be done as follows. >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C') >>> mol = Chem.AddHs(mol) >>> AllChem.EmbedMolecule(mol) >>> AllChem.MMFFOptimizeMolecule(mol) >>> coords = get_mol_3d_coordinates(mol) >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25, >>> explicit_hydrogens=True) >>> print(g) DGLGraph(num_nodes=41, num_edges=42, ndata_schemes={} edata_schemes={}) See Also -------- get_mol_3d_coordinates k_nearest_neighbors smiles_to_nearest_neighbor_graph """ if mol is None: print('Invalid mol found') return None if explicit_hydrogens: mol = Chem.AddHs(mol) num_atoms = mol.GetNumAtoms() num_coords = coordinates.shape[0] assert num_atoms == num_coords, \ 'Expect the number of atoms to match the first dimension of coordinates, ' \ 'got {:d} and {:d}'.format(num_atoms, num_coords) if canonical_atom_order: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) srcs, dsts, dists = k_nearest_neighbors(coordinates=coordinates, neighbor_cutoff=neighbor_cutoff, max_num_neighbors=max_num_neighbors, p_distance=p_distance, self_loops=add_self_loop) g = dgl.graph(([], []), idtype=torch.int32) # Add nodes first since some nodes may be completely isolated g.add_nodes(num_atoms) # Add edges g.add_edges(srcs, dsts) if node_featurizer is not None: g.ndata.update(node_featurizer(mol)) if edge_featurizer is not None: g.edata.update(edge_featurizer(mol)) if keep_dists: assert dist_field not in g.edata, \ 'Expect {} to be reserved for distance between neighboring atoms.' g.edata[dist_field] = torch.tensor(dists).float().reshape(-1, 1) if num_virtual_nodes > 0: num_real_nodes = g.num_nodes() real_nodes = list(range(num_real_nodes)) g.add_nodes(num_virtual_nodes) # Change Topology virtual_src = [] virtual_dst = [] for count in range(num_virtual_nodes): virtual_node = num_real_nodes + count virtual_node_copy = [virtual_node] * num_real_nodes virtual_src.extend(real_nodes) virtual_src.extend(virtual_node_copy) virtual_dst.extend(virtual_node_copy) virtual_dst.extend(real_nodes) g.add_edges(virtual_src, virtual_dst) for nk, nv in g.ndata.items(): nv = torch.cat([nv, torch.zeros(g.num_nodes(), 1)], dim=1) nv[:-num_virtual_nodes, -1] = 1 g.ndata[nk] = nv for ek, ev in g.edata.items(): ev = torch.cat([ev, torch.zeros(g.num_edges(), 1)], dim=1) ev[:-num_virtual_nodes * num_real_nodes * 2, -1] = 1 g.edata[ek] = ev return g
[docs]def smiles_to_nearest_neighbor_graph(smiles, coordinates, neighbor_cutoff, max_num_neighbors=None, p_distance=2, add_self_loop=False, node_featurizer=None, edge_featurizer=None, canonical_atom_order=True, keep_dists=False, dist_field='dist', explicit_hydrogens=False, num_virtual_nodes=0): """Convert a SMILES into a nearest neighbor graph and featurize for it. Different from bigraph and complete graph, the nearest neighbor graph may not be symmetric since i is the closest neighbor of j does not necessarily suggest the other way. Parameters ---------- smiles : str String of SMILES coordinates : numpy.ndarray of shape (N, D) The coordinates of atoms in the molecule. N for the number of atoms and D for the dimensions of the coordinates. neighbor_cutoff : float If the distance between a pair of nodes is larger than neighbor_cutoff, they will not be considered as neighboring nodes. max_num_neighbors : int or None. If not None, then this specifies the maximum number of neighbors allowed for each atom. Default to None. p_distance : int We compute the distance between neighbors using Minkowski (:math:`l_p`) distance. When ``p_distance = 1``, Minkowski distance is equivalent to Manhattan distance. When ``p_distance = 2``, Minkowski distance is equivalent to the standard Euclidean distance. Default to 2. add_self_loop : bool Whether to add self loops in DGLGraphs. Default to False. node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. Default to None. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. Default to None. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. Default to True. keep_dists : bool Whether to store the distance between neighboring atoms in ``edata`` of the constructed DGLGraphs. Default to False. dist_field : str Field for storing distance between neighboring atoms in ``edata``. This comes into effect only when ``keep_dists=True``. Default to ``'dist'``. explicit_hydrogens : bool Whether to explicitly represent hydrogens as nodes in the graph. If True, it will call rdkit.Chem.AddHs(mol). Default to False. num_virtual_nodes : int The number of virtual nodes to add. The virtual nodes will be connected to all real nodes with virtual edges. If the returned graph has any node/edge feature, an additional column of binary values will be used for each feature to indicate the identity of virtual node/edges. The features of the virtual nodes/edges will be zero vectors except for the additional column. Default to 0. Returns ------- DGLGraph or None Nearest neighbor DGLGraph for the molecule if :attr:`smiles` is valid and None otherwise. Examples -------- >>> from dgllife.utils import smiles_to_nearest_neighbor_graph >>> from rdkit import Chem >>> from rdkit.Chem import AllChem >>> smiles = 'CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C' >>> mol = Chem.MolFromSmiles(smiles) >>> AllChem.EmbedMolecule(mol) >>> AllChem.MMFFOptimizeMolecule(mol) >>> coords = get_mol_3d_coordinates(mol) >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25) >>> print(g) DGLGraph(num_nodes=23, num_edges=6, ndata_schemes={} edata_schemes={}) Quite often we will want to use the distance between end atoms of edges, this can be achieved with >>> g = smiles_to_nearest_neighbor_graph(smiles, coords, neighbor_cutoff=1.25, keep_dists=True) >>> print(g.edata['dist']) tensor([[1.2024], [1.2024], [1.2270], [1.2270], [1.2259], [1.2259]]) By default, we do not explicitly represent hydrogens as nodes, which can be done as follows. >>> smiles = 'CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C' >>> mol = Chem.MolFromSmiles(smiles) >>> mol = Chem.AddHs(mol) >>> AllChem.EmbedMolecule(mol) >>> AllChem.MMFFOptimizeMolecule(mol) >>> coords = get_mol_3d_coordinates(mol) >>> g = smiles_to_nearest_neighbor_graph(smiles, coords, neighbor_cutoff=1.25, >>> explicit_hydrogens=True) >>> print(g) DGLGraph(num_nodes=41, num_edges=42, ndata_schemes={} edata_schemes={}) See Also -------- get_mol_3d_coordinates k_nearest_neighbors mol_to_nearest_neighbor_graph """ mol = Chem.MolFromSmiles(smiles) return mol_to_nearest_neighbor_graph( mol, coordinates, neighbor_cutoff, max_num_neighbors, p_distance, add_self_loop, node_featurizer, edge_featurizer, canonical_atom_order, keep_dists, dist_field, explicit_hydrogens, num_virtual_nodes)
[docs]class ToGraph: r"""An abstract class for writing graph constructors.""" def __call__(self, data_obj): raise NotImplementedError def __repr__(self): return self.__class__.__name__ + '()'
[docs]class MolToBigraph(ToGraph): """Convert RDKit molecule objects into bi-directed DGLGraphs and featurize for them. Parameters ---------- add_self_loop : bool Whether to add self loops in DGLGraphs. Default to False. node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. Default to None. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. Default to None. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. Default to True. explicit_hydrogens : bool Whether to explicitly represent hydrogens as nodes in the graph. If True, it will call rdkit.Chem.AddHs(mol). Default to False. num_virtual_nodes : int The number of virtual nodes to add. The virtual nodes will be connected to all real nodes with virtual edges. If the returned graph has any node/edge feature, an additional column of binary values will be used for each feature to indicate the identity of virtual node/edges. The features of the virtual nodes/edges will be zero vectors except for the additional column. Default to 0. Examples -------- >>> import torch >>> from rdkit import Chem >>> from dgllife.utils import MolToBigraph >>> # A custom node featurizer >>> def featurize_atoms(mol): >>> feats = [] >>> for atom in mol.GetAtoms(): >>> feats.append(atom.GetAtomicNum()) >>> return {'atomic': torch.tensor(feats).reshape(-1, 1).float()} >>> # A custom edge featurizer >>> def featurize_bonds(mol): >>> feats = [] >>> bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, >>> Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC] >>> for bond in mol.GetBonds(): >>> btype = bond_types.index(bond.GetBondType()) >>> # One bond between atom u and v corresponds to two edges (u, v) and (v, u) >>> feats.extend([btype, btype]) >>> return {'type': torch.tensor(feats).reshape(-1, 1).float()} >>> mol_to_g = MolToBigraph(node_featurizer=featurize_atoms, edge_featurizer=featurize_bonds) >>> mol = Chem.MolFromSmiles('CCO') >>> g = mol_to_g(mol) >>> print(g.ndata['atomic']) tensor([[6.], [8.], [6.]]) >>> print(g.edata['type']) tensor([[0.], [0.], [0.], [0.]]) """
[docs] def __init__(self, add_self_loop=False, node_featurizer=None, edge_featurizer=None, canonical_atom_order=True, explicit_hydrogens=False, num_virtual_nodes=0): self.add_self_loop = add_self_loop self.node_featurizer = node_featurizer self.edge_featurizer = edge_featurizer self.canonical_atom_order = canonical_atom_order self.explicit_hydrogens = explicit_hydrogens self.num_virtual_nodes = num_virtual_nodes
def __call__(self, mol): """Construct graph for the molecule and featurize it. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder Returns ------- DGLGraph or None Bi-directed DGLGraph for the molecule if :attr:`mol` is valid and None otherwise. """ return mol_to_bigraph(mol, self.add_self_loop, self.node_featurizer, self.edge_featurizer, self.canonical_atom_order, self.explicit_hydrogens, self.num_virtual_nodes)
[docs]class SMILESToBigraph(ToGraph): """Convert SMILES strings into bi-directed DGLGraphs and featurize for them. Parameters ---------- add_self_loop : bool Whether to add self loops in DGLGraphs. Default to False. node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. Default to None. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. Default to None. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. Default to True. explicit_hydrogens : bool Whether to explicitly represent hydrogens as nodes in the graph. If True, it will call rdkit.Chem.AddHs(mol). Default to False. num_virtual_nodes : int The number of virtual nodes to add. The virtual nodes will be connected to all real nodes with virtual edges. If the returned graph has any node/edge feature, an additional column of binary values will be used for each feature to indicate the identity of virtual node/edges. The features of the virtual nodes/edges will be zero vectors except for the additional column. Default to 0. Examples -------- >>> import torch >>> from rdkit import Chem >>> from dgllife.utils import SMILESToBigraph >>> # A custom node featurizer >>> def featurize_atoms(mol): >>> feats = [] >>> for atom in mol.GetAtoms(): >>> feats.append(atom.GetAtomicNum()) >>> return {'atomic': torch.tensor(feats).reshape(-1, 1).float()} >>> # A custom edge featurizer >>> def featurize_bonds(mol): >>> feats = [] >>> bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, >>> Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC] >>> for bond in mol.GetBonds(): >>> btype = bond_types.index(bond.GetBondType()) >>> # One bond between atom u and v corresponds to two edges (u, v) and (v, u) >>> feats.extend([btype, btype]) >>> return {'type': torch.tensor(feats).reshape(-1, 1).float()} >>> smi_to_g = SMILESToBigraph(node_featurizer=featurize_atoms, ... edge_featurizer=featurize_bonds) >>> g = smi_to_g('CCO') >>> print(g.ndata['atomic']) tensor([[6.], [8.], [6.]]) >>> print(g.edata['type']) tensor([[0.], [0.], [0.], [0.]]) """
[docs] def __init__(self, add_self_loop=False, node_featurizer=None, edge_featurizer=None, canonical_atom_order=True, explicit_hydrogens=False, num_virtual_nodes=0): self.add_self_loop = add_self_loop self.node_featurizer = node_featurizer self.edge_featurizer = edge_featurizer self.canonical_atom_order = canonical_atom_order self.explicit_hydrogens = explicit_hydrogens self.num_virtual_nodes = num_virtual_nodes
def __call__(self, smiles): """Construct graph for the molecule and featurize it. Parameters ---------- smiles : str SMILES string. Returns ------- DGLGraph or None Bi-directed DGLGraph for the molecule if :attr:`smiles` is valid and None otherwise. """ return smiles_to_bigraph(smiles, self.add_self_loop, self.node_featurizer, self.edge_featurizer, self.canonical_atom_order, self.explicit_hydrogens, self.num_virtual_nodes)