Source code for dgllife.utils.mol_to_graph

# -*- coding: utf-8 -*-
#
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Convert molecules into DGLGraphs
#
# pylint: disable= no-member, arguments-differ, invalid-name

from functools import partial
import torch
import dgl

from sklearn.neighbors import NearestNeighbors

try:
    from rdkit import Chem
    from rdkit.Chem import rdmolfiles, rdmolops
except ImportError:
    pass

__all__ = ['mol_to_graph',
           'smiles_to_bigraph',
           'mol_to_bigraph',
           'smiles_to_complete_graph',
           'mol_to_complete_graph',
           'k_nearest_neighbors',
           'mol_to_nearest_neighbor_graph',
           'smiles_to_nearest_neighbor_graph',
           'ToGraph',
           'MolToBigraph',
           'SMILESToBigraph']

# pylint: disable=I1101
[docs]def mol_to_graph(mol, graph_constructor, node_featurizer, edge_featurizer,
                 canonical_atom_order, explicit_hydrogens=False, num_virtual_nodes=0):
    """Convert an RDKit molecule object into a DGLGraph and featurize for it.

    This function can be used to construct any arbitrary ``DGLGraph`` from an
    RDKit molecule instance.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    graph_constructor : callable
        Takes an RDKit molecule as input and returns a DGLGraph
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to
        update ndata for a DGLGraph.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to
        update edata for a DGLGraph.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed.
    explicit_hydrogens : bool
        Whether to explicitly represent hydrogens as nodes in the graph. If True,
        it will call rdkit.Chem.AddHs(mol). If False, it will do nothing.
        Default to False.
    num_virtual_nodes : int
        The number of virtual nodes to add. The virtual nodes will be connected to
        all real nodes with virtual edges. If the returned graph has any node/edge
        feature, an additional column of binary values will be used for each feature
        to indicate the identity of virtual node/edges. The features of the virtual
        nodes/edges will be zero vectors except for the additional column. Default to 0.

    Returns
    -------
    DGLGraph or None
        Converted DGLGraph for the molecule if :attr:`mol` is valid and None otherwise.

    See Also
    --------
    mol_to_bigraph
    mol_to_complete_graph
    mol_to_nearest_neighbor_graph
    """
    if mol is None:
        print('Invalid mol found')
        return None

    # Whether to have hydrogen atoms as explicit nodes
    if explicit_hydrogens:
        mol = Chem.AddHs(mol)

    if canonical_atom_order:
        new_order = rdmolfiles.CanonicalRankAtoms(mol)
        mol = rdmolops.RenumberAtoms(mol, new_order)
    g = graph_constructor(mol)

    if node_featurizer is not None:
        g.ndata.update(node_featurizer(mol))

    if edge_featurizer is not None:
        g.edata.update(edge_featurizer(mol))

    if num_virtual_nodes > 0:
        num_real_nodes = g.num_nodes()
        real_nodes = list(range(num_real_nodes))
        g.add_nodes(num_virtual_nodes)

        # Change Topology
        virtual_src = []
        virtual_dst = []
        for count in range(num_virtual_nodes):
            virtual_node = num_real_nodes + count
            virtual_node_copy = [virtual_node] * num_real_nodes
            virtual_src.extend(real_nodes)
            virtual_src.extend(virtual_node_copy)
            virtual_dst.extend(virtual_node_copy)
            virtual_dst.extend(real_nodes)
        g.add_edges(virtual_src, virtual_dst)

        for nk, nv in g.ndata.items():
            nv = torch.cat([nv, torch.zeros(g.num_nodes(), 1)], dim=1)
            nv[-num_virtual_nodes:, -1] = 1
            g.ndata[nk] = nv

        for ek, ev in g.edata.items():
            ev = torch.cat([ev, torch.zeros(g.num_edges(), 1)], dim=1)
            ev[-num_virtual_nodes * num_real_nodes * 2:, -1] = 1
            g.edata[ek] = ev

    return g

def construct_bigraph_from_mol(mol, add_self_loop=False):
    """Construct a bi-directed DGLGraph with topology only for the molecule.

    The **i** th atom in the molecule, i.e. ``mol.GetAtomWithIdx(i)``, corresponds to the
    **i** th node in the returned DGLGraph.

    The **i** th bond in the molecule, i.e. ``mol.GetBondWithIdx(i)``, corresponds to the
    **(2i)**-th and **(2i+1)**-th edges in the returned DGLGraph. The **(2i)**-th and
    **(2i+1)**-th edges will be separately from **u** to **v** and **v** to **u**, where
    **u** is ``bond.GetBeginAtomIdx()`` and **v** is ``bond.GetEndAtomIdx()``.

    If self loops are added, the last **n** edges will separately be self loops for
    atoms ``0, 1, ..., n-1``.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    add_self_loop : bool
        Whether to add self loops in DGLGraphs. Default to False.

    Returns
    -------
    g : DGLGraph
        Empty bigraph topology of the molecule
    """
    g = dgl.graph(([], []), idtype=torch.int32)

    # Add nodes
    num_atoms = mol.GetNumAtoms()
    g.add_nodes(num_atoms)

    # Add edges
    src_list = []
    dst_list = []
    num_bonds = mol.GetNumBonds()
    for i in range(num_bonds):
        bond = mol.GetBondWithIdx(i)
        u = bond.GetBeginAtomIdx()
        v = bond.GetEndAtomIdx()
        src_list.extend([u, v])
        dst_list.extend([v, u])

    if add_self_loop:
        nodes = g.nodes().tolist()
        src_list.extend(nodes)
        dst_list.extend(nodes)

    g.add_edges(torch.IntTensor(src_list), torch.IntTensor(dst_list))

    return g

[docs]def mol_to_bigraph(mol, add_self_loop=False,
                   node_featurizer=None,
                   edge_featurizer=None,
                   canonical_atom_order=True,
                   explicit_hydrogens=False,
                   num_virtual_nodes=0):
    """Convert an RDKit molecule object into a bi-directed DGLGraph and featurize for it.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    add_self_loop : bool
        Whether to add self loops in DGLGraphs. Default to False.
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to update
        ndata for a DGLGraph. Default to None.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to update
        edata for a DGLGraph. Default to None.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed. Default
        to True.
    explicit_hydrogens : bool
        Whether to explicitly represent hydrogens as nodes in the graph. If True,
        it will call rdkit.Chem.AddHs(mol). Default to False.
    num_virtual_nodes : int
        The number of virtual nodes to add. The virtual nodes will be connected to
        all real nodes with virtual edges. If the returned graph has any node/edge
        feature, an additional column of binary values will be used for each feature
        to indicate the identity of virtual node/edges. The features of the virtual
        nodes/edges will be zero vectors except for the additional column. Default to 0.

    Returns
    -------
    DGLGraph or None
        Bi-directed DGLGraph for the molecule if :attr:`mol` is valid and None otherwise.

    Examples
    --------
    >>> from rdkit import Chem
    >>> from dgllife.utils import mol_to_bigraph

    >>> mol = Chem.MolFromSmiles('CCO')
    >>> g = mol_to_bigraph(mol)
    >>> print(g)
    DGLGraph(num_nodes=3, num_edges=4,
             ndata_schemes={}
             edata_schemes={})

    We can also initialize node/edge features when constructing graphs.

    >>> import torch
    >>> from rdkit import Chem
    >>> from dgllife.utils import mol_to_bigraph

    >>> def featurize_atoms(mol):
    >>>     feats = []
    >>>     for atom in mol.GetAtoms():
    >>>         feats.append(atom.GetAtomicNum())
    >>>     return {'atomic': torch.tensor(feats).reshape(-1, 1).float()}

    >>> def featurize_bonds(mol):
    >>>     feats = []
    >>>     bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE,
    >>>                   Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC]
    >>>     for bond in mol.GetBonds():
    >>>         btype = bond_types.index(bond.GetBondType())
    >>>         # One bond between atom u and v corresponds to two edges (u, v) and (v, u)
    >>>         feats.extend([btype, btype])
    >>>     return {'type': torch.tensor(feats).reshape(-1, 1).float()}

    >>> mol = Chem.MolFromSmiles('CCO')
    >>> g = mol_to_bigraph(mol, node_featurizer=featurize_atoms,
    >>>                    edge_featurizer=featurize_bonds)
    >>> print(g.ndata['atomic'])
    tensor([[6.],
            [8.],
            [6.]])
    >>> print(g.edata['type'])
    tensor([[0.],
            [0.],
            [0.],
            [0.]])

    By default, we do not explicitly represent hydrogens as nodes, which can be done as follows.

    >>> g = mol_to_bigraph(mol, explicit_hydrogens=True)
    >>> print(g)
    DGLGraph(num_nodes=9, num_edges=16,
             ndata_schemes={}
             edata_schemes={})

    See Also
    --------
    smiles_to_bigraph
    """
    return mol_to_graph(mol, partial(construct_bigraph_from_mol, add_self_loop=add_self_loop),
                        node_featurizer, edge_featurizer,
                        canonical_atom_order, explicit_hydrogens, num_virtual_nodes)

[docs]def smiles_to_bigraph(smiles, add_self_loop=False,
                      node_featurizer=None,
                      edge_featurizer=None,
                      canonical_atom_order=True,
                      explicit_hydrogens=False,
                      num_virtual_nodes=0):
    """Convert a SMILES into a bi-directed DGLGraph and featurize for it.

    Parameters
    ----------
    smiles : str
        String of SMILES
    add_self_loop : bool
        Whether to add self loops in DGLGraphs. Default to False.
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to update
        ndata for a DGLGraph. Default to None.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to update
        edata for a DGLGraph. Default to None.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed. Default
        to True.
    explicit_hydrogens : bool
        Whether to explicitly represent hydrogens as nodes in the graph. If True,
        it will call rdkit.Chem.AddHs(mol). Default to False.
    num_virtual_nodes : int
        The number of virtual nodes to add. The virtual nodes will be connected to
        all real nodes with virtual edges. If the returned graph has any node/edge
        feature, an additional column of binary values will be used for each feature
        to indicate the identity of virtual node/edges. The features of the virtual
        nodes/edges will be zero vectors except for the additional column. Default to 0.

    Returns
    -------
    DGLGraph or None
        Bi-directed DGLGraph for the molecule if :attr:`smiles` is valid and None otherwise.

    Examples
    --------
    >>> from dgllife.utils import smiles_to_bigraph

    >>> g = smiles_to_bigraph('CCO')
    >>> print(g)
    DGLGraph(num_nodes=3, num_edges=4,
             ndata_schemes={}
             edata_schemes={})

    We can also initialize node/edge features when constructing graphs.

    >>> import torch
    >>> from rdkit import Chem
    >>> from dgllife.utils import smiles_to_bigraph

    >>> def featurize_atoms(mol):
    >>>     feats = []
    >>>     for atom in mol.GetAtoms():
    >>>         feats.append(atom.GetAtomicNum())
    >>>     return {'atomic': torch.tensor(feats).reshape(-1, 1).float()}

    >>> def featurize_bonds(mol):
    >>>     feats = []
    >>>     bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE,
    >>>                   Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC]
    >>>     for bond in mol.GetBonds():
    >>>         btype = bond_types.index(bond.GetBondType())
    >>>         # One bond between atom u and v corresponds to two edges (u, v) and (v, u)
    >>>         feats.extend([btype, btype])
    >>>     return {'type': torch.tensor(feats).reshape(-1, 1).float()}

    >>> g = smiles_to_bigraph('CCO', node_featurizer=featurize_atoms,
    >>>                       edge_featurizer=featurize_bonds)
    >>> print(g.ndata['atomic'])
    tensor([[6.],
            [8.],
            [6.]])
    >>> print(g.edata['type'])
    tensor([[0.],
            [0.],
            [0.],
            [0.]])

    By default, we do not explicitly represent hydrogens as nodes, which can be done as follows.

    >>> g = smiles_to_bigraph('CCO', explicit_hydrogens=True)
    >>> print(g)
    DGLGraph(num_nodes=9, num_edges=16,
             ndata_schemes={}
             edata_schemes={})

    See Also
    --------
    mol_to_bigraph
    """
    mol = Chem.MolFromSmiles(smiles)
    return mol_to_bigraph(mol, add_self_loop, node_featurizer, edge_featurizer,
                          canonical_atom_order, explicit_hydrogens, num_virtual_nodes)

def construct_complete_graph_from_mol(mol, add_self_loop=False):
    """Construct a complete graph with topology only for the molecule

    The **i** th atom in the molecule, i.e. ``mol.GetAtomWithIdx(i)``, corresponds to the
    **i** th node in the returned DGLGraph.

    The edges are in the order of (0, 0), (1, 0), (2, 0), ... (0, 1), (1, 1), (2, 1), ...
    If self loops are not created, we will not have (0, 0), (1, 1), ...

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    add_self_loop : bool
        Whether to add self loops in DGLGraphs. Default to False.

    Returns
    -------
    g : DGLGraph
        Empty complete graph topology of the molecule
    """
    num_atoms = mol.GetNumAtoms()
    src = []
    dst = []
    for i in range(num_atoms):
        for j in range(num_atoms):
            if i != j or add_self_loop:
                src.append(i)
                dst.append(j)
    g = dgl.graph((torch.IntTensor(src), torch.IntTensor(dst)), idtype=torch.int32)

    return g

[docs]def mol_to_complete_graph(mol, add_self_loop=False,
                          node_featurizer=None,
                          edge_featurizer=None,
                          canonical_atom_order=True,
                          explicit_hydrogens=False,
                          num_virtual_nodes=0):
    """Convert an RDKit molecule into a complete DGLGraph and featurize for it.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    add_self_loop : bool
        Whether to add self loops in DGLGraphs. Default to False.
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to update
        ndata for a DGLGraph. Default to None.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to update
        edata for a DGLGraph. Default to None.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed. Default
        to True.
    explicit_hydrogens : bool
        Whether to explicitly represent hydrogens as nodes in the graph. If True,
        it will call rdkit.Chem.AddHs(mol). Default to False.
    num_virtual_nodes : int
        The number of virtual nodes to add. The virtual nodes will be connected to
        all real nodes with virtual edges. If the returned graph has any node/edge
        feature, an additional column of binary values will be used for each feature
        to indicate the identity of virtual node/edges. The features of the virtual
        nodes/edges will be zero vectors except for the additional column. Default to 0.

    Returns
    -------
    DGLGraph or None
        Complete DGLGraph for the molecule if :attr:`mol` is valid and None otherwise.

    Examples
    --------
    >>> from rdkit import Chem
    >>> from dgllife.utils import mol_to_complete_graph

    >>> mol = Chem.MolFromSmiles('CCO')
    >>> g = mol_to_complete_graph(mol)
    >>> print(g)
    DGLGraph(num_nodes=3, num_edges=6,
             ndata_schemes={}
             edata_schemes={})

    We can also initialize node/edge features when constructing graphs.

    >>> import torch
    >>> from rdkit import Chem
    >>> from dgllife.utils import mol_to_complete_graph
    >>> from functools import partial

    >>> def featurize_atoms(mol):
    >>>     feats = []
    >>>     for atom in mol.GetAtoms():
    >>>         feats.append(atom.GetAtomicNum())
    >>>     return {'atomic': torch.tensor(feats).reshape(-1, 1).float()}

    >>> def featurize_edges(mol, add_self_loop=False):
    >>>     feats = []
    >>>     num_atoms = mol.GetNumAtoms()
    >>>     atoms = list(mol.GetAtoms())
    >>>     distance_matrix = Chem.GetDistanceMatrix(mol)
    >>>     for i in range(num_atoms):
    >>>         for j in range(num_atoms):
    >>>             if i != j or add_self_loop:
    >>>                 feats.append(float(distance_matrix[i, j]))
    >>>     return {'dist': torch.tensor(feats).reshape(-1, 1).float()}

    >>> mol = Chem.MolFromSmiles('CCO')
    >>> add_self_loop = True
    >>> g = mol_to_complete_graph(
    >>>         mol, add_self_loop=add_self_loop, node_featurizer=featurize_atoms,
    >>>         edge_featurizer=partial(featurize_edges, add_self_loop=add_self_loop))
    >>> print(g.ndata['atomic'])
    tensor([[6.],
            [8.],
            [6.]])
    >>> print(g.edata['dist'])
    tensor([[0.],
            [2.],
            [1.],
            [2.],
            [0.],
            [1.],
            [1.],
            [1.],
            [0.]])

    By default, we do not explicitly represent hydrogens as nodes, which can be done as follows.

    >>> g = mol_to_complete_graph(mol, explicit_hydrogens=True)
    >>> print(g)
    DGLGraph(num_nodes=9, num_edges=72,
             ndata_schemes={}
             edata_schemes={})

    See Also
    --------
    smiles_to_complete_graph
    """
    return mol_to_graph(mol,
                        partial(construct_complete_graph_from_mol, add_self_loop=add_self_loop),
                        node_featurizer, edge_featurizer,
                        canonical_atom_order, explicit_hydrogens, num_virtual_nodes)

[docs]def smiles_to_complete_graph(smiles, add_self_loop=False,
                             node_featurizer=None,
                             edge_featurizer=None,
                             canonical_atom_order=True,
                             explicit_hydrogens=False,
                             num_virtual_nodes=0):
    """Convert a SMILES into a complete DGLGraph and featurize for it.

    Parameters
    ----------
    smiles : str
        String of SMILES
    add_self_loop : bool
        Whether to add self loops in DGLGraphs. Default to False.
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to update
        ndata for a DGLGraph. Default to None.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to update
        edata for a DGLGraph. Default to None.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed. Default
        to True.
    explicit_hydrogens : bool
        Whether to explicitly represent hydrogens as nodes in the graph. If True,
        it will call rdkit.Chem.AddHs(mol). Default to False.
    num_virtual_nodes : int
        The number of virtual nodes to add. The virtual nodes will be connected to
        all real nodes with virtual edges. If the returned graph has any node/edge
        feature, an additional column of binary values will be used for each feature
        to indicate the identity of virtual node/edges. The features of the virtual
        nodes/edges will be zero vectors except for the additional column. Default to 0.

    Returns
    -------
    DGLGraph or None
        Complete DGLGraph for the molecule if :attr:`smiles` is valid and None otherwise.

    Examples
    --------
    >>> from dgllife.utils import smiles_to_complete_graph

    >>> g = smiles_to_complete_graph('CCO')
    >>> print(g)
    DGLGraph(num_nodes=3, num_edges=6,
             ndata_schemes={}
             edata_schemes={})

    We can also initialize node/edge features when constructing graphs.

    >>> import torch
    >>> from rdkit import Chem
    >>> from dgllife.utils import smiles_to_complete_graph
    >>> from functools import partial

    >>> def featurize_atoms(mol):
    >>>     feats = []
    >>>     for atom in mol.GetAtoms():
    >>>         feats.append(atom.GetAtomicNum())
    >>>     return {'atomic': torch.tensor(feats).reshape(-1, 1).float()}

    >>> def featurize_edges(mol, add_self_loop=False):
    >>>     feats = []
    >>>     num_atoms = mol.GetNumAtoms()
    >>>     atoms = list(mol.GetAtoms())
    >>>     distance_matrix = Chem.GetDistanceMatrix(mol)
    >>>     for i in range(num_atoms):
    >>>         for j in range(num_atoms):
    >>>             if i != j or add_self_loop:
    >>>                 feats.append(float(distance_matrix[i, j]))
    >>>     return {'dist': torch.tensor(feats).reshape(-1, 1).float()}

    >>> add_self_loop = True
    >>> g = smiles_to_complete_graph(
    >>>         'CCO', add_self_loop=add_self_loop, node_featurizer=featurize_atoms,
    >>>         edge_featurizer=partial(featurize_edges, add_self_loop=add_self_loop))
    >>> print(g.ndata['atomic'])
    tensor([[6.],
            [8.],
            [6.]])
    >>> print(g.edata['dist'])
    tensor([[0.],
            [2.],
            [1.],
            [2.],
            [0.],
            [1.],
            [1.],
            [1.],
            [0.]])

    By default, we do not explicitly represent hydrogens as nodes, which can be done as follows.

    >>> g = smiles_to_complete_graph('CCO', explicit_hydrogens=True)
    >>> print(g)
    DGLGraph(num_nodes=9, num_edges=72,
             ndata_schemes={}
             edata_schemes={})

    See Also
    --------
    mol_to_complete_graph
    """
    mol = Chem.MolFromSmiles(smiles)
    return mol_to_complete_graph(mol, add_self_loop, node_featurizer, edge_featurizer,
                                 canonical_atom_order, explicit_hydrogens, num_virtual_nodes)

[docs]def k_nearest_neighbors(coordinates, neighbor_cutoff, max_num_neighbors=None,
                        p_distance=2, self_loops=False):
    """Find k nearest neighbors for each atom

    We do not guarantee that the edges are sorted according to the distance
    between atoms.

    Parameters
    ----------
    coordinates : numpy.ndarray of shape (N, D)
        The coordinates of atoms in the molecule. N for the number of atoms
        and D for the dimensions of the coordinates.
    neighbor_cutoff : float
        If the distance between a pair of nodes is larger than neighbor_cutoff,
        they will not be considered as neighboring nodes.
    max_num_neighbors : int or None.
        If not None, then this specifies the maximum number of neighbors
        allowed for each atom. Default to None.
    p_distance : int
        We compute the distance between neighbors using Minkowski (:math:`l_p`)
        distance. When ``p_distance = 1``, Minkowski distance is equivalent to
        Manhattan distance. When ``p_distance = 2``, Minkowski distance is
        equivalent to the standard Euclidean distance. Default to 2.
    self_loops : bool
        Whether to allow a node to be its own neighbor. Default to False.

    Returns
    -------
    srcs : list of int
        Source nodes.
    dsts : list of int
        Destination nodes, corresponding to ``srcs``.
    distances : list of float
        Distances between the end nodes, corresponding to ``srcs`` and ``dsts``.

    Examples
    --------
    >>> from dgllife.utils import get_mol_3d_coordinates, k_nearest_neighbors
    >>> from rdkit import Chem
    >>> from rdkit.Chem import AllChem

    >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C')
    >>> AllChem.EmbedMolecule(mol)
    >>> AllChem.MMFFOptimizeMolecule(mol)
    >>> coords = get_mol_3d_coordinates(mol)
    >>> srcs, dsts, dists = k_nearest_neighbors(coords, neighbor_cutoff=1.25)
    >>> print(srcs)
    [8, 7, 11, 10, 20, 19]
    >>> print(dsts)
    [7, 8, 10, 11, 19, 20]
    >>> print(dists)
    [1.2084666104583117, 1.2084666104583117, 1.226457824344217,
     1.226457824344217, 1.2230522248065987, 1.2230522248065987]

    See Also
    --------
    get_mol_3d_coordinates
    mol_to_nearest_neighbor_graph
    smiles_to_nearest_neighbor_graph
    """
    num_atoms = coordinates.shape[0]
    model = NearestNeighbors(radius=neighbor_cutoff, p=p_distance)
    model.fit(coordinates)
    dists_, nbrs = model.radius_neighbors(coordinates)
    srcs, dsts, dists = [], [], []
    for i in range(num_atoms):
        dists_i = dists_[i].tolist()
        nbrs_i = nbrs[i].tolist()
        if not self_loops:
            dists_i.remove(0)
            nbrs_i.remove(i)
        if max_num_neighbors is not None and len(nbrs_i) > max_num_neighbors:
            packed_nbrs = list(zip(dists_i, nbrs_i))
            # Sort neighbors based on distance from smallest to largest
            packed_nbrs.sort(key=lambda tup: tup[0])
            dists_i, nbrs_i = map(list, zip(*packed_nbrs))
            dsts.extend([i for _ in range(max_num_neighbors)])
            srcs.extend(nbrs_i[:max_num_neighbors])
            dists.extend(dists_i[:max_num_neighbors])
        else:
            dsts.extend([i for _ in range(len(nbrs_i))])
            srcs.extend(nbrs_i)
            dists.extend(dists_i)

    return srcs, dsts, dists

# pylint: disable=E1102
[docs]def mol_to_nearest_neighbor_graph(mol,
                                  coordinates,
                                  neighbor_cutoff,
                                  max_num_neighbors=None,
                                  p_distance=2,
                                  add_self_loop=False,
                                  node_featurizer=None,
                                  edge_featurizer=None,
                                  canonical_atom_order=True,
                                  keep_dists=False,
                                  dist_field='dist',
                                  explicit_hydrogens=False,
                                  num_virtual_nodes=0):
    """Convert an RDKit molecule into a nearest neighbor graph and featurize for it.

    Different from bigraph and complete graph, the nearest neighbor graph
    may not be symmetric since i is the closest neighbor of j does not
    necessarily suggest the other way.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    coordinates : numpy.ndarray of shape (N, D)
        The coordinates of atoms in the molecule. N for the number of atoms
        and D for the dimensions of the coordinates.
    neighbor_cutoff : float
        If the distance between a pair of nodes is larger than neighbor_cutoff,
        they will not be considered as neighboring nodes.
    max_num_neighbors : int or None.
        If not None, then this specifies the maximum number of neighbors
        allowed for each atom. Default to None.
    p_distance : int
        We compute the distance between neighbors using Minkowski (:math:`l_p`)
        distance. When ``p_distance = 1``, Minkowski distance is equivalent to
        Manhattan distance. When ``p_distance = 2``, Minkowski distance is
        equivalent to the standard Euclidean distance. Default to 2.
    add_self_loop : bool
        Whether to add self loops in DGLGraphs. Default to False.
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to update
        ndata for a DGLGraph. Default to None.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to update
        edata for a DGLGraph. Default to None.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed. Default
        to True.
    keep_dists : bool
        Whether to store the distance between neighboring atoms in ``edata`` of the
        constructed DGLGraphs. Default to False.
    dist_field : str
        Field for storing distance between neighboring atoms in ``edata``. This comes
        into effect only when ``keep_dists=True``. Default to ``'dist'``.
    explicit_hydrogens : bool
        Whether to explicitly represent hydrogens as nodes in the graph. If True,
        it will call rdkit.Chem.AddHs(mol). Default to False.
    num_virtual_nodes : int
        The number of virtual nodes to add. The virtual nodes will be connected to
        all real nodes with virtual edges. If the returned graph has any node/edge
        feature, an additional column of binary values will be used for each feature
        to indicate the identity of virtual node/edges. The features of the virtual
        nodes/edges will be zero vectors except for the additional column. Default to 0.

    Returns
    -------
    DGLGraph or None
        Nearest neighbor DGLGraph for the molecule if :attr:`mol` is valid and None otherwise.

    Examples
    --------
    >>> from dgllife.utils import mol_to_nearest_neighbor_graph
    >>> from rdkit import Chem
    >>> from rdkit.Chem import AllChem

    >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C')
    >>> AllChem.EmbedMolecule(mol)
    >>> AllChem.MMFFOptimizeMolecule(mol)
    >>> coords = get_mol_3d_coordinates(mol)
    >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25)
    >>> print(g)
    DGLGraph(num_nodes=23, num_edges=6,
             ndata_schemes={}
             edata_schemes={})

    Quite often we will want to use the distance between end atoms of edges, this can be
    achieved with

    >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25, keep_dists=True)
    >>> print(g.edata['dist'])
    tensor([[1.2024],
            [1.2024],
            [1.2270],
            [1.2270],
            [1.2259],
            [1.2259]])

    By default, we do not explicitly represent hydrogens as nodes, which can be done as follows.

    >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C')
    >>> mol = Chem.AddHs(mol)
    >>> AllChem.EmbedMolecule(mol)
    >>> AllChem.MMFFOptimizeMolecule(mol)
    >>> coords = get_mol_3d_coordinates(mol)
    >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25,
    >>>                                   explicit_hydrogens=True)
    >>> print(g)
    DGLGraph(num_nodes=41, num_edges=42,
             ndata_schemes={}
             edata_schemes={})

    See Also
    --------
    get_mol_3d_coordinates
    k_nearest_neighbors
    smiles_to_nearest_neighbor_graph
    """
    if mol is None:
        print('Invalid mol found')
        return None

    if explicit_hydrogens:
        mol = Chem.AddHs(mol)

    num_atoms = mol.GetNumAtoms()
    num_coords = coordinates.shape[0]
    assert num_atoms == num_coords, \
        'Expect the number of atoms to match the first dimension of coordinates, ' \
        'got {:d} and {:d}'.format(num_atoms, num_coords)

    if canonical_atom_order:
        new_order = rdmolfiles.CanonicalRankAtoms(mol)
        mol = rdmolops.RenumberAtoms(mol, new_order)

    srcs, dsts, dists = k_nearest_neighbors(coordinates=coordinates,
                                            neighbor_cutoff=neighbor_cutoff,
                                            max_num_neighbors=max_num_neighbors,
                                            p_distance=p_distance,
                                            self_loops=add_self_loop)
    g = dgl.graph(([], []), idtype=torch.int32)

    # Add nodes first since some nodes may be completely isolated
    g.add_nodes(num_atoms)

    # Add edges
    g.add_edges(srcs, dsts)

    if node_featurizer is not None:
        g.ndata.update(node_featurizer(mol))

    if edge_featurizer is not None:
        g.edata.update(edge_featurizer(mol))

    if keep_dists:
        assert dist_field not in g.edata, \
            'Expect {} to be reserved for distance between neighboring atoms.'
        g.edata[dist_field] = torch.tensor(dists).float().reshape(-1, 1)

    if num_virtual_nodes > 0:
        num_real_nodes = g.num_nodes()
        real_nodes = list(range(num_real_nodes))
        g.add_nodes(num_virtual_nodes)

        # Change Topology
        virtual_src = []
        virtual_dst = []
        for count in range(num_virtual_nodes):
            virtual_node = num_real_nodes + count
            virtual_node_copy = [virtual_node] * num_real_nodes
            virtual_src.extend(real_nodes)
            virtual_src.extend(virtual_node_copy)
            virtual_dst.extend(virtual_node_copy)
            virtual_dst.extend(real_nodes)
        g.add_edges(virtual_src, virtual_dst)

        for nk, nv in g.ndata.items():
            nv = torch.cat([nv, torch.zeros(g.num_nodes(), 1)], dim=1)
            nv[:-num_virtual_nodes, -1] = 1
            g.ndata[nk] = nv

        for ek, ev in g.edata.items():
            ev = torch.cat([ev, torch.zeros(g.num_edges(), 1)], dim=1)
            ev[:-num_virtual_nodes * num_real_nodes * 2, -1] = 1
            g.edata[ek] = ev

    return g

[docs]def smiles_to_nearest_neighbor_graph(smiles,
                                     coordinates,
                                     neighbor_cutoff,
                                     max_num_neighbors=None,
                                     p_distance=2,
                                     add_self_loop=False,
                                     node_featurizer=None,
                                     edge_featurizer=None,
                                     canonical_atom_order=True,
                                     keep_dists=False,
                                     dist_field='dist',
                                     explicit_hydrogens=False,
                                     num_virtual_nodes=0):
    """Convert a SMILES into a nearest neighbor graph and featurize for it.

    Different from bigraph and complete graph, the nearest neighbor graph
    may not be symmetric since i is the closest neighbor of j does not
    necessarily suggest the other way.

    Parameters
    ----------
    smiles : str
        String of SMILES
    coordinates : numpy.ndarray of shape (N, D)
        The coordinates of atoms in the molecule. N for the number of atoms
        and D for the dimensions of the coordinates.
    neighbor_cutoff : float
        If the distance between a pair of nodes is larger than neighbor_cutoff,
        they will not be considered as neighboring nodes.
    max_num_neighbors : int or None.
        If not None, then this specifies the maximum number of neighbors
        allowed for each atom. Default to None.
    p_distance : int
        We compute the distance between neighbors using Minkowski (:math:`l_p`)
        distance. When ``p_distance = 1``, Minkowski distance is equivalent to
        Manhattan distance. When ``p_distance = 2``, Minkowski distance is
        equivalent to the standard Euclidean distance. Default to 2.
    add_self_loop : bool
        Whether to add self loops in DGLGraphs. Default to False.
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to update
        ndata for a DGLGraph. Default to None.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to update
        edata for a DGLGraph. Default to None.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed. Default
        to True.
    keep_dists : bool
        Whether to store the distance between neighboring atoms in ``edata`` of the
        constructed DGLGraphs. Default to False.
    dist_field : str
        Field for storing distance between neighboring atoms in ``edata``. This comes
        into effect only when ``keep_dists=True``. Default to ``'dist'``.
    explicit_hydrogens : bool
        Whether to explicitly represent hydrogens as nodes in the graph. If True,
        it will call rdkit.Chem.AddHs(mol). Default to False.
    num_virtual_nodes : int
        The number of virtual nodes to add. The virtual nodes will be connected to
        all real nodes with virtual edges. If the returned graph has any node/edge
        feature, an additional column of binary values will be used for each feature
        to indicate the identity of virtual node/edges. The features of the virtual
        nodes/edges will be zero vectors except for the additional column. Default to 0.

    Returns
    -------
    DGLGraph or None
        Nearest neighbor DGLGraph for the molecule if :attr:`smiles` is valid and None otherwise.

    Examples
    --------
    >>> from dgllife.utils import smiles_to_nearest_neighbor_graph
    >>> from rdkit import Chem
    >>> from rdkit.Chem import AllChem

    >>> smiles = 'CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C'
    >>> mol = Chem.MolFromSmiles(smiles)
    >>> AllChem.EmbedMolecule(mol)
    >>> AllChem.MMFFOptimizeMolecule(mol)
    >>> coords = get_mol_3d_coordinates(mol)
    >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25)
    >>> print(g)
    DGLGraph(num_nodes=23, num_edges=6,
             ndata_schemes={}
             edata_schemes={})

    Quite often we will want to use the distance between end atoms of edges, this can be
    achieved with

    >>> g = smiles_to_nearest_neighbor_graph(smiles, coords, neighbor_cutoff=1.25, keep_dists=True)
    >>> print(g.edata['dist'])
    tensor([[1.2024],
            [1.2024],
            [1.2270],
            [1.2270],
            [1.2259],
            [1.2259]])

    By default, we do not explicitly represent hydrogens as nodes, which can be done as follows.

    >>> smiles = 'CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C'
    >>> mol = Chem.MolFromSmiles(smiles)
    >>> mol = Chem.AddHs(mol)
    >>> AllChem.EmbedMolecule(mol)
    >>> AllChem.MMFFOptimizeMolecule(mol)
    >>> coords = get_mol_3d_coordinates(mol)
    >>> g = smiles_to_nearest_neighbor_graph(smiles, coords, neighbor_cutoff=1.25,
    >>>                                      explicit_hydrogens=True)
    >>> print(g)
    DGLGraph(num_nodes=41, num_edges=42,
             ndata_schemes={}
             edata_schemes={})

    See Also
    --------
    get_mol_3d_coordinates
    k_nearest_neighbors
    mol_to_nearest_neighbor_graph
    """
    mol = Chem.MolFromSmiles(smiles)
    return mol_to_nearest_neighbor_graph(
        mol, coordinates, neighbor_cutoff, max_num_neighbors, p_distance,
        add_self_loop, node_featurizer, edge_featurizer, canonical_atom_order,
        keep_dists, dist_field, explicit_hydrogens, num_virtual_nodes)

[docs]class ToGraph:
    r"""An abstract class for writing graph constructors."""
    def __call__(self, data_obj):
        raise NotImplementedError

    def __repr__(self):
        return self.__class__.__name__ + '()'

[docs]class MolToBigraph(ToGraph):
    """Convert RDKit molecule objects into bi-directed DGLGraphs and featurize for them.

    Parameters
    ----------
    add_self_loop : bool
        Whether to add self loops in DGLGraphs. Default to False.
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to update
        ndata for a DGLGraph. Default to None.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to update
        edata for a DGLGraph. Default to None.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed. Default
        to True.
    explicit_hydrogens : bool
        Whether to explicitly represent hydrogens as nodes in the graph. If True,
        it will call rdkit.Chem.AddHs(mol). Default to False.
    num_virtual_nodes : int
        The number of virtual nodes to add. The virtual nodes will be connected to
        all real nodes with virtual edges. If the returned graph has any node/edge
        feature, an additional column of binary values will be used for each feature
        to indicate the identity of virtual node/edges. The features of the virtual
        nodes/edges will be zero vectors except for the additional column. Default to 0.

    Examples
    --------
    >>> import torch
    >>> from rdkit import Chem
    >>> from dgllife.utils import MolToBigraph

    >>> # A custom node featurizer
    >>> def featurize_atoms(mol):
    >>>     feats = []
    >>>     for atom in mol.GetAtoms():
    >>>         feats.append(atom.GetAtomicNum())
    >>>     return {'atomic': torch.tensor(feats).reshape(-1, 1).float()}

    >>> # A custom edge featurizer
    >>> def featurize_bonds(mol):
    >>>     feats = []
    >>>     bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE,
    >>>                   Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC]
    >>>     for bond in mol.GetBonds():
    >>>         btype = bond_types.index(bond.GetBondType())
    >>>         # One bond between atom u and v corresponds to two edges (u, v) and (v, u)
    >>>         feats.extend([btype, btype])
    >>>     return {'type': torch.tensor(feats).reshape(-1, 1).float()}

    >>> mol_to_g = MolToBigraph(node_featurizer=featurize_atoms, edge_featurizer=featurize_bonds)
    >>> mol = Chem.MolFromSmiles('CCO')
    >>> g = mol_to_g(mol)
    >>> print(g.ndata['atomic'])
    tensor([[6.],
            [8.],
            [6.]])
    >>> print(g.edata['type'])
    tensor([[0.],
            [0.],
            [0.],
            [0.]])
    """
[docs]    def __init__(self,
                 add_self_loop=False,
                 node_featurizer=None,
                 edge_featurizer=None,
                 canonical_atom_order=True,
                 explicit_hydrogens=False,
                 num_virtual_nodes=0):
        self.add_self_loop = add_self_loop
        self.node_featurizer = node_featurizer
        self.edge_featurizer = edge_featurizer
        self.canonical_atom_order = canonical_atom_order
        self.explicit_hydrogens = explicit_hydrogens
        self.num_virtual_nodes = num_virtual_nodes

    def __call__(self, mol):
        """Construct graph for the molecule and featurize it.

        Parameters
        ----------
        mol : rdkit.Chem.rdchem.Mol
            RDKit molecule holder

        Returns
        -------
        DGLGraph or None
            Bi-directed DGLGraph for the molecule if :attr:`mol` is valid and None otherwise.
        """
        return mol_to_bigraph(mol,
                              self.add_self_loop,
                              self.node_featurizer,
                              self.edge_featurizer,
                              self.canonical_atom_order,
                              self.explicit_hydrogens,
                              self.num_virtual_nodes)

[docs]class SMILESToBigraph(ToGraph):
    """Convert SMILES strings into bi-directed DGLGraphs and featurize for them.

    Parameters
    ----------
    add_self_loop : bool
        Whether to add self loops in DGLGraphs. Default to False.
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to update
        ndata for a DGLGraph. Default to None.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to update
        edata for a DGLGraph. Default to None.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed. Default
        to True.
    explicit_hydrogens : bool
        Whether to explicitly represent hydrogens as nodes in the graph. If True,
        it will call rdkit.Chem.AddHs(mol). Default to False.
    num_virtual_nodes : int
        The number of virtual nodes to add. The virtual nodes will be connected to
        all real nodes with virtual edges. If the returned graph has any node/edge
        feature, an additional column of binary values will be used for each feature
        to indicate the identity of virtual node/edges. The features of the virtual
        nodes/edges will be zero vectors except for the additional column. Default to 0.

    Examples
    --------
    >>> import torch
    >>> from rdkit import Chem
    >>> from dgllife.utils import SMILESToBigraph

    >>> # A custom node featurizer
    >>> def featurize_atoms(mol):
    >>>     feats = []
    >>>     for atom in mol.GetAtoms():
    >>>         feats.append(atom.GetAtomicNum())
    >>>     return {'atomic': torch.tensor(feats).reshape(-1, 1).float()}

    >>> # A custom edge featurizer
    >>> def featurize_bonds(mol):
    >>>     feats = []
    >>>     bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE,
    >>>                   Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC]
    >>>     for bond in mol.GetBonds():
    >>>         btype = bond_types.index(bond.GetBondType())
    >>>         # One bond between atom u and v corresponds to two edges (u, v) and (v, u)
    >>>         feats.extend([btype, btype])
    >>>     return {'type': torch.tensor(feats).reshape(-1, 1).float()}

    >>> smi_to_g = SMILESToBigraph(node_featurizer=featurize_atoms,
    ...                            edge_featurizer=featurize_bonds)
    >>> g = smi_to_g('CCO')
    >>> print(g.ndata['atomic'])
    tensor([[6.],
            [8.],
            [6.]])
    >>> print(g.edata['type'])
    tensor([[0.],
            [0.],
            [0.],
            [0.]])
    """
[docs]    def __init__(self,
                 add_self_loop=False,
                 node_featurizer=None,
                 edge_featurizer=None,
                 canonical_atom_order=True,
                 explicit_hydrogens=False,
                 num_virtual_nodes=0):
        self.add_self_loop = add_self_loop
        self.node_featurizer = node_featurizer
        self.edge_featurizer = edge_featurizer
        self.canonical_atom_order = canonical_atom_order
        self.explicit_hydrogens = explicit_hydrogens
        self.num_virtual_nodes = num_virtual_nodes

    def __call__(self, smiles):
        """Construct graph for the molecule and featurize it.

        Parameters
        ----------
        smiles : str
            SMILES string.

        Returns
        -------
        DGLGraph or None
            Bi-directed DGLGraph for the molecule if :attr:`smiles` is valid and None otherwise.
        """
        return smiles_to_bigraph(smiles,
                                 self.add_self_loop,
                                 self.node_featurizer,
                                 self.edge_featurizer,
                                 self.canonical_atom_order,
                                 self.explicit_hydrogens,
                                 self.num_virtual_nodes)