Source code for dgllife.data.esol

# -*- coding: utf-8 -*-
#
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# ESOL from MoleculeNet for the prediction of water solubility

import pandas as pd

from dgl.data.utils import get_download_dir, download, _get_dgl_url, extract_archive

from .csv_dataset import MoleculeCSVDataset

__all__ = ['ESOL']

[docs]class ESOL(MoleculeCSVDataset):
    r"""ESOL from MoleculeNet for the prediction of water solubility

    Quoting [1], " ESOL is a small dataset consisting of water solubility data for 1128 compounds.
    The dataset has been used to train models that estimate solubility directly from chemical
    structures (as encoded in SMILES strings). Note that these structures don't include 3D
    coordinates, since solubility is a property of a molecule and not of its particular
    conformers."

    References:

        * [1] MoleculeNet: A Benchmark for Molecular Machine Learning.
        * [2] ESOL: estimating aqueous solubility directly from molecular structure.
        * [3] DeepChem

    Parameters
    ----------
    smiles_to_graph: callable, str -> DGLGraph
        A function turning a SMILES string into a DGLGraph. If None, it uses
        :func:`dgllife.utils.SMILESToBigraph` by default.
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to update
        ndata for a DGLGraph. Default to None.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to update
        edata for a DGLGraph. Default to None.
    load : bool
        Whether to load the previously pre-processed dataset or pre-process from scratch.
        ``load`` should be False when we want to try different graph construction and
        featurization methods and need to preprocess from scratch. Default to False.
    log_every : bool
        Print a message every time ``log_every`` molecules are processed. Default to 1000.
    cache_file_path : str
        Path to the cached DGLGraphs, default to 'esol_dglgraph.bin'.
    n_jobs : int
        The maximum number of concurrently running jobs for graph construction and featurization,
        using joblib backend. Default to 1.

    Examples
    --------

    >>> from dgllife.data import ESOL
    >>> from dgllife.utils import SMILESToBigraph, CanonicalAtomFeaturizer

    >>> smiles_to_g = SMILESToBigraph(node_featurizer=CanonicalAtomFeaturizer())
    >>> dataset = ESOL(smiles_to_g)
    >>> # Get size of the dataset
    >>> len(dataset)
    1128
    >>> # Get the 0th datapoint, consisting of SMILES, DGLGraph and solubility
    >>> dataset[0]
    ('OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)C(O)C3O ',
     DGLGraph(num_nodes=32, num_edges=68,
              ndata_schemes={}
              edata_schemes={}),
     tensor([-0.7700]))

    We also provide information for the name, estimated solubility, minimum atom
    degree, molecular weight, number of h bond donors, number of rings,
    number of rotatable bonds, and polar surface area of the compound

    >>> # Access the information mentioned above for the ith datapoint
    >>> dataset.compound_names[i]
    >>> dataset.estimated_solubility[i]
    >>> dataset.min_degree[i]
    >>> dataset.mol_weight[i]
    >>> dataset.num_h_bond_donors[i]
    >>> dataset.num_rings[i]
    >>> dataset.num_rotatable_bonds[i]
    >>> dataset.polar_surface_area[i]

    We can also get all these information along with SMILES, DGLGraph and solubility
    at once.

    >>> dataset.load_full = True
    >>> dataset[0]
    ('OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)C(O)C3O ',
     DGLGraph(num_nodes=32, num_edges=68,
              ndata_schemes={}
              edata_schemes={}),
     tensor([-0.7700]),
     'Amigdalin',
     -0.974,
     1,
     457.43200000000013,
     7,
     3,
     7,
     202.32)
    """
    def __init__(self,
                 smiles_to_graph=None,
                 node_featurizer=None,
                 edge_featurizer=None,
                 load=False,
                 log_every=1000,
                 cache_file_path='./esol_dglgraph.bin',
                 n_jobs=1):

        self._url = 'dataset/ESOL.zip'
        data_path = get_download_dir() + '/ESOL.zip'
        dir_path = get_download_dir() + '/ESOL'
        download(_get_dgl_url(self._url), path=data_path, overwrite=False)
        extract_archive(data_path, dir_path)
        df = pd.read_csv(dir_path + '/delaney-processed.csv')

        super(ESOL, self).__init__(df=df,
                                   smiles_to_graph=smiles_to_graph,
                                   node_featurizer=node_featurizer,
                                   edge_featurizer=edge_featurizer,
                                   smiles_column='smiles',
                                   cache_file_path=cache_file_path,
                                   task_names=['measured log solubility in mols per litre'],
                                   load=load,
                                   log_every=log_every,
                                   init_mask=False,
                                   n_jobs=n_jobs)

        self.load_full = False
        # Compound names in PubChem
        self.compound_names = df['Compound ID'].tolist()
        self.compound_names = [self.compound_names[i] for i in self.valid_ids]
        # Estimated solubility
        self.estimated_solubility = df['ESOL predicted log solubility in mols per litre'].tolist()
        self.estimated_solubility = [self.estimated_solubility[i] for i in self.valid_ids]
        # Minimum atom degree
        self.min_degree = df['Minimum Degree'].tolist()
        self.min_degree = [self.min_degree[i] for i in self.valid_ids]
        # Molecular weight
        self.mol_weight = df['Molecular Weight'].tolist()
        self.mol_weight = [self.mol_weight[i] for i in self.valid_ids]
        # Number of H-Bond Donors
        self.num_h_bond_donors = df['Number of H-Bond Donors'].tolist()
        self.num_h_bond_donors = [self.num_h_bond_donors[i] for i in self.valid_ids]
        # Number of rings
        self.num_rings = df['Number of Rings'].tolist()
        self.num_rings = [self.num_rings[i] for i in self.valid_ids]
        # Number of rotatable bonds
        self.num_rotatable_bonds = df['Number of Rotatable Bonds'].tolist()
        self.num_rotatable_bonds = [self.num_rotatable_bonds[i] for i in self.valid_ids]
        # Polar Surface Area
        self.polar_surface_area = df['Polar Surface Area'].tolist()
        self.polar_surface_area = [self.polar_surface_area[i] for i in self.valid_ids]

[docs]    def __getitem__(self, item):
        """Get datapoint with index

        Parameters
        ----------
        item : int
            Datapoint index

        Returns
        -------
        str
            SMILES for the ith datapoint
        DGLGraph
            DGLGraph for the ith datapoint
        Tensor of dtype float32 and shape (1)
            Labels of the ith datapoint
        str, optional
            Name for the ith compound, returned only when ``self.load_full`` is True.
        float, optional
            Estimated solubility for the ith compound,
            returned only when ``self.load_full`` is True.
        int, optional
            Minimum atom degree for the ith datapoint, returned only when
            ``self.load_full`` is True.
        float, optional
            Molecular weight for the ith datapoint, returned only when
            ``self.load_full`` is True.
        int, optional
            Number of h bond donors for the ith datapoint, returned only when
            ``self.load_full`` is True.
        int, optional
            Number of rings in the ith datapoint, returned only when
            ``self.load_full`` is True.
        int, optional
            Number of rotatable bonds in the ith datapoint, returned only when
            ``self.load_full`` is True.
        float, optional
            Polar surface area for the ith datapoint, returned only when
            ``self.load_full`` is True.
        """
        if self.load_full:
            return self.smiles[item], self.graphs[item], self.labels[item], \
                   self.compound_names[item], self.estimated_solubility[item], \
                   self.min_degree[item], self.mol_weight[item], \
                   self.num_h_bond_donors[item], self.num_rings[item], \
                   self.num_rotatable_bonds[item], self.polar_surface_area[item]
        else:
            return self.smiles[item], self.graphs[item], self.labels[item]