# -*- coding: utf-8 -*-
#
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Sider from MoleculeNet for the prediction of drug side-effects
import pandas as pd
from dgl.data.utils import get_download_dir, download, _get_dgl_url, extract_archive
from .csv_dataset import MoleculeCSVDataset
__all__ = ['SIDER']
[docs]class SIDER(MoleculeCSVDataset):
r"""SIDER from MoleculeNet for the prediction of grouped drug side-effects
Quoting [1], "The Side Effect Resource (SIDER) is a database of marketed drugs and adverse
drug reactions (ADR). The version of the SIDER dataset in DeepChem has grouped drug
side-effects into 27 system organ classes following MedDRA classifications measured for 1427
approved drugs (following previous usage)."
References:
* [1] MoleculeNet: A Benchmark for Molecular Machine Learning.
* [2] DeepChem
Parameters
----------
smiles_to_graph: callable, str -> DGLGraph
A function turning a SMILES string into a DGLGraph. If None, it uses
:func:`dgllife.utils.SMILESToBigraph` by default.
node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
Featurization for nodes like atoms in a molecule, which can be used to update
ndata for a DGLGraph. Default to None.
edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
Featurization for edges like bonds in a molecule, which can be used to update
edata for a DGLGraph. Default to None.
load : bool
Whether to load the previously pre-processed dataset or pre-process from scratch.
``load`` should be False when we want to try different graph construction and
featurization methods and need to preprocess from scratch. Default to False.
log_every : bool
Print a message every time ``log_every`` molecules are processed. Default to 1000.
cache_file_path : str
Path to the cached DGLGraphs, default to 'sider_dglgraph.bin'.
n_jobs : int
The maximum number of concurrently running jobs for graph construction and featurization,
using joblib backend. Default to 1.
Examples
--------
>>> import torch
>>> from dgllife.data import SIDER
>>> from dgllife.utils import SMILESToBigraph, CanonicalAtomFeaturizer
>>> smiles_to_g = SMILESToBigraph(node_featurizer=CanonicalAtomFeaturizer())
>>> dataset = SIDER(smiles_to_g)
>>> # Get size of the dataset
>>> len(dataset)
1427
>>> # Get the 0th datapoint, consisting of SMILES, DGLGraph, labels, and masks
>>> dataset[0]
('C(CNCCNCCNCCN)N',
Graph(num_nodes=13, num_edges=24,
ndata_schemes={'h': Scheme(shape=(74,), dtype=torch.float32)}
edata_schemes={}),
tensor([1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0.,
0., 0., 1., 0., 0., 1., 1., 0., 0., 1., 1., 1., 0.]),
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
To address the imbalance between positive and negative samples, we can re-weight
positive samples for each task based on the training datapoints.
>>> train_ids = torch.arange(500)
>>> dataset.task_pos_weights(train_ids)
tensor([ 1.1368, 0.4793, 49.0000, 0.7123, 0.2626, 0.5015, 0.1211, 5.2500,
0.4205, 1.0325, 3.1667, 0.1312, 3.9505, 5.9444, 0.3263, 0.7544,
0.0823, 4.9524, 0.3889, 0.3812, 0.4706, 0.6447, 11.5000, 1.4272,
0.5060, 0.1136, 0.5106])
"""
def __init__(self,
smiles_to_graph=None,
node_featurizer=None,
edge_featurizer=None,
load=False,
log_every=1000,
cache_file_path='./sider_dglgraph.bin',
n_jobs=1):
self._url = 'dataset/sider.zip'
data_path = get_download_dir() + '/sider.zip'
dir_path = get_download_dir() + '/sider'
download(_get_dgl_url(self._url), path=data_path, overwrite=False)
extract_archive(data_path, dir_path)
df = pd.read_csv(dir_path + '/sider.csv')
super(SIDER, self).__init__(df=df,
smiles_to_graph=smiles_to_graph,
node_featurizer=node_featurizer,
edge_featurizer=edge_featurizer,
smiles_column='smiles',
cache_file_path=cache_file_path,
load=load,
log_every=log_every,
init_mask=True,
n_jobs=n_jobs)
[docs] def __getitem__(self, item):
"""Get datapoint with index
Parameters
----------
item : int
Datapoint index
Returns
-------
str
SMILES for the ith datapoint
DGLGraph
DGLGraph for the ith datapoint
Tensor of dtype float32 and shape (T)
Labels of the ith datapoint for all tasks. T for the number of tasks.
Tensor of dtype float32 and shape (T)
Binary masks of the ith datapoint indicating the existence of labels for all tasks.
"""
return self.smiles[item], self.graphs[item], self.labels[item], self.mask[item]