diff --git a/README.md b/README.md index ff08d6b..788a956 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,99 @@ -# rdkit +# Molecular Fingerprinting +*author: shiyu* + +
+ +## Desription + +Molecular Fingerprinting encodes a Simplified Molecular Input Line Entry Specification (SMILES) as a fingerprint. The fingerprint can represent elements, atom pairs, or functional groups, etc., and are often used for substructure searches and similarity searches in drug discovery. + +This operator uses [RDKit](https://www.rdkit.org/docs/index.html) to generate the molecular fingerprint. + + + +
+ + + +## Code Example + +> Before running the following code, you need to install rdkit, refer to https://www.rdkit.org/docs/Install.html. +> +> ``` +> # install rdkit with conda +> $ conda install -c conda-forge rdkit +> ``` + +An example that use the Morgan algorithm to generate a fingerprint of the molecular formula 'Cc1ccc(cc1)S(=O)(=O)N'. + + *Write the pipeline in simplified style:* + +```python +import towhee + +towhee.dc(['Cc1ccc(cc1)S(=O)(=O)N']) \ + .molecular_fingerprinting.rdkit() \ + .show() +``` + + +*Write a same pipeline with explicit inputs/outputs name specifications:* + +```python +import towhee + +towhee.dc['smiles'](['Cc1ccc(cc1)S(=O)(=O)N']) \ + .molecular_fingerprinting.rdkit['smiles', 'fingerprint']() \ + .show() +``` + + + + +
+ + + +## Factory Constructor + +Create the operator via the following factory method: + +***molecular_fingerprinting.rdkit( algorithm: str = 'morgan', size: int = 2048)*** + + + +**Parameters:** + +***algorithm:*** *str* + +Which algorithm to use for fingerprinting, including 'morgan', 'daylight', 'ap', 'maccs', defaluts to 'morgan', and there is the [list of available fingerprints](https://www.rdkit.org/docs/GettingStartedInPython.html#list-of-available-fingerprints). + + + +***size:*** *int* + +The bit vector size just for morgan and daylight algorithm, defaults to 2048. + + + +
+ + + +## Interface + +An molecular fingerprinting operator takes a SMILES as input. +It uses the RDKit specified by algorithm name to generate a SMILES fingerprint. + +**Parameters:** + +***smiles:*** *str* + +A Simplified Molecular Input Line Entry Specification (SMILES). + + + +**Returns:** *bytes* + +The molecular fingerprint. diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..ce44dff --- /dev/null +++ b/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2021 Zilliz. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .rdkit import Rdkit + + +def rdkit(**kwargs): + return Rdkit(**kwargs) diff --git a/rdkit.py b/rdkit.py new file mode 100644 index 0000000..0d90cde --- /dev/null +++ b/rdkit.py @@ -0,0 +1,53 @@ +# Copyright 2021 Zilliz. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from towhee import register +from towhee.operator import Operator + +from rdkit import DataStructs, Chem + +@register(output_schema=['fingerprint']) +class Rdkit(Operator): + """ + Generate molecular fingerprint with RDKit. + + Args: + algorithm (`str`): + Which algorithm to use for fingerprinting, including morgan, daylight, ap, maccs, and defaluts to 'morgan'. + size (`int`): + The bit vector size, defaults to 2048. + """ + + def __init__(self, algorithm: str = 'morgan', size: int = 2048): + self.algorithm = algorithm + self.size = size + + def __call__(self, smiles: str): + mols = Chem.MolFromSmiles(smiles) + if self.algorithm == 'daylight': + fp = Chem.RDKFingerprint(mols, fpSize=self.size) + elif self.algorithm == 'morgan': + from rdkit.Chem import AllChem + AllChem.GetMorganFingerprint + fp = AllChem.GetMorganFingerprintAsBitVect(mols, 2, self.size) + elif self.algorithm == 'ap': + from rdkit.Chem.AtomPairs import Pairs + fp = Pairs.GetAtomPairFingerprintAsBitVect(mols) + elif self.algorithm == 'maccs': + from rdkit.Chem import MACCSkeys + fp = MACCSkeys.GenMACCSKeys(mols) + + hex_fp = DataStructs.BitVectToFPSText(fp) + fingerprint = bytes.fromhex(hex_fp) + return fingerprint diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/result1.png b/result1.png new file mode 100644 index 0000000..fa598ba Binary files /dev/null and b/result1.png differ diff --git a/result2.png b/result2.png new file mode 100644 index 0000000..6d1f424 Binary files /dev/null and b/result2.png differ