Source code for Capricho.chembl.api.webresource

"""Module holding functionalities for the ChEMBL API using the webresource client as the backend."""

from typing import Optional, Tuple, Union

import pandas as pd
from chembl_webresource_client.new_client import new_client

from ...core.pandas_helper import find_dict_in_dataframe
from ...core.rate_limit import rate_limit
from ...logger import logger
from ..exceptions import BioactivitiesNotFoundError
from ..parsing import parse_compound_response

# Info on Chirality:
# The chirality flag shows whether a drug is dosed as a racemic mixture (0), single stereoisomer (1) or as an achiral molecule (2), for unchecked compounds the chirality flag = -1.
# source: https://chembl.gitbook.io/chembl-interface-documentation/frequently-asked-questions/drug-and-compound-questions#:~:text=Blog%20post.-,Can%20you%20provide%20more%20details%20on%20the%20chirality%20flag%3F,-The%20chirality%20flag


[docs] def get_document_table(document_chembl_ids: list) -> pd.DataFrame: """From a list of ChEMBL assay IDs, get the publication details. Args: assay_chembl_ids: list of ChEMBL assay IDs. Returns: dict: a dictionary with assay IDs as keys and publication details as values. """ query_kwargs = {} if document_chembl_ids is not None: query_kwargs.update({"document_chembl_ids__in": document_chembl_ids}) document_api = new_client.document documents = document_api.filter(**query_kwargs).only( "document_chembl_id", "doc_type", "authors", "doi", "journal", "volume", "year", "title", "chembl_release", ) publications_details = {} for doc_data in documents: if not doc_data: continue authors, doi, journal, volume, year, title = None, None, None, None, None, None chembl_release = None document_id = None doc_type = None document_id = doc_data.get("document_chembl_id") if not document_id: # Skip if no document ID continue doc_type = doc_data.get("doc_type") authors = doc_data.get("authors") doi = doc_data.get("doi") journal = doc_data.get("journal") volume = doc_data.get("volume") year = doc_data.get("year") title = doc_data.get("title") chembl_release = doc_data.get("chembl_release") # Handle nested structure: # {'chembl_release': 'CHEMBL_7', 'creation_date': '2010-09-29'} if isinstance(chembl_release, dict): chembl_release = chembl_release.get("chembl_release") publications_details[document_id] = { "doc_type": doc_type, "authors": authors, "doi": doi, "journal": journal, "volume": volume, "year": year, "title": title, "chembl_release": chembl_release, } return pd.DataFrame.from_dict(publications_details, orient="index").reset_index( names=["document_chembl_id"] )
[docs] def get_compound_table(molecule_chembl_ids: list) -> pd.DataFrame: """Get information on a molecule from ChEMBL. Args: molecule_chembl_ids: a list of molecule ChEMBL IDs. Returns: pd.DataFrame: a DataFrame with the molecule information. """ extracted = {} compounds_api = new_client.molecule result = compounds_api.filter(molecule_chembl_id__in=molecule_chembl_ids).only( "molecule_chembl_id", "molecule_hierarchy", "molecule_structures", "chemical_probes", "chirality", "oral", "prodrug", "max_phase", "therapeutical_flag", "withdrawn_flag", "indication_class", ) if result: for idx, r in enumerate(result): mol_id = r.get("molecule_chembl_id") if r is None: logger.warning(f"No information found for molecule {mol_id}") continue extracted[idx] = parse_compound_response(r, mol_id) else: raise ValueError(f"No information found for {molecule_chembl_ids}") return ( pd.DataFrame.from_dict(extracted, orient="index") .sort_values(by="molecule_chembl_id", key=lambda col: col.map(lambda e: molecule_chembl_ids.index(e))) .reset_index(drop=True) )
[docs] @rate_limit(max_per_second=5) def get_similarity_compound_table(smi: str, similarity: float) -> pd.DataFrame: """Fetch similar compounds from ChEMBL using the similarity API. Args: smiles: single smiles string to find similar molecules to. similarity: similarity threshold to use for the search. Value should be between 40 and 100. Raises: ValueError: If the similarity is not between 40 and 100, or if no similar molecules are found. Returns: pd.DataFrame: a DataFrame with the similar molecules. """ if not (40 <= similarity <= 100): raise ValueError("Similarity must be between 40 and 100") similarity_api = new_client.similarity extracted = {} result = similarity_api.filter(smiles=smi, similarity=similarity).only( "molecule_chembl_id", "molecule_hierarchy", "molecule_structures", "chemical_probes", "chirality", "oral", "prodrug", "max_phase", "therapeutical_flag", "withdrawn_flag", "indication_class", "similarity", ) if result: for idx, r in enumerate(result): if r is None: logger.warning(f"No similar molecules were found for reponse {idx}") else: extracted[idx] = {"querySmiles": smi, **parse_compound_response(r, smi)} else: logger.warning(f"No similar molecules were found to {smi}") return pd.DataFrame.from_dict(extracted, orient="index")
[docs] def get_assay_table( assay_chembl_ids: list, confidence_scores: list | None = None, **kwargs, ) -> pd.DataFrame: """Take a list of assay chembl ids and get their respective assays in ChEMBL. Args: assay_chembl_ids: a list of assay ChEMBL IDs. kwargs: keywords arguments to filter the assays. Returns: pd.DataFrame: a DataFrame with the assays. """ if confidence_scores is None: confidence_scores = list(range(0, 10)) activity_kwargs = { "assay_chembl_id__in": assay_chembl_ids, "confidence_score__in": confidence_scores, **kwargs, } assays_api = new_client.assay assays = assays_api.filter(**activity_kwargs).only( "assay_chembl_id", "description", "relationship_type", "assay_type", "assay_organism", "assay_category", "assay_tax_id", "assay_strain", "assay_tissue", "assay_cell_type", "assay_subcellular_fraction", "bao_format", "confidence_score", "document_chembl_id", "target_chembl_id", "variant_sequence", ) if assays: assays_df = pd.DataFrame.from_records(assays) if find_dict_in_dataframe(assays_df) is not None: logger.warning("Keeping only mutation info from `variant_sequence`.") assays_df = ( assays_df.assign( mutation=lambda x: x.variant_sequence.apply( lambda y: y.get("mutation") if isinstance(y, dict) else y ) ) .assign(mutation=lambda x: x.mutation.replace({None: "WT"})) .drop(columns=["variant_sequence"]) ) else: activity_kwargs.pop("assay_chembl_id__in") raise ValueError( f"No assays found for the ids: {assay_chembl_ids} with the parameters: {activity_kwargs}" ) return assays_df.rename(columns={"description": "assay_description"})
[docs] def get_activity_table( molecule_chembl_ids: Optional[list] = None, target_chembl_ids: Optional[list] = None, assay_chembl_ids: Optional[list] = None, document_chembl_ids: Optional[list] = None, **kwargs, ) -> Tuple[pd.DataFrame, dict]: """Take a list of molecule chembl ids and get their respective bioactivities in ChEMBL. Args: molecule_chembl_id: list of molecule ChEMBL IDs to fecth bioactivities. Defaults to None. target_chembl_ids: list of target ChEMBL IDs to fetch bioactivities. Defaults to None. assay_chembl_ids: list of assay ChEMBL IDs to fetch bioactivities. Defaults to None. document_chembl_ids: list of document ChEMBL IDs to fetch bioactivities. Defaults to None. kwargs: example -> `standard_relation=["="], assay_type__in=["B", "F"]`. Returns: Tuple[pd.DataFrame, dict]: a DataFrame with the bioactivities and the parameters used to fetch them. """ activity_kwargs = {**kwargs} if molecule_chembl_ids is not None: activity_kwargs.update({"molecule_chembl_id__in": molecule_chembl_ids}) if target_chembl_ids is not None: activity_kwargs.update({"target_chembl_id__in": target_chembl_ids}) if assay_chembl_ids is not None: activity_kwargs.update({"assay_chembl_id__in": assay_chembl_ids}) if document_chembl_ids is not None: activity_kwargs.update({"document_chembl_id__in": document_chembl_ids}) activity_api = new_client.activity logger.debug(f"Fetching bioactivities with the parameters: {activity_kwargs}") bioactivities = activity_api.filter(**activity_kwargs).only( "activity_id", "assay_chembl_id", "assay_description", "assay_type", "molecule_chembl_id", "standard_flag", "standard_relation", "standard_type", "standard_units", "standard_value", "pchembl_value", "target_chembl_id", "target_organism", "data_validity_comment", "potential_duplicate", ) if bioactivities: assays_df = pd.DataFrame.from_records(bioactivities) else: raise ValueError(f"No bioactivities found for the ids: {molecule_chembl_ids}") if assays_df.empty: raise BioactivitiesNotFoundError(parameters=activity_kwargs) return assays_df, activity_kwargs
[docs] def get_full_activity_data( molecule_chembl_ids: Optional[list] = None, target_chembl_ids: Optional[list] = None, assay_chembl_ids: Optional[list] = None, document_chembl_ids: Optional[list] = None, confidence_scores: Union[list, Tuple] = (9, 8), assay_types: Union[list, Tuple] = ("B", "F"), chembl_release: Optional[int] = None, add_document_info: bool = True, ) -> pd.DataFrame: """ Retrieve ChEMBL bioactivity data from any combination of molecule, target, assay, or document IDs. Data is retrieved using the ChEMBL webresource client, merges and returns a DataFrame with the bioactivity data. 1. Fetch bioactivities for the given target or molecule IDs, considering designated confidence scores and assay types (binding or functional) using `new_client.activity` from them `chembl_webresource_client` package. 2. Extract unique assay IDs from the bioactivities DataFrame & add this information to the final DataFrame. Args: molecule_chembl_ids: list of ChEMBL molecule IDs to fetch data for. Defaults to None. target_chembl_ids: list of ChEMBL target IDs to fetch data for. Defaults to None. assay_chembl_ids: list of ChEMBL assay IDs to fetch data for. Defaults to None. document_chembl_ids: list of ChEMBL document IDs to fetch data for. Defaults to None. confidence_scores: list of confidence scores to filter the fetched assay data. Defaults to (9, 8). assay_types: list of assay types to be fetched from ChEMBL. Defaults to binding (B) and functional (F) data. chembl_release: specify latest ChEMBL release to extract data from (e.g., 28). Defaults to None. add_document_info: whether to add publication-related fields to the final DataFrame. Setting to True, will require one less query to be made to ChEMBL, but fields like `year` will be lacking. Defaults to True. Returns: pd.DataFrame: Merged DataFrame with molecule, bioactivity, and assay information. """ activity_df, parameters = get_activity_table( molecule_chembl_ids=molecule_chembl_ids, target_chembl_ids=target_chembl_ids, assay_chembl_ids=assay_chembl_ids, document_chembl_ids=document_chembl_ids, standard_relation__in=["="], assay_type__in=assay_types, ) unique_assay_ids = activity_df["assay_chembl_id"].unique().tolist() if not unique_assay_ids: raise BioactivitiesNotFoundError(parameters=parameters) logger.debug(f"Fetching assays of type {assay_types} with confidence scores: {list(confidence_scores)}") assays_df = get_assay_table( unique_assay_ids, confidence_scores=list(confidence_scores), assay_type__in=assay_types ) merged_df = pd.merge( activity_df, assays_df, on=["assay_chembl_id", "target_chembl_id", "assay_type"], how="right", # keep only the bioactivities with respective assays ).drop_duplicates() logger.debug(f"Columns in the merged DataFrame: {merged_df.columns}") # Get the molecule structures & merge to the dataset unique_mol_chembl_ids = merged_df["molecule_chembl_id"].unique().tolist() logger.debug(f"Fetching structural information for {len(unique_mol_chembl_ids)} molecule ChEMBL IDs.") mol_data = get_compound_table(unique_mol_chembl_ids) # Merge using molecule_chembl_id full_df = merged_df.merge(mol_data, on="molecule_chembl_id", how="inner") full_df = full_df[ # reorder columns with molecule_chembl_id and canonical_smiles first ["molecule_chembl_id", "canonical_smiles"] + [col for col in full_df.columns if col not in ["molecule_chembl_id", "canonical_smiles"]] ] logger.debug(f"Columns in the DataFrame with molecular structures: {full_df.columns}") if any([chembl_release is not None, add_document_info]): document_ids = full_df["document_chembl_id"].unique().tolist() logger.info("Fetching publication details for the documents.") publications_df = get_document_table(document_ids) full_df = ( pd.merge(full_df, publications_df, on="document_chembl_id", how="left") .assign(chembl_release=lambda x: x.chembl_release.str.replace("CHEMBL_", "").astype(int)) .query(f"chembl_release <= {chembl_release}") .reset_index(drop=True) ) return full_df