Source code for Capricho.chembl.api.webresource

"""Module holding functionalities for the ChEMBL API using the webresource client as the backend."""

from typing import Optional, Tuple, Union

import pandas as pd
from chembl_webresource_client.new_client import new_client

from ...core.pandas_helper import find_dict_in_dataframe
from ...core.rate_limit import rate_limit
from ...logger import logger
from ..exceptions import BioactivitiesNotFoundError
from ..parsing import parse_compound_response

# Info on Chirality:
# The chirality flag shows whether a drug is dosed as a racemic mixture (0), single stereoisomer (1) or as an achiral molecule (2), for unchecked compounds the chirality flag = -1.
# source: https://chembl.gitbook.io/chembl-interface-documentation/frequently-asked-questions/drug-and-compound-questions#:~:text=Blog%20post.-,Can%20you%20provide%20more%20details%20on%20the%20chirality%20flag%3F,-The%20chirality%20flag



[docs]
def get_document_table(document_chembl_ids: list) -> pd.DataFrame:
    """From a list of ChEMBL assay IDs, get the publication details.
    Args:
        assay_chembl_ids: list of ChEMBL assay IDs.
    Returns:
        dict: a dictionary with assay IDs as keys and publication details as values.
    """
    query_kwargs = {}
    if document_chembl_ids is not None:
        query_kwargs.update({"document_chembl_ids__in": document_chembl_ids})

    document_api = new_client.document
    documents = document_api.filter(**query_kwargs).only(
        "document_chembl_id",
        "doc_type",
        "authors",
        "doi",
        "journal",
        "volume",
        "year",
        "title",
        "chembl_release",
    )
    publications_details = {}
    for doc_data in documents:

        if not doc_data:
            continue

        authors, doi, journal, volume, year, title = None, None, None, None, None, None
        chembl_release = None
        document_id = None
        doc_type = None

        document_id = doc_data.get("document_chembl_id")
        if not document_id:  # Skip if no document ID
            continue

        doc_type = doc_data.get("doc_type")
        authors = doc_data.get("authors")
        doi = doc_data.get("doi")
        journal = doc_data.get("journal")
        volume = doc_data.get("volume")
        year = doc_data.get("year")
        title = doc_data.get("title")
        chembl_release = doc_data.get("chembl_release")

        # Handle nested structure:
        # {'chembl_release': 'CHEMBL_7', 'creation_date': '2010-09-29'}
        if isinstance(chembl_release, dict):
            chembl_release = chembl_release.get("chembl_release")

        publications_details[document_id] = {
            "doc_type": doc_type,
            "authors": authors,
            "doi": doi,
            "journal": journal,
            "volume": volume,
            "year": year,
            "title": title,
            "chembl_release": chembl_release,
        }
    return pd.DataFrame.from_dict(publications_details, orient="index").reset_index(
        names=["document_chembl_id"]
    )




[docs]
def get_compound_table(molecule_chembl_ids: list) -> pd.DataFrame:
    """Get information on a molecule from ChEMBL.
    Args:
        molecule_chembl_ids: a list of molecule ChEMBL IDs.
    Returns:
        pd.DataFrame: a DataFrame with the molecule information.
    """
    extracted = {}
    compounds_api = new_client.molecule
    result = compounds_api.filter(molecule_chembl_id__in=molecule_chembl_ids).only(
        "molecule_chembl_id",
        "molecule_hierarchy",
        "molecule_structures",
        "chemical_probes",
        "chirality",
        "oral",
        "prodrug",
        "max_phase",
        "therapeutical_flag",
        "withdrawn_flag",
        "indication_class",
    )
    if result:
        for idx, r in enumerate(result):
            mol_id = r.get("molecule_chembl_id")
            if r is None:
                logger.warning(f"No information found for molecule {mol_id}")
                continue
            extracted[idx] = parse_compound_response(r, mol_id)
    else:
        raise ValueError(f"No information found for {molecule_chembl_ids}")
    return (
        pd.DataFrame.from_dict(extracted, orient="index")
        .sort_values(by="molecule_chembl_id", key=lambda col: col.map(lambda e: molecule_chembl_ids.index(e)))
        .reset_index(drop=True)
    )




[docs]
@rate_limit(max_per_second=5)
def get_similarity_compound_table(smi: str, similarity: float) -> pd.DataFrame:
    """Fetch similar compounds from ChEMBL using the similarity API.

    Args:
        smiles: single smiles string to find similar molecules to.
        similarity: similarity threshold to use for the search. Value should be between 40 and 100.

    Raises:
        ValueError: If the similarity is not between 40 and 100, or if no similar molecules are found.

    Returns:
        pd.DataFrame: a DataFrame with the similar molecules.
    """

    if not (40 <= similarity <= 100):
        raise ValueError("Similarity must be between 40 and 100")

    similarity_api = new_client.similarity
    extracted = {}
    result = similarity_api.filter(smiles=smi, similarity=similarity).only(
        "molecule_chembl_id",
        "molecule_hierarchy",
        "molecule_structures",
        "chemical_probes",
        "chirality",
        "oral",
        "prodrug",
        "max_phase",
        "therapeutical_flag",
        "withdrawn_flag",
        "indication_class",
        "similarity",
    )
    if result:
        for idx, r in enumerate(result):
            if r is None:
                logger.warning(f"No similar molecules were found for reponse {idx}")
            else:
                extracted[idx] = {"querySmiles": smi, **parse_compound_response(r, smi)}
    else:
        logger.warning(f"No similar molecules were found to {smi}")
    return pd.DataFrame.from_dict(extracted, orient="index")




[docs]
def get_assay_table(
    assay_chembl_ids: list,
    confidence_scores: list | None = None,
    **kwargs,
) -> pd.DataFrame:
    """Take a list of assay chembl ids and get their respective assays in ChEMBL.
    Args:
        assay_chembl_ids: a list of assay ChEMBL IDs.
        kwargs: keywords arguments to filter the assays.
    Returns:
        pd.DataFrame: a DataFrame with the assays.
    """
    if confidence_scores is None:
        confidence_scores = list(range(0, 10))
    activity_kwargs = {
        "assay_chembl_id__in": assay_chembl_ids,
        "confidence_score__in": confidence_scores,
        **kwargs,
    }
    assays_api = new_client.assay
    assays = assays_api.filter(**activity_kwargs).only(
        "assay_chembl_id",
        "description",
        "relationship_type",
        "assay_type",
        "assay_organism",
        "assay_category",
        "assay_tax_id",
        "assay_strain",
        "assay_tissue",
        "assay_cell_type",
        "assay_subcellular_fraction",
        "bao_format",
        "confidence_score",
        "document_chembl_id",
        "target_chembl_id",
        "variant_sequence",
    )
    if assays:
        assays_df = pd.DataFrame.from_records(assays)
        if find_dict_in_dataframe(assays_df) is not None:
            logger.warning("Keeping only mutation info from `variant_sequence`.")
            assays_df = (
                assays_df.assign(
                    mutation=lambda x: x.variant_sequence.apply(
                        lambda y: y.get("mutation") if isinstance(y, dict) else y
                    )
                )
                .assign(mutation=lambda x: x.mutation.replace({None: "WT"}))
                .drop(columns=["variant_sequence"])
            )
    else:
        activity_kwargs.pop("assay_chembl_id__in")
        raise ValueError(
            f"No assays found for the ids: {assay_chembl_ids} with the parameters: {activity_kwargs}"
        )
    return assays_df.rename(columns={"description": "assay_description"})




[docs]
def get_activity_table(
    molecule_chembl_ids: Optional[list] = None,
    target_chembl_ids: Optional[list] = None,
    assay_chembl_ids: Optional[list] = None,
    document_chembl_ids: Optional[list] = None,
    **kwargs,
) -> Tuple[pd.DataFrame, dict]:
    """Take a list of molecule chembl ids and get their respective bioactivities in ChEMBL.
    Args:
        molecule_chembl_id: list of molecule ChEMBL IDs to fecth bioactivities. Defaults to None.
        target_chembl_ids: list of target ChEMBL IDs to fetch bioactivities. Defaults to None.
        assay_chembl_ids: list of assay ChEMBL IDs to fetch bioactivities. Defaults to None.
        document_chembl_ids: list of document ChEMBL IDs to fetch bioactivities. Defaults to None.
        kwargs: example -> `standard_relation=["="], assay_type__in=["B", "F"]`.
    Returns:
        Tuple[pd.DataFrame, dict]: a DataFrame with the bioactivities and the parameters used to fetch them.
    """
    activity_kwargs = {**kwargs}
    if molecule_chembl_ids is not None:
        activity_kwargs.update({"molecule_chembl_id__in": molecule_chembl_ids})
    if target_chembl_ids is not None:
        activity_kwargs.update({"target_chembl_id__in": target_chembl_ids})
    if assay_chembl_ids is not None:
        activity_kwargs.update({"assay_chembl_id__in": assay_chembl_ids})
    if document_chembl_ids is not None:
        activity_kwargs.update({"document_chembl_id__in": document_chembl_ids})
    activity_api = new_client.activity
    logger.debug(f"Fetching bioactivities with the parameters: {activity_kwargs}")
    bioactivities = activity_api.filter(**activity_kwargs).only(
        "activity_id",
        "assay_chembl_id",
        "assay_description",
        "assay_type",
        "molecule_chembl_id",
        "standard_flag",
        "standard_relation",
        "standard_type",
        "standard_units",
        "standard_value",
        "pchembl_value",
        "target_chembl_id",
        "target_organism",
        "data_validity_comment",
        "potential_duplicate",
    )
    if bioactivities:
        assays_df = pd.DataFrame.from_records(bioactivities)
    else:
        raise ValueError(f"No bioactivities found for the ids: {molecule_chembl_ids}")
    if assays_df.empty:
        raise BioactivitiesNotFoundError(parameters=activity_kwargs)
    return assays_df, activity_kwargs




[docs]
def get_full_activity_data(
    molecule_chembl_ids: Optional[list] = None,
    target_chembl_ids: Optional[list] = None,
    assay_chembl_ids: Optional[list] = None,
    document_chembl_ids: Optional[list] = None,
    confidence_scores: Union[list, Tuple] = (9, 8),
    assay_types: Union[list, Tuple] = ("B", "F"),
    chembl_release: Optional[int] = None,
    add_document_info: bool = True,
) -> pd.DataFrame:
    """
    Retrieve ChEMBL bioactivity data from any combination of molecule, target, assay, or document IDs.
    Data is retrieved using the ChEMBL webresource client, merges and returns a DataFrame with the
    bioactivity data.

    1. Fetch bioactivities for the given target or molecule IDs, considering designated
        confidence scores and assay types (binding or functional) using `new_client.activity`
        from them `chembl_webresource_client` package.
    2. Extract unique assay IDs from the bioactivities DataFrame & add this information to the
        final DataFrame.

    Args:
        molecule_chembl_ids: list of ChEMBL molecule IDs to fetch data for. Defaults to None.
        target_chembl_ids: list of ChEMBL target IDs to fetch data for. Defaults to None.
        assay_chembl_ids: list of ChEMBL assay IDs to fetch data for. Defaults to None.
        document_chembl_ids: list of ChEMBL document IDs to fetch data for. Defaults to None.
        confidence_scores: list of confidence scores to filter the fetched assay data.
            Defaults to (9, 8).
        assay_types: list of assay types to be fetched from ChEMBL. Defaults to binding (B) and
            functional (F) data.
        chembl_release: specify latest ChEMBL release to extract data from (e.g., 28). Defaults to None.
        add_document_info: whether to add publication-related fields to the final DataFrame. Setting
            to True, will require one less query to be made to ChEMBL, but fields like `year` will be
            lacking. Defaults to True.
    Returns:
        pd.DataFrame: Merged DataFrame with molecule, bioactivity, and assay information.
    """
    activity_df, parameters = get_activity_table(
        molecule_chembl_ids=molecule_chembl_ids,
        target_chembl_ids=target_chembl_ids,
        assay_chembl_ids=assay_chembl_ids,
        document_chembl_ids=document_chembl_ids,
        standard_relation__in=["="],
        assay_type__in=assay_types,
    )
    unique_assay_ids = activity_df["assay_chembl_id"].unique().tolist()
    if not unique_assay_ids:
        raise BioactivitiesNotFoundError(parameters=parameters)

    logger.debug(f"Fetching assays of type {assay_types} with confidence scores: {list(confidence_scores)}")
    assays_df = get_assay_table(
        unique_assay_ids, confidence_scores=list(confidence_scores), assay_type__in=assay_types
    )

    merged_df = pd.merge(
        activity_df,
        assays_df,
        on=["assay_chembl_id", "target_chembl_id", "assay_type"],
        how="right",  # keep only the bioactivities with respective assays
    ).drop_duplicates()
    logger.debug(f"Columns in the merged DataFrame: {merged_df.columns}")

    # Get the molecule structures & merge to the dataset
    unique_mol_chembl_ids = merged_df["molecule_chembl_id"].unique().tolist()
    logger.debug(f"Fetching structural information for {len(unique_mol_chembl_ids)} molecule ChEMBL IDs.")
    mol_data = get_compound_table(unique_mol_chembl_ids)

    # Merge using molecule_chembl_id
    full_df = merged_df.merge(mol_data, on="molecule_chembl_id", how="inner")
    full_df = full_df[  # reorder columns with molecule_chembl_id and canonical_smiles first
        ["molecule_chembl_id", "canonical_smiles"]
        + [col for col in full_df.columns if col not in ["molecule_chembl_id", "canonical_smiles"]]
    ]
    logger.debug(f"Columns in the DataFrame with molecular structures: {full_df.columns}")
    if any([chembl_release is not None, add_document_info]):
        document_ids = full_df["document_chembl_id"].unique().tolist()
        logger.info("Fetching publication details for the documents.")
        publications_df = get_document_table(document_ids)
        full_df = (
            pd.merge(full_df, publications_df, on="document_chembl_id", how="left")
            .assign(chembl_release=lambda x: x.chembl_release.str.replace("CHEMBL_", "").astype(int))
            .query(f"chembl_release <= {chembl_release}")
            .reset_index(drop=True)
        )

    return full_df