"""Module holding functionalities for the ChEMBL API."""
from typing import List, Literal, Optional, Sequence, Tuple, Union
import numpy as np
import pandas as pd
from ..core.pandas_helper import add_comment
from ..logger import logger
from .api.downloader import get_assay_size_sql, get_full_activity_data_sql
from .data_flag_functions import (
flag_calculated_pchembl,
flag_incompatible_units,
flag_potential_duplicate,
flag_with_data_validity_comment,
)
from .exceptions import BioactivitiesNotFoundError
[docs]
def convert_to_log10(df: pd.DataFrame) -> pd.DataFrame:
"""Function to be applied to the whole DataFrame. Will convert the standard_value
column to pchembl_value column, if the standard_units are in nM, µM or uM.
Activities with incompatible units are flagged and preserved with pchembl_value=NaN
for transparency.
Args:
df: a bioactivity DataFrame. e.g.: output from `get_activity_table`.
Returns:
pd.DataFrame: the DataFrame with the pchembl_value column added. Activities with
incompatible units are flagged and have pchembl_value=NaN.
"""
def compute_log(row):
unit = row["standard_units"]
if pd.isna(unit):
return np.nan
value = row["standard_value"]
if value == 0: # avoid division by zero
return np.nan
if unit == "mM":
value_in_M = value * 1e-3
elif unit == "µM" or unit == "uM":
value_in_M = value * 1e-6
elif unit == "nM":
value_in_M = value * 1e-9
else:
return np.nan # Unit not compatible with pChEMBL calculation
return -np.log10(value_in_M)
desired_units = ["nM", "µM", "uM", "mM"] # noqa: F841
df = df.copy().pipe(flag_incompatible_units) # flag incompatible units w/ pchembl calculation e.g.; %
# Filter to convertible units for calculation, but preserve incompatible ones after
convertible_mask = df["standard_units"].isin(desired_units) | df["standard_units"].isna()
convertible_df = df[convertible_mask].copy()
incompatible_df = df[~convertible_mask].copy()
if "pchembl_value" not in convertible_df.columns:
raise ValueError("pchembl_value column not found in input DataFrame.")
elif (~convertible_df.pchembl_value.isna()).any():
raise ValueError(
"input DataFrame should only have pchembl_value.isna() values. If not, use "
"chembl.processing.process_bioactivities instead."
)
if convertible_df.shape[0] > 0: # Calculate pChEMBL for convertible units
convertible_df = convertible_df.pipe(flag_calculated_pchembl)
convertible_df = convertible_df.assign(pchembl_value=lambda x: x.apply(compute_log, axis=1))
with pd.option_context("future.no_silent_downcasting", True):
pchembl_inf_or_nan = convertible_df.replace([np.inf, -np.inf], np.nan).query(
"pchembl_value.isna()"
)
if not pchembl_inf_or_nan.empty:
debug_cols = [
"target_chembl_id",
"assay_chembl_id",
"assay_type",
"molecule_chembl_id",
"standard_units",
"standard_value",
]
_info = pchembl_inf_or_nan.loc[pchembl_inf_or_nan.index[:6], debug_cols]
comment = "Infinite or NaN pchembl_value after calculation"
logger.info(f"Flagging {len(pchembl_inf_or_nan)} rows: {comment}:\n{_info}")
convertible_df = add_comment(
df=convertible_df,
comment=comment,
criteria_func=lambda x: x.index.isin(pchembl_inf_or_nan.index),
target_column="pchembl_value", # Dummy target, criteria_func handles selection
comment_type="d",
)
if incompatible_df.shape[0] > 0: # Recombine convertible and incompatible dataframes
# Ensure incompatible_df has pchembl_value column (will be NaN)
if "pchembl_value" not in incompatible_df.columns:
incompatible_df["pchembl_value"] = np.nan
result_df = pd.concat([convertible_df, incompatible_df], ignore_index=True)
else:
result_df = convertible_df
return result_df
[docs]
def curate_activity_pairs(
df: pd.DataFrame,
mol_id_col: str = "molecule_chembl_id",
assay_id_col: str = "assay_chembl_id",
activity_value_col: str = "pchembl_value",
) -> pd.DataFrame:
"""Curate activity pairs for the same molecule across different assays. Removes or flags pairs
of measurements if their activity values (e.g., pChEMBL values) differ by approximately 3.0 or 6.0
Filter inspired on Landrum & Riniker, 2024, where the authors state:
> Given the very low probability of two separate experiments producing exactly the same
results, the exact matches are most likely cases where values from a previous paper are
copied into a new one; this was discussed in the earlier work by Kramer et al. (10) and
spot-checked with a number of assay pairs here.
Args:
df: DataFrame with bioactivity data.
mol_id_col: column name for molecule IDs. Defaults to "molecule_chembl_id" for ChEMBL data.
assay_id_col: column name for assay IDs. Defaults to "assay_chembl_id" for ChEMBL data.
activity_value_col: column name for activity values. Defaults to "pchembl_value" for ChEMBL data.
Returns:
pd.DataFrame: The curated DataFrame.
"""
df = df.copy()
if not {mol_id_col, assay_id_col, activity_value_col}.issubset(df.columns):
logger.warning(
"Skipping activity pair curation: Required columns "
f"({mol_id_col}, {assay_id_col}, {activity_value_col}) not found."
)
return df
if df.empty:
logger.info("Input DataFrame is empty. Skipping activity pair curation.")
return df
# Prepare DataFrame for merge by adding original index as a new column
df_for_merge = df.copy()
# Create a unique name for the temporary column holding original index values
temp_orig_idx_col = "__original_index__"
_i = 0
while temp_orig_idx_col in df_for_merge.columns:
temp_orig_idx_col = f"__original_index__{_i}"
_i += 1
df_for_merge[temp_orig_idx_col] = df.index # Use original df.index
# Self-merge on molecule ID
merged_df = pd.merge(df_for_merge, df_for_merge, on=mol_id_col, suffixes=("_L", "_R"))
# Define column names for easier access
orig_idx_col_L = temp_orig_idx_col + "_L"
orig_idx_col_R = temp_orig_idx_col + "_R"
assay_col_L = assay_id_col + "_L"
assay_col_R = assay_id_col + "_R"
activity_col_L = activity_value_col + "_L"
activity_col_R = activity_value_col + "_R"
# Filter for pairs:
# 1. From different assays
condition_diff_assays = merged_df[assay_col_L] != merged_df[assay_col_R]
# 2. Unique pairs of original rows (avoid self-comparison and duplicate (rowA,rowB)/(rowB,rowA) pairs)
condition_unique_rows = merged_df[orig_idx_col_L] < merged_df[orig_idx_col_R]
valid_pairs = merged_df[condition_diff_assays & condition_unique_rows].copy()
if valid_pairs.empty:
logger.info(
"No potential activity pairs found after initial structural filtering (diff assays, unique rows)."
)
return df
# Handle NaNs in activity values for the pairs
valid_pairs.dropna(subset=[activity_col_L, activity_col_R], inplace=True)
if valid_pairs.empty:
logger.info("No valid pairs with non-NaN activity values found for curation.")
return df
# Calculate absolute difference in activity values
valid_pairs["abs_diff"] = np.abs(valid_pairs[activity_col_L] - valid_pairs[activity_col_R])
# Check if the absolute difference is close to 3.0 and 6.0
# we use np.isclose to handle floating point precision issues (e.g.: 3.000000001)
error_in_exact_3 = np.isclose(valid_pairs["abs_diff"], 3.0, rtol=1e-9, atol=1e-9)
error_in_exact_6 = np.isclose(valid_pairs["abs_diff"], 6.0, rtol=1e-9, atol=1e-9)
problematic_pairs = valid_pairs[error_in_exact_3 | error_in_exact_6]
rows_to_flag_indices = set()
if not problematic_pairs.empty:
indices_L = problematic_pairs[orig_idx_col_L]
indices_R = problematic_pairs[orig_idx_col_R]
rows_to_flag_indices.update(indices_L.unique())
rows_to_flag_indices.update(indices_R.unique())
for _, row_pair in problematic_pairs.iterrows(): # iterrows on small problematic_pairs df for logging
logger.debug(
f"Marking/flagging rows for molecule {row_pair[mol_id_col]} (indices: {row_pair[orig_idx_col_L]}, {row_pair[orig_idx_col_R]}), "
f"assays {row_pair[assay_col_L]} (value: {row_pair[activity_col_L]}) and "
f"{row_pair[assay_col_R]} (value: {row_pair[activity_col_R]}) "
f"due to activity value difference of 3.0 or 6.0."
)
if rows_to_flag_indices:
comment = "Unit Annotation Error"
to_rm_idxs = list(rows_to_flag_indices)
logger.info(f"Activity Curation: Flagging {len(to_rm_idxs)} measurements due to {comment.lower()}")
df = add_comment(
df=df,
comment=comment,
criteria_func=lambda x: x.index.isin(to_rm_idxs),
target_column=activity_value_col,
comment_type="d",
)
else:
logger.info("No activity pairs found meeting the curation criteria (activity diff ~3.0 | ~6.0).")
return df
[docs]
def process_bioactivities(
bioactivities_df: pd.DataFrame,
calculate_pchembl: bool = True,
curate_annotation_errors: bool = True,
require_document_date: bool = False,
value_col: str = "pchembl_value",
) -> pd.DataFrame:
"""Processes the bioactivities DataFrame. Will convert the standard_value
column to pchembl_value column if the standard_units are in mM, µM, uM, or nM. If the
standard_units are in log, the original value in the pchembl_value is preserved.
Args:
bioactivities_df: bioactivity dataframe, e.g.: output from `get_activity_table`.
calculate_pchembl: Whether to calculate pChEMBL values. When aggregating on
standard_value (value_col != "pchembl_value"), this enables calculation of
pchembl_value for compatible units (e.g., nM, µM, mM). Though those are available
for high quality data, censored data does not have a pChEMBL value readily available
and they'll need to be calculated.
curate_annotation_errors: Whether to apply activity curation based on pChEMBL values
diverging in exactly 3.0 (indicate possible annotation errors). Defaults to True.
require_document_date: Whether to filter out activities without a document year.
value_col: Column name for aggregation values. Defaults to "pchembl_value".
When set to "standard_value", pchembl_value filtering is skipped.
Returns:
pd.DataFrame: the processed bioactivities DataFrame.
"""
bioactivities_df = bioactivities_df.astype({"standard_value": "float32", "pchembl_value": "float32"})
with pd.option_context("future.no_silent_downcasting", True):
bioactivities_df = bioactivities_df.replace({None: np.nan}).infer_objects(copy=False)
bioactivities_df = (
bioactivities_df.pipe(flag_with_data_validity_comment)
# .query("data_validity_comment.isna()")
.pipe(flag_potential_duplicate)
# .query("potential_duplicate == 0")
.drop(columns=["data_validity_comment", "potential_duplicate"])
)
# When aggregating on pchembl_value (default), opportunistically calculate missing values.
# When aggregating on standard_value, skip this filtering to avoid unnecessary work.
if value_col == "pchembl_value":
# Separate activities with and without pChEMBL values
with_pchembl = bioactivities_df.query("~pchembl_value.isna()").copy()
without_pchembl = bioactivities_df.query("pchembl_value.isna()").copy()
if calculate_pchembl and not without_pchembl.empty:
# Calculate pChEMBL for activities without it (preserves incompatible units)
without_pchembl = convert_to_log10(without_pchembl)
bioactivities_df = pd.concat([with_pchembl, without_pchembl], ignore_index=True)
elif not calculate_pchembl and not without_pchembl.empty:
# Keep activities without pChEMBL (incompatible units already flagged)
bioactivities_df = pd.concat([with_pchembl, without_pchembl], ignore_index=True)
else:
# Either calculate_pchembl=True and no missing values, or calculate_pchembl=False and all have values
bioactivities_df = with_pchembl if without_pchembl.empty else bioactivities_df
else:
# When aggregating on non-pchembl values (e.g., standard_value), still opportunistically
# calculate pchembl_value for compatible units if requested, but don't filter based on it
if calculate_pchembl:
without_pchembl = bioactivities_df.query("pchembl_value.isna()").copy()
if not without_pchembl.empty:
# Calculate pChEMBL for activities without it (preserves incompatible units)
without_pchembl = convert_to_log10(without_pchembl)
with_pchembl = bioactivities_df.query("~pchembl_value.isna()").copy()
bioactivities_df = pd.concat([with_pchembl, without_pchembl], ignore_index=True)
if curate_annotation_errors:
bioactivities_df = curate_activity_pairs(bioactivities_df)
if require_document_date:
if "year" not in bioactivities_df.columns:
logger.warning(
"Document date curation enabled, but 'year' column not found. Skipping this curation."
)
else:
original_count = len(bioactivities_df)
bioactivities_df = bioactivities_df[bioactivities_df["year"].notna()]
removed_count = original_count - len(bioactivities_df)
if removed_count > 0:
logger.info(
f"Document Date Curation: Removed {removed_count} measurements lacking a document year."
)
return bioactivities_df.reset_index(drop=True)
[docs]
def get_bioactivities_workflow(
molecule_chembl_ids: Optional[Union[list, str]] = None,
target_chembl_ids: Optional[Union[list, str]] = None,
assay_chembl_ids: Optional[Union[list, str]] = None,
document_chembl_ids: Optional[Union[list, str]] = None,
standard_relation: Optional[List[str]] = None,
standard_type: Optional[List[str]] = None,
standard_units: Optional[List[str]] = None,
confidence_scores: Union[list, Tuple] = (9, 8),
assay_types: Union[list, Tuple] = ("B", "F"),
chembl_release: Optional[int] = None,
additional_fields: Optional[List[str]] = None,
prefix: Optional[Sequence[str]] = None,
version: Optional[Union[int, str]] = None,
calculate_pchembl: bool = False,
curate_annotation_errors: bool = True,
require_document_date: bool = False,
backend: Literal["downloader", "webresource"] = "downloader",
value_col: str = "pchembl_value",
):
"""Perform the first step of the bioactivity data retrieval workflow. These are:
1. Get ChEMBL data using any of the input identifiers: molecule_chembl_ids, target_chembl_ids,
assay_chembl_ids, or document_chembl_ids. Additional filters are supported by other input
parameters.
2. Once the data is retrieved, the bioactivities are processed according to the `calculate_pchembl`
parameter. ChEMBL calculates pChEMBL values for activity data with the following criteria:
- "standard_type" in "[IC50", "XC50", "EC50", "AC50", "Ki", "Kd", "Potency", "ED50"];
- "standard_relation" == "=" & "standard_units" == "nM";
- "standard_value" > 0 & "data_validity_comment"].isnull()) | "data_validity_comment" == "Manually validated"
By passing this parameter to True, pchembl values will be calculated for bioactivities reported in
nM, µM or uM `standard_unit`, or -Log|Log `standard_type`.
3. The first quality filter is applied. Data containing "data_validity_description" or
"potential_duplicate" flags are immediatelly removed from the DataFrame.
Args:
molecule_chembl_ids: list of ChEMBL molecule IDs to fetch data for. Defaults to None.
target_chembl_ids: list of ChEMBL target IDs to fetch data for. Defaults to None.
assay_chembl_ids: list of ChEMBL assay IDs to fetch data for. Defaults to None.
document_chembl_ids: list of ChEMBL document IDs to fetch data for. Defaults to None.
standard_relation: Optional filter for standard relation types (e.g., ["=", "<", ">"])
standard_type: Optional filter for activity types (e.g., ["IC50", "Ki", "EC50"])
confidence_scores: list of confidence scores to filter the fetched assay data.
Defaults to (9, 8).
assay_types: list of assay types to be fetched from ChEMBL. Defaults to binding (B) and
functional (F) data.
chembl_release: Not to confuse for `version`. This is the ChEMBL release number used to
filter the data. Defaults to None.
additional_fields: `backend=="downloader"` only! "Optional list of additional fields to
include in the sql query. E.g.: ["vs.sequence"], to retrieve the sequence of the
variant, if available. Defaults to None.
prefix: `backend=="downloader"` only! prefix to be used by pystow for storing the data
on a custom directory. Defaults to None.
version: `backend=="downloader"` only! version of the ChEMBL database to be downloaded by
chembl_downloader. If left as None, the latest version will be downloaded. Defaults to None.
curate_annotation_errors: Whether to apply activity curation based on pChEMBL values diverging
in exactly 3.0 (indicate possible annotation errors). Defaults to True.
calculate_pchembl: calculate pChEMBL values for bioactivities reported in nM, µM or uM `standard_unit`
or -Log|Log `standard_type`. Defaults to False
backend: the backend to be used for fetching the data. If downloader, the ChEMBL sql database
is downloaded and extracted first. Defaults to "downloader".
value_col: Column name for values to be used during aggregation. Defaults to "pchembl_value".
When set to "standard_value", pchembl filtering is skipped and pchembl values are
only calculated opportunistically for compatible units. Defaults to "pchembl_value".
Raises:
BioactivitiesNotFoundError: If the retrieved bioactivity dataframe is empty.
"""
if backend == "downloader":
bioactivities_df = get_full_activity_data_sql(
molecule_chembl_ids=molecule_chembl_ids,
target_chembl_ids=target_chembl_ids,
assay_chembl_ids=assay_chembl_ids,
document_chembl_ids=document_chembl_ids,
standard_relation=standard_relation,
standard_type=standard_type,
standard_units=standard_units,
confidence_scores=confidence_scores,
assay_types=assay_types,
chembl_release=chembl_release,
additional_fields=additional_fields,
prefix=prefix,
version=version,
)
assay_sizes = get_assay_size_sql(
assay_chembl_ids=bioactivities_df.assay_chembl_id.unique().tolist(),
prefix=prefix,
version=version,
)
bioactivities_df = bioactivities_df.merge(assay_sizes, on="assay_chembl_id")
elif backend == "webresource":
from .api.webresource import get_full_activity_data
bioactivities_df = get_full_activity_data(
molecule_chembl_ids=molecule_chembl_ids,
target_chembl_ids=target_chembl_ids,
assay_chembl_ids=assay_chembl_ids,
document_chembl_ids=document_chembl_ids,
confidence_scores=confidence_scores,
assay_types=assay_types,
chembl_release=chembl_release,
).sort_values(by=["molecule_chembl_id", "activity_id", "standard_value"])
if bioactivities_df.empty:
raise BioactivitiesNotFoundError(
"No bioactivities found for the given query: "
f"molecule_chembl_ids={molecule_chembl_ids}, "
f"target_chembl_ids={target_chembl_ids}, "
f"assay_chembl_ids={assay_chembl_ids}, "
f"document_chembl_ids={document_chembl_ids}, "
)
bioactivities_df = process_bioactivities(
bioactivities_df,
calculate_pchembl=calculate_pchembl,
curate_annotation_errors=curate_annotation_errors,
require_document_date=require_document_date,
value_col=value_col,
)
if bioactivities_df.empty:
raise BioactivitiesNotFoundError(
"No bioactivities found after calculating pchembl_values. To investigate, use "
"either methods CompoundMapper.chembl.api.downloader.get_full_activity_data_sql or "
"CompoundMapper.chembl.api.webresource.get_full_activity_data."
)
return bioactivities_df