Source code for Capricho.chembl.processing

"""Module holding functionalities for the ChEMBL API."""

from typing import List, Literal, Optional, Sequence, Tuple, Union

import numpy as np
import pandas as pd

from ..core.pandas_helper import add_comment
from ..logger import logger
from .api.downloader import get_assay_size_sql, get_full_activity_data_sql
from .data_flag_functions import (
    flag_calculated_pchembl,
    flag_incompatible_units,
    flag_potential_duplicate,
    flag_with_data_validity_comment,
)
from .exceptions import BioactivitiesNotFoundError


[docs] def convert_to_log10(df: pd.DataFrame) -> pd.DataFrame: """Function to be applied to the whole DataFrame. Will convert the standard_value column to pchembl_value column, if the standard_units are in nM, µM or uM. Activities with incompatible units are flagged and preserved with pchembl_value=NaN for transparency. Args: df: a bioactivity DataFrame. e.g.: output from `get_activity_table`. Returns: pd.DataFrame: the DataFrame with the pchembl_value column added. Activities with incompatible units are flagged and have pchembl_value=NaN. """ def compute_log(row): unit = row["standard_units"] if pd.isna(unit): return np.nan value = row["standard_value"] if value == 0: # avoid division by zero return np.nan if unit == "mM": value_in_M = value * 1e-3 elif unit == "µM" or unit == "uM": value_in_M = value * 1e-6 elif unit == "nM": value_in_M = value * 1e-9 else: return np.nan # Unit not compatible with pChEMBL calculation return -np.log10(value_in_M) desired_units = ["nM", "µM", "uM", "mM"] # noqa: F841 df = df.copy().pipe(flag_incompatible_units) # flag incompatible units w/ pchembl calculation e.g.; % # Filter to convertible units for calculation, but preserve incompatible ones after convertible_mask = df["standard_units"].isin(desired_units) | df["standard_units"].isna() convertible_df = df[convertible_mask].copy() incompatible_df = df[~convertible_mask].copy() if "pchembl_value" not in convertible_df.columns: raise ValueError("pchembl_value column not found in input DataFrame.") elif (~convertible_df.pchembl_value.isna()).any(): raise ValueError( "input DataFrame should only have pchembl_value.isna() values. If not, use " "chembl.processing.process_bioactivities instead." ) if convertible_df.shape[0] > 0: # Calculate pChEMBL for convertible units convertible_df = convertible_df.pipe(flag_calculated_pchembl) convertible_df = convertible_df.assign(pchembl_value=lambda x: x.apply(compute_log, axis=1)) with pd.option_context("future.no_silent_downcasting", True): pchembl_inf_or_nan = convertible_df.replace([np.inf, -np.inf], np.nan).query( "pchembl_value.isna()" ) if not pchembl_inf_or_nan.empty: debug_cols = [ "target_chembl_id", "assay_chembl_id", "assay_type", "molecule_chembl_id", "standard_units", "standard_value", ] _info = pchembl_inf_or_nan.loc[pchembl_inf_or_nan.index[:6], debug_cols] comment = "Infinite or NaN pchembl_value after calculation" logger.info(f"Flagging {len(pchembl_inf_or_nan)} rows: {comment}:\n{_info}") convertible_df = add_comment( df=convertible_df, comment=comment, criteria_func=lambda x: x.index.isin(pchembl_inf_or_nan.index), target_column="pchembl_value", # Dummy target, criteria_func handles selection comment_type="d", ) if incompatible_df.shape[0] > 0: # Recombine convertible and incompatible dataframes # Ensure incompatible_df has pchembl_value column (will be NaN) if "pchembl_value" not in incompatible_df.columns: incompatible_df["pchembl_value"] = np.nan result_df = pd.concat([convertible_df, incompatible_df], ignore_index=True) else: result_df = convertible_df return result_df
[docs] def curate_activity_pairs( df: pd.DataFrame, mol_id_col: str = "molecule_chembl_id", assay_id_col: str = "assay_chembl_id", activity_value_col: str = "pchembl_value", ) -> pd.DataFrame: """Curate activity pairs for the same molecule across different assays. Removes or flags pairs of measurements if their activity values (e.g., pChEMBL values) differ by approximately 3.0 or 6.0 Filter inspired on Landrum & Riniker, 2024, where the authors state: > Given the very low probability of two separate experiments producing exactly the same results, the exact matches are most likely cases where values from a previous paper are copied into a new one; this was discussed in the earlier work by Kramer et al. (10) and spot-checked with a number of assay pairs here. Args: df: DataFrame with bioactivity data. mol_id_col: column name for molecule IDs. Defaults to "molecule_chembl_id" for ChEMBL data. assay_id_col: column name for assay IDs. Defaults to "assay_chembl_id" for ChEMBL data. activity_value_col: column name for activity values. Defaults to "pchembl_value" for ChEMBL data. Returns: pd.DataFrame: The curated DataFrame. """ df = df.copy() if not {mol_id_col, assay_id_col, activity_value_col}.issubset(df.columns): logger.warning( "Skipping activity pair curation: Required columns " f"({mol_id_col}, {assay_id_col}, {activity_value_col}) not found." ) return df if df.empty: logger.info("Input DataFrame is empty. Skipping activity pair curation.") return df # Prepare DataFrame for merge by adding original index as a new column df_for_merge = df.copy() # Create a unique name for the temporary column holding original index values temp_orig_idx_col = "__original_index__" _i = 0 while temp_orig_idx_col in df_for_merge.columns: temp_orig_idx_col = f"__original_index__{_i}" _i += 1 df_for_merge[temp_orig_idx_col] = df.index # Use original df.index # Self-merge on molecule ID merged_df = pd.merge(df_for_merge, df_for_merge, on=mol_id_col, suffixes=("_L", "_R")) # Define column names for easier access orig_idx_col_L = temp_orig_idx_col + "_L" orig_idx_col_R = temp_orig_idx_col + "_R" assay_col_L = assay_id_col + "_L" assay_col_R = assay_id_col + "_R" activity_col_L = activity_value_col + "_L" activity_col_R = activity_value_col + "_R" # Filter for pairs: # 1. From different assays condition_diff_assays = merged_df[assay_col_L] != merged_df[assay_col_R] # 2. Unique pairs of original rows (avoid self-comparison and duplicate (rowA,rowB)/(rowB,rowA) pairs) condition_unique_rows = merged_df[orig_idx_col_L] < merged_df[orig_idx_col_R] valid_pairs = merged_df[condition_diff_assays & condition_unique_rows].copy() if valid_pairs.empty: logger.info( "No potential activity pairs found after initial structural filtering (diff assays, unique rows)." ) return df # Handle NaNs in activity values for the pairs valid_pairs.dropna(subset=[activity_col_L, activity_col_R], inplace=True) if valid_pairs.empty: logger.info("No valid pairs with non-NaN activity values found for curation.") return df # Calculate absolute difference in activity values valid_pairs["abs_diff"] = np.abs(valid_pairs[activity_col_L] - valid_pairs[activity_col_R]) # Check if the absolute difference is close to 3.0 and 6.0 # we use np.isclose to handle floating point precision issues (e.g.: 3.000000001) error_in_exact_3 = np.isclose(valid_pairs["abs_diff"], 3.0, rtol=1e-9, atol=1e-9) error_in_exact_6 = np.isclose(valid_pairs["abs_diff"], 6.0, rtol=1e-9, atol=1e-9) problematic_pairs = valid_pairs[error_in_exact_3 | error_in_exact_6] rows_to_flag_indices = set() if not problematic_pairs.empty: indices_L = problematic_pairs[orig_idx_col_L] indices_R = problematic_pairs[orig_idx_col_R] rows_to_flag_indices.update(indices_L.unique()) rows_to_flag_indices.update(indices_R.unique()) for _, row_pair in problematic_pairs.iterrows(): # iterrows on small problematic_pairs df for logging logger.debug( f"Marking/flagging rows for molecule {row_pair[mol_id_col]} (indices: {row_pair[orig_idx_col_L]}, {row_pair[orig_idx_col_R]}), " f"assays {row_pair[assay_col_L]} (value: {row_pair[activity_col_L]}) and " f"{row_pair[assay_col_R]} (value: {row_pair[activity_col_R]}) " f"due to activity value difference of 3.0 or 6.0." ) if rows_to_flag_indices: comment = "Unit Annotation Error" to_rm_idxs = list(rows_to_flag_indices) logger.info(f"Activity Curation: Flagging {len(to_rm_idxs)} measurements due to {comment.lower()}") df = add_comment( df=df, comment=comment, criteria_func=lambda x: x.index.isin(to_rm_idxs), target_column=activity_value_col, comment_type="d", ) else: logger.info("No activity pairs found meeting the curation criteria (activity diff ~3.0 | ~6.0).") return df
[docs] def process_bioactivities( bioactivities_df: pd.DataFrame, calculate_pchembl: bool = True, curate_annotation_errors: bool = True, require_document_date: bool = False, value_col: str = "pchembl_value", ) -> pd.DataFrame: """Processes the bioactivities DataFrame. Will convert the standard_value column to pchembl_value column if the standard_units are in mM, µM, uM, or nM. If the standard_units are in log, the original value in the pchembl_value is preserved. Args: bioactivities_df: bioactivity dataframe, e.g.: output from `get_activity_table`. calculate_pchembl: Whether to calculate pChEMBL values. When aggregating on standard_value (value_col != "pchembl_value"), this enables calculation of pchembl_value for compatible units (e.g., nM, µM, mM). Though those are available for high quality data, censored data does not have a pChEMBL value readily available and they'll need to be calculated. curate_annotation_errors: Whether to apply activity curation based on pChEMBL values diverging in exactly 3.0 (indicate possible annotation errors). Defaults to True. require_document_date: Whether to filter out activities without a document year. value_col: Column name for aggregation values. Defaults to "pchembl_value". When set to "standard_value", pchembl_value filtering is skipped. Returns: pd.DataFrame: the processed bioactivities DataFrame. """ bioactivities_df = bioactivities_df.astype({"standard_value": "float32", "pchembl_value": "float32"}) with pd.option_context("future.no_silent_downcasting", True): bioactivities_df = bioactivities_df.replace({None: np.nan}).infer_objects(copy=False) bioactivities_df = ( bioactivities_df.pipe(flag_with_data_validity_comment) # .query("data_validity_comment.isna()") .pipe(flag_potential_duplicate) # .query("potential_duplicate == 0") .drop(columns=["data_validity_comment", "potential_duplicate"]) ) # When aggregating on pchembl_value (default), opportunistically calculate missing values. # When aggregating on standard_value, skip this filtering to avoid unnecessary work. if value_col == "pchembl_value": # Separate activities with and without pChEMBL values with_pchembl = bioactivities_df.query("~pchembl_value.isna()").copy() without_pchembl = bioactivities_df.query("pchembl_value.isna()").copy() if calculate_pchembl and not without_pchembl.empty: # Calculate pChEMBL for activities without it (preserves incompatible units) without_pchembl = convert_to_log10(without_pchembl) bioactivities_df = pd.concat([with_pchembl, without_pchembl], ignore_index=True) elif not calculate_pchembl and not without_pchembl.empty: # Keep activities without pChEMBL (incompatible units already flagged) bioactivities_df = pd.concat([with_pchembl, without_pchembl], ignore_index=True) else: # Either calculate_pchembl=True and no missing values, or calculate_pchembl=False and all have values bioactivities_df = with_pchembl if without_pchembl.empty else bioactivities_df else: # When aggregating on non-pchembl values (e.g., standard_value), still opportunistically # calculate pchembl_value for compatible units if requested, but don't filter based on it if calculate_pchembl: without_pchembl = bioactivities_df.query("pchembl_value.isna()").copy() if not without_pchembl.empty: # Calculate pChEMBL for activities without it (preserves incompatible units) without_pchembl = convert_to_log10(without_pchembl) with_pchembl = bioactivities_df.query("~pchembl_value.isna()").copy() bioactivities_df = pd.concat([with_pchembl, without_pchembl], ignore_index=True) if curate_annotation_errors: bioactivities_df = curate_activity_pairs(bioactivities_df) if require_document_date: if "year" not in bioactivities_df.columns: logger.warning( "Document date curation enabled, but 'year' column not found. Skipping this curation." ) else: original_count = len(bioactivities_df) bioactivities_df = bioactivities_df[bioactivities_df["year"].notna()] removed_count = original_count - len(bioactivities_df) if removed_count > 0: logger.info( f"Document Date Curation: Removed {removed_count} measurements lacking a document year." ) return bioactivities_df.reset_index(drop=True)
[docs] def get_bioactivities_workflow( molecule_chembl_ids: Optional[Union[list, str]] = None, target_chembl_ids: Optional[Union[list, str]] = None, assay_chembl_ids: Optional[Union[list, str]] = None, document_chembl_ids: Optional[Union[list, str]] = None, standard_relation: Optional[List[str]] = None, standard_type: Optional[List[str]] = None, standard_units: Optional[List[str]] = None, confidence_scores: Union[list, Tuple] = (9, 8), assay_types: Union[list, Tuple] = ("B", "F"), chembl_release: Optional[int] = None, additional_fields: Optional[List[str]] = None, prefix: Optional[Sequence[str]] = None, version: Optional[Union[int, str]] = None, calculate_pchembl: bool = False, curate_annotation_errors: bool = True, require_document_date: bool = False, backend: Literal["downloader", "webresource"] = "downloader", value_col: str = "pchembl_value", ): """Perform the first step of the bioactivity data retrieval workflow. These are: 1. Get ChEMBL data using any of the input identifiers: molecule_chembl_ids, target_chembl_ids, assay_chembl_ids, or document_chembl_ids. Additional filters are supported by other input parameters. 2. Once the data is retrieved, the bioactivities are processed according to the `calculate_pchembl` parameter. ChEMBL calculates pChEMBL values for activity data with the following criteria: - "standard_type" in "[IC50", "XC50", "EC50", "AC50", "Ki", "Kd", "Potency", "ED50"]; - "standard_relation" == "=" & "standard_units" == "nM"; - "standard_value" > 0 & "data_validity_comment"].isnull()) | "data_validity_comment" == "Manually validated" By passing this parameter to True, pchembl values will be calculated for bioactivities reported in nM, µM or uM `standard_unit`, or -Log|Log `standard_type`. 3. The first quality filter is applied. Data containing "data_validity_description" or "potential_duplicate" flags are immediatelly removed from the DataFrame. Args: molecule_chembl_ids: list of ChEMBL molecule IDs to fetch data for. Defaults to None. target_chembl_ids: list of ChEMBL target IDs to fetch data for. Defaults to None. assay_chembl_ids: list of ChEMBL assay IDs to fetch data for. Defaults to None. document_chembl_ids: list of ChEMBL document IDs to fetch data for. Defaults to None. standard_relation: Optional filter for standard relation types (e.g., ["=", "<", ">"]) standard_type: Optional filter for activity types (e.g., ["IC50", "Ki", "EC50"]) confidence_scores: list of confidence scores to filter the fetched assay data. Defaults to (9, 8). assay_types: list of assay types to be fetched from ChEMBL. Defaults to binding (B) and functional (F) data. chembl_release: Not to confuse for `version`. This is the ChEMBL release number used to filter the data. Defaults to None. additional_fields: `backend=="downloader"` only! "Optional list of additional fields to include in the sql query. E.g.: ["vs.sequence"], to retrieve the sequence of the variant, if available. Defaults to None. prefix: `backend=="downloader"` only! prefix to be used by pystow for storing the data on a custom directory. Defaults to None. version: `backend=="downloader"` only! version of the ChEMBL database to be downloaded by chembl_downloader. If left as None, the latest version will be downloaded. Defaults to None. curate_annotation_errors: Whether to apply activity curation based on pChEMBL values diverging in exactly 3.0 (indicate possible annotation errors). Defaults to True. calculate_pchembl: calculate pChEMBL values for bioactivities reported in nM, µM or uM `standard_unit` or -Log|Log `standard_type`. Defaults to False backend: the backend to be used for fetching the data. If downloader, the ChEMBL sql database is downloaded and extracted first. Defaults to "downloader". value_col: Column name for values to be used during aggregation. Defaults to "pchembl_value". When set to "standard_value", pchembl filtering is skipped and pchembl values are only calculated opportunistically for compatible units. Defaults to "pchembl_value". Raises: BioactivitiesNotFoundError: If the retrieved bioactivity dataframe is empty. """ if backend == "downloader": bioactivities_df = get_full_activity_data_sql( molecule_chembl_ids=molecule_chembl_ids, target_chembl_ids=target_chembl_ids, assay_chembl_ids=assay_chembl_ids, document_chembl_ids=document_chembl_ids, standard_relation=standard_relation, standard_type=standard_type, standard_units=standard_units, confidence_scores=confidence_scores, assay_types=assay_types, chembl_release=chembl_release, additional_fields=additional_fields, prefix=prefix, version=version, ) assay_sizes = get_assay_size_sql( assay_chembl_ids=bioactivities_df.assay_chembl_id.unique().tolist(), prefix=prefix, version=version, ) bioactivities_df = bioactivities_df.merge(assay_sizes, on="assay_chembl_id") elif backend == "webresource": from .api.webresource import get_full_activity_data bioactivities_df = get_full_activity_data( molecule_chembl_ids=molecule_chembl_ids, target_chembl_ids=target_chembl_ids, assay_chembl_ids=assay_chembl_ids, document_chembl_ids=document_chembl_ids, confidence_scores=confidence_scores, assay_types=assay_types, chembl_release=chembl_release, ).sort_values(by=["molecule_chembl_id", "activity_id", "standard_value"]) if bioactivities_df.empty: raise BioactivitiesNotFoundError( "No bioactivities found for the given query: " f"molecule_chembl_ids={molecule_chembl_ids}, " f"target_chembl_ids={target_chembl_ids}, " f"assay_chembl_ids={assay_chembl_ids}, " f"document_chembl_ids={document_chembl_ids}, " ) bioactivities_df = process_bioactivities( bioactivities_df, calculate_pchembl=calculate_pchembl, curate_annotation_errors=curate_annotation_errors, require_document_date=require_document_date, value_col=value_col, ) if bioactivities_df.empty: raise BioactivitiesNotFoundError( "No bioactivities found after calculating pchembl_values. To investigate, use " "either methods CompoundMapper.chembl.api.downloader.get_full_activity_data_sql or " "CompoundMapper.chembl.api.webresource.get_full_activity_data." ) return bioactivities_df