Enforce pre-defined validation constraints¶

In a previous guide, you defined validation constraints ad-hoc when initializing Curator objects.

Often, you want to enforce a pre-defined set of validation constraints, like, e.g., the CELLxGENE curator (Curate AnnData based on the CELLxGENE schema).

This guide shows how to subclass Curator to enforce pre-defined constraints.

Define a custom curator¶

Consider the example of electronic health records (EHR). We want to ensure that

every record has the fields disease, phenotype, developmental_stage, and age
values for these fields map against specific versions of pre-defined ontologies

The following implementation achieves the goal by subclassing DataFrameCurator.

EHR Curator¶

import bionty as bt
import pandas as pd
from lamindb.core import DataFrameCurator, logger
from lamindb.core.types import UPathStr

__version__ = "0.1.0"


class EHRCurator(DataFrameCurator):
    """Custom curation flow for electronic health record data."""

    def __init__(self, data: pd.DataFrame | UPathStr):
        # Curate these columns against the specified fields
        DEFAULT_CATEGORICALS = {
            "disease": bt.Disease.name,
            "phenotype": bt.Phenotype.name,
            "developmental_stage": bt.DevelopmentalStage.name,
        }

        # If columns or values are missing, we substitute with these defaults
        DEFAULT_VALUES = {
            "disease": "normal",
            "development_stage": "unknown",
            "phenotype": "unknown",
        }

        # Validate values onto the following ontology versions
        DEFAULT_SOURCES = {
            "disease": bt.Source.get(
                entity="bionty.Disease", name="mondo", version="2023-04-04"
            ),
            "developmental_stage": bt.Source.get(
                entity="bionty.DevelopmentalStage", name="hsapdv", version="2020-03-10"
            ),
            "phenotype": bt.Source.get(
                entity="bionty.Phenotype",
                name="hp",
                version="2023-06-17",
                organism="human",
            ),
        }

        self.data = data

        for col, default in DEFAULT_VALUES.items():
            if col not in self.data.columns:
                self.data[col] = default
            else:
                self.data[col] = self.data[col].fillna(default)

        super().__init__(
            df=self.data,
            categoricals=DEFAULT_CATEGORICALS,
            sources=DEFAULT_SOURCES,
            organism="human",
        )

    def validate(self, organism: str | None = None) -> bool:
        """Validates the internal EHR standard."""
        missing_columns = {"disease", "phenotype", "developmental_stage", "age"} - set(
            self.data.columns
        )
        if missing_columns:
            logger.error(
                f"Columns {', '.join(map(repr, missing_columns))} are missing but required."
            )
            return False

        return DataFrameCurator.validate(self, organism)

Use the custom curator¶

!lamin init --storage ./subclass-curator --modules bionty

→ initialized lamindb: testuser1/subclass-curator

import lamindb as ln
import bionty as bt
import pandas as pd
from ehrcurator import EHRCurator

ln.track("2XEr2IA4n1w40000")

→ connected lamindb: testuser1/subclass-curator

→ created Transform('2XEr2IA4n1w40000'), started new Run('YhbaIXbd...') at 2025-01-12 14:05:04 UTC

→ notebook imports: bionty==1.0a1 ehrcurator lamindb==1.0a2 pandas==2.2.3

# create example DataFrame that has all mandatory columns but one ('patient_age') is wrongly named
data = {
    "disease": [
        "Alzheimer disease",
        "diabetes mellitus",
        "breast cancer",
        "Hypertension",
        "asthma",
    ],
    "phenotype": [
        "Mental deterioration",
        "Hyperglycemia",
        "Tumor growth",
        "Increased blood pressure",
        "Airway inflammation",
    ],
    "developmental_stage": ["Adult", "Adult", "Adult", "Adult", "Child"],
    "patient_age": [70, 55, 60, 65, 12],
}
df = pd.DataFrame(data)
df

Show code cell output Hide code cell output

	disease	phenotype	developmental_stage	patient_age
0	Alzheimer disease	Mental deterioration	Adult	70
1	diabetes mellitus	Hyperglycemia	Adult	55
2	breast cancer	Tumor growth	Adult	60
3	Hypertension	Increased blood pressure	Adult	65
4	asthma	Airway inflammation	Child	12

ehrcurator = EHRCurator(df)
ehrcurator.validate()

# Fix the name of wrongly spelled column
df.columns = df.columns.str.replace("patient_age", "age")
ehrcurator.validate()

# Use lookup objects to curate the values
disease_lo = bt.Disease.public().lookup()
phenotype_lo = bt.Phenotype.public().lookup()
developmental_stage_lo = bt.DevelopmentalStage.public().lookup()

df["disease"] = df["disease"].replace(
    {"Hypertension": disease_lo.hypertensive_disorder.name}
)
df["phenotype"] = df["phenotype"].replace(
    {
        "Tumor growth": phenotype_lo.neoplasm.name,
        "Airway inflammation": phenotype_lo.bronchitis.name,
    }
)
df["developmental_stage"] = df["developmental_stage"].replace(
    {
        "Adult": developmental_stage_lo.adolescent_stage.name,
        "Child": developmental_stage_lo.child_stage.name,
    }
)

ehrcurator.validate()