Skip to content

Constraints

This section demonstrates how to use the Constraints module in ydata-sdk.

Don't forget to set up your license key

    import os

    os.environ['YDATA_LICENSE_KEY'] = '{add-your-key}'

Example Code

import numpy as np
import pandas as pd

from ydata.constraints.engine import ConstraintEngine
from ydata.constraints.rows import (CustomRowConstraint, EqualColumnConstraint, GreaterThanColumnConstraint,
                                    IntervalColumnConstraint, LowerThanColumnConstraint)
from ydata.dataset import Dataset


def create_dummy_dataset(size: int = 100) -> Dataset:
    df = pd.DataFrame(
        {
            "constant": np.ones(size),
            "low_cardinality": np.random.randint(2, size=size),
            "ascending": np.arange(size),
            "negatives": -1 * np.arange(size),
            "missings": [np.nan] * size,
        }
    )
    return Dataset(df)


def nunique(column):
    return column.nunique()


def col_sum(column):
    return column.sum()


def all_negatives(column):
    return all(column < 0)


def validate_constraints(dataset):
    # should fail on missings since there is only nan values
    c1 = GreaterThanColumnConstraint(
        check=nunique, columns=["constant", "missings"], value=0
    )
    # should pass
    c2 = LowerThanColumnConstraint(
        check=col_sum, columns=["negatives"], value=0)
    # should fail since negatives max value is 0
    c3 = EqualColumnConstraint(
        check=all_negatives, columns="negatives", value=True)
    # should fail on only one row
    c4 = CustomRowConstraint(
        check=lambda x: x < 0, columns="negatives", name="negative_rows"
    )
    # should pass since the cadinality of the columns is at most 2
    c5 = EqualColumnConstraint(
        nunique, columns=["constant", "low_cardinality"], value=1, tolerance=1
    )
    # should fail since ascending has high cardinality
    c6 = IntervalColumnConstraint(
        nunique, columns=["ascending"], lower_bound=0, upper_bound=1
    )

    constraints = [c1, c2, c3, c4, c5, c6]

    ce = ConstraintEngine()
    ce.add_constraints(constraints)
    ce.validate(dataset)
    return ce


if __name__ == "__main__":
    dataset = create_dummy_dataset()
    constraint_engine = validate_constraints(dataset)

    # we can now print clear summaries of the constraint engine

    # we can 'print' out the constraint engine results
    print(constraint_engine)

    # to only check which constraints have been added to the engine
    # we can use the repr method
    print(repr(constraint_engine))