Skip to content

Synthetic Data with Calculated Features

This section demonstrates how to use the Calculated Features module, also known as Business Rules, in ydata-sdk.

Don't forget to set up your license key

    import os

    os.environ['YDATA_LICENSE_KEY'] = '{add-your-key}'

Example Code

from pathlib import Path
from warnings import filterwarnings

from examples.local import setting_dask_env
from ydata.connectors import GCSConnector
from ydata.metadata.metadata import Metadata
from ydata.synthesizers.regular import RegularSynthesizer
from ydata.utils.data_types import CATEGORICAL_DTYPES, DataType
from ydata.utils.formats import read_json

filterwarnings("ignore")


# If running locally this example please set your environment variables first: 'RUNNING_ENV'=='LOCAL'
setting_dask_env()


def get_token(token_name):
    token_path = (
        Path(__file__).absolute().parent.parent.parent.joinpath(
            ".secrets", token_name)
    )
    return read_json(token_path)


if __name__ == "__main__":
    # Reading our data file from the GCS cloud
    connector = GCSConnector(
        project_id="bucketname", keyfile_dict=get_token("gcs_credentials.json")
    )
    real_ds = connector.read_file(
        "gs://path-to-file/data.csv.zip", file_type="csv"
    )

    # Inspecting head of the dataset
    real_ds._data.head().T

    metadata = Metadata()
    metadata(real_ds)

    # Only term feature datatype should be categorical
    updated_dtypes = []
    for column in metadata.columns.values():
        if column.datatype in CATEGORICAL_DTYPES and column.name != "term":
            metadata.columns[column.name].datatype = DataType.NUMERICAL
            updated_dtypes.append(column.name)

    SAMPLE_SIZE = 1000
    # Fit a synthesizer without using Calculated Features
    synth_reg = RegularSynthesizer()
    synth_reg.fit(real_ds, metadata=metadata)

    # Obtaining samples
    samples_reg = synth_reg.sample(SAMPLE_SIZE)

    # Computes the revolving credit utilization based on the current revolving balance and the credit limit.
    def get_revolving_util(revol_bal, total_rev_hi_lim): return (
        revol_bal / total_rev_hi_lim
    ).values

    def get_installment(int_rate, loan_amnt, term):
        "Computes the installment values due monthly based on an amortization loan schedule."
        n = term.str.rstrip("m").astype("int")  # The total number of periods
        period_int = (
            int_rate / 12
        )  # The adjusted annual interest for the monthly installment periods
        return (
            loan_amnt
            * (
                (period_int * (1 + period_int) ** n) /
                ((1 + period_int) ** n - 1)
            ).values
        )

    def get_total_payment(total_rec_int, total_rec_late_fee, total_rec_prncp):
        "Computes total payment as the sum of all payment parcels."
        return (total_rec_int + total_rec_late_fee + total_rec_prncp).values

    calculated_features = [
        {
            "calculated_features": "revol_util",
            "function": get_revolving_util,
            "calculated_from": ["revol_bal", "total_rev_hi_lim"],
        },
        {
            "calculated_features": "installment",
            "function": get_installment,
            "calculated_from": ["int_rate", "loan_amnt", "term"],
        },
        {
            "calculated_features": "total_pymnt",
            "function": get_total_payment,
            "calculated_from": [
                "total_rec_int",
                "total_rec_late_fee",
                "total_rec_prncp",
            ],
        },
    ]

    # Fit a synthesizer using Calculated Features
    synth_calcft = RegularSynthesizer()
    synth_calcft.fit(
        real_ds, metadata=metadata, calculated_features=calculated_features
    )

    # Obtaining samples
    samples_calcft = synth_calcft.sample(SAMPLE_SIZE)

    # Note the total_pymnt does not match the sum of total_rec_int, total_rec_late_fee and total_rec_prncp
    # without calculated features
    print(samples_reg._data.tail())
    print(samples_calcft._data.tail())