Synthetic Data with Calculated Features
This section demonstrates how to use the Calculated Features
module, also known as Business Rules, in ydata-sdk
.
Don't forget to set up your license key
Example Code
from pathlib import Path
from warnings import filterwarnings
from examples.local import setting_dask_env
from ydata.connectors import GCSConnector
from ydata.metadata.metadata import Metadata
from ydata.synthesizers.regular import RegularSynthesizer
from ydata.utils.data_types import CATEGORICAL_DTYPES, DataType
from ydata.utils.formats import read_json
filterwarnings("ignore")
# If running locally this example please set your environment variables first: 'RUNNING_ENV'=='LOCAL'
setting_dask_env()
def get_token(token_name):
token_path = (
Path(__file__).absolute().parent.parent.parent.joinpath(
".secrets", token_name)
)
return read_json(token_path)
if __name__ == "__main__":
# Reading our data file from the GCS cloud
connector = GCSConnector(
project_id="bucketname", keyfile_dict=get_token("gcs_credentials.json")
)
real_ds = connector.read_file(
"gs://path-to-file/data.csv.zip", file_type="csv"
)
# Inspecting head of the dataset
real_ds._data.head().T
metadata = Metadata()
metadata(real_ds)
# Only term feature datatype should be categorical
updated_dtypes = []
for column in metadata.columns.values():
if column.datatype in CATEGORICAL_DTYPES and column.name != "term":
metadata.columns[column.name].datatype = DataType.NUMERICAL
updated_dtypes.append(column.name)
SAMPLE_SIZE = 1000
# Fit a synthesizer without using Calculated Features
synth_reg = RegularSynthesizer()
synth_reg.fit(real_ds, metadata=metadata)
# Obtaining samples
samples_reg = synth_reg.sample(SAMPLE_SIZE)
# Computes the revolving credit utilization based on the current revolving balance and the credit limit.
def get_revolving_util(revol_bal, total_rev_hi_lim): return (
revol_bal / total_rev_hi_lim
).values
def get_installment(int_rate, loan_amnt, term):
"Computes the installment values due monthly based on an amortization loan schedule."
n = term.str.rstrip("m").astype("int") # The total number of periods
period_int = (
int_rate / 12
) # The adjusted annual interest for the monthly installment periods
return (
loan_amnt
* (
(period_int * (1 + period_int) ** n) /
((1 + period_int) ** n - 1)
).values
)
def get_total_payment(total_rec_int, total_rec_late_fee, total_rec_prncp):
"Computes total payment as the sum of all payment parcels."
return (total_rec_int + total_rec_late_fee + total_rec_prncp).values
calculated_features = [
{
"calculated_features": "revol_util",
"function": get_revolving_util,
"calculated_from": ["revol_bal", "total_rev_hi_lim"],
},
{
"calculated_features": "installment",
"function": get_installment,
"calculated_from": ["int_rate", "loan_amnt", "term"],
},
{
"calculated_features": "total_pymnt",
"function": get_total_payment,
"calculated_from": [
"total_rec_int",
"total_rec_late_fee",
"total_rec_prncp",
],
},
]
# Fit a synthesizer using Calculated Features
synth_calcft = RegularSynthesizer()
synth_calcft.fit(
real_ds, metadata=metadata, calculated_features=calculated_features
)
# Obtaining samples
samples_calcft = synth_calcft.sample(SAMPLE_SIZE)
# Note the total_pymnt does not match the sum of total_rec_int, total_rec_late_fee and total_rec_prncp
# without calculated features
print(samples_reg._data.tail())
print(samples_calcft._data.tail())