Synthetic Data with Calculated Features
This section demonstrates how to use the Calculated Features
module, also known as Business Rules, in ydata-sdk
.
Don't forget to set up your license key
Example Code
from pathlib import Path
from warnings import filterwarnings
from ydata.connectors import GCSConnector
from ydata.metadata.metadata import Metadata
from ydata.synthesizers.regular import RegularSynthesizer
from ydata.utils.data_types import CATEGORICAL_DTYPES, DataType
from ydata.utils.formats import read_json
filterwarnings("ignore")
def get_token(token_name):
token_path = (
Path(__file__).absolute().parent.parent.parent.joinpath(
".secrets", token_name)
)
return read_json(token_path)
if __name__ == "__main__":
# Reading our data file from the GCS cloud
connector = GCSConnector(
project_id="bucketname", keyfile_dict=get_token("gcs_credentials.json")
)
real_ds = connector.read_file(
"gs://path-to-file/data.csv.zip", file_type="csv"
)
# Inspecting head of the dataset
real_ds._data.head().T
metadata = Metadata()
metadata(real_ds)
# Only term feature datatype should be categorical
updated_dtypes = []
for column in metadata.columns.values():
if column.datatype in CATEGORICAL_DTYPES and column.name != "term":
metadata.columns[column.name].datatype = DataType.NUMERICAL
updated_dtypes.append(column.name)
SAMPLE_SIZE = 1000
# Fit a synthesizer without using Calculated Features
synth_reg = RegularSynthesizer()
synth_reg.fit(real_ds, metadata=metadata)
# Obtaining samples
samples_reg = synth_reg.sample(SAMPLE_SIZE)
# Computes the revolving credit utilization based on the current revolving balance and the credit limit.
def get_revolving_util(revol_bal, total_rev_hi_lim): return (
revol_bal / total_rev_hi_lim
).values
def get_installment(int_rate, loan_amnt, term):
"Computes the installment values due monthly based on an amortization loan schedule."
n = term.str.rstrip("m").astype("int") # The total number of periods
period_int = (
int_rate / 12
) # The adjusted annual interest for the monthly installment periods
return (
loan_amnt
* (
(period_int * (1 + period_int) ** n) /
((1 + period_int) ** n - 1)
).values
)
def get_total_payment(total_rec_int, total_rec_late_fee, total_rec_prncp):
"Computes total payment as the sum of all payment parcels."
return (total_rec_int + total_rec_late_fee + total_rec_prncp).values
calculated_features = [
{
"calculated_features": "revol_util",
"function": get_revolving_util,
"calculated_from": ["revol_bal", "total_rev_hi_lim"],
},
{
"calculated_features": "installment",
"function": get_installment,
"calculated_from": ["int_rate", "loan_amnt", "term"],
},
{
"calculated_features": "total_pymnt",
"function": get_total_payment,
"calculated_from": [
"total_rec_int",
"total_rec_late_fee",
"total_rec_prncp",
],
},
]
# Fit a synthesizer using Calculated Features
synth_calcft = RegularSynthesizer()
synth_calcft.fit(
real_ds, metadata=metadata, calculated_features=calculated_features
)
# Obtaining samples
samples_calcft = synth_calcft.sample(SAMPLE_SIZE)
# Note the total_pymnt does not match the sum of total_rec_int, total_rec_late_fee and total_rec_prncp
# without calculated features
print(samples_reg._data.tail())
print(samples_calcft._data.tail())