Multitable Anonymization
This section demonstrates how to use the Anonymization for Databases
module in ydata-sdk
.
Don't forget to set up your license key
Example Code
from ydata.metadata.multimetadata import MultiMetadata
from ydata.preprocessors.methods.anonymization import AnonymizerType
from ydata.synthesizers.multitable.model import MultiTableSynthesizer
if __name__ == "__main__":
from ydata.connectors.storages.rdbms_connector import MySQLConnector
DB_TYPE = 'mysql'
DB_PYTHON_LIBRARY = 'pymysql'
USERNAME = 'username'
PASSWORD = 'password'
HOSTNAME = 'hostname'
PORT = '3306'
DATABASE_NAME = 'database_name'
URI_STRUCTURE = f"{DB_TYPE}+{DB_PYTHON_LIBRARY}://{USERNAME}:{PASSWORD}@{HOSTNAME}:{PORT}/{DATABASE_NAME}"
conn_str = {
"hostname": HOSTNAME,
"username": USERNAME,
"password": PASSWORD,
"port": PORT,
"database": DATABASE_NAME
}
conn = MySQLConnector(conn_string=conn_str)
schema = conn.get_database_schema()
dataset = conn.read_database()
dataset['trans'].astype(column="date", vartype="datetime", format="%y%m%d")
dataset['account'].astype(column="date", vartype="datetime", format="%y%m%d")
dataset['loan'].astype(column="date", vartype="datetime", format="%y%m%d")
dataset_attrs = {
'trans': {
'sortbykey': 'date'
}
}
dataset_type = {
'trans': 'timeseries'
}
m = MultiMetadata(dataset, dataset_attrs=dataset_attrs, dataset_type=dataset_type)
"""
The anonymizer for MultiDataset supports the same configuration as for a Dataset. The difference is that the configuration
is also indexed on the tables. If a columns is a foreign key in another talbe, the anonymizer will automatically anonymize the
corresponding columns.
"""
config = {
'district': {
'a1': r'[0-9]{4}-[A-Z]{5}',
'a_label': {
'cols': ['a2, a3'],
'type': AnonymizerType.HOSTNAME
}
},
'disp': {
'account_id': r'[0-9]{4}-[A-Z]{5}'
},
}
"""
It is possible to automatically have the primary keys anonymize globally. The configuration below will anonymize all primary
keys and their reference everywhere in the MultiDataset.
"""
config = {
'anonymize_primary_keys': True,
'disp': {
'account_id': {
"type": "regex",
"regex": r'[0-9]{4}-[A-Z]{5}'
}
},
}
"""
Alternatively, it is possible to specify which table should have its primary keys columns anonymized:
"""
config = {
'district': {
'anonymize_primary_keys': True,
},
'disp': {
'account_id': r'[0-9]{4}-[A-Z]{5}'
},
}
synth = MultiTableSynthesizer()
synth.fit(dataset, m, anonymize=config, reference_table_limit=50_000)