Skip to content

Multitable Anonymization

This section demonstrates how to use the Anonymization for Databases module in ydata-sdk.

Don't forget to set up your license key

    import os

    os.environ['YDATA_LICENSE_KEY'] = '{add-your-key}'

Example Code

from ydata.metadata.multimetadata import MultiMetadata
from ydata.preprocessors.methods.anonymization import AnonymizerType
from ydata.synthesizers.multitable.model import MultiTableSynthesizer

if __name__ == "__main__":
    from ydata.connectors.storages.rdbms_connector import MySQLConnector

    DB_TYPE = 'mysql'
    DB_PYTHON_LIBRARY = 'pymysql'
    USERNAME = 'username'
    PASSWORD = 'password'
    HOSTNAME = 'hostname'
    PORT = '3306'
    DATABASE_NAME = 'database_name'
    URI_STRUCTURE = f"{DB_TYPE}+{DB_PYTHON_LIBRARY}://{USERNAME}:{PASSWORD}@{HOSTNAME}:{PORT}/{DATABASE_NAME}"

    conn_str = {
        "hostname": HOSTNAME,
        "username": USERNAME,
        "password": PASSWORD,
        "port": PORT,
        "database": DATABASE_NAME
    }
    conn = MySQLConnector(conn_string=conn_str)
    schema = conn.get_database_schema()
    dataset = conn.read_database()

    dataset['trans'].astype(column="date", vartype="datetime", format="%y%m%d")
    dataset['account'].astype(column="date", vartype="datetime", format="%y%m%d")
    dataset['loan'].astype(column="date", vartype="datetime", format="%y%m%d")

    dataset_attrs = {
        'trans': {
            'sortbykey': 'date'
        }
    }

    dataset_type = {
        'trans': 'timeseries'
    }

    m = MultiMetadata(dataset, dataset_attrs=dataset_attrs, dataset_type=dataset_type)

    """
    The anonymizer for MultiDataset supports the same configuration as for a Dataset. The difference is that the configuration
    is also indexed on the tables. If a columns is a foreign key in another talbe, the anonymizer will automatically anonymize the
    corresponding columns.
    """
    config = {
        'district': {
            'a1': r'[0-9]{4}-[A-Z]{5}',
            'a_label': {
                'cols': ['a2, a3'],
                'type': AnonymizerType.HOSTNAME
            }
        },
        'disp': {
            'account_id': r'[0-9]{4}-[A-Z]{5}'
        },
    }

    """
    It is possible to automatically have the primary keys anonymize globally. The configuration below will anonymize all primary
    keys and their reference everywhere in the MultiDataset.
    """
    config = {
        'anonymize_primary_keys': True,
        'disp': {
            'account_id': {
                "type": "regex",
                "regex": r'[0-9]{4}-[A-Z]{5}'
            }
        },
    }

    """
    Alternatively, it is possible to specify which table should have its primary keys columns anonymized:
    """
    config = {
        'district': {
            'anonymize_primary_keys': True,
        },
        'disp': {
            'account_id': r'[0-9]{4}-[A-Z]{5}'
        },
    }

    synth = MultiTableSynthesizer()
    synth.fit(dataset, m, anonymize=config, reference_table_limit=50_000)