Source code for mip_dmp.process.mapping
# Copyright 2023 The HIP team, University Hospital of Lausanne (CHUV), Switzerland & Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Module that provides functions to support the mapping of datasets to a specific CDEs metadata schema."""
# External imports
import os
import numpy as np
import pandas as pd
# Disable Tensorflow warnings, other options are:
# - 0 (default): all messages are logged (default behavior)
# - 1: INFO messages are not printed
# - 2: INFO and WARNING messages are not printed
# - 3: INFO, WARNING, and ERROR messages are not printed
# Note: this has to be done before importing tensorflow
# that is done when importing chars2vec in mip_dmp/io.py
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # noqa
# Constants
MAPPING_TABLE_COLUMNS = {
"dataset_column": [],
"cde_code": [],
"cde_type": [],
"transform_type": [],
"transform": [],
}
[docs]def map_dataset(dataset, mappings, cde_codes):
"""Map the dataset to the schema.
Parameters
----------
dataset : pandas.DataFrame
Dataset to be mapped.
mappings : dict
Mappings of the dataset columns to the schema columns.
cde_codes : list
List of codes of the CDE metadata schema.
Returns
-------
pandas.DataFrame
Mapped dataset.
"""
# create a list to hold the mapped columns
mapped_columns = []
# Convert the list of mappings to a dictionary using cde_code as the key
mapping_dict = {mapping["cde_code"]: mapping for mapping in mappings}
print(f"len(mapping_dict) = {len(mapping_dict)}")
# Map and apply transformation to each dataset column described in the
# mapping JSON file.
for cde_code in cde_codes:
if cde_code in mapping_dict:
mapping = mapping_dict[cde_code]
# Extract the mapping information of the column.
dataset_column = mapping["dataset_column"]
cde_code = mapping["cde_code"]
cde_type = mapping["cde_type"]
transform_type = mapping["transform_type"]
transform = mapping["transform"]
print(
f" > Process column {dataset_column} with CDE code {cde_code}, CDE type {cde_type}, transform type {transform_type}, and transform {transform}"
)
# If the column is present in the dataset, copy the dataset column to
# the mapped dataset for which the column name is the CDE code, map
# the input data to the CDE code, apply the transformation, and append
# to the list of mapped columns.
if dataset_column in dataset.columns:
mapped_columns.append(
transform_dataset_column(
dataset[dataset_column].rename(cde_code),
cde_code,
cde_type,
transform_type,
transform,
)
)
else:
print(f"WARNING: No mapping found for CDE code {cde_code}. Fill with NaN.")
mapped_columns.append(pd.DataFrame(columns=[cde_code]))
mapped_dataset = pd.concat(mapped_columns, axis=1)
# Return the mapped dataset.
print(mapped_dataset)
return mapped_dataset
[docs]def transform_dataset_column(
dataset_column, cde_code, cde_type, transform_type, transform
):
"""Transform the dataset column.
Parameters
----------
dataset_column : pandas.DataFrame
Dataset column to be transformed.
cde_code : str
CDE code of the dataset column.
cde_type : str
CDE type of the dataset column. Can be "binomial", "multinomial", "integer" or "real".
transform_type : str
Type of transformation to be applied to the dataset column.
Can be "map" or "scale".
transform : str
Transformation to be applied to the dataset column.
Can be a JSON string for the "map" transformation type or a scaling factor.
Returns
-------
dataset_column: pandas.DataFrame
The transformed dataset column.
"""
# Apply the transformation only if not NaN.
if transform_type == "map" and transform != "nan":
dataset_column = apply_transform_map(dataset_column, transform)
elif transform_type == "scale" and transform != "nan":
# Apply the scaling factor.
scaling_factor = float(transform)
dataset_column = apply_transform_scale(
dataset_column, cde_code, cde_type, scaling_factor
)
else:
print(f"WARNING: No transformation applied for output column {cde_code}.")
return dataset_column
[docs]def apply_transform_map(dataset_column, transform):
"""Apply the transform map for binomial and multinominal variables.
Parameters
----------
dataset_column : pandas.DataFrame
Dataset column to be transformed.
transform : str
Transformation to be applied to the dataset column.
Can be a JSON string for the "map" transformation type or a scaling factor.
Returns
-------
dataset_column: pandas.DataFrame
The transformed dataset column."""
# Parse the mapping values from the JSON string
mapping_values = eval(transform)
dataset_column = dataset_column.map(
lambda x: x.lower() if isinstance(x, str) else x
)
# Map the values.
for mapping_value_item in mapping_values.items():
old_value = mapping_value_item[0].lower()
new_value = mapping_value_item[1]
dataset_column.iloc[dataset_column == old_value] = new_value
return dataset_column
[docs]def apply_transform_scale(dataset_column, cde_code, cde_type, scaling_factor):
"""Apply the transform scale for real and integer variables.
Parameters
----------
dataset_column : pandas.DataFrame
Dataset column to be transformed.
cde_code : str
CDE code of the dataset column.
cde_type : str
CDE type of the dataset column. Can be "binomial", "multinomial", "integer" or "real".
scaling_factor : float
Scaling factor to be applied to the dataset column.
Returns
-------
dataset_column: pandas.DataFrame
The transformed dataset column.
"""
# Check if the column contains any NaN values. If so, the scaling is
# not applied. Otherwise, the scaling is applied.
if not dataset_column.isnull().values.any():
# Cast the column to the correct type and apply the scaling factor.
try:
if cde_type == "integer":
dataset_column = dataset_column.astype(int) * int(scaling_factor)
elif cde_type == "real":
dataset_column = dataset_column.astype(float) * scaling_factor
except ValueError:
print(f"WARNING: The column {cde_code} could not be cast to {cde_type}.")
else:
# Cast and scale only the non-NaN values.
if cde_type == "integer":
dataset_column_list = [
np.nan if pd.isnull(x) else int(float(x)) * int(scaling_factor)
for x in dataset_column
]
elif cde_type == "real":
dataset_column_list = [
np.nan if pd.isnull(x) else float(x) * scaling_factor
for x in dataset_column
]
dataset_column = pd.DataFrame(dataset_column_list, columns=[cde_code])
return dataset_column