pdstools.explanations.Preprocess¶

Classes¶

Preprocess

Module Contents¶

class Preprocess(explanations: pdstools.explanations.Explanations.Explanations)¶

Bases: pdstools.utils.namespaces.LazyNamespace

Parameters:: explanations (pdstools.explanations.Explanations.Explanations)

dependencies = ['duckdb', 'polars']¶

dependency_group = 'explanations'¶

SEP = ', '¶

LEFT_PREFIX = 'l'¶

RIGHT_PREFIX = 'r'¶

explanations¶

explanations_folder¶

data_file¶

data_foldername = 'aggregated_data'¶

data_folderpath¶

from_date¶

to_date¶

model_name¶

model_context_limit¶

query_batch_limit¶

file_batch_limit¶

memory_limit¶

thread_count¶

progress_bar¶

_conn = None¶

selected_files: list[str] = []¶

contexts: dict[str, dict[str, list[str]]] | None = None¶

unique_contexts_filename = 'Instance of pathlib.Path/unique_contexts.json'¶

generate()¶

Process explanation parquet files and save calculated aggregates.

This method reads the explanation data from the provided location and creates aggregates for multiple contexts which are used to create global explanation plots.

The different context aggregates are as follows: i) Overall Numeric Predictor Contributions

The average contribution towards predicted model propensity for each numeric predictor value decile.

Overal Symbolic Predictor Contributions The average contribution towards predicted model propensity for each symoblic predictor value.
Context Specific Numeric Predictor Contributions

The average contribution towards predicted model propensity for each numeric predictor value decile, grouped by context key partition.

Overal Symbolic Predictor Contributions The average contribution towards predicted model propensity for each symoblic predictor value, grouped by context key partition.

Each of the aggregates are written to parquet files to a temporary output dirtectory

static _clean_query(query)¶

_is_cached()¶

_validate_explanations_folder()¶

_run_agg(predictor_type: pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)¶

Parameters:: predictor_type (pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)

_create_in_mem_table(predictor_type: pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)¶

Parameters:: predictor_type (pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)

static _create_unique_contexts_file(filename, data)¶

_create_context_batches(all_contexts: list[str])¶

Parameters:: all_contexts (list[str])

_get_contexts(predictor_type: pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)¶

Parameters:: predictor_type (pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)

_agg_in_batches(predictor_type: pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)¶

Parameters:: predictor_type (pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)

_agg_overall(predictor_type: pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE, where_condition='TRUE')¶

Parameters:: predictor_type (pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)

_delete_in_mem_table(predictor_type: pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)¶

Parameters:: predictor_type (pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)

static _get_table_name(predictor_type) → pdstools.explanations.ExplanationsUtils._TABLE_NAME¶

Return type:: pdstools.explanations.ExplanationsUtils._TABLE_NAME

_get_create_table_sql_formatted(tbl_name: pdstools.explanations.ExplanationsUtils._TABLE_NAME, predictor_type: pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)¶

Parameters:

tbl_name (pdstools.explanations.ExplanationsUtils._TABLE_NAME)
predictor_type (pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)

_parquet_in_batches(file_batch_nb: str, query_batches: dict[str, list[str]], predictor_type: pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)¶

Parameters:

file_batch_nb (str)
query_batches (dict[str, list[str]])
predictor_type (pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)

_parquet_overall(predictor_type: pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE, where_condition='TRUE')¶

Parameters:: predictor_type (pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)

_write_to_parquet(df: polars.DataFrame, file_name: str)¶

Parameters:

df (polars.DataFrame)
file_name (str)

_read_overall_sql_file(predictor_type: pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)¶

Parameters:: predictor_type (pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)

_read_batch_sql_file(predictor_type: pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)¶

Parameters:: predictor_type (pdstools.explanations.ExplanationsUtils._PREDICTOR_TYPE)

_read_resource_file(package_name, filename_w_ext)¶

_get_model_contexts_sql_formatted(tbl_name: pdstools.explanations.ExplanationsUtils._TABLE_NAME)¶

Parameters:: tbl_name (pdstools.explanations.ExplanationsUtils._TABLE_NAME)

_get_overall_sql_formatted(sql, tbl_name: pdstools.explanations.ExplanationsUtils._TABLE_NAME, where_condition)¶

Parameters:: tbl_name (pdstools.explanations.ExplanationsUtils._TABLE_NAME)

_get_batch_sql_formatted(sql, tbl_name: pdstools.explanations.ExplanationsUtils._TABLE_NAME, where_condition='TRUE')¶

Parameters:: tbl_name (pdstools.explanations.ExplanationsUtils._TABLE_NAME)

_get_selected_files()¶

_populate_selected_files()¶

_populate_selected_files_from_local()¶

_populate_selected_files_from_url(file_url: str)¶

Parameters:: file_url (str)

_execute_query(query: str)¶

Execute a query on the in-memory DuckDB connection.

Parameters:: query (str)