gamcoach.gamcoach

Main module for GAM Coach.

GAM Coach implements a simple and flexible method to generate counterfactual explanations for generalized additive models (GAMs).

View Source

   1"""Main module for GAM Coach.
   2
   3GAM Coach implements a simple and flexible method to generate counterfactual
   4explanations for generalized additive models (GAMs).
   5"""
   6
   7import numpy as np
   8import pandas as pd
   9import re
  10import pulp
  11from tqdm import tqdm
  12from scipy.stats import gaussian_kde
  13from interpret.glassbox import (
  14    ExplainableBoostingClassifier,
  15    ExplainableBoostingRegressor,
  16)
  17from collections import Counter
  18from typing import Union
  19
  20from .counterfactuals import Counterfactuals
  21
  22SEED = 922
  23
  24
  25class GAMCoach:
  26    """Main class for GAM Coach."""
  27
  28    def __init__(
  29        self,
  30        ebm: Union[ExplainableBoostingClassifier, ExplainableBoostingRegressor],
  31        x_train: np.ndarray,
  32        cont_mads=None,
  33        cat_distances=None,
  34        adjust_cat_distance=True,
  35    ):
  36        """Initialize a GAMCoach object.
  37
  38        Args:
  39            ebm (Union[ExplainableBoostingClassifier, ExplainableBoostingRegressor]):
  40                The trained EBM model. It can be either a classifier or a regressor.
  41            x_train (np.ndarray): The training data. It is used to compute the
  42                distance for different features.
  43            cont_mads (dict, optional): `feature_name` -> `median absolute
  44                deviation score`. If it is provided, it is used to overwrite the
  45                computed MADs for continuous variables. It is useful when you
  46                want to provide a custom normalization function to compute the
  47                distance between continuous features.
  48            cat_distances (dict, optional): `feature_name` -> {`level_name` -> `distance`}.
  49                Level distance of categorical variables. By default, the distance
  50                is computed by (1 - frequency(level)) for each level. It imples
  51                that it is easier to move to a more frequent. If `cat_distances`
  52                is provided, it will overwrite the default distance for
  53                categorical variables.
  54            adjust_cat_distance (bool, optional): If true, we use (1 -
  55                frequency(level)) for each level. Otherwise, we give distance = 1
  56                for different levels and 0 for the same level.
  57        """
  58
  59        self.ebm: Union[
  60            ExplainableBoostingClassifier, ExplainableBoostingRegressor
  61        ] = ebm
  62        """The trained EBM model."""
  63
  64        self.x_train: np.ndarray = x_train
  65
  66        self.cont_mads: dict = cont_mads
  67        """Median absolute deviation (MAD) of continuous variables."""
  68
  69        self.cat_distances: dict = cat_distances
  70        """Level distance of categorical variables. By default, the distance is
  71        computed by $(1 - \\frac{\\text{count of} L_i}{\\text{count of all L}})$
  72        for one level $L_i$. It implies that it is easier to move to a more
  73        frequent level.
  74        """
  75
  76        self.adjust_cat_distance: bool = adjust_cat_distance
  77
  78        # If cont_mads is not given, we compute it from the training data
  79        if self.cont_mads is None:
  80            ebm_cont_indexes = np.array(
  81                [
  82                    i
  83                    for i in range(len(self.ebm.feature_names))
  84                    if self.ebm.feature_types[i] == "continuous"
  85                ]
  86            )
  87
  88            self.cont_mads = {}
  89
  90            for i in ebm_cont_indexes:
  91                self.cont_mads[ebm.feature_names[i]] = self.compute_mad(
  92                    self.x_train[:, i]
  93                )
  94
  95        # If cat_distance is not given, we compute it from the training data
  96        if self.cat_distances is None:
  97            ebm_cat_indexes = np.array(
  98                [
  99                    i
 100                    for i in range(len(self.ebm.feature_names))
 101                    if self.ebm.feature_types[i] == "categorical"
 102                ]
 103            )
 104
 105            self.cat_distances = {}
 106
 107            if self.adjust_cat_distance:
 108                for i in ebm_cat_indexes:
 109                    self.cat_distances[
 110                        self.ebm.feature_names[i]
 111                    ] = GAMCoach.compute_frequency_distance(self.x_train[:, i])
 112            else:
 113                for i in ebm_cat_indexes:
 114                    self.cat_distances[
 115                        self.ebm.feature_names[i]
 116                    ] = GAMCoach.compute_naive_cat_distance(self.x_train[:, i])
 117
 118        # Determine if the ebm is a classifier or a regressor
 119        self.is_classifier = isinstance(self.ebm.intercept_, np.ndarray)
 120        """True if the ebm model is a classifier, false if it is a regressor."""
 121
 122    def generate_cfs(
 123        self,
 124        cur_example: np.ndarray,
 125        total_cfs: int = 1,
 126        target_range: tuple = None,
 127        sim_threshold_factor: float = 0.005,
 128        sim_threshold: float = None,
 129        categorical_weight: Union[float, str] = "auto",
 130        features_to_vary: list = None,
 131        max_num_features_to_vary: int = None,
 132        feature_ranges: dict = None,
 133        continuous_integer_features: list = None,
 134        verbose: int = 1,
 135    ) -> Counterfactuals:
 136        """Generate counterfactual examples.
 137
 138        Use mixed-integer linear programming to generate optimal counterfactual
 139        examples for the given data point.
 140
 141        Args:
 142            cur_example (np.ndarray): The data point of interest. This function
 143                aims to find similar examples that the model gives different
 144                predictions.
 145            total_cfs (int, optional): The total number of counterfactuals to,
 146                generate. Default to 1.
 147            target_range (tuple, optional): The targetted prediction range. This
 148                parameter is required if the EBM is a regressor.
 149            sim_threshold_factor (float, optional): A positive float to automatically
 150                generate a similarity threshold. This parameter has no effect if
 151                `sim_threshold` is provided. If `sim_threshold` is
 152                not provided, we compute `sim_threshold` as `sim_threshold_factor`
 153                * average additive score range of all continuous features. If
 154                `sim_threshold_factor` is too small, it takes longer time to
 155                generate CFs. If `sim_threshold_factor` is too large, the
 156                algorithm might miss some optimal CFs.
 157            sim_threshold (float, optional): A positive float to determine how we
 158                decide if two bins of a continuous feature have similar scores.
 159                Two bins $b_1$ and $b_2$ are similar (the distant one will be
 160                removed) if $|b_1 - b_2| \\leq$ `sim_threshold`.
 161            categorical_weight (Union[float, str], optional): A positive float
 162                to scale the distances of options for categorical variables. Since
 163                we have very different distance functions for continuous and
 164                categorical features, we need to scale them so they are at a
 165                comparable range. To do that, we multiply the categorical feature's
 166                distances by `categorical_weight`. By default ('auto'), we scale
 167                the distances of categorical features so that they have the mean
 168                distance as continuous features.
 169            features_to_vary ([str], optional): A list of feature names that
 170                the CFs can change. If it is `None`, this function will use all
 171                features.
 172            max_num_features_to_vary (int, optional): The max number of features
 173                that the CF can vary. Default is no maximum.
 174            feature_ranges (dict, optional): A dictionary to control the permitted
 175                ranges/values for continuous/categorical features. It maps
 176                `feature_name` -> [`min_value`, `max_value`] for continuous features,
 177                `feature_name` -> [`level1`, `level2`, ...] for categorical features.
 178            continuous_integer_features (list, optional): A list of names of
 179                continuous features that need to be integers (e.g., age, FICO score)
 180            verbose (int): 0: no any output, 1: show progress bar, 2: show internal
 181                optimization details
 182
 183        Returns:
 184            Counterfactuals: The generated counterfactual examples with their
 185                associated distances and change information.
 186        """
 187
 188        # Transforming some parameters
 189        if len(cur_example.shape) == 1:
 190            cur_example = cur_example.reshape(1, -1)
 191
 192        if features_to_vary is None:
 193            features_to_vary = [
 194                self.ebm.feature_names[i]
 195                for i in range(len(self.ebm.feature_types))
 196                if self.ebm.feature_types[i] != "interaction"
 197            ]
 198
 199        # Step 1: Find the current score for each feature
 200        # This is done by ebm.explain_local()
 201        cur_scores = {}
 202
 203        if self.is_classifier:
 204            cur_scores["intercept"] = self.ebm.intercept_[0]
 205        else:
 206            cur_scores["intercept"] = self.ebm.intercept_
 207
 208        local_data = self.ebm.explain_local(cur_example)._internal_obj
 209
 210        for i in range(len(self.ebm.feature_names)):
 211            cur_feature_name = self.ebm.feature_names[i]
 212            cur_feature_type = self.ebm.feature_types[i]
 213
 214            cur_scores[cur_feature_name] = local_data["specific"][0]["scores"][i]
 215
 216        # Find the CF direction
 217
 218        # Binary classification
 219        # Predicted 0 => +1
 220        # Predicted 1 => -1
 221        if self.is_classifier:
 222            cf_direction = self.ebm.predict(cur_example)[0] * (-2) + 1
 223            total_score = np.sum([cur_scores[k] for k in cur_scores])
 224            needed_score_gain = -total_score
 225            score_gain_bound = None
 226
 227        else:
 228            # Regression
 229            # Increase +1
 230            # Decrease -1
 231            if target_range is None:
 232                raise ValueError(
 233                    "target_range cannot be None when the model is a regressor"
 234                )
 235
 236            predicted_value = self.ebm.predict(cur_example)[0]
 237            if (
 238                predicted_value >= target_range[0]
 239                and predicted_value <= target_range[1]
 240            ):
 241                raise ValueError("The target_range cannot cover the current prediction")
 242
 243            elif predicted_value < target_range[0]:
 244                cf_direction = 1
 245                needed_score_gain = target_range[0] - predicted_value
 246                score_gain_bound = target_range[1] - predicted_value
 247            else:
 248                cf_direction = -1
 249                needed_score_gain = target_range[1] - predicted_value
 250                score_gain_bound = target_range[0] - predicted_value
 251
 252        # Step 2: Generate continuous and categorical options
 253        options = {}
 254
 255        # Generate a similarity threshold if it is not provided
 256        if sim_threshold is None:
 257            additive_ranges = []
 258
 259            for i in range(len(self.ebm.feature_names)):
 260                if self.ebm.feature_types[i] == "continuous":
 261                    cur_values = self.ebm.additive_terms_[i]
 262                    additive_ranges.append(np.max(cur_values) - np.min(cur_values))
 263
 264            sim_threshold = np.mean(additive_ranges) * sim_threshold_factor
 265
 266        # To make it faster to solve the MILP problem, we can decrease the
 267        # number of variables by filtering out unhelpful and redundant options
 268        #
 269        # (1) Unhelpful options: options that move the score to an undesirable
 270        # direction. For example, if we want to flip 0 to 1, options that decrease
 271        # the score are unhelpful.
 272        #
 273        # (2) Redundant options: for a set of options that give similar score
 274        # gains (bounded by a parameter epsilon), we only need to incldue one
 275        # option that has the lowest distance. This is only relevant for
 276        # continuous variables. Users can set the parameter epsilon. The default
 277        # should be relatively small, otherwise we might miss the optimal solution.
 278
 279        # Step 2.1: Find all good options from continuous and categorical features
 280        for cur_feature_id in range(len(self.ebm.feature_names)):
 281
 282            cur_feature_name = self.ebm.feature_names[cur_feature_id]
 283            cur_feature_type = self.ebm.feature_types[cur_feature_id]
 284            cur_feature_index = self.ebm.feature_groups_[cur_feature_id][0]
 285
 286            if cur_feature_type == "interaction":
 287                continue
 288
 289            elif cur_feature_type == "continuous":
 290                # The parameter epsilon controls the threshold of how we determine
 291                # "similar" options for continuous variables
 292                epsilon = sim_threshold
 293
 294                cur_feature_score = cur_scores[cur_feature_name]
 295                cur_feature_value = float(cur_example[0][cur_feature_id])
 296
 297                # Users can require the continuous feature to have integer values
 298                # For example, age, FICO score, and number of accounts
 299                need_to_be_int = False
 300                if (
 301                    continuous_integer_features
 302                    and cur_feature_name in continuous_integer_features
 303                ):
 304                    need_to_be_int = True
 305
 306                cur_cont_options = self.generate_cont_options(
 307                    cf_direction,
 308                    cur_feature_index,
 309                    cur_feature_name,
 310                    cur_feature_value,
 311                    cur_feature_score,
 312                    self.cont_mads,
 313                    cur_example[0],
 314                    score_gain_bound,
 315                    epsilon,
 316                    need_to_be_int,
 317                )
 318
 319                options[cur_feature_name] = cur_cont_options
 320
 321            elif cur_feature_type == "categorical":
 322                cur_feature_score = cur_scores[cur_feature_name]
 323                cur_feature_value = str(cur_example[0][cur_feature_id])
 324                cur_cat_distance = self.cat_distances[cur_feature_name]
 325
 326                cur_cat_options = self.generate_cat_options(
 327                    cf_direction,
 328                    cur_feature_index,
 329                    cur_feature_value,
 330                    cur_feature_score,
 331                    cur_cat_distance,
 332                    cur_example[0],
 333                    score_gain_bound,
 334                )
 335
 336                options[cur_feature_name] = cur_cat_options
 337
 338        # Step 2.2: Filter out undesired options (based on the feature_range)
 339        if feature_ranges is not None:
 340            for f_name in feature_ranges:
 341                cur_range = feature_ranges[f_name]
 342                f_index = self.ebm.feature_names.index(f_name)
 343                f_type = self.ebm.feature_types[f_index]
 344
 345                if f_type == "continuous":
 346                    # Delete options that use out-of-range options
 347                    for o in range(len(options[f_name]) - 1, -1, -1):
 348                        cur_target = options[f_name][o][0]
 349                        if cur_target < cur_range[0] or cur_target > cur_range[1]:
 350                            options[f_name].pop(o)
 351                elif f_type == "categorical":
 352                    for o in range(len(options[f_name]) - 1, -1, -1):
 353                        if options[f_name][o][0] not in cur_range:
 354                            options[f_name].pop(o)
 355
 356        # Step 2.3: Compute the interaction offsets for all possible options
 357        for cur_feature_id in range(len(self.ebm.feature_names)):
 358
 359            cur_feature_name = self.ebm.feature_names[cur_feature_id]
 360            cur_feature_type = self.ebm.feature_types[cur_feature_id]
 361
 362            if cur_feature_type == "interaction":
 363
 364                cur_feature_index_1 = self.ebm.feature_groups_[cur_feature_id][0]
 365                cur_feature_index_2 = self.ebm.feature_groups_[cur_feature_id][1]
 366
 367                cur_feature_score = cur_scores[cur_feature_name]
 368                options[cur_feature_name] = self.generate_inter_options(
 369                    cur_feature_id,
 370                    cur_feature_index_1,
 371                    cur_feature_index_2,
 372                    cur_feature_score,
 373                    options,
 374                )
 375
 376        # Step 2.4: Rescale categorical distances so that they have the same mean
 377        # as continuous variables (default)
 378        if categorical_weight == "auto":
 379            cont_distances = []
 380            cat_distances = []
 381
 382            for f_name in options:
 383                f_index = self.ebm.feature_names.index(f_name)
 384                f_type = self.ebm.feature_types[f_index]
 385
 386                if f_type == "continuous":
 387                    for option in options[f_name]:
 388                        cont_distances.append(option[2])
 389                elif f_type == "categorical":
 390                    for option in options[f_name]:
 391                        cat_distances.append(option[2])
 392
 393            categorical_weight = np.mean(cont_distances) / np.mean(cat_distances)
 394
 395        for f_name in options:
 396            f_index = self.ebm.feature_names.index(f_name)
 397            f_type = self.ebm.feature_types[f_index]
 398
 399            if f_type == "categorical":
 400                for option in options[f_name]:
 401                    option[2] = option[2] * categorical_weight
 402
 403        # Step 3. Formulate the MILP model and solve it
 404
 405        # Find diverse solutions by accumulatively muting the optimal solutions
 406        solutions = []
 407        muted_variables = []
 408        is_successful = True
 409
 410        for _ in tqdm(range(total_cfs), disable=verbose == 0):
 411            model, variables = self.create_milp(
 412                cf_direction,
 413                needed_score_gain,
 414                features_to_vary,
 415                options,
 416                max_num_features_to_vary,
 417                muted_variables=muted_variables,
 418            )
 419
 420            model.solve(pulp.apis.PULP_CBC_CMD(msg=verbose > 0, warmStart=True))
 421
 422            if model.status != 1:
 423                is_successful = False
 424
 425            if verbose == 2:
 426                print("solver runs for {:.2f} seconds".format(model.solutionTime))
 427                print("status: {}".format(pulp.LpStatus[model.status]))
 428
 429            active_variables = []
 430
 431            # Print the optimal solution
 432            for key in variables:
 433                for x in variables[key]:
 434                    if x.varValue > 0:
 435                        active_variables.append(x)
 436
 437            if verbose == 2:
 438                print("\nFound solutions:")
 439                self.print_solution(cur_example, active_variables, options)
 440
 441            # Collect the current solution and mute the associated variables
 442            solutions.append([active_variables, pulp.value(model.objective)])
 443
 444            for var in active_variables:
 445                if " x " not in var.name:
 446                    muted_variables.append(var.name)
 447
 448        cfs = Counterfactuals(
 449            solutions, is_successful, model, variables, self.ebm, cur_example, options
 450        )
 451
 452        return cfs
 453
 454    def generate_cont_options(
 455        self,
 456        cf_direction,
 457        cur_feature_index,
 458        cur_feature_name,
 459        cur_feature_value,
 460        cur_feature_score,
 461        cont_mads,
 462        cur_example,
 463        score_gain_bound=None,
 464        epsilon=0.005,
 465        need_to_be_int=False,
 466        skip_unhelpful=True,
 467    ):
 468        """
 469        Generate all alternative options for this continuous variable. This function
 470        would filter out all options that are:
 471
 472        1. Not helpful for the counterfactual generation.
 473        2. Give similar score gain but requires larger distance.
 474
 475        Args:
 476            cf_direction (int): Integer `+1` if 0 => 1, `-1` if 1 => 0
 477                (classification); `+1` if we need to increase the prediction,
 478                `-1` if decrease (regression).
 479            cur_feature_index (int): The index of the current continuous feature.
 480            cur_feature_name (str): Name of the current feature.
 481            cur_feature_value (float): The current feature value.
 482            cur_feature_score (float): The score for the current feature value.
 483            cont_mads (dict): A map of feature_name => MAD score.
 484            cur_example (list): Current sample values
 485            score_gain_bound (float): Bound of the score gain. We do not collect
 486                options that give `score_gain` > `score_gain_bound` (when
 487                `cf_direction=1`), or `score_gain` < `score_gain_bound` (when
 488                `cf_direction=-1`)
 489            epsilon (float): The threshold to determine if two options give similar
 490                score gains. Score gains $s_1$ and $s_2$ are similar if
 491                $|s_1 - s_2| <$ epsilon. Smaller epsilon significantly increases
 492                the time to solve the MILP. Large epsilon might filter out the
 493                optimal CF. Defaults to 0.005.
 494            need_to_be_int (bool): True if the target values for this continuous
 495                variable need to have integer values.
 496            skip_unhelpful (bool): True if to skip options from main
 497                effects that give opposite score gain. It is rare that there is a
 498                positive score gain from pair-interaction that outweigh negative
 499                score gain from two main effects, and adjusting the distance penalty.
 500
 501        Returns:
 502            list: List of option tuples (target, score gain, distance, bin_index)
 503        """
 504
 505        # For each continuous feature, each bin is a variable
 506        # For each bin, we need to compute (1) score gain, (2) distance
 507        # (1) score gain is the difference between new bin and current bin
 508        # (2) distance is L1 distance divided by median absolute deviation (MAD)
 509
 510        # Get the additive scores of this feature
 511        additives = self.ebm.additive_terms_[cur_feature_index][1:]
 512
 513        # Get the bin edges of this feature
 514        bin_starts = self.ebm.preprocessor_._get_bin_labels(cur_feature_index)[:-1]
 515
 516        # Create "options", each option is a tuple (target, score_gain, distance,
 517        # bin_index)
 518        cont_options = []
 519
 520        # Identify which bin this value falls into
 521        cur_bin_id = search_sorted_lower_index(bin_starts, cur_feature_value)
 522        assert additives[cur_bin_id] == cur_feature_score
 523
 524        # Identify interaction terms that we need to consider
 525        associated_interactions = []
 526
 527        for cur_feature_id in range(len(self.ebm.feature_names)):
 528            cur_feature_type = self.ebm.feature_types[cur_feature_id]
 529            if cur_feature_type == "interaction":
 530
 531                indexes = self.ebm.feature_groups_[cur_feature_id]
 532
 533                if cur_feature_index in indexes:
 534                    feature_position = 0 if indexes[0] == cur_feature_index else 1
 535
 536                    other_position = 1 - feature_position
 537                    other_index = indexes[other_position]
 538                    other_type = self.ebm.feature_types[other_index]
 539
 540                    # Get the current additive scores and bin edges
 541                    inter_additives = self.ebm.additive_terms_[cur_feature_id][1:, 1:]
 542
 543                    # Have to skip the max edge if it is continuous
 544                    bin_starts_feature = self.ebm.pair_preprocessor_._get_bin_labels(
 545                        cur_feature_index
 546                    )[:-1]
 547
 548                    bin_starts_other = self.ebm.pair_preprocessor_._get_bin_labels(
 549                        other_index
 550                    )
 551                    if other_type == "continuous":
 552                        bin_starts_other = bin_starts_other[:-1]
 553
 554                    # Get the current interaction term score
 555                    other_bin = None
 556                    if other_type == "continuous":
 557                        other_bin = search_sorted_lower_index(
 558                            bin_starts_other, float(cur_example[other_index])
 559                        )
 560                    else:
 561                        other_bin = bin_starts_other.index(cur_example[other_index])
 562
 563                    feature_bin = search_sorted_lower_index(
 564                        bin_starts_feature, cur_feature_value
 565                    )
 566
 567                    feature_inter_score = 0
 568
 569                    if feature_position == 0:
 570                        feature_inter_score = inter_additives[feature_bin, other_bin]
 571                    else:
 572                        feature_inter_score = inter_additives[other_bin, feature_bin]
 573
 574                    # Extract the row or column where we fix the other feature and
 575                    # vary the current feature
 576                    feature_inter_bin_starts = bin_starts_feature
 577                    feature_inter_additives = []
 578
 579                    if feature_position == 0:
 580                        for i in range(len(inter_additives)):
 581                            feature_inter_additives.append(
 582                                inter_additives[i, other_bin]
 583                            )
 584                    else:
 585                        for i in range(len(inter_additives[0])):
 586                            feature_inter_additives.append(
 587                                inter_additives[other_bin, i]
 588                            )
 589
 590                    # Register this interaction term
 591                    associated_interactions.append(
 592                        {
 593                            "inter_index": indexes,
 594                            "cur_interaction_id": cur_feature_id,
 595                            "feature_inter_score": feature_inter_score,
 596                            "feature_inter_bin_starts": feature_inter_bin_starts,
 597                            "feature_inter_additives": feature_inter_additives,
 598                        }
 599                    )
 600
 601        for i in range(len(additives)):
 602            # Because of the special binning structure of EBM, the distance of
 603            # bins on the left to the current value is different from the bins
 604            # that are on the right
 605            #
 606            # For bins on the left, the raw distance is abs(bin_start[i + 1] - x)
 607            # For bins on the right, the raw distance is abs(bin_start[i] - x)
 608            target = cur_feature_value
 609            distance = 0
 610
 611            if i < cur_bin_id:
 612                # First need to consider if it is need to be an integer
 613                # If so, it would be the closest integer to the right point
 614                if need_to_be_int:
 615                    target = float(int(bin_starts[i + 1]))
 616                    if target == bin_starts[i + 1]:
 617                        target -= 1
 618
 619                    # Skip this option if it is not possible to find an int value
 620                    if target < bin_starts[i]:
 621                        continue
 622
 623                    distance = np.abs(target - cur_feature_value)
 624
 625                else:
 626                    target = bin_starts[i + 1]
 627                    distance = np.abs(target - cur_feature_value)
 628
 629                    # Subtract a very smaller value to make the target
 630                    # technically fall into the left bin
 631                    target -= 1e-4
 632
 633            elif i > cur_bin_id:
 634                # First need to consider if it should be an integer value
 635                # If so, it would be the closest integer to the left point
 636                if need_to_be_int:
 637                    target = float(np.ceil(bin_starts[i]))
 638                    if target == bin_starts[i]:
 639                        target += 1
 640
 641                    # Skip this option if it is not possible to find an int value
 642                    if i + 1 < len(additives) and target >= bin_starts[i + 1]:
 643                        continue
 644
 645                    distance = np.abs(target - cur_feature_value)
 646
 647                else:
 648                    target = bin_starts[i]
 649                    distance = np.abs(target - cur_feature_value)
 650
 651            # Scale the distance based on the deviation of the feature (how changeable it is)
 652            if cont_mads[cur_feature_name] > 0:
 653                distance /= cont_mads[cur_feature_name]
 654
 655            # Compute score gain which has two parts:
 656            # (1) gain from the change of main effect
 657            # (2) gain from the change of interaction effect
 658
 659            # Main effect
 660            main_score_gain = additives[i] - cur_feature_score
 661
 662            # Interaction terms
 663            # A list to track all interaction score gain offsets
 664            # [[interaction id, interaction score gain]]
 665            inter_score_gain = 0
 666            inter_score_gains = []
 667
 668            for d in associated_interactions:
 669                inter_bin_id = search_sorted_lower_index(
 670                    d["feature_inter_bin_starts"], target
 671                )
 672                inter_score_gain += (
 673                    d["feature_inter_additives"][inter_bin_id]
 674                    - d["feature_inter_score"]
 675                )
 676                inter_score_gains.append(
 677                    [
 678                        d["cur_interaction_id"],
 679                        d["feature_inter_additives"][inter_bin_id]
 680                        - d["feature_inter_score"],
 681                    ]
 682                )
 683
 684            score_gain = main_score_gain + inter_score_gain
 685
 686            if cf_direction * score_gain <= 0 and skip_unhelpful:
 687                continue
 688
 689            # Filter out of bound options
 690            if score_gain_bound and skip_unhelpful:
 691                if cf_direction == 1 and score_gain > score_gain_bound:
 692                    continue
 693                if cf_direction == -1 and score_gain < score_gain_bound:
 694                    continue
 695
 696            cont_options.append([target, score_gain, distance, i, inter_score_gains])
 697
 698        # Now we can apply the second round of filtering to remove redundant options
 699        # Redundant options refer to bins that give similar score gain with larger distance
 700        cont_options = sorted(cont_options, key=lambda x: x[2])
 701
 702        start = 0
 703        while start < len(cont_options):
 704            for i in range(len(cont_options) - 1, start, -1):
 705                if np.abs(cont_options[i][1] - cont_options[start][1]) < epsilon:
 706                    cont_options.pop(i)
 707
 708            start += 1
 709
 710        return cont_options
 711
 712    def generate_cat_options(
 713        self,
 714        cf_direction,
 715        cur_feature_index,
 716        cur_feature_value,
 717        cur_feature_score,
 718        cur_cat_distance,
 719        cur_example,
 720        score_gain_bound=None,
 721        skip_unhelpful=True,
 722    ):
 723        """
 724        Generate all alternative options for this categorical variable. This function
 725        would filter out all options that are not helpful for the counterfactual
 726        generation.
 727
 728        Args:
 729            cf_direction (int): Integer `+1` if 0 => 1, `-1` if 1 => 0
 730                (classification); `+1` if we need to increase the prediction,
 731                `-1` if decrease (regression).
 732            cur_feature_index (int): The index of the current continuous feature.
 733            cur_feature_value (float): The current feature value.
 734            cur_feature_score (float): The score for the current feature value.
 735            cur_cat_distance (dict): A map of feature_level => 1 - frequency.
 736            cur_example (list): Current sample values.
 737            score_gain_bound (float): Bound of the score gain. We do not collect
 738                options that give `score_gain` > `score_gain_bound` (when
 739                `cf_direction=1`), or `score_gain` < `score_gain_bound` (when
 740                `cf_direction=-1`)
 741            skip_unhelpful (bool): True if to skip options from main
 742                effects that give opposite score gain. It is rare that there is a
 743                positive score gain from pair-interaction that outweigh negative
 744                score gain from two main effects, and adjusting the distance penalty.
 745
 746        Returns:
 747            list: List of option tuples (target, score_gain, distance, bin_index).
 748        """
 749
 750        # Find other options for this categorical variable
 751        # For each option, we compute the (1) score gain, and (2) distance
 752        #
 753        # (1) Score gain is the same as continuous variables
 754        # (2) The distance is determined by 1 - the level frequency in the
 755        # training data. It implies that levels with high frequency are easier
 756        # to "move to"
 757
 758        # Get the additive scores of this feature
 759        additives = self.ebm.additive_terms_[cur_feature_index][1:]
 760
 761        # Get the bin edges of this feature
 762        levels = self.ebm.preprocessor_._get_bin_labels(cur_feature_index)
 763
 764        # Create "options", each option is a tuple (target, score_gain, distance, bin_index)
 765        cat_options = []
 766
 767        # Identify interaction terms that we need to consider
 768        associated_interactions = []
 769
 770        for cur_feature_id in range(len(self.ebm.feature_names)):
 771            cur_feature_type = self.ebm.feature_types[cur_feature_id]
 772            if cur_feature_type == "interaction":
 773
 774                indexes = self.ebm.feature_groups_[cur_feature_id]
 775
 776                if cur_feature_index in indexes:
 777                    feature_position = 0 if indexes[0] == cur_feature_index else 1
 778
 779                    other_position = 1 - feature_position
 780                    other_index = indexes[other_position]
 781                    other_type = self.ebm.feature_types[other_index]
 782                    other_name = self.ebm.feature_names[other_index]
 783
 784                    # Get the current additive scores and bin edges
 785                    inter_additives = self.ebm.additive_terms_[cur_feature_id][1:, 1:]
 786
 787                    bin_starts_feature = self.ebm.pair_preprocessor_._get_bin_labels(
 788                        cur_feature_index
 789                    )
 790                    bin_starts_other = self.ebm.pair_preprocessor_._get_bin_labels(
 791                        other_index
 792                    )
 793
 794                    # Have to skip the max edge if it is continuous
 795                    if other_type == "continuous":
 796                        bin_starts_other = bin_starts_other[:-1]
 797
 798                    # Get the current interaction term score
 799                    other_bin = None
 800                    if other_type == "continuous":
 801                        other_bin = search_sorted_lower_index(
 802                            bin_starts_other, float(cur_example[other_index])
 803                        )
 804                    else:
 805                        other_bin = bin_starts_other.index(cur_example[other_index])
 806
 807                    feature_bin = bin_starts_feature.index(cur_feature_value)
 808
 809                    feature_inter_score = 0
 810
 811                    if feature_position == 0:
 812                        feature_inter_score = inter_additives[feature_bin, other_bin]
 813                    else:
 814                        feature_inter_score = inter_additives[other_bin, feature_bin]
 815
 816                    # Extract the row or column where we fix the other features and
 817                    # vary the current feature
 818                    feature_inter_bin_starts = bin_starts_feature
 819                    feature_inter_additives = []
 820
 821                    if feature_position == 0:
 822                        for i in range(len(inter_additives)):
 823                            feature_inter_additives.append(
 824                                inter_additives[i, other_bin]
 825                            )
 826                    else:
 827                        for i in range(len(inter_additives[0])):
 828                            feature_inter_additives.append(
 829                                inter_additives[other_bin, i]
 830                            )
 831
 832                    # Register this interaction term
 833                    associated_interactions.append(
 834                        {
 835                            "inter_index": indexes,
 836                            "cur_interaction_id": cur_feature_id,
 837                            "feature_inter_score": feature_inter_score,
 838                            "feature_inter_bin_starts": feature_inter_bin_starts,
 839                            "feature_inter_additives": feature_inter_additives,
 840                        }
 841                    )
 842
 843        for i in range(len(additives)):
 844            if levels[i] != cur_feature_value:
 845                target = levels[i]
 846                distance = cur_cat_distance[target]
 847
 848                # Compute score gain which has two parts:
 849                # (1) gain from the change of main effect
 850                # (2) gain from the change of interaction effect
 851
 852                # Main effect
 853                main_score_gain = additives[i] - cur_feature_score
 854
 855                # Interaction terms
 856                # A list to track all interaction score gain offsets
 857                # [[interaction id, interaction score gain]]
 858                inter_score_gain = 0
 859                inter_score_gains = []
 860
 861                for d in associated_interactions:
 862                    inter_bin_id = d["feature_inter_bin_starts"].index(target)
 863                    inter_score_gain += (
 864                        d["feature_inter_additives"][inter_bin_id]
 865                        - d["feature_inter_score"]
 866                    )
 867                    inter_score_gains.append(
 868                        [
 869                            d["cur_interaction_id"],
 870                            d["feature_inter_additives"][inter_bin_id]
 871                            - d["feature_inter_score"],
 872                        ]
 873                    )
 874
 875                score_gain = main_score_gain + inter_score_gain
 876
 877                # Skip unhelpful options
 878                if cf_direction * score_gain <= 0 and skip_unhelpful:
 879                    continue
 880
 881                # Filter out of bound options
 882                if score_gain_bound and skip_unhelpful:
 883                    if cf_direction == 1 and score_gain > score_gain_bound:
 884                        continue
 885                    if cf_direction == -1 and score_gain < score_gain_bound:
 886                        continue
 887
 888                cat_options.append([target, score_gain, distance, i, inter_score_gains])
 889
 890        return cat_options
 891
 892    def generate_inter_options(
 893        self,
 894        cur_feature_id,
 895        cur_feature_index_1,
 896        cur_feature_index_2,
 897        cur_feature_score,
 898        options,
 899    ):
 900        """
 901        Generate all possible options for this interaction variable.
 902
 903        Interaction terms are interesting in this MILP. Each option counts as a
 904        variable, but each variable only affects the score gain, not the distance.
 905
 906        Note that in EBM, the bin definitions for interaction terms can be different
 907        from their definitions for individual continuous variables.
 908
 909        To model interaction terms, we can think it as a binary variable. The
 910        value is determined by the multiplication of two main effect variables.
 911        Each interaction variable describes a combination of two main effect
 912        variables. Therefore, say continuous variable A has $x$ probable options,
 913        and another continuous variable B has $y$ probable options, then we should
 914        add $x \\times y$ binary variables to offset their probable interaction
 915        effects.
 916
 917        Args:
 918            cur_feature_id (int): The id of this interaction effect.
 919            cur_feature_index_1 (int): The index of the first main effect.
 920            cur_feature_index_2 (int): The index of the second main effect.
 921            cur_feature_score (float): The score for the current feature value.
 922            options (dict): The current option list, feature_name ->
 923                [`target`, `score_gain`, `distance`, `bin_id`].
 924
 925        Returns:
 926            List of option tuples (target, score_gain, distance, bin_index)
 927        """
 928
 929        # Get the sub-types for this interaction term
 930        cur_feature_type_1 = self.ebm.feature_types[cur_feature_index_1]
 931        cur_feature_type_2 = self.ebm.feature_types[cur_feature_index_2]
 932
 933        # Get the sub-names for this interaction term
 934        cur_feature_name_1 = self.ebm.feature_names[cur_feature_index_1]
 935        cur_feature_name_2 = self.ebm.feature_names[cur_feature_index_2]
 936
 937        # The first column and row are reserved for missing values (even with
 938        # categorical features)
 939        additives = self.ebm.additive_terms_[cur_feature_id][1:, 1:]
 940
 941        # Four possibilities here: cont x cont, cont x cat, cat x cont, cat x cat.
 942        # Each has a different way to lookup the bin table.
 943        inter_options = []
 944
 945        # Iterate through all possible combinations of options from these two
 946        # variables
 947        for opt_1 in options[cur_feature_name_1]:
 948            for opt_2 in options[cur_feature_name_2]:
 949
 950                bin_starts_1 = self.ebm.pair_preprocessor_._get_bin_labels(
 951                    cur_feature_index_1
 952                )
 953                bin_starts_2 = self.ebm.pair_preprocessor_._get_bin_labels(
 954                    cur_feature_index_2
 955                )
 956
 957                bin_1 = None
 958                bin_2 = None
 959
 960                if cur_feature_type_1 == "continuous":
 961                    if cur_feature_type_2 == "continuous":
 962                        # cont x cont
 963                        bin_starts_1 = bin_starts_1[:-1]
 964                        bin_starts_2 = bin_starts_2[:-1]
 965
 966                        # locate the bin for each option value
 967                        bin_1 = search_sorted_lower_index(bin_starts_1, opt_1[0])
 968                        bin_2 = search_sorted_lower_index(bin_starts_2, opt_2[0])
 969
 970                    else:
 971                        # cont x cat
 972                        bin_starts_1 = bin_starts_1[:-1]
 973
 974                        # locate the bin for each option value
 975                        bin_1 = search_sorted_lower_index(bin_starts_1, opt_1[0])
 976                        bin_2 = bin_starts_2.index(opt_2[0])
 977
 978                else:
 979                    if cur_feature_type_2 == "continuous":
 980                        # cat x cont
 981                        bin_starts_2 = bin_starts_2[:-1]
 982
 983                        # locate the bin for each option value
 984                        bin_1 = bin_starts_1.index(opt_1[0])
 985                        bin_2 = search_sorted_lower_index(bin_starts_2, opt_2[0])
 986
 987                    else:
 988                        # cat x cat
 989
 990                        # locate the bin for each option value
 991                        bin_1 = bin_starts_1.index(opt_1[0])
 992                        bin_2 = bin_starts_2.index(opt_2[0])
 993
 994                new_score = additives[bin_1, bin_2]
 995                score_gain = new_score - cur_feature_score
 996
 997                # The score gain on the interaction term need to offset the interaction
 998                # score gain we have already counted on the main effect options. That
 999                # score is saved in the option tuple.
1000
1001                # We first need to find the common interaction id
1002                common_index = [-1, -1]
1003                for m in range(len(opt_1[4])):
1004                    for n in range(len(opt_2[4])):
1005                        if opt_1[4][m][0] == opt_2[4][n][0]:
1006                            common_index = [m, n]
1007                            break
1008
1009                    if common_index[0] != -1 and common_index[1] != -1:
1010                        break
1011
1012                score_gain -= opt_1[4][common_index[0]][1]
1013                score_gain -= opt_2[4][common_index[1]][1]
1014
1015                inter_options.append(
1016                    [[opt_1[0], opt_2[0]], score_gain, 0, [opt_1[3], opt_2[3]], 0]
1017                )
1018
1019        return inter_options
1020
1021    @staticmethod
1022    def create_milp(
1023        cf_direction,
1024        needed_score_gain,
1025        features_to_vary,
1026        options,
1027        max_num_features_to_vary=None,
1028        muted_variables=[],
1029    ):
1030        """
1031        Create a MILP to find counterfactuals (CF) using PuLP.
1032
1033        Args:
1034            cf_direction (int): Integer +1 if 0 => 1, -1 if 1 => 0 (classification),
1035                +1 if we need to incrase the prediction, -1 if decrease (regression).
1036            needed_score_gain (float): The score gain needed to achieve the CF goal.
1037            features_to_vary (list[str]): Feature names of features that the
1038                generated CF can change.
1039            options (dict): Possible options for each variable. Each option is a
1040                list [target, score_gain, distance, bin_index].
1041            max_num_features_to_vary (int, optional): Max number of features that the
1042                generated CF can change. If the value is `None`, the CFs can
1043                change any number of features.
1044            muted_variables (list[str], optional): Variables that this MILP should
1045                not use. This is useful to mute optimal variables so we can explore
1046                diverse solutions. This list should not include interaction variables.
1047
1048        Returns:
1049            A tuple (`model`, `variables`), where `model` is a pulp.LpProblem
1050            model that encodes the MILP problem, and `variables` is a dict of
1051            variables used in the `model`: `feature_name` => [`variables`].
1052        """
1053
1054        # Create a model (minimizing the distance)
1055        model = pulp.LpProblem("ebmCounterfactual", pulp.LpMinimize)
1056
1057        distance = 0
1058        score_gain = 0
1059
1060        muted_variables_set = set(muted_variables)
1061
1062        # Create variables
1063        variables = {}
1064        for f in features_to_vary:
1065            # Each variable encodes an option (0: not use this option,
1066            # 1: use this option)
1067            cur_variables = []
1068
1069            for option in options[f]:
1070                var_name = "{}:{}".format(f, option[3])
1071
1072                # Skip the muted variables
1073                if var_name in muted_variables_set:
1074                    continue
1075
1076                x = pulp.LpVariable(var_name, lowBound=0, upBound=1, cat="Binary")
1077                x.setInitialValue(0)
1078
1079                score_gain += option[1] * x
1080                distance += option[2] * x
1081
1082                cur_variables.append(x)
1083
1084            variables[f] = cur_variables
1085
1086            # A local constraint is that we can only at most selection one option from
1087            # one feature
1088            model += pulp.lpSum(cur_variables) <= 1
1089
1090        # Users can also set `max_num_features_to_vary` to control the total
1091        # number of features to vary
1092        if max_num_features_to_vary is not None:
1093            main_variables = []
1094            for f in variables:
1095                main_variables.extend(variables[f])
1096
1097            model += pulp.lpSum(main_variables) <= max_num_features_to_vary
1098
1099        # Create variables for interaction effects
1100        for opt_name in options:
1101            if " x " in opt_name:
1102                f1_name = re.sub(r"(.+)\sx\s.+", r"\1", opt_name)
1103                f2_name = re.sub(r".+\sx\s(.+)", r"\1", opt_name)
1104
1105                if f1_name in features_to_vary and f2_name in features_to_vary:
1106
1107                    # We need to include this interaction effect
1108                    cur_variables = []
1109
1110                    for option in options[opt_name]:
1111                        z = pulp.LpVariable(
1112                            "{}:{},{}".format(opt_name, option[3][0], option[3][1]),
1113                            lowBound=0,
1114                            upBound=1,
1115                            cat="Continuous",
1116                        )
1117                        z.setInitialValue(0)
1118
1119                        # Need to iterate through existing variables for f1 and f2 to find
1120                        # the corresponding variables
1121                        x_f1 = None
1122                        x_f2 = None
1123
1124                        # Skp if this interaction variable involves muted main variable
1125                        x_f1_name = "{}:{}".format(f1_name, option[3][0])
1126                        x_f2_name = "{}:{}".format(f2_name, option[3][1])
1127
1128                        if (
1129                            x_f1_name in muted_variables_set
1130                            or x_f2_name in muted_variables_set
1131                        ):
1132                            continue
1133
1134                        for x in variables[f1_name]:
1135                            if x.name == x_f1_name:
1136                                x_f1 = x
1137                                break
1138
1139                        for x in variables[f2_name]:
1140                            if x.name == x_f2_name:
1141                                x_f2 = x
1142                                break
1143
1144                        assert x_f1 is not None and x_f2 is not None
1145
1146                        # variable z is actually the product of x_f1 and x_f2
1147                        # We can linearize it by 3 constraints
1148                        model += z <= x_f1
1149                        model += z <= x_f2
1150                        model += z >= x_f1 + x_f2 - 1
1151
1152                        cur_variables.append(z)
1153
1154                    variables[opt_name] = cur_variables
1155
1156        # Use constraint to express counterfactual
1157        if cf_direction == 1:
1158            model += score_gain >= needed_score_gain
1159        else:
1160            model += score_gain <= needed_score_gain
1161
1162        # We want to minimize the distance
1163        model += distance
1164
1165        return model, variables
1166
1167    def print_solution(self, cur_example, active_variables, options):
1168        """
1169        Print the optimal solution.
1170
1171        Args:
1172            cur_example (np.ndarray): the original data point.
1173            active_variables (list[variable]): binary variables with value 1.
1174            options (dict): all the possible options for all features.
1175        """
1176
1177        for var in active_variables:
1178            # Skip interaction vars (included)
1179            if "_x_" not in var.name:
1180                f_name = re.sub(r"(.+):\d+", r"\1", var.name)
1181                bin_i = int(re.sub(r".+:(\d+)", r"\1", var.name))
1182
1183                # Find the original value
1184                org_value = cur_example[0][self.ebm.feature_names.index(f_name)]
1185
1186                # Find the target bin
1187                f_index = self.ebm.feature_names.index(f_name)
1188                f_type = self.ebm.feature_types[f_index]
1189
1190                if f_type == "continuous":
1191                    bin_starts = self.ebm.preprocessor_._get_bin_labels(f_index)[:-1]
1192
1193                    target_bin = "[{},".format(bin_starts[bin_i])
1194
1195                    if bin_i + 1 < len(bin_starts):
1196                        target_bin += " {})".format(bin_starts[bin_i + 1])
1197                    else:
1198                        target_bin += " inf)"
1199                else:
1200                    target_bin = ""
1201                    org_value = '"{}"'.format(org_value)
1202
1203                for option in options[f_name]:
1204                    if option[3] == bin_i:
1205                        print(
1206                            "Change <{}> from {} to {} {}".format(
1207                                f_name, org_value, option[0], target_bin
1208                            )
1209                        )
1210                        print(
1211                            "\t* score gain: {:.4f}\n\t* distance cost: {:.4f}".format(
1212                                option[1], option[2]
1213                            )
1214                        )
1215                        break
1216
1217            else:
1218                f_name = re.sub(r"(.+):.+", r"\1", var.name)
1219                f_name = f_name.replace("_x_", " x ")
1220                bin_0 = int(re.sub(r".+:(\d+),\d+", r"\1", var.name))
1221                bin_1 = int(re.sub(r".+:\d+,(\d+)", r"\1", var.name))
1222
1223                for option in options[f_name]:
1224                    if option[3][0] == bin_0 and option[3][1] == bin_1:
1225                        print("Trigger interaction term: <{}>".format(f_name))
1226                        print(
1227                            "\t* score gain: {:.4f}\n\t* distance cost: {:.4f}".format(
1228                                option[1], 0
1229                            )
1230                        )
1231                        break
1232        print()
1233
1234    @staticmethod
1235    def compute_mad(xs):
1236        """
1237        Compute the median absolute deviation of a continuous feature.
1238
1239        Args:
1240            xs (np.ndarray): A column of continuous values.
1241
1242        Returns:
1243            float: MAD value of xs.
1244        """
1245        xs_median = np.median(xs.astype(float))
1246        mad = np.median(np.abs(xs.astype(float) - xs_median))
1247        return mad
1248
1249    @staticmethod
1250    def compute_frequency_distance(xs):
1251        """
1252        For categorical variables, we compute 1 - frequency as their distance. It implies
1253        that switching to a frequent value takes less effort.
1254
1255        Args:
1256            xs (np.ndarray): A column of categorical values.
1257
1258        Returns:
1259            dict: category level -> 1 - frequency.
1260        """
1261        counter = Counter(xs)
1262
1263        results = {}
1264
1265        for key in counter:
1266            results[key] = 1 - (counter[key] / len(xs))
1267
1268        return results
1269
1270    @staticmethod
1271    def compute_naive_cat_distance(xs):
1272        """
1273        Alternative to compute_frequency_distance() to compute distance for
1274        categorical variables. The distance 1 for different levels and 0 for
1275        the same levels. Here we give them all score 1, because same-level
1276        options will be filtered out when we create categorical options for the
1277        optimization program.
1278
1279        Args:
1280            xs (np.ndarray): A column of categorical values.
1281
1282        Returns:
1283            dict: category level -> 1.
1284        """
1285        counter = Counter(xs)
1286        results = {}
1287
1288        for key in counter:
1289            results[key] = 1
1290
1291        return results
1292
1293
1294def search_sorted_lower_index(sorted_edges, value):
1295    """Binary search to locate the correct bin for continuous features."""
1296    left = 0
1297    right = len(sorted_edges) - 1
1298
1299    while right - left > 1:
1300        i = left + int((right - left) / 2)
1301
1302        if value > sorted_edges[i]:
1303            left = i
1304        elif value < sorted_edges[i]:
1305            right = i
1306        else:
1307            return i
1308
1309    # Handle out of bound issues
1310    if value >= sorted_edges[right]:
1311        return right
1312    if value < sorted_edges[left]:
1313        return left
1314
1315    return right - 1
1316
1317
1318def sigmoid(x):
1319    """Sigmoid function."""
1320    return 1 / (1 + np.exp(x))
1321
1322
1323def _resort_categorical_level(col_mapping):
1324    """
1325    Resort the levels in the categorical encoders if all levels can be converted
1326    to numbers (integer or float).
1327
1328    Args:
1329        col_mapping: the dictionary that maps level string to int
1330
1331    Returns:
1332        New col_mapping if all levels can be converted to numbers, otherwise
1333        the original col_mapping
1334    """
1335
1336    def is_number(string):
1337        try:
1338            float(string)
1339            return True
1340        except ValueError:
1341            return False
1342
1343    if all(map(is_number, col_mapping.keys())):
1344
1345        key_tuples = [(k, float(k)) for k in col_mapping.keys()]
1346        sorted_key_tuples = sorted(key_tuples, key=lambda x: x[1])
1347
1348        new_mapping = {}
1349        value = 1
1350
1351        for t in sorted_key_tuples:
1352            new_mapping[t[0]] = value
1353            value += 1
1354
1355        return new_mapping
1356
1357    else:
1358        return col_mapping
1359
1360
1361def _init_feature_descriptions(ebm, label_encoder):
1362    # Initialize the feature description dictionary
1363    feature_descriptions = {}
1364
1365    for i in range(len(ebm.feature_names)):
1366        cur_name = ebm.feature_names[i]
1367        cur_type = ebm.feature_types[i]
1368
1369        # Use the feature name as the default display name
1370        if cur_type == "continuous":
1371            feature_descriptions[cur_name] = {
1372                "displayName": cur_name,
1373                "description": "",
1374            }
1375
1376        # For categorical features, we can also give display name and description
1377        # for different levels
1378        elif cur_type == "categorical":
1379
1380            level_descriptions = {}
1381
1382            for level in label_encoder[cur_name]:
1383                level_descriptions[level] = {
1384                    "displayName": label_encoder[cur_name][level],
1385                    "description": "",
1386                }
1387
1388            feature_descriptions[cur_name] = {
1389                "displayName": cur_name,
1390                "description": "",
1391                "levelDescription": level_descriptions,
1392            }
1393
1394        else:
1395            continue
1396
1397    return feature_descriptions
1398
1399
1400def _init_feature_configuration(ebm):
1401    # Initialize the feature configuration dictionary
1402    feature_configuration = {}
1403
1404    for i in range(len(ebm.feature_names)):
1405        cur_name = ebm.feature_names[i]
1406        cur_type = ebm.feature_types[i]
1407
1408        # Use the feature name as the default display name
1409        if cur_type == "continuous" or cur_type == "categorical":
1410            feature_configuration[cur_name] = {
1411                "difficulty": 3,
1412                "requiresInt": False,
1413                "requiresIncreasing": False,
1414                "requiresDecreasing": False,
1415                "usesTransform": None,
1416                "acceptableRange": None,
1417            }
1418        else:
1419            continue
1420
1421    return feature_configuration
1422
1423
1424def _get_kde_sample(xs, n_sample=200):
1425    """
1426    Compute kernel density estimation.
1427    """
1428    kernel = gaussian_kde(xs.astype(float))
1429
1430    sample_x = np.linspace(np.min(xs), np.max(xs), n_sample)
1431    sample_y = kernel(sample_x)
1432
1433    return sample_x, sample_y
1434
1435
1436def get_model_data(
1437    ebm,
1438    x_train,
1439    model_info,
1440    resort_categorical=False,
1441    feature_info=None,
1442    feature_level_info=None,
1443    feature_config=None,
1444):
1445    """
1446    Get the model data for GAM Coach.
1447    Args:
1448        ebm: Trained EBM model. ExplainableBoostingClassifier or
1449            ExplainableBoostingRegressor object.
1450        x_train: Training data. We use it to compute the mean absolute deviation
1451            score for continuous features, and frequency scores for categorical
1452            features.
1453        model_info: Information about the model (class names, regression target
1454            name). For classification, the order of classes matters. It should
1455            be consistent with the class encoding index. For example, the first
1456            element should be the name for class 0.
1457            It has format:
1458            `{'classes': ['loan rejection', 'loan approval']}` or
1459            `{'regressionName': 'interest rate'}`
1460        resort_categorical: Whether to sort the levels in categorical variable
1461            by increasing order if all levels can be converted to numbers.
1462        feature_info: You can provide a dictionary to give a separate display
1463            name and optional description for each feature. By default, the
1464            display name is the same as the feature name, and the description
1465            is an emtpy string. `feature_info` can be partial (only including
1466            some features). It has format:
1467            `{'feature_name': ['display_name', 'description']}`
1468        feature_level_info: You can provide a dictionary to give separate display
1469            name and optional description for each level of categorical features.
1470            By default, the display name is the same as the level name, and the
1471            description is an empty string. `feature_info` can be partial
1472            (e.g., only including some levels from some categorical features).
1473            It has format:
1474            `{'feature_name': {level_id: ['display_name', 'description']}}`
1475        feature_config: You can provide a dictionary to configure the difficulty,
1476            integer requirement, and acceptable range of individual features.
1477            The difficulty is an integer between 1 and 6: 1 (very easy to change),
1478            2 (easy), 3 (default), 4 (hard), 5 (very hard), 6 (impossible to change).
1479            By default, difficulty is set to 3 for all features, requiresInt is
1480            False for continuous variables, and acceptanceRange is None (search
1481            all range).
1482            The dictionary property has the following format:
1483            `{'difficulty': 3, 'requiresInt': True, 'acceptableRange': None}`
1484    Returns:
1485        A Python dictionary of model data
1486    """
1487    ROUND = 6
1488
1489    # Main model info on each feature
1490    features = []
1491
1492    # Track the encoding of categorical feature levels
1493    labelEncoder = {}
1494
1495    # Track the score range
1496    score_range = [np.inf, -np.inf]
1497
1498    for i in tqdm(range(len(ebm.feature_names))):
1499        cur_feature = {}
1500        cur_feature["name"] = ebm.feature_names[i]
1501        cur_feature["type"] = ebm.feature_types[i]
1502        cur_feature["importance"] = ebm.feature_importances_[i]
1503
1504        # Handle interaction term differently from cont/cat
1505        if cur_feature["type"] == "interaction":
1506            cur_id = ebm.feature_groups_[i]
1507            cur_feature["id"] = list(cur_id)
1508
1509            # Info for each individual feature
1510            cur_feature["name1"] = ebm.feature_names[cur_id[0]]
1511            cur_feature["name2"] = ebm.feature_names[cur_id[1]]
1512
1513            cur_feature["type1"] = ebm.feature_types[cur_id[0]]
1514            cur_feature["type2"] = ebm.feature_types[cur_id[1]]
1515
1516            # Skip the first item from both dimensions
1517            cur_feature["additive"] = np.round(ebm.additive_terms_[i], ROUND)[
1518                1:, 1:
1519            ].tolist()
1520            cur_feature["error"] = np.round(ebm.term_standard_deviations_[i], ROUND)[
1521                1:, 1:
1522            ].tolist()
1523
1524            # Get the bin label info
1525            cur_feature["binLabel1"] = ebm.pair_preprocessor_._get_bin_labels(cur_id[0])
1526            cur_feature["binLabel2"] = ebm.pair_preprocessor_._get_bin_labels(cur_id[1])
1527
1528            # Encode categorical levels as integers
1529            if cur_feature["type1"] == "categorical":
1530                level_str_to_int = ebm.pair_preprocessor_.col_mapping_[cur_id[0]]
1531                cur_feature["binLabel1"] = list(
1532                    map(lambda x: level_str_to_int[x], cur_feature["binLabel1"])
1533                )
1534
1535            if cur_feature["type2"] == "categorical":
1536                level_str_to_int = ebm.pair_preprocessor_.col_mapping_[cur_id[1]]
1537                cur_feature["binLabel2"] = list(
1538                    map(lambda x: level_str_to_int[x], cur_feature["binLabel2"])
1539                )
1540
1541            # Get density info
1542            if cur_feature["type1"] == "categorical":
1543                level_str_to_int = ebm.pair_preprocessor_.col_mapping_[cur_id[0]]
1544                cur_feature["histEdge1"] = ebm.preprocessor_._get_hist_edges(cur_id[0])
1545                cur_feature["histEdge1"] = list(
1546                    map(lambda x: level_str_to_int[x], cur_feature["histEdge1"])
1547                )
1548                cur_feature["histCount1"] = np.round(
1549                    ebm.preprocessor_._get_hist_counts(cur_id[0]), ROUND
1550                ).tolist()
1551            else:
1552                # Use KDE to draw density plots for cont features
1553                edges, counts = _get_kde_sample(x_train[:, cur_id[0]])
1554                cur_feature["histEdge1"] = edges.tolist()
1555                cur_feature["histCount1"] = counts.tolist()
1556
1557            if cur_feature["type2"] == "categorical":
1558                level_str_to_int = ebm.pair_preprocessor_.col_mapping_[cur_id[1]]
1559                cur_feature["histEdge2"] = ebm.preprocessor_._get_hist_edges(cur_id[1])
1560                cur_feature["histEdge2"] = list(
1561                    map(lambda x: level_str_to_int[x], cur_feature["histEdge2"])
1562                )
1563                cur_feature["histCount2"] = np.round(
1564                    ebm.preprocessor_._get_hist_counts(cur_id[1]), ROUND
1565                ).tolist()
1566            else:
1567                # Use KDE to draw density plots for cont features
1568                edges, counts = _get_kde_sample(x_train[:, cur_id[1]])
1569                cur_feature["histEdge2"] = edges.tolist()
1570                cur_feature["histCount2"] = counts.tolist()
1571
1572        else:
1573            # Skip the first item (reserved for missing value)
1574            cur_feature["additive"] = np.round(ebm.additive_terms_[i], ROUND).tolist()[
1575                1:
1576            ]
1577            cur_feature["error"] = np.round(
1578                ebm.term_standard_deviations_[i], ROUND
1579            ).tolist()[1:]
1580            cur_feature["id"] = ebm.feature_groups_[i]
1581            cur_id = ebm.feature_groups_[i][0]
1582            cur_feature["count"] = ebm.preprocessor_.col_bin_counts_[cur_id].tolist()[
1583                1:
1584            ]
1585
1586            # Track the global score range
1587            score_range[0] = min(
1588                score_range[0],
1589                np.min(ebm.additive_terms_[i] - ebm.term_standard_deviations_[i]),
1590            )
1591            score_range[1] = max(
1592                score_range[1],
1593                np.max(ebm.additive_terms_[i] + ebm.term_standard_deviations_[i]),
1594            )
1595
1596            # Add the binning information for continuous features
1597            if cur_feature["type"] == "continuous":
1598                # Add the bin information
1599                cur_feature["binEdge"] = ebm.preprocessor_._get_bin_labels(cur_id)
1600
1601                # Use KDE to draw density plots for cont features
1602                edges, counts = _get_kde_sample(x_train[:, cur_id])
1603
1604                cur_feature["histEdge"] = edges.tolist()
1605                cur_feature["histCount"] = counts.tolist()
1606
1607            elif cur_feature["type"] == "categorical":
1608                # Get the level value mapping
1609                level_str_to_int = ebm.preprocessor_.col_mapping_[cur_id]
1610
1611                if resort_categorical:
1612                    level_str_to_int = _resort_categorical_level(level_str_to_int)
1613
1614                cur_feature["binLabel"] = list(
1615                    map(
1616                        lambda x: level_str_to_int[x],
1617                        ebm.preprocessor_._get_bin_labels(cur_id),
1618                    )
1619                )
1620
1621                # Add the hist information
1622                # For categorical data, the edges are strings
1623                cur_feature["histEdge"] = list(
1624                    map(
1625                        lambda x: level_str_to_int[x],
1626                        ebm.preprocessor_._get_hist_edges(cur_id),
1627                    )
1628                )
1629
1630                cur_feature["histCount"] = np.round(
1631                    ebm.preprocessor_._get_hist_counts(cur_id), ROUND
1632                ).tolist()
1633
1634                if resort_categorical:
1635                    cur_bin_info = list(
1636                        zip(
1637                            cur_feature["binLabel"],
1638                            cur_feature["additive"],
1639                            cur_feature["error"],
1640                            cur_feature["count"],
1641                        )
1642                    )
1643                    cur_bin_info = sorted(cur_bin_info, key=lambda x: x[0])
1644
1645                    cur_feature["binLabel"] = [k[0] for k in cur_bin_info]
1646                    cur_feature["additive"] = [k[1] for k in cur_bin_info]
1647                    cur_feature["error"] = [k[2] for k in cur_bin_info]
1648                    cur_feature["count"] = [k[3] for k in cur_bin_info]
1649
1650                    cur_hist_info = list(
1651                        zip(cur_feature["histEdge"], cur_feature["histCount"])
1652                    )
1653                    cur_hist_info = sorted(cur_hist_info, key=lambda x: x[0])
1654
1655                    cur_feature["histEdge"] = [k[0] for k in cur_hist_info]
1656                    cur_feature["histCount"] = [k[1] for k in cur_hist_info]
1657
1658                # Add the label encoding information
1659                labelEncoder[cur_feature["name"]] = {
1660                    i: s for s, i in level_str_to_int.items()
1661                }
1662
1663        features.append(cur_feature)
1664
1665    score_range = list(map(lambda x: round(x, ROUND), score_range))
1666
1667    feature_names = []
1668    feature_types = []
1669
1670    # Sample data does not record interaction features
1671    for i in range(len(ebm.feature_names)):
1672        if ebm.feature_types[i] != "interaction":
1673            feature_names.append(ebm.feature_names[i])
1674            feature_types.append(ebm.feature_types[i])
1675
1676    # Compute the MAD scores and frequencies
1677    ebm_cont_indexes = np.array(
1678        [i for i in range(len(feature_names)) if feature_types[i] == "continuous"]
1679    )
1680
1681    contMads = {}
1682
1683    for i in ebm_cont_indexes:
1684        contMads[ebm.feature_names[i]] = GAMCoach.compute_mad(x_train[:, i])
1685
1686    ebm_cat_indexes = np.array(
1687        [i for i in range(len(feature_names)) if feature_types[i] == "categorical"]
1688    )
1689
1690    catDistances = {}
1691
1692    for i in ebm_cat_indexes:
1693        catDistances[feature_names[i]] = GAMCoach.compute_frequency_distance(
1694            x_train[:, i]
1695        )
1696
1697    # Initialize a feature description dictionary (provide more information about
1698    # each feature in the UI)
1699    feature_descriptions = _init_feature_descriptions(ebm, labelEncoder)
1700
1701    # Overwrite some entries in the default feature_descriptions
1702    if feature_info:
1703        for feature in feature_info:
1704            feature_descriptions[feature]["displayName"] = feature_info[feature][0]
1705            feature_descriptions[feature]["description"] = feature_info[feature][1]
1706
1707    if feature_level_info:
1708        for feature in feature_level_info:
1709            for level in feature_level_info[feature]:
1710                display_name = feature_level_info[feature][level][0]
1711                description = feature_level_info[feature][level][1]
1712                feature_descriptions[feature]["levelDescription"][level][
1713                    "displayName"
1714                ] = display_name
1715                feature_descriptions[feature]["levelDescription"][level][
1716                    "description"
1717                ] = description
1718
1719    # Put descriptions under the 'features' key
1720    for feature in features:
1721        if feature["name"] in feature_descriptions:
1722            feature["description"] = feature_descriptions[feature["name"]]
1723
1724    # Set the feature configurations
1725    feature_configurations = _init_feature_configuration(ebm)
1726
1727    if feature_config:
1728        for feature in feature_config:
1729            cur_config = feature_config[feature]
1730            for k in [
1731                "requiresInt",
1732                "difficulty",
1733                "acceptableRange",
1734                "requiresIncreasing",
1735                "requiresDecreasing",
1736                "usesTransform",
1737            ]:
1738                if k in cur_config:
1739                    feature_configurations[feature][k] = cur_config[k]
1740
1741    # Attach the configuration to the feature field
1742    for feature in features:
1743        if feature["name"] in feature_configurations:
1744            feature["config"] = feature_configurations[feature["name"]]
1745
1746    data = {
1747        "intercept": ebm.intercept_[0] if hasattr(ebm, "classes_") else ebm.intercept_,
1748        "isClassifier": hasattr(ebm, "classes_"),
1749        "modelInfo": model_info,
1750        "features": features,
1751        "labelEncoder": labelEncoder,
1752        "scoreRange": score_range,
1753        "featureNames": feature_names,
1754        "featureTypes": feature_types,
1755        "contMads": contMads,
1756        "catDistances": catDistances,
1757    }
1758
1759    return data

def search_sorted_lower_index(sorted_edges, value): View Source

1295def search_sorted_lower_index(sorted_edges, value):
1296    """Binary search to locate the correct bin for continuous features."""
1297    left = 0
1298    right = len(sorted_edges) - 1
1299
1300    while right - left > 1:
1301        i = left + int((right - left) / 2)
1302
1303        if value > sorted_edges[i]:
1304            left = i
1305        elif value < sorted_edges[i]:
1306            right = i
1307        else:
1308            return i
1309
1310    # Handle out of bound issues
1311    if value >= sorted_edges[right]:
1312        return right
1313    if value < sorted_edges[left]:
1314        return left
1315
1316    return right - 1

Binary search to locate the correct bin for continuous features.

def sigmoid(x): View Source

1319def sigmoid(x):
1320    """Sigmoid function."""
1321    return 1 / (1 + np.exp(x))

Sigmoid function.

def get_model_data( ebm, x_train, model_info, resort_categorical=False, feature_info=None, feature_level_info=None, feature_config=None): View Source

1437def get_model_data(
1438    ebm,
1439    x_train,
1440    model_info,
1441    resort_categorical=False,
1442    feature_info=None,
1443    feature_level_info=None,
1444    feature_config=None,
1445):
1446    """
1447    Get the model data for GAM Coach.
1448    Args:
1449        ebm: Trained EBM model. ExplainableBoostingClassifier or
1450            ExplainableBoostingRegressor object.
1451        x_train: Training data. We use it to compute the mean absolute deviation
1452            score for continuous features, and frequency scores for categorical
1453            features.
1454        model_info: Information about the model (class names, regression target
1455            name). For classification, the order of classes matters. It should
1456            be consistent with the class encoding index. For example, the first
1457            element should be the name for class 0.
1458            It has format:
1459            `{'classes': ['loan rejection', 'loan approval']}` or
1460            `{'regressionName': 'interest rate'}`
1461        resort_categorical: Whether to sort the levels in categorical variable
1462            by increasing order if all levels can be converted to numbers.
1463        feature_info: You can provide a dictionary to give a separate display
1464            name and optional description for each feature. By default, the
1465            display name is the same as the feature name, and the description
1466            is an emtpy string. `feature_info` can be partial (only including
1467            some features). It has format:
1468            `{'feature_name': ['display_name', 'description']}`
1469        feature_level_info: You can provide a dictionary to give separate display
1470            name and optional description for each level of categorical features.
1471            By default, the display name is the same as the level name, and the
1472            description is an empty string. `feature_info` can be partial
1473            (e.g., only including some levels from some categorical features).
1474            It has format:
1475            `{'feature_name': {level_id: ['display_name', 'description']}}`
1476        feature_config: You can provide a dictionary to configure the difficulty,
1477            integer requirement, and acceptable range of individual features.
1478            The difficulty is an integer between 1 and 6: 1 (very easy to change),
1479            2 (easy), 3 (default), 4 (hard), 5 (very hard), 6 (impossible to change).
1480            By default, difficulty is set to 3 for all features, requiresInt is
1481            False for continuous variables, and acceptanceRange is None (search
1482            all range).
1483            The dictionary property has the following format:
1484            `{'difficulty': 3, 'requiresInt': True, 'acceptableRange': None}`
1485    Returns:
1486        A Python dictionary of model data
1487    """
1488    ROUND = 6
1489
1490    # Main model info on each feature
1491    features = []
1492
1493    # Track the encoding of categorical feature levels
1494    labelEncoder = {}
1495
1496    # Track the score range
1497    score_range = [np.inf, -np.inf]
1498
1499    for i in tqdm(range(len(ebm.feature_names))):
1500        cur_feature = {}
1501        cur_feature["name"] = ebm.feature_names[i]
1502        cur_feature["type"] = ebm.feature_types[i]
1503        cur_feature["importance"] = ebm.feature_importances_[i]
1504
1505        # Handle interaction term differently from cont/cat
1506        if cur_feature["type"] == "interaction":
1507            cur_id = ebm.feature_groups_[i]
1508            cur_feature["id"] = list(cur_id)
1509
1510            # Info for each individual feature
1511            cur_feature["name1"] = ebm.feature_names[cur_id[0]]
1512            cur_feature["name2"] = ebm.feature_names[cur_id[1]]
1513
1514            cur_feature["type1"] = ebm.feature_types[cur_id[0]]
1515            cur_feature["type2"] = ebm.feature_types[cur_id[1]]
1516
1517            # Skip the first item from both dimensions
1518            cur_feature["additive"] = np.round(ebm.additive_terms_[i], ROUND)[
1519                1:, 1:
1520            ].tolist()
1521            cur_feature["error"] = np.round(ebm.term_standard_deviations_[i], ROUND)[
1522                1:, 1:
1523            ].tolist()
1524
1525            # Get the bin label info
1526            cur_feature["binLabel1"] = ebm.pair_preprocessor_._get_bin_labels(cur_id[0])
1527            cur_feature["binLabel2"] = ebm.pair_preprocessor_._get_bin_labels(cur_id[1])
1528
1529            # Encode categorical levels as integers
1530            if cur_feature["type1"] == "categorical":
1531                level_str_to_int = ebm.pair_preprocessor_.col_mapping_[cur_id[0]]
1532                cur_feature["binLabel1"] = list(
1533                    map(lambda x: level_str_to_int[x], cur_feature["binLabel1"])
1534                )
1535
1536            if cur_feature["type2"] == "categorical":
1537                level_str_to_int = ebm.pair_preprocessor_.col_mapping_[cur_id[1]]
1538                cur_feature["binLabel2"] = list(
1539                    map(lambda x: level_str_to_int[x], cur_feature["binLabel2"])
1540                )
1541
1542            # Get density info
1543            if cur_feature["type1"] == "categorical":
1544                level_str_to_int = ebm.pair_preprocessor_.col_mapping_[cur_id[0]]
1545                cur_feature["histEdge1"] = ebm.preprocessor_._get_hist_edges(cur_id[0])
1546                cur_feature["histEdge1"] = list(
1547                    map(lambda x: level_str_to_int[x], cur_feature["histEdge1"])
1548                )
1549                cur_feature["histCount1"] = np.round(
1550                    ebm.preprocessor_._get_hist_counts(cur_id[0]), ROUND
1551                ).tolist()
1552            else:
1553                # Use KDE to draw density plots for cont features
1554                edges, counts = _get_kde_sample(x_train[:, cur_id[0]])
1555                cur_feature["histEdge1"] = edges.tolist()
1556                cur_feature["histCount1"] = counts.tolist()
1557
1558            if cur_feature["type2"] == "categorical":
1559                level_str_to_int = ebm.pair_preprocessor_.col_mapping_[cur_id[1]]
1560                cur_feature["histEdge2"] = ebm.preprocessor_._get_hist_edges(cur_id[1])
1561                cur_feature["histEdge2"] = list(
1562                    map(lambda x: level_str_to_int[x], cur_feature["histEdge2"])
1563                )
1564                cur_feature["histCount2"] = np.round(
1565                    ebm.preprocessor_._get_hist_counts(cur_id[1]), ROUND
1566                ).tolist()
1567            else:
1568                # Use KDE to draw density plots for cont features
1569                edges, counts = _get_kde_sample(x_train[:, cur_id[1]])
1570                cur_feature["histEdge2"] = edges.tolist()
1571                cur_feature["histCount2"] = counts.tolist()
1572
1573        else:
1574            # Skip the first item (reserved for missing value)
1575            cur_feature["additive"] = np.round(ebm.additive_terms_[i], ROUND).tolist()[
1576                1:
1577            ]
1578            cur_feature["error"] = np.round(
1579                ebm.term_standard_deviations_[i], ROUND
1580            ).tolist()[1:]
1581            cur_feature["id"] = ebm.feature_groups_[i]
1582            cur_id = ebm.feature_groups_[i][0]
1583            cur_feature["count"] = ebm.preprocessor_.col_bin_counts_[cur_id].tolist()[
1584                1:
1585            ]
1586
1587            # Track the global score range
1588            score_range[0] = min(
1589                score_range[0],
1590                np.min(ebm.additive_terms_[i] - ebm.term_standard_deviations_[i]),
1591            )
1592            score_range[1] = max(
1593                score_range[1],
1594                np.max(ebm.additive_terms_[i] + ebm.term_standard_deviations_[i]),
1595            )
1596
1597            # Add the binning information for continuous features
1598            if cur_feature["type"] == "continuous":
1599                # Add the bin information
1600                cur_feature["binEdge"] = ebm.preprocessor_._get_bin_labels(cur_id)
1601
1602                # Use KDE to draw density plots for cont features
1603                edges, counts = _get_kde_sample(x_train[:, cur_id])
1604
1605                cur_feature["histEdge"] = edges.tolist()
1606                cur_feature["histCount"] = counts.tolist()
1607
1608            elif cur_feature["type"] == "categorical":
1609                # Get the level value mapping
1610                level_str_to_int = ebm.preprocessor_.col_mapping_[cur_id]
1611
1612                if resort_categorical:
1613                    level_str_to_int = _resort_categorical_level(level_str_to_int)
1614
1615                cur_feature["binLabel"] = list(
1616                    map(
1617                        lambda x: level_str_to_int[x],
1618                        ebm.preprocessor_._get_bin_labels(cur_id),
1619                    )
1620                )
1621
1622                # Add the hist information
1623                # For categorical data, the edges are strings
1624                cur_feature["histEdge"] = list(
1625                    map(
1626                        lambda x: level_str_to_int[x],
1627                        ebm.preprocessor_._get_hist_edges(cur_id),
1628                    )
1629                )
1630
1631                cur_feature["histCount"] = np.round(
1632                    ebm.preprocessor_._get_hist_counts(cur_id), ROUND
1633                ).tolist()
1634
1635                if resort_categorical:
1636                    cur_bin_info = list(
1637                        zip(
1638                            cur_feature["binLabel"],
1639                            cur_feature["additive"],
1640                            cur_feature["error"],
1641                            cur_feature["count"],
1642                        )
1643                    )
1644                    cur_bin_info = sorted(cur_bin_info, key=lambda x: x[0])
1645
1646                    cur_feature["binLabel"] = [k[0] for k in cur_bin_info]
1647                    cur_feature["additive"] = [k[1] for k in cur_bin_info]
1648                    cur_feature["error"] = [k[2] for k in cur_bin_info]
1649                    cur_feature["count"] = [k[3] for k in cur_bin_info]
1650
1651                    cur_hist_info = list(
1652                        zip(cur_feature["histEdge"], cur_feature["histCount"])
1653                    )
1654                    cur_hist_info = sorted(cur_hist_info, key=lambda x: x[0])
1655
1656                    cur_feature["histEdge"] = [k[0] for k in cur_hist_info]
1657                    cur_feature["histCount"] = [k[1] for k in cur_hist_info]
1658
1659                # Add the label encoding information
1660                labelEncoder[cur_feature["name"]] = {
1661                    i: s for s, i in level_str_to_int.items()
1662                }
1663
1664        features.append(cur_feature)
1665
1666    score_range = list(map(lambda x: round(x, ROUND), score_range))
1667
1668    feature_names = []
1669    feature_types = []
1670
1671    # Sample data does not record interaction features
1672    for i in range(len(ebm.feature_names)):
1673        if ebm.feature_types[i] != "interaction":
1674            feature_names.append(ebm.feature_names[i])
1675            feature_types.append(ebm.feature_types[i])
1676
1677    # Compute the MAD scores and frequencies
1678    ebm_cont_indexes = np.array(
1679        [i for i in range(len(feature_names)) if feature_types[i] == "continuous"]
1680    )
1681
1682    contMads = {}
1683
1684    for i in ebm_cont_indexes:
1685        contMads[ebm.feature_names[i]] = GAMCoach.compute_mad(x_train[:, i])
1686
1687    ebm_cat_indexes = np.array(
1688        [i for i in range(len(feature_names)) if feature_types[i] == "categorical"]
1689    )
1690
1691    catDistances = {}
1692
1693    for i in ebm_cat_indexes:
1694        catDistances[feature_names[i]] = GAMCoach.compute_frequency_distance(
1695            x_train[:, i]
1696        )
1697
1698    # Initialize a feature description dictionary (provide more information about
1699    # each feature in the UI)
1700    feature_descriptions = _init_feature_descriptions(ebm, labelEncoder)
1701
1702    # Overwrite some entries in the default feature_descriptions
1703    if feature_info:
1704        for feature in feature_info:
1705            feature_descriptions[feature]["displayName"] = feature_info[feature][0]
1706            feature_descriptions[feature]["description"] = feature_info[feature][1]
1707
1708    if feature_level_info:
1709        for feature in feature_level_info:
1710            for level in feature_level_info[feature]:
1711                display_name = feature_level_info[feature][level][0]
1712                description = feature_level_info[feature][level][1]
1713                feature_descriptions[feature]["levelDescription"][level][
1714                    "displayName"
1715                ] = display_name
1716                feature_descriptions[feature]["levelDescription"][level][
1717                    "description"
1718                ] = description
1719
1720    # Put descriptions under the 'features' key
1721    for feature in features:
1722        if feature["name"] in feature_descriptions:
1723            feature["description"] = feature_descriptions[feature["name"]]
1724
1725    # Set the feature configurations
1726    feature_configurations = _init_feature_configuration(ebm)
1727
1728    if feature_config:
1729        for feature in feature_config:
1730            cur_config = feature_config[feature]
1731            for k in [
1732                "requiresInt",
1733                "difficulty",
1734                "acceptableRange",
1735                "requiresIncreasing",
1736                "requiresDecreasing",
1737                "usesTransform",
1738            ]:
1739                if k in cur_config:
1740                    feature_configurations[feature][k] = cur_config[k]
1741
1742    # Attach the configuration to the feature field
1743    for feature in features:
1744        if feature["name"] in feature_configurations:
1745            feature["config"] = feature_configurations[feature["name"]]
1746
1747    data = {
1748        "intercept": ebm.intercept_[0] if hasattr(ebm, "classes_") else ebm.intercept_,
1749        "isClassifier": hasattr(ebm, "classes_"),
1750        "modelInfo": model_info,
1751        "features": features,
1752        "labelEncoder": labelEncoder,
1753        "scoreRange": score_range,
1754        "featureNames": feature_names,
1755        "featureTypes": feature_types,
1756        "contMads": contMads,
1757        "catDistances": catDistances,
1758    }
1759
1760    return data

Get the model data for GAM Coach.

Args

ebm: Trained EBM model. ExplainableBoostingClassifier or ExplainableBoostingRegressor object.
x_train: Training data. We use it to compute the mean absolute deviation score for continuous features, and frequency scores for categorical features.
model_info: Information about the model (class names, regression target name). For classification, the order of classes matters. It should be consistent with the class encoding index. For example, the first element should be the name for class 0. It has format: {'classes': ['loan rejection', 'loan approval']} or {'regressionName': 'interest rate'}
resort_categorical: Whether to sort the levels in categorical variable by increasing order if all levels can be converted to numbers.
feature_info: You can provide a dictionary to give a separate display name and optional description for each feature. By default, the display name is the same as the feature name, and the description is an emtpy string. feature_info can be partial (only including some features). It has format: {'feature_name': ['display_name', 'description']}
feature_level_info: You can provide a dictionary to give separate display name and optional description for each level of categorical features. By default, the display name is the same as the level name, and the description is an empty string. feature_info can be partial (e.g., only including some levels from some categorical features). It has format: {'feature_name': {level_id: ['display_name', 'description']}}
feature_config: You can provide a dictionary to configure the difficulty, integer requirement, and acceptable range of individual features. The difficulty is an integer between 1 and 6: 1 (very easy to change), 2 (easy), 3 (default), 4 (hard), 5 (very hard), 6 (impossible to change). By default, difficulty is set to 3 for all features, requiresInt is False for continuous variables, and acceptanceRange is None (search all range). The dictionary property has the following format: {'difficulty': 3, 'requiresInt': True, 'acceptableRange': None}

Returns

A Python dictionary of model data