gamcoach.gamcoach

Main module for GAM Coach.

GAM Coach implements a simple and flexible method to generate counterfactual explanations for generalized additive models (GAMs).

   1"""Main module for GAM Coach.
   2
   3GAM Coach implements a simple and flexible method to generate counterfactual
   4explanations for generalized additive models (GAMs).
   5"""
   6
   7import numpy as np
   8import pandas as pd
   9import re
  10import pulp
  11from tqdm import tqdm
  12from scipy.stats import gaussian_kde
  13from interpret.glassbox import (
  14    ExplainableBoostingClassifier,
  15    ExplainableBoostingRegressor,
  16)
  17from collections import Counter
  18from typing import Union
  19
  20from .counterfactuals import Counterfactuals
  21
  22SEED = 922
  23
  24
  25class GAMCoach:
  26    """Main class for GAM Coach."""
  27
  28    def __init__(
  29        self,
  30        ebm: Union[ExplainableBoostingClassifier, ExplainableBoostingRegressor],
  31        x_train: np.ndarray,
  32        cont_mads=None,
  33        cat_distances=None,
  34        adjust_cat_distance=True,
  35    ):
  36        """Initialize a GAMCoach object.
  37
  38        Args:
  39            ebm (Union[ExplainableBoostingClassifier, ExplainableBoostingRegressor]):
  40                The trained EBM model. It can be either a classifier or a regressor.
  41            x_train (np.ndarray): The training data. It is used to compute the
  42                distance for different features.
  43            cont_mads (dict, optional): `feature_name` -> `median absolute
  44                deviation score`. If it is provided, it is used to overwrite the
  45                computed MADs for continuous variables. It is useful when you
  46                want to provide a custom normalization function to compute the
  47                distance between continuous features.
  48            cat_distances (dict, optional): `feature_name` -> {`level_name` -> `distance`}.
  49                Level distance of categorical variables. By default, the distance
  50                is computed by (1 - frequency(level)) for each level. It imples
  51                that it is easier to move to a more frequent. If `cat_distances`
  52                is provided, it will overwrite the default distance for
  53                categorical variables.
  54            adjust_cat_distance (bool, optional): If true, we use (1 -
  55                frequency(level)) for each level. Otherwise, we give distance = 1
  56                for different levels and 0 for the same level.
  57        """
  58
  59        self.ebm: Union[
  60            ExplainableBoostingClassifier, ExplainableBoostingRegressor
  61        ] = ebm
  62        """The trained EBM model."""
  63
  64        self.x_train: np.ndarray = x_train
  65
  66        self.cont_mads: dict = cont_mads
  67        """Median absolute deviation (MAD) of continuous variables."""
  68
  69        self.cat_distances: dict = cat_distances
  70        """Level distance of categorical variables. By default, the distance is
  71        computed by $(1 - \\frac{\\text{count of} L_i}{\\text{count of all L}})$
  72        for one level $L_i$. It implies that it is easier to move to a more
  73        frequent level.
  74        """
  75
  76        self.adjust_cat_distance: bool = adjust_cat_distance
  77
  78        # If cont_mads is not given, we compute it from the training data
  79        if self.cont_mads is None:
  80            ebm_cont_indexes = np.array(
  81                [
  82                    i
  83                    for i in range(len(self.ebm.feature_names))
  84                    if self.ebm.feature_types[i] == "continuous"
  85                ]
  86            )
  87
  88            self.cont_mads = {}
  89
  90            for i in ebm_cont_indexes:
  91                self.cont_mads[ebm.feature_names[i]] = self.compute_mad(
  92                    self.x_train[:, i]
  93                )
  94
  95        # If cat_distance is not given, we compute it from the training data
  96        if self.cat_distances is None:
  97            ebm_cat_indexes = np.array(
  98                [
  99                    i
 100                    for i in range(len(self.ebm.feature_names))
 101                    if self.ebm.feature_types[i] == "categorical"
 102                ]
 103            )
 104
 105            self.cat_distances = {}
 106
 107            if self.adjust_cat_distance:
 108                for i in ebm_cat_indexes:
 109                    self.cat_distances[
 110                        self.ebm.feature_names[i]
 111                    ] = GAMCoach.compute_frequency_distance(self.x_train[:, i])
 112            else:
 113                for i in ebm_cat_indexes:
 114                    self.cat_distances[
 115                        self.ebm.feature_names[i]
 116                    ] = GAMCoach.compute_naive_cat_distance(self.x_train[:, i])
 117
 118        # Determine if the ebm is a classifier or a regressor
 119        self.is_classifier = isinstance(self.ebm.intercept_, np.ndarray)
 120        """True if the ebm model is a classifier, false if it is a regressor."""
 121
 122    def generate_cfs(
 123        self,
 124        cur_example: np.ndarray,
 125        total_cfs: int = 1,
 126        target_range: tuple = None,
 127        sim_threshold_factor: float = 0.005,
 128        sim_threshold: float = None,
 129        categorical_weight: Union[float, str] = "auto",
 130        features_to_vary: list = None,
 131        max_num_features_to_vary: int = None,
 132        feature_ranges: dict = None,
 133        continuous_integer_features: list = None,
 134        verbose: int = 1,
 135    ) -> Counterfactuals:
 136        """Generate counterfactual examples.
 137
 138        Use mixed-integer linear programming to generate optimal counterfactual
 139        examples for the given data point.
 140
 141        Args:
 142            cur_example (np.ndarray): The data point of interest. This function
 143                aims to find similar examples that the model gives different
 144                predictions.
 145            total_cfs (int, optional): The total number of counterfactuals to,
 146                generate. Default to 1.
 147            target_range (tuple, optional): The targetted prediction range. This
 148                parameter is required if the EBM is a regressor.
 149            sim_threshold_factor (float, optional): A positive float to automatically
 150                generate a similarity threshold. This parameter has no effect if
 151                `sim_threshold` is provided. If `sim_threshold` is
 152                not provided, we compute `sim_threshold` as `sim_threshold_factor`
 153                * average additive score range of all continuous features. If
 154                `sim_threshold_factor` is too small, it takes longer time to
 155                generate CFs. If `sim_threshold_factor` is too large, the
 156                algorithm might miss some optimal CFs.
 157            sim_threshold (float, optional): A positive float to determine how we
 158                decide if two bins of a continuous feature have similar scores.
 159                Two bins $b_1$ and $b_2$ are similar (the distant one will be
 160                removed) if $|b_1 - b_2| \\leq$ `sim_threshold`.
 161            categorical_weight (Union[float, str], optional): A positive float
 162                to scale the distances of options for categorical variables. Since
 163                we have very different distance functions for continuous and
 164                categorical features, we need to scale them so they are at a
 165                comparable range. To do that, we multiply the categorical feature's
 166                distances by `categorical_weight`. By default ('auto'), we scale
 167                the distances of categorical features so that they have the mean
 168                distance as continuous features.
 169            features_to_vary ([str], optional): A list of feature names that
 170                the CFs can change. If it is `None`, this function will use all
 171                features.
 172            max_num_features_to_vary (int, optional): The max number of features
 173                that the CF can vary. Default is no maximum.
 174            feature_ranges (dict, optional): A dictionary to control the permitted
 175                ranges/values for continuous/categorical features. It maps
 176                `feature_name` -> [`min_value`, `max_value`] for continuous features,
 177                `feature_name` -> [`level1`, `level2`, ...] for categorical features.
 178            continuous_integer_features (list, optional): A list of names of
 179                continuous features that need to be integers (e.g., age, FICO score)
 180            verbose (int): 0: no any output, 1: show progress bar, 2: show internal
 181                optimization details
 182
 183        Returns:
 184            Counterfactuals: The generated counterfactual examples with their
 185                associated distances and change information.
 186        """
 187
 188        # Transforming some parameters
 189        if len(cur_example.shape) == 1:
 190            cur_example = cur_example.reshape(1, -1)
 191
 192        if features_to_vary is None:
 193            features_to_vary = [
 194                self.ebm.feature_names[i]
 195                for i in range(len(self.ebm.feature_types))
 196                if self.ebm.feature_types[i] != "interaction"
 197            ]
 198
 199        # Step 1: Find the current score for each feature
 200        # This is done by ebm.explain_local()
 201        cur_scores = {}
 202
 203        if self.is_classifier:
 204            cur_scores["intercept"] = self.ebm.intercept_[0]
 205        else:
 206            cur_scores["intercept"] = self.ebm.intercept_
 207
 208        local_data = self.ebm.explain_local(cur_example)._internal_obj
 209
 210        for i in range(len(self.ebm.feature_names)):
 211            cur_feature_name = self.ebm.feature_names[i]
 212            cur_feature_type = self.ebm.feature_types[i]
 213
 214            cur_scores[cur_feature_name] = local_data["specific"][0]["scores"][i]
 215
 216        # Find the CF direction
 217
 218        # Binary classification
 219        # Predicted 0 => +1
 220        # Predicted 1 => -1
 221        if self.is_classifier:
 222            cf_direction = self.ebm.predict(cur_example)[0] * (-2) + 1
 223            total_score = np.sum([cur_scores[k] for k in cur_scores])
 224            needed_score_gain = -total_score
 225            score_gain_bound = None
 226
 227        else:
 228            # Regression
 229            # Increase +1
 230            # Decrease -1
 231            if target_range is None:
 232                raise ValueError(
 233                    "target_range cannot be None when the model is a regressor"
 234                )
 235
 236            predicted_value = self.ebm.predict(cur_example)[0]
 237            if (
 238                predicted_value >= target_range[0]
 239                and predicted_value <= target_range[1]
 240            ):
 241                raise ValueError("The target_range cannot cover the current prediction")
 242
 243            elif predicted_value < target_range[0]:
 244                cf_direction = 1
 245                needed_score_gain = target_range[0] - predicted_value
 246                score_gain_bound = target_range[1] - predicted_value
 247            else:
 248                cf_direction = -1
 249                needed_score_gain = target_range[1] - predicted_value
 250                score_gain_bound = target_range[0] - predicted_value
 251
 252        # Step 2: Generate continuous and categorical options
 253        options = {}
 254
 255        # Generate a similarity threshold if it is not provided
 256        if sim_threshold is None:
 257            additive_ranges = []
 258
 259            for i in range(len(self.ebm.feature_names)):
 260                if self.ebm.feature_types[i] == "continuous":
 261                    cur_values = self.ebm.additive_terms_[i]
 262                    additive_ranges.append(np.max(cur_values) - np.min(cur_values))
 263
 264            sim_threshold = np.mean(additive_ranges) * sim_threshold_factor
 265
 266        # To make it faster to solve the MILP problem, we can decrease the
 267        # number of variables by filtering out unhelpful and redundant options
 268        #
 269        # (1) Unhelpful options: options that move the score to an undesirable
 270        # direction. For example, if we want to flip 0 to 1, options that decrease
 271        # the score are unhelpful.
 272        #
 273        # (2) Redundant options: for a set of options that give similar score
 274        # gains (bounded by a parameter epsilon), we only need to incldue one
 275        # option that has the lowest distance. This is only relevant for
 276        # continuous variables. Users can set the parameter epsilon. The default
 277        # should be relatively small, otherwise we might miss the optimal solution.
 278
 279        # Step 2.1: Find all good options from continuous and categorical features
 280        for cur_feature_id in range(len(self.ebm.feature_names)):
 281
 282            cur_feature_name = self.ebm.feature_names[cur_feature_id]
 283            cur_feature_type = self.ebm.feature_types[cur_feature_id]
 284            cur_feature_index = self.ebm.feature_groups_[cur_feature_id][0]
 285
 286            if cur_feature_type == "interaction":
 287                continue
 288
 289            elif cur_feature_type == "continuous":
 290                # The parameter epsilon controls the threshold of how we determine
 291                # "similar" options for continuous variables
 292                epsilon = sim_threshold
 293
 294                cur_feature_score = cur_scores[cur_feature_name]
 295                cur_feature_value = float(cur_example[0][cur_feature_id])
 296
 297                # Users can require the continuous feature to have integer values
 298                # For example, age, FICO score, and number of accounts
 299                need_to_be_int = False
 300                if (
 301                    continuous_integer_features
 302                    and cur_feature_name in continuous_integer_features
 303                ):
 304                    need_to_be_int = True
 305
 306                cur_cont_options = self.generate_cont_options(
 307                    cf_direction,
 308                    cur_feature_index,
 309                    cur_feature_name,
 310                    cur_feature_value,
 311                    cur_feature_score,
 312                    self.cont_mads,
 313                    cur_example[0],
 314                    score_gain_bound,
 315                    epsilon,
 316                    need_to_be_int,
 317                )
 318
 319                options[cur_feature_name] = cur_cont_options
 320
 321            elif cur_feature_type == "categorical":
 322                cur_feature_score = cur_scores[cur_feature_name]
 323                cur_feature_value = str(cur_example[0][cur_feature_id])
 324                cur_cat_distance = self.cat_distances[cur_feature_name]
 325
 326                cur_cat_options = self.generate_cat_options(
 327                    cf_direction,
 328                    cur_feature_index,
 329                    cur_feature_value,
 330                    cur_feature_score,
 331                    cur_cat_distance,
 332                    cur_example[0],
 333                    score_gain_bound,
 334                )
 335
 336                options[cur_feature_name] = cur_cat_options
 337
 338        # Step 2.2: Filter out undesired options (based on the feature_range)
 339        if feature_ranges is not None:
 340            for f_name in feature_ranges:
 341                cur_range = feature_ranges[f_name]
 342                f_index = self.ebm.feature_names.index(f_name)
 343                f_type = self.ebm.feature_types[f_index]
 344
 345                if f_type == "continuous":
 346                    # Delete options that use out-of-range options
 347                    for o in range(len(options[f_name]) - 1, -1, -1):
 348                        cur_target = options[f_name][o][0]
 349                        if cur_target < cur_range[0] or cur_target > cur_range[1]:
 350                            options[f_name].pop(o)
 351                elif f_type == "categorical":
 352                    for o in range(len(options[f_name]) - 1, -1, -1):
 353                        if options[f_name][o][0] not in cur_range:
 354                            options[f_name].pop(o)
 355
 356        # Step 2.3: Compute the interaction offsets for all possible options
 357        for cur_feature_id in range(len(self.ebm.feature_names)):
 358
 359            cur_feature_name = self.ebm.feature_names[cur_feature_id]
 360            cur_feature_type = self.ebm.feature_types[cur_feature_id]
 361
 362            if cur_feature_type == "interaction":
 363
 364                cur_feature_index_1 = self.ebm.feature_groups_[cur_feature_id][0]
 365                cur_feature_index_2 = self.ebm.feature_groups_[cur_feature_id][1]
 366
 367                cur_feature_score = cur_scores[cur_feature_name]
 368                options[cur_feature_name] = self.generate_inter_options(
 369                    cur_feature_id,
 370                    cur_feature_index_1,
 371                    cur_feature_index_2,
 372                    cur_feature_score,
 373                    options,
 374                )
 375
 376        # Step 2.4: Rescale categorical distances so that they have the same mean
 377        # as continuous variables (default)
 378        if categorical_weight == "auto":
 379            cont_distances = []
 380            cat_distances = []
 381
 382            for f_name in options:
 383                f_index = self.ebm.feature_names.index(f_name)
 384                f_type = self.ebm.feature_types[f_index]
 385
 386                if f_type == "continuous":
 387                    for option in options[f_name]:
 388                        cont_distances.append(option[2])
 389                elif f_type == "categorical":
 390                    for option in options[f_name]:
 391                        cat_distances.append(option[2])
 392
 393            categorical_weight = np.mean(cont_distances) / np.mean(cat_distances)
 394
 395        for f_name in options:
 396            f_index = self.ebm.feature_names.index(f_name)
 397            f_type = self.ebm.feature_types[f_index]
 398
 399            if f_type == "categorical":
 400                for option in options[f_name]:
 401                    option[2] = option[2] * categorical_weight
 402
 403        # Step 3. Formulate the MILP model and solve it
 404
 405        # Find diverse solutions by accumulatively muting the optimal solutions
 406        solutions = []
 407        muted_variables = []
 408        is_successful = True
 409
 410        for _ in tqdm(range(total_cfs), disable=verbose == 0):
 411            model, variables = self.create_milp(
 412                cf_direction,
 413                needed_score_gain,
 414                features_to_vary,
 415                options,
 416                max_num_features_to_vary,
 417                muted_variables=muted_variables,
 418            )
 419
 420            model.solve(pulp.apis.PULP_CBC_CMD(msg=verbose > 0, warmStart=True))
 421
 422            if model.status != 1:
 423                is_successful = False
 424
 425            if verbose == 2:
 426                print("solver runs for {:.2f} seconds".format(model.solutionTime))
 427                print("status: {}".format(pulp.LpStatus[model.status]))
 428
 429            active_variables = []
 430
 431            # Print the optimal solution
 432            for key in variables:
 433                for x in variables[key]:
 434                    if x.varValue > 0:
 435                        active_variables.append(x)
 436
 437            if verbose == 2:
 438                print("\nFound solutions:")
 439                self.print_solution(cur_example, active_variables, options)
 440
 441            # Collect the current solution and mute the associated variables
 442            solutions.append([active_variables, pulp.value(model.objective)])
 443
 444            for var in active_variables:
 445                if " x " not in var.name:
 446                    muted_variables.append(var.name)
 447
 448        cfs = Counterfactuals(
 449            solutions, is_successful, model, variables, self.ebm, cur_example, options
 450        )
 451
 452        return cfs
 453
 454    def generate_cont_options(
 455        self,
 456        cf_direction,
 457        cur_feature_index,
 458        cur_feature_name,
 459        cur_feature_value,
 460        cur_feature_score,
 461        cont_mads,
 462        cur_example,
 463        score_gain_bound=None,
 464        epsilon=0.005,
 465        need_to_be_int=False,
 466        skip_unhelpful=True,
 467    ):
 468        """
 469        Generate all alternative options for this continuous variable. This function
 470        would filter out all options that are:
 471
 472        1. Not helpful for the counterfactual generation.
 473        2. Give similar score gain but requires larger distance.
 474
 475        Args:
 476            cf_direction (int): Integer `+1` if 0 => 1, `-1` if 1 => 0
 477                (classification); `+1` if we need to increase the prediction,
 478                `-1` if decrease (regression).
 479            cur_feature_index (int): The index of the current continuous feature.
 480            cur_feature_name (str): Name of the current feature.
 481            cur_feature_value (float): The current feature value.
 482            cur_feature_score (float): The score for the current feature value.
 483            cont_mads (dict): A map of feature_name => MAD score.
 484            cur_example (list): Current sample values
 485            score_gain_bound (float): Bound of the score gain. We do not collect
 486                options that give `score_gain` > `score_gain_bound` (when
 487                `cf_direction=1`), or `score_gain` < `score_gain_bound` (when
 488                `cf_direction=-1`)
 489            epsilon (float): The threshold to determine if two options give similar
 490                score gains. Score gains $s_1$ and $s_2$ are similar if
 491                $|s_1 - s_2| <$ epsilon. Smaller epsilon significantly increases
 492                the time to solve the MILP. Large epsilon might filter out the
 493                optimal CF. Defaults to 0.005.
 494            need_to_be_int (bool): True if the target values for this continuous
 495                variable need to have integer values.
 496            skip_unhelpful (bool): True if to skip options from main
 497                effects that give opposite score gain. It is rare that there is a
 498                positive score gain from pair-interaction that outweigh negative
 499                score gain from two main effects, and adjusting the distance penalty.
 500
 501        Returns:
 502            list: List of option tuples (target, score gain, distance, bin_index)
 503        """
 504
 505        # For each continuous feature, each bin is a variable
 506        # For each bin, we need to compute (1) score gain, (2) distance
 507        # (1) score gain is the difference between new bin and current bin
 508        # (2) distance is L1 distance divided by median absolute deviation (MAD)
 509
 510        # Get the additive scores of this feature
 511        additives = self.ebm.additive_terms_[cur_feature_index][1:]
 512
 513        # Get the bin edges of this feature
 514        bin_starts = self.ebm.preprocessor_._get_bin_labels(cur_feature_index)[:-1]
 515
 516        # Create "options", each option is a tuple (target, score_gain, distance,
 517        # bin_index)
 518        cont_options = []
 519
 520        # Identify which bin this value falls into
 521        cur_bin_id = search_sorted_lower_index(bin_starts, cur_feature_value)
 522        assert additives[cur_bin_id] == cur_feature_score
 523
 524        # Identify interaction terms that we need to consider
 525        associated_interactions = []
 526
 527        for cur_feature_id in range(len(self.ebm.feature_names)):
 528            cur_feature_type = self.ebm.feature_types[cur_feature_id]
 529            if cur_feature_type == "interaction":
 530
 531                indexes = self.ebm.feature_groups_[cur_feature_id]
 532
 533                if cur_feature_index in indexes:
 534                    feature_position = 0 if indexes[0] == cur_feature_index else 1
 535
 536                    other_position = 1 - feature_position
 537                    other_index = indexes[other_position]
 538                    other_type = self.ebm.feature_types[other_index]
 539
 540                    # Get the current additive scores and bin edges
 541                    inter_additives = self.ebm.additive_terms_[cur_feature_id][1:, 1:]
 542
 543                    # Have to skip the max edge if it is continuous
 544                    bin_starts_feature = self.ebm.pair_preprocessor_._get_bin_labels(
 545                        cur_feature_index
 546                    )[:-1]
 547
 548                    bin_starts_other = self.ebm.pair_preprocessor_._get_bin_labels(
 549                        other_index
 550                    )
 551                    if other_type == "continuous":
 552                        bin_starts_other = bin_starts_other[:-1]
 553
 554                    # Get the current interaction term score
 555                    other_bin = None
 556                    if other_type == "continuous":
 557                        other_bin = search_sorted_lower_index(
 558                            bin_starts_other, float(cur_example[other_index])
 559                        )
 560                    else:
 561                        other_bin = bin_starts_other.index(cur_example[other_index])
 562
 563                    feature_bin = search_sorted_lower_index(
 564                        bin_starts_feature, cur_feature_value
 565                    )
 566
 567                    feature_inter_score = 0
 568
 569                    if feature_position == 0:
 570                        feature_inter_score = inter_additives[feature_bin, other_bin]
 571                    else:
 572                        feature_inter_score = inter_additives[other_bin, feature_bin]
 573
 574                    # Extract the row or column where we fix the other feature and
 575                    # vary the current feature
 576                    feature_inter_bin_starts = bin_starts_feature
 577                    feature_inter_additives = []
 578
 579                    if feature_position == 0:
 580                        for i in range(len(inter_additives)):
 581                            feature_inter_additives.append(
 582                                inter_additives[i, other_bin]
 583                            )
 584                    else:
 585                        for i in range(len(inter_additives[0])):
 586                            feature_inter_additives.append(
 587                                inter_additives[other_bin, i]
 588                            )
 589
 590                    # Register this interaction term
 591                    associated_interactions.append(
 592                        {
 593                            "inter_index": indexes,
 594                            "cur_interaction_id": cur_feature_id,
 595                            "feature_inter_score": feature_inter_score,
 596                            "feature_inter_bin_starts": feature_inter_bin_starts,
 597                            "feature_inter_additives": feature_inter_additives,
 598                        }
 599                    )
 600
 601        for i in range(len(additives)):
 602            # Because of the special binning structure of EBM, the distance of
 603            # bins on the left to the current value is different from the bins
 604            # that are on the right
 605            #
 606            # For bins on the left, the raw distance is abs(bin_start[i + 1] - x)
 607            # For bins on the right, the raw distance is abs(bin_start[i] - x)
 608            target = cur_feature_value
 609            distance = 0
 610
 611            if i < cur_bin_id:
 612                # First need to consider if it is need to be an integer
 613                # If so, it would be the closest integer to the right point
 614                if need_to_be_int:
 615                    target = float(int(bin_starts[i + 1]))
 616                    if target == bin_starts[i + 1]:
 617                        target -= 1
 618
 619                    # Skip this option if it is not possible to find an int value
 620                    if target < bin_starts[i]:
 621                        continue
 622
 623                    distance = np.abs(target - cur_feature_value)
 624
 625                else:
 626                    target = bin_starts[i + 1]
 627                    distance = np.abs(target - cur_feature_value)
 628
 629                    # Subtract a very smaller value to make the target
 630                    # technically fall into the left bin
 631                    target -= 1e-4
 632
 633            elif i > cur_bin_id:
 634                # First need to consider if it should be an integer value
 635                # If so, it would be the closest integer to the left point
 636                if need_to_be_int:
 637                    target = float(np.ceil(bin_starts[i]))
 638                    if target == bin_starts[i]:
 639                        target += 1
 640
 641                    # Skip this option if it is not possible to find an int value
 642                    if i + 1 < len(additives) and target >= bin_starts[i + 1]:
 643                        continue
 644
 645                    distance = np.abs(target - cur_feature_value)
 646
 647                else:
 648                    target = bin_starts[i]
 649                    distance = np.abs(target - cur_feature_value)
 650
 651            # Scale the distance based on the deviation of the feature (how changeable it is)
 652            if cont_mads[cur_feature_name] > 0:
 653                distance /= cont_mads[cur_feature_name]
 654
 655            # Compute score gain which has two parts:
 656            # (1) gain from the change of main effect
 657            # (2) gain from the change of interaction effect
 658
 659            # Main effect
 660            main_score_gain = additives[i] - cur_feature_score
 661
 662            # Interaction terms
 663            # A list to track all interaction score gain offsets
 664            # [[interaction id, interaction score gain]]
 665            inter_score_gain = 0
 666            inter_score_gains = []
 667
 668            for d in associated_interactions:
 669                inter_bin_id = search_sorted_lower_index(
 670                    d["feature_inter_bin_starts"], target
 671                )
 672                inter_score_gain += (
 673                    d["feature_inter_additives"][inter_bin_id]
 674                    - d["feature_inter_score"]
 675                )
 676                inter_score_gains.append(
 677                    [
 678                        d["cur_interaction_id"],
 679                        d["feature_inter_additives"][inter_bin_id]
 680                        - d["feature_inter_score"],
 681                    ]
 682                )
 683
 684            score_gain = main_score_gain + inter_score_gain
 685
 686            if cf_direction * score_gain <= 0 and skip_unhelpful:
 687                continue
 688
 689            # Filter out of bound options
 690            if score_gain_bound and skip_unhelpful:
 691                if cf_direction == 1 and score_gain > score_gain_bound:
 692                    continue
 693                if cf_direction == -1 and score_gain < score_gain_bound:
 694                    continue
 695
 696            cont_options.append([target, score_gain, distance, i, inter_score_gains])
 697
 698        # Now we can apply the second round of filtering to remove redundant options
 699        # Redundant options refer to bins that give similar score gain with larger distance
 700        cont_options = sorted(cont_options, key=lambda x: x[2])
 701
 702        start = 0
 703        while start < len(cont_options):
 704            for i in range(len(cont_options) - 1, start, -1):
 705                if np.abs(cont_options[i][1] - cont_options[start][1]) < epsilon:
 706                    cont_options.pop(i)
 707
 708            start += 1
 709
 710        return cont_options
 711
 712    def generate_cat_options(
 713        self,
 714        cf_direction,
 715        cur_feature_index,
 716        cur_feature_value,
 717        cur_feature_score,
 718        cur_cat_distance,
 719        cur_example,
 720        score_gain_bound=None,
 721        skip_unhelpful=True,
 722    ):
 723        """
 724        Generate all alternative options for this categorical variable. This function
 725        would filter out all options that are not helpful for the counterfactual
 726        generation.
 727
 728        Args:
 729            cf_direction (int): Integer `+1` if 0 => 1, `-1` if 1 => 0
 730                (classification); `+1` if we need to increase the prediction,
 731                `-1` if decrease (regression).
 732            cur_feature_index (int): The index of the current continuous feature.
 733            cur_feature_value (float): The current feature value.
 734            cur_feature_score (float): The score for the current feature value.
 735            cur_cat_distance (dict): A map of feature_level => 1 - frequency.
 736            cur_example (list): Current sample values.
 737            score_gain_bound (float): Bound of the score gain. We do not collect
 738                options that give `score_gain` > `score_gain_bound` (when
 739                `cf_direction=1`), or `score_gain` < `score_gain_bound` (when
 740                `cf_direction=-1`)
 741            skip_unhelpful (bool): True if to skip options from main
 742                effects that give opposite score gain. It is rare that there is a
 743                positive score gain from pair-interaction that outweigh negative
 744                score gain from two main effects, and adjusting the distance penalty.
 745
 746        Returns:
 747            list: List of option tuples (target, score_gain, distance, bin_index).
 748        """
 749
 750        # Find other options for this categorical variable
 751        # For each option, we compute the (1) score gain, and (2) distance
 752        #
 753        # (1) Score gain is the same as continuous variables
 754        # (2) The distance is determined by 1 - the level frequency in the
 755        # training data. It implies that levels with high frequency are easier
 756        # to "move to"
 757
 758        # Get the additive scores of this feature
 759        additives = self.ebm.additive_terms_[cur_feature_index][1:]
 760
 761        # Get the bin edges of this feature
 762        levels = self.ebm.preprocessor_._get_bin_labels(cur_feature_index)
 763
 764        # Create "options", each option is a tuple (target, score_gain, distance, bin_index)
 765        cat_options = []
 766
 767        # Identify interaction terms that we need to consider
 768        associated_interactions = []
 769
 770        for cur_feature_id in range(len(self.ebm.feature_names)):
 771            cur_feature_type = self.ebm.feature_types[cur_feature_id]
 772            if cur_feature_type == "interaction":
 773
 774                indexes = self.ebm.feature_groups_[cur_feature_id]
 775
 776                if cur_feature_index in indexes:
 777                    feature_position = 0 if indexes[0] == cur_feature_index else 1
 778
 779                    other_position = 1 - feature_position
 780                    other_index = indexes[other_position]
 781                    other_type = self.ebm.feature_types[other_index]
 782                    other_name = self.ebm.feature_names[other_index]
 783
 784                    # Get the current additive scores and bin edges
 785                    inter_additives = self.ebm.additive_terms_[cur_feature_id][1:, 1:]
 786
 787                    bin_starts_feature = self.ebm.pair_preprocessor_._get_bin_labels(
 788                        cur_feature_index
 789                    )
 790                    bin_starts_other = self.ebm.pair_preprocessor_._get_bin_labels(
 791                        other_index
 792                    )
 793
 794                    # Have to skip the max edge if it is continuous
 795                    if other_type == "continuous":
 796                        bin_starts_other = bin_starts_other[:-1]
 797
 798                    # Get the current interaction term score
 799                    other_bin = None
 800                    if other_type == "continuous":
 801                        other_bin = search_sorted_lower_index(
 802                            bin_starts_other, float(cur_example[other_index])
 803                        )
 804                    else:
 805                        other_bin = bin_starts_other.index(cur_example[other_index])
 806
 807                    feature_bin = bin_starts_feature.index(cur_feature_value)
 808
 809                    feature_inter_score = 0
 810
 811                    if feature_position == 0:
 812                        feature_inter_score = inter_additives[feature_bin, other_bin]
 813                    else:
 814                        feature_inter_score = inter_additives[other_bin, feature_bin]
 815
 816                    # Extract the row or column where we fix the other features and
 817                    # vary the current feature
 818                    feature_inter_bin_starts = bin_starts_feature
 819                    feature_inter_additives = []
 820
 821                    if feature_position == 0:
 822                        for i in range(len(inter_additives)):
 823                            feature_inter_additives.append(
 824                                inter_additives[i, other_bin]
 825                            )
 826                    else:
 827                        for i in range(len(inter_additives[0])):
 828                            feature_inter_additives.append(
 829                                inter_additives[other_bin, i]
 830                            )
 831
 832                    # Register this interaction term
 833                    associated_interactions.append(
 834                        {
 835                            "inter_index": indexes,
 836                            "cur_interaction_id": cur_feature_id,
 837                            "feature_inter_score": feature_inter_score,
 838                            "feature_inter_bin_starts": feature_inter_bin_starts,
 839                            "feature_inter_additives": feature_inter_additives,
 840                        }
 841                    )
 842
 843        for i in range(len(additives)):
 844            if levels[i] != cur_feature_value:
 845                target = levels[i]
 846                distance = cur_cat_distance[target]
 847
 848                # Compute score gain which has two parts:
 849                # (1) gain from the change of main effect
 850                # (2) gain from the change of interaction effect
 851
 852                # Main effect
 853                main_score_gain = additives[i] - cur_feature_score
 854
 855                # Interaction terms
 856                # A list to track all interaction score gain offsets
 857                # [[interaction id, interaction score gain]]
 858                inter_score_gain = 0
 859                inter_score_gains = []
 860
 861                for d in associated_interactions:
 862                    inter_bin_id = d["feature_inter_bin_starts"].index(target)
 863                    inter_score_gain += (
 864                        d["feature_inter_additives"][inter_bin_id]
 865                        - d["feature_inter_score"]
 866                    )
 867                    inter_score_gains.append(
 868                        [
 869                            d["cur_interaction_id"],
 870                            d["feature_inter_additives"][inter_bin_id]
 871                            - d["feature_inter_score"],
 872                        ]
 873                    )
 874
 875                score_gain = main_score_gain + inter_score_gain
 876
 877                # Skip unhelpful options
 878                if cf_direction * score_gain <= 0 and skip_unhelpful:
 879                    continue
 880
 881                # Filter out of bound options
 882                if score_gain_bound and skip_unhelpful:
 883                    if cf_direction == 1 and score_gain > score_gain_bound:
 884                        continue
 885                    if cf_direction == -1 and score_gain < score_gain_bound:
 886                        continue
 887
 888                cat_options.append([target, score_gain, distance, i, inter_score_gains])
 889
 890        return cat_options
 891
 892    def generate_inter_options(
 893        self,
 894        cur_feature_id,
 895        cur_feature_index_1,
 896        cur_feature_index_2,
 897        cur_feature_score,
 898        options,
 899    ):
 900        """
 901        Generate all possible options for this interaction variable.
 902
 903        Interaction terms are interesting in this MILP. Each option counts as a
 904        variable, but each variable only affects the score gain, not the distance.
 905
 906        Note that in EBM, the bin definitions for interaction terms can be different
 907        from their definitions for individual continuous variables.
 908
 909        To model interaction terms, we can think it as a binary variable. The
 910        value is determined by the multiplication of two main effect variables.
 911        Each interaction variable describes a combination of two main effect
 912        variables. Therefore, say continuous variable A has $x$ probable options,
 913        and another continuous variable B has $y$ probable options, then we should
 914        add $x \\times y$ binary variables to offset their probable interaction
 915        effects.
 916
 917        Args:
 918            cur_feature_id (int): The id of this interaction effect.
 919            cur_feature_index_1 (int): The index of the first main effect.
 920            cur_feature_index_2 (int): The index of the second main effect.
 921            cur_feature_score (float): The score for the current feature value.
 922            options (dict): The current option list, feature_name ->
 923                [`target`, `score_gain`, `distance`, `bin_id`].
 924
 925        Returns:
 926            List of option tuples (target, score_gain, distance, bin_index)
 927        """
 928
 929        # Get the sub-types for this interaction term
 930        cur_feature_type_1 = self.ebm.feature_types[cur_feature_index_1]
 931        cur_feature_type_2 = self.ebm.feature_types[cur_feature_index_2]
 932
 933        # Get the sub-names for this interaction term
 934        cur_feature_name_1 = self.ebm.feature_names[cur_feature_index_1]
 935        cur_feature_name_2 = self.ebm.feature_names[cur_feature_index_2]
 936
 937        # The first column and row are reserved for missing values (even with
 938        # categorical features)
 939        additives = self.ebm.additive_terms_[cur_feature_id][1:, 1:]
 940
 941        # Four possibilities here: cont x cont, cont x cat, cat x cont, cat x cat.
 942        # Each has a different way to lookup the bin table.
 943        inter_options = []
 944
 945        # Iterate through all possible combinations of options from these two
 946        # variables
 947        for opt_1 in options[cur_feature_name_1]:
 948            for opt_2 in options[cur_feature_name_2]:
 949
 950                bin_starts_1 = self.ebm.pair_preprocessor_._get_bin_labels(
 951                    cur_feature_index_1
 952                )
 953                bin_starts_2 = self.ebm.pair_preprocessor_._get_bin_labels(
 954                    cur_feature_index_2
 955                )
 956
 957                bin_1 = None
 958                bin_2 = None
 959
 960                if cur_feature_type_1 == "continuous":
 961                    if cur_feature_type_2 == "continuous":
 962                        # cont x cont
 963                        bin_starts_1 = bin_starts_1[:-1]
 964                        bin_starts_2 = bin_starts_2[:-1]
 965
 966                        # locate the bin for each option value
 967                        bin_1 = search_sorted_lower_index(bin_starts_1, opt_1[0])
 968                        bin_2 = search_sorted_lower_index(bin_starts_2, opt_2[0])
 969
 970                    else:
 971                        # cont x cat
 972                        bin_starts_1 = bin_starts_1[:-1]
 973
 974                        # locate the bin for each option value
 975                        bin_1 = search_sorted_lower_index(bin_starts_1, opt_1[0])
 976                        bin_2 = bin_starts_2.index(opt_2[0])
 977
 978                else:
 979                    if cur_feature_type_2 == "continuous":
 980                        # cat x cont
 981                        bin_starts_2 = bin_starts_2[:-1]
 982
 983                        # locate the bin for each option value
 984                        bin_1 = bin_starts_1.index(opt_1[0])
 985                        bin_2 = search_sorted_lower_index(bin_starts_2, opt_2[0])
 986
 987                    else:
 988                        # cat x cat
 989
 990                        # locate the bin for each option value
 991                        bin_1 = bin_starts_1.index(opt_1[0])
 992                        bin_2 = bin_starts_2.index(opt_2[0])
 993
 994                new_score = additives[bin_1, bin_2]
 995                score_gain = new_score - cur_feature_score
 996
 997                # The score gain on the interaction term need to offset the interaction
 998                # score gain we have already counted on the main effect options. That
 999                # score is saved in the option tuple.
1000
1001                # We first need to find the common interaction id
1002                common_index = [-1, -1]
1003                for m in range(len(opt_1[4])):
1004                    for n in range(len(opt_2[4])):
1005                        if opt_1[4][m][0] == opt_2[4][n][0]:
1006                            common_index = [m, n]
1007                            break
1008
1009                    if common_index[0] != -1 and common_index[1] != -1:
1010                        break
1011
1012                score_gain -= opt_1[4][common_index[0]][1]
1013                score_gain -= opt_2[4][common_index[1]][1]
1014
1015                inter_options.append(
1016                    [[opt_1[0], opt_2[0]], score_gain, 0, [opt_1[3], opt_2[3]], 0]
1017                )
1018
1019        return inter_options
1020
1021    @staticmethod
1022    def create_milp(
1023        cf_direction,
1024        needed_score_gain,
1025        features_to_vary,
1026        options,
1027        max_num_features_to_vary=None,
1028        muted_variables=[],
1029    ):
1030        """
1031        Create a MILP to find counterfactuals (CF) using PuLP.
1032
1033        Args:
1034            cf_direction (int): Integer +1 if 0 => 1, -1 if 1 => 0 (classification),
1035                +1 if we need to incrase the prediction, -1 if decrease (regression).
1036            needed_score_gain (float): The score gain needed to achieve the CF goal.
1037            features_to_vary (list[str]): Feature names of features that the
1038                generated CF can change.
1039            options (dict): Possible options for each variable. Each option is a
1040                list [target, score_gain, distance, bin_index].
1041            max_num_features_to_vary (int, optional): Max number of features that the
1042                generated CF can change. If the value is `None`, the CFs can
1043                change any number of features.
1044            muted_variables (list[str], optional): Variables that this MILP should
1045                not use. This is useful to mute optimal variables so we can explore
1046                diverse solutions. This list should not include interaction variables.
1047
1048        Returns:
1049            A tuple (`model`, `variables`), where `model` is a pulp.LpProblem
1050            model that encodes the MILP problem, and `variables` is a dict of
1051            variables used in the `model`: `feature_name` => [`variables`].
1052        """
1053
1054        # Create a model (minimizing the distance)
1055        model = pulp.LpProblem("ebmCounterfactual", pulp.LpMinimize)
1056
1057        distance = 0
1058        score_gain = 0
1059
1060        muted_variables_set = set(muted_variables)
1061
1062        # Create variables
1063        variables = {}
1064        for f in features_to_vary:
1065            # Each variable encodes an option (0: not use this option,
1066            # 1: use this option)
1067            cur_variables = []
1068
1069            for option in options[f]:
1070                var_name = "{}:{}".format(f, option[3])
1071
1072                # Skip the muted variables
1073                if var_name in muted_variables_set:
1074                    continue
1075
1076                x = pulp.LpVariable(var_name, lowBound=0, upBound=1, cat="Binary")
1077                x.setInitialValue(0)
1078
1079                score_gain += option[1] * x
1080                distance += option[2] * x
1081
1082                cur_variables.append(x)
1083
1084            variables[f] = cur_variables
1085
1086            # A local constraint is that we can only at most selection one option from
1087            # one feature
1088            model += pulp.lpSum(cur_variables) <= 1
1089
1090        # Users can also set `max_num_features_to_vary` to control the total
1091        # number of features to vary
1092        if max_num_features_to_vary is not None:
1093            main_variables = []
1094            for f in variables:
1095                main_variables.extend(variables[f])
1096
1097            model += pulp.lpSum(main_variables) <= max_num_features_to_vary
1098
1099        # Create variables for interaction effects
1100        for opt_name in options:
1101            if " x " in opt_name:
1102                f1_name = re.sub(r"(.+)\sx\s.+", r"\1", opt_name)
1103                f2_name = re.sub(r".+\sx\s(.+)", r"\1", opt_name)
1104
1105                if f1_name in features_to_vary and f2_name in features_to_vary:
1106
1107                    # We need to include this interaction effect
1108                    cur_variables = []
1109
1110                    for option in options[opt_name]:
1111                        z = pulp.LpVariable(
1112                            "{}:{},{}".format(opt_name, option[3][0], option[3][1]),
1113                            lowBound=0,
1114                            upBound=1,
1115                            cat="Continuous",
1116                        )
1117                        z.setInitialValue(0)
1118
1119                        # Need to iterate through existing variables for f1 and f2 to find
1120                        # the corresponding variables
1121                        x_f1 = None
1122                        x_f2 = None
1123
1124                        # Skp if this interaction variable involves muted main variable
1125                        x_f1_name = "{}:{}".format(f1_name, option[3][0])
1126                        x_f2_name = "{}:{}".format(f2_name, option[3][1])
1127
1128                        if (
1129                            x_f1_name in muted_variables_set
1130                            or x_f2_name in muted_variables_set
1131                        ):
1132                            continue
1133
1134                        for x in variables[f1_name]:
1135                            if x.name == x_f1_name:
1136                                x_f1 = x
1137                                break
1138
1139                        for x in variables[f2_name]:
1140                            if x.name == x_f2_name:
1141                                x_f2 = x
1142                                break
1143
1144                        assert x_f1 is not None and x_f2 is not None
1145
1146                        # variable z is actually the product of x_f1 and x_f2
1147                        # We can linearize it by 3 constraints
1148                        model += z <= x_f1
1149                        model += z <= x_f2
1150                        model += z >= x_f1 + x_f2 - 1
1151
1152                        cur_variables.append(z)
1153
1154                    variables[opt_name] = cur_variables
1155
1156        # Use constraint to express counterfactual
1157        if cf_direction == 1:
1158            model += score_gain >= needed_score_gain
1159        else:
1160            model += score_gain <= needed_score_gain
1161
1162        # We want to minimize the distance
1163        model += distance
1164
1165        return model, variables
1166
1167    def print_solution(self, cur_example, active_variables, options):
1168        """
1169        Print the optimal solution.
1170
1171        Args:
1172            cur_example (np.ndarray): the original data point.
1173            active_variables (list[variable]): binary variables with value 1.
1174            options (dict): all the possible options for all features.
1175        """
1176
1177        for var in active_variables:
1178            # Skip interaction vars (included)
1179            if "_x_" not in var.name:
1180                f_name = re.sub(r"(.+):\d+", r"\1", var.name)
1181                bin_i = int(re.sub(r".+:(\d+)", r"\1", var.name))
1182
1183                # Find the original value
1184                org_value = cur_example[0][self.ebm.feature_names.index(f_name)]
1185
1186                # Find the target bin
1187                f_index = self.ebm.feature_names.index(f_name)
1188                f_type = self.ebm.feature_types[f_index]
1189
1190                if f_type == "continuous":
1191                    bin_starts = self.ebm.preprocessor_._get_bin_labels(f_index)[:-1]
1192
1193                    target_bin = "[{},".format(bin_starts[bin_i])
1194
1195                    if bin_i + 1 < len(bin_starts):
1196                        target_bin += " {})".format(bin_starts[bin_i + 1])
1197                    else:
1198                        target_bin += " inf)"
1199                else:
1200                    target_bin = ""
1201                    org_value = '"{}"'.format(org_value)
1202
1203                for option in options[f_name]:
1204                    if option[3] == bin_i:
1205                        print(
1206                            "Change <{}> from {} to {} {}".format(
1207                                f_name, org_value, option[0], target_bin
1208                            )
1209                        )
1210                        print(
1211                            "\t* score gain: {:.4f}\n\t* distance cost: {:.4f}".format(
1212                                option[1], option[2]
1213                            )
1214                        )
1215                        break
1216
1217            else:
1218                f_name = re.sub(r"(.+):.+", r"\1", var.name)
1219                f_name = f_name.replace("_x_", " x ")
1220                bin_0 = int(re.sub(r".+:(\d+),\d+", r"\1", var.name))
1221                bin_1 = int(re.sub(r".+:\d+,(\d+)", r"\1", var.name))
1222
1223                for option in options[f_name]:
1224                    if option[3][0] == bin_0 and option[3][1] == bin_1:
1225                        print("Trigger interaction term: <{}>".format(f_name))
1226                        print(
1227                            "\t* score gain: {:.4f}\n\t* distance cost: {:.4f}".format(
1228                                option[1], 0
1229                            )
1230                        )
1231                        break
1232        print()
1233
1234    @staticmethod
1235    def compute_mad(xs):
1236        """
1237        Compute the median absolute deviation of a continuous feature.
1238
1239        Args:
1240            xs (np.ndarray): A column of continuous values.
1241
1242        Returns:
1243            float: MAD value of xs.
1244        """
1245        xs_median = np.median(xs.astype(float))
1246        mad = np.median(np.abs(xs.astype(float) - xs_median))
1247        return mad
1248
1249    @staticmethod
1250    def compute_frequency_distance(xs):
1251        """
1252        For categorical variables, we compute 1 - frequency as their distance. It implies
1253        that switching to a frequent value takes less effort.
1254
1255        Args:
1256            xs (np.ndarray): A column of categorical values.
1257
1258        Returns:
1259            dict: category level -> 1 - frequency.
1260        """
1261        counter = Counter(xs)
1262
1263        results = {}
1264
1265        for key in counter:
1266            results[key] = 1 - (counter[key] / len(xs))
1267
1268        return results
1269
1270    @staticmethod
1271    def compute_naive_cat_distance(xs):
1272        """
1273        Alternative to compute_frequency_distance() to compute distance for
1274        categorical variables. The distance 1 for different levels and 0 for
1275        the same levels. Here we give them all score 1, because same-level
1276        options will be filtered out when we create categorical options for the
1277        optimization program.
1278
1279        Args:
1280            xs (np.ndarray): A column of categorical values.
1281
1282        Returns:
1283            dict: category level -> 1.
1284        """
1285        counter = Counter(xs)
1286        results = {}
1287
1288        for key in counter:
1289            results[key] = 1
1290
1291        return results
1292
1293
1294def search_sorted_lower_index(sorted_edges, value):
1295    """Binary search to locate the correct bin for continuous features."""
1296    left = 0
1297    right = len(sorted_edges) - 1
1298
1299    while right - left > 1:
1300        i = left + int((right - left) / 2)
1301
1302        if value > sorted_edges[i]:
1303            left = i
1304        elif value < sorted_edges[i]:
1305            right = i
1306        else:
1307            return i
1308
1309    # Handle out of bound issues
1310    if value >= sorted_edges[right]:
1311        return right
1312    if value < sorted_edges[left]:
1313        return left
1314
1315    return right - 1
1316
1317
1318def sigmoid(x):
1319    """Sigmoid function."""
1320    return 1 / (1 + np.exp(x))
1321
1322
1323def _resort_categorical_level(col_mapping):
1324    """
1325    Resort the levels in the categorical encoders if all levels can be converted
1326    to numbers (integer or float).
1327
1328    Args:
1329        col_mapping: the dictionary that maps level string to int
1330
1331    Returns:
1332        New col_mapping if all levels can be converted to numbers, otherwise
1333        the original col_mapping
1334    """
1335
1336    def is_number(string):
1337        try:
1338            float(string)
1339            return True
1340        except ValueError:
1341            return False
1342
1343    if all(map(is_number, col_mapping.keys())):
1344
1345        key_tuples = [(k, float(k)) for k in col_mapping.keys()]
1346        sorted_key_tuples = sorted(key_tuples, key=lambda x: x[1])
1347
1348        new_mapping = {}
1349        value = 1
1350
1351        for t in sorted_key_tuples:
1352            new_mapping[t[0]] = value
1353            value += 1
1354
1355        return new_mapping
1356
1357    else:
1358        return col_mapping
1359
1360
1361def _init_feature_descriptions(ebm, label_encoder):
1362    # Initialize the feature description dictionary
1363    feature_descriptions = {}
1364
1365    for i in range(len(ebm.feature_names)):
1366        cur_name = ebm.feature_names[i]
1367        cur_type = ebm.feature_types[i]
1368
1369        # Use the feature name as the default display name
1370        if cur_type == "continuous":
1371            feature_descriptions[cur_name] = {
1372                "displayName": cur_name,
1373                "description": "",
1374            }
1375
1376        # For categorical features, we can also give display name and description
1377        # for different levels
1378        elif cur_type == "categorical":
1379
1380            level_descriptions = {}
1381
1382            for level in label_encoder[cur_name]:
1383                level_descriptions[level] = {
1384                    "displayName": label_encoder[cur_name][level],
1385                    "description": "",
1386                }
1387
1388            feature_descriptions[cur_name] = {
1389                "displayName": cur_name,
1390                "description": "",
1391                "levelDescription": level_descriptions,
1392            }
1393
1394        else:
1395            continue
1396
1397    return feature_descriptions
1398
1399
1400def _init_feature_configuration(ebm):
1401    # Initialize the feature configuration dictionary
1402    feature_configuration = {}
1403
1404    for i in range(len(ebm.feature_names)):
1405        cur_name = ebm.feature_names[i]
1406        cur_type = ebm.feature_types[i]
1407
1408        # Use the feature name as the default display name
1409        if cur_type == "continuous" or cur_type == "categorical":
1410            feature_configuration[cur_name] = {
1411                "difficulty": 3,
1412                "requiresInt": False,
1413                "requiresIncreasing": False,
1414                "requiresDecreasing": False,
1415                "usesTransform": None,
1416                "acceptableRange": None,
1417            }
1418        else:
1419            continue
1420
1421    return feature_configuration
1422
1423
1424def _get_kde_sample(xs, n_sample=200):
1425    """
1426    Compute kernel density estimation.
1427    """
1428    kernel = gaussian_kde(xs.astype(float))
1429
1430    sample_x = np.linspace(np.min(xs), np.max(xs), n_sample)
1431    sample_y = kernel(sample_x)
1432
1433    return sample_x, sample_y
1434
1435
1436def get_model_data(
1437    ebm,
1438    x_train,
1439    model_info,
1440    resort_categorical=False,
1441    feature_info=None,
1442    feature_level_info=None,
1443    feature_config=None,
1444):
1445    """
1446    Get the model data for GAM Coach.
1447    Args:
1448        ebm: Trained EBM model. ExplainableBoostingClassifier or
1449            ExplainableBoostingRegressor object.
1450        x_train: Training data. We use it to compute the mean absolute deviation
1451            score for continuous features, and frequency scores for categorical
1452            features.
1453        model_info: Information about the model (class names, regression target
1454            name). For classification, the order of classes matters. It should
1455            be consistent with the class encoding index. For example, the first
1456            element should be the name for class 0.
1457            It has format:
1458            `{'classes': ['loan rejection', 'loan approval']}` or
1459            `{'regressionName': 'interest rate'}`
1460        resort_categorical: Whether to sort the levels in categorical variable
1461            by increasing order if all levels can be converted to numbers.
1462        feature_info: You can provide a dictionary to give a separate display
1463            name and optional description for each feature. By default, the
1464            display name is the same as the feature name, and the description
1465            is an emtpy string. `feature_info` can be partial (only including
1466            some features). It has format:
1467            `{'feature_name': ['display_name', 'description']}`
1468        feature_level_info: You can provide a dictionary to give separate display
1469            name and optional description for each level of categorical features.
1470            By default, the display name is the same as the level name, and the
1471            description is an empty string. `feature_info` can be partial
1472            (e.g., only including some levels from some categorical features).
1473            It has format:
1474            `{'feature_name': {level_id: ['display_name', 'description']}}`
1475        feature_config: You can provide a dictionary to configure the difficulty,
1476            integer requirement, and acceptable range of individual features.
1477            The difficulty is an integer between 1 and 6: 1 (very easy to change),
1478            2 (easy), 3 (default), 4 (hard), 5 (very hard), 6 (impossible to change).
1479            By default, difficulty is set to 3 for all features, requiresInt is
1480            False for continuous variables, and acceptanceRange is None (search
1481            all range).
1482            The dictionary property has the following format:
1483            `{'difficulty': 3, 'requiresInt': True, 'acceptableRange': None}`
1484    Returns:
1485        A Python dictionary of model data
1486    """
1487    ROUND = 6
1488
1489    # Main model info on each feature
1490    features = []
1491
1492    # Track the encoding of categorical feature levels
1493    labelEncoder = {}
1494
1495    # Track the score range
1496    score_range = [np.inf, -np.inf]
1497
1498    for i in tqdm(range(len(ebm.feature_names))):
1499        cur_feature = {}
1500        cur_feature["name"] = ebm.feature_names[i]
1501        cur_feature["type"] = ebm.feature_types[i]
1502        cur_feature["importance"] = ebm.feature_importances_[i]
1503
1504        # Handle interaction term differently from cont/cat
1505        if cur_feature["type"] == "interaction":
1506            cur_id = ebm.feature_groups_[i]
1507            cur_feature["id"] = list(cur_id)
1508
1509            # Info for each individual feature
1510            cur_feature["name1"] = ebm.feature_names[cur_id[0]]
1511            cur_feature["name2"] = ebm.feature_names[cur_id[1]]
1512
1513            cur_feature["type1"] = ebm.feature_types[cur_id[0]]
1514            cur_feature["type2"] = ebm.feature_types[cur_id[1]]
1515
1516            # Skip the first item from both dimensions
1517            cur_feature["additive"] = np.round(ebm.additive_terms_[i], ROUND)[
1518                1:, 1:
1519            ].tolist()
1520            cur_feature["error"] = np.round(ebm.term_standard_deviations_[i], ROUND)[
1521                1:, 1:
1522            ].tolist()
1523
1524            # Get the bin label info
1525            cur_feature["binLabel1"] = ebm.pair_preprocessor_._get_bin_labels(cur_id[0])
1526            cur_feature["binLabel2"] = ebm.pair_preprocessor_._get_bin_labels(cur_id[1])
1527
1528            # Encode categorical levels as integers
1529            if cur_feature["type1"] == "categorical":
1530                level_str_to_int = ebm.pair_preprocessor_.col_mapping_[cur_id[0]]
1531                cur_feature["binLabel1"] = list(
1532                    map(lambda x: level_str_to_int[x], cur_feature["binLabel1"])
1533                )
1534
1535            if cur_feature["type2"] == "categorical":
1536                level_str_to_int = ebm.pair_preprocessor_.col_mapping_[cur_id[1]]
1537                cur_feature["binLabel2"] = list(
1538                    map(lambda x: level_str_to_int[x], cur_feature["binLabel2"])
1539                )
1540
1541            # Get density info
1542            if cur_feature["type1"] == "categorical":
1543                level_str_to_int = ebm.pair_preprocessor_.col_mapping_[cur_id[0]]
1544                cur_feature["histEdge1"] = ebm.preprocessor_._get_hist_edges(cur_id[0])
1545                cur_feature["histEdge1"] = list(
1546                    map(lambda x: level_str_to_int[x], cur_feature["histEdge1"])
1547                )
1548                cur_feature["histCount1"] = np.round(
1549                    ebm.preprocessor_._get_hist_counts(cur_id[0]), ROUND
1550                ).tolist()
1551            else:
1552                # Use KDE to draw density plots for cont features
1553                edges, counts = _get_kde_sample(x_train[:, cur_id[0]])
1554                cur_feature["histEdge1"] = edges.tolist()
1555                cur_feature["histCount1"] = counts.tolist()
1556
1557            if cur_feature["type2"] == "categorical":
1558                level_str_to_int = ebm.pair_preprocessor_.col_mapping_[cur_id[1]]
1559                cur_feature["histEdge2"] = ebm.preprocessor_._get_hist_edges(cur_id[1])
1560                cur_feature["histEdge2"] = list(
1561                    map(lambda x: level_str_to_int[x], cur_feature["histEdge2"])
1562                )
1563                cur_feature["histCount2"] = np.round(
1564                    ebm.preprocessor_._get_hist_counts(cur_id[1]), ROUND
1565                ).tolist()
1566            else:
1567                # Use KDE to draw density plots for cont features
1568                edges, counts = _get_kde_sample(x_train[:, cur_id[1]])
1569                cur_feature["histEdge2"] = edges.tolist()
1570                cur_feature["histCount2"] = counts.tolist()
1571
1572        else:
1573            # Skip the first item (reserved for missing value)
1574            cur_feature["additive"] = np.round(ebm.additive_terms_[i], ROUND).tolist()[
1575                1:
1576            ]
1577            cur_feature["error"] = np.round(
1578                ebm.term_standard_deviations_[i], ROUND
1579            ).tolist()[1:]
1580            cur_feature["id"] = ebm.feature_groups_[i]
1581            cur_id = ebm.feature_groups_[i][0]
1582            cur_feature["count"] = ebm.preprocessor_.col_bin_counts_[cur_id].tolist()[
1583                1:
1584            ]
1585
1586            # Track the global score range
1587            score_range[0] = min(
1588                score_range[0],
1589                np.min(ebm.additive_terms_[i] - ebm.term_standard_deviations_[i]),
1590            )
1591            score_range[1] = max(
1592                score_range[1],
1593                np.max(ebm.additive_terms_[i] + ebm.term_standard_deviations_[i]),
1594            )
1595
1596            # Add the binning information for continuous features
1597            if cur_feature["type"] == "continuous":
1598                # Add the bin information
1599                cur_feature["binEdge"] = ebm.preprocessor_._get_bin_labels(cur_id)
1600
1601                # Use KDE to draw density plots for cont features
1602                edges, counts = _get_kde_sample(x_train[:, cur_id])
1603
1604                cur_feature["histEdge"] = edges.tolist()
1605                cur_feature["histCount"] = counts.tolist()
1606
1607            elif cur_feature["type"] == "categorical":
1608                # Get the level value mapping
1609                level_str_to_int = ebm.preprocessor_.col_mapping_[cur_id]
1610
1611                if resort_categorical:
1612                    level_str_to_int = _resort_categorical_level(level_str_to_int)
1613
1614                cur_feature["binLabel"] = list(
1615                    map(
1616                        lambda x: level_str_to_int[x],
1617                        ebm.preprocessor_._get_bin_labels(cur_id),
1618                    )
1619                )
1620
1621                # Add the hist information
1622                # For categorical data, the edges are strings
1623                cur_feature["histEdge"] = list(
1624                    map(
1625                        lambda x: level_str_to_int[x],
1626                        ebm.preprocessor_._get_hist_edges(cur_id),
1627                    )
1628                )
1629
1630                cur_feature["histCount"] = np.round(
1631                    ebm.preprocessor_._get_hist_counts(cur_id), ROUND
1632                ).tolist()
1633
1634                if resort_categorical:
1635                    cur_bin_info = list(
1636                        zip(
1637                            cur_feature["binLabel"],
1638                            cur_feature["additive"],
1639                            cur_feature["error"],
1640                            cur_feature["count"],
1641                        )
1642                    )
1643                    cur_bin_info = sorted(cur_bin_info, key=lambda x: x[0])
1644
1645                    cur_feature["binLabel"] = [k[0] for k in cur_bin_info]
1646                    cur_feature["additive"] = [k[1] for k in cur_bin_info]
1647                    cur_feature["error"] = [k[2] for k in cur_bin_info]
1648                    cur_feature["count"] = [k[3] for k in cur_bin_info]
1649
1650                    cur_hist_info = list(
1651                        zip(cur_feature["histEdge"], cur_feature["histCount"])
1652                    )
1653                    cur_hist_info = sorted(cur_hist_info, key=lambda x: x[0])
1654
1655                    cur_feature["histEdge"] = [k[0] for k in cur_hist_info]
1656                    cur_feature["histCount"] = [k[1] for k in cur_hist_info]
1657
1658                # Add the label encoding information
1659                labelEncoder[cur_feature["name"]] = {
1660                    i: s for s, i in level_str_to_int.items()
1661                }
1662
1663        features.append(cur_feature)
1664
1665    score_range = list(map(lambda x: round(x, ROUND), score_range))
1666
1667    feature_names = []
1668    feature_types = []
1669
1670    # Sample data does not record interaction features
1671    for i in range(len(ebm.feature_names)):
1672        if ebm.feature_types[i] != "interaction":
1673            feature_names.append(ebm.feature_names[i])
1674            feature_types.append(ebm.feature_types[i])
1675
1676    # Compute the MAD scores and frequencies
1677    ebm_cont_indexes = np.array(
1678        [i for i in range(len(feature_names)) if feature_types[i] == "continuous"]
1679    )
1680
1681    contMads = {}
1682
1683    for i in ebm_cont_indexes:
1684        contMads[ebm.feature_names[i]] = GAMCoach.compute_mad(x_train[:, i])
1685
1686    ebm_cat_indexes = np.array(
1687        [i for i in range(len(feature_names)) if feature_types[i] == "categorical"]
1688    )
1689
1690    catDistances = {}
1691
1692    for i in ebm_cat_indexes:
1693        catDistances[feature_names[i]] = GAMCoach.compute_frequency_distance(
1694            x_train[:, i]
1695        )
1696
1697    # Initialize a feature description dictionary (provide more information about
1698    # each feature in the UI)
1699    feature_descriptions = _init_feature_descriptions(ebm, labelEncoder)
1700
1701    # Overwrite some entries in the default feature_descriptions
1702    if feature_info:
1703        for feature in feature_info:
1704            feature_descriptions[feature]["displayName"] = feature_info[feature][0]
1705            feature_descriptions[feature]["description"] = feature_info[feature][1]
1706
1707    if feature_level_info:
1708        for feature in feature_level_info:
1709            for level in feature_level_info[feature]:
1710                display_name = feature_level_info[feature][level][0]
1711                description = feature_level_info[feature][level][1]
1712                feature_descriptions[feature]["levelDescription"][level][
1713                    "displayName"
1714                ] = display_name
1715                feature_descriptions[feature]["levelDescription"][level][
1716                    "description"
1717                ] = description
1718
1719    # Put descriptions under the 'features' key
1720    for feature in features:
1721        if feature["name"] in feature_descriptions:
1722            feature["description"] = feature_descriptions[feature["name"]]
1723
1724    # Set the feature configurations
1725    feature_configurations = _init_feature_configuration(ebm)
1726
1727    if feature_config:
1728        for feature in feature_config:
1729            cur_config = feature_config[feature]
1730            for k in [
1731                "requiresInt",
1732                "difficulty",
1733                "acceptableRange",
1734                "requiresIncreasing",
1735                "requiresDecreasing",
1736                "usesTransform",
1737            ]:
1738                if k in cur_config:
1739                    feature_configurations[feature][k] = cur_config[k]
1740
1741    # Attach the configuration to the feature field
1742    for feature in features:
1743        if feature["name"] in feature_configurations:
1744            feature["config"] = feature_configurations[feature["name"]]
1745
1746    data = {
1747        "intercept": ebm.intercept_[0] if hasattr(ebm, "classes_") else ebm.intercept_,
1748        "isClassifier": hasattr(ebm, "classes_"),
1749        "modelInfo": model_info,
1750        "features": features,
1751        "labelEncoder": labelEncoder,
1752        "scoreRange": score_range,
1753        "featureNames": feature_names,
1754        "featureTypes": feature_types,
1755        "contMads": contMads,
1756        "catDistances": catDistances,
1757    }
1758
1759    return data
class GAMCoach:
  26class GAMCoach:
  27    """Main class for GAM Coach."""
  28
  29    def __init__(
  30        self,
  31        ebm: Union[ExplainableBoostingClassifier, ExplainableBoostingRegressor],
  32        x_train: np.ndarray,
  33        cont_mads=None,
  34        cat_distances=None,
  35        adjust_cat_distance=True,
  36    ):
  37        """Initialize a GAMCoach object.
  38
  39        Args:
  40            ebm (Union[ExplainableBoostingClassifier, ExplainableBoostingRegressor]):
  41                The trained EBM model. It can be either a classifier or a regressor.
  42            x_train (np.ndarray): The training data. It is used to compute the
  43                distance for different features.
  44            cont_mads (dict, optional): `feature_name` -> `median absolute
  45                deviation score`. If it is provided, it is used to overwrite the
  46                computed MADs for continuous variables. It is useful when you
  47                want to provide a custom normalization function to compute the
  48                distance between continuous features.
  49            cat_distances (dict, optional): `feature_name` -> {`level_name` -> `distance`}.
  50                Level distance of categorical variables. By default, the distance
  51                is computed by (1 - frequency(level)) for each level. It imples
  52                that it is easier to move to a more frequent. If `cat_distances`
  53                is provided, it will overwrite the default distance for
  54                categorical variables.
  55            adjust_cat_distance (bool, optional): If true, we use (1 -
  56                frequency(level)) for each level. Otherwise, we give distance = 1
  57                for different levels and 0 for the same level.
  58        """
  59
  60        self.ebm: Union[
  61            ExplainableBoostingClassifier, ExplainableBoostingRegressor
  62        ] = ebm
  63        """The trained EBM model."""
  64
  65        self.x_train: np.ndarray = x_train
  66
  67        self.cont_mads: dict = cont_mads
  68        """Median absolute deviation (MAD) of continuous variables."""
  69
  70        self.cat_distances: dict = cat_distances
  71        """Level distance of categorical variables. By default, the distance is
  72        computed by $(1 - \\frac{\\text{count of} L_i}{\\text{count of all L}})$
  73        for one level $L_i$. It implies that it is easier to move to a more
  74        frequent level.
  75        """
  76
  77        self.adjust_cat_distance: bool = adjust_cat_distance
  78
  79        # If cont_mads is not given, we compute it from the training data
  80        if self.cont_mads is None:
  81            ebm_cont_indexes = np.array(
  82                [
  83                    i
  84                    for i in range(len(self.ebm.feature_names))
  85                    if self.ebm.feature_types[i] == "continuous"
  86                ]
  87            )
  88
  89            self.cont_mads = {}
  90
  91            for i in ebm_cont_indexes:
  92                self.cont_mads[ebm.feature_names[i]] = self.compute_mad(
  93                    self.x_train[:, i]
  94                )
  95
  96        # If cat_distance is not given, we compute it from the training data
  97        if self.cat_distances is None:
  98            ebm_cat_indexes = np.array(
  99                [
 100                    i
 101                    for i in range(len(self.ebm.feature_names))
 102                    if self.ebm.feature_types[i] == "categorical"
 103                ]
 104            )
 105
 106            self.cat_distances = {}
 107
 108            if self.adjust_cat_distance:
 109                for i in ebm_cat_indexes:
 110                    self.cat_distances[
 111                        self.ebm.feature_names[i]
 112                    ] = GAMCoach.compute_frequency_distance(self.x_train[:, i])
 113            else:
 114                for i in ebm_cat_indexes:
 115                    self.cat_distances[
 116                        self.ebm.feature_names[i]
 117                    ] = GAMCoach.compute_naive_cat_distance(self.x_train[:, i])
 118
 119        # Determine if the ebm is a classifier or a regressor
 120        self.is_classifier = isinstance(self.ebm.intercept_, np.ndarray)
 121        """True if the ebm model is a classifier, false if it is a regressor."""
 122
 123    def generate_cfs(
 124        self,
 125        cur_example: np.ndarray,
 126        total_cfs: int = 1,
 127        target_range: tuple = None,
 128        sim_threshold_factor: float = 0.005,
 129        sim_threshold: float = None,
 130        categorical_weight: Union[float, str] = "auto",
 131        features_to_vary: list = None,
 132        max_num_features_to_vary: int = None,
 133        feature_ranges: dict = None,
 134        continuous_integer_features: list = None,
 135        verbose: int = 1,
 136    ) -> Counterfactuals:
 137        """Generate counterfactual examples.
 138
 139        Use mixed-integer linear programming to generate optimal counterfactual
 140        examples for the given data point.
 141
 142        Args:
 143            cur_example (np.ndarray): The data point of interest. This function
 144                aims to find similar examples that the model gives different
 145                predictions.
 146            total_cfs (int, optional): The total number of counterfactuals to,
 147                generate. Default to 1.
 148            target_range (tuple, optional): The targetted prediction range. This
 149                parameter is required if the EBM is a regressor.
 150            sim_threshold_factor (float, optional): A positive float to automatically
 151                generate a similarity threshold. This parameter has no effect if
 152                `sim_threshold` is provided. If `sim_threshold` is
 153                not provided, we compute `sim_threshold` as `sim_threshold_factor`
 154                * average additive score range of all continuous features. If
 155                `sim_threshold_factor` is too small, it takes longer time to
 156                generate CFs. If `sim_threshold_factor` is too large, the
 157                algorithm might miss some optimal CFs.
 158            sim_threshold (float, optional): A positive float to determine how we
 159                decide if two bins of a continuous feature have similar scores.
 160                Two bins $b_1$ and $b_2$ are similar (the distant one will be
 161                removed) if $|b_1 - b_2| \\leq$ `sim_threshold`.
 162            categorical_weight (Union[float, str], optional): A positive float
 163                to scale the distances of options for categorical variables. Since
 164                we have very different distance functions for continuous and
 165                categorical features, we need to scale them so they are at a
 166                comparable range. To do that, we multiply the categorical feature's
 167                distances by `categorical_weight`. By default ('auto'), we scale
 168                the distances of categorical features so that they have the mean
 169                distance as continuous features.
 170            features_to_vary ([str], optional): A list of feature names that
 171                the CFs can change. If it is `None`, this function will use all
 172                features.
 173            max_num_features_to_vary (int, optional): The max number of features
 174                that the CF can vary. Default is no maximum.
 175            feature_ranges (dict, optional): A dictionary to control the permitted
 176                ranges/values for continuous/categorical features. It maps
 177                `feature_name` -> [`min_value`, `max_value`] for continuous features,
 178                `feature_name` -> [`level1`, `level2`, ...] for categorical features.
 179            continuous_integer_features (list, optional): A list of names of
 180                continuous features that need to be integers (e.g., age, FICO score)
 181            verbose (int): 0: no any output, 1: show progress bar, 2: show internal
 182                optimization details
 183
 184        Returns:
 185            Counterfactuals: The generated counterfactual examples with their
 186                associated distances and change information.
 187        """
 188
 189        # Transforming some parameters
 190        if len(cur_example.shape) == 1:
 191            cur_example = cur_example.reshape(1, -1)
 192
 193        if features_to_vary is None:
 194            features_to_vary = [
 195                self.ebm.feature_names[i]
 196                for i in range(len(self.ebm.feature_types))
 197                if self.ebm.feature_types[i] != "interaction"
 198            ]
 199
 200        # Step 1: Find the current score for each feature
 201        # This is done by ebm.explain_local()
 202        cur_scores = {}
 203
 204        if self.is_classifier:
 205            cur_scores["intercept"] = self.ebm.intercept_[0]
 206        else:
 207            cur_scores["intercept"] = self.ebm.intercept_
 208
 209        local_data = self.ebm.explain_local(cur_example)._internal_obj
 210
 211        for i in range(len(self.ebm.feature_names)):
 212            cur_feature_name = self.ebm.feature_names[i]
 213            cur_feature_type = self.ebm.feature_types[i]
 214
 215            cur_scores[cur_feature_name] = local_data["specific"][0]["scores"][i]
 216
 217        # Find the CF direction
 218
 219        # Binary classification
 220        # Predicted 0 => +1
 221        # Predicted 1 => -1
 222        if self.is_classifier:
 223            cf_direction = self.ebm.predict(cur_example)[0] * (-2) + 1
 224            total_score = np.sum([cur_scores[k] for k in cur_scores])
 225            needed_score_gain = -total_score
 226            score_gain_bound = None
 227
 228        else:
 229            # Regression
 230            # Increase +1
 231            # Decrease -1
 232            if target_range is None:
 233                raise ValueError(
 234                    "target_range cannot be None when the model is a regressor"
 235                )
 236
 237            predicted_value = self.ebm.predict(cur_example)[0]
 238            if (
 239                predicted_value >= target_range[0]
 240                and predicted_value <= target_range[1]
 241            ):
 242                raise ValueError("The target_range cannot cover the current prediction")
 243
 244            elif predicted_value < target_range[0]:
 245                cf_direction = 1
 246                needed_score_gain = target_range[0] - predicted_value
 247                score_gain_bound = target_range[1] - predicted_value
 248            else:
 249                cf_direction = -1
 250                needed_score_gain = target_range[1] - predicted_value
 251                score_gain_bound = target_range[0] - predicted_value
 252
 253        # Step 2: Generate continuous and categorical options
 254        options = {}
 255
 256        # Generate a similarity threshold if it is not provided
 257        if sim_threshold is None:
 258            additive_ranges = []
 259
 260            for i in range(len(self.ebm.feature_names)):
 261                if self.ebm.feature_types[i] == "continuous":
 262                    cur_values = self.ebm.additive_terms_[i]
 263                    additive_ranges.append(np.max(cur_values) - np.min(cur_values))
 264
 265            sim_threshold = np.mean(additive_ranges) * sim_threshold_factor
 266
 267        # To make it faster to solve the MILP problem, we can decrease the
 268        # number of variables by filtering out unhelpful and redundant options
 269        #
 270        # (1) Unhelpful options: options that move the score to an undesirable
 271        # direction. For example, if we want to flip 0 to 1, options that decrease
 272        # the score are unhelpful.
 273        #
 274        # (2) Redundant options: for a set of options that give similar score
 275        # gains (bounded by a parameter epsilon), we only need to incldue one
 276        # option that has the lowest distance. This is only relevant for
 277        # continuous variables. Users can set the parameter epsilon. The default
 278        # should be relatively small, otherwise we might miss the optimal solution.
 279
 280        # Step 2.1: Find all good options from continuous and categorical features
 281        for cur_feature_id in range(len(self.ebm.feature_names)):
 282
 283            cur_feature_name = self.ebm.feature_names[cur_feature_id]
 284            cur_feature_type = self.ebm.feature_types[cur_feature_id]
 285            cur_feature_index = self.ebm.feature_groups_[cur_feature_id][0]
 286
 287            if cur_feature_type == "interaction":
 288                continue
 289
 290            elif cur_feature_type == "continuous":
 291                # The parameter epsilon controls the threshold of how we determine
 292                # "similar" options for continuous variables
 293                epsilon = sim_threshold
 294
 295                cur_feature_score = cur_scores[cur_feature_name]
 296                cur_feature_value = float(cur_example[0][cur_feature_id])
 297
 298                # Users can require the continuous feature to have integer values
 299                # For example, age, FICO score, and number of accounts
 300                need_to_be_int = False
 301                if (
 302                    continuous_integer_features
 303                    and cur_feature_name in continuous_integer_features
 304                ):
 305                    need_to_be_int = True
 306
 307                cur_cont_options = self.generate_cont_options(
 308                    cf_direction,
 309                    cur_feature_index,
 310                    cur_feature_name,
 311                    cur_feature_value,
 312                    cur_feature_score,
 313                    self.cont_mads,
 314                    cur_example[0],
 315                    score_gain_bound,
 316                    epsilon,
 317                    need_to_be_int,
 318                )
 319
 320                options[cur_feature_name] = cur_cont_options
 321
 322            elif cur_feature_type == "categorical":
 323                cur_feature_score = cur_scores[cur_feature_name]
 324                cur_feature_value = str(cur_example[0][cur_feature_id])
 325                cur_cat_distance = self.cat_distances[cur_feature_name]
 326
 327                cur_cat_options = self.generate_cat_options(
 328                    cf_direction,
 329                    cur_feature_index,
 330                    cur_feature_value,
 331                    cur_feature_score,
 332                    cur_cat_distance,
 333                    cur_example[0],
 334                    score_gain_bound,
 335                )
 336
 337                options[cur_feature_name] = cur_cat_options
 338
 339        # Step 2.2: Filter out undesired options (based on the feature_range)
 340        if feature_ranges is not None:
 341            for f_name in feature_ranges:
 342                cur_range = feature_ranges[f_name]
 343                f_index = self.ebm.feature_names.index(f_name)
 344                f_type = self.ebm.feature_types[f_index]
 345
 346                if f_type == "continuous":
 347                    # Delete options that use out-of-range options
 348                    for o in range(len(options[f_name]) - 1, -1, -1):
 349                        cur_target = options[f_name][o][0]
 350                        if cur_target < cur_range[0] or cur_target > cur_range[1]:
 351                            options[f_name].pop(o)
 352                elif f_type == "categorical":
 353                    for o in range(len(options[f_name]) - 1, -1, -1):
 354                        if options[f_name][o][0] not in cur_range:
 355                            options[f_name].pop(o)
 356
 357        # Step 2.3: Compute the interaction offsets for all possible options
 358        for cur_feature_id in range(len(self.ebm.feature_names)):
 359
 360            cur_feature_name = self.ebm.feature_names[cur_feature_id]
 361            cur_feature_type = self.ebm.feature_types[cur_feature_id]
 362
 363            if cur_feature_type == "interaction":
 364
 365                cur_feature_index_1 = self.ebm.feature_groups_[cur_feature_id][0]
 366                cur_feature_index_2 = self.ebm.feature_groups_[cur_feature_id][1]
 367
 368                cur_feature_score = cur_scores[cur_feature_name]
 369                options[cur_feature_name] = self.generate_inter_options(
 370                    cur_feature_id,
 371                    cur_feature_index_1,
 372                    cur_feature_index_2,
 373                    cur_feature_score,
 374                    options,
 375                )
 376
 377        # Step 2.4: Rescale categorical distances so that they have the same mean
 378        # as continuous variables (default)
 379        if categorical_weight == "auto":
 380            cont_distances = []
 381            cat_distances = []
 382
 383            for f_name in options:
 384                f_index = self.ebm.feature_names.index(f_name)
 385                f_type = self.ebm.feature_types[f_index]
 386
 387                if f_type == "continuous":
 388                    for option in options[f_name]:
 389                        cont_distances.append(option[2])
 390                elif f_type == "categorical":
 391                    for option in options[f_name]:
 392                        cat_distances.append(option[2])
 393
 394            categorical_weight = np.mean(cont_distances) / np.mean(cat_distances)
 395
 396        for f_name in options:
 397            f_index = self.ebm.feature_names.index(f_name)
 398            f_type = self.ebm.feature_types[f_index]
 399
 400            if f_type == "categorical":
 401                for option in options[f_name]:
 402                    option[2] = option[2] * categorical_weight
 403
 404        # Step 3. Formulate the MILP model and solve it
 405
 406        # Find diverse solutions by accumulatively muting the optimal solutions
 407        solutions = []
 408        muted_variables = []
 409        is_successful = True
 410
 411        for _ in tqdm(range(total_cfs), disable=verbose == 0):
 412            model, variables = self.create_milp(
 413                cf_direction,
 414                needed_score_gain,
 415                features_to_vary,
 416                options,
 417                max_num_features_to_vary,
 418                muted_variables=muted_variables,
 419            )
 420
 421            model.solve(pulp.apis.PULP_CBC_CMD(msg=verbose > 0, warmStart=True))
 422
 423            if model.status != 1:
 424                is_successful = False
 425
 426            if verbose == 2:
 427                print("solver runs for {:.2f} seconds".format(model.solutionTime))
 428                print("status: {}".format(pulp.LpStatus[model.status]))
 429
 430            active_variables = []
 431
 432            # Print the optimal solution
 433            for key in variables:
 434                for x in variables[key]:
 435                    if x.varValue > 0:
 436                        active_variables.append(x)
 437
 438            if verbose == 2:
 439                print("\nFound solutions:")
 440                self.print_solution(cur_example, active_variables, options)
 441
 442            # Collect the current solution and mute the associated variables
 443            solutions.append([active_variables, pulp.value(model.objective)])
 444
 445            for var in active_variables:
 446                if " x " not in var.name:
 447                    muted_variables.append(var.name)
 448
 449        cfs = Counterfactuals(
 450            solutions, is_successful, model, variables, self.ebm, cur_example, options
 451        )
 452
 453        return cfs
 454
 455    def generate_cont_options(
 456        self,
 457        cf_direction,
 458        cur_feature_index,
 459        cur_feature_name,
 460        cur_feature_value,
 461        cur_feature_score,
 462        cont_mads,
 463        cur_example,
 464        score_gain_bound=None,
 465        epsilon=0.005,
 466        need_to_be_int=False,
 467        skip_unhelpful=True,
 468    ):
 469        """
 470        Generate all alternative options for this continuous variable. This function
 471        would filter out all options that are:
 472
 473        1. Not helpful for the counterfactual generation.
 474        2. Give similar score gain but requires larger distance.
 475
 476        Args:
 477            cf_direction (int): Integer `+1` if 0 => 1, `-1` if 1 => 0
 478                (classification); `+1` if we need to increase the prediction,
 479                `-1` if decrease (regression).
 480            cur_feature_index (int): The index of the current continuous feature.
 481            cur_feature_name (str): Name of the current feature.
 482            cur_feature_value (float): The current feature value.
 483            cur_feature_score (float): The score for the current feature value.
 484            cont_mads (dict): A map of feature_name => MAD score.
 485            cur_example (list): Current sample values
 486            score_gain_bound (float): Bound of the score gain. We do not collect
 487                options that give `score_gain` > `score_gain_bound` (when
 488                `cf_direction=1`), or `score_gain` < `score_gain_bound` (when
 489                `cf_direction=-1`)
 490            epsilon (float): The threshold to determine if two options give similar
 491                score gains. Score gains $s_1$ and $s_2$ are similar if
 492                $|s_1 - s_2| <$ epsilon. Smaller epsilon significantly increases
 493                the time to solve the MILP. Large epsilon might filter out the
 494                optimal CF. Defaults to 0.005.
 495            need_to_be_int (bool): True if the target values for this continuous
 496                variable need to have integer values.
 497            skip_unhelpful (bool): True if to skip options from main
 498                effects that give opposite score gain. It is rare that there is a
 499                positive score gain from pair-interaction that outweigh negative
 500                score gain from two main effects, and adjusting the distance penalty.
 501
 502        Returns:
 503            list: List of option tuples (target, score gain, distance, bin_index)
 504        """
 505
 506        # For each continuous feature, each bin is a variable
 507        # For each bin, we need to compute (1) score gain, (2) distance
 508        # (1) score gain is the difference between new bin and current bin
 509        # (2) distance is L1 distance divided by median absolute deviation (MAD)
 510
 511        # Get the additive scores of this feature
 512        additives = self.ebm.additive_terms_[cur_feature_index][1:]
 513
 514        # Get the bin edges of this feature
 515        bin_starts = self.ebm.preprocessor_._get_bin_labels(cur_feature_index)[:-1]
 516
 517        # Create "options", each option is a tuple (target, score_gain, distance,
 518        # bin_index)
 519        cont_options = []
 520
 521        # Identify which bin this value falls into
 522        cur_bin_id = search_sorted_lower_index(bin_starts, cur_feature_value)
 523        assert additives[cur_bin_id] == cur_feature_score
 524
 525        # Identify interaction terms that we need to consider
 526        associated_interactions = []
 527
 528        for cur_feature_id in range(len(self.ebm.feature_names)):
 529            cur_feature_type = self.ebm.feature_types[cur_feature_id]
 530            if cur_feature_type == "interaction":
 531
 532                indexes = self.ebm.feature_groups_[cur_feature_id]
 533
 534                if cur_feature_index in indexes:
 535                    feature_position = 0 if indexes[0] == cur_feature_index else 1
 536
 537                    other_position = 1 - feature_position
 538                    other_index = indexes[other_position]
 539                    other_type = self.ebm.feature_types[other_index]
 540
 541                    # Get the current additive scores and bin edges
 542                    inter_additives = self.ebm.additive_terms_[cur_feature_id][1:, 1:]
 543
 544                    # Have to skip the max edge if it is continuous
 545                    bin_starts_feature = self.ebm.pair_preprocessor_._get_bin_labels(
 546                        cur_feature_index
 547                    )[:-1]
 548
 549                    bin_starts_other = self.ebm.pair_preprocessor_._get_bin_labels(
 550                        other_index
 551                    )
 552                    if other_type == "continuous":
 553                        bin_starts_other = bin_starts_other[:-1]
 554
 555                    # Get the current interaction term score
 556                    other_bin = None
 557                    if other_type == "continuous":
 558                        other_bin = search_sorted_lower_index(
 559                            bin_starts_other, float(cur_example[other_index])
 560                        )
 561                    else:
 562                        other_bin = bin_starts_other.index(cur_example[other_index])
 563
 564                    feature_bin = search_sorted_lower_index(
 565                        bin_starts_feature, cur_feature_value
 566                    )
 567
 568                    feature_inter_score = 0
 569
 570                    if feature_position == 0:
 571                        feature_inter_score = inter_additives[feature_bin, other_bin]
 572                    else:
 573                        feature_inter_score = inter_additives[other_bin, feature_bin]
 574
 575                    # Extract the row or column where we fix the other feature and
 576                    # vary the current feature
 577                    feature_inter_bin_starts = bin_starts_feature
 578                    feature_inter_additives = []
 579
 580                    if feature_position == 0:
 581                        for i in range(len(inter_additives)):
 582                            feature_inter_additives.append(
 583                                inter_additives[i, other_bin]
 584                            )
 585                    else:
 586                        for i in range(len(inter_additives[0])):
 587                            feature_inter_additives.append(
 588                                inter_additives[other_bin, i]
 589                            )
 590
 591                    # Register this interaction term
 592                    associated_interactions.append(
 593                        {
 594                            "inter_index": indexes,
 595                            "cur_interaction_id": cur_feature_id,
 596                            "feature_inter_score": feature_inter_score,
 597                            "feature_inter_bin_starts": feature_inter_bin_starts,
 598                            "feature_inter_additives": feature_inter_additives,
 599                        }
 600                    )
 601
 602        for i in range(len(additives)):
 603            # Because of the special binning structure of EBM, the distance of
 604            # bins on the left to the current value is different from the bins
 605            # that are on the right
 606            #
 607            # For bins on the left, the raw distance is abs(bin_start[i + 1] - x)
 608            # For bins on the right, the raw distance is abs(bin_start[i] - x)
 609            target = cur_feature_value
 610            distance = 0
 611
 612            if i < cur_bin_id:
 613                # First need to consider if it is need to be an integer
 614                # If so, it would be the closest integer to the right point
 615                if need_to_be_int:
 616                    target = float(int(bin_starts[i + 1]))
 617                    if target == bin_starts[i + 1]:
 618                        target -= 1
 619
 620                    # Skip this option if it is not possible to find an int value
 621                    if target < bin_starts[i]:
 622                        continue
 623
 624                    distance = np.abs(target - cur_feature_value)
 625
 626                else:
 627                    target = bin_starts[i + 1]
 628                    distance = np.abs(target - cur_feature_value)
 629
 630                    # Subtract a very smaller value to make the target
 631                    # technically fall into the left bin
 632                    target -= 1e-4
 633
 634            elif i > cur_bin_id:
 635                # First need to consider if it should be an integer value
 636                # If so, it would be the closest integer to the left point
 637                if need_to_be_int:
 638                    target = float(np.ceil(bin_starts[i]))
 639                    if target == bin_starts[i]:
 640                        target += 1
 641
 642                    # Skip this option if it is not possible to find an int value
 643                    if i + 1 < len(additives) and target >= bin_starts[i + 1]:
 644                        continue
 645
 646                    distance = np.abs(target - cur_feature_value)
 647
 648                else:
 649                    target = bin_starts[i]
 650                    distance = np.abs(target - cur_feature_value)
 651
 652            # Scale the distance based on the deviation of the feature (how changeable it is)
 653            if cont_mads[cur_feature_name] > 0:
 654                distance /= cont_mads[cur_feature_name]
 655
 656            # Compute score gain which has two parts:
 657            # (1) gain from the change of main effect
 658            # (2) gain from the change of interaction effect
 659
 660            # Main effect
 661            main_score_gain = additives[i] - cur_feature_score
 662
 663            # Interaction terms
 664            # A list to track all interaction score gain offsets
 665            # [[interaction id, interaction score gain]]
 666            inter_score_gain = 0
 667            inter_score_gains = []
 668
 669            for d in associated_interactions:
 670                inter_bin_id = search_sorted_lower_index(
 671                    d["feature_inter_bin_starts"], target
 672                )
 673                inter_score_gain += (
 674                    d["feature_inter_additives"][inter_bin_id]
 675                    - d["feature_inter_score"]
 676                )
 677                inter_score_gains.append(
 678                    [
 679                        d["cur_interaction_id"],
 680                        d["feature_inter_additives"][inter_bin_id]
 681                        - d["feature_inter_score"],
 682                    ]
 683                )
 684
 685            score_gain = main_score_gain + inter_score_gain
 686
 687            if cf_direction * score_gain <= 0 and skip_unhelpful:
 688                continue
 689
 690            # Filter out of bound options
 691            if score_gain_bound and skip_unhelpful:
 692                if cf_direction == 1 and score_gain > score_gain_bound:
 693                    continue
 694                if cf_direction == -1 and score_gain < score_gain_bound:
 695                    continue
 696
 697            cont_options.append([target, score_gain, distance, i, inter_score_gains])
 698
 699        # Now we can apply the second round of filtering to remove redundant options
 700        # Redundant options refer to bins that give similar score gain with larger distance
 701        cont_options = sorted(cont_options, key=lambda x: x[2])
 702
 703        start = 0
 704        while start < len(cont_options):
 705            for i in range(len(cont_options) - 1, start, -1):
 706                if np.abs(cont_options[i][1] - cont_options[start][1]) < epsilon:
 707                    cont_options.pop(i)
 708
 709            start += 1
 710
 711        return cont_options
 712
 713    def generate_cat_options(
 714        self,
 715        cf_direction,
 716        cur_feature_index,
 717        cur_feature_value,
 718        cur_feature_score,
 719        cur_cat_distance,
 720        cur_example,
 721        score_gain_bound=None,
 722        skip_unhelpful=True,
 723    ):
 724        """
 725        Generate all alternative options for this categorical variable. This function
 726        would filter out all options that are not helpful for the counterfactual
 727        generation.
 728
 729        Args:
 730            cf_direction (int): Integer `+1` if 0 => 1, `-1` if 1 => 0
 731                (classification); `+1` if we need to increase the prediction,
 732                `-1` if decrease (regression).
 733            cur_feature_index (int): The index of the current continuous feature.
 734            cur_feature_value (float): The current feature value.
 735            cur_feature_score (float): The score for the current feature value.
 736            cur_cat_distance (dict): A map of feature_level => 1 - frequency.
 737            cur_example (list): Current sample values.
 738            score_gain_bound (float): Bound of the score gain. We do not collect
 739                options that give `score_gain` > `score_gain_bound` (when
 740                `cf_direction=1`), or `score_gain` < `score_gain_bound` (when
 741                `cf_direction=-1`)
 742            skip_unhelpful (bool): True if to skip options from main
 743                effects that give opposite score gain. It is rare that there is a
 744                positive score gain from pair-interaction that outweigh negative
 745                score gain from two main effects, and adjusting the distance penalty.
 746
 747        Returns:
 748            list: List of option tuples (target, score_gain, distance, bin_index).
 749        """
 750
 751        # Find other options for this categorical variable
 752        # For each option, we compute the (1) score gain, and (2) distance
 753        #
 754        # (1) Score gain is the same as continuous variables
 755        # (2) The distance is determined by 1 - the level frequency in the
 756        # training data. It implies that levels with high frequency are easier
 757        # to "move to"
 758
 759        # Get the additive scores of this feature
 760        additives = self.ebm.additive_terms_[cur_feature_index][1:]
 761
 762        # Get the bin edges of this feature
 763        levels = self.ebm.preprocessor_._get_bin_labels(cur_feature_index)
 764
 765        # Create "options", each option is a tuple (target, score_gain, distance, bin_index)
 766        cat_options = []
 767
 768        # Identify interaction terms that we need to consider
 769        associated_interactions = []
 770
 771        for cur_feature_id in range(len(self.ebm.feature_names)):
 772            cur_feature_type = self.ebm.feature_types[cur_feature_id]
 773            if cur_feature_type == "interaction":
 774
 775                indexes = self.ebm.feature_groups_[cur_feature_id]
 776
 777                if cur_feature_index in indexes:
 778                    feature_position = 0 if indexes[0] == cur_feature_index else 1
 779
 780                    other_position = 1 - feature_position
 781                    other_index = indexes[other_position]
 782                    other_type = self.ebm.feature_types[other_index]
 783                    other_name = self.ebm.feature_names[other_index]
 784
 785                    # Get the current additive scores and bin edges
 786                    inter_additives = self.ebm.additive_terms_[cur_feature_id][1:, 1:]
 787
 788                    bin_starts_feature = self.ebm.pair_preprocessor_._get_bin_labels(
 789                        cur_feature_index
 790                    )
 791                    bin_starts_other = self.ebm.pair_preprocessor_._get_bin_labels(
 792                        other_index
 793                    )
 794
 795                    # Have to skip the max edge if it is continuous
 796                    if other_type == "continuous":
 797                        bin_starts_other = bin_starts_other[:-1]
 798
 799                    # Get the current interaction term score
 800                    other_bin = None
 801                    if other_type == "continuous":
 802                        other_bin = search_sorted_lower_index(
 803                            bin_starts_other, float(cur_example[other_index])
 804                        )
 805                    else:
 806                        other_bin = bin_starts_other.index(cur_example[other_index])
 807
 808                    feature_bin = bin_starts_feature.index(cur_feature_value)
 809
 810                    feature_inter_score = 0
 811
 812                    if feature_position == 0:
 813                        feature_inter_score = inter_additives[feature_bin, other_bin]
 814                    else:
 815                        feature_inter_score = inter_additives[other_bin, feature_bin]
 816
 817                    # Extract the row or column where we fix the other features and
 818                    # vary the current feature
 819                    feature_inter_bin_starts = bin_starts_feature
 820                    feature_inter_additives = []
 821
 822                    if feature_position == 0:
 823                        for i in range(len(inter_additives)):
 824                            feature_inter_additives.append(
 825                                inter_additives[i, other_bin]
 826                            )
 827                    else:
 828                        for i in range(len(inter_additives[0])):
 829                            feature_inter_additives.append(
 830                                inter_additives[other_bin, i]
 831                            )
 832
 833                    # Register this interaction term
 834                    associated_interactions.append(
 835                        {
 836                            "inter_index": indexes,
 837                            "cur_interaction_id": cur_feature_id,
 838                            "feature_inter_score": feature_inter_score,
 839                            "feature_inter_bin_starts": feature_inter_bin_starts,
 840                            "feature_inter_additives": feature_inter_additives,
 841                        }
 842                    )
 843
 844        for i in range(len(additives)):
 845            if levels[i] != cur_feature_value:
 846                target = levels[i]
 847                distance = cur_cat_distance[target]
 848
 849                # Compute score gain which has two parts:
 850                # (1) gain from the change of main effect
 851                # (2) gain from the change of interaction effect
 852
 853                # Main effect
 854                main_score_gain = additives[i] - cur_feature_score
 855
 856                # Interaction terms
 857                # A list to track all interaction score gain offsets
 858                # [[interaction id, interaction score gain]]
 859                inter_score_gain = 0
 860                inter_score_gains = []
 861
 862                for d in associated_interactions:
 863                    inter_bin_id = d["feature_inter_bin_starts"].index(target)
 864                    inter_score_gain += (
 865                        d["feature_inter_additives"][inter_bin_id]
 866                        - d["feature_inter_score"]
 867                    )
 868                    inter_score_gains.append(
 869                        [
 870                            d["cur_interaction_id"],
 871                            d["feature_inter_additives"][inter_bin_id]
 872                            - d["feature_inter_score"],
 873                        ]
 874                    )
 875
 876                score_gain = main_score_gain + inter_score_gain
 877
 878                # Skip unhelpful options
 879                if cf_direction * score_gain <= 0 and skip_unhelpful:
 880                    continue
 881
 882                # Filter out of bound options
 883                if score_gain_bound and skip_unhelpful:
 884                    if cf_direction == 1 and score_gain > score_gain_bound:
 885                        continue
 886                    if cf_direction == -1 and score_gain < score_gain_bound:
 887                        continue
 888
 889                cat_options.append([target, score_gain, distance, i, inter_score_gains])
 890
 891        return cat_options
 892
 893    def generate_inter_options(
 894        self,
 895        cur_feature_id,
 896        cur_feature_index_1,
 897        cur_feature_index_2,
 898        cur_feature_score,
 899        options,
 900    ):
 901        """
 902        Generate all possible options for this interaction variable.
 903
 904        Interaction terms are interesting in this MILP. Each option counts as a
 905        variable, but each variable only affects the score gain, not the distance.
 906
 907        Note that in EBM, the bin definitions for interaction terms can be different
 908        from their definitions for individual continuous variables.
 909
 910        To model interaction terms, we can think it as a binary variable. The
 911        value is determined by the multiplication of two main effect variables.
 912        Each interaction variable describes a combination of two main effect
 913        variables. Therefore, say continuous variable A has $x$ probable options,
 914        and another continuous variable B has $y$ probable options, then we should
 915        add $x \\times y$ binary variables to offset their probable interaction
 916        effects.
 917
 918        Args:
 919            cur_feature_id (int): The id of this interaction effect.
 920            cur_feature_index_1 (int): The index of the first main effect.
 921            cur_feature_index_2 (int): The index of the second main effect.
 922            cur_feature_score (float): The score for the current feature value.
 923            options (dict): The current option list, feature_name ->
 924                [`target`, `score_gain`, `distance`, `bin_id`].
 925
 926        Returns:
 927            List of option tuples (target, score_gain, distance, bin_index)
 928        """
 929
 930        # Get the sub-types for this interaction term
 931        cur_feature_type_1 = self.ebm.feature_types[cur_feature_index_1]
 932        cur_feature_type_2 = self.ebm.feature_types[cur_feature_index_2]
 933
 934        # Get the sub-names for this interaction term
 935        cur_feature_name_1 = self.ebm.feature_names[cur_feature_index_1]
 936        cur_feature_name_2 = self.ebm.feature_names[cur_feature_index_2]
 937
 938        # The first column and row are reserved for missing values (even with
 939        # categorical features)
 940        additives = self.ebm.additive_terms_[cur_feature_id][1:, 1:]
 941
 942        # Four possibilities here: cont x cont, cont x cat, cat x cont, cat x cat.
 943        # Each has a different way to lookup the bin table.
 944        inter_options = []
 945
 946        # Iterate through all possible combinations of options from these two
 947        # variables
 948        for opt_1 in options[cur_feature_name_1]:
 949            for opt_2 in options[cur_feature_name_2]:
 950
 951                bin_starts_1 = self.ebm.pair_preprocessor_._get_bin_labels(
 952                    cur_feature_index_1
 953                )
 954                bin_starts_2 = self.ebm.pair_preprocessor_._get_bin_labels(
 955                    cur_feature_index_2
 956                )
 957
 958                bin_1 = None
 959                bin_2 = None
 960
 961                if cur_feature_type_1 == "continuous":
 962                    if cur_feature_type_2 == "continuous":
 963                        # cont x cont
 964                        bin_starts_1 = bin_starts_1[:-1]
 965                        bin_starts_2 = bin_starts_2[:-1]
 966
 967                        # locate the bin for each option value
 968                        bin_1 = search_sorted_lower_index(bin_starts_1, opt_1[0])
 969                        bin_2 = search_sorted_lower_index(bin_starts_2, opt_2[0])
 970
 971                    else:
 972                        # cont x cat
 973                        bin_starts_1 = bin_starts_1[:-1]
 974
 975                        # locate the bin for each option value
 976                        bin_1 = search_sorted_lower_index(bin_starts_1, opt_1[0])
 977                        bin_2 = bin_starts_2.index(opt_2[0])
 978
 979                else:
 980                    if cur_feature_type_2 == "continuous":
 981                        # cat x cont
 982                        bin_starts_2 = bin_starts_2[:-1]
 983
 984                        # locate the bin for each option value
 985                        bin_1 = bin_starts_1.index(opt_1[0])
 986                        bin_2 = search_sorted_lower_index(bin_starts_2, opt_2[0])
 987
 988                    else:
 989                        # cat x cat
 990
 991                        # locate the bin for each option value
 992                        bin_1 = bin_starts_1.index(opt_1[0])
 993                        bin_2 = bin_starts_2.index(opt_2[0])
 994
 995                new_score = additives[bin_1, bin_2]
 996                score_gain = new_score - cur_feature_score
 997
 998                # The score gain on the interaction term need to offset the interaction
 999                # score gain we have already counted on the main effect options. That
1000                # score is saved in the option tuple.
1001
1002                # We first need to find the common interaction id
1003                common_index = [-1, -1]
1004                for m in range(len(opt_1[4])):
1005                    for n in range(len(opt_2[4])):
1006                        if opt_1[4][m][0] == opt_2[4][n][0]:
1007                            common_index = [m, n]
1008                            break
1009
1010                    if common_index[0] != -1 and common_index[1] != -1:
1011                        break
1012
1013                score_gain -= opt_1[4][common_index[0]][1]
1014                score_gain -= opt_2[4][common_index[1]][1]
1015
1016                inter_options.append(
1017                    [[opt_1[0], opt_2[0]], score_gain, 0, [opt_1[3], opt_2[3]], 0]
1018                )
1019
1020        return inter_options
1021
1022    @staticmethod
1023    def create_milp(
1024        cf_direction,
1025        needed_score_gain,
1026        features_to_vary,
1027        options,
1028        max_num_features_to_vary=None,
1029        muted_variables=[],
1030    ):
1031        """
1032        Create a MILP to find counterfactuals (CF) using PuLP.
1033
1034        Args:
1035            cf_direction (int): Integer +1 if 0 => 1, -1 if 1 => 0 (classification),
1036                +1 if we need to incrase the prediction, -1 if decrease (regression).
1037            needed_score_gain (float): The score gain needed to achieve the CF goal.
1038            features_to_vary (list[str]): Feature names of features that the
1039                generated CF can change.
1040            options (dict): Possible options for each variable. Each option is a
1041                list [target, score_gain, distance, bin_index].
1042            max_num_features_to_vary (int, optional): Max number of features that the
1043                generated CF can change. If the value is `None`, the CFs can
1044                change any number of features.
1045            muted_variables (list[str], optional): Variables that this MILP should
1046                not use. This is useful to mute optimal variables so we can explore
1047                diverse solutions. This list should not include interaction variables.
1048
1049        Returns:
1050            A tuple (`model`, `variables`), where `model` is a pulp.LpProblem
1051            model that encodes the MILP problem, and `variables` is a dict of
1052            variables used in the `model`: `feature_name` => [`variables`].
1053        """
1054
1055        # Create a model (minimizing the distance)
1056        model = pulp.LpProblem("ebmCounterfactual", pulp.LpMinimize)
1057
1058        distance = 0
1059        score_gain = 0
1060
1061        muted_variables_set = set(muted_variables)
1062
1063        # Create variables
1064        variables = {}
1065        for f in features_to_vary:
1066            # Each variable encodes an option (0: not use this option,
1067            # 1: use this option)
1068            cur_variables = []
1069
1070            for option in options[f]:
1071                var_name = "{}:{}".format(f, option[3])
1072
1073                # Skip the muted variables
1074                if var_name in muted_variables_set:
1075                    continue
1076
1077                x = pulp.LpVariable(var_name, lowBound=0, upBound=1, cat="Binary")
1078                x.setInitialValue(0)
1079
1080                score_gain += option[1] * x
1081                distance += option[2] * x
1082
1083                cur_variables.append(x)
1084
1085            variables[f] = cur_variables
1086
1087            # A local constraint is that we can only at most selection one option from
1088            # one feature
1089            model += pulp.lpSum(cur_variables) <= 1
1090
1091        # Users can also set `max_num_features_to_vary` to control the total
1092        # number of features to vary
1093        if max_num_features_to_vary is not None:
1094            main_variables = []
1095            for f in variables:
1096                main_variables.extend(variables[f])
1097
1098            model += pulp.lpSum(main_variables) <= max_num_features_to_vary
1099
1100        # Create variables for interaction effects
1101        for opt_name in options:
1102            if " x " in opt_name:
1103                f1_name = re.sub(r"(.+)\sx\s.+", r"\1", opt_name)
1104                f2_name = re.sub(r".+\sx\s(.+)", r"\1", opt_name)
1105
1106                if f1_name in features_to_vary and f2_name in features_to_vary:
1107
1108                    # We need to include this interaction effect
1109                    cur_variables = []
1110
1111                    for option in options[opt_name]:
1112                        z = pulp.LpVariable(
1113                            "{}:{},{}".format(opt_name, option[3][0], option[3][1]),
1114                            lowBound=0,
1115                            upBound=1,
1116                            cat="Continuous",
1117                        )
1118                        z.setInitialValue(0)
1119
1120                        # Need to iterate through existing variables for f1 and f2 to find
1121                        # the corresponding variables
1122                        x_f1 = None
1123                        x_f2 = None
1124
1125                        # Skp if this interaction variable involves muted main variable
1126                        x_f1_name = "{}:{}".format(f1_name, option[3][0])
1127                        x_f2_name = "{}:{}".format(f2_name, option[3][1])
1128
1129                        if (
1130                            x_f1_name in muted_variables_set
1131                            or x_f2_name in muted_variables_set
1132                        ):
1133                            continue
1134
1135                        for x in variables[f1_name]:
1136                            if x.name == x_f1_name:
1137                                x_f1 = x
1138                                break
1139
1140                        for x in variables[f2_name]:
1141                            if x.name == x_f2_name:
1142                                x_f2 = x
1143                                break
1144
1145                        assert x_f1 is not None and x_f2 is not None
1146
1147                        # variable z is actually the product of x_f1 and x_f2
1148                        # We can linearize it by 3 constraints
1149                        model += z <= x_f1
1150                        model += z <= x_f2
1151                        model += z >= x_f1 + x_f2 - 1
1152
1153                        cur_variables.append(z)
1154
1155                    variables[opt_name] = cur_variables
1156
1157        # Use constraint to express counterfactual
1158        if cf_direction == 1:
1159            model += score_gain >= needed_score_gain
1160        else:
1161            model += score_gain <= needed_score_gain
1162
1163        # We want to minimize the distance
1164        model += distance
1165
1166        return model, variables
1167
1168    def print_solution(self, cur_example, active_variables, options):
1169        """
1170        Print the optimal solution.
1171
1172        Args:
1173            cur_example (np.ndarray): the original data point.
1174            active_variables (list[variable]): binary variables with value 1.
1175            options (dict): all the possible options for all features.
1176        """
1177
1178        for var in active_variables:
1179            # Skip interaction vars (included)
1180            if "_x_" not in var.name:
1181                f_name = re.sub(r"(.+):\d+", r"\1", var.name)
1182                bin_i = int(re.sub(r".+:(\d+)", r"\1", var.name))
1183
1184                # Find the original value
1185                org_value = cur_example[0][self.ebm.feature_names.index(f_name)]
1186
1187                # Find the target bin
1188                f_index = self.ebm.feature_names.index(f_name)
1189                f_type = self.ebm.feature_types[f_index]
1190
1191                if f_type == "continuous":
1192                    bin_starts = self.ebm.preprocessor_._get_bin_labels(f_index)[:-1]
1193
1194                    target_bin = "[{},".format(bin_starts[bin_i])
1195
1196                    if bin_i + 1 < len(bin_starts):
1197                        target_bin += " {})".format(bin_starts[bin_i + 1])
1198                    else:
1199                        target_bin += " inf)"
1200                else:
1201                    target_bin = ""
1202                    org_value = '"{}"'.format(org_value)
1203
1204                for option in options[f_name]:
1205                    if option[3] == bin_i:
1206                        print(
1207                            "Change <{}> from {} to {} {}".format(
1208                                f_name, org_value, option[0], target_bin
1209                            )
1210                        )
1211                        print(
1212                            "\t* score gain: {:.4f}\n\t* distance cost: {:.4f}".format(
1213                                option[1], option[2]
1214                            )
1215                        )
1216                        break
1217
1218            else:
1219                f_name = re.sub(r"(.+):.+", r"\1", var.name)
1220                f_name = f_name.replace("_x_", " x ")
1221                bin_0 = int(re.sub(r".+:(\d+),\d+", r"\1", var.name))
1222                bin_1 = int(re.sub(r".+:\d+,(\d+)", r"\1", var.name))
1223
1224                for option in options[f_name]:
1225                    if option[3][0] == bin_0 and option[3][1] == bin_1:
1226                        print("Trigger interaction term: <{}>".format(f_name))
1227                        print(
1228                            "\t* score gain: {:.4f}\n\t* distance cost: {:.4f}".format(
1229                                option[1], 0
1230                            )
1231                        )
1232                        break
1233        print()
1234
1235    @staticmethod
1236    def compute_mad(xs):
1237        """
1238        Compute the median absolute deviation of a continuous feature.
1239
1240        Args:
1241            xs (np.ndarray): A column of continuous values.
1242
1243        Returns:
1244            float: MAD value of xs.
1245        """
1246        xs_median = np.median(xs.astype(float))
1247        mad = np.median(np.abs(xs.astype(float) - xs_median))
1248        return mad
1249
1250    @staticmethod
1251    def compute_frequency_distance(xs):
1252        """
1253        For categorical variables, we compute 1 - frequency as their distance. It implies
1254        that switching to a frequent value takes less effort.
1255
1256        Args:
1257            xs (np.ndarray): A column of categorical values.
1258
1259        Returns:
1260            dict: category level -> 1 - frequency.
1261        """
1262        counter = Counter(xs)
1263
1264        results = {}
1265
1266        for key in counter:
1267            results[key] = 1 - (counter[key] / len(xs))
1268
1269        return results
1270
1271    @staticmethod
1272    def compute_naive_cat_distance(xs):
1273        """
1274        Alternative to compute_frequency_distance() to compute distance for
1275        categorical variables. The distance 1 for different levels and 0 for
1276        the same levels. Here we give them all score 1, because same-level
1277        options will be filtered out when we create categorical options for the
1278        optimization program.
1279
1280        Args:
1281            xs (np.ndarray): A column of categorical values.
1282
1283        Returns:
1284            dict: category level -> 1.
1285        """
1286        counter = Counter(xs)
1287        results = {}
1288
1289        for key in counter:
1290            results[key] = 1
1291
1292        return results

Main class for GAM Coach.

GAMCoach( ebm: Union[interpret.glassbox.ebm.ebm.ExplainableBoostingClassifier, interpret.glassbox.ebm.ebm.ExplainableBoostingRegressor], x_train: numpy.ndarray, cont_mads=None, cat_distances=None, adjust_cat_distance=True)
 29    def __init__(
 30        self,
 31        ebm: Union[ExplainableBoostingClassifier, ExplainableBoostingRegressor],
 32        x_train: np.ndarray,
 33        cont_mads=None,
 34        cat_distances=None,
 35        adjust_cat_distance=True,
 36    ):
 37        """Initialize a GAMCoach object.
 38
 39        Args:
 40            ebm (Union[ExplainableBoostingClassifier, ExplainableBoostingRegressor]):
 41                The trained EBM model. It can be either a classifier or a regressor.
 42            x_train (np.ndarray): The training data. It is used to compute the
 43                distance for different features.
 44            cont_mads (dict, optional): `feature_name` -> `median absolute
 45                deviation score`. If it is provided, it is used to overwrite the
 46                computed MADs for continuous variables. It is useful when you
 47                want to provide a custom normalization function to compute the
 48                distance between continuous features.
 49            cat_distances (dict, optional): `feature_name` -> {`level_name` -> `distance`}.
 50                Level distance of categorical variables. By default, the distance
 51                is computed by (1 - frequency(level)) for each level. It imples
 52                that it is easier to move to a more frequent. If `cat_distances`
 53                is provided, it will overwrite the default distance for
 54                categorical variables.
 55            adjust_cat_distance (bool, optional): If true, we use (1 -
 56                frequency(level)) for each level. Otherwise, we give distance = 1
 57                for different levels and 0 for the same level.
 58        """
 59
 60        self.ebm: Union[
 61            ExplainableBoostingClassifier, ExplainableBoostingRegressor
 62        ] = ebm
 63        """The trained EBM model."""
 64
 65        self.x_train: np.ndarray = x_train
 66
 67        self.cont_mads: dict = cont_mads
 68        """Median absolute deviation (MAD) of continuous variables."""
 69
 70        self.cat_distances: dict = cat_distances
 71        """Level distance of categorical variables. By default, the distance is
 72        computed by $(1 - \\frac{\\text{count of} L_i}{\\text{count of all L}})$
 73        for one level $L_i$. It implies that it is easier to move to a more
 74        frequent level.
 75        """
 76
 77        self.adjust_cat_distance: bool = adjust_cat_distance
 78
 79        # If cont_mads is not given, we compute it from the training data
 80        if self.cont_mads is None:
 81            ebm_cont_indexes = np.array(
 82                [
 83                    i
 84                    for i in range(len(self.ebm.feature_names))
 85                    if self.ebm.feature_types[i] == "continuous"
 86                ]
 87            )
 88
 89            self.cont_mads = {}
 90
 91            for i in ebm_cont_indexes:
 92                self.cont_mads[ebm.feature_names[i]] = self.compute_mad(
 93                    self.x_train[:, i]
 94                )
 95
 96        # If cat_distance is not given, we compute it from the training data
 97        if self.cat_distances is None:
 98            ebm_cat_indexes = np.array(
 99                [
100                    i
101                    for i in range(len(self.ebm.feature_names))
102                    if self.ebm.feature_types[i] == "categorical"
103                ]
104            )
105
106            self.cat_distances = {}
107
108            if self.adjust_cat_distance:
109                for i in ebm_cat_indexes:
110                    self.cat_distances[
111                        self.ebm.feature_names[i]
112                    ] = GAMCoach.compute_frequency_distance(self.x_train[:, i])
113            else:
114                for i in ebm_cat_indexes:
115                    self.cat_distances[
116                        self.ebm.feature_names[i]
117                    ] = GAMCoach.compute_naive_cat_distance(self.x_train[:, i])
118
119        # Determine if the ebm is a classifier or a regressor
120        self.is_classifier = isinstance(self.ebm.intercept_, np.ndarray)
121        """True if the ebm model is a classifier, false if it is a regressor."""

Initialize a GAMCoach object.

Args
  • ebm (Union[ExplainableBoostingClassifier, ExplainableBoostingRegressor]): The trained EBM model. It can be either a classifier or a regressor.
  • x_train (np.ndarray): The training data. It is used to compute the distance for different features.
  • cont_mads (dict, optional): feature_name -> median absolute deviation score. If it is provided, it is used to overwrite the computed MADs for continuous variables. It is useful when you want to provide a custom normalization function to compute the distance between continuous features.
  • cat_distances (dict, optional): feature_name -> {level_name -> distance}. Level distance of categorical variables. By default, the distance is computed by (1 - frequency(level)) for each level. It imples that it is easier to move to a more frequent. If cat_distances is provided, it will overwrite the default distance for categorical variables.
  • adjust_cat_distance (bool, optional): If true, we use (1 - frequency(level)) for each level. Otherwise, we give distance = 1 for different levels and 0 for the same level.
ebm: Union[interpret.glassbox.ebm.ebm.ExplainableBoostingClassifier, interpret.glassbox.ebm.ebm.ExplainableBoostingRegressor]

The trained EBM model.

cont_mads: dict

Median absolute deviation (MAD) of continuous variables.

cat_distances: dict

Level distance of categorical variables. By default, the distance is computed by $(1 - \frac{\text{count of} L_i}{\text{count of all L}})$ for one level $L_i$. It implies that it is easier to move to a more frequent level.

is_classifier

True if the ebm model is a classifier, false if it is a regressor.

def generate_cfs( self, cur_example: numpy.ndarray, total_cfs: int = 1, target_range: tuple = None, sim_threshold_factor: float = 0.005, sim_threshold: float = None, categorical_weight: Union[float, str] = 'auto', features_to_vary: list = None, max_num_features_to_vary: int = None, feature_ranges: dict = None, continuous_integer_features: list = None, verbose: int = 1) -> gamcoach.counterfactuals.Counterfactuals:
123    def generate_cfs(
124        self,
125        cur_example: np.ndarray,
126        total_cfs: int = 1,
127        target_range: tuple = None,
128        sim_threshold_factor: float = 0.005,
129        sim_threshold: float = None,
130        categorical_weight: Union[float, str] = "auto",
131        features_to_vary: list = None,
132        max_num_features_to_vary: int = None,
133        feature_ranges: dict = None,
134        continuous_integer_features: list = None,
135        verbose: int = 1,
136    ) -> Counterfactuals:
137        """Generate counterfactual examples.
138
139        Use mixed-integer linear programming to generate optimal counterfactual
140        examples for the given data point.
141
142        Args:
143            cur_example (np.ndarray): The data point of interest. This function
144                aims to find similar examples that the model gives different
145                predictions.
146            total_cfs (int, optional): The total number of counterfactuals to,
147                generate. Default to 1.
148            target_range (tuple, optional): The targetted prediction range. This
149                parameter is required if the EBM is a regressor.
150            sim_threshold_factor (float, optional): A positive float to automatically
151                generate a similarity threshold. This parameter has no effect if
152                `sim_threshold` is provided. If `sim_threshold` is
153                not provided, we compute `sim_threshold` as `sim_threshold_factor`
154                * average additive score range of all continuous features. If
155                `sim_threshold_factor` is too small, it takes longer time to
156                generate CFs. If `sim_threshold_factor` is too large, the
157                algorithm might miss some optimal CFs.
158            sim_threshold (float, optional): A positive float to determine how we
159                decide if two bins of a continuous feature have similar scores.
160                Two bins $b_1$ and $b_2$ are similar (the distant one will be
161                removed) if $|b_1 - b_2| \\leq$ `sim_threshold`.
162            categorical_weight (Union[float, str], optional): A positive float
163                to scale the distances of options for categorical variables. Since
164                we have very different distance functions for continuous and
165                categorical features, we need to scale them so they are at a
166                comparable range. To do that, we multiply the categorical feature's
167                distances by `categorical_weight`. By default ('auto'), we scale
168                the distances of categorical features so that they have the mean
169                distance as continuous features.
170            features_to_vary ([str], optional): A list of feature names that
171                the CFs can change. If it is `None`, this function will use all
172                features.
173            max_num_features_to_vary (int, optional): The max number of features
174                that the CF can vary. Default is no maximum.
175            feature_ranges (dict, optional): A dictionary to control the permitted
176                ranges/values for continuous/categorical features. It maps
177                `feature_name` -> [`min_value`, `max_value`] for continuous features,
178                `feature_name` -> [`level1`, `level2`, ...] for categorical features.
179            continuous_integer_features (list, optional): A list of names of
180                continuous features that need to be integers (e.g., age, FICO score)
181            verbose (int): 0: no any output, 1: show progress bar, 2: show internal
182                optimization details
183
184        Returns:
185            Counterfactuals: The generated counterfactual examples with their
186                associated distances and change information.
187        """
188
189        # Transforming some parameters
190        if len(cur_example.shape) == 1:
191            cur_example = cur_example.reshape(1, -1)
192
193        if features_to_vary is None:
194            features_to_vary = [
195                self.ebm.feature_names[i]
196                for i in range(len(self.ebm.feature_types))
197                if self.ebm.feature_types[i] != "interaction"
198            ]
199
200        # Step 1: Find the current score for each feature
201        # This is done by ebm.explain_local()
202        cur_scores = {}
203
204        if self.is_classifier:
205            cur_scores["intercept"] = self.ebm.intercept_[0]
206        else:
207            cur_scores["intercept"] = self.ebm.intercept_
208
209        local_data = self.ebm.explain_local(cur_example)._internal_obj
210
211        for i in range(len(self.ebm.feature_names)):
212            cur_feature_name = self.ebm.feature_names[i]
213            cur_feature_type = self.ebm.feature_types[i]
214
215            cur_scores[cur_feature_name] = local_data["specific"][0]["scores"][i]
216
217        # Find the CF direction
218
219        # Binary classification
220        # Predicted 0 => +1
221        # Predicted 1 => -1
222        if self.is_classifier:
223            cf_direction = self.ebm.predict(cur_example)[0] * (-2) + 1
224            total_score = np.sum([cur_scores[k] for k in cur_scores])
225            needed_score_gain = -total_score
226            score_gain_bound = None
227
228        else:
229            # Regression
230            # Increase +1
231            # Decrease -1
232            if target_range is None:
233                raise ValueError(
234                    "target_range cannot be None when the model is a regressor"
235                )
236
237            predicted_value = self.ebm.predict(cur_example)[0]
238            if (
239                predicted_value >= target_range[0]
240                and predicted_value <= target_range[1]
241            ):
242                raise ValueError("The target_range cannot cover the current prediction")
243
244            elif predicted_value < target_range[0]:
245                cf_direction = 1
246                needed_score_gain = target_range[0] - predicted_value
247                score_gain_bound = target_range[1] - predicted_value
248            else:
249                cf_direction = -1
250                needed_score_gain = target_range[1] - predicted_value
251                score_gain_bound = target_range[0] - predicted_value
252
253        # Step 2: Generate continuous and categorical options
254        options = {}
255
256        # Generate a similarity threshold if it is not provided
257        if sim_threshold is None:
258            additive_ranges = []
259
260            for i in range(len(self.ebm.feature_names)):
261                if self.ebm.feature_types[i] == "continuous":
262                    cur_values = self.ebm.additive_terms_[i]
263                    additive_ranges.append(np.max(cur_values) - np.min(cur_values))
264
265            sim_threshold = np.mean(additive_ranges) * sim_threshold_factor
266
267        # To make it faster to solve the MILP problem, we can decrease the
268        # number of variables by filtering out unhelpful and redundant options
269        #
270        # (1) Unhelpful options: options that move the score to an undesirable
271        # direction. For example, if we want to flip 0 to 1, options that decrease
272        # the score are unhelpful.
273        #
274        # (2) Redundant options: for a set of options that give similar score
275        # gains (bounded by a parameter epsilon), we only need to incldue one
276        # option that has the lowest distance. This is only relevant for
277        # continuous variables. Users can set the parameter epsilon. The default
278        # should be relatively small, otherwise we might miss the optimal solution.
279
280        # Step 2.1: Find all good options from continuous and categorical features
281        for cur_feature_id in range(len(self.ebm.feature_names)):
282
283            cur_feature_name = self.ebm.feature_names[cur_feature_id]
284            cur_feature_type = self.ebm.feature_types[cur_feature_id]
285            cur_feature_index = self.ebm.feature_groups_[cur_feature_id][0]
286
287            if cur_feature_type == "interaction":
288                continue
289
290            elif cur_feature_type == "continuous":
291                # The parameter epsilon controls the threshold of how we determine
292                # "similar" options for continuous variables
293                epsilon = sim_threshold
294
295                cur_feature_score = cur_scores[cur_feature_name]
296                cur_feature_value = float(cur_example[0][cur_feature_id])
297
298                # Users can require the continuous feature to have integer values
299                # For example, age, FICO score, and number of accounts
300                need_to_be_int = False
301                if (
302                    continuous_integer_features
303                    and cur_feature_name in continuous_integer_features
304                ):
305                    need_to_be_int = True
306
307                cur_cont_options = self.generate_cont_options(
308                    cf_direction,
309                    cur_feature_index,
310                    cur_feature_name,
311                    cur_feature_value,
312                    cur_feature_score,
313                    self.cont_mads,
314                    cur_example[0],
315                    score_gain_bound,
316                    epsilon,
317                    need_to_be_int,
318                )
319
320                options[cur_feature_name] = cur_cont_options
321
322            elif cur_feature_type == "categorical":
323                cur_feature_score = cur_scores[cur_feature_name]
324                cur_feature_value = str(cur_example[0][cur_feature_id])
325                cur_cat_distance = self.cat_distances[cur_feature_name]
326
327                cur_cat_options = self.generate_cat_options(
328                    cf_direction,
329                    cur_feature_index,
330                    cur_feature_value,
331                    cur_feature_score,
332                    cur_cat_distance,
333                    cur_example[0],
334                    score_gain_bound,
335                )
336
337                options[cur_feature_name] = cur_cat_options
338
339        # Step 2.2: Filter out undesired options (based on the feature_range)
340        if feature_ranges is not None:
341            for f_name in feature_ranges:
342                cur_range = feature_ranges[f_name]
343                f_index = self.ebm.feature_names.index(f_name)
344                f_type = self.ebm.feature_types[f_index]
345
346                if f_type == "continuous":
347                    # Delete options that use out-of-range options
348                    for o in range(len(options[f_name]) - 1, -1, -1):
349                        cur_target = options[f_name][o][0]
350                        if cur_target < cur_range[0] or cur_target > cur_range[1]:
351                            options[f_name].pop(o)
352                elif f_type == "categorical":
353                    for o in range(len(options[f_name]) - 1, -1, -1):
354                        if options[f_name][o][0] not in cur_range:
355                            options[f_name].pop(o)
356
357        # Step 2.3: Compute the interaction offsets for all possible options
358        for cur_feature_id in range(len(self.ebm.feature_names)):
359
360            cur_feature_name = self.ebm.feature_names[cur_feature_id]
361            cur_feature_type = self.ebm.feature_types[cur_feature_id]
362
363            if cur_feature_type == "interaction":
364
365                cur_feature_index_1 = self.ebm.feature_groups_[cur_feature_id][0]
366                cur_feature_index_2 = self.ebm.feature_groups_[cur_feature_id][1]
367
368                cur_feature_score = cur_scores[cur_feature_name]
369                options[cur_feature_name] = self.generate_inter_options(
370                    cur_feature_id,
371                    cur_feature_index_1,
372                    cur_feature_index_2,
373                    cur_feature_score,
374                    options,
375                )
376
377        # Step 2.4: Rescale categorical distances so that they have the same mean
378        # as continuous variables (default)
379        if categorical_weight == "auto":
380            cont_distances = []
381            cat_distances = []
382
383            for f_name in options:
384                f_index = self.ebm.feature_names.index(f_name)
385                f_type = self.ebm.feature_types[f_index]
386
387                if f_type == "continuous":
388                    for option in options[f_name]:
389                        cont_distances.append(option[2])
390                elif f_type == "categorical":
391                    for option in options[f_name]:
392                        cat_distances.append(option[2])
393
394            categorical_weight = np.mean(cont_distances) / np.mean(cat_distances)
395
396        for f_name in options:
397            f_index = self.ebm.feature_names.index(f_name)
398            f_type = self.ebm.feature_types[f_index]
399
400            if f_type == "categorical":
401                for option in options[f_name]:
402                    option[2] = option[2] * categorical_weight
403
404        # Step 3. Formulate the MILP model and solve it
405
406        # Find diverse solutions by accumulatively muting the optimal solutions
407        solutions = []
408        muted_variables = []
409        is_successful = True
410
411        for _ in tqdm(range(total_cfs), disable=verbose == 0):
412            model, variables = self.create_milp(
413                cf_direction,
414                needed_score_gain,
415                features_to_vary,
416                options,
417                max_num_features_to_vary,
418                muted_variables=muted_variables,
419            )
420
421            model.solve(pulp.apis.PULP_CBC_CMD(msg=verbose > 0, warmStart=True))
422
423            if model.status != 1:
424                is_successful = False
425
426            if verbose == 2:
427                print("solver runs for {:.2f} seconds".format(model.solutionTime))
428                print("status: {}".format(pulp.LpStatus[model.status]))
429
430            active_variables = []
431
432            # Print the optimal solution
433            for key in variables:
434                for x in variables[key]:
435                    if x.varValue > 0:
436                        active_variables.append(x)
437
438            if verbose == 2:
439                print("\nFound solutions:")
440                self.print_solution(cur_example, active_variables, options)
441
442            # Collect the current solution and mute the associated variables
443            solutions.append([active_variables, pulp.value(model.objective)])
444
445            for var in active_variables:
446                if " x " not in var.name:
447                    muted_variables.append(var.name)
448
449        cfs = Counterfactuals(
450            solutions, is_successful, model, variables, self.ebm, cur_example, options
451        )
452
453        return cfs

Generate counterfactual examples.

Use mixed-integer linear programming to generate optimal counterfactual examples for the given data point.

Args
  • cur_example (np.ndarray): The data point of interest. This function aims to find similar examples that the model gives different predictions.
  • total_cfs (int, optional): The total number of counterfactuals to, generate. Default to 1.
  • target_range (tuple, optional): The targetted prediction range. This parameter is required if the EBM is a regressor.
  • sim_threshold_factor (float, optional): A positive float to automatically generate a similarity threshold. This parameter has no effect if sim_threshold is provided. If sim_threshold is not provided, we compute sim_threshold as sim_threshold_factor
    • average additive score range of all continuous features. If sim_threshold_factor is too small, it takes longer time to generate CFs. If sim_threshold_factor is too large, the algorithm might miss some optimal CFs.
  • sim_threshold (float, optional): A positive float to determine how we decide if two bins of a continuous feature have similar scores. Two bins $b_1$ and $b_2$ are similar (the distant one will be removed) if $|b_1 - b_2| \leq$ sim_threshold.
  • categorical_weight (Union[float, str], optional): A positive float to scale the distances of options for categorical variables. Since we have very different distance functions for continuous and categorical features, we need to scale them so they are at a comparable range. To do that, we multiply the categorical feature's distances by categorical_weight. By default ('auto'), we scale the distances of categorical features so that they have the mean distance as continuous features.
  • features_to_vary ([str], optional): A list of feature names that the CFs can change. If it is None, this function will use all features.
  • max_num_features_to_vary (int, optional): The max number of features that the CF can vary. Default is no maximum.
  • feature_ranges (dict, optional): A dictionary to control the permitted ranges/values for continuous/categorical features. It maps feature_name -> [min_value, max_value] for continuous features, feature_name -> [level1, level2, ...] for categorical features.
  • continuous_integer_features (list, optional): A list of names of continuous features that need to be integers (e.g., age, FICO score)
  • verbose (int): 0: no any output, 1: show progress bar, 2: show internal optimization details
Returns

Counterfactuals: The generated counterfactual examples with their associated distances and change information.

def generate_cont_options( self, cf_direction, cur_feature_index, cur_feature_name, cur_feature_value, cur_feature_score, cont_mads, cur_example, score_gain_bound=None, epsilon=0.005, need_to_be_int=False, skip_unhelpful=True):
455    def generate_cont_options(
456        self,
457        cf_direction,
458        cur_feature_index,
459        cur_feature_name,
460        cur_feature_value,
461        cur_feature_score,
462        cont_mads,
463        cur_example,
464        score_gain_bound=None,
465        epsilon=0.005,
466        need_to_be_int=False,
467        skip_unhelpful=True,
468    ):
469        """
470        Generate all alternative options for this continuous variable. This function
471        would filter out all options that are:
472
473        1. Not helpful for the counterfactual generation.
474        2. Give similar score gain but requires larger distance.
475
476        Args:
477            cf_direction (int): Integer `+1` if 0 => 1, `-1` if 1 => 0
478                (classification); `+1` if we need to increase the prediction,
479                `-1` if decrease (regression).
480            cur_feature_index (int): The index of the current continuous feature.
481            cur_feature_name (str): Name of the current feature.
482            cur_feature_value (float): The current feature value.
483            cur_feature_score (float): The score for the current feature value.
484            cont_mads (dict): A map of feature_name => MAD score.
485            cur_example (list): Current sample values
486            score_gain_bound (float): Bound of the score gain. We do not collect
487                options that give `score_gain` > `score_gain_bound` (when
488                `cf_direction=1`), or `score_gain` < `score_gain_bound` (when
489                `cf_direction=-1`)
490            epsilon (float): The threshold to determine if two options give similar
491                score gains. Score gains $s_1$ and $s_2$ are similar if
492                $|s_1 - s_2| <$ epsilon. Smaller epsilon significantly increases
493                the time to solve the MILP. Large epsilon might filter out the
494                optimal CF. Defaults to 0.005.
495            need_to_be_int (bool): True if the target values for this continuous
496                variable need to have integer values.
497            skip_unhelpful (bool): True if to skip options from main
498                effects that give opposite score gain. It is rare that there is a
499                positive score gain from pair-interaction that outweigh negative
500                score gain from two main effects, and adjusting the distance penalty.
501
502        Returns:
503            list: List of option tuples (target, score gain, distance, bin_index)
504        """
505
506        # For each continuous feature, each bin is a variable
507        # For each bin, we need to compute (1) score gain, (2) distance
508        # (1) score gain is the difference between new bin and current bin
509        # (2) distance is L1 distance divided by median absolute deviation (MAD)
510
511        # Get the additive scores of this feature
512        additives = self.ebm.additive_terms_[cur_feature_index][1:]
513
514        # Get the bin edges of this feature
515        bin_starts = self.ebm.preprocessor_._get_bin_labels(cur_feature_index)[:-1]
516
517        # Create "options", each option is a tuple (target, score_gain, distance,
518        # bin_index)
519        cont_options = []
520
521        # Identify which bin this value falls into
522        cur_bin_id = search_sorted_lower_index(bin_starts, cur_feature_value)
523        assert additives[cur_bin_id] == cur_feature_score
524
525        # Identify interaction terms that we need to consider
526        associated_interactions = []
527
528        for cur_feature_id in range(len(self.ebm.feature_names)):
529            cur_feature_type = self.ebm.feature_types[cur_feature_id]
530            if cur_feature_type == "interaction":
531
532                indexes = self.ebm.feature_groups_[cur_feature_id]
533
534                if cur_feature_index in indexes:
535                    feature_position = 0 if indexes[0] == cur_feature_index else 1
536
537                    other_position = 1 - feature_position
538                    other_index = indexes[other_position]
539                    other_type = self.ebm.feature_types[other_index]
540
541                    # Get the current additive scores and bin edges
542                    inter_additives = self.ebm.additive_terms_[cur_feature_id][1:, 1:]
543
544                    # Have to skip the max edge if it is continuous
545                    bin_starts_feature = self.ebm.pair_preprocessor_._get_bin_labels(
546                        cur_feature_index
547                    )[:-1]
548
549                    bin_starts_other = self.ebm.pair_preprocessor_._get_bin_labels(
550                        other_index
551                    )
552                    if other_type == "continuous":
553                        bin_starts_other = bin_starts_other[:-1]
554
555                    # Get the current interaction term score
556                    other_bin = None
557                    if other_type == "continuous":
558                        other_bin = search_sorted_lower_index(
559                            bin_starts_other, float(cur_example[other_index])
560                        )
561                    else:
562                        other_bin = bin_starts_other.index(cur_example[other_index])
563
564                    feature_bin = search_sorted_lower_index(
565                        bin_starts_feature, cur_feature_value
566                    )
567
568                    feature_inter_score = 0
569
570                    if feature_position == 0:
571                        feature_inter_score = inter_additives[feature_bin, other_bin]
572                    else:
573                        feature_inter_score = inter_additives[other_bin, feature_bin]
574
575                    # Extract the row or column where we fix the other feature and
576                    # vary the current feature
577                    feature_inter_bin_starts = bin_starts_feature
578                    feature_inter_additives = []
579
580                    if feature_position == 0:
581                        for i in range(len(inter_additives)):
582                            feature_inter_additives.append(
583                                inter_additives[i, other_bin]
584                            )
585                    else:
586                        for i in range(len(inter_additives[0])):
587                            feature_inter_additives.append(
588                                inter_additives[other_bin, i]
589                            )
590
591                    # Register this interaction term
592                    associated_interactions.append(
593                        {
594                            "inter_index": indexes,
595                            "cur_interaction_id": cur_feature_id,
596                            "feature_inter_score": feature_inter_score,
597                            "feature_inter_bin_starts": feature_inter_bin_starts,
598                            "feature_inter_additives": feature_inter_additives,
599                        }
600                    )
601
602        for i in range(len(additives)):
603            # Because of the special binning structure of EBM, the distance of
604            # bins on the left to the current value is different from the bins
605            # that are on the right
606            #
607            # For bins on the left, the raw distance is abs(bin_start[i + 1] - x)
608            # For bins on the right, the raw distance is abs(bin_start[i] - x)
609            target = cur_feature_value
610            distance = 0
611
612            if i < cur_bin_id:
613                # First need to consider if it is need to be an integer
614                # If so, it would be the closest integer to the right point
615                if need_to_be_int:
616                    target = float(int(bin_starts[i + 1]))
617                    if target == bin_starts[i + 1]:
618                        target -= 1
619
620                    # Skip this option if it is not possible to find an int value
621                    if target < bin_starts[i]:
622                        continue
623
624                    distance = np.abs(target - cur_feature_value)
625
626                else:
627                    target = bin_starts[i + 1]
628                    distance = np.abs(target - cur_feature_value)
629
630                    # Subtract a very smaller value to make the target
631                    # technically fall into the left bin
632                    target -= 1e-4
633
634            elif i > cur_bin_id:
635                # First need to consider if it should be an integer value
636                # If so, it would be the closest integer to the left point
637                if need_to_be_int:
638                    target = float(np.ceil(bin_starts[i]))
639                    if target == bin_starts[i]:
640                        target += 1
641
642                    # Skip this option if it is not possible to find an int value
643                    if i + 1 < len(additives) and target >= bin_starts[i + 1]:
644                        continue
645
646                    distance = np.abs(target - cur_feature_value)
647
648                else:
649                    target = bin_starts[i]
650                    distance = np.abs(target - cur_feature_value)
651
652            # Scale the distance based on the deviation of the feature (how changeable it is)
653            if cont_mads[cur_feature_name] > 0:
654                distance /= cont_mads[cur_feature_name]
655
656            # Compute score gain which has two parts:
657            # (1) gain from the change of main effect
658            # (2) gain from the change of interaction effect
659
660            # Main effect
661            main_score_gain = additives[i] - cur_feature_score
662
663            # Interaction terms
664            # A list to track all interaction score gain offsets
665            # [[interaction id, interaction score gain]]
666            inter_score_gain = 0
667            inter_score_gains = []
668
669            for d in associated_interactions:
670                inter_bin_id = search_sorted_lower_index(
671                    d["feature_inter_bin_starts"], target
672                )
673                inter_score_gain += (
674                    d["feature_inter_additives"][inter_bin_id]
675                    - d["feature_inter_score"]
676                )
677                inter_score_gains.append(
678                    [
679                        d["cur_interaction_id"],
680                        d["feature_inter_additives"][inter_bin_id]
681                        - d["feature_inter_score"],
682                    ]
683                )
684
685            score_gain = main_score_gain + inter_score_gain
686
687            if cf_direction * score_gain <= 0 and skip_unhelpful:
688                continue
689
690            # Filter out of bound options
691            if score_gain_bound and skip_unhelpful:
692                if cf_direction == 1 and score_gain > score_gain_bound:
693                    continue
694                if cf_direction == -1 and score_gain < score_gain_bound:
695                    continue
696
697            cont_options.append([target, score_gain, distance, i, inter_score_gains])
698
699        # Now we can apply the second round of filtering to remove redundant options
700        # Redundant options refer to bins that give similar score gain with larger distance
701        cont_options = sorted(cont_options, key=lambda x: x[2])
702
703        start = 0
704        while start < len(cont_options):
705            for i in range(len(cont_options) - 1, start, -1):
706                if np.abs(cont_options[i][1] - cont_options[start][1]) < epsilon:
707                    cont_options.pop(i)
708
709            start += 1
710
711        return cont_options

Generate all alternative options for this continuous variable. This function would filter out all options that are:

  1. Not helpful for the counterfactual generation.
  2. Give similar score gain but requires larger distance.
Args
  • cf_direction (int): Integer +1 if 0 => 1, -1 if 1 => 0 (classification); +1 if we need to increase the prediction, -1 if decrease (regression).
  • cur_feature_index (int): The index of the current continuous feature.
  • cur_feature_name (str): Name of the current feature.
  • cur_feature_value (float): The current feature value.
  • cur_feature_score (float): The score for the current feature value.
  • cont_mads (dict): A map of feature_name => MAD score.
  • cur_example (list): Current sample values
  • score_gain_bound (float): Bound of the score gain. We do not collect options that give score_gain > score_gain_bound (when cf_direction=1), or score_gain < score_gain_bound (when cf_direction=-1)
  • epsilon (float): The threshold to determine if two options give similar score gains. Score gains $s_1$ and $s_2$ are similar if $|s_1 - s_2| <$ epsilon. Smaller epsilon significantly increases the time to solve the MILP. Large epsilon might filter out the optimal CF. Defaults to 0.005.
  • need_to_be_int (bool): True if the target values for this continuous variable need to have integer values.
  • skip_unhelpful (bool): True if to skip options from main effects that give opposite score gain. It is rare that there is a positive score gain from pair-interaction that outweigh negative score gain from two main effects, and adjusting the distance penalty.
Returns

list: List of option tuples (target, score gain, distance, bin_index)

def generate_cat_options( self, cf_direction, cur_feature_index, cur_feature_value, cur_feature_score, cur_cat_distance, cur_example, score_gain_bound=None, skip_unhelpful=True):
713    def generate_cat_options(
714        self,
715        cf_direction,
716        cur_feature_index,
717        cur_feature_value,
718        cur_feature_score,
719        cur_cat_distance,
720        cur_example,
721        score_gain_bound=None,
722        skip_unhelpful=True,
723    ):
724        """
725        Generate all alternative options for this categorical variable. This function
726        would filter out all options that are not helpful for the counterfactual
727        generation.
728
729        Args:
730            cf_direction (int): Integer `+1` if 0 => 1, `-1` if 1 => 0
731                (classification); `+1` if we need to increase the prediction,
732                `-1` if decrease (regression).
733            cur_feature_index (int): The index of the current continuous feature.
734            cur_feature_value (float): The current feature value.
735            cur_feature_score (float): The score for the current feature value.
736            cur_cat_distance (dict): A map of feature_level => 1 - frequency.
737            cur_example (list): Current sample values.
738            score_gain_bound (float): Bound of the score gain. We do not collect
739                options that give `score_gain` > `score_gain_bound` (when
740                `cf_direction=1`), or `score_gain` < `score_gain_bound` (when
741                `cf_direction=-1`)
742            skip_unhelpful (bool): True if to skip options from main
743                effects that give opposite score gain. It is rare that there is a
744                positive score gain from pair-interaction that outweigh negative
745                score gain from two main effects, and adjusting the distance penalty.
746
747        Returns:
748            list: List of option tuples (target, score_gain, distance, bin_index).
749        """
750
751        # Find other options for this categorical variable
752        # For each option, we compute the (1) score gain, and (2) distance
753        #
754        # (1) Score gain is the same as continuous variables
755        # (2) The distance is determined by 1 - the level frequency in the
756        # training data. It implies that levels with high frequency are easier
757        # to "move to"
758
759        # Get the additive scores of this feature
760        additives = self.ebm.additive_terms_[cur_feature_index][1:]
761
762        # Get the bin edges of this feature
763        levels = self.ebm.preprocessor_._get_bin_labels(cur_feature_index)
764
765        # Create "options", each option is a tuple (target, score_gain, distance, bin_index)
766        cat_options = []
767
768        # Identify interaction terms that we need to consider
769        associated_interactions = []
770
771        for cur_feature_id in range(len(self.ebm.feature_names)):
772            cur_feature_type = self.ebm.feature_types[cur_feature_id]
773            if cur_feature_type == "interaction":
774
775                indexes = self.ebm.feature_groups_[cur_feature_id]
776
777                if cur_feature_index in indexes:
778                    feature_position = 0 if indexes[0] == cur_feature_index else 1
779
780                    other_position = 1 - feature_position
781                    other_index = indexes[other_position]
782                    other_type = self.ebm.feature_types[other_index]
783                    other_name = self.ebm.feature_names[other_index]
784
785                    # Get the current additive scores and bin edges
786                    inter_additives = self.ebm.additive_terms_[cur_feature_id][1:, 1:]
787
788                    bin_starts_feature = self.ebm.pair_preprocessor_._get_bin_labels(
789                        cur_feature_index
790                    )
791                    bin_starts_other = self.ebm.pair_preprocessor_._get_bin_labels(
792                        other_index
793                    )
794
795                    # Have to skip the max edge if it is continuous
796                    if other_type == "continuous":
797                        bin_starts_other = bin_starts_other[:-1]
798
799                    # Get the current interaction term score
800                    other_bin = None
801                    if other_type == "continuous":
802                        other_bin = search_sorted_lower_index(
803                            bin_starts_other, float(cur_example[other_index])
804                        )
805                    else:
806                        other_bin = bin_starts_other.index(cur_example[other_index])
807
808                    feature_bin = bin_starts_feature.index(cur_feature_value)
809
810                    feature_inter_score = 0
811
812                    if feature_position == 0:
813                        feature_inter_score = inter_additives[feature_bin, other_bin]
814                    else:
815                        feature_inter_score = inter_additives[other_bin, feature_bin]
816
817                    # Extract the row or column where we fix the other features and
818                    # vary the current feature
819                    feature_inter_bin_starts = bin_starts_feature
820                    feature_inter_additives = []
821
822                    if feature_position == 0:
823                        for i in range(len(inter_additives)):
824                            feature_inter_additives.append(
825                                inter_additives[i, other_bin]
826                            )
827                    else:
828                        for i in range(len(inter_additives[0])):
829                            feature_inter_additives.append(
830                                inter_additives[other_bin, i]
831                            )
832
833                    # Register this interaction term
834                    associated_interactions.append(
835                        {
836                            "inter_index": indexes,
837                            "cur_interaction_id": cur_feature_id,
838                            "feature_inter_score": feature_inter_score,
839                            "feature_inter_bin_starts": feature_inter_bin_starts,
840                            "feature_inter_additives": feature_inter_additives,
841                        }
842                    )
843
844        for i in range(len(additives)):
845            if levels[i] != cur_feature_value:
846                target = levels[i]
847                distance = cur_cat_distance[target]
848
849                # Compute score gain which has two parts:
850                # (1) gain from the change of main effect
851                # (2) gain from the change of interaction effect
852
853                # Main effect
854                main_score_gain = additives[i] - cur_feature_score
855
856                # Interaction terms
857                # A list to track all interaction score gain offsets
858                # [[interaction id, interaction score gain]]
859                inter_score_gain = 0
860                inter_score_gains = []
861
862                for d in associated_interactions:
863                    inter_bin_id = d["feature_inter_bin_starts"].index(target)
864                    inter_score_gain += (
865                        d["feature_inter_additives"][inter_bin_id]
866                        - d["feature_inter_score"]
867                    )
868                    inter_score_gains.append(
869                        [
870                            d["cur_interaction_id"],
871                            d["feature_inter_additives"][inter_bin_id]
872                            - d["feature_inter_score"],
873                        ]
874                    )
875
876                score_gain = main_score_gain + inter_score_gain
877
878                # Skip unhelpful options
879                if cf_direction * score_gain <= 0 and skip_unhelpful:
880                    continue
881
882                # Filter out of bound options
883                if score_gain_bound and skip_unhelpful:
884                    if cf_direction == 1 and score_gain > score_gain_bound:
885                        continue
886                    if cf_direction == -1 and score_gain < score_gain_bound:
887                        continue
888
889                cat_options.append([target, score_gain, distance, i, inter_score_gains])
890
891        return cat_options

Generate all alternative options for this categorical variable. This function would filter out all options that are not helpful for the counterfactual generation.

Args
  • cf_direction (int): Integer +1 if 0 => 1, -1 if 1 => 0 (classification); +1 if we need to increase the prediction, -1 if decrease (regression).
  • cur_feature_index (int): The index of the current continuous feature.
  • cur_feature_value (float): The current feature value.
  • cur_feature_score (float): The score for the current feature value.
  • cur_cat_distance (dict): A map of feature_level => 1 - frequency.
  • cur_example (list): Current sample values.
  • score_gain_bound (float): Bound of the score gain. We do not collect options that give score_gain > score_gain_bound (when cf_direction=1), or score_gain < score_gain_bound (when cf_direction=-1)
  • skip_unhelpful (bool): True if to skip options from main effects that give opposite score gain. It is rare that there is a positive score gain from pair-interaction that outweigh negative score gain from two main effects, and adjusting the distance penalty.
Returns

list: List of option tuples (target, score_gain, distance, bin_index).

def generate_inter_options( self, cur_feature_id, cur_feature_index_1, cur_feature_index_2, cur_feature_score, options):
 893    def generate_inter_options(
 894        self,
 895        cur_feature_id,
 896        cur_feature_index_1,
 897        cur_feature_index_2,
 898        cur_feature_score,
 899        options,
 900    ):
 901        """
 902        Generate all possible options for this interaction variable.
 903
 904        Interaction terms are interesting in this MILP. Each option counts as a
 905        variable, but each variable only affects the score gain, not the distance.
 906
 907        Note that in EBM, the bin definitions for interaction terms can be different
 908        from their definitions for individual continuous variables.
 909
 910        To model interaction terms, we can think it as a binary variable. The
 911        value is determined by the multiplication of two main effect variables.
 912        Each interaction variable describes a combination of two main effect
 913        variables. Therefore, say continuous variable A has $x$ probable options,
 914        and another continuous variable B has $y$ probable options, then we should
 915        add $x \\times y$ binary variables to offset their probable interaction
 916        effects.
 917
 918        Args:
 919            cur_feature_id (int): The id of this interaction effect.
 920            cur_feature_index_1 (int): The index of the first main effect.
 921            cur_feature_index_2 (int): The index of the second main effect.
 922            cur_feature_score (float): The score for the current feature value.
 923            options (dict): The current option list, feature_name ->
 924                [`target`, `score_gain`, `distance`, `bin_id`].
 925
 926        Returns:
 927            List of option tuples (target, score_gain, distance, bin_index)
 928        """
 929
 930        # Get the sub-types for this interaction term
 931        cur_feature_type_1 = self.ebm.feature_types[cur_feature_index_1]
 932        cur_feature_type_2 = self.ebm.feature_types[cur_feature_index_2]
 933
 934        # Get the sub-names for this interaction term
 935        cur_feature_name_1 = self.ebm.feature_names[cur_feature_index_1]
 936        cur_feature_name_2 = self.ebm.feature_names[cur_feature_index_2]
 937
 938        # The first column and row are reserved for missing values (even with
 939        # categorical features)
 940        additives = self.ebm.additive_terms_[cur_feature_id][1:, 1:]
 941
 942        # Four possibilities here: cont x cont, cont x cat, cat x cont, cat x cat.
 943        # Each has a different way to lookup the bin table.
 944        inter_options = []
 945
 946        # Iterate through all possible combinations of options from these two
 947        # variables
 948        for opt_1 in options[cur_feature_name_1]:
 949            for opt_2 in options[cur_feature_name_2]:
 950
 951                bin_starts_1 = self.ebm.pair_preprocessor_._get_bin_labels(
 952                    cur_feature_index_1
 953                )
 954                bin_starts_2 = self.ebm.pair_preprocessor_._get_bin_labels(
 955                    cur_feature_index_2
 956                )
 957
 958                bin_1 = None
 959                bin_2 = None
 960
 961                if cur_feature_type_1 == "continuous":
 962                    if cur_feature_type_2 == "continuous":
 963                        # cont x cont
 964                        bin_starts_1 = bin_starts_1[:-1]
 965                        bin_starts_2 = bin_starts_2[:-1]
 966
 967                        # locate the bin for each option value
 968                        bin_1 = search_sorted_lower_index(bin_starts_1, opt_1[0])
 969                        bin_2 = search_sorted_lower_index(bin_starts_2, opt_2[0])
 970
 971                    else:
 972                        # cont x cat
 973                        bin_starts_1 = bin_starts_1[:-1]
 974
 975                        # locate the bin for each option value
 976                        bin_1 = search_sorted_lower_index(bin_starts_1, opt_1[0])
 977                        bin_2 = bin_starts_2.index(opt_2[0])
 978
 979                else:
 980                    if cur_feature_type_2 == "continuous":
 981                        # cat x cont
 982                        bin_starts_2 = bin_starts_2[:-1]
 983
 984                        # locate the bin for each option value
 985                        bin_1 = bin_starts_1.index(opt_1[0])
 986                        bin_2 = search_sorted_lower_index(bin_starts_2, opt_2[0])
 987
 988                    else:
 989                        # cat x cat
 990
 991                        # locate the bin for each option value
 992                        bin_1 = bin_starts_1.index(opt_1[0])
 993                        bin_2 = bin_starts_2.index(opt_2[0])
 994
 995                new_score = additives[bin_1, bin_2]
 996                score_gain = new_score - cur_feature_score
 997
 998                # The score gain on the interaction term need to offset the interaction
 999                # score gain we have already counted on the main effect options. That
1000                # score is saved in the option tuple.
1001
1002                # We first need to find the common interaction id
1003                common_index = [-1, -1]
1004                for m in range(len(opt_1[4])):
1005                    for n in range(len(opt_2[4])):
1006                        if opt_1[4][m][0] == opt_2[4][n][0]:
1007                            common_index = [m, n]
1008                            break
1009
1010                    if common_index[0] != -1 and common_index[1] != -1:
1011                        break
1012
1013                score_gain -= opt_1[4][common_index[0]][1]
1014                score_gain -= opt_2[4][common_index[1]][1]
1015
1016                inter_options.append(
1017                    [[opt_1[0], opt_2[0]], score_gain, 0, [opt_1[3], opt_2[3]], 0]
1018                )
1019
1020        return inter_options

Generate all possible options for this interaction variable.

Interaction terms are interesting in this MILP. Each option counts as a variable, but each variable only affects the score gain, not the distance.

Note that in EBM, the bin definitions for interaction terms can be different from their definitions for individual continuous variables.

To model interaction terms, we can think it as a binary variable. The value is determined by the multiplication of two main effect variables. Each interaction variable describes a combination of two main effect variables. Therefore, say continuous variable A has $x$ probable options, and another continuous variable B has $y$ probable options, then we should add $x \times y$ binary variables to offset their probable interaction effects.

Args
  • cur_feature_id (int): The id of this interaction effect.
  • cur_feature_index_1 (int): The index of the first main effect.
  • cur_feature_index_2 (int): The index of the second main effect.
  • cur_feature_score (float): The score for the current feature value.
  • options (dict): The current option list, feature_name -> [target, score_gain, distance, bin_id].
Returns

List of option tuples (target, score_gain, distance, bin_index)

@staticmethod
def create_milp( cf_direction, needed_score_gain, features_to_vary, options, max_num_features_to_vary=None, muted_variables=[]):
1022    @staticmethod
1023    def create_milp(
1024        cf_direction,
1025        needed_score_gain,
1026        features_to_vary,
1027        options,
1028        max_num_features_to_vary=None,
1029        muted_variables=[],
1030    ):
1031        """
1032        Create a MILP to find counterfactuals (CF) using PuLP.
1033
1034        Args:
1035            cf_direction (int): Integer +1 if 0 => 1, -1 if 1 => 0 (classification),
1036                +1 if we need to incrase the prediction, -1 if decrease (regression).
1037            needed_score_gain (float): The score gain needed to achieve the CF goal.
1038            features_to_vary (list[str]): Feature names of features that the
1039                generated CF can change.
1040            options (dict): Possible options for each variable. Each option is a
1041                list [target, score_gain, distance, bin_index].
1042            max_num_features_to_vary (int, optional): Max number of features that the
1043                generated CF can change. If the value is `None`, the CFs can
1044                change any number of features.
1045            muted_variables (list[str], optional): Variables that this MILP should
1046                not use. This is useful to mute optimal variables so we can explore
1047                diverse solutions. This list should not include interaction variables.
1048
1049        Returns:
1050            A tuple (`model`, `variables`), where `model` is a pulp.LpProblem
1051            model that encodes the MILP problem, and `variables` is a dict of
1052            variables used in the `model`: `feature_name` => [`variables`].
1053        """
1054
1055        # Create a model (minimizing the distance)
1056        model = pulp.LpProblem("ebmCounterfactual", pulp.LpMinimize)
1057
1058        distance = 0
1059        score_gain = 0
1060
1061        muted_variables_set = set(muted_variables)
1062
1063        # Create variables
1064        variables = {}
1065        for f in features_to_vary:
1066            # Each variable encodes an option (0: not use this option,
1067            # 1: use this option)
1068            cur_variables = []
1069
1070            for option in options[f]:
1071                var_name = "{}:{}".format(f, option[3])
1072
1073                # Skip the muted variables
1074                if var_name in muted_variables_set:
1075                    continue
1076
1077                x = pulp.LpVariable(var_name, lowBound=0, upBound=1, cat="Binary")
1078                x.setInitialValue(0)
1079
1080                score_gain += option[1] * x
1081                distance += option[2] * x
1082
1083                cur_variables.append(x)
1084
1085            variables[f] = cur_variables
1086
1087            # A local constraint is that we can only at most selection one option from
1088            # one feature
1089            model += pulp.lpSum(cur_variables) <= 1
1090
1091        # Users can also set `max_num_features_to_vary` to control the total
1092        # number of features to vary
1093        if max_num_features_to_vary is not None:
1094            main_variables = []
1095            for f in variables:
1096                main_variables.extend(variables[f])
1097
1098            model += pulp.lpSum(main_variables) <= max_num_features_to_vary
1099
1100        # Create variables for interaction effects
1101        for opt_name in options:
1102            if " x " in opt_name:
1103                f1_name = re.sub(r"(.+)\sx\s.+", r"\1", opt_name)
1104                f2_name = re.sub(r".+\sx\s(.+)", r"\1", opt_name)
1105
1106                if f1_name in features_to_vary and f2_name in features_to_vary:
1107
1108                    # We need to include this interaction effect
1109                    cur_variables = []
1110
1111                    for option in options[opt_name]:
1112                        z = pulp.LpVariable(
1113                            "{}:{},{}".format(opt_name, option[3][0], option[3][1]),
1114                            lowBound=0,
1115                            upBound=1,
1116                            cat="Continuous",
1117                        )
1118                        z.setInitialValue(0)
1119
1120                        # Need to iterate through existing variables for f1 and f2 to find
1121                        # the corresponding variables
1122                        x_f1 = None
1123                        x_f2 = None
1124
1125                        # Skp if this interaction variable involves muted main variable
1126                        x_f1_name = "{}:{}".format(f1_name, option[3][0])
1127                        x_f2_name = "{}:{}".format(f2_name, option[3][1])
1128
1129                        if (
1130                            x_f1_name in muted_variables_set
1131                            or x_f2_name in muted_variables_set
1132                        ):
1133                            continue
1134
1135                        for x in variables[f1_name]:
1136                            if x.name == x_f1_name:
1137                                x_f1 = x
1138                                break
1139
1140                        for x in variables[f2_name]:
1141                            if x.name == x_f2_name:
1142                                x_f2 = x
1143                                break
1144
1145                        assert x_f1 is not None and x_f2 is not None
1146
1147                        # variable z is actually the product of x_f1 and x_f2
1148                        # We can linearize it by 3 constraints
1149                        model += z <= x_f1
1150                        model += z <= x_f2
1151                        model += z >= x_f1 + x_f2 - 1
1152
1153                        cur_variables.append(z)
1154
1155                    variables[opt_name] = cur_variables
1156
1157        # Use constraint to express counterfactual
1158        if cf_direction == 1:
1159            model += score_gain >= needed_score_gain
1160        else:
1161            model += score_gain <= needed_score_gain
1162
1163        # We want to minimize the distance
1164        model += distance
1165
1166        return model, variables

Create a MILP to find counterfactuals (CF) using PuLP.

Args
  • cf_direction (int): Integer +1 if 0 => 1, -1 if 1 => 0 (classification), +1 if we need to incrase the prediction, -1 if decrease (regression).
  • needed_score_gain (float): The score gain needed to achieve the CF goal.
  • features_to_vary (list[str]): Feature names of features that the generated CF can change.
  • options (dict): Possible options for each variable. Each option is a list [target, score_gain, distance, bin_index].
  • max_num_features_to_vary (int, optional): Max number of features that the generated CF can change. If the value is None, the CFs can change any number of features.
  • muted_variables (list[str], optional): Variables that this MILP should not use. This is useful to mute optimal variables so we can explore diverse solutions. This list should not include interaction variables.
Returns

A tuple (model, variables), where model is a pulp.LpProblem model that encodes the MILP problem, and variables is a dict of variables used in the model: feature_name => [variables].

def print_solution(self, cur_example, active_variables, options):
1168    def print_solution(self, cur_example, active_variables, options):
1169        """
1170        Print the optimal solution.
1171
1172        Args:
1173            cur_example (np.ndarray): the original data point.
1174            active_variables (list[variable]): binary variables with value 1.
1175            options (dict): all the possible options for all features.
1176        """
1177
1178        for var in active_variables:
1179            # Skip interaction vars (included)
1180            if "_x_" not in var.name:
1181                f_name = re.sub(r"(.+):\d+", r"\1", var.name)
1182                bin_i = int(re.sub(r".+:(\d+)", r"\1", var.name))
1183
1184                # Find the original value
1185                org_value = cur_example[0][self.ebm.feature_names.index(f_name)]
1186
1187                # Find the target bin
1188                f_index = self.ebm.feature_names.index(f_name)
1189                f_type = self.ebm.feature_types[f_index]
1190
1191                if f_type == "continuous":
1192                    bin_starts = self.ebm.preprocessor_._get_bin_labels(f_index)[:-1]
1193
1194                    target_bin = "[{},".format(bin_starts[bin_i])
1195
1196                    if bin_i + 1 < len(bin_starts):
1197                        target_bin += " {})".format(bin_starts[bin_i + 1])
1198                    else:
1199                        target_bin += " inf)"
1200                else:
1201                    target_bin = ""
1202                    org_value = '"{}"'.format(org_value)
1203
1204                for option in options[f_name]:
1205                    if option[3] == bin_i:
1206                        print(
1207                            "Change <{}> from {} to {} {}".format(
1208                                f_name, org_value, option[0], target_bin
1209                            )
1210                        )
1211                        print(
1212                            "\t* score gain: {:.4f}\n\t* distance cost: {:.4f}".format(
1213                                option[1], option[2]
1214                            )
1215                        )
1216                        break
1217
1218            else:
1219                f_name = re.sub(r"(.+):.+", r"\1", var.name)
1220                f_name = f_name.replace("_x_", " x ")
1221                bin_0 = int(re.sub(r".+:(\d+),\d+", r"\1", var.name))
1222                bin_1 = int(re.sub(r".+:\d+,(\d+)", r"\1", var.name))
1223
1224                for option in options[f_name]:
1225                    if option[3][0] == bin_0 and option[3][1] == bin_1:
1226                        print("Trigger interaction term: <{}>".format(f_name))
1227                        print(
1228                            "\t* score gain: {:.4f}\n\t* distance cost: {:.4f}".format(
1229                                option[1], 0
1230                            )
1231                        )
1232                        break
1233        print()

Print the optimal solution.

Args
  • cur_example (np.ndarray): the original data point.
  • active_variables (list[variable]): binary variables with value 1.
  • options (dict): all the possible options for all features.
@staticmethod
def compute_mad(xs):
1235    @staticmethod
1236    def compute_mad(xs):
1237        """
1238        Compute the median absolute deviation of a continuous feature.
1239
1240        Args:
1241            xs (np.ndarray): A column of continuous values.
1242
1243        Returns:
1244            float: MAD value of xs.
1245        """
1246        xs_median = np.median(xs.astype(float))
1247        mad = np.median(np.abs(xs.astype(float) - xs_median))
1248        return mad

Compute the median absolute deviation of a continuous feature.

Args
  • xs (np.ndarray): A column of continuous values.
Returns

float: MAD value of xs.

@staticmethod
def compute_frequency_distance(xs):
1250    @staticmethod
1251    def compute_frequency_distance(xs):
1252        """
1253        For categorical variables, we compute 1 - frequency as their distance. It implies
1254        that switching to a frequent value takes less effort.
1255
1256        Args:
1257            xs (np.ndarray): A column of categorical values.
1258
1259        Returns:
1260            dict: category level -> 1 - frequency.
1261        """
1262        counter = Counter(xs)
1263
1264        results = {}
1265
1266        for key in counter:
1267            results[key] = 1 - (counter[key] / len(xs))
1268
1269        return results

For categorical variables, we compute 1 - frequency as their distance. It implies that switching to a frequent value takes less effort.

Args
  • xs (np.ndarray): A column of categorical values.
Returns

dict: category level -> 1 - frequency.

@staticmethod
def compute_naive_cat_distance(xs):
1271    @staticmethod
1272    def compute_naive_cat_distance(xs):
1273        """
1274        Alternative to compute_frequency_distance() to compute distance for
1275        categorical variables. The distance 1 for different levels and 0 for
1276        the same levels. Here we give them all score 1, because same-level
1277        options will be filtered out when we create categorical options for the
1278        optimization program.
1279
1280        Args:
1281            xs (np.ndarray): A column of categorical values.
1282
1283        Returns:
1284            dict: category level -> 1.
1285        """
1286        counter = Counter(xs)
1287        results = {}
1288
1289        for key in counter:
1290            results[key] = 1
1291
1292        return results

Alternative to compute_frequency_distance() to compute distance for categorical variables. The distance 1 for different levels and 0 for the same levels. Here we give them all score 1, because same-level options will be filtered out when we create categorical options for the optimization program.

Args
  • xs (np.ndarray): A column of categorical values.
Returns

dict: category level -> 1.

def search_sorted_lower_index(sorted_edges, value):
1295def search_sorted_lower_index(sorted_edges, value):
1296    """Binary search to locate the correct bin for continuous features."""
1297    left = 0
1298    right = len(sorted_edges) - 1
1299
1300    while right - left > 1:
1301        i = left + int((right - left) / 2)
1302
1303        if value > sorted_edges[i]:
1304            left = i
1305        elif value < sorted_edges[i]:
1306            right = i
1307        else:
1308            return i
1309
1310    # Handle out of bound issues
1311    if value >= sorted_edges[right]:
1312        return right
1313    if value < sorted_edges[left]:
1314        return left
1315
1316    return right - 1

Binary search to locate the correct bin for continuous features.

def sigmoid(x):
1319def sigmoid(x):
1320    """Sigmoid function."""
1321    return 1 / (1 + np.exp(x))

Sigmoid function.

def get_model_data( ebm, x_train, model_info, resort_categorical=False, feature_info=None, feature_level_info=None, feature_config=None):
1437def get_model_data(
1438    ebm,
1439    x_train,
1440    model_info,
1441    resort_categorical=False,
1442    feature_info=None,
1443    feature_level_info=None,
1444    feature_config=None,
1445):
1446    """
1447    Get the model data for GAM Coach.
1448    Args:
1449        ebm: Trained EBM model. ExplainableBoostingClassifier or
1450            ExplainableBoostingRegressor object.
1451        x_train: Training data. We use it to compute the mean absolute deviation
1452            score for continuous features, and frequency scores for categorical
1453            features.
1454        model_info: Information about the model (class names, regression target
1455            name). For classification, the order of classes matters. It should
1456            be consistent with the class encoding index. For example, the first
1457            element should be the name for class 0.
1458            It has format:
1459            `{'classes': ['loan rejection', 'loan approval']}` or
1460            `{'regressionName': 'interest rate'}`
1461        resort_categorical: Whether to sort the levels in categorical variable
1462            by increasing order if all levels can be converted to numbers.
1463        feature_info: You can provide a dictionary to give a separate display
1464            name and optional description for each feature. By default, the
1465            display name is the same as the feature name, and the description
1466            is an emtpy string. `feature_info` can be partial (only including
1467            some features). It has format:
1468            `{'feature_name': ['display_name', 'description']}`
1469        feature_level_info: You can provide a dictionary to give separate display
1470            name and optional description for each level of categorical features.
1471            By default, the display name is the same as the level name, and the
1472            description is an empty string. `feature_info` can be partial
1473            (e.g., only including some levels from some categorical features).
1474            It has format:
1475            `{'feature_name': {level_id: ['display_name', 'description']}}`
1476        feature_config: You can provide a dictionary to configure the difficulty,
1477            integer requirement, and acceptable range of individual features.
1478            The difficulty is an integer between 1 and 6: 1 (very easy to change),
1479            2 (easy), 3 (default), 4 (hard), 5 (very hard), 6 (impossible to change).
1480            By default, difficulty is set to 3 for all features, requiresInt is
1481            False for continuous variables, and acceptanceRange is None (search
1482            all range).
1483            The dictionary property has the following format:
1484            `{'difficulty': 3, 'requiresInt': True, 'acceptableRange': None}`
1485    Returns:
1486        A Python dictionary of model data
1487    """
1488    ROUND = 6
1489
1490    # Main model info on each feature
1491    features = []
1492
1493    # Track the encoding of categorical feature levels
1494    labelEncoder = {}
1495
1496    # Track the score range
1497    score_range = [np.inf, -np.inf]
1498
1499    for i in tqdm(range(len(ebm.feature_names))):
1500        cur_feature = {}
1501        cur_feature["name"] = ebm.feature_names[i]
1502        cur_feature["type"] = ebm.feature_types[i]
1503        cur_feature["importance"] = ebm.feature_importances_[i]
1504
1505        # Handle interaction term differently from cont/cat
1506        if cur_feature["type"] == "interaction":
1507            cur_id = ebm.feature_groups_[i]
1508            cur_feature["id"] = list(cur_id)
1509
1510            # Info for each individual feature
1511            cur_feature["name1"] = ebm.feature_names[cur_id[0]]
1512            cur_feature["name2"] = ebm.feature_names[cur_id[1]]
1513
1514            cur_feature["type1"] = ebm.feature_types[cur_id[0]]
1515            cur_feature["type2"] = ebm.feature_types[cur_id[1]]
1516
1517            # Skip the first item from both dimensions
1518            cur_feature["additive"] = np.round(ebm.additive_terms_[i], ROUND)[
1519                1:, 1:
1520            ].tolist()
1521            cur_feature["error"] = np.round(ebm.term_standard_deviations_[i], ROUND)[
1522                1:, 1:
1523            ].tolist()
1524
1525            # Get the bin label info
1526            cur_feature["binLabel1"] = ebm.pair_preprocessor_._get_bin_labels(cur_id[0])
1527            cur_feature["binLabel2"] = ebm.pair_preprocessor_._get_bin_labels(cur_id[1])
1528
1529            # Encode categorical levels as integers
1530            if cur_feature["type1"] == "categorical":
1531                level_str_to_int = ebm.pair_preprocessor_.col_mapping_[cur_id[0]]
1532                cur_feature["binLabel1"] = list(
1533                    map(lambda x: level_str_to_int[x], cur_feature["binLabel1"])
1534                )
1535
1536            if cur_feature["type2"] == "categorical":
1537                level_str_to_int = ebm.pair_preprocessor_.col_mapping_[cur_id[1]]
1538                cur_feature["binLabel2"] = list(
1539                    map(lambda x: level_str_to_int[x], cur_feature["binLabel2"])
1540                )
1541
1542            # Get density info
1543            if cur_feature["type1"] == "categorical":
1544                level_str_to_int = ebm.pair_preprocessor_.col_mapping_[cur_id[0]]
1545                cur_feature["histEdge1"] = ebm.preprocessor_._get_hist_edges(cur_id[0])
1546                cur_feature["histEdge1"] = list(
1547                    map(lambda x: level_str_to_int[x], cur_feature["histEdge1"])
1548                )
1549                cur_feature["histCount1"] = np.round(
1550                    ebm.preprocessor_._get_hist_counts(cur_id[0]), ROUND
1551                ).tolist()
1552            else:
1553                # Use KDE to draw density plots for cont features
1554                edges, counts = _get_kde_sample(x_train[:, cur_id[0]])
1555                cur_feature["histEdge1"] = edges.tolist()
1556                cur_feature["histCount1"] = counts.tolist()
1557
1558            if cur_feature["type2"] == "categorical":
1559                level_str_to_int = ebm.pair_preprocessor_.col_mapping_[cur_id[1]]
1560                cur_feature["histEdge2"] = ebm.preprocessor_._get_hist_edges(cur_id[1])
1561                cur_feature["histEdge2"] = list(
1562                    map(lambda x: level_str_to_int[x], cur_feature["histEdge2"])
1563                )
1564                cur_feature["histCount2"] = np.round(
1565                    ebm.preprocessor_._get_hist_counts(cur_id[1]), ROUND
1566                ).tolist()
1567            else:
1568                # Use KDE to draw density plots for cont features
1569                edges, counts = _get_kde_sample(x_train[:, cur_id[1]])
1570                cur_feature["histEdge2"] = edges.tolist()
1571                cur_feature["histCount2"] = counts.tolist()
1572
1573        else:
1574            # Skip the first item (reserved for missing value)
1575            cur_feature["additive"] = np.round(ebm.additive_terms_[i], ROUND).tolist()[
1576                1:
1577            ]
1578            cur_feature["error"] = np.round(
1579                ebm.term_standard_deviations_[i], ROUND
1580            ).tolist()[1:]
1581            cur_feature["id"] = ebm.feature_groups_[i]
1582            cur_id = ebm.feature_groups_[i][0]
1583            cur_feature["count"] = ebm.preprocessor_.col_bin_counts_[cur_id].tolist()[
1584                1:
1585            ]
1586
1587            # Track the global score range
1588            score_range[0] = min(
1589                score_range[0],
1590                np.min(ebm.additive_terms_[i] - ebm.term_standard_deviations_[i]),
1591            )
1592            score_range[1] = max(
1593                score_range[1],
1594                np.max(ebm.additive_terms_[i] + ebm.term_standard_deviations_[i]),
1595            )
1596
1597            # Add the binning information for continuous features
1598            if cur_feature["type"] == "continuous":
1599                # Add the bin information
1600                cur_feature["binEdge"] = ebm.preprocessor_._get_bin_labels(cur_id)
1601
1602                # Use KDE to draw density plots for cont features
1603                edges, counts = _get_kde_sample(x_train[:, cur_id])
1604
1605                cur_feature["histEdge"] = edges.tolist()
1606                cur_feature["histCount"] = counts.tolist()
1607
1608            elif cur_feature["type"] == "categorical":
1609                # Get the level value mapping
1610                level_str_to_int = ebm.preprocessor_.col_mapping_[cur_id]
1611
1612                if resort_categorical:
1613                    level_str_to_int = _resort_categorical_level(level_str_to_int)
1614
1615                cur_feature["binLabel"] = list(
1616                    map(
1617                        lambda x: level_str_to_int[x],
1618                        ebm.preprocessor_._get_bin_labels(cur_id),
1619                    )
1620                )
1621
1622                # Add the hist information
1623                # For categorical data, the edges are strings
1624                cur_feature["histEdge"] = list(
1625                    map(
1626                        lambda x: level_str_to_int[x],
1627                        ebm.preprocessor_._get_hist_edges(cur_id),
1628                    )
1629                )
1630
1631                cur_feature["histCount"] = np.round(
1632                    ebm.preprocessor_._get_hist_counts(cur_id), ROUND
1633                ).tolist()
1634
1635                if resort_categorical:
1636                    cur_bin_info = list(
1637                        zip(
1638                            cur_feature["binLabel"],
1639                            cur_feature["additive"],
1640                            cur_feature["error"],
1641                            cur_feature["count"],
1642                        )
1643                    )
1644                    cur_bin_info = sorted(cur_bin_info, key=lambda x: x[0])
1645
1646                    cur_feature["binLabel"] = [k[0] for k in cur_bin_info]
1647                    cur_feature["additive"] = [k[1] for k in cur_bin_info]
1648                    cur_feature["error"] = [k[2] for k in cur_bin_info]
1649                    cur_feature["count"] = [k[3] for k in cur_bin_info]
1650
1651                    cur_hist_info = list(
1652                        zip(cur_feature["histEdge"], cur_feature["histCount"])
1653                    )
1654                    cur_hist_info = sorted(cur_hist_info, key=lambda x: x[0])
1655
1656                    cur_feature["histEdge"] = [k[0] for k in cur_hist_info]
1657                    cur_feature["histCount"] = [k[1] for k in cur_hist_info]
1658
1659                # Add the label encoding information
1660                labelEncoder[cur_feature["name"]] = {
1661                    i: s for s, i in level_str_to_int.items()
1662                }
1663
1664        features.append(cur_feature)
1665
1666    score_range = list(map(lambda x: round(x, ROUND), score_range))
1667
1668    feature_names = []
1669    feature_types = []
1670
1671    # Sample data does not record interaction features
1672    for i in range(len(ebm.feature_names)):
1673        if ebm.feature_types[i] != "interaction":
1674            feature_names.append(ebm.feature_names[i])
1675            feature_types.append(ebm.feature_types[i])
1676
1677    # Compute the MAD scores and frequencies
1678    ebm_cont_indexes = np.array(
1679        [i for i in range(len(feature_names)) if feature_types[i] == "continuous"]
1680    )
1681
1682    contMads = {}
1683
1684    for i in ebm_cont_indexes:
1685        contMads[ebm.feature_names[i]] = GAMCoach.compute_mad(x_train[:, i])
1686
1687    ebm_cat_indexes = np.array(
1688        [i for i in range(len(feature_names)) if feature_types[i] == "categorical"]
1689    )
1690
1691    catDistances = {}
1692
1693    for i in ebm_cat_indexes:
1694        catDistances[feature_names[i]] = GAMCoach.compute_frequency_distance(
1695            x_train[:, i]
1696        )
1697
1698    # Initialize a feature description dictionary (provide more information about
1699    # each feature in the UI)
1700    feature_descriptions = _init_feature_descriptions(ebm, labelEncoder)
1701
1702    # Overwrite some entries in the default feature_descriptions
1703    if feature_info:
1704        for feature in feature_info:
1705            feature_descriptions[feature]["displayName"] = feature_info[feature][0]
1706            feature_descriptions[feature]["description"] = feature_info[feature][1]
1707
1708    if feature_level_info:
1709        for feature in feature_level_info:
1710            for level in feature_level_info[feature]:
1711                display_name = feature_level_info[feature][level][0]
1712                description = feature_level_info[feature][level][1]
1713                feature_descriptions[feature]["levelDescription"][level][
1714                    "displayName"
1715                ] = display_name
1716                feature_descriptions[feature]["levelDescription"][level][
1717                    "description"
1718                ] = description
1719
1720    # Put descriptions under the 'features' key
1721    for feature in features:
1722        if feature["name"] in feature_descriptions:
1723            feature["description"] = feature_descriptions[feature["name"]]
1724
1725    # Set the feature configurations
1726    feature_configurations = _init_feature_configuration(ebm)
1727
1728    if feature_config:
1729        for feature in feature_config:
1730            cur_config = feature_config[feature]
1731            for k in [
1732                "requiresInt",
1733                "difficulty",
1734                "acceptableRange",
1735                "requiresIncreasing",
1736                "requiresDecreasing",
1737                "usesTransform",
1738            ]:
1739                if k in cur_config:
1740                    feature_configurations[feature][k] = cur_config[k]
1741
1742    # Attach the configuration to the feature field
1743    for feature in features:
1744        if feature["name"] in feature_configurations:
1745            feature["config"] = feature_configurations[feature["name"]]
1746
1747    data = {
1748        "intercept": ebm.intercept_[0] if hasattr(ebm, "classes_") else ebm.intercept_,
1749        "isClassifier": hasattr(ebm, "classes_"),
1750        "modelInfo": model_info,
1751        "features": features,
1752        "labelEncoder": labelEncoder,
1753        "scoreRange": score_range,
1754        "featureNames": feature_names,
1755        "featureTypes": feature_types,
1756        "contMads": contMads,
1757        "catDistances": catDistances,
1758    }
1759
1760    return data

Get the model data for GAM Coach.

Args
  • ebm: Trained EBM model. ExplainableBoostingClassifier or ExplainableBoostingRegressor object.
  • x_train: Training data. We use it to compute the mean absolute deviation score for continuous features, and frequency scores for categorical features.
  • model_info: Information about the model (class names, regression target name). For classification, the order of classes matters. It should be consistent with the class encoding index. For example, the first element should be the name for class 0. It has format: {'classes': ['loan rejection', 'loan approval']} or {'regressionName': 'interest rate'}
  • resort_categorical: Whether to sort the levels in categorical variable by increasing order if all levels can be converted to numbers.
  • feature_info: You can provide a dictionary to give a separate display name and optional description for each feature. By default, the display name is the same as the feature name, and the description is an emtpy string. feature_info can be partial (only including some features). It has format: {'feature_name': ['display_name', 'description']}
  • feature_level_info: You can provide a dictionary to give separate display name and optional description for each level of categorical features. By default, the display name is the same as the level name, and the description is an empty string. feature_info can be partial (e.g., only including some levels from some categorical features). It has format: {'feature_name': {level_id: ['display_name', 'description']}}
  • feature_config: You can provide a dictionary to configure the difficulty, integer requirement, and acceptable range of individual features. The difficulty is an integer between 1 and 6: 1 (very easy to change), 2 (easy), 3 (default), 4 (hard), 5 (very hard), 6 (impossible to change). By default, difficulty is set to 3 for all features, requiresInt is False for continuous variables, and acceptanceRange is None (search all range). The dictionary property has the following format: {'difficulty': 3, 'requiresInt': True, 'acceptableRange': None}
Returns

A Python dictionary of model data