Skip to content

SSVC CSV Analyzer

This module provides a script for analyzing an SSVC tree csv file.

usage: csv_analyzer.py [-h] [--outcol OUTCOL] [--permutation] csvfile

Analyze an SSVC tree csv file

positional arguments:
  csvfile          the csv file to analyze

options:
  -h, --help       show this help message and exit
  --outcol OUTCOL  the name of the outcome column
  --permutation    use permutation importance instead of drop column importance
Example

Given a test.csv file like this:

row,Exploitation,Exposure,Automatable,Human Impact,Priority
1,none,small,no,low,defer
2,none,small,no,medium,defer
3,none,small,no,high,scheduled
...
Analyze the csv file:
$ python csv_analyzer.py test.csv

Feature Importance after Dropping Each Feature in test.csv
         feature  feature_importance
0  exploitation_            0.347222
1  human_impact_            0.291667
2   automatable_            0.180556
3      exposure_            0.166667

Higher values imply more important features.

_clean_table(df)

Clean up a dataframe, normalizing column names and dropping columns we don't need

Parameters:

Name Type Description Default
df DataFrame

the dataframe to clean

required

Returns:

Type Description
DataFrame

the cleaned dataframe

Source code in src/ssvc/csv_analyzer.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
def _clean_table(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean up a dataframe, normalizing column names and dropping columns we don't need

    Args:
        df: the dataframe to clean

    Returns:
        the cleaned dataframe
    """
    # normalize data
    df = df.rename(columns=_col_norm)
    # drop columns we don't need
    drop_cols = [
        "row",
    ]
    df = df.drop(columns=drop_cols, errors="ignore")
    return df

_col_norm(c)

Normalize a column name

Parameters:

Name Type Description Default
c str

the column name to normalize

required

Returns:

Type Description
str

the normalized column name

Source code in src/ssvc/csv_analyzer.py
71
72
73
74
75
76
77
78
79
80
81
82
83
def _col_norm(c: str) -> str:
    """
    Normalize a column name

    Args:
        c: the column name to normalize

    Returns:
        the normalized column name
    """
    new_col = re.sub("[^0-9a-zA-Z]+", "_", c)
    new_col = new_col.lower()
    return new_col

_imp_df(column_names, importances)

Create a dataframe of feature importances

Parameters:

Name Type Description Default
column_names list

the names of the columns

required
importances list

the feature importances

required

Returns:

Type Description
DataFrame

a dataframe of feature importances

Source code in src/ssvc/csv_analyzer.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def _imp_df(column_names: list, importances: list) -> pd.DataFrame:
    """
    Create a dataframe of feature importances

    Args:
        column_names: the names of the columns
        importances: the feature importances

    Returns:
        a dataframe of feature importances
    """
    df = (
        pd.DataFrame(
            {"feature": column_names, "feature_importance": importances}
        )
        .sort_values("feature_importance", ascending=False)
        .reset_index(drop=True)
    )
    return df

_prepare_data(df, target, permute=False)

Compute feature importance two different ways for a dataframe

Parameters:

Name Type Description Default
df DataFrame

the dataframe to analyze

required
target str

the name of the target column to analyze against

required
permute bool

use permutation importance instead of drop column importance

False

Returns:

Type Description
(DataFrame, DataFrame)

a tuple of (the cleaned dataframe, the feature importance dataframe)

Source code in src/ssvc/csv_analyzer.py
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
def _prepare_data(
    df: pd.DataFrame, target: str, permute: bool = False
) -> (pd.DataFrame, pd.DataFrame):
    """
    Compute feature importance two different ways for a dataframe

    Args:
        df: the dataframe to analyze
        target: the name of the target column to analyze against
        permute: use permutation importance instead of drop column importance

    Returns:
        a tuple of (the cleaned dataframe, the feature importance dataframe)
    """

    df = _clean_table(df)
    # check for target column
    if target not in df.columns:
        raise KeyError(f"Column '{target}' not found in {list(df.columns)}")

    X, y = _split_data(df, target)
    # turn features into ordinals
    # this assumes that every column is an ordinal label
    # and that the ordinals are sorted in ascending order
    cols = []
    for c in X.columns:
        newcol = f"{c}_"
        cols.append(newcol)
        codes = list(enumerate(X[c].unique()))
        mapper = {v: k for (k, v) in codes}
        X[newcol] = X[c].replace(mapper)
    X2 = X[cols]

    return X2, y

_split_data(df, target)

Split a dataframe into features and target

Parameters:

Name Type Description Default
df DataFrame

the dataframe to split

required
target str

the name of the target column

required

Returns:

Type Description
(DataFrame, DataFrame)

a tuple of (features, target)

Source code in src/ssvc/csv_analyzer.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def _split_data(df: pd.DataFrame, target: str) -> (pd.DataFrame, pd.DataFrame):
    """
    Split a dataframe into features and target

    Args:
        df: the dataframe to split
        target: the name of the target column

    Returns:
        a tuple of (features, target)
    """

    # construct feature list
    features = [c for c in df.columns if c != target]
    y = df[target]
    X = df[features]
    return X, y

drop_col_feature_importance(df, target)

Compute feature importance using drop column feature importance

Parameters:

Name Type Description Default
df DataFrame

the dataframe to analyze

required
target str

the name of the target column to analyze against

required

Returns:

Type Description
DataFrame

a dataframe of feature importances

Source code in src/ssvc/csv_analyzer.py
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
def drop_col_feature_importance(df: pd.DataFrame, target: str) -> pd.DataFrame:
    """
    Compute feature importance using drop column feature importance

    Args:
        df: the dataframe to analyze
        target: the name of the target column to analyze against

    Returns:
        a dataframe of feature importances
    """
    X2, y = _prepare_data(df, target)
    # construct tree
    dt = DecisionTreeClassifier(random_state=99, criterion="entropy")

    imp = _drop_col_feat_imp(dt, X2, y)
    return imp

permute_feature_importance(df, target)

Compute feature importance using permutation feature importance

Parameters:

Name Type Description Default
df DataFrame

the dataframe to analyze

required
target str

the name of the target column to analyze against

required

Returns:

Type Description
DataFrame

a dataframe of feature importances

Source code in src/ssvc/csv_analyzer.py
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
def permute_feature_importance(df: pd.DataFrame, target: str) -> pd.DataFrame:
    """
    Compute feature importance using permutation feature importance

    Args:
        df: the dataframe to analyze
        target: the name of the target column to analyze against

    Returns:
        a dataframe of feature importances
    """
    X2, y = _prepare_data(df, target)
    # construct tree
    dt = DecisionTreeClassifier(random_state=99, criterion="entropy")

    imp = _perm_feat_imp(dt, X2, y)
    return imp