Commit 49e94e6d authored by Laurens D'hooge's avatar Laurens D'hooge
Browse files

CTU-13 HFBR code

parent 59bcc83b
import pandas as pd
import numpy as np
import json
import os
from sklearn.model_selection import StratifiedShuffleSplit, cross_validate, RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import tree
from scipy.stats import friedmanchisquare, wilcoxon
from statsmodels.stats.multitest import fdrcorrection
df = None
d = "../../ctu-13/clean"
ds = [f"{d}/{i}" for i in range(1,14)]
ds.append(f"{d}/all")
for d in ds:
for subset in [d for d in os.listdir(d) if d.endswith('.parquet')]:
full_path = os.path.join(d, subset)
print(f"Hierarchical Feature Block Selection calculations for {full_path}")
df = pd.read_parquet(full_path)
df = df.drop(labels=['state'], axis=1)
label_idx = df.columns.size -1
df['label'] = df['label'].astype('object')
df['label'] = df['label'].str.startswith('flow=From-Botnet', na=False)
df['label'] = df['label'].astype(dtype=np.float32, copy=False)
# VIF drop
vif_labels = open(f'./VIF-calculations/{os.path.splitext(subset)[0]}-VIF-removal.txt', 'r').read().rstrip().split(',')
print(vif_labels)
df.drop(labels=vif_labels, axis=1, inplace=True)
print(df.shape)
df = df.astype(dtype='float32')
print(df.dtypes)
print(df.shape)
df.drop_duplicates(subset=None, keep='first', inplace=True)
print(df.shape)
# Balancing df
col = df.columns[-1]
cols = df.columns[:-1]
vc = df[col].value_counts()
n = vc.iloc[-1]
m = vc.iloc[0]
initial_cut = df.loc[df[col] == vc.index[0]].sample(n=int(m-n), replace=False)
df = df.drop(index=initial_cut.index)
print("SHAPE AFTER BALANCING")
print(df.shape)
print(df[col].value_counts())
ctu_scalable = list(df.columns)
for omit_scale_ctu in ('proto_arp', 'proto_esp', 'proto_gre', 'proto_icmp', 'proto_igmp', 'proto_ipnip', 'proto_ipv6', 'proto_ipv6-icmp', 'proto_ipx/spx', 'proto_llc', 'proto_pim', 'proto_rarp', 'proto_rsvp', 'proto_rtcp', 'proto_rtp', 'proto_udt', 'proto_unas', 'dir_<-', 'dir_<->', 'dir_<?', 'dir_<?>', 'dir_?>', 'dir_who', 'state', 'label'):
try:
ctu_scalable.remove(omit_scale_ctu)
except ValueError:
print(f"{omit_scale_ctu} already not part of df, skipping")
zero_variance_after_balance = []
for c in df.columns[:-1]:
if df[c].var() == 0.0:
df = df.drop(labels=[c], axis=1)
zero_variance_after_balance.append(c)
print(f"Removed {c} because after balancing, its variance is 0.0")
for c in ctu_scalable:
qs = df[c].quantile(q=[0.25, 0.50, 0.75], interpolation='linear')
if qs[0.75]-qs[0.25] == 0.0:
continue
else:
df[c] = (df[c] - qs[0.50]) / \
(qs[0.75] - qs[0.25])
remaining_columns = list(df.columns)
remaining_columns.remove('label')
remaining_columns_count = len(remaining_columns)
label_loc = df.columns.get_loc('label')
print(df.head(2))
array = df.values
X = np.delete(array, label_loc, 1)
y = array[:, label_loc]
nested_score_mean = None
nested_score_std = None
parameter_grid = {
"n_estimators": [100, 250, 500, 1000],
"criterion": ["gini", "entropy"],
"max_depth": [3, 6, 9, 12, 15],
"min_samples_split": [2],
"min_samples_leaf": [0.0001, 0.0005, 0.001, 0.005, 0.01],
"min_weight_fraction_leaf": [0.],
"max_features": ["sqrt"],
"max_leaf_nodes": [None],
"min_impurity_decrease": [0.],
"class_weight": [None, "balanced"],
"ccp_alpha": [0.0],
"max_samples": [None]
}
train_sizes = [0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.125, 0.15]
test_sizes = [0.95, 0.94, 0.93, 0.92, 0.91, 0.90, 0.875, 0.85]
inner_cv_splits = 4
outer_cv_splits = 10
result_dict = {}
result_dict["dataset"] = "ctu13"
result_dict["subset"] = os.path.splitext(subset)[0]
result_dict["atk_types"] = "botnet"
result_dict["test_sizes"] = test_sizes
result_dict["remaining_features"] = remaining_columns
result_dict["test_scores"] = []
result_dict["test_score_avg"] = []
result_dict["test_score_std"] = []
result_dict["stack_rows"] = []
result_dict["wilcox_descending_feature_pairs"] = []
result_dict["wilcox_p_array"] = []
result_dict["wilcox_p_reject_non_corrected"] = []
result_dict["wilcox_p_array_corrected"] = []
result_dict["wilcox_p_reject_corrected"] = []
result_dict["zero_variance_after_balance"] = zero_variance_after_balance
for ts_index, ts in enumerate(test_sizes):
print(f"train / validate on {X.shape[0]*train_sizes[ts_index]}")
print(f"test on {X.shape[0]*test_sizes[ts_index]}")
inner_sss = StratifiedShuffleSplit(n_splits=4, test_size=0.25)
outer_sss = StratifiedShuffleSplit(n_splits=outer_cv_splits, test_size=ts)
est = ExtraTreesClassifier(n_jobs=os.cpu_count())
param_cv = RandomizedSearchCV(estimator=est, param_distributions=parameter_grid,
cv=inner_sss, n_iter=50, scoring="balanced_accuracy", n_jobs=os.cpu_count(), verbose=0)
nested_score = cross_validate(param_cv, X=X, y=y, cv=outer_sss, n_jobs=os.cpu_count(),
scoring='balanced_accuracy', return_train_score=True, return_estimator=True, verbose=1)
nested_score_mean = nested_score["test_score"].mean()
nested_score_std = nested_score["test_score"].std()
print("Average balanced accuracy scores from the 10 outer validation folds")
print(nested_score_mean, " +- ", nested_score_std)
stack_raw = [nested_score["estimator"][i].best_estimator_.feature_importances_ for i in range(outer_cv_splits)]
stack = np.array(stack_raw)
print("stack shape", stack.shape)
print(stack)
stack = stack.transpose()
print("transposed stack shape", stack.shape)
print(stack)
stack_rows = np.vsplit(stack, stack.shape[0])
print("vsplit stacks", len(stack_rows))
print(stack_rows)
friedman, p_friedman = friedmanchisquare(*stack_rows)
print('stat=%.3f, p=%.6f' % (friedman, p_friedman))
if p_friedman > 0.05:
print('Probably the same distribution')
else:
print('Probably different distributions')
av = np.zeros(remaining_columns_count)
for i in range(outer_cv_splits):
av = av + \
nested_score["estimator"][i].best_estimator_.feature_importances_
av = av / outer_cv_splits
wilcox_p_array = np.array([])
wilcox_p_reject = np.array([])
av_sorted = np.sort(av)
wilcox_descending_feature_pairs = np.array([])
for i in range(1, remaining_columns_count):
high_index = np.where(av == av_sorted[-i])[0]
low_index = np.where(av == av_sorted[-(i+1)])[0]
if len(high_index) == 1 and len(low_index) == 1:
# print("high feature under consideration:", remaining_columns[high_index[0]], "importance_avg", av[high_index[0]], "importances", stack_rows[high_index[0]])
# print("low feature under consideration:", remaining_columns[low_index[0]], "importance_avg", av[low_index[0]], "importances", stack_rows[low_index[0]])
print("high feature avg contrib", remaining_columns[high_index[0]], ": ", av[high_index[0]])
print("low feature avg contrib", remaining_columns[low_index[0]], ": ", av[low_index[0]])
wilcox_descending_feature_pairs = np.append(wilcox_descending_feature_pairs, np.array([remaining_columns[high_index[0]], remaining_columns[low_index[0]]]))
wilcox, p_wilcox = wilcoxon(stack_rows[high_index[0]].flatten(), stack_rows[low_index[0]].flatten(), alternative='greater')
wilcox_p_array = np.append(wilcox_p_array, p_wilcox)
print('1-SIDE wilcox stat=%.3f, p=%.6f' % (wilcox, p_wilcox))
if p_wilcox > 0.05:
print("don't reject h0")
wilcox_p_reject = np.append(wilcox_p_reject, False)
else:
print("reject h0")
wilcox_p_reject = np.append(wilcox_p_reject, True)
else:
print("average feature importances at low indices", av[low_index])
print("average feature importances at high indices",av[high_index])
rejected, pvalue_corrected = fdrcorrection(pvals=wilcox_p_array, alpha=0.05, method='indep')
print(len(wilcox_p_array), wilcox_p_array)
print(len(wilcox_p_reject), wilcox_p_reject)
print(len(rejected), rejected)
print(len(pvalue_corrected), pvalue_corrected)
result_dict["test_scores"].append(nested_score["test_score"].tolist())
result_dict["test_score_avg"].append(nested_score_mean)
result_dict["test_score_std"].append(nested_score_std)
result_dict["stack_rows"].append([x.tolist() for x in stack_rows])
result_dict["wilcox_descending_feature_pairs"].append(wilcox_descending_feature_pairs.tolist())
result_dict["wilcox_p_array"].append(wilcox_p_array.tolist())
result_dict["wilcox_p_reject_non_corrected"].append(wilcox_p_reject.tolist())
result_dict["wilcox_p_array_corrected"].append(pvalue_corrected.tolist())
result_dict["wilcox_p_reject_corrected"].append(rejected.tolist())
# for i in range(outer_cv_splits):
# plt.plot(-np.sort(-nested_score["estimator"][i].best_estimator_.feature_importances_), linewidth=0.25)
# plt.plot(-np.sort(-av), 'b-')
# tree.plot_tree(nested_score["estimator"][0].best_estimator_.estimators_[0])
# plt.show()
with open(f"./HFBR-calculations/{os.path.splitext(subset)[0]}-HFBR-calculations.json", "w", encoding='utf-8') as outfile:
json.dump(result_dict, outfile, indent=4)
import cuml
import cudf
import cupy
from cuml.model_selection import train_test_split
from cuml.linear_model import LinearRegression as cuLR
import os
df = None
d = "../../ctu-13/clean"
ds = [f"{d}/{i}" for i in range(1,14)]
ds.append(f"{d}/all")
for d in ds:
for subset in [d for d in os.listdir(d) if d.endswith('.parquet')]:
full_path = os.path.join(d, subset)
print(f"Variance Inflation Factor calculations for {full_path}")
df = cudf.read_parquet(full_path, ignore_index=True)
df = df.drop(labels=['label', 'state'], axis=1)
for c in df.columns:
sig = df[c].std()
if sig == 0.0:
continue
else:
df[c] = (df[c] - df[c].mean()) / sig
df = df.astype(dtype=cupy.float32, copy=False)
cutoff = 5.0
removed_features = set()
complete = False
while not complete:
biggest_vif = 5.0
biggest_vif_feature = None
dropped_one_early = False
for i, col_under_test in enumerate(df.columns):
X_reg_train, _, y_reg_train, _ = train_test_split(df.drop(columns=[col_under_test]),
df[col_under_test],
train_size = 1.0)
cuml_reg_model = cuLR(fit_intercept=True, normalize=False, algorithm='eig')
trained_LR = cuml_reg_model.fit(X_reg_train, y_reg_train)
cu_preds = trained_LR.predict(X_reg_train)
cu_r2 = cuml.metrics.r2_score(y_reg_train, cu_preds)
if cu_r2 == 1.0:
df = df.drop(labels=[col_under_test], axis=1)
dropped_one_early = True
removed_features.add(col_under_test)
print('dropped one early', col_under_test, 'vif = +inf')
break
elif 1 / (1-cu_r2) > biggest_vif:
biggest_vif = 1 / (1-cu_r2)
biggest_vif_feature = col_under_test
# print(f"{i}.{col_under_test} cuml's r2 score : ", cu_r2)
removed_features.add(biggest_vif_feature)
print('biggest vif feature', biggest_vif_feature, biggest_vif)
if dropped_one_early:
continue
elif biggest_vif_feature != None:
df = df.drop(labels=[biggest_vif_feature], axis=1)
else:
complete = True
print(df.shape)
print(df.columns)
print("Removed")
print(removed_features)
removed_features.remove(None)
with open(f'VIF-calculations/{os.path.splitext(subset)[0]}-VIF-removal.txt', 'a') as fd:
removed_features = ','.join(removed_features)+'\n'
fd.write(removed_features)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment