Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Laurens D'hooge
clean-ids-collection
Commits
b11b7fc2
Commit
b11b7fc2
authored
Oct 13, 2021
by
Laurens D'hooge
Browse files
UNSW-NB15 HFBR code
parent
ce020c14
Changes
5
Hide whitespace changes
Inline
Side-by-side
unsw-nb15/HFBRs-feature-selection/HFBR-calculation-unswnb15.py
0 → 100644
View file @
b11b7fc2
import
pandas
as
pd
import
numpy
as
np
import
json
import
os
from
sklearn.model_selection
import
StratifiedShuffleSplit
,
cross_val_score
,
learning_curve
,
cross_validate
,
RandomizedSearchCV
,
cross_val_predict
from
sklearn.ensemble
import
ExtraTreesClassifier
from
scipy.stats
import
friedmanchisquare
,
wilcoxon
from
statsmodels.stats.multitest
import
fdrcorrection
df
=
None
ds
=
[
'../../unsw-nb15/clean'
,
'../../unsw-nb15/clean/designated-train-test-sets'
]
for
d
in
ds
:
for
subset
in
[
d
for
d
in
os
.
listdir
(
d
)
if
d
.
endswith
(
'.parquet'
)]:
full_path
=
os
.
path
.
join
(
d
,
subset
)
print
(
f
"Hierarchical Feature Block Selection calculations for
{
full_path
}
"
)
df
=
pd
.
read_parquet
(
full_path
)
attack_types
=
list
(
df
[
'attack_cat'
].
value_counts
().
index
)
df
=
df
.
drop
(
labels
=
[
'attack_cat'
,
'dsport'
],
axis
=
1
)
df
[
'label'
]
=
df
[
'label'
].
astype
(
np
.
float32
)
df
.
rename
(
columns
=
{
'label'
:
'Label'
},
inplace
=
True
)
df
[
'proto'
]
=
df
[
'proto'
].
cat
.
codes
df
[
'state'
]
=
df
[
'state'
].
cat
.
codes
df
[
'service'
]
=
df
[
'service'
].
cat
.
codes
df
[
'stime'
]
=
(
df
[
'stime'
]
-
pd
.
Timestamp
(
"1970-01-01"
))
//
pd
.
Timedelta
(
"1s"
)
df
[
'ltime'
]
=
(
df
[
'ltime'
]
-
pd
.
Timestamp
(
"1970-01-01"
))
//
pd
.
Timedelta
(
"1s"
)
print
(
df
.
dtypes
)
# VIF drop
vif_labels
=
open
(
f
'./VIF-calculations/
{
os
.
path
.
splitext
(
subset
)[
0
]
}
-VIF-removal.txt'
,
'r'
).
read
().
rstrip
().
split
(
','
)
print
(
vif_labels
)
df
.
drop
(
labels
=
vif_labels
,
axis
=
1
,
inplace
=
True
)
print
(
df
.
shape
)
df
=
df
.
astype
(
dtype
=
'float32'
)
print
(
df
.
dtypes
)
print
(
df
.
shape
)
df
.
drop_duplicates
(
subset
=
None
,
keep
=
'first'
,
inplace
=
True
)
print
(
df
.
shape
)
# Balancing df
col
=
df
.
columns
[
-
1
]
cols
=
df
.
columns
[:
-
1
]
vc
=
df
[
col
].
value_counts
()
n
=
vc
.
iloc
[
-
1
]
m
=
vc
.
iloc
[
0
]
print
(
"N AND M"
)
print
(
n
,
m
)
if
m
==
n
or
n
<
200
or
m
<
200
:
continue
initial_cut
=
df
.
loc
[
df
[
col
]
==
vc
.
index
[
0
]].
sample
(
n
=
int
(
m
-
n
),
replace
=
False
)
df
=
df
.
drop
(
index
=
initial_cut
.
index
)
print
(
"SHAPE AFTER BALANCING"
)
print
(
df
.
shape
)
print
(
df
[
col
].
value_counts
())
unsw_scalable
=
list
(
df
.
columns
)
for
omit_scale_unsw
in
(
'proto'
,
'state'
,
'service'
,
'stime'
,
'ltime'
,
'is_sm_ips_ports'
,
'is_ftp_login'
,
'Label'
):
try
:
unsw_scalable
.
remove
(
omit_scale_unsw
)
except
ValueError
:
print
(
f
"
{
omit_scale_unsw
}
already not part of df, skipping"
)
zero_variance_after_balance
=
[]
for
c
in
df
.
columns
[:
-
1
]:
if
df
[
c
].
var
()
==
0.0
:
df
=
df
.
drop
(
labels
=
[
c
],
axis
=
1
)
zero_variance_after_balance
.
append
(
c
)
try
:
unsw_scalable
.
remove
(
c
)
except
ValueError
:
pass
print
(
f
"Removed
{
c
}
because after balancing, its variance is 0.0"
)
for
c
in
unsw_scalable
:
qs
=
df
[
c
].
quantile
(
q
=
[
0.25
,
0.50
,
0.75
],
interpolation
=
'linear'
)
if
qs
[
0.75
]
-
qs
[
0.25
]
==
0.0
:
continue
else
:
df
[
c
]
=
(
df
[
c
]
-
qs
[
0.50
])
/
\
(
qs
[
0.75
]
-
qs
[
0.25
])
remaining_columns
=
list
(
df
.
columns
)
remaining_columns
.
remove
(
'Label'
)
remaining_columns_count
=
len
(
remaining_columns
)
label_loc
=
df
.
columns
.
get_loc
(
'Label'
)
print
(
df
.
head
(
2
))
array
=
df
.
values
X
=
np
.
delete
(
array
,
label_loc
,
1
)
y
=
array
[:,
label_loc
]
nested_score_mean
=
None
nested_score_std
=
None
parameter_grid
=
{
"n_estimators"
:
[
100
,
250
,
500
,
1000
],
"criterion"
:
[
"gini"
,
"entropy"
],
"max_depth"
:
[
3
,
6
,
9
,
12
,
15
],
"min_samples_split"
:
[
2
],
"min_samples_leaf"
:
[
0.0001
,
0.0005
,
0.001
,
0.005
,
0.01
],
"min_weight_fraction_leaf"
:
[
0.
],
"max_features"
:
[
"sqrt"
],
"max_leaf_nodes"
:
[
None
],
"min_impurity_decrease"
:
[
0.
],
"class_weight"
:
[
None
,
"balanced"
],
"ccp_alpha"
:
[
0.0
],
"max_samples"
:
[
None
]
}
train_sizes
=
[
0.05
,
0.06
,
0.07
,
0.08
,
0.09
,
0.1
,
0.125
,
0.15
]
test_sizes
=
[
0.95
,
0.94
,
0.93
,
0.92
,
0.91
,
0.90
,
0.875
,
0.85
]
inner_cv_splits
=
4
outer_cv_splits
=
10
result_dict
=
{}
result_dict
[
"dataset"
]
=
"unswnb15"
result_dict
[
"subset"
]
=
os
.
path
.
splitext
(
subset
)[
0
]
result_dict
[
"atk_types"
]
=
attack_types
result_dict
[
"test_sizes"
]
=
test_sizes
result_dict
[
"remaining_features"
]
=
remaining_columns
result_dict
[
"test_scores"
]
=
[]
result_dict
[
"test_score_avg"
]
=
[]
result_dict
[
"test_score_std"
]
=
[]
result_dict
[
"stack_rows"
]
=
[]
result_dict
[
"wilcox_descending_feature_pairs"
]
=
[]
result_dict
[
"wilcox_p_array"
]
=
[]
result_dict
[
"wilcox_p_reject_non_corrected"
]
=
[]
result_dict
[
"wilcox_p_array_corrected"
]
=
[]
result_dict
[
"wilcox_p_reject_corrected"
]
=
[]
result_dict
[
"zero_variance_after_balance"
]
=
zero_variance_after_balance
for
ts_index
,
ts
in
enumerate
(
test_sizes
):
print
(
f
"train / validate on
{
X
.
shape
[
0
]
*
train_sizes
[
ts_index
]
}
"
)
print
(
f
"test on
{
X
.
shape
[
0
]
*
test_sizes
[
ts_index
]
}
"
)
inner_sss
=
StratifiedShuffleSplit
(
n_splits
=
4
,
test_size
=
0.25
)
outer_sss
=
StratifiedShuffleSplit
(
n_splits
=
outer_cv_splits
,
test_size
=
ts
)
est
=
ExtraTreesClassifier
(
n_jobs
=
os
.
cpu_count
())
param_cv
=
RandomizedSearchCV
(
estimator
=
est
,
param_distributions
=
parameter_grid
,
cv
=
inner_sss
,
n_iter
=
50
,
scoring
=
"balanced_accuracy"
,
n_jobs
=
os
.
cpu_count
(),
verbose
=
0
)
nested_score
=
cross_validate
(
param_cv
,
X
=
X
,
y
=
y
,
cv
=
outer_sss
,
n_jobs
=
os
.
cpu_count
(),
scoring
=
'balanced_accuracy'
,
return_train_score
=
True
,
return_estimator
=
True
,
verbose
=
1
)
nested_score_mean
=
nested_score
[
"test_score"
].
mean
()
nested_score_std
=
nested_score
[
"test_score"
].
std
()
print
(
"Average balanced accuracy scores from the 10 outer validation folds"
)
print
(
nested_score_mean
,
" +- "
,
nested_score_std
)
stack_raw
=
[
nested_score
[
"estimator"
][
i
].
best_estimator_
.
feature_importances_
for
i
in
range
(
outer_cv_splits
)]
stack
=
np
.
array
(
stack_raw
)
print
(
"stack shape"
,
stack
.
shape
)
print
(
stack
)
stack
=
stack
.
transpose
()
print
(
"transposed stack shape"
,
stack
.
shape
)
print
(
stack
)
stack_rows
=
np
.
vsplit
(
stack
,
stack
.
shape
[
0
])
print
(
"vsplit stacks"
,
len
(
stack_rows
))
print
(
stack_rows
)
friedman
,
p_friedman
=
friedmanchisquare
(
*
stack_rows
)
print
(
'stat=%.3f, p=%.6f'
%
(
friedman
,
p_friedman
))
if
p_friedman
>
0.05
:
print
(
'Probably the same distribution'
)
else
:
print
(
'Probably different distributions'
)
av
=
np
.
zeros
(
remaining_columns_count
)
for
i
in
range
(
outer_cv_splits
):
av
=
av
+
\
nested_score
[
"estimator"
][
i
].
best_estimator_
.
feature_importances_
av
=
av
/
outer_cv_splits
wilcox_p_array
=
np
.
array
([])
wilcox_p_reject
=
np
.
array
([])
av_sorted
=
np
.
sort
(
av
)
wilcox_descending_feature_pairs
=
np
.
array
([])
for
i
in
range
(
1
,
remaining_columns_count
):
high_index
=
np
.
where
(
av
==
av_sorted
[
-
i
])[
0
]
low_index
=
np
.
where
(
av
==
av_sorted
[
-
(
i
+
1
)])[
0
]
if
len
(
high_index
)
==
1
and
len
(
low_index
)
==
1
:
# print("high feature under consideration:", remaining_columns[high_index[0]], "importance_avg", av[high_index[0]], "importances", stack_rows[high_index[0]])
# print("low feature under consideration:", remaining_columns[low_index[0]], "importance_avg", av[low_index[0]], "importances", stack_rows[low_index[0]])
print
(
"high feature avg contrib"
,
remaining_columns
[
high_index
[
0
]],
": "
,
av
[
high_index
[
0
]])
print
(
"low feature avg contrib"
,
remaining_columns
[
low_index
[
0
]],
": "
,
av
[
low_index
[
0
]])
wilcox_descending_feature_pairs
=
np
.
append
(
wilcox_descending_feature_pairs
,
np
.
array
([
remaining_columns
[
high_index
[
0
]],
remaining_columns
[
low_index
[
0
]]]))
wilcox
,
p_wilcox
=
wilcoxon
(
stack_rows
[
high_index
[
0
]].
flatten
(),
stack_rows
[
low_index
[
0
]].
flatten
(),
alternative
=
'greater'
)
wilcox_p_array
=
np
.
append
(
wilcox_p_array
,
p_wilcox
)
print
(
'1-SIDE wilcox stat=%.3f, p=%.6f'
%
(
wilcox
,
p_wilcox
))
if
p_wilcox
>
0.05
:
print
(
"don't reject h0"
)
wilcox_p_reject
=
np
.
append
(
wilcox_p_reject
,
False
)
else
:
print
(
"reject h0"
)
wilcox_p_reject
=
np
.
append
(
wilcox_p_reject
,
True
)
else
:
print
(
"average feature importances at low indices"
,
av
[
low_index
])
print
(
"average feature importances at high indices"
,
av
[
high_index
])
rejected
,
pvalue_corrected
=
fdrcorrection
(
pvals
=
wilcox_p_array
,
alpha
=
0.05
,
method
=
'indep'
)
print
(
len
(
wilcox_p_array
),
wilcox_p_array
)
print
(
len
(
wilcox_p_reject
),
wilcox_p_reject
)
print
(
len
(
rejected
),
rejected
)
print
(
len
(
pvalue_corrected
),
pvalue_corrected
)
result_dict
[
"test_scores"
].
append
(
nested_score
[
"test_score"
].
tolist
())
result_dict
[
"test_score_avg"
].
append
(
nested_score_mean
)
result_dict
[
"test_score_std"
].
append
(
nested_score_std
)
result_dict
[
"stack_rows"
].
append
([
x
.
tolist
()
for
x
in
stack_rows
])
result_dict
[
"wilcox_descending_feature_pairs"
].
append
(
wilcox_descending_feature_pairs
.
tolist
())
result_dict
[
"wilcox_p_array"
].
append
(
wilcox_p_array
.
tolist
())
result_dict
[
"wilcox_p_reject_non_corrected"
].
append
(
wilcox_p_reject
.
tolist
())
result_dict
[
"wilcox_p_array_corrected"
].
append
(
pvalue_corrected
.
tolist
())
result_dict
[
"wilcox_p_reject_corrected"
].
append
(
rejected
.
tolist
())
# for i in range(outer_cv_splits):
# plt.plot(-np.sort(-nested_score["estimator"][i].best_estimator_.feature_importances_), linewidth=0.25)
# plt.plot(-np.sort(-av), 'b-')
with
open
(
f
"./HFBR-calculations/
{
os
.
path
.
splitext
(
subset
)[
0
]
}
-HFBR-calculations.json"
,
"w"
,
encoding
=
'utf-8'
)
as
outfile
:
json
.
dump
(
result_dict
,
outfile
,
indent
=
4
)
unsw-nb15/HFBRs-feature-selection/VIF-calculations/UNSW-NB15-VIF-removal.txt
0 → 100644
View file @
b11b7fc2
dloss,stime,ct_src_dport_ltm,is_ftp_login,tcprtt,sintpkt,spkts,dpkts,dwin,sloss
unsw-nb15/HFBRs-feature-selection/VIF-calculations/UNSW_NB15_testing-set-VIF-removal.txt
0 → 100644
View file @
b11b7fc2
dloss,ct_dst_src_ltm,ct_src_dport_ltm,is_ftp_login,tcprtt,sinpkt,spkts,dpkts,dwin,ct_srv_dst,sloss
unsw-nb15/HFBRs-feature-selection/VIF-calculations/UNSW_NB15_training-set-VIF-removal.txt
0 → 100644
View file @
b11b7fc2
dloss,ct_dst_src_ltm,ct_src_dport_ltm,tcprtt,sinpkt,ct_ftp_cmd,dpkts,spkts,dwin,ct_srv_dst,sloss
unsw-nb15/HFBRs-feature-selection/VIF-selection-unswnb15.py
0 → 100644
View file @
b11b7fc2
import
cuml
import
cudf
import
cupy
from
cuml.model_selection
import
train_test_split
from
cuml.linear_model
import
LinearRegression
as
cuLR
import
os
df
=
None
ds
=
[
"../../unsw-nb15/clean"
,
"../../unsw-nb15/clean/designated-train-test-sets"
]
for
d
in
ds
:
for
subset
in
[
d
for
d
in
os
.
listdir
(
d
)
if
d
.
endswith
(
'.parquet'
)]:
full_path
=
os
.
path
.
join
(
d
,
subset
)
print
(
f
"Variance Inflation Factor calculations for
{
full_path
}
"
)
df
=
cudf
.
read_parquet
(
full_path
,
ignore_index
=
True
)
df
=
df
.
drop
(
labels
=
[
'attack_cat'
,
'label'
],
axis
=
1
)
print
(
df
.
dtypes
)
print
(
df
[
'proto'
].
value_counts
())
print
(
df
[
'state'
].
value_counts
())
print
(
df
[
'service'
].
value_counts
())
df
=
df
.
drop
(
labels
=
[
'proto'
,
'state'
,
'service'
],
axis
=
1
)
for
c
in
df
.
columns
:
sig
=
df
[
c
].
std
()
if
sig
==
0.0
:
continue
else
:
df
[
c
]
=
(
df
[
c
]
-
df
[
c
].
mean
())
/
sig
df
=
df
.
astype
(
dtype
=
cupy
.
float32
,
copy
=
False
)
cutoff
=
5.0
removed_features
=
set
()
complete
=
False
while
not
complete
:
biggest_vif
=
5.0
biggest_vif_feature
=
None
dropped_one_early
=
False
for
i
,
col_under_test
in
enumerate
(
df
.
columns
):
X_reg_train
,
_
,
y_reg_train
,
_
=
train_test_split
(
df
.
drop
(
columns
=
[
col_under_test
]),
df
[
col_under_test
],
train_size
=
1.0
)
cuml_reg_model
=
cuLR
(
fit_intercept
=
True
,
normalize
=
False
,
algorithm
=
'eig'
)
trained_LR
=
cuml_reg_model
.
fit
(
X_reg_train
,
y_reg_train
)
cu_preds
=
trained_LR
.
predict
(
X_reg_train
)
cu_r2
=
cuml
.
metrics
.
r2_score
(
y_reg_train
,
cu_preds
)
if
cu_r2
==
1.0
:
df
=
df
.
drop
(
labels
=
[
col_under_test
],
axis
=
1
)
dropped_one_early
=
True
removed_features
.
add
(
col_under_test
)
print
(
'dropped one early'
,
col_under_test
,
'vif = +inf'
)
break
elif
1
/
(
1
-
cu_r2
)
>
biggest_vif
:
biggest_vif
=
1
/
(
1
-
cu_r2
)
biggest_vif_feature
=
col_under_test
# print(f"{i}.{col_under_test} cuml's r2 score : ", cu_r2)
removed_features
.
add
(
biggest_vif_feature
)
print
(
'biggest vif feature'
,
biggest_vif_feature
,
biggest_vif
)
if
dropped_one_early
:
continue
elif
biggest_vif_feature
!=
None
:
df
=
df
.
drop
(
labels
=
[
biggest_vif_feature
],
axis
=
1
)
else
:
complete
=
True
print
(
df
.
shape
)
print
(
df
.
columns
)
print
(
"Removed"
)
print
(
removed_features
)
removed_features
.
remove
(
None
)
with
open
(
f
'VIF-calculations/
{
os
.
path
.
splitext
(
subset
)[
0
]
}
-VIF-removal.txt'
,
'a'
)
as
fd
:
removed_features
=
','
.
join
(
removed_features
)
+
'
\n
'
fd
.
write
(
removed_features
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment