Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Laurens D'hooge
clean-ids-collection
Commits
8c6fde9d
Commit
8c6fde9d
authored
Oct 13, 2021
by
Laurens D'hooge
Browse files
ISCX-IDS2012 HFBR code
parent
49e94e6d
Changes
15
Hide whitespace changes
Inline
Side-by-side
iscx-ids2012/HFBRs-feature-selection/HFBR-calculation-iscxids2012.py
0 → 100644
View file @
8c6fde9d
import
pandas
as
pd
import
numpy
as
np
import
json
import
os
from
sklearn.model_selection
import
StratifiedShuffleSplit
,
cross_validate
,
RandomizedSearchCV
from
sklearn.ensemble
import
ExtraTreesClassifier
from
sklearn
import
tree
from
scipy.stats
import
friedmanchisquare
,
wilcoxon
from
statsmodels.stats.multitest
import
fdrcorrection
df
=
None
d
=
"../../iscx-ids2012/clean"
atk_types
=
{
'TestbedMonJun14Flows.parquet'
:
[],
'TestbedThuJun17-3Flows.parquet'
:
[],
'TestbedWedJun16-2Flows.parquet'
:
[],
'TestbedSunJun13Flows.parquet'
:
[],
'TestbedWedJun16-1Flows.parquet'
:
[],
'TestbedSatJun12Flows.parquet'
:
[],
'TestbedThuJun17-2Flows.parquet'
:
[],
'TestbedTueJun15-3Flows.parquet'
:
[],
'TestbedWedJun16-3Flows.parquet'
:
[],
'TestbedTueJun15-2Flows.parquet'
:
[],
'TestbedTueJun15-1Flows.parquet'
:
[],
'TestbedThuJun17-1Flows.parquet'
:
[],
'iscx-ids2012.parquet'
:
[]
}
for
subset
in
[
d
for
d
in
os
.
listdir
(
d
)
if
d
.
endswith
(
'.parquet'
)]:
full_path
=
os
.
path
.
join
(
d
,
subset
)
print
(
f
"Hierarchical Feature Block Selection calculations for
{
full_path
}
"
)
df
=
pd
.
read_parquet
(
full_path
)
tag_idx
=
df
.
columns
.
size
-
1
df
[
'tag'
]
=
df
[
'tag'
].
astype
(
'object'
)
attacks
=
df
.
loc
[
df
[
'tag'
]
!=
"Normal"
].
index
df
.
iloc
[
attacks
,
tag_idx
]
=
1.0
df
.
iloc
[
df
.
index
.
difference
(
attacks
),
tag_idx
]
=
0.0
df
[
'tag'
]
=
df
[
'tag'
].
astype
(
dtype
=
np
.
float32
,
copy
=
False
)
print
(
df
[
'tag'
].
value_counts
())
# VIF drop
vif_labels
=
open
(
f
'./VIF-calculations/
{
os
.
path
.
splitext
(
subset
)[
0
]
}
-VIF-removal.txt'
,
'r'
).
read
().
rstrip
().
split
(
','
)
print
(
vif_labels
)
df
.
drop
(
labels
=
vif_labels
,
axis
=
1
,
inplace
=
True
)
print
(
df
.
shape
)
df
=
df
.
astype
(
dtype
=
'float32'
)
print
(
df
.
dtypes
)
print
(
df
.
shape
)
df
.
drop_duplicates
(
subset
=
None
,
keep
=
'first'
,
inplace
=
True
)
print
(
df
.
shape
)
# Balancing df
col
=
df
.
columns
[
-
1
]
cols
=
df
.
columns
[:
-
1
]
vc
=
df
[
col
].
value_counts
()
n
=
vc
.
iloc
[
-
1
]
m
=
vc
.
iloc
[
0
]
initial_cut
=
df
.
loc
[
df
[
col
]
==
vc
.
index
[
0
]].
sample
(
n
=
int
(
m
-
n
),
replace
=
False
)
df
=
df
.
drop
(
index
=
initial_cut
.
index
)
print
(
"SHAPE AFTER BALANCING"
)
print
(
df
.
shape
)
print
(
df
[
col
].
value_counts
())
iscx_scalable
=
list
(
df
.
columns
)
for
omit_scale_iscx
in
(
'app_name_AOL-ICQ'
,
'app_name_Anet'
,
'app_name_Authentication'
,
'app_name_BGP'
,
'app_name_BitTorrent'
,
'app_name_Blubster'
,
'app_name_Citrix'
,
'app_name_Common-P2P-Port'
,
'app_name_Common-Ports'
,
'app_name_DNS'
,
'app_name_DNS-Port'
,
'app_name_FTP'
,
'app_name_Filenet'
,
'app_name_Flowgen'
,
'app_name_Gnutella'
,
'app_name_Google'
,
'app_name_Groove'
,
'app_name_GuptaSQLBase'
,
'app_name_H.323'
,
'app_name_HTTPImageTransfer'
,
'app_name_HTTPWeb'
,
'app_name_Hosts2-Ns'
,
'app_name_Hotline'
,
'app_name_ICMP'
,
'app_name_IGMP'
,
'app_name_IMAP'
,
'app_name_IPSec'
,
'app_name_IPX'
,
'app_name_IRC'
,
'app_name_Ingres'
,
'app_name_Intellex'
,
'app_name_Kazaa'
,
'app_name_LDAP'
,
'app_name_MDQS'
,
'app_name_MGCP'
,
'app_name_MS-SQL'
,
'app_name_MSMQ'
,
'app_name_MSN'
,
'app_name_MSN-Zone'
,
'app_name_MSTerminalServices'
,
'app_name_ManagementServices'
,
'app_name_MicrosoftMediaServer'
,
'app_name_Misc-DB'
,
'app_name_Misc-Mail-Port'
,
'app_name_Misc-Ports'
,
'app_name_MiscApp'
,
'app_name_MiscApplication'
,
'app_name_NETBEUI'
,
'app_name_NFS'
,
'app_name_NNTPNews'
,
'app_name_NTP'
,
'app_name_Nessus'
,
'app_name_NetBIOS-IP'
,
'app_name_Network-Config-Ports'
,
'app_name_NortonAntiVirus'
,
'app_name_NortonGhost'
,
'app_name_OpenNap'
,
'app_name_OpenWindows'
,
'app_name_Oracle'
,
'app_name_PCAnywhere'
,
'app_name_POP'
,
'app_name_POP-port'
,
'app_name_PPTP'
,
'app_name_PeerEnabler'
,
'app_name_PostgreSQL'
,
'app_name_Printer'
,
'app_name_RPC'
,
'app_name_RTSP'
,
'app_name_Real'
,
'app_name_SAP'
,
'app_name_SIP'
,
'app_name_SMS'
,
'app_name_SMTP'
,
'app_name_SNA'
,
'app_name_SNMP-Ports'
,
'app_name_SSDP'
,
'app_name_SSH'
,
'app_name_SSL-Shell'
,
'app_name_SecureWeb'
,
'app_name_Squid'
,
'app_name_StreamingAudio'
,
'app_name_SunRPC'
,
'app_name_TFTP'
,
'app_name_Tacacs'
,
'app_name_Telnet'
,
'app_name_Timbuktu'
,
'app_name_TimeServer'
,
'app_name_Unknown_TCP'
,
'app_name_Unknown_UDP'
,
'app_name_UpdateDaemon'
,
'app_name_VNC'
,
'app_name_Web-Port'
,
'app_name_WebFileTransfer'
,
'app_name_WebMediaAudio'
,
'app_name_WebMediaDocuments'
,
'app_name_WebMediaVideo'
,
'app_name_Webmin'
,
'app_name_WindowsFileSharing'
,
'app_name_XFER'
,
'app_name_XWindows'
,
'app_name_Yahoo'
,
'app_name_dsp3270'
,
'app_name_giop-ssl'
,
'app_name_iChat'
,
'app_name_rexec'
,
'app_name_rlogin'
,
'app_name_rsh'
,
'direction_L2L'
,
'direction_L2R'
,
'direction_R2L'
,
'direction_R2R'
,
'source_tcp_flags_f'
,
'source_tcp_flags_s'
,
'source_tcp_flags_r'
,
'source_tcp_flags_p'
,
'source_tcp_flags_a'
,
'destination_tcp_flags_f'
,
'destination_tcp_flags_s'
,
'destination_tcp_flags_r'
,
'destination_tcp_flags_p'
,
'destination_tcp_flags_a'
,
'proto_icmp_ip'
,
'proto_igmp'
,
'proto_ip'
,
'proto_ipv6icmp'
,
'proto_tcp_ip'
,
'proto_udp_ip'
,
'tag'
):
try
:
iscx_scalable
.
remove
(
omit_scale_iscx
)
except
ValueError
:
print
(
f
"
{
omit_scale_iscx
}
already not part of df, skipping"
)
zero_variance_after_balance
=
[]
for
c
in
df
.
columns
[:
-
1
]:
if
df
[
c
].
var
()
==
0.0
:
df
=
df
.
drop
(
labels
=
[
c
],
axis
=
1
)
zero_variance_after_balance
.
append
(
c
)
print
(
f
"Removed
{
c
}
because after balancing, its variance is 0.0"
)
for
c
in
iscx_scalable
:
qs
=
df
[
c
].
quantile
(
q
=
[
0.25
,
0.50
,
0.75
],
interpolation
=
'linear'
)
if
qs
[
0.75
]
-
qs
[
0.25
]
==
0.0
:
continue
else
:
df
[
c
]
=
(
df
[
c
]
-
qs
[
0.50
])
/
\
(
qs
[
0.75
]
-
qs
[
0.25
])
remaining_columns
=
list
(
df
.
columns
)
remaining_columns
.
remove
(
'tag'
)
remaining_columns_count
=
len
(
remaining_columns
)
label_loc
=
df
.
columns
.
get_loc
(
'tag'
)
print
(
df
.
head
(
2
))
array
=
df
.
values
X
=
np
.
delete
(
array
,
label_loc
,
1
)
y
=
array
[:,
label_loc
]
nested_score_mean
=
None
nested_score_std
=
None
parameter_grid
=
{
"n_estimators"
:
[
100
,
250
,
500
,
1000
],
"criterion"
:
[
"gini"
,
"entropy"
],
"max_depth"
:
[
3
,
6
,
9
,
12
,
15
],
"min_samples_split"
:
[
2
],
"min_samples_leaf"
:
[
0.0001
,
0.0005
,
0.001
,
0.005
,
0.01
],
"min_weight_fraction_leaf"
:
[
0.
],
"max_features"
:
[
"sqrt"
],
"max_leaf_nodes"
:
[
None
],
"min_impurity_decrease"
:
[
0.
],
"class_weight"
:
[
None
,
"balanced"
],
"ccp_alpha"
:
[
0.0
],
"max_samples"
:
[
None
]
}
train_sizes
=
[
0.05
,
0.06
,
0.07
,
0.08
,
0.09
,
0.1
,
0.125
,
0.15
]
test_sizes
=
[
0.95
,
0.94
,
0.93
,
0.92
,
0.91
,
0.90
,
0.875
,
0.85
]
inner_cv_splits
=
4
outer_cv_splits
=
10
result_dict
=
{}
result_dict
[
"dataset"
]
=
"iscxids2012"
result_dict
[
"subset"
]
=
os
.
path
.
splitext
(
subset
)[
0
]
result_dict
[
"atk_types"
]
=
[]
result_dict
[
"test_sizes"
]
=
test_sizes
result_dict
[
"remaining_features"
]
=
remaining_columns
result_dict
[
"test_scores"
]
=
[]
result_dict
[
"test_score_avg"
]
=
[]
result_dict
[
"test_score_std"
]
=
[]
result_dict
[
"stack_rows"
]
=
[]
result_dict
[
"wilcox_descending_feature_pairs"
]
=
[]
result_dict
[
"wilcox_p_array"
]
=
[]
result_dict
[
"wilcox_p_reject_non_corrected"
]
=
[]
result_dict
[
"wilcox_p_array_corrected"
]
=
[]
result_dict
[
"wilcox_p_reject_corrected"
]
=
[]
result_dict
[
"zero_variance_after_balance"
]
=
zero_variance_after_balance
for
ts_index
,
ts
in
enumerate
(
test_sizes
):
print
(
f
"train / validate on
{
X
.
shape
[
0
]
*
train_sizes
[
ts_index
]
}
"
)
print
(
f
"test on
{
X
.
shape
[
0
]
*
test_sizes
[
ts_index
]
}
"
)
inner_sss
=
StratifiedShuffleSplit
(
n_splits
=
4
,
test_size
=
0.25
)
outer_sss
=
StratifiedShuffleSplit
(
n_splits
=
outer_cv_splits
,
test_size
=
ts
)
est
=
ExtraTreesClassifier
(
n_jobs
=
os
.
cpu_count
())
param_cv
=
RandomizedSearchCV
(
estimator
=
est
,
param_distributions
=
parameter_grid
,
cv
=
inner_sss
,
n_iter
=
50
,
scoring
=
"balanced_accuracy"
,
n_jobs
=
os
.
cpu_count
(),
verbose
=
0
)
nested_score
=
cross_validate
(
param_cv
,
X
=
X
,
y
=
y
,
cv
=
outer_sss
,
n_jobs
=
os
.
cpu_count
(),
scoring
=
'balanced_accuracy'
,
return_train_score
=
True
,
return_estimator
=
True
,
verbose
=
1
)
nested_score_mean
=
nested_score
[
"test_score"
].
mean
()
nested_score_std
=
nested_score
[
"test_score"
].
std
()
print
(
"Average balanced accuracy scores from the 10 outer validation folds"
)
print
(
nested_score_mean
,
" +- "
,
nested_score_std
)
stack_raw
=
[
nested_score
[
"estimator"
][
i
].
best_estimator_
.
feature_importances_
for
i
in
range
(
outer_cv_splits
)]
stack
=
np
.
array
(
stack_raw
)
print
(
"stack shape"
,
stack
.
shape
)
print
(
stack
)
stack
=
stack
.
transpose
()
print
(
"transposed stack shape"
,
stack
.
shape
)
print
(
stack
)
stack_rows
=
np
.
vsplit
(
stack
,
stack
.
shape
[
0
])
print
(
"vsplit stacks"
,
len
(
stack_rows
))
print
(
stack_rows
)
friedman
,
p_friedman
=
friedmanchisquare
(
*
stack_rows
)
print
(
'stat=%.3f, p=%.6f'
%
(
friedman
,
p_friedman
))
if
p_friedman
>
0.05
:
print
(
'Probably the same distribution'
)
else
:
print
(
'Probably different distributions'
)
av
=
np
.
zeros
(
remaining_columns_count
)
for
i
in
range
(
outer_cv_splits
):
av
=
av
+
\
nested_score
[
"estimator"
][
i
].
best_estimator_
.
feature_importances_
av
=
av
/
outer_cv_splits
wilcox_p_array
=
np
.
array
([])
wilcox_p_reject
=
np
.
array
([])
av_sorted
=
np
.
sort
(
av
)
wilcox_descending_feature_pairs
=
np
.
array
([])
for
i
in
range
(
1
,
remaining_columns_count
):
high_index
=
np
.
where
(
av
==
av_sorted
[
-
i
])[
0
]
low_index
=
np
.
where
(
av
==
av_sorted
[
-
(
i
+
1
)])[
0
]
if
len
(
high_index
)
==
1
and
len
(
low_index
)
==
1
:
# print("high feature under consideration:", remaining_columns[high_index[0]], "importance_avg", av[high_index[0]], "importances", stack_rows[high_index[0]])
# print("low feature under consideration:", remaining_columns[low_index[0]], "importance_avg", av[low_index[0]], "importances", stack_rows[low_index[0]])
print
(
"high feature avg contrib"
,
remaining_columns
[
high_index
[
0
]],
": "
,
av
[
high_index
[
0
]])
print
(
"low feature avg contrib"
,
remaining_columns
[
low_index
[
0
]],
": "
,
av
[
low_index
[
0
]])
wilcox_descending_feature_pairs
=
np
.
append
(
wilcox_descending_feature_pairs
,
np
.
array
([
remaining_columns
[
high_index
[
0
]],
remaining_columns
[
low_index
[
0
]]]))
wilcox
,
p_wilcox
=
wilcoxon
(
stack_rows
[
high_index
[
0
]].
flatten
(),
stack_rows
[
low_index
[
0
]].
flatten
(),
alternative
=
'greater'
)
wilcox_p_array
=
np
.
append
(
wilcox_p_array
,
p_wilcox
)
print
(
'1-SIDE wilcox stat=%.3f, p=%.6f'
%
(
wilcox
,
p_wilcox
))
if
p_wilcox
>
0.05
:
print
(
"don't reject h0"
)
wilcox_p_reject
=
np
.
append
(
wilcox_p_reject
,
False
)
else
:
print
(
"reject h0"
)
wilcox_p_reject
=
np
.
append
(
wilcox_p_reject
,
True
)
else
:
print
(
"average feature importances at low indices"
,
av
[
low_index
])
print
(
"average feature importances at high indices"
,
av
[
high_index
])
rejected
,
pvalue_corrected
=
fdrcorrection
(
pvals
=
wilcox_p_array
,
alpha
=
0.05
,
method
=
'indep'
)
print
(
len
(
wilcox_p_array
),
wilcox_p_array
)
print
(
len
(
wilcox_p_reject
),
wilcox_p_reject
)
print
(
len
(
rejected
),
rejected
)
print
(
len
(
pvalue_corrected
),
pvalue_corrected
)
result_dict
[
"test_scores"
].
append
(
nested_score
[
"test_score"
].
tolist
())
result_dict
[
"test_score_avg"
].
append
(
nested_score_mean
)
result_dict
[
"test_score_std"
].
append
(
nested_score_std
)
result_dict
[
"stack_rows"
].
append
([
x
.
tolist
()
for
x
in
stack_rows
])
result_dict
[
"wilcox_descending_feature_pairs"
].
append
(
wilcox_descending_feature_pairs
.
tolist
())
result_dict
[
"wilcox_p_array"
].
append
(
wilcox_p_array
.
tolist
())
result_dict
[
"wilcox_p_reject_non_corrected"
].
append
(
wilcox_p_reject
.
tolist
())
result_dict
[
"wilcox_p_array_corrected"
].
append
(
pvalue_corrected
.
tolist
())
result_dict
[
"wilcox_p_reject_corrected"
].
append
(
rejected
.
tolist
())
# for i in range(outer_cv_splits):
# plt.plot(-np.sort(-nested_score["estimator"][i].best_estimator_.feature_importances_), linewidth=0.25)
# plt.plot(-np.sort(-av), 'b-')
# tree.plot_tree(nested_score["estimator"][0].best_estimator_.estimators_[0])
# plt.show()
with
open
(
f
"./HFBR-calculations/
{
os
.
path
.
splitext
(
subset
)[
0
]
}
-HFBR-calculations.json"
,
"w"
,
encoding
=
'utf-8'
)
as
outfile
:
json
.
dump
(
result_dict
,
outfile
,
indent
=
4
)
iscx-ids2012/HFBRs-feature-selection/VIF-calculations/TestbedMonJun14Flows-VIF-removal.txt
0 → 100644
View file @
8c6fde9d
app_name_IGMP,app_name_HTTPWeb,source_tcp_flags_a,destination_tcp_flags_p,source_tcp_flags_p,proto_udp_ip,app_name_DNS,total_destination_packets,destination_tcp_flags_s,direction_L2L,destination_tcp_flags_a,app_name_ICMP,proto_tcp_ip,total_source_packets
iscx-ids2012/HFBRs-feature-selection/VIF-calculations/TestbedSatJun12Flows-VIF-removal.txt
0 → 100644
View file @
8c6fde9d
app_name_IGMP,source_tcp_flags_a,destination_tcp_flags_p,source_tcp_flags_p,proto_udp_ip,app_name_DNS,total_destination_packets,destination_tcp_flags_s,direction_L2L,destination_tcp_flags_a,app_name_HTTPImageTransfer,app_name_ICMP,proto_tcp_ip,total_source_packets
iscx-ids2012/HFBRs-feature-selection/VIF-calculations/TestbedSunJun13Flows-VIF-removal.txt
0 → 100644
View file @
8c6fde9d
app_name_IGMP,source_tcp_flags_a,destination_tcp_flags_p,source_tcp_flags_p,proto_udp_ip,app_name_DNS,total_destination_packets,destination_tcp_flags_s,direction_L2L,destination_tcp_flags_a,app_name_HTTPImageTransfer,app_name_ICMP,proto_tcp_ip,total_source_packets
iscx-ids2012/HFBRs-feature-selection/VIF-calculations/TestbedThuJun17-1Flows-VIF-removal.txt
0 → 100644
View file @
8c6fde9d
app_name_IGMP,source_tcp_flags_a,source_tcp_flags_p,proto_udp_ip,app_name_DNS,total_destination_packets,destination_tcp_flags_s,direction_L2L,destination_tcp_flags_a,app_name_HTTPImageTransfer,app_name_ICMP,proto_tcp_ip,total_source_packets
iscx-ids2012/HFBRs-feature-selection/VIF-calculations/TestbedThuJun17-2Flows-VIF-removal.txt
0 → 100644
View file @
8c6fde9d
source_tcp_flags_a,proto_udp_ip,app_name_DNS,total_destination_packets,destination_tcp_flags_s,direction_L2L,destination_tcp_flags_a,direction_R2L,app_name_HTTPImageTransfer,app_name_ICMP,proto_tcp_ip,total_source_packets
iscx-ids2012/HFBRs-feature-selection/VIF-calculations/TestbedThuJun17-3Flows-VIF-removal.txt
0 → 100644
View file @
8c6fde9d
app_name_IGMP,app_name_HTTPWeb,source_tcp_flags_a,destination_tcp_flags_p,source_tcp_flags_p,proto_udp_ip,app_name_DNS,total_destination_packets,destination_tcp_flags_s,direction_L2L,destination_tcp_flags_a,app_name_ICMP,proto_tcp_ip,total_source_packets
iscx-ids2012/HFBRs-feature-selection/VIF-calculations/TestbedTueJun15-1Flows-VIF-removal.txt
0 → 100644
View file @
8c6fde9d
app_name_IGMP,app_name_HTTPWeb,source_tcp_flags_a,destination_tcp_flags_p,source_tcp_flags_p,proto_udp_ip,total_destination_packets,destination_tcp_flags_s,direction_L2L,destination_tcp_flags_a,app_name_ICMP,proto_tcp_ip,total_source_packets
iscx-ids2012/HFBRs-feature-selection/VIF-calculations/TestbedTueJun15-2Flows-VIF-removal.txt
0 → 100644
View file @
8c6fde9d
app_name_HTTPWeb,source_tcp_flags_a,source_tcp_flags_p,proto_udp_ip,app_name_DNS,total_destination_packets,destination_tcp_flags_s,app_name_Unknown_UDP,direction_L2L,destination_tcp_flags_a,app_name_HTTPImageTransfer,app_name_ICMP,total_source_bytes,proto_tcp_ip,total_source_packets
iscx-ids2012/HFBRs-feature-selection/VIF-calculations/TestbedTueJun15-3Flows-VIF-removal.txt
0 → 100644
View file @
8c6fde9d
app_name_IGMP,app_name_HTTPWeb,source_tcp_flags_a,destination_tcp_flags_p,source_tcp_flags_p,proto_udp_ip,app_name_DNS,total_destination_packets,destination_tcp_flags_s,proto_icmp_ip,direction_L2L,destination_tcp_flags_a,app_name_HTTPImageTransfer,app_name_ICMP,total_source_bytes,proto_tcp_ip,total_source_packets
iscx-ids2012/HFBRs-feature-selection/VIF-calculations/TestbedWedJun16-1Flows-VIF-removal.txt
0 → 100644
View file @
8c6fde9d
app_name_IGMP,source_tcp_flags_a,destination_tcp_flags_p,source_tcp_flags_p,proto_udp_ip,app_name_DNS,total_destination_packets,destination_tcp_flags_s,direction_L2L,destination_tcp_flags_a,app_name_HTTPImageTransfer,direction_L2R,app_name_ICMP,proto_tcp_ip,total_source_packets
iscx-ids2012/HFBRs-feature-selection/VIF-calculations/TestbedWedJun16-2Flows-VIF-removal.txt
0 → 100644
View file @
8c6fde9d
app_name_IGMP,source_tcp_flags_a,source_tcp_flags_p,proto_udp_ip,app_name_DNS,total_destination_packets,destination_tcp_flags_s,direction_L2L,destination_tcp_flags_a,app_name_HTTPImageTransfer,app_name_ICMP,total_source_bytes,proto_tcp_ip,total_source_packets
iscx-ids2012/HFBRs-feature-selection/VIF-calculations/TestbedWedJun16-3Flows-VIF-removal.txt
0 → 100644
View file @
8c6fde9d
app_name_IGMP,source_tcp_flags_a,source_tcp_flags_p,proto_udp_ip,app_name_DNS,total_destination_packets,destination_tcp_flags_s,direction_L2L,destination_tcp_flags_a,app_name_HTTPImageTransfer,app_name_ICMP,proto_tcp_ip,total_source_packets
iscx-ids2012/HFBRs-feature-selection/VIF-calculations/iscx-ids2012-VIF-removal.txt
0 → 100644
View file @
8c6fde9d
app_name_IGMP,app_name_HTTPWeb,source_tcp_flags_a,source_tcp_flags_p,proto_udp_ip,app_name_DNS,total_destination_packets,destination_tcp_flags_s,destination_tcp_flags_a,direction_L2R,app_name_ICMP,proto_tcp_ip,total_source_packets
iscx-ids2012/HFBRs-feature-selection/VIF-selection-iscxids2012.py
0 → 100644
View file @
8c6fde9d
import
cuml
import
cudf
import
cupy
from
cuml.model_selection
import
train_test_split
from
cuml.linear_model
import
LinearRegression
as
cuLR
import
os
df
=
None
d
=
"../../iscx-ids2012/clean"
for
subset
in
[
d
for
d
in
os
.
listdir
(
d
)
if
d
.
endswith
(
'.parquet'
)]:
full_path
=
os
.
path
.
join
(
d
,
subset
)
print
(
f
"Variance Inflation Factor calculations for
{
full_path
}
"
)
df
=
cudf
.
read_parquet
(
full_path
,
ignore_index
=
True
)
df
=
df
.
drop
(
labels
=
[
'tag'
],
axis
=
1
)
for
c
in
df
.
columns
:
sig
=
df
[
c
].
std
()
if
sig
==
0.0
:
continue
else
:
df
[
c
]
=
(
df
[
c
]
-
df
[
c
].
mean
())
/
sig
df
=
df
.
astype
(
dtype
=
cupy
.
float32
,
copy
=
False
)
cutoff
=
5.0
removed_features
=
set
()
complete
=
False
while
not
complete
:
biggest_vif
=
5.0
biggest_vif_feature
=
None
dropped_one_early
=
False
for
i
,
col_under_test
in
enumerate
(
df
.
columns
):
X_reg_train
,
_
,
y_reg_train
,
_
=
train_test_split
(
df
.
drop
(
columns
=
[
col_under_test
]),
df
[
col_under_test
],
train_size
=
1.0
)
cuml_reg_model
=
cuLR
(
fit_intercept
=
True
,
normalize
=
False
,
algorithm
=
'eig'
)
trained_LR
=
cuml_reg_model
.
fit
(
X_reg_train
,
y_reg_train
)
cu_preds
=
trained_LR
.
predict
(
X_reg_train
)
cu_r2
=
cuml
.
metrics
.
r2_score
(
y_reg_train
,
cu_preds
)
if
cu_r2
==
1.0
:
df
=
df
.
drop
(
labels
=
[
col_under_test
],
axis
=
1
)
dropped_one_early
=
True
removed_features
.
add
(
col_under_test
)
print
(
'dropped one early'
,
col_under_test
,
'vif = +inf'
)
break
elif
1
/
(
1
-
cu_r2
)
>
biggest_vif
:
biggest_vif
=
1
/
(
1
-
cu_r2
)
biggest_vif_feature
=
col_under_test
# print(f"{i}.{col_under_test} cuml's r2 score : ", cu_r2)
removed_features
.
add
(
biggest_vif_feature
)
print
(
'biggest vif feature'
,
biggest_vif_feature
,
biggest_vif
)
if
dropped_one_early
:
continue
elif
biggest_vif_feature
!=
None
:
df
=
df
.
drop
(
labels
=
[
biggest_vif_feature
],
axis
=
1
)
else
:
complete
=
True
print
(
df
.
shape
)
print
(
df
.
columns
)
print
(
"Removed"
)
print
(
removed_features
)
removed_features
.
remove
(
None
)
with
open
(
f
'VIF-calculations/
{
os
.
path
.
splitext
(
subset
)[
0
]
}
-VIF-removal.txt'
,
'a'
)
as
fd
:
removed_features
=
','
.
join
(
removed_features
)
+
'
\n
'
fd
.
write
(
removed_features
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment