Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Laurens D'hooge
clean-ids-collection
Commits
67e5fea3
Commit
67e5fea3
authored
Apr 10, 2022
by
Laurens D'hooge
Browse files
updated preproc code for dirty-metadata versions
parent
1ee471eb
Changes
7
Hide whitespace changes
Inline
Side-by-side
cic_data_cleaning.py
View file @
67e5fea3
...
...
@@ -13,11 +13,11 @@ random.seed(0)
drop_columns
=
[
# Dataset Specific Information
"Flow ID"
,
"Source IP"
,
"Src IP"
,
"Source Port"
,
"Src Port"
,
"Destination IP"
,
"Dst IP"
,
"Destination Port"
,
"Dst Port"
,
"Protocol"
,
"Timestamp"
,
#
"Source IP", "Src IP",
#
"Source Port", "Src Port",
#
"Destination IP", "Dst IP",
#
"Destination Port", "Dst Port",
#
"Protocol", "Timestamp",
# Features Without Observed Variance
"Bwd PSH Flags"
,
"Fwd URG Flags"
,
...
...
@@ -68,6 +68,12 @@ drop_columns = [
# ]
mapper
=
{
# Only needed for dirty-with-metadata
'Src IP'
:
'Source IP'
,
'Dst IP'
:
'Destination IP'
,
'Src Port'
:
'Source Port'
,
'Dst Port'
:
'Destination Port'
,
'Tot Fwd Pkts'
:
'Total Fwd Packets'
,
'Tot Bwd Pkts'
:
'Total Backward Packets'
,
'TotLen Fwd Pkts'
:
'Fwd Packets Length Total'
,
...
...
@@ -125,7 +131,15 @@ mapper = {
'Fwd Act Data Pkts'
:
'Fwd Act Data Packets'
,
'act_data_pkt_fwd'
:
'Fwd Act Data Packets'
,
'Fwd Seg Size Min'
:
'Fwd Seg Size Min'
,
'min_seg_size_forward'
:
'Fwd Seg Size Min'
'min_seg_size_forward'
:
'Fwd Seg Size Min'
,
# 'Active Mean': 'Active Mean',
# 'Active Std': 'Active Std',
# 'Active Max': 'Active Max',
# 'Active Min': 'Active Min',
# 'Idle Mean': 'Idle Mean',
# 'Idle Std': 'Idle Std',
# 'Idle Max': 'Idle Max',
# 'Idle Min': 'Idle Min',
}
...
...
@@ -157,9 +171,9 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
df
[
int_col
]
=
df
[
int_col
].
apply
(
pd
.
to_numeric
,
errors
=
'coerce'
,
downcast
=
'integer'
)
float_col
=
df
.
select_dtypes
(
include
=
'float'
).
columns
df
[
float_col
]
=
df
[
float_col
].
apply
(
pd
.
to_numeric
,
errors
=
'coerce'
,
downcast
=
'float'
)
obj_col
=
df
.
select_dtypes
(
include
=
'object'
).
columns
print
(
f
'Columns with dtype == object:
{
obj_col
}
'
)
df
[
obj_col
]
=
df
[
obj_col
].
apply
(
pd
.
to_numeric
,
errors
=
'coerce'
)
#
obj_col = df.select_dtypes(include='object').columns
#
print(f'Columns with dtype == object: {obj_col}')
#
df[obj_col] = df[obj_col].apply(pd.to_numeric, errors='coerce')
# Drop rows with invalid data
df
.
replace
([
np
.
inf
,
-
np
.
inf
],
np
.
nan
,
inplace
=
True
)
...
...
@@ -167,8 +181,7 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
df
.
dropna
(
inplace
=
True
)
# Drop duplicate rows
df
.
drop_duplicates
(
inplace
=
True
,
subset
=
df
.
columns
.
difference
([
'Label'
,
'Timestamp'
]))
df
.
drop_duplicates
(
inplace
=
True
,
subset
=
df
.
columns
.
difference
([
'Label'
]))
print
(
df
[
"Label"
].
value_counts
())
print
(
f
"shape:
{
df
.
shape
}
\n
"
)
...
...
@@ -181,18 +194,18 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
# Save to file
if
'feather'
in
filetypes
:
f
=
file
.
replace
(
'.csv'
,
'.feather'
)
df
.
to_feather
(
f
'
{
dataset
}
/
clean
/
{
f
}
'
)
df
.
to_feather
(
f
'
{
dataset
}
/
dirty-with-metadata
/
{
f
}
'
)
if
'parquet'
in
filetypes
:
f
=
file
.
replace
(
'.csv'
,
'.parquet'
)
df
.
to_parquet
(
f
'
{
dataset
}
/
clean
/
{
f
}
'
,
index
=
False
)
df
.
to_parquet
(
f
'
{
dataset
}
/
dirty-with-metadata
/
{
f
}
'
,
index
=
False
)
if
'csv'
in
filetypes
:
df
.
to_csv
(
f
'
{
dataset
}
/
clean
/
{
file
}
'
,
sep
=
','
,
index
=
False
,
encoding
=
'utf-8'
)
df
.
to_csv
(
f
'
{
dataset
}
/
dirty-with-metadata
/
{
file
}
'
,
sep
=
','
,
index
=
False
,
encoding
=
'utf-8'
)
def
aggregate_data
(
dataset
,
save
=
True
,
filetype
=
'feather'
):
# Will search for all files in the 'clean' directory of the correct filetype and aggregate them
all_data
=
pd
.
DataFrame
()
for
file
in
glob
.
glob
(
f
'
{
dataset
}
/
clean
/*.
{
filetype
}
'
):
for
file
in
glob
.
glob
(
f
'
{
dataset
}
/
dirty-with-metadata
/*.
{
filetype
}
'
):
print
(
file
)
df
=
pd
.
DataFrame
()
if
filetype
==
'feather'
:
...
...
@@ -216,33 +229,33 @@ def aggregate_data(dataset, save=True, filetype='feather'):
malicious
=
all_data
[
all_data
.
Label
!=
'Benign'
].
reset_index
(
drop
=
True
)
benign
=
all_data
[
all_data
.
Label
==
'Benign'
].
reset_index
(
drop
=
True
)
if
filetype
==
'feather'
:
all_data
.
to_feather
(
f
'
{
dataset
}
/
clean
/all_data.feather'
)
malicious
.
to_feather
(
f
'
{
dataset
}
/
clean
/all_malicious.feather'
)
benign
.
to_feather
(
f
'
{
dataset
}
/
clean
/all_benign.feather'
)
all_data
.
to_feather
(
f
'
{
dataset
}
/
dirty-with-metadata
/all_data.feather'
)
malicious
.
to_feather
(
f
'
{
dataset
}
/
dirty-with-metadata
/all_malicious.feather'
)
benign
.
to_feather
(
f
'
{
dataset
}
/
dirty-with-metadata
/all_benign.feather'
)
if
filetype
==
'parquet'
:
all_data
.
to_parquet
(
f
'
{
dataset
}
/
clean
/all_data.parquet'
,
index
=
False
)
f
'
{
dataset
}
/
dirty-with-metadata
/all_data.parquet'
,
index
=
False
)
malicious
.
to_parquet
(
f
'
{
dataset
}
/
clean
/all_malicious.parquet'
,
index
=
False
)
f
'
{
dataset
}
/
dirty-with-metadata
/all_malicious.parquet'
,
index
=
False
)
benign
.
to_parquet
(
f
'
{
dataset
}
/
clean
/all_benign.parquet'
,
index
=
False
)
f
'
{
dataset
}
/
dirty-with-metadata
/all_benign.parquet'
,
index
=
False
)
def
check_dims_type
(
dataset
):
for
file
in
os
.
listdir
(
f
'
{
dataset
}
/
clean
'
):
for
file
in
os
.
listdir
(
f
'
{
dataset
}
/
dirty-with-metadata
'
):
print
(
f
"-------
{
file
}
-------"
)
if
file
.
endswith
(
'.csv'
):
df
=
pd
.
read_csv
(
f
'
{
dataset
}
/
clean
/
{
file
}
'
)
df
=
pd
.
read_csv
(
f
'
{
dataset
}
/
dirty-with-metadata
/
{
file
}
'
)
print
(
df
.
shape
)
print
(
df
.
head
(
1
))
# [print(i) for i in df.dtypes]
elif
file
.
endswith
(
'.parquet'
):
df
=
pd
.
read_parquet
(
f
'
{
dataset
}
/
clean
/
{
file
}
'
)
df
=
pd
.
read_parquet
(
f
'
{
dataset
}
/
dirty-with-metadata
/
{
file
}
'
)
print
(
df
.
shape
)
print
(
df
.
head
(
1
))
# [print(i) for i in df.dtypes]
elif
file
.
endswith
(
'.feather'
):
df
=
pd
.
read_feather
(
f
'
{
dataset
}
/
clean
/
{
file
}
'
)
df
=
pd
.
read_feather
(
f
'
{
dataset
}
/
dirty-with-metadata
/
{
file
}
'
)
print
(
df
.
shape
)
print
(
df
.
head
(
1
))
# [print(i) for i in df.dtypes]
...
...
cidds_data_cleaning.py
View file @
67e5fea3
...
...
@@ -26,7 +26,7 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
'attackdescription'
:
'attack_description'
},
inplace
=
True
)
df
=
df
.
drop
(
labels
=
[
'date_first_seen'
,
'src_ip_addr'
,
'src_pt'
,
'dst_ip_addr'
,
'dst_pt'
,
'attack_description'
],
axis
=
1
)
#
df = df.drop(labels=['date_first_seen', 'src_ip_addr', 'src_pt', 'dst_ip_addr', 'dst_pt', 'attack_description'], axis=1)
proto_idx
=
df
.
columns
.
get_loc
(
'proto'
)
df
[
'proto'
]
=
df
[
'proto'
].
str
.
strip
()
...
...
@@ -95,12 +95,12 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
# Save to file
if
'feather'
in
filetypes
:
f
=
file
.
replace
(
'.csv'
,
'.feather'
)
df
.
to_feather
(
f
'
{
dataset
.
replace
(
"original"
,
"
clean
"
)
}
/
{
f
}
'
)
df
.
to_feather
(
f
'
{
dataset
.
replace
(
"original"
,
"
dirty-with-metadata
"
)
}
/
{
f
}
'
)
if
'parquet'
in
filetypes
:
f
=
file
.
replace
(
'.csv'
,
'.parquet'
)
df
.
to_parquet
(
f
'
{
dataset
.
replace
(
"original"
,
"
clean
"
)
}
/
{
f
}
'
,
index
=
False
)
df
.
to_parquet
(
f
'
{
dataset
.
replace
(
"original"
,
"
dirty-with-metadata
"
)
}
/
{
f
}
'
,
index
=
False
)
if
'csv'
in
filetypes
:
df
.
to_csv
(
f
'
{
dataset
.
replace
(
"original"
,
"
clean
"
)
}
/
{
file
}
'
,
sep
=
','
,
index
=
False
,
encoding
=
'utf-8'
)
df
.
to_csv
(
f
'
{
dataset
.
replace
(
"original"
,
"
dirty-with-metadata
"
)
}
/
{
file
}
'
,
sep
=
','
,
index
=
False
,
encoding
=
'utf-8'
)
def
check_dims_type
(
dataset
):
for
file
in
listdir
(
f
'
{
dataset
}
'
):
...
...
@@ -124,9 +124,9 @@ def check_dims_type(dataset):
if
__name__
==
"__main__"
:
# Adjust for cleaning the correct dataset into the desired format
# Needs directory with dataset name containing empty dir 'clean' and dir 'original' containing the CSVs
# clean_dataset('cidds-001/original/traffic/ExternalServer', filetypes=['feather', 'parquet', 'csv'])
# clean_dataset('cidds-001/original/traffic/OpenStack', filetypes=['feather', 'parquet', 'csv'])
# clean_dataset('cidds-002/original/traffic', filetypes=['feather', 'parquet', 'csv'])
check_dims_type
(
'cidds-001/clean/traffic/ExternalServer'
)
check_dims_type
(
'cidds-001/clean/traffic/OpenStack'
)
check_dims_type
(
'cidds-002/clean/traffic'
)
\ No newline at end of file
clean_dataset
(
'cidds-001/original/traffic/ExternalServer'
,
filetypes
=
[
'feather'
,
'parquet'
])
clean_dataset
(
'cidds-001/original/traffic/OpenStack'
,
filetypes
=
[
'feather'
,
'parquet'
])
clean_dataset
(
'cidds-002/original/traffic'
,
filetypes
=
[
'feather'
,
'parquet'
])
# check_dims_type('cidds-001/clean/traffic/ExternalServer')
# check_dims_type('cidds-001/clean/traffic/OpenStack')
# check_dims_type('cidds-002/clean/traffic')
\ No newline at end of file
ctu_data_cleaning.py
View file @
67e5fea3
#! /usr/bin/env python3
import
pandas
as
pd
pd
.
set_option
(
'display.max_columns'
,
None
)
pd
.
set_option
(
'display.max_rows'
,
None
)
pd
.
set_option
(
'display.max_rows'
,
100
)
import
numpy
as
np
from
os
import
listdir
...
...
@@ -14,8 +14,19 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
df
=
df
.
rename
(
str
.
lower
,
axis
=
'columns'
)
df
=
df
.
rename
(
str
.
strip
,
axis
=
'columns'
)
df
=
df
.
drop
(
labels
=
[
'starttime'
,
'srcaddr'
,
'sport'
,
'dstaddr'
,
'dport'
],
axis
=
1
)
# df = df.drop(labels=['starttime', 'srcaddr', 'sport', 'dstaddr', 'dport'], axis=1)
df
=
df
.
drop
(
labels
=
[
'starttime'
],
axis
=
1
)
df
.
drop
(
index
=
df
[
df
[
'sport'
].
isna
()].
index
,
inplace
=
True
)
matching
=
df
[
'sport'
].
str
.
startswith
(
'0x'
)
df
.
loc
[
matching
,
'sport'
]
=
df
.
loc
[
matching
,
'sport'
].
apply
(
int
,
base
=
16
)
df
[
'sport'
]
=
df
[
'sport'
].
astype
(
'int32'
)
df
.
drop
(
index
=
df
[
df
[
'dport'
].
isna
()].
index
,
inplace
=
True
)
matching
=
df
[
'dport'
].
str
.
startswith
(
'0x'
)
df
.
loc
[
matching
,
'dport'
]
=
df
.
loc
[
matching
,
'dport'
].
apply
(
int
,
base
=
16
)
df
[
'dport'
]
=
df
[
'dport'
].
astype
(
'int32'
)
df
.
rename
(
columns
=
{
'totpkts'
:
'tot_pkts'
,
'totbytes'
:
'tot_bytes'
,
...
...
@@ -58,12 +69,12 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
# Save to file
if
'feather'
in
filetypes
:
f
=
file
.
replace
(
'.csv'
,
'.feather'
)
df
.
to_feather
(
f
'
{
dataset
.
replace
(
"original"
,
"
clean
"
)
}
/
{
f
}
'
)
df
.
to_feather
(
f
'
{
dataset
.
replace
(
"original"
,
"
dirty-with-metadata
"
)
}
/
{
f
}
'
)
if
'parquet'
in
filetypes
:
f
=
file
.
replace
(
'.csv'
,
'.parquet'
)
df
.
to_parquet
(
f
'
{
dataset
.
replace
(
"original"
,
"
clean
"
)
}
/
{
f
}
'
,
index
=
False
)
df
.
to_parquet
(
f
'
{
dataset
.
replace
(
"original"
,
"
dirty-with-metadata
"
)
}
/
{
f
}
'
,
index
=
False
)
if
'csv'
in
filetypes
:
df
.
to_csv
(
f
'
{
dataset
.
replace
(
"original"
,
"
clean
"
)
}
/
{
file
}
'
,
sep
=
','
,
index
=
False
,
encoding
=
'utf-8'
)
df
.
to_csv
(
f
'
{
dataset
.
replace
(
"original"
,
"
dirty-with-metadata
"
)
}
/
{
file
}
'
,
sep
=
','
,
index
=
False
,
encoding
=
'utf-8'
)
def
check_dims_type
(
dataset
):
for
file
in
listdir
(
f
'
{
dataset
}
'
):
...
...
@@ -88,7 +99,7 @@ if __name__ == "__main__":
# Adjust for cleaning the correct dataset into the desired format
# Needs directory with dataset name containing empty dir 'clean' and dir 'original' containing the CSVs
for
i
in
range
(
1
,
14
):
clean_dataset
(
f
'ctu-13/original/
{
i
}
'
,
filetypes
=
[
'feather'
,
'parquet'
,
'csv'
])
clean_dataset
(
f
'ctu-13/original/all'
,
filetypes
=
[
'feather'
,
'parquet'
,
'csv'
])
clean_dataset
(
f
'ctu-13/original/
{
i
}
'
,
filetypes
=
[
'feather'
,
'parquet'
])
clean_dataset
(
f
'ctu-13/original/all'
,
filetypes
=
[
'feather'
,
'parquet'
])
# check_dims_type('ctu-13/clean/all')
\ No newline at end of file
iscxids_data_cleaning.py
View file @
67e5fea3
#! /usr/bin/env python3
import
pandas
as
pd
pd
.
set_option
(
'display.max_columns'
,
None
)
pd
.
set_option
(
'display.max_rows'
,
None
)
pd
.
set_option
(
'display.max_rows'
,
50
)
import
numpy
as
np
import
csv
from
os
import
listdir
...
...
@@ -28,27 +28,28 @@ header = ['app_name',
'tag'
]
def
clean_dataset
(
dataset
,
filetypes
=
[
'feather'
,
'parquet'
,
'csv'
]):
for
file
in
[
x
for
x
in
listdir
(
f
'
{
dataset
}
'
)
if
(
x
.
endswith
(
'.csv'
)
and
x
in
(
'iscx-ids2012.csv'
,
'TestbedWedJun16-3Flows.csv'
))
]:
for
file
in
[
x
for
x
in
listdir
(
f
'
{
dataset
}
'
)
if
x
.
endswith
(
'.csv'
)]:
print
(
f
"-------
{
file
}
-------"
)
df
=
pd
.
read_csv
(
f
'
{
dataset
}
/
{
file
}
'
,
low_memory
=
True
,
encoding
=
'utf-8'
,
quotechar
=
"'"
,
quoting
=
csv
.
QUOTE_ALL
,
skiprows
=
0
)
for
c
in
[
'source_payload_as_base64'
,
'source_payload_as_UTF'
,
'destination_payload_as_base64'
,
'destination_payload_as_UTF'
,
'start_date_time'
,
'stop_date_time'
,
'source'
,
'destination'
,
'source_port'
,
'destination_port'
]:
for
c
in
[
'source_payload_as_base64'
,
'source_payload_as_UTF'
,
'destination_payload_as_base64'
,
'destination_payload_as_UTF'
,
'start_date_time'
,
'stop_date_time'
]:
#
, 'source', 'destination', 'source_port', 'destination_port']:
try
:
df
=
df
.
drop
(
labels
=
[
c
],
axis
=
1
)
except
:
pass
df
=
df
.
rename
(
str
.
lower
,
axis
=
'columns'
)
df
=
df
.
rename
(
str
.
strip
,
axis
=
'columns'
)
df
=
df
.
rename
(
str
.
strip
,
axis
=
'columns'
)
df
[
'source_port'
]
=
df
[
'source_port'
].
astype
(
'int32'
)
df
[
'destination_port'
]
=
df
[
'destination_port'
].
astype
(
'int32'
)
app_idx
=
df
.
columns
.
get_loc
(
'app_name'
)
app_dir_data
=
pd
.
get_dummies
(
df
[
'app_name'
],
prefix
=
'app_name'
,
dtype
=
np
.
int8
)
for
i
,
c
in
enumerate
(
app_dir_data
.
columns
):
df
.
insert
(
loc
=
app_idx
+
i
,
column
=
c
,
value
=
app_dir_data
[
c
])
df
=
df
.
drop
(
labels
=
[
'app_name'
],
axis
=
1
)
print
(
df
.
shape
)
df
=
df
.
drop
(
labels
=
[
'app_name'
],
axis
=
1
)
for
c
in
[
'source_tcp_flags_description'
,
'destination_tcp_flags_description'
]:
df
[
c
]
=
df
[
c
].
str
.
strip
()
...
...
@@ -99,15 +100,13 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
df
.
insert
(
loc
=
proto_idx
+
i
,
column
=
c
,
value
=
ohe_proto_data
[
c
])
df
=
df
.
drop
(
labels
=
[
'protocol_name'
],
axis
=
1
)
print
(
df
.
head
(
1
))
for
c
in
df
.
columns
:
print
(
'---'
,
c
,
'---'
)
print
(
df
[
c
].
value_counts
())
# for c in df.columns:
# print('---', c, '---')
# print(df[c].value_counts())
# Drop rows with infinite values
df
.
replace
([
np
.
inf
,
-
np
.
inf
],
np
.
nan
,
inplace
=
True
)
print
(
"N/A rows after preproc"
,
df
.
isna
().
any
(
axis
=
1
).
sum
())
print
(
df
.
isna
().
sum
())
print
(
"N/A rows after preproc"
,
df
.
isna
().
any
(
axis
=
1
).
sum
())
print
(
df
.
shape
)
df
.
dropna
(
inplace
=
True
)
...
...
@@ -117,8 +116,8 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
df
.
drop_duplicates
(
inplace
=
True
)
print
(
df
.
shape
)
for
c
in
df
.
columns
[:
-
1
]:
df
[
c
]
=
pd
.
to_numeric
(
df
[
c
].
values
,
errors
=
'raise'
,
downcast
=
'integer'
)
#
for c in df.columns[:-1]:
#
df[c] = pd.to_numeric(df[c].values, errors='raise', downcast='integer')
# Reset index
df
.
reset_index
(
inplace
=
True
,
drop
=
True
)
...
...
@@ -126,12 +125,12 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
# Save to file
if
'feather'
in
filetypes
:
f
=
file
.
replace
(
'.csv'
,
'.feather'
)
df
.
to_feather
(
f
'
{
dataset
.
replace
(
"original"
,
"
clean
"
)
}
/
{
f
}
'
)
df
.
to_feather
(
f
'
{
dataset
.
replace
(
"original"
,
"
dirty-with-metadata
"
)
}
/
{
f
}
'
)
if
'parquet'
in
filetypes
:
f
=
file
.
replace
(
'.csv'
,
'.parquet'
)
df
.
to_parquet
(
f
'
{
dataset
.
replace
(
"original"
,
"
clean
"
)
}
/
{
f
}
'
,
index
=
False
)
df
.
to_parquet
(
f
'
{
dataset
.
replace
(
"original"
,
"
dirty-with-metadata
"
)
}
/
{
f
}
'
,
index
=
False
)
if
'csv'
in
filetypes
:
df
.
to_csv
(
f
'
{
dataset
.
replace
(
"original"
,
"
clean
"
)
}
/
{
file
}
'
,
sep
=
','
,
index
=
False
,
encoding
=
'utf-8'
)
df
.
to_csv
(
f
'
{
dataset
.
replace
(
"original"
,
"
dirty-with-metadata
"
)
}
/
{
file
}
'
,
sep
=
','
,
index
=
False
,
encoding
=
'utf-8'
)
def
check_dims_type
(
dataset
):
for
file
in
[
x
for
x
in
listdir
(
f
'
{
dataset
}
'
)]:
...
...
@@ -155,9 +154,5 @@ def check_dims_type(dataset):
if
__name__
==
"__main__"
:
# Adjust for cleaning the correct dataset into the desired format
# Needs directory with dataset name containing empty dir 'clean' and dir 'original' containing the CSVs
# clean_dataset(f'iscx-ids2012/original', filetypes=['feather', 'parquet', 'csv'])
# check_dims_type('iscx-ids2012/clean')
total
=
pd
.
read_parquet
(
'iscx-ids2012/clean/iscx-ids2012.parquet'
)
print
(
total
.
shape
)
print
(
total
.
dtypes
)
print
(
total
[
'tag'
].
value_counts
())
\ No newline at end of file
clean_dataset
(
f
'iscx-ids2012/original'
,
filetypes
=
[
'feather'
,
'parquet'
])
# check_dims_type('iscx-ids2012/clean')
\ No newline at end of file
nslkdd_data_cleaning.py
View file @
67e5fea3
...
...
@@ -19,27 +19,27 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
df
=
df
.
rename
(
str
.
lower
,
axis
=
'columns'
)
df
=
df
.
rename
(
str
.
strip
,
axis
=
'columns'
)
df
=
df
.
drop
(
labels
=
[
'classnum'
],
axis
=
1
)
df
=
df
.
drop
(
labels
=
[
'classnum'
,
'service'
],
axis
=
1
)
proto_idx
=
df
.
columns
.
get_loc
(
'protocol_type'
)
ohe_proto_data
=
pd
.
get_dummies
(
df
[
'protocol_type'
],
prefix
=
'proto'
,
dtype
=
np
.
int8
)
for
i
,
c
in
enumerate
(
ohe_proto_data
.
columns
):
df
.
insert
(
loc
=
proto_idx
+
i
,
column
=
c
,
value
=
ohe_proto_data
[
c
])
df
=
df
.
drop
(
labels
=
[
'protocol_type'
],
axis
=
1
)
#
proto_idx = df.columns.get_loc('protocol_type')
#
ohe_proto_data = pd.get_dummies(df['protocol_type'], prefix='proto', dtype=np.int8)
#
for i, c in enumerate(ohe_proto_data.columns):
#
df.insert(loc=proto_idx+i, column=c, value=ohe_proto_data[c])
#
df = df.drop(labels=['protocol_type'], axis=1)
srv_idx
=
df
.
columns
.
get_loc
(
'service'
)
ohe_srv_data
=
pd
.
get_dummies
(
df
[
'service'
],
prefix
=
'service'
,
dtype
=
np
.
int8
)
#
srv_idx = df.columns.get_loc('service')
#
ohe_srv_data = pd.get_dummies(df['service'], prefix='service', dtype=np.int8)
for
i
,
c
in
enumerate
(
ohe_srv_data
.
columns
):
df
.
insert
(
loc
=
srv_idx
+
i
,
column
=
c
,
value
=
ohe_srv_data
[
c
])
df
=
df
.
drop
(
labels
=
[
'service'
],
axis
=
1
)
#
for i, c in enumerate(ohe_srv_data.columns):
#
df.insert(loc=srv_idx+i, column=c, value=ohe_srv_data[c])
#
df = df.drop(labels=['service'], axis=1)
flag_idx
=
df
.
columns
.
get_loc
(
'flag'
)
ohe_flag_data
=
pd
.
get_dummies
(
df
[
'flag'
],
prefix
=
'flag'
,
dtype
=
np
.
int8
)
#
flag_idx = df.columns.get_loc('flag')
#
ohe_flag_data = pd.get_dummies(df['flag'], prefix='flag', dtype=np.int8)
for
i
,
c
in
enumerate
(
ohe_flag_data
.
columns
):
df
.
insert
(
loc
=
flag_idx
+
i
,
column
=
c
,
value
=
ohe_flag_data
[
c
])
df
=
df
.
drop
(
labels
=
[
'flag'
],
axis
=
1
)
#
for i, c in enumerate(ohe_flag_data.columns):
#
df.insert(loc=flag_idx+i, column=c, value=ohe_flag_data[c])
#
df = df.drop(labels=['flag'], axis=1)
# Drop rows with infinite values
df
.
replace
([
np
.
inf
,
-
np
.
inf
],
np
.
nan
,
inplace
=
True
)
...
...
@@ -52,7 +52,7 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
print
(
"Dropping duplicate rows"
)
df
.
drop_duplicates
(
inplace
=
True
)
print
(
df
.
shape
)
print
(
df
.
shape
)
df_train
=
df
.
loc
[
df
[
'subset'
]
==
'train'
]
df_train
.
drop
(
labels
=
[
'subset'
],
axis
=
1
,
inplace
=
True
)
...
...
@@ -82,7 +82,7 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
if
'csv'
in
filetypes
:
f1
=
file_train
.
replace
(
'original'
,
'clean'
)
df_train
.
to_csv
(
f1
,
sep
=
','
,
index
=
False
,
encoding
=
'utf-8'
)
f2
=
file_test
.
replace
(
'original'
,
'clean'
)
f2
=
file_test
.
replace
(
'original'
,
'clean'
)
df_test
.
to_csv
(
f2
,
sep
=
','
,
index
=
False
,
encoding
=
'utf-8'
)
def
check_dims_type
(
dataset
):
...
...
@@ -108,5 +108,5 @@ if __name__ == "__main__":
# Adjust for cleaning the correct dataset into the desired format
# Needs directory with dataset name containing empty dir 'clean' and dir 'original' containing the CSVs
clean_dataset
(
f
'nsl-kdd/original'
,
filetypes
=
[
'feather'
,
'parquet'
,
'csv'
])
check_dims_type
(
'nsl-kdd/clean'
)
#
check_dims_type('nsl-kdd/clean')
\ No newline at end of file
nslkdd_data_cleaning_with_metadata.py
0 → 100644
View file @
67e5fea3
#! /usr/bin/env python3
import
pandas
as
pd
pd
.
set_option
(
'display.max_columns'
,
None
)
import
numpy
as
np
from
os
import
listdir
def
clean_dataset
(
dataset
,
filetypes
=
[
'feather'
,
'parquet'
,
'csv'
]):
file_train
=
dataset
+
'/KDDTrain.csv'
file_test
=
dataset
+
'/KDDTest.csv'
print
(
f
'Temporarily merging
{
file_train
}
and
{
file_test
}
to end up with a consistent set of features'
)
df_train
=
pd
.
read_csv
(
f
'
{
file_train
}
'
,
low_memory
=
True
,
encoding
=
'utf-8'
)
df_train
[
'subset'
]
=
'train'
df_test
=
pd
.
read_csv
(
f
'
{
file_test
}
'
,
low_memory
=
True
,
encoding
=
'utf-8'
)
df_test
[
'subset'
]
=
'test'
df
=
pd
.
concat
(
objs
=
[
df_train
,
df_test
],
ignore_index
=
True
,
copy
=
False
,
sort
=
False
)
df
=
df
.
rename
(
str
.
lower
,
axis
=
'columns'
)
df
=
df
.
rename
(
str
.
strip
,
axis
=
'columns'
)
df
=
df
.
drop
(
labels
=
[
'classnum'
],
axis
=
1
)
# proto_idx = df.columns.get_loc('protocol_type')
# ohe_proto_data = pd.get_dummies(df['protocol_type'], prefix='proto', dtype=np.int8)
# for i, c in enumerate(ohe_proto_data.columns):
# df.insert(loc=proto_idx+i, column=c, value=ohe_proto_data[c])
# df = df.drop(labels=['protocol_type'], axis=1)
# srv_idx = df.columns.get_loc('service')
# ohe_srv_data = pd.get_dummies(df['service'], prefix='service', dtype=np.int8)
# for i, c in enumerate(ohe_srv_data.columns):
# df.insert(loc=srv_idx+i, column=c, value=ohe_srv_data[c])
# df = df.drop(labels=['service'], axis=1)
# flag_idx = df.columns.get_loc('flag')
# ohe_flag_data = pd.get_dummies(df['flag'], prefix='flag', dtype=np.int8)
# for i, c in enumerate(ohe_flag_data.columns):
# df.insert(loc=flag_idx+i, column=c, value=ohe_flag_data[c])
# df = df.drop(labels=['flag'], axis=1)
# Drop rows with infinite values
df
.
replace
([
np
.
inf
,
-
np
.
inf
],
np
.
nan
,
inplace
=
True
)
print
(
"N/A rows after preproc"
,
df
.
isna
().
any
(
axis
=
1
).
sum
())
print
(
df
.
shape
)
df
.
dropna
(
inplace
=
True
)
print
(
df
.
shape
)
print
(
"Dropping duplicate rows"
)
df
.
drop_duplicates
(
inplace
=
True
)
print
(
df
.
shape
)
print
(
df
.
shape
)
df_train
=
df
.
loc
[
df
[
'subset'
]
==
'train'
]
df_train
.
drop
(
labels
=
[
'subset'
],
axis
=
1
,
inplace
=
True
)
df_train
.
reset_index
(
inplace
=
True
,
drop
=
True
)
print
(
df_train
.
shape
)
df_test
=
df
.
loc
[
df
[
'subset'
]
==
'test'
]
df_test
.
drop
(
labels
=
[
'subset'
],
axis
=
1
,
inplace
=
True
)
df_test
.
reset_index
(
inplace
=
True
,
drop
=
True
)
print
(
df_test
.
shape
)
print
(
df_train
.
columns
)
print
(
df_test
.
columns
)
# Save to file
if
'feather'
in
filetypes
:
f1
=
file_train
.
replace
(
'.csv'
,
'.feather'
).
replace
(
'original'
,
'dirty-with-metadata'
)
df_train
.
to_feather
(
f1
)
f2
=
file_test
.
replace
(
'.csv'
,
'.feather'
).
replace
(
'original'
,
'dirty-with-metadata'
)
df_test
.
to_feather
(
f2
)
if
'parquet'
in
filetypes
:
f1
=
file_train
.
replace
(
'.csv'
,
'.parquet'
).
replace
(
'original'
,
'dirty-with-metadata'
)
df_train
.
to_parquet
(
f1
)
f2
=
file_test
.
replace
(
'.csv'
,
'.parquet'
).
replace
(
'original'
,
'dirty-with-metadata'
)
df_test
.
to_parquet
(
f2
)
if
'csv'
in
filetypes
:
f1
=
file_train
.
replace
(
'original'
,
'dirty-with-metadata'
)
df_train
.
to_csv
(
f1
,
sep
=
','
,
index
=
False
,
encoding
=
'utf-8'
)
f2
=
file_test
.
replace
(
'original'
,
'dirty-with-metadata'
)
df_test
.
to_csv
(
f2
,
sep
=
','
,
index
=
False
,
encoding
=
'utf-8'
)
def
check_dims_type
(
dataset
):
for
file
in
[
x
for
x
in
listdir
(
f
'
{
dataset
}
'
)
if
x
.
endswith
(
'.csv'
)]:
print
(
f
"-------
{
file
}
-------"
)
if
file
.
endswith
(
'.csv'
):
df
=
pd
.
read_csv
(
f
'
{
dataset
}
/
{
file
}
'
)
print
(
df
.
shape
)
print
(
df
.
head
(
1
))
print
(
df
.
dtypes
)
elif
file
.
endswith
(
'.parquet'
):
df
=
pd
.
read_parquet
(
f
'
{
dataset
}
/
{
file
}
'
)
print
(
df
.
shape
)
print
(
df
.
head
(
1
))
print
(
df
.
dtypes
)
elif
file
.
endswith
(
'.feather'
):
df
=
pd
.
read_feather
(
f
'
{
dataset
}
/
{
file
}
'
)
print
(
df
.
shape
)
print
(
df
.
head
(
1
))
print
(
df
.
dtypes
)
if
__name__
==
"__main__"
:
# Adjust for cleaning the correct dataset into the desired format
# Needs directory with dataset name containing empty dir 'clean' and dir 'original' containing the CSVs
clean_dataset
(
f
'nsl-kdd/original'
,
filetypes
=
[
'feather'
,
'parquet'
,
'csv'
])
# check_dims_type('nsl-kdd/clean')
\ No newline at end of file
unsw_data_cleaning.py
View file @
67e5fea3
#! /usr/bin/env python3
import
pandas
as
pd
pd
.
set_option
(
'display.max_rows'
,
None
)
pd
.
set_option
(
'display.max_rows'
,
50
)
pd
.
set_option
(
'display.max_columns'
,
None
)
pd
.
set_option
(
'display.expand_frame_repr'
,
None
)
import
numpy
as
np
import
random
import
os
import
glob
import
time
random
.
seed
(
0
)
...
...
@@ -64,7 +62,7 @@ column_names = [
'label'
]
drop_columns
=
[
'srcip'
,
'sport'
,
'dstip'
]
drop_columns
=
[
'srcip'
,
'sport'
,
'dstip'
,
'dsport'
]
datetime_dyptes
=
{
'stime'
:
'datetime64'
,
...
...
@@ -72,23 +70,37 @@ datetime_dyptes = {
}
def
read_all_data
():
data_1
=
pd
.
read_csv
(
'unswnb15/original/UNSW
-
NB15_1.csv'
,
header
=
None
,
names
=
column_names
,
low_memory
=
False
)
data_2
=
pd
.
read_csv
(
'unswnb15/original/UNSW
-
NB15_2.csv'
,
header
=
None
,
names
=
column_names
,
low_memory
=
False
)
data_3
=
pd
.
read_csv
(
'unswnb15/original/UNSW
-
NB15_3.csv'
,
header
=
None
,
names
=
column_names
,
low_memory
=
False
)
data_4
=
pd
.
read_csv
(
'unswnb15/original/UNSW
-
NB15_4.csv'
,
header
=
None
,
names
=
column_names
,
low_memory
=
False
)
data_1
=
pd
.
read_csv
(
'unsw
-
nb15/original/UNSW
_
NB15_1.csv'
,
header
=
None
,
names
=
column_names
,
low_memory
=
False
)
data_2
=
pd
.
read_csv
(
'unsw
-
nb15/original/UNSW
_
NB15_2.csv'
,
header
=
None
,
names
=
column_names
,
low_memory
=
False
)
data_3
=
pd
.
read_csv
(
'unsw
-
nb15/original/UNSW
_
NB15_3.csv'
,
header
=
None
,
names
=
column_names
,
low_memory
=
False
)
data_4
=
pd
.
read_csv
(
'unsw
-
nb15/original/UNSW
_
NB15_4.csv'
,
header
=
None
,
names
=
column_names
,
low_memory
=
False
)
data
=
pd
.
concat
(
objs
=
[
data_1
,
data_2
,
data_3
,
data_4
],
ignore_index
=
True
,
copy
=
False
)
return
data
def
clean_dataset
(
dataset
,
filetypes
=
[
'feather'
,
'parquet'
,
'csv'
]):
df
=
read_all_data
()
print
(
df
[
'label'
].
value_counts
())
print
(
"Shape:"
,
df
.
shape
)
print
(
"Shape:"
,
df
.
shape
)
# Dataset-specific quirks
df
[
'ct_ftp_cmd'
].
replace
(
' '
,
'0'
,
inplace
=
True
)
df
[
'ct_ftp_cmd'
]
=
df
[
'ct_ftp_cmd'
].
astype
(
'int'
)
df
[
'ct_flw_http_mthd'
].
fillna
(
value
=
0
,
inplace
=
True
)
df
.
drop
(
columns
=
drop_columns
,
inplace
=
True
,
errors
=
"ignore"
)
# df.drop(columns=drop_columns, inplace=True, errors="ignore")
df
[
'srcip'
]
=
df
[
'srcip'
].
astype
(
'category'
)
df
=
df
.
drop
(
index
=
df
.
loc
[
df
[
'sport'
]
==
'0x000b'
].
index
)
df
=
df
.
drop
(
index
=
df
.
loc
[
df
[
'sport'
]
==
'0x000c'
].
index
)
df
=
df
.
drop
(
index
=
df
.
loc
[
df
[
'sport'
]
==
'0xc0a8'
].
index
)
df
=
df
.
drop
(
index
=
df
.
loc
[
df
[
'sport'
]
==
'-'
].
index
)
df
[
'sport'
]
=
df
[
'sport'
].
astype
(
'int64'
)
df
[
'dstip'
]
=
df
[
'sport'
].
astype
(
'category'
)
df
=
df
.
drop
(
index
=
df
.
loc
[
df
[
'dsport'
]
==
'0xc0a8'
].
index
)
df
=
df
.
drop
(
index
=
df
.
loc
[
df
[
'dsport'
]
==
'-'
].
index
)
df
=
df
.
drop
(
index
=
df
.
loc
[
df
[
'dsport'
]
==
'0x20205321'
].
index
)
df
=
df
.
drop
(
index
=
df
.
loc
[
df
[
'dsport'
]
==
'0xcc09'
].
index
)
df
[
'dsport'
]
=
df
[
'dsport'
].
astype
(
'int64'
)
df
[
'stime'
]
=
pd
.
to_datetime
(
df
[
'stime'
],
unit
=
's'
,
errors
=
'coerce'
)
df
[
'ltime'
]
=
pd
.
to_datetime
(
df
[
'ltime'
],
unit
=
's'
,
errors
=
'coerce'
)
...
...
@@ -109,9 +121,10 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
df
[
'is_sm_ips_ports'
]
=
df
[
'is_sm_ips_ports'
].
astype
(
'bool'
)