Commit 67e5fea3 authored by Laurens D'hooge's avatar Laurens D'hooge
Browse files

updated preproc code for dirty-metadata versions

parent 1ee471eb
......@@ -13,11 +13,11 @@ random.seed(0)
drop_columns = [
# Dataset Specific Information
"Flow ID",
"Source IP", "Src IP",
"Source Port", "Src Port",
"Destination IP", "Dst IP",
"Destination Port", "Dst Port",
"Protocol", "Timestamp",
# "Source IP", "Src IP",
# "Source Port", "Src Port",
# "Destination IP", "Dst IP",
# "Destination Port", "Dst Port",
# "Protocol", "Timestamp",
# Features Without Observed Variance
"Bwd PSH Flags",
"Fwd URG Flags",
......@@ -68,6 +68,12 @@ drop_columns = [
# ]
mapper = {
# Only needed for dirty-with-metadata
'Src IP': 'Source IP',
'Dst IP': 'Destination IP',
'Src Port': 'Source Port',
'Dst Port': 'Destination Port',
'Tot Fwd Pkts': 'Total Fwd Packets',
'Tot Bwd Pkts': 'Total Backward Packets',
'TotLen Fwd Pkts': 'Fwd Packets Length Total',
......@@ -125,7 +131,15 @@ mapper = {
'Fwd Act Data Pkts': 'Fwd Act Data Packets',
'act_data_pkt_fwd': 'Fwd Act Data Packets',
'Fwd Seg Size Min': 'Fwd Seg Size Min',
'min_seg_size_forward': 'Fwd Seg Size Min'
'min_seg_size_forward': 'Fwd Seg Size Min',
# 'Active Mean': 'Active Mean',
# 'Active Std': 'Active Std',
# 'Active Max': 'Active Max',
# 'Active Min': 'Active Min',
# 'Idle Mean': 'Idle Mean',
# 'Idle Std': 'Idle Std',
# 'Idle Max': 'Idle Max',
# 'Idle Min': 'Idle Min',
}
......@@ -157,9 +171,9 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
df[int_col] = df[int_col].apply(pd.to_numeric, errors='coerce', downcast='integer')
float_col = df.select_dtypes(include='float').columns
df[float_col] = df[float_col].apply(pd.to_numeric, errors='coerce', downcast='float')
obj_col = df.select_dtypes(include='object').columns
print(f'Columns with dtype == object: {obj_col}')
df[obj_col] = df[obj_col].apply(pd.to_numeric, errors='coerce')
# obj_col = df.select_dtypes(include='object').columns
# print(f'Columns with dtype == object: {obj_col}')
# df[obj_col] = df[obj_col].apply(pd.to_numeric, errors='coerce')
# Drop rows with invalid data
df.replace([np.inf, -np.inf], np.nan, inplace=True)
......@@ -167,8 +181,7 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
df.dropna(inplace=True)
# Drop duplicate rows
df.drop_duplicates(
inplace=True, subset=df.columns.difference(['Label', 'Timestamp']))
df.drop_duplicates(inplace=True, subset=df.columns.difference(['Label']))
print(df["Label"].value_counts())
print(f"shape: {df.shape}\n")
......@@ -181,18 +194,18 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
# Save to file
if 'feather' in filetypes:
f = file.replace('.csv', '.feather')
df.to_feather(f'{dataset}/clean/{f}')
df.to_feather(f'{dataset}/dirty-with-metadata/{f}')
if 'parquet' in filetypes:
f = file.replace('.csv', '.parquet')
df.to_parquet(f'{dataset}/clean/{f}', index=False)
df.to_parquet(f'{dataset}/dirty-with-metadata/{f}', index=False)
if 'csv' in filetypes:
df.to_csv(f'{dataset}/clean/{file}', sep=',', index=False, encoding='utf-8')
df.to_csv(f'{dataset}/dirty-with-metadata/{file}', sep=',', index=False, encoding='utf-8')
def aggregate_data(dataset, save=True, filetype='feather'):
# Will search for all files in the 'clean' directory of the correct filetype and aggregate them
all_data = pd.DataFrame()
for file in glob.glob(f'{dataset}/clean/*.{filetype}'):
for file in glob.glob(f'{dataset}/dirty-with-metadata/*.{filetype}'):
print(file)
df = pd.DataFrame()
if filetype == 'feather':
......@@ -216,33 +229,33 @@ def aggregate_data(dataset, save=True, filetype='feather'):
malicious = all_data[all_data.Label != 'Benign'].reset_index(drop=True)
benign = all_data[all_data.Label == 'Benign'].reset_index(drop=True)
if filetype == 'feather':
all_data.to_feather(f'{dataset}/clean/all_data.feather')
malicious.to_feather(f'{dataset}/clean/all_malicious.feather')
benign.to_feather(f'{dataset}/clean/all_benign.feather')
all_data.to_feather(f'{dataset}/dirty-with-metadata/all_data.feather')
malicious.to_feather(f'{dataset}/dirty-with-metadata/all_malicious.feather')
benign.to_feather(f'{dataset}/dirty-with-metadata/all_benign.feather')
if filetype == 'parquet':
all_data.to_parquet(
f'{dataset}/clean/all_data.parquet', index=False)
f'{dataset}/dirty-with-metadata/all_data.parquet', index=False)
malicious.to_parquet(
f'{dataset}/clean/all_malicious.parquet', index=False)
f'{dataset}/dirty-with-metadata/all_malicious.parquet', index=False)
benign.to_parquet(
f'{dataset}/clean/all_benign.parquet', index=False)
f'{dataset}/dirty-with-metadata/all_benign.parquet', index=False)
def check_dims_type(dataset):
for file in os.listdir(f'{dataset}/clean'):
for file in os.listdir(f'{dataset}/dirty-with-metadata'):
print(f"------- {file} -------")
if file.endswith('.csv'):
df = pd.read_csv(f'{dataset}/clean/{file}')
df = pd.read_csv(f'{dataset}/dirty-with-metadata/{file}')
print(df.shape)
print(df.head(1))
# [print(i) for i in df.dtypes]
elif file.endswith('.parquet'):
df = pd.read_parquet(f'{dataset}/clean/{file}')
df = pd.read_parquet(f'{dataset}/dirty-with-metadata/{file}')
print(df.shape)
print(df.head(1))
# [print(i) for i in df.dtypes]
elif file.endswith('.feather'):
df = pd.read_feather(f'{dataset}/clean/{file}')
df = pd.read_feather(f'{dataset}/dirty-with-metadata/{file}')
print(df.shape)
print(df.head(1))
# [print(i) for i in df.dtypes]
......
......@@ -26,7 +26,7 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
'attackdescription': 'attack_description'
}, inplace=True)
df = df.drop(labels=['date_first_seen', 'src_ip_addr', 'src_pt', 'dst_ip_addr', 'dst_pt', 'attack_description'], axis=1)
# df = df.drop(labels=['date_first_seen', 'src_ip_addr', 'src_pt', 'dst_ip_addr', 'dst_pt', 'attack_description'], axis=1)
proto_idx = df.columns.get_loc('proto')
df['proto'] = df['proto'].str.strip()
......@@ -95,12 +95,12 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
# Save to file
if 'feather' in filetypes:
f = file.replace('.csv', '.feather')
df.to_feather(f'{dataset.replace("original", "clean")}/{f}')
df.to_feather(f'{dataset.replace("original", "dirty-with-metadata")}/{f}')
if 'parquet' in filetypes:
f = file.replace('.csv', '.parquet')
df.to_parquet(f'{dataset.replace("original", "clean")}/{f}', index=False)
df.to_parquet(f'{dataset.replace("original", "dirty-with-metadata")}/{f}', index=False)
if 'csv' in filetypes:
df.to_csv(f'{dataset.replace("original", "clean")}/{file}', sep=',', index=False, encoding='utf-8')
df.to_csv(f'{dataset.replace("original", "dirty-with-metadata")}/{file}', sep=',', index=False, encoding='utf-8')
def check_dims_type(dataset):
for file in listdir(f'{dataset}'):
......@@ -124,9 +124,9 @@ def check_dims_type(dataset):
if __name__ == "__main__":
# Adjust for cleaning the correct dataset into the desired format
# Needs directory with dataset name containing empty dir 'clean' and dir 'original' containing the CSVs
# clean_dataset('cidds-001/original/traffic/ExternalServer', filetypes=['feather', 'parquet', 'csv'])
# clean_dataset('cidds-001/original/traffic/OpenStack', filetypes=['feather', 'parquet', 'csv'])
# clean_dataset('cidds-002/original/traffic', filetypes=['feather', 'parquet', 'csv'])
check_dims_type('cidds-001/clean/traffic/ExternalServer')
check_dims_type('cidds-001/clean/traffic/OpenStack')
check_dims_type('cidds-002/clean/traffic')
\ No newline at end of file
clean_dataset('cidds-001/original/traffic/ExternalServer', filetypes=['feather', 'parquet'])
clean_dataset('cidds-001/original/traffic/OpenStack', filetypes=['feather', 'parquet'])
clean_dataset('cidds-002/original/traffic', filetypes=['feather', 'parquet'])
# check_dims_type('cidds-001/clean/traffic/ExternalServer')
# check_dims_type('cidds-001/clean/traffic/OpenStack')
# check_dims_type('cidds-002/clean/traffic')
\ No newline at end of file
#! /usr/bin/env python3
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_rows', 100)
import numpy as np
from os import listdir
......@@ -14,8 +14,19 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
df = df.rename(str.lower, axis='columns')
df = df.rename(str.strip, axis='columns')
df = df.drop(labels=['starttime', 'srcaddr', 'sport', 'dstaddr', 'dport'], axis=1)
# df = df.drop(labels=['starttime', 'srcaddr', 'sport', 'dstaddr', 'dport'], axis=1)
df = df.drop(labels=['starttime'], axis=1)
df.drop(index=df[df['sport'].isna()].index, inplace=True)
matching = df['sport'].str.startswith('0x')
df.loc[matching, 'sport'] = df.loc[matching, 'sport'].apply(int, base=16)
df['sport'] = df['sport'].astype('int32')
df.drop(index=df[df['dport'].isna()].index, inplace=True)
matching = df['dport'].str.startswith('0x')
df.loc[matching, 'dport'] = df.loc[matching, 'dport'].apply(int, base=16)
df['dport'] = df['dport'].astype('int32')
df.rename(columns={
'totpkts': 'tot_pkts',
'totbytes': 'tot_bytes',
......@@ -58,12 +69,12 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
# Save to file
if 'feather' in filetypes:
f = file.replace('.csv', '.feather')
df.to_feather(f'{dataset.replace("original", "clean")}/{f}')
df.to_feather(f'{dataset.replace("original", "dirty-with-metadata")}/{f}')
if 'parquet' in filetypes:
f = file.replace('.csv', '.parquet')
df.to_parquet(f'{dataset.replace("original", "clean")}/{f}', index=False)
df.to_parquet(f'{dataset.replace("original", "dirty-with-metadata")}/{f}', index=False)
if 'csv' in filetypes:
df.to_csv(f'{dataset.replace("original", "clean")}/{file}', sep=',', index=False, encoding='utf-8')
df.to_csv(f'{dataset.replace("original", "dirty-with-metadata")}/{file}', sep=',', index=False, encoding='utf-8')
def check_dims_type(dataset):
for file in listdir(f'{dataset}'):
......@@ -88,7 +99,7 @@ if __name__ == "__main__":
# Adjust for cleaning the correct dataset into the desired format
# Needs directory with dataset name containing empty dir 'clean' and dir 'original' containing the CSVs
for i in range(1, 14):
clean_dataset(f'ctu-13/original/{i}', filetypes=['feather', 'parquet', 'csv'])
clean_dataset(f'ctu-13/original/all', filetypes=['feather', 'parquet', 'csv'])
clean_dataset(f'ctu-13/original/{i}', filetypes=['feather', 'parquet'])
clean_dataset(f'ctu-13/original/all', filetypes=['feather', 'parquet'])
# check_dims_type('ctu-13/clean/all')
\ No newline at end of file
#! /usr/bin/env python3
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_rows', 50)
import numpy as np
import csv
from os import listdir
......@@ -28,27 +28,28 @@ header = ['app_name',
'tag']
def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
for file in [x for x in listdir(f'{dataset}') if (x.endswith('.csv') and x in ('iscx-ids2012.csv', 'TestbedWedJun16-3Flows.csv'))]:
for file in [x for x in listdir(f'{dataset}') if x.endswith('.csv')]:
print(f"------- {file} -------")
df = pd.read_csv(f'{dataset}/{file}', low_memory=True, encoding='utf-8', quotechar="'", quoting=csv.QUOTE_ALL, skiprows=0)
for c in ['source_payload_as_base64', 'source_payload_as_UTF', 'destination_payload_as_base64', 'destination_payload_as_UTF', 'start_date_time', 'stop_date_time', 'source', 'destination', 'source_port', 'destination_port']:
for c in ['source_payload_as_base64', 'source_payload_as_UTF', 'destination_payload_as_base64', 'destination_payload_as_UTF', 'start_date_time', 'stop_date_time']: #, 'source', 'destination', 'source_port', 'destination_port']:
try:
df = df.drop(labels=[c], axis=1)
except:
pass
df = df.rename(str.lower, axis='columns')
df = df.rename(str.strip, axis='columns')
df = df.rename(str.strip, axis='columns')
df['source_port'] = df['source_port'].astype('int32')
df['destination_port'] = df['destination_port'].astype('int32')
app_idx = df.columns.get_loc('app_name')
app_dir_data = pd.get_dummies(df['app_name'], prefix='app_name', dtype=np.int8)
for i, c in enumerate(app_dir_data.columns):
df.insert(loc=app_idx+i, column=c, value=app_dir_data[c])
df = df.drop(labels=['app_name'], axis=1)
print(df.shape)
df = df.drop(labels=['app_name'], axis=1)
for c in ['source_tcp_flags_description', 'destination_tcp_flags_description']:
df[c] = df[c].str.strip()
......@@ -99,15 +100,13 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
df.insert(loc=proto_idx+i, column=c, value=ohe_proto_data[c])
df = df.drop(labels=['protocol_name'], axis=1)
print(df.head(1))
for c in df.columns:
print('---', c, '---')
print(df[c].value_counts())
# for c in df.columns:
# print('---', c, '---')
# print(df[c].value_counts())
# Drop rows with infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
print("N/A rows after preproc", df.isna().any(axis=1).sum())
print(df.isna().sum())
print("N/A rows after preproc", df.isna().any(axis=1).sum())
print(df.shape)
df.dropna(inplace=True)
......@@ -117,8 +116,8 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
df.drop_duplicates(inplace=True)
print(df.shape)
for c in df.columns[:-1]:
df[c] = pd.to_numeric(df[c].values, errors='raise', downcast='integer')
# for c in df.columns[:-1]:
# df[c] = pd.to_numeric(df[c].values, errors='raise', downcast='integer')
# Reset index
df.reset_index(inplace=True, drop=True)
......@@ -126,12 +125,12 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
# Save to file
if 'feather' in filetypes:
f = file.replace('.csv', '.feather')
df.to_feather(f'{dataset.replace("original", "clean")}/{f}')
df.to_feather(f'{dataset.replace("original", "dirty-with-metadata")}/{f}')
if 'parquet' in filetypes:
f = file.replace('.csv', '.parquet')
df.to_parquet(f'{dataset.replace("original", "clean")}/{f}', index=False)
df.to_parquet(f'{dataset.replace("original", "dirty-with-metadata")}/{f}', index=False)
if 'csv' in filetypes:
df.to_csv(f'{dataset.replace("original", "clean")}/{file}', sep=',', index=False, encoding='utf-8')
df.to_csv(f'{dataset.replace("original", "dirty-with-metadata")}/{file}', sep=',', index=False, encoding='utf-8')
def check_dims_type(dataset):
for file in [x for x in listdir(f'{dataset}')]:
......@@ -155,9 +154,5 @@ def check_dims_type(dataset):
if __name__ == "__main__":
# Adjust for cleaning the correct dataset into the desired format
# Needs directory with dataset name containing empty dir 'clean' and dir 'original' containing the CSVs
# clean_dataset(f'iscx-ids2012/original', filetypes=['feather', 'parquet', 'csv'])
# check_dims_type('iscx-ids2012/clean')
total = pd.read_parquet('iscx-ids2012/clean/iscx-ids2012.parquet')
print(total.shape)
print(total.dtypes)
print(total['tag'].value_counts())
\ No newline at end of file
clean_dataset(f'iscx-ids2012/original', filetypes=['feather', 'parquet'])
# check_dims_type('iscx-ids2012/clean')
\ No newline at end of file
......@@ -19,27 +19,27 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
df = df.rename(str.lower, axis='columns')
df = df.rename(str.strip, axis='columns')
df = df.drop(labels=['classnum'], axis=1)
df = df.drop(labels=['classnum', 'service'], axis=1)
proto_idx = df.columns.get_loc('protocol_type')
ohe_proto_data = pd.get_dummies(df['protocol_type'], prefix='proto', dtype=np.int8)
for i, c in enumerate(ohe_proto_data.columns):
df.insert(loc=proto_idx+i, column=c, value=ohe_proto_data[c])
df = df.drop(labels=['protocol_type'], axis=1)
# proto_idx = df.columns.get_loc('protocol_type')
# ohe_proto_data = pd.get_dummies(df['protocol_type'], prefix='proto', dtype=np.int8)
# for i, c in enumerate(ohe_proto_data.columns):
# df.insert(loc=proto_idx+i, column=c, value=ohe_proto_data[c])
# df = df.drop(labels=['protocol_type'], axis=1)
srv_idx = df.columns.get_loc('service')
ohe_srv_data = pd.get_dummies(df['service'], prefix='service', dtype=np.int8)
# srv_idx = df.columns.get_loc('service')
# ohe_srv_data = pd.get_dummies(df['service'], prefix='service', dtype=np.int8)
for i, c in enumerate(ohe_srv_data.columns):
df.insert(loc=srv_idx+i, column=c, value=ohe_srv_data[c])
df = df.drop(labels=['service'], axis=1)
# for i, c in enumerate(ohe_srv_data.columns):
# df.insert(loc=srv_idx+i, column=c, value=ohe_srv_data[c])
# df = df.drop(labels=['service'], axis=1)
flag_idx = df.columns.get_loc('flag')
ohe_flag_data = pd.get_dummies(df['flag'], prefix='flag', dtype=np.int8)
# flag_idx = df.columns.get_loc('flag')
# ohe_flag_data = pd.get_dummies(df['flag'], prefix='flag', dtype=np.int8)
for i, c in enumerate(ohe_flag_data.columns):
df.insert(loc=flag_idx+i, column=c, value=ohe_flag_data[c])
df = df.drop(labels=['flag'], axis=1)
# for i, c in enumerate(ohe_flag_data.columns):
# df.insert(loc=flag_idx+i, column=c, value=ohe_flag_data[c])
# df = df.drop(labels=['flag'], axis=1)
# Drop rows with infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
......@@ -52,7 +52,7 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
print("Dropping duplicate rows")
df.drop_duplicates(inplace=True)
print(df.shape)
print(df.shape)
df_train = df.loc[df['subset'] == 'train']
df_train.drop(labels=['subset'], axis=1, inplace=True)
......@@ -82,7 +82,7 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
if 'csv' in filetypes:
f1 = file_train.replace('original', 'clean')
df_train.to_csv(f1, sep=',', index=False, encoding='utf-8')
f2 = file_test.replace('original', 'clean')
f2 = file_test.replace('original', 'clean')
df_test.to_csv(f2, sep=',', index=False, encoding='utf-8')
def check_dims_type(dataset):
......@@ -108,5 +108,5 @@ if __name__ == "__main__":
# Adjust for cleaning the correct dataset into the desired format
# Needs directory with dataset name containing empty dir 'clean' and dir 'original' containing the CSVs
clean_dataset(f'nsl-kdd/original', filetypes=['feather', 'parquet', 'csv'])
check_dims_type('nsl-kdd/clean')
# check_dims_type('nsl-kdd/clean')
\ No newline at end of file
#! /usr/bin/env python3
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from os import listdir
def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
file_train = dataset+'/KDDTrain.csv'
file_test = dataset+'/KDDTest.csv'
print(f'Temporarily merging {file_train} and {file_test} to end up with a consistent set of features')
df_train = pd.read_csv(f'{file_train}', low_memory=True, encoding='utf-8')
df_train['subset'] = 'train'
df_test = pd.read_csv(f'{file_test}', low_memory=True, encoding='utf-8')
df_test['subset'] = 'test'
df = pd.concat(objs=[df_train, df_test], ignore_index=True, copy=False, sort=False)
df = df.rename(str.lower, axis='columns')
df = df.rename(str.strip, axis='columns')
df = df.drop(labels=['classnum'], axis=1)
# proto_idx = df.columns.get_loc('protocol_type')
# ohe_proto_data = pd.get_dummies(df['protocol_type'], prefix='proto', dtype=np.int8)
# for i, c in enumerate(ohe_proto_data.columns):
# df.insert(loc=proto_idx+i, column=c, value=ohe_proto_data[c])
# df = df.drop(labels=['protocol_type'], axis=1)
# srv_idx = df.columns.get_loc('service')
# ohe_srv_data = pd.get_dummies(df['service'], prefix='service', dtype=np.int8)
# for i, c in enumerate(ohe_srv_data.columns):
# df.insert(loc=srv_idx+i, column=c, value=ohe_srv_data[c])
# df = df.drop(labels=['service'], axis=1)
# flag_idx = df.columns.get_loc('flag')
# ohe_flag_data = pd.get_dummies(df['flag'], prefix='flag', dtype=np.int8)
# for i, c in enumerate(ohe_flag_data.columns):
# df.insert(loc=flag_idx+i, column=c, value=ohe_flag_data[c])
# df = df.drop(labels=['flag'], axis=1)
# Drop rows with infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
print("N/A rows after preproc", df.isna().any(axis=1).sum())
print(df.shape)
df.dropna(inplace=True)
print(df.shape)
print("Dropping duplicate rows")
df.drop_duplicates(inplace=True)
print(df.shape)
print(df.shape)
df_train = df.loc[df['subset'] == 'train']
df_train.drop(labels=['subset'], axis=1, inplace=True)
df_train.reset_index(inplace=True, drop=True)
print(df_train.shape)
df_test = df.loc[df['subset'] == 'test']
df_test.drop(labels=['subset'], axis=1, inplace=True)
df_test.reset_index(inplace=True, drop=True)
print(df_test.shape)
print(df_train.columns)
print(df_test.columns)
# Save to file
if 'feather' in filetypes:
f1 = file_train.replace('.csv', '.feather').replace('original', 'dirty-with-metadata')
df_train.to_feather(f1)
f2 = file_test.replace('.csv', '.feather').replace('original', 'dirty-with-metadata')
df_test.to_feather(f2)
if 'parquet' in filetypes:
f1 = file_train.replace('.csv', '.parquet').replace('original', 'dirty-with-metadata')
df_train.to_parquet(f1)
f2 = file_test.replace('.csv', '.parquet').replace('original', 'dirty-with-metadata')
df_test.to_parquet(f2)
if 'csv' in filetypes:
f1 = file_train.replace('original', 'dirty-with-metadata')
df_train.to_csv(f1, sep=',', index=False, encoding='utf-8')
f2 = file_test.replace('original', 'dirty-with-metadata')
df_test.to_csv(f2, sep=',', index=False, encoding='utf-8')
def check_dims_type(dataset):
for file in [x for x in listdir(f'{dataset}') if x.endswith('.csv')]:
print(f"------- {file} -------")
if file.endswith('.csv'):
df = pd.read_csv(f'{dataset}/{file}')
print(df.shape)
print(df.head(1))
print(df.dtypes)
elif file.endswith('.parquet'):
df = pd.read_parquet(f'{dataset}/{file}')
print(df.shape)
print(df.head(1))
print(df.dtypes)
elif file.endswith('.feather'):
df = pd.read_feather(f'{dataset}/{file}')
print(df.shape)
print(df.head(1))
print(df.dtypes)
if __name__ == "__main__":
# Adjust for cleaning the correct dataset into the desired format
# Needs directory with dataset name containing empty dir 'clean' and dir 'original' containing the CSVs
clean_dataset(f'nsl-kdd/original', filetypes=['feather', 'parquet', 'csv'])
# check_dims_type('nsl-kdd/clean')
\ No newline at end of file
#! /usr/bin/env python3
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', None)
import numpy as np
import random
import os
import glob
import time
random.seed(0)
......@@ -64,7 +62,7 @@ column_names = [
'label'
]
drop_columns = ['srcip', 'sport', 'dstip']
drop_columns = ['srcip', 'sport', 'dstip', 'dsport']
datetime_dyptes = {
'stime': 'datetime64',
......@@ -72,23 +70,37 @@ datetime_dyptes = {
}
def read_all_data():
data_1 = pd.read_csv('unswnb15/original/UNSW-NB15_1.csv', header=None, names=column_names, low_memory=False)
data_2 = pd.read_csv('unswnb15/original/UNSW-NB15_2.csv', header=None, names=column_names, low_memory=False)
data_3 = pd.read_csv('unswnb15/original/UNSW-NB15_3.csv', header=None, names=column_names, low_memory=False)
data_4 = pd.read_csv('unswnb15/original/UNSW-NB15_4.csv', header=None, names=column_names, low_memory=False)
data_1 = pd.read_csv('unsw-nb15/original/UNSW_NB15_1.csv', header=None, names=column_names, low_memory=False)
data_2 = pd.read_csv('unsw-nb15/original/UNSW_NB15_2.csv', header=None, names=column_names, low_memory=False)
data_3 = pd.read_csv('unsw-nb15/original/UNSW_NB15_3.csv', header=None, names=column_names, low_memory=False)
data_4 = pd.read_csv('unsw-nb15/original/UNSW_NB15_4.csv', header=None, names=column_names, low_memory=False)
data = pd.concat(objs=[data_1, data_2, data_3, data_4], ignore_index=True, copy=False)
return data
def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
df = read_all_data()
print(df['label'].value_counts())
print("Shape:", df.shape)
print("Shape:", df.shape)
# Dataset-specific quirks
df['ct_ftp_cmd'].replace(' ', '0', inplace=True)
df['ct_ftp_cmd'] = df['ct_ftp_cmd'].astype('int')
df['ct_flw_http_mthd'].fillna(value=0, inplace=True)
df.drop(columns=drop_columns, inplace=True, errors="ignore")
# df.drop(columns=drop_columns, inplace=True, errors="ignore")
df['srcip'] = df['srcip'].astype('category')
df = df.drop(index=df.loc[df['sport'] == '0x000b'].index)
df = df.drop(index=df.loc[df['sport'] == '0x000c'].index)
df = df.drop(index=df.loc[df['sport'] == '0xc0a8'].index)
df = df.drop(index=df.loc[df['sport'] == '-'].index)
df['sport'] = df['sport'].astype('int64')
df['dstip'] = df['sport'].astype('category')
df = df.drop(index=df.loc[df['dsport'] == '0xc0a8'].index)
df = df.drop(index=df.loc[df['dsport'] == '-'].index)
df = df.drop(index=df.loc[df['dsport'] == '0x20205321'].index)
df = df.drop(index=df.loc[df['dsport'] == '0xcc09'].index)
df['dsport'] = df['dsport'].astype('int64')
df['stime'] = pd.to_datetime(df['stime'], unit='s', errors='coerce')
df['ltime'] = pd.to_datetime(df['ltime'], unit='s', errors='coerce')
......@@ -109,9 +121,10 @@ def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
df['is_sm_ips_ports'] = df['is_sm_ips_ports'].astype('bool')