Commit 5892ba57 authored by Laurens D'hooge's avatar Laurens D'hooge
Browse files

included nsl-kdd

parent e1629723
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#! /usr/bin/env python3
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from os import listdir
def clean_dataset(dataset, filetypes=['feather', 'parquet', 'csv']):
file_train = dataset+'/KDDTrain.csv'
file_test = dataset+'/KDDTest.csv'
print(f'Temporarily merging {file_train} and {file_test} to end up with a consistent set of features')
df_train = pd.read_csv(f'{file_train}', low_memory=True, encoding='utf-8')
df_train['subset'] = 'train'
df_test = pd.read_csv(f'{file_test}', low_memory=True, encoding='utf-8')
df_test['subset'] = 'test'
df = pd.concat(objs=[df_train, df_test], ignore_index=True, copy=False, sort=False)
df = df.rename(str.lower, axis='columns')
df = df.rename(str.strip, axis='columns')
df = df.drop(labels=['classnum'], axis=1)
proto_idx = df.columns.get_loc('protocol_type')
ohe_proto_data = pd.get_dummies(df['protocol_type'], prefix='proto', dtype=np.int8)
for i, c in enumerate(ohe_proto_data.columns):
df.insert(loc=proto_idx+i, column=c, value=ohe_proto_data[c])
df = df.drop(labels=['protocol_type'], axis=1)
srv_idx = df.columns.get_loc('service')
ohe_srv_data = pd.get_dummies(df['service'], prefix='service', dtype=np.int8)
for i, c in enumerate(ohe_srv_data.columns):
df.insert(loc=srv_idx+i, column=c, value=ohe_srv_data[c])
df = df.drop(labels=['service'], axis=1)
flag_idx = df.columns.get_loc('flag')
ohe_flag_data = pd.get_dummies(df['flag'], prefix='flag', dtype=np.int8)
for i, c in enumerate(ohe_flag_data.columns):
df.insert(loc=flag_idx+i, column=c, value=ohe_flag_data[c])
df = df.drop(labels=['flag'], axis=1)
# Drop rows with infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
print("N/A rows after preproc", df.isna().any(axis=1).sum())
print(df.shape)
df.dropna(inplace=True)
print(df.shape)
print("Dropping duplicate rows")
df.drop_duplicates(inplace=True)
print(df.shape)
print(df.shape)
df_train = df.loc[df['subset'] == 'train']
df_train.drop(labels=['subset'], axis=1, inplace=True)
df_train.reset_index(inplace=True, drop=True)
print(df_train.shape)
df_test = df.loc[df['subset'] == 'test']
df_test.drop(labels=['subset'], axis=1, inplace=True)
df_test.reset_index(inplace=True, drop=True)
print(df_test.shape)
print(df_train.columns)
print(df_test.columns)
# Save to file
if 'feather' in filetypes:
f1 = file_train.replace('.csv', '.feather').replace('original', 'clean')
df_train.to_feather(f1)
f2 = file_test.replace('.csv', '.feather').replace('original', 'clean')
df_test.to_feather(f2)
if 'parquet' in filetypes:
f1 = file_train.replace('.csv', '.parquet').replace('original', 'clean')
df_train.to_parquet(f1)
f2 = file_test.replace('.csv', '.parquet').replace('original', 'clean')
df_test.to_parquet(f2)
if 'csv' in filetypes:
f1 = file_train.replace('original', 'clean')
df_train.to_csv(f1, sep=',', index=False, encoding='utf-8')
f2 = file_test.replace('original', 'clean')
df_test.to_csv(f2, sep=',', index=False, encoding='utf-8')
def check_dims_type(dataset):
for file in [x for x in listdir(f'{dataset}') if x.endswith('.csv')]:
print(f"------- {file} -------")
if file.endswith('.csv'):
df = pd.read_csv(f'{dataset}/{file}')
print(df.shape)
print(df.head(1))
print(df.dtypes)
elif file.endswith('.parquet'):
df = pd.read_parquet(f'{dataset}/{file}')
print(df.shape)
print(df.head(1))
print(df.dtypes)
elif file.endswith('.feather'):
df = pd.read_feather(f'{dataset}/{file}')
print(df.shape)
print(df.head(1))
print(df.dtypes)
if __name__ == "__main__":
# Adjust for cleaning the correct dataset into the desired format
# Needs directory with dataset name containing empty dir 'clean' and dir 'original' containing the CSVs
clean_dataset(f'nsl-kdd/original', filetypes=['feather', 'parquet', 'csv'])
check_dims_type('nsl-kdd/clean')
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment