diff --git a/0-base-images/python3-base/Dockerfile b/0-base-images/python3-base/Dockerfile index 92624e4..4f7111f 100644 --- a/0-base-images/python3-base/Dockerfile +++ b/0-base-images/python3-base/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.7.9-slim-buster +FROM python:3.10-slim LABEL maintainer="sebastian.schmidl@hpi.de" diff --git a/0-base-images/python3-base/requirements.txt b/0-base-images/python3-base/requirements.txt index ee4661f..182d331 100644 --- a/0-base-images/python3-base/requirements.txt +++ b/0-base-images/python3-base/requirements.txt @@ -1,5 +1,5 @@ -numpy==1.20.0 -pandas==1.2.1 -matplotlib==3.3.4 -scipy==1.6.0 -scikit-learn==0.24.1 +numpy>=1.20.0 +pandas>=1.2.1 +matplotlib>=3.3.4 +scipy>=1.6.0 +scikit-learn>=0.24.1 diff --git a/0-base-images/python3-torch/Dockerfile b/0-base-images/python3-torch/Dockerfile index 1d412be..56f6a8a 100644 --- a/0-base-images/python3-torch/Dockerfile +++ b/0-base-images/python3-torch/Dockerfile @@ -1,5 +1,5 @@ -FROM registry.gitlab.hpi.de/akita/i/python3-base +FROM registry.gitlab.hpi.de/akita/i/python3-base:0.2.6 LABEL maintainer="phillip.wenig@hpi.de" -RUN pip install --no-cache-dir torch==1.7.1 +RUN pip install --no-cache-dir torch==1.13.1 diff --git a/README.md b/README.md index 092bb8a..78c48bf 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,7 @@ The namespace prefix (repository) for the built Docker images is `registry.gitla | [fft](./fft) | `registry.gitlab.hpi.de/akita/i/fft` | python 3.7 | [`registry.gitlab.hpi.de/akita/i/python3-base`](./0-base-images/python3-base) | unsupervised | univariate | | [generic_rf](./generic_rf) | `registry.gitlab.hpi.de/akita/i/generic_rf` | python 3.7 | [`registry.gitlab.hpi.de/akita/i/python3-base`](./0-base-images/python3-base) | semi-supervised | univariate | | [generic_xgb](./generic_xgb) | `registry.gitlab.hpi.de/akita/i/generic_xgb` | python 3.7 | [`registry.gitlab.hpi.de/akita/i/python3-base`](./0-base-images/python3-base) | semi-supervised | univariate | +| [gdn](./gdn) | `registry.gitlab.hpi.de/akita/i/gdn` | python 3.7 | [`registry.gitlab.hpi.de/akita/i/python3-base`](./0-base-images/python3-base) | semi-supervised | multivariate | | [grammarviz3](./grammarviz3) | `registry.gitlab.hpi.de/akita/i/grammarviz3` | Java| [`registry.gitlab.hpi.de/akita/i/java-base`](./0-base-images/java-base) | unsupervised | univariate | | [grammarviz3_multi](./grammarviz3_multi) | `registry.gitlab.hpi.de/akita/i/grammarviz3_multi` | Java| [`registry.gitlab.hpi.de/akita/i/java-base`](./0-base-images/java-base) | unsupervised | multivariate | | [hbos](./hbos) | `registry.gitlab.hpi.de/akita/i/hbos` | python 3.7 | [`registry.gitlab.hpi.de/akita/i/pyod`](./0-base-images/pyod) -> [`registry.gitlab.hpi.de/akita/i/python3-base`](./0-base-images/python3-base) | unsupervised | multivariate | @@ -178,11 +179,9 @@ Follow the below steps to test your algorithm using Docker (examples assume that docker run --rm \ -v $(pwd)/1-data:/data:ro \ -v $(pwd)/2-results:/results:rw \ - # -e LOCAL_UID= \ - # -e LOCAL_GID= \ - registry.gitlab.hpi.de/akita/i/:latest execute-algorithm '{ + registry.gitlab.hpi.de/akita/i/gdn:0.2.6 execute-algorithm '{ "executionType": "train", - "dataInput": "/data/dataset.csv", + "dataInput": "/data/multi-dataset.csv", "dataOutput": "/results/anomaly_scores.ts", "modelInput": "/results/model.pkl", "modelOutput": "/results/model.pkl", diff --git a/gdn/Dockerfile b/gdn/Dockerfile new file mode 100644 index 0000000..d77ac9a --- /dev/null +++ b/gdn/Dockerfile @@ -0,0 +1,18 @@ +FROM registry.gitlab.hpi.de/akita/i/python3-torch:0.2.6 + +LABEL maintainer="2er0@dbaumi.at" + +ENV ALGORITHM_MAIN="/app/algorithm.py" + +# install algorithm dependencies +COPY requirements.txt /app/ +RUN apt-get update; \ + apt-get install -y gcc g++ python3-dev; \ + apt-get clean; \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +RUN pip install -r /app/requirements.txt + +COPY algorithm.py /app/ +COPY GDN /app/GDN +# fixing six.py dataloader issue +COPY GDN/dataloader_fix.py /usr/local/lib/python3.10/site-packages/torch_geometric/data/dataloader.py diff --git a/gdn/GDN/LICENSE b/gdn/GDN/LICENSE new file mode 100644 index 0000000..956d782 --- /dev/null +++ b/gdn/GDN/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 d-ailin + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/gdn/GDN/README.md b/gdn/GDN/README.md new file mode 100644 index 0000000..7465300 --- /dev/null +++ b/gdn/GDN/README.md @@ -0,0 +1,79 @@ +# GDN + +Code implementation for : [Graph Neural Network-Based Anomaly Detection in Multivariate Time Series(AAAI'21)](https://arxiv.org/pdf/2106.06947.pdf) + + +# Installation +### Requirements +* Python >= 3.6 +* cuda == 10.2 +* [Pytorch==1.5.1](https://pytorch.org/) +* [PyG: torch-geometric==1.5.0](https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html) + +### Install packages +``` + # run after installing correct Pytorch package + bash install.sh +``` + +### Quick Start +Run to check if the environment is ready +``` + bash run.sh cpu msl + # or with gpu + bash run.sh msl # e.g. bash run.sh 1 msl +``` + + +# Usage +We use part of msl dataset(refer to [telemanom](https://github.com/khundman/telemanom)) as demo example. + +## Data Preparation +``` +# put your dataset under data/ directory with the same structure shown in the data/msl/ + +data + |-msl + | |-list.txt # the feature names, one feature per line + | |-train.csv # training data + | |-test.csv # test data + |-your_dataset + | |-list.txt + | |-train.csv + | |-test.csv + | ... + +``` + +### Notices: +* The first column in .csv will be regarded as index column. +* The column sequence in .csv don't need to match the sequence in list.txt, we will rearrange the data columns according to the sequence in list.txt. +* test.csv should have a column named "attack" which contains ground truth label(0/1) of being attacked or not(0: normal, 1: attacked) + +## Run +``` + # using gpu + bash run.sh + + # or using cpu + bash run.sh cpu +``` +You can change running parameters in the run.sh. + +# Others +SWaT and WADI datasets can be requested from [iTrust](https://itrust.sutd.edu.sg/) + + +# Citation +If you find this repo or our work useful for your research, please consider citing the paper +``` +@inproceedings{deng2021graph, + title={Graph neural network-based anomaly detection in multivariate time series}, + author={Deng, Ailin and Hooi, Bryan}, + booktitle={Proceedings of the AAAI Conference on Artificial Intelligence}, + volume={35}, + number={5}, + pages={4027--4035}, + year={2021} +} +``` diff --git a/gdn/GDN/__init__.py b/gdn/GDN/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gdn/GDN/dataloader_fix.py b/gdn/GDN/dataloader_fix.py new file mode 100644 index 0000000..6fad4a1 --- /dev/null +++ b/gdn/GDN/dataloader_fix.py @@ -0,0 +1,118 @@ +# fixed version of original dataloader in torch_geometric/data/dataloader.py +# last import guarantees overload of original version +import torch.utils.data +from torch.utils.data.dataloader import default_collate + +from torch_geometric.data import Data, Batch +from torch._six import string_classes + + +int_classes = (bool, int) + +# NOTE: This overrides the default dataloader from torch_geometric to fix an issue +class Collater(object): + def __init__(self, follow_batch): + self.follow_batch = follow_batch + + def collate(self, batch): + elem = batch[0] + if isinstance(elem, Data): + return Batch.from_data_list(batch, self.follow_batch) + elif isinstance(elem, torch.Tensor): + return default_collate(batch) + elif isinstance(elem, float): + return torch.tensor(batch, dtype=torch.float) + elif isinstance(elem, int_classes): + return torch.tensor(batch) + elif isinstance(elem, string_classes): + return batch + elif isinstance(elem, container_abcs.Mapping): + return {key: self.collate([d[key] for d in batch]) for key in elem} + elif isinstance(elem, tuple) and hasattr(elem, '_fields'): + return type(elem)(*(self.collate(s) for s in zip(*batch))) + elif isinstance(elem, container_abcs.Sequence): + return [self.collate(s) for s in zip(*batch)] + + raise TypeError('DataLoader found invalid type: {}'.format(type(elem))) + + def __call__(self, batch): + return self.collate(batch) + + +class DataLoader(torch.utils.data.DataLoader): + r"""Data loader which merges data objects from a + :class:`torch_geometric.data.dataset` to a mini-batch. + + Args: + dataset (Dataset): The dataset from which to load the data. + batch_size (int, optional): How many samples per batch to load. + (default: :obj:`1`) + shuffle (bool, optional): If set to :obj:`True`, the data will be + reshuffled at every epoch. (default: :obj:`False`) + follow_batch (list or tuple, optional): Creates assignment batch + vectors for each key in the list. (default: :obj:`[]`) + """ + + def __init__(self, dataset, batch_size=1, shuffle=False, follow_batch=[], + **kwargs): + super(DataLoader, + self).__init__(dataset, batch_size, shuffle, + collate_fn=Collater(follow_batch), **kwargs) + + +class DataListLoader(torch.utils.data.DataLoader): + r"""Data loader which merges data objects from a + :class:`torch_geometric.data.dataset` to a python list. + + .. note:: + + This data loader should be used for multi-gpu support via + :class:`torch_geometric.nn.DataParallel`. + + Args: + dataset (Dataset): The dataset from which to load the data. + batch_size (int, optional): How many samples per batch to load. + (default: :obj:`1`) + shuffle (bool, optional): If set to :obj:`True`, the data will be + reshuffled at every epoch (default: :obj:`False`) + """ + + def __init__(self, dataset, batch_size=1, shuffle=False, **kwargs): + super(DataListLoader, self).__init__( + dataset, batch_size, shuffle, + collate_fn=lambda data_list: data_list, **kwargs) + + +class DenseCollater(object): + def collate(self, data_list): + batch = Batch() + for key in data_list[0].keys: + batch[key] = default_collate([d[key] for d in data_list]) + return batch + + def __call__(self, batch): + return self.collate(batch) + + +class DenseDataLoader(torch.utils.data.DataLoader): + r"""Data loader which merges data objects from a + :class:`torch_geometric.data.dataset` to a mini-batch. + + .. note:: + + To make use of this data loader, all graphs in the dataset needs to + have the same shape for each its attributes. + Therefore, this data loader should only be used when working with + *dense* adjacency matrices. + + Args: + dataset (Dataset): The dataset from which to load the data. + batch_size (int, optional): How many samples per batch to load. + (default: :obj:`1`) + shuffle (bool, optional): If set to :obj:`True`, the data will be + reshuffled at every epoch (default: :obj:`False`) + """ + + def __init__(self, dataset, batch_size=1, shuffle=False, **kwargs): + super(DenseDataLoader, self).__init__( + dataset, batch_size, shuffle, collate_fn=DenseCollater(), **kwargs) diff --git a/gdn/GDN/datasets/TimeDataset.py b/gdn/GDN/datasets/TimeDataset.py new file mode 100644 index 0000000..8eb0b4c --- /dev/null +++ b/gdn/GDN/datasets/TimeDataset.py @@ -0,0 +1,78 @@ +import torch +from torch.utils.data import Dataset, DataLoader + +import torch.nn.functional as F +from sklearn.preprocessing import MinMaxScaler, StandardScaler +import numpy as np + + +class TimeDataset(Dataset): + def __init__(self, raw_data, edge_index, mode='train', config = None): + self.raw_data = raw_data + + self.config = config + self.edge_index = edge_index + self.mode = mode + + x_data = raw_data[:-1] + labels = raw_data[-1] + + + data = x_data + + # to tensor + data = torch.tensor(data).double() + labels = torch.tensor(labels).double() + + self.x, self.y, self.labels = self.process(data, labels) + + def __len__(self): + return len(self.x) + + + def process(self, data, labels): + x_arr, y_arr = [], [] + labels_arr = [] + + slide_win, slide_stride = [self.config[k] for k + in ['slide_win', 'slide_stride'] + ] + is_train = self.mode == 'train' + + node_num, total_time_len = data.shape + + rang = range(slide_win, total_time_len, slide_stride) if is_train else range(slide_win, total_time_len) + + for i in rang: + + ft = data[:, i-slide_win:i] + tar = data[:, i] + + x_arr.append(ft) + y_arr.append(tar) + + labels_arr.append(labels[i]) + + + x = torch.stack(x_arr).contiguous() + y = torch.stack(y_arr).contiguous() + + labels = torch.Tensor(labels_arr).contiguous() + + return x, y, labels + + def __getitem__(self, idx): + + feature = self.x[idx].double() + y = self.y[idx].double() + + edge_index = self.edge_index.long() + + label = self.labels[idx].double() + + return feature, y, label, edge_index + + + + + diff --git a/gdn/GDN/evaluate.py b/gdn/GDN/evaluate.py new file mode 100644 index 0000000..ac71928 --- /dev/null +++ b/gdn/GDN/evaluate.py @@ -0,0 +1,173 @@ +from GDN.util.data import * +import numpy as np +from sklearn.metrics import precision_score, recall_score, roc_auc_score, f1_score + + +def get_full_err_scores(test_result, val_result): + np_test_result = np.array(test_result) + np_val_result = np.array(val_result) + + all_scores = None + all_normals = None + feature_num = np_test_result.shape[-1] + + labels = np_test_result[2, :, 0].tolist() + + for i in range(feature_num): + test_re_list = np_test_result[:2, :, i] + val_re_list = np_val_result[:2, :, i] + + scores = get_err_scores(test_re_list, val_re_list) + normal_dist = get_err_scores(val_re_list, val_re_list) + + if all_scores is None: + all_scores = scores + all_normals = normal_dist + else: + all_scores = np.vstack(( + all_scores, + scores + )) + all_normals = np.vstack(( + all_normals, + normal_dist + )) + + return all_scores, all_normals + + +def get_final_err_scores(test_result, val_result): + full_scores, all_normals = get_full_err_scores(test_result, val_result, return_normal_scores=True) + + all_scores = np.max(full_scores, axis=0) + + return all_scores + + +def get_err_scores(test_res, val_res): + test_predict, test_gt = test_res + val_predict, val_gt = val_res + + n_err_mid, n_err_iqr = get_err_median_and_iqr(test_predict, test_gt) + + test_delta = np.abs(np.subtract( + np.array(test_predict).astype(np.float64), + np.array(test_gt).astype(np.float64) + )) + epsilon = 1e-2 + + err_scores = (test_delta - n_err_mid) / (np.abs(n_err_iqr) + epsilon) + + smoothed_err_scores = np.zeros(err_scores.shape) + before_num = 3 + for i in range(before_num, len(err_scores)): + smoothed_err_scores[i] = np.mean(err_scores[i - before_num:i + 1]) + + return smoothed_err_scores + + +def get_loss(predict, gt): + return eval_mseloss(predict, gt) + + +def get_f1_scores(total_err_scores, gt_labels, topk=1): + print('total_err_scores', total_err_scores.shape) + # remove the highest and lowest score at each timestep + total_features = total_err_scores.shape[0] + + # topk_indices = np.argpartition(total_err_scores, range(total_features-1-topk, total_features-1), axis=0)[-topk-1:-1] + topk_indices = np.argpartition(total_err_scores, range(total_features - topk - 1, total_features), axis=0)[-topk:] + + topk_indices = np.transpose(topk_indices) + + total_topk_err_scores = [] + topk_err_score_map = [] + # topk_anomaly_sensors = [] + + for i, indexs in enumerate(topk_indices): + sum_score = sum( + score for k, score in enumerate(sorted([total_err_scores[index, i] for j, index in enumerate(indexs)]))) + + total_topk_err_scores.append(sum_score) + + final_topk_fmeas = eval_scores(total_topk_err_scores, gt_labels, 400) + + return final_topk_fmeas + + +def get_val_performance_data(total_err_scores, normal_scores, gt_labels, topk=1): + total_features = total_err_scores.shape[0] + + topk_indices = np.argpartition(total_err_scores, range(total_features - topk - 1, total_features), axis=0)[-topk:] + + total_topk_err_scores = [] + topk_err_score_map = [] + + total_topk_err_scores = np.sum(np.take_along_axis(total_err_scores, topk_indices, axis=0), axis=0) + + thresold = np.max(normal_scores) + + pred_labels = np.zeros(len(total_topk_err_scores)) + pred_labels[total_topk_err_scores > thresold] = 1 + + for i in range(len(pred_labels)): + pred_labels[i] = int(pred_labels[i]) + gt_labels[i] = int(gt_labels[i]) + + pre = precision_score(gt_labels, pred_labels) + rec = recall_score(gt_labels, pred_labels) + + f1 = f1_score(gt_labels, pred_labels) + + auc_score = roc_auc_score(gt_labels, total_topk_err_scores) + + return f1, pre, rec, auc_score, thresold + + +def get_best_performance_data(total_err_scores, gt_labels, topk=1): + total_features = total_err_scores.shape[0] + + # topk_indices = np.argpartition(total_err_scores, range(total_features-1-topk, total_features-1), axis=0)[-topk-1:-1] + topk_indices = np.argpartition(total_err_scores, range(total_features - topk - 1, total_features), axis=0)[-topk:] + + total_topk_err_scores = [] + topk_err_score_map = [] + + total_topk_err_scores = np.sum(np.take_along_axis(total_err_scores, topk_indices, axis=0), axis=0) + + final_topk_fmeas, thresolds = eval_scores(total_topk_err_scores, gt_labels, 400, return_thresold=True) + + th_i = final_topk_fmeas.index(max(final_topk_fmeas)) + thresold = thresolds[th_i] + + pred_labels = np.zeros(len(total_topk_err_scores)) + pred_labels[total_topk_err_scores > thresold] = 1 + + for i in range(len(pred_labels)): + pred_labels[i] = int(pred_labels[i]) + gt_labels[i] = int(gt_labels[i]) + + pre = precision_score(gt_labels, pred_labels) + rec = recall_score(gt_labels, pred_labels) + + auc_score = roc_auc_score(gt_labels, total_topk_err_scores) + + return max(final_topk_fmeas), pre, rec, auc_score, thresold + + +def get_best_performance_data_sequence(total_err_scores, gt_labels, topk=1): + total_features = total_err_scores.shape[0] + + topk_indices = np.argpartition(total_err_scores, range(total_features - topk - 1, total_features), axis=0)[-topk:] + + total_topk_err_scores = np.sum(np.take_along_axis(total_err_scores, topk_indices, axis=0), axis=0) + + final_topk_fmeas, thresolds = eval_scores(total_topk_err_scores, gt_labels, 400, return_threshold=True) + + th_i = final_topk_fmeas.index(max(final_topk_fmeas)) + thresold = thresolds[th_i] + + pred_labels = np.zeros(len(total_topk_err_scores)) + pred_labels[total_topk_err_scores > thresold] = 1 + + return max(final_topk_fmeas), total_topk_err_scores, pred_labels, thresold, gt_labels diff --git a/gdn/GDN/install.sh b/gdn/GDN/install.sh new file mode 100644 index 0000000..5b4f4e7 --- /dev/null +++ b/gdn/GDN/install.sh @@ -0,0 +1,5 @@ +pip install --no-index torch-scatter -f https://pytorch-geometric.com/whl/torch-1.5.0+cu102.html +pip install --no-index torch-sparse -f https://pytorch-geometric.com/whl/torch-1.5.0+cu102.html +pip install --no-index torch-cluster -f https://pytorch-geometric.com/whl/torch-1.5.0+cu102.html +pip install --no-index torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.5.0+cu102.html +pip install torch-geometric==1.5.0 \ No newline at end of file diff --git a/gdn/GDN/main.py b/gdn/GDN/main.py new file mode 100644 index 0000000..e234d4a --- /dev/null +++ b/gdn/GDN/main.py @@ -0,0 +1,269 @@ +# -*- coding: utf-8 -*- +import pickle as pkl +from typing import List, Any + +import os +from pathlib import Path +import random +from datetime import datetime + +import numpy as np +import torch +from torch.utils.data import DataLoader, Subset + +from GDN.util.env import get_device, set_device +from GDN.util.preprocess import build_loc_net, construct_data +from GDN.util.net_struct import get_feature_map, get_fc_graph_struc +from GDN.evaluate import get_full_err_scores, get_best_performance_data, get_val_performance_data, \ + get_best_performance_data_sequence + +from GDN.datasets.TimeDataset import TimeDataset + +from GDN.models.GDN import GDNModule + +from GDN.train import train +from GDN.test import test + +import GDN.dataloader_fix + + +def GDNtrain(train_config: dict, env_config: dict) -> None: + feature_map = get_feature_map(env_config["dataset"]) + fc_struc = get_fc_graph_struc(env_config["dataset"]) + + set_device(env_config["device"] + if "device" in env_config else + "cpu") + device = get_device() + + fc_edge_index = build_loc_net(fc_struc, list(env_config["dataset"].columns), feature_map=feature_map) + fc_edge_index = torch.tensor(fc_edge_index, dtype=torch.long) + + train_ts = env_config["dataset"] + train_dataset_indata = construct_data(train_ts, feature_map, labels=0) + + cfg = { + 'slide_win': train_config['slide_win'], + 'slide_stride': train_config['slide_stride'], + } + + train_dataset = TimeDataset(train_dataset_indata, fc_edge_index, mode='train', config=cfg) + train_dataloader, val_dataloader = get_loaders(train_dataset, train_config['seed'], train_config['batch'], + val_ratio=train_config['val_ratio']) + full_train_dataloader = DataLoader(train_dataset, batch_size=train_config['batch'], + shuffle=False, num_workers=0) + + edge_index_sets = [] + edge_index_sets.append(fc_edge_index) + + model = create_gdn_model(train_config, edge_index_sets, + feature_map, device) + + save_path = get_save_path(env_config["modelOutput"]) + + train_log = train(model, save_path[0], + config=train_config, + train_dataloader=train_dataloader, + val_dataloader=val_dataloader, + feature_map=feature_map, + test_dataloader=None, + test_dataset=None, + train_dataset=train_dataset, + dataset_name=env_config['dataset'] + ) + + _, train_result = test(model, full_train_dataloader) + _, val_result = test(model, val_dataloader) + top1_best_info = get_score(train_result, val_result) + + save_result_output(top1_best_info[4], env_config) + + save_config_element(save_path, + train_config, feature_map, fc_edge_index, val_result) + + +def GDNtest(env_config: dict) -> None: + elements = load_config_element(env_config["modelInput"]) + train_config, feature_map, fc_edge_index, val_result = elements[0], elements[1], elements[2], elements[3] + + set_device(env_config["device"] + if "device" in env_config else + "cpu") + device = get_device() + + edge_index_sets = [] + edge_index_sets.append(fc_edge_index) + + test_ts = env_config["dataset"] + test_dataset_indata = construct_data(test_ts, feature_map, + labels=0) # test_ts["is_anomaly"].tolist()) + + cfg = { + 'slide_win': train_config['slide_win'], + 'slide_stride': train_config['slide_stride'], + } + + test_dataset = TimeDataset(test_dataset_indata, fc_edge_index, mode='test', config=cfg) + + test_dataloader = DataLoader(test_dataset, batch_size=train_config['batch'], + shuffle=False, num_workers=0) + + model_load_path = get_save_path(env_config["modelInput"])[0] + + model = create_gdn_model(train_config, edge_index_sets, + feature_map, device) + + model.load_state_dict(torch.load(model_load_path)) + best_model = model.to(device) + + _, test_result = test(best_model, test_dataloader) + top1_best_info = get_score(test_result, val_result) + + save_result_output(top1_best_info[4], env_config) + + +def create_gdn_model(train_config, + edge_index_sets, feature_map, + device) -> GDN: + return GDNModule(edge_index_sets, len(feature_map), + dim=train_config['dim'], + input_dim=train_config['slide_win'], + out_layer_num=train_config['out_layer_num'], + out_layer_inter_dim=train_config['out_layer_inter_dim'], + topk=train_config['topk'] + ).to(device) + + +def get_loaders(train_dataset, seed, batch, val_ratio=0.1): + dataset_len = int(len(train_dataset)) + train_use_len = int(dataset_len * (1 - val_ratio)) + val_use_len = int(dataset_len * val_ratio) + val_start_index = random.randrange(train_use_len) + indices = torch.arange(dataset_len) + + train_sub_indices = torch.cat([indices[:val_start_index], indices[val_start_index + val_use_len:]]) + train_subset = Subset(train_dataset, train_sub_indices) + + val_sub_indices = indices[val_start_index:val_start_index + val_use_len] + val_subset = Subset(train_dataset, val_sub_indices) + + train_dataloader = DataLoader(train_subset, batch_size=batch, + shuffle=True) + + val_dataloader = DataLoader(val_subset, batch_size=batch, + shuffle=False) + + return train_dataloader, val_dataloader + + +def get_score(full_result, val_result): + np_result = np.array(full_result) + + labels = np_result[2, :, 0].tolist() + + scores, _ = get_full_err_scores(full_result, val_result) + + top1_info = get_best_performance_data_sequence(scores, labels, topk=1) + + return top1_info + + +def get_save_path(model_path: str): + base_dir = os.path.dirname(model_path) + + paths = [ + f'{model_path}', + f'{base_dir}/train_config.pkl', + f'{base_dir}/feature_map.pkl', + f'{base_dir}/fc_edge_index.pkl', + f'{base_dir}/val_result.pkl' + ] + + for path in paths: + dirname = os.path.dirname(path) + Path(dirname).mkdir(parents=True, exist_ok=True) + + return paths + + +def save_config_element(paths, train_config, feature_map, fc_edge_index, val_result) -> None: + for p, e in zip(paths[1:], + [train_config, feature_map, fc_edge_index, val_result]): + with open(p, 'wb') as file: + pkl.dump(e, file, protocol=pkl.HIGHEST_PROTOCOL) + + +def load_config_element(model_path) -> List[Any]: + paths = get_save_path(model_path) + elements = [] + for p in paths[1:]: + if not Path(p).exists(): + raise FileNotFoundError("Base element not found in required path." + "Run training first", p) + with open(p, 'rb') as file: + elements.append(pkl.load(file)) + return elements + + +def save_result_output(result, env_config) -> None: + path = env_config["dataOutput"] + np_result = np.array(result) + np.savetxt(path, np_result, delimiter=",") + +# if __name__ == "__main__": +# parser = argparse.ArgumentParser() +# +# parser.add_argument('-batch', help='batch size', type=int, default=128) +# parser.add_argument('-epoch', help='train epoch', type=int, default=100) +# parser.add_argument('-slide_win', help='slide_win', type=int, default=15) +# parser.add_argument('-dim', help='dimension', type=int, default=64) +# parser.add_argument('-slide_stride', help='slide_stride', type=int, default=5) +# parser.add_argument('-save_path_pattern', help='save path pattern', type=str, default='') +# parser.add_argument('-dataset', help='wadi / swat', type=str, default='wadi') +# parser.add_argument('-device', help='cuda / cpu', type=str, default='cuda') +# parser.add_argument('-random_seed', help='random seed', type=int, default=0) +# parser.add_argument('-comment', help='experiment comment', type=str, default='') +# parser.add_argument('-out_layer_num', help='outlayer num', type=int, default=1) +# parser.add_argument('-out_layer_inter_dim', help='out_layer_inter_dim', type=int, default=256) +# parser.add_argument('-decay', help='decay', type=float, default=0) +# parser.add_argument('-val_ratio', help='val ratio', type=float, default=0.1) +# parser.add_argument('-topk', help='topk num', type=int, default=20) +# parser.add_argument('-report', help='best / val', type=str, default='best') +# parser.add_argument('-load_model_path', help='trained model path', type=str, default='') +# +# args = parser.parse_args() +# +# random.seed(args.random_seed) +# np.random.seed(args.random_seed) +# torch.manual_seed(args.random_seed) +# torch.cuda.manual_seed(args.random_seed) +# torch.cuda.manual_seed_all(args.random_seed) +# torch.backends.cudnn.benchmark = False +# torch.backends.cudnn.deterministic = True +# os.environ['PYTHONHASHSEED'] = str(args.random_seed) +# +# train_config = { +# 'batch': args.batch, +# 'epoch': args.epoch, +# 'slide_win': args.slide_win, +# 'dim': args.dim, +# 'slide_stride': args.slide_stride, +# 'comment': args.comment, +# 'seed': args.random_seed, +# 'out_layer_num': args.out_layer_num, +# 'out_layer_inter_dim': args.out_layer_inter_dim, +# 'decay': args.decay, +# 'val_ratio': args.val_ratio, +# 'topk': args.topk, +# } +# +# env_config = { +# 'save_path': args.save_path_pattern, +# 'dataset': args.dataset, +# 'report': args.report, +# 'device': args.device, +# 'load_model_path': args.load_model_path +# } +# +# main = GDNMain(train_config, env_config, debug=False) +# main.run() diff --git a/gdn/GDN/models/GDN.py b/gdn/GDN/models/GDN.py new file mode 100644 index 0000000..0f97514 --- /dev/null +++ b/gdn/GDN/models/GDN.py @@ -0,0 +1,188 @@ +import numpy as np +import torch +import matplotlib.pyplot as plt +import torch.nn as nn +import time +from GDN.util.time import * +from GDN.util.env import * +from torch_geometric.nn import GCNConv, GATConv, EdgeConv +import math +import torch.nn.functional as F + +from .graph_layer import GraphLayer + + +def get_batch_edge_index(org_edge_index, batch_num, node_num): + # org_edge_index:(2, edge_num) + edge_index = org_edge_index.clone().detach() + edge_num = org_edge_index.shape[1] + batch_edge_index = edge_index.repeat(1,batch_num).contiguous() + + for i in range(batch_num): + batch_edge_index[:, i*edge_num:(i+1)*edge_num] += i*node_num + + return batch_edge_index.long() + + +class OutLayer(nn.Module): + def __init__(self, in_num, node_num, layer_num, inter_num = 512): + super(OutLayer, self).__init__() + + modules = [] + + for i in range(layer_num): + # last layer, output shape:1 + if i == layer_num-1: + modules.append(nn.Linear( in_num if layer_num == 1 else inter_num, 1)) + else: + layer_in_num = in_num if i == 0 else inter_num + modules.append(nn.Linear( layer_in_num, inter_num )) + modules.append(nn.BatchNorm1d(inter_num)) + modules.append(nn.ReLU()) + + self.mlp = nn.ModuleList(modules) + + def forward(self, x): + out = x + + for mod in self.mlp: + if isinstance(mod, nn.BatchNorm1d): + out = out.permute(0,2,1) + out = mod(out) + out = out.permute(0,2,1) + else: + out = mod(out) + + return out + + + +class GNNLayer(nn.Module): + def __init__(self, in_channel, out_channel, inter_dim=0, heads=1, node_num=100): + super(GNNLayer, self).__init__() + + + self.gnn = GraphLayer(in_channel, out_channel, inter_dim=inter_dim, heads=heads, concat=False) + + self.bn = nn.BatchNorm1d(out_channel) + self.relu = nn.ReLU() + self.leaky_relu = nn.LeakyReLU() + + def forward(self, x, edge_index, embedding=None, node_num=0): + + out, (new_edge_index, att_weight) = self.gnn(x, edge_index, embedding, return_attention_weights=True) + self.att_weight_1 = att_weight + self.edge_index_1 = new_edge_index + + out = self.bn(out) + + return self.relu(out) + + +class GDNModule(nn.Module): + def __init__(self, edge_index_sets, node_num, dim=64, out_layer_inter_dim=256, input_dim=10, out_layer_num=1, topk=20): + + super(GDNModule, self).__init__() + + self.edge_index_sets = edge_index_sets + + device = get_device() + + edge_index = edge_index_sets[0] + + + embed_dim = dim + self.embedding = nn.Embedding(node_num, embed_dim) + self.bn_outlayer_in = nn.BatchNorm1d(embed_dim) + + + edge_set_num = len(edge_index_sets) + self.gnn_layers = nn.ModuleList([ + GNNLayer(input_dim, dim, inter_dim=dim+embed_dim, heads=1) for i in range(edge_set_num) + ]) + + + self.node_embedding = None + self.topk = topk + self.learned_graph = None + + self.out_layer = OutLayer(dim*edge_set_num, node_num, out_layer_num, inter_num = out_layer_inter_dim) + + self.cache_edge_index_sets = [None] * edge_set_num + self.cache_embed_index = None + + self.dp = nn.Dropout(0.2) + + self.init_params() + + def init_params(self): + nn.init.kaiming_uniform_(self.embedding.weight, a=math.sqrt(5)) + + + def forward(self, data, org_edge_index): + + x = data.clone().detach() + edge_index_sets = self.edge_index_sets + + device = data.device + + batch_num, node_num, all_feature = x.shape + x = x.view(-1, all_feature).contiguous() + + + gcn_outs = [] + for i, edge_index in enumerate(edge_index_sets): + edge_num = edge_index.shape[1] + cache_edge_index = self.cache_edge_index_sets[i] + + if cache_edge_index is None or cache_edge_index.shape[1] != edge_num*batch_num: + self.cache_edge_index_sets[i] = get_batch_edge_index(edge_index, batch_num, node_num).to(device) + + batch_edge_index = self.cache_edge_index_sets[i] + + all_embeddings = self.embedding(torch.arange(node_num).to(device)) + + weights_arr = all_embeddings.detach().clone() + all_embeddings = all_embeddings.repeat(batch_num, 1) + + weights = weights_arr.view(node_num, -1) + + cos_ji_mat = torch.matmul(weights, weights.T) + normed_mat = torch.matmul(weights.norm(dim=-1).view(-1,1), weights.norm(dim=-1).view(1,-1)) + cos_ji_mat = cos_ji_mat / normed_mat + + dim = weights.shape[-1] + topk_num = self.topk + + topk_indices_ji = torch.topk(cos_ji_mat, topk_num, dim=-1)[1] + + self.learned_graph = topk_indices_ji + + gated_i = torch.arange(0, node_num).T.unsqueeze(1).repeat(1, topk_num).flatten().to(device).unsqueeze(0) + gated_j = topk_indices_ji.flatten().unsqueeze(0) + gated_edge_index = torch.cat((gated_j, gated_i), dim=0) + + batch_gated_edge_index = get_batch_edge_index(gated_edge_index, batch_num, node_num).to(device) + gcn_out = self.gnn_layers[i](x, batch_gated_edge_index, node_num=node_num*batch_num, embedding=all_embeddings) + + + gcn_outs.append(gcn_out) + + x = torch.cat(gcn_outs, dim=1) + x = x.view(batch_num, node_num, -1) + + + indexes = torch.arange(0,node_num).to(device) + out = torch.mul(x, self.embedding(indexes)) + + out = out.permute(0,2,1) + out = F.relu(self.bn_outlayer_in(out)) + out = out.permute(0,2,1) + + out = self.dp(out) + out = self.out_layer(out) + out = out.view(-1, node_num) + + + return out + \ No newline at end of file diff --git a/gdn/GDN/models/graph_layer.py b/gdn/GDN/models/graph_layer.py new file mode 100644 index 0000000..77d9db2 --- /dev/null +++ b/gdn/GDN/models/graph_layer.py @@ -0,0 +1,124 @@ +import torch +from torch.nn import Parameter, Linear, Sequential, BatchNorm1d, ReLU +import torch.nn.functional as F +from torch_geometric.nn.conv import MessagePassing +from torch_geometric.utils import remove_self_loops, add_self_loops, softmax + +from torch_geometric.nn.inits import glorot, zeros +import time +import math + +class GraphLayer(MessagePassing): + def __init__(self, in_channels, out_channels, heads=1, concat=True, + negative_slope=0.2, dropout=0, bias=True, inter_dim=-1,**kwargs): + super(GraphLayer, self).__init__(aggr='add', **kwargs) + + self.in_channels = in_channels + self.out_channels = out_channels + self.heads = heads + self.concat = concat + self.negative_slope = negative_slope + self.dropout = dropout + + self.__alpha__ = None + + self.lin = Linear(in_channels, heads * out_channels, bias=False) + + self.att_i = Parameter(torch.Tensor(1, heads, out_channels)) + self.att_j = Parameter(torch.Tensor(1, heads, out_channels)) + self.att_em_i = Parameter(torch.Tensor(1, heads, out_channels)) + self.att_em_j = Parameter(torch.Tensor(1, heads, out_channels)) + + if bias and concat: + self.bias = Parameter(torch.Tensor(heads * out_channels)) + elif bias and not concat: + self.bias = Parameter(torch.Tensor(out_channels)) + else: + self.register_parameter('bias', None) + + self.reset_parameters() + + def reset_parameters(self): + glorot(self.lin.weight) + glorot(self.att_i) + glorot(self.att_j) + + zeros(self.att_em_i) + zeros(self.att_em_j) + + zeros(self.bias) + + + + def forward(self, x, edge_index, embedding, return_attention_weights=False): + """""" + if torch.is_tensor(x): + x = self.lin(x) + x = (x, x) + else: + x = (self.lin(x[0]), self.lin(x[1])) + + edge_index, _ = remove_self_loops(edge_index) + edge_index, _ = add_self_loops(edge_index, + num_nodes=x[1].size(self.node_dim)) + + out = self.propagate(edge_index, x=x, embedding=embedding, edges=edge_index, + return_attention_weights=return_attention_weights) + + if self.concat: + out = out.view(-1, self.heads * self.out_channels) + else: + out = out.mean(dim=1) + + if self.bias is not None: + out = out + self.bias + + if return_attention_weights: + alpha, self.__alpha__ = self.__alpha__, None + return out, (edge_index, alpha) + else: + return out + + def message(self, x_i, x_j, edge_index_i, size_i, + embedding, + edges, + return_attention_weights): + + x_i = x_i.view(-1, self.heads, self.out_channels) + x_j = x_j.view(-1, self.heads, self.out_channels) + + if embedding is not None: + embedding_i, embedding_j = embedding[edge_index_i], embedding[edges[0]] + embedding_i = embedding_i.unsqueeze(1).repeat(1,self.heads,1) + embedding_j = embedding_j.unsqueeze(1).repeat(1,self.heads,1) + + key_i = torch.cat((x_i, embedding_i), dim=-1) + key_j = torch.cat((x_j, embedding_j), dim=-1) + + + + cat_att_i = torch.cat((self.att_i, self.att_em_i), dim=-1) + cat_att_j = torch.cat((self.att_j, self.att_em_j), dim=-1) + + alpha = (key_i * cat_att_i).sum(-1) + (key_j * cat_att_j).sum(-1) + + + alpha = alpha.view(-1, self.heads, 1) + + + alpha = F.leaky_relu(alpha, self.negative_slope) + alpha = softmax(alpha, edge_index_i, size_i) + + if return_attention_weights: + self.__alpha__ = alpha + + alpha = F.dropout(alpha, p=self.dropout, training=self.training) + + return x_j * alpha.view(-1, self.heads, 1) + + + + def __repr__(self): + return '{}({}, {}, heads={})'.format(self.__class__.__name__, + self.in_channels, + self.out_channels, self.heads) diff --git a/gdn/GDN/run.sh b/gdn/GDN/run.sh new file mode 100644 index 0000000..ca6ba59 --- /dev/null +++ b/gdn/GDN/run.sh @@ -0,0 +1,59 @@ +gpu_n=$1 +DATASET=$2 + +seed=5 +BATCH_SIZE=32 +SLIDE_WIN=5 +dim=64 +out_layer_num=1 +SLIDE_STRIDE=1 +topk=5 +out_layer_inter_dim=128 +val_ratio=0.2 +decay=0 + + +path_pattern="${DATASET}" +COMMENT="${DATASET}" + +EPOCH=30 +report='best' + +if [[ "$gpu_n" == "cpu" ]]; then + python main.py \ + -dataset $DATASET \ + -save_path_pattern $path_pattern \ + -slide_stride $SLIDE_STRIDE \ + -slide_win $SLIDE_WIN \ + -batch $BATCH_SIZE \ + -epoch $EPOCH \ + -comment $COMMENT \ + -random_seed $seed \ + -decay $decay \ + -dim $dim \ + -out_layer_num $out_layer_num \ + -out_layer_inter_dim $out_layer_inter_dim \ + -decay $decay \ + -val_ratio $val_ratio \ + -report $report \ + -topk $topk \ + -device 'cpu' +else + CUDA_VISIBLE_DEVICES=$gpu_n python main.py \ + -dataset $DATASET \ + -save_path_pattern $path_pattern \ + -slide_stride $SLIDE_STRIDE \ + -slide_win $SLIDE_WIN \ + -batch $BATCH_SIZE \ + -epoch $EPOCH \ + -comment $COMMENT \ + -random_seed $seed \ + -decay $decay \ + -dim $dim \ + -out_layer_num $out_layer_num \ + -out_layer_inter_dim $out_layer_inter_dim \ + -decay $decay \ + -val_ratio $val_ratio \ + -report $report \ + -topk $topk +fi \ No newline at end of file diff --git a/gdn/GDN/test.py b/gdn/GDN/test.py new file mode 100644 index 0000000..80ee248 --- /dev/null +++ b/gdn/GDN/test.py @@ -0,0 +1,62 @@ +import torch.nn as nn +import time +from GDN.util.time import * +from GDN.util.env import * + + +def test(model, dataloader): + # test + loss_func = nn.MSELoss(reduction='mean') + device = get_device() + + test_loss_list = [] + now = time.time() + + test_predicted_list = [] + test_ground_list = [] + test_labels_list = [] + + t_test_predicted_list = [] + t_test_ground_list = [] + t_test_labels_list = [] + + test_len = len(dataloader) + + model.eval() + + i = 0 + acu_loss = 0 + for x, y, labels, edge_index in dataloader: + x, y, labels, edge_index = [item.to(device).float() for item in [x, y, labels, edge_index]] + + with torch.no_grad(): + predicted = model(x, edge_index).float().to(device) + + loss = loss_func(predicted, y) + + labels = labels.unsqueeze(1).repeat(1, predicted.shape[1]) + + if len(t_test_predicted_list) <= 0: + t_test_predicted_list = predicted + t_test_ground_list = y + t_test_labels_list = labels + else: + t_test_predicted_list = torch.cat((t_test_predicted_list, predicted), dim=0) + t_test_ground_list = torch.cat((t_test_ground_list, y), dim=0) + t_test_labels_list = torch.cat((t_test_labels_list, labels), dim=0) + + test_loss_list.append(loss.item()) + acu_loss += loss.item() + + i += 1 + + if i % 10000 == 1 and i > 1: + print(timeSincePlus(now, i / test_len)) + + test_predicted_list = t_test_predicted_list.tolist() + test_ground_list = t_test_ground_list.tolist() + test_labels_list = t_test_labels_list.tolist() + + avg_loss = sum(test_loss_list) / len(test_loss_list) + + return avg_loss, [test_predicted_list, test_ground_list, test_labels_list] diff --git a/gdn/GDN/train.py b/gdn/GDN/train.py new file mode 100644 index 0000000..e47c0c2 --- /dev/null +++ b/gdn/GDN/train.py @@ -0,0 +1,96 @@ +import time + +from GDN.test import test +from GDN.util.time import * +from GDN.util.env import * + +import torch +import torch.nn.functional as F + + +def loss_func(y_pred, y_true): + loss = F.mse_loss(y_pred, y_true, reduction='mean') + + return loss + + +def train(model=None, save_path='', config={}, train_dataloader=None, val_dataloader=None, feature_map={}, + test_dataloader=None, test_dataset=None, dataset_name='swat', train_dataset=None): + seed = config['seed'] + + optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=config['decay']) + + now = time.time() + + train_loss_list = [] + cmp_loss_list = [] + + device = get_device() + + acu_loss = 0 + min_loss = 1e+8 + min_f1 = 0 + min_pre = 0 + best_prec = 0 + + i = 0 + epoch = config['epoch'] + early_stop_win = 15 + + model.train() + + log_interval = 1000 + stop_improve_count = 0 + + dataloader = train_dataloader + + for i_epoch in range(epoch): + + acu_loss = 0 + model.train() + + for x, labels, attack_labels, edge_index in dataloader: + _start = time.time() + + x, labels, edge_index = [item.float().to(device) for item in [x, labels, edge_index]] + + optimizer.zero_grad() + out = model(x, edge_index).float().to(device) + loss = loss_func(out, labels) + + loss.backward() + optimizer.step() + + train_loss_list.append(loss.item()) + acu_loss += loss.item() + + i += 1 + + # each epoch + print('epoch ({} / {}) (Loss:{:.8f}, ACU_loss:{:.8f})'.format( + i_epoch, epoch, + acu_loss / len(dataloader), acu_loss), flush=True + ) + + # use val dataset to judge + if val_dataloader is not None: + + val_loss, val_result = test(model, val_dataloader) + + if val_loss < min_loss: + torch.save(model.state_dict(), save_path) + + min_loss = val_loss + stop_improve_count = 0 + else: + stop_improve_count += 1 + + if stop_improve_count >= early_stop_win: + break + + else: + if acu_loss < min_loss: + torch.save(model.state_dict(), save_path) + min_loss = acu_loss + + return train_loss_list diff --git a/gdn/GDN/util/data.py b/gdn/GDN/util/data.py new file mode 100644 index 0000000..b6518d9 --- /dev/null +++ b/gdn/GDN/util/data.py @@ -0,0 +1,125 @@ +# util functions about data + +from scipy.stats import rankdata, iqr, trim_mean +from sklearn.metrics import f1_score, mean_squared_error +import numpy as np +from numpy import percentile + + +def get_attack_interval(attack): + heads = [] + tails = [] + for i in range(len(attack)): + if attack[i] == 1: + if attack[i - 1] == 0: + heads.append(i) + + if i < len(attack) - 1 and attack[i + 1] == 0: + tails.append(i) + elif i == len(attack) - 1: + tails.append(i) + res = [] + for i in range(len(heads)): + res.append((heads[i], tails[i])) + # print(heads, tails) + return res + + +# calculate F1 scores +def eval_scores(scores, true_scores, th_steps, return_threshold=False): + padding_list = [0] * (len(true_scores) - len(scores)) + # print(padding_list) + + if len(padding_list) > 0: + scores = padding_list + scores + + scores_sorted = rankdata(scores, method='ordinal') + th_steps = th_steps + # th_steps = 500 + th_vals = np.array(range(th_steps)) * 1.0 / th_steps + fmeas = [None] * th_steps + thresholds = [None] * th_steps + for i in range(th_steps): + cur_pred = scores_sorted > th_vals[i] * len(scores) + + fmeas[i] = f1_score(true_scores, cur_pred) + + score_index = scores_sorted.tolist().index(int(th_vals[i] * len(scores) + 1)) + thresholds[i] = scores[score_index] + + if return_threshold: + return fmeas, thresholds + return fmeas + + +def eval_mseloss(predicted, ground_truth): + ground_truth_list = np.array(ground_truth) + predicted_list = np.array(predicted) + + # mask = (ground_truth_list == 0) | (predicted_list == 0) + + # ground_truth_list = ground_truth_list[~mask] + # predicted_list = predicted_list[~mask] + + # neg_mask = predicted_list < 0 + # predicted_list[neg_mask] = 0 + + # err = np.abs(predicted_list / ground_truth_list - 1) + # acc = (1 - np.mean(err)) + + # return loss + loss = mean_squared_error(predicted_list, ground_truth_list) + + return loss + + +def get_err_median_and_iqr(predicted, groundtruth): + np_arr = np.abs(np.subtract(np.array(predicted), np.array(groundtruth))) + + err_median = np.median(np_arr) + err_iqr = iqr(np_arr) + + return err_median, err_iqr + + +def get_err_median_and_quantile(predicted, groundtruth, percentage): + np_arr = np.abs(np.subtract(np.array(predicted), np.array(groundtruth))) + + err_median = np.median(np_arr) + # err_iqr = iqr(np_arr) + err_delta = percentile(np_arr, int(percentage * 100)) - percentile(np_arr, int((1 - percentage) * 100)) + + return err_median, err_delta + + +def get_err_mean_and_quantile(predicted, groundtruth, percentage): + np_arr = np.abs(np.subtract(np.array(predicted), np.array(groundtruth))) + + err_median = trim_mean(np_arr, percentage) + # err_iqr = iqr(np_arr) + err_delta = percentile(np_arr, int(percentage * 100)) - percentile(np_arr, int((1 - percentage) * 100)) + + return err_median, err_delta + + +def get_err_mean_and_std(predicted, groundtruth): + np_arr = np.abs(np.subtract(np.array(predicted), np.array(groundtruth))) + + err_mean = np.mean(np_arr) + err_std = np.std(np_arr) + + return err_mean, err_std + + +def get_f1_score(scores, gt, contamination): + padding_list = [0] * (len(gt) - len(scores)) + # print(padding_list) + + threshold = percentile(scores, 100 * (1 - contamination)) + + if len(padding_list) > 0: + scores = padding_list + scores + + pred_labels = (scores > threshold).astype('int').ravel() + + return f1_score(gt, pred_labels) diff --git a/gdn/GDN/util/env.py b/gdn/GDN/util/env.py new file mode 100644 index 0000000..a89d37d --- /dev/null +++ b/gdn/GDN/util/env.py @@ -0,0 +1,15 @@ +import torch +import numpy as np + +_device = None + +def get_device(): + # return torch.device('cuda' if torch.cuda.is_available() else 'cpu') + return _device + +def set_device(dev): + global _device + _device = dev + +def init_work(worker_id, seed): + np.random.seed(seed + worker_id) diff --git a/gdn/GDN/util/iostream.py b/gdn/GDN/util/iostream.py new file mode 100644 index 0000000..e072789 --- /dev/null +++ b/gdn/GDN/util/iostream.py @@ -0,0 +1,106 @@ +from util.data import get_attack_interval +import time +from datetime import datetime +from pytz import utc, timezone +from util.time import timestamp2str +import json +import argparse +import numpy as np + +def printsep(): + print('='*40+'\n') + +def save_attack_infos(f1_scores, total_err_scores, labels, names, save_path, dataset, config): + slide_win=config['slide_win'] + down_len=config['down_len'] + + + if dataset == 'wadi' or dataset == 'wadi2': + s = '09/10/2017 18:00:00' + elif dataset == 'swat': + s = '28/12/2015 10:00:00' + start_s = int(time.mktime(datetime.strptime(s, "%d/%m/%Y %H:%M:%S").timetuple())) + cst8 = timezone('Asia/Shanghai') + fmt = '%m/%d %H:%M:%S' + + attack_inters = get_attack_interval(labels) + + save_infos = { + 'total_best_f1_score': f1_scores[0], + 'total_best_f1_score_topk': f1_scores[1], + 'total_best_f1_score_all': f1_scores[2], + 'attacks': [] + } + + indices_map = names + + indices = np.argmax(total_err_scores, axis=0).tolist() + anomaly_sensors = [ indices_map[index] for index in indices ] + + topk = 5 + topk_indices = np.argpartition(total_err_scores, -topk, axis=0)[-topk:] + topk_indices = np.transpose(topk_indices) + + topk_anomaly_sensors = [] + topk_err_score_map=[] + for i, indexs in enumerate(topk_indices): + # print(indexs) + topk_anomaly_sensors.append([indices_map[index] for index in indexs]) + + item = {} + for sensor, index in zip(topk_anomaly_sensors[i],indexs): + if sensor not in item: + item[sensor] = total_err_scores[index, i] + + topk_err_score_map.append(item) + + + for head, end in attack_inters: + attack_infos = {} + topk_attack_infos = {} + + head_t = timestamp2str(start_s+(head+slide_win)*down_len, fmt, cst8) + end_t = timestamp2str(start_s+(end+slide_win)*down_len, fmt, cst8) + # head_t = datetime.fromtimestamp(start_s+head).astimezone(cst8).strftime(fmt) + # end_t = datetime.fromtimestamp(start_s+end).astimezone(cst8).strftime(fmt) + + # print(f'\nattack from {head_t} to {end_t}:') + + for i in range(head, end): + # t = datetime.fromtimestamp(start_s+i).astimezone(cst8).strftime(fmt) + t = timestamp2str(start_s+(i+slide_win)*down_len, fmt, cst8) + max_sensor = anomaly_sensors[i] + topk_sensors = topk_anomaly_sensors[i] + + if max_sensor not in attack_infos: + attack_infos[max_sensor] = 0 + attack_infos[max_sensor] += 1 + + # for anomaly_sensor in topk_sensors: + # if anomaly_sensor not in topk_attack_infos: + # topk_attack_infos[anomaly_sensor] = 0 + # topk_attack_infos[anomaly_sensor] += 1 + + for anomaly_sensor in topk_sensors: + if anomaly_sensor not in topk_attack_infos: + topk_attack_infos[anomaly_sensor] = 0 + topk_attack_infos[anomaly_sensor] += topk_err_score_map[i][anomaly_sensor] + + + # print('-------------------------------') + # print(f'total top 5 attack sensors from {head_t} to {end_t}:') + sorted_attack_infos = {k: v for k, v in sorted(attack_infos.items(), reverse=True, key=lambda item: item[1])} + sorted_topk_attack_infos = {k: v for k, v in sorted(topk_attack_infos.items(), reverse=True, key=lambda item: item[1])} + # for key, count in sorted_attack_infos.items()[:5]: + # print(key, count) + + save_infos['attacks'].append({ + 'start': head_t, + 'end': end_t, + 'sensors': list(sorted_attack_infos), + 'topk_sensors': list(sorted_topk_attack_infos), + 'topk_scores': list(sorted_topk_attack_infos.values()) + }) + + with open(save_path, 'w+') as outfile: + json.dump(save_infos, outfile, indent=4) diff --git a/gdn/GDN/util/net_struct.py b/gdn/GDN/util/net_struct.py new file mode 100644 index 0000000..8ae2513 --- /dev/null +++ b/gdn/GDN/util/net_struct.py @@ -0,0 +1,56 @@ +import pandas as pd + + +def get_feature_map(dataset: pd.DataFrame): + feature_list = [] + for ft in dataset.filter(like="value").columns.values.tolist(): + feature_list.append(ft.strip()) + + return feature_list + + +# graph is 'fully-connect' +def get_fc_graph_struc(dataset: pd.DataFrame): + struc_map = {} + feature_list = [] + for ft in dataset.filter(like="value").columns.values.tolist(): + feature_list.append(ft.strip()) + + for ft in feature_list: + if ft not in struc_map: + struc_map[ft] = [] + + for other_ft in feature_list: + if other_ft is not ft: + struc_map[ft].append(other_ft) + + return struc_map + +def get_prior_graph_struc(dataset): + feature_file = open(f'./data/{dataset}/features.txt', 'r') + + struc_map = {} + feature_list = [] + for ft in feature_file: + feature_list.append(ft.strip()) + + for ft in feature_list: + if ft not in struc_map: + struc_map[ft] = [] + for other_ft in feature_list: + if dataset == 'wadi' or dataset == 'wadi2': + # same group, 1_xxx, 2A_xxx, 2_xxx + if other_ft is not ft and other_ft[0] == ft[0]: + struc_map[ft].append(other_ft) + elif dataset == 'swat': + # FIT101, PV101 + if other_ft is not ft and other_ft[-3] == ft[-3]: + struc_map[ft].append(other_ft) + + + return struc_map + + +if __name__ == '__main__': + get_graph_struc() + \ No newline at end of file diff --git a/gdn/GDN/util/preprocess.py b/gdn/GDN/util/preprocess.py new file mode 100644 index 0000000..2d43f5e --- /dev/null +++ b/gdn/GDN/util/preprocess.py @@ -0,0 +1,116 @@ +# preprocess data +import numpy as np +import re + + +def get_most_common_features(target, all_features, max = 3, min = 3): + res = [] + main_keys = target.split('_') + + for feature in all_features: + if target == feature: + continue + + f_keys = feature.split('_') + common_key_num = len(list(set(f_keys) & set(main_keys))) + + if common_key_num >= min and common_key_num <= max: + res.append(feature) + + return res + +def build_net(target, all_features): + # get edge_indexes, and index_feature_map + main_keys = target.split('_') + edge_indexes = [ + [], + [] + ] + index_feature_map = [target] + + # find closest features(nodes): + parent_list = [target] + graph_map = {} + depth = 2 + + for i in range(depth): + for feature in parent_list: + children = get_most_common_features(feature, all_features) + + if feature not in graph_map: + graph_map[feature] = [] + + # exclude parent + pure_children = [] + for child in children: + if child not in graph_map: + pure_children.append(child) + + graph_map[feature] = pure_children + + if feature not in index_feature_map: + index_feature_map.append(feature) + p_index = index_feature_map.index(feature) + for child in pure_children: + if child not in index_feature_map: + index_feature_map.append(child) + c_index = index_feature_map.index(child) + + edge_indexes[1].append(p_index) + edge_indexes[0].append(c_index) + + parent_list = pure_children + + return edge_indexes, index_feature_map + + +def construct_data(data, feature_map, labels=0): + res = [] + + for feature in feature_map: + if feature in data.columns: + res.append(data.loc[:, feature].values.tolist()) + else: + print(feature, 'not exist in data') + # append labels as last + sample_n = len(res[0]) + + if type(labels) == int: + res.append([labels]*sample_n) + elif len(labels) == sample_n: + res.append(labels) + + return res + +def build_loc_net(struc, all_features, feature_map=[]): + + index_feature_map = feature_map + edge_indexes = [ + [], + [] + ] + for node_name, node_list in struc.items(): + if node_name not in all_features: + continue + + if node_name not in index_feature_map: + index_feature_map.append(node_name) + + p_index = index_feature_map.index(node_name) + for child in node_list: + if child not in all_features: + continue + + if child not in index_feature_map: + print(f'error: {child} not in index_feature_map') + # index_feature_map.append(child) + + c_index = index_feature_map.index(child) + # edge_indexes[0].append(p_index) + # edge_indexes[1].append(c_index) + edge_indexes[0].append(c_index) + edge_indexes[1].append(p_index) + + + + return edge_indexes \ No newline at end of file diff --git a/gdn/GDN/util/time.py b/gdn/GDN/util/time.py new file mode 100644 index 0000000..cafe271 --- /dev/null +++ b/gdn/GDN/util/time.py @@ -0,0 +1,28 @@ +import time +import math +from datetime import datetime +from pytz import utc, timezone + +def asMinutes(s): + m = math.floor(s / 60) + s -= m * 60 + return '%dm %ds' % (m, s) + + +def timeSincePlus(since, percent): + now = time.time() + s = now - since + es = s / (percent) + rs = es - s + return '%s (- %s)' % (asMinutes(s), asMinutes(rs)) + + +def timeSince(since): + now = time.time() + s = now - since + m = math.floor(s / 60) + s -= m * 60 + return '%dm %ds' % (m, s) + +def timestamp2str(sec, fmt, tz): + return datetime.fromtimestamp(sec).astimezone(tz).strftime(fmt) diff --git a/gdn/LICENSE b/gdn/LICENSE new file mode 100644 index 0000000..89d9369 --- /dev/null +++ b/gdn/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021-2023 d-ailin and Sebastian Schmidl + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/gdn/README.md b/gdn/README.md new file mode 100644 index 0000000..c037090 --- /dev/null +++ b/gdn/README.md @@ -0,0 +1,19 @@ +# GDN + +||| +| :--- | :--- | +| Citekey | DengEtAl2021Graph | +| Source Code | [https://github.com/d-ailin/GDN](https://github.com/d-ailin/GDN) | +| Learning type | semi-supervised | +| Input dimensionality | multivariate | +||| + +## Dependencies + +- python 3 +- numpy +- pandas +- pytorch +- pytorch-geometric +- torch-scatter, +- torch-sparse, \ No newline at end of file diff --git a/gdn/__init__.py b/gdn/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gdn/algorithm.py b/gdn/algorithm.py new file mode 100644 index 0000000..19b15fb --- /dev/null +++ b/gdn/algorithm.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +import argparse +import json +import sys +import numpy as np +import pandas as pd +import torch + +from dataclasses import dataclass + +from GDN.main import GDNtrain, GDNtest + + +@dataclass +class CustomParameters: + window_size: int = 15 + stride: int = 5 + latent_size: int = 64 + n_out_layers: int = 1 + out_layer_dimensionality: int = 1 + topk: int = 20 + epochs: int = 10 + batch_size: int = 128 + split: float = 0.9 + learning_rate_decay: float = 0.001 + random_state: int = 42 + device: str = 'cpu' + + +class AlgorithmArgs(argparse.Namespace): + @property + def ts(self) -> pd.DataFrame: + ds = self.df.iloc[:, 1:-1] + if ds.shape[1] == 1: + raise RuntimeError("No multivariate time series provided as dataset") + return ds + + @property + def tsa(self) -> pd.DataFrame: + return self.df.iloc[:, 1:] + + @property + def df(self) -> pd.DataFrame: + return pd.read_csv(self.dataInput) + + @staticmethod + def from_sys_args() -> 'AlgorithmArgs': + args: dict = json.loads(sys.argv[1]) + custom_parameter_keys = dir(CustomParameters()) + filtered_parameters = dict( + filter(lambda x: x[0] in custom_parameter_keys, args.get("customParameters", {}).items())) + args["customParameters"] = CustomParameters(**filtered_parameters) + return AlgorithmArgs(**args) + + +def train(args: AlgorithmArgs): + ts = args.ts + + train_config = { + "batch": args.customParameters.batch_size, + "epoch": args.customParameters.epochs, + "slide_win": args.customParameters.window_size, + "dim": args.customParameters.latent_size, + "slide_stride": args.customParameters.stride, + "comment": "TimeEval execution", + "seed": args.customParameters.random_state, + "out_layer_num": args.customParameters.n_out_layers, + "out_layer_inter_dim": args.customParameters.out_layer_dimensionality, + "decay": args.customParameters.learning_rate_decay, + "val_ratio": args.customParameters.split, + "topk": ts.shape[1] if args.customParameters.topk > ts.shape[1] else args.customParameters.topk + } + + # load data + env_config = { + "dataset": ts, + "dataOutput": args.dataOutput, + "modelOutput": args.modelOutput, + "device": args.customParameters.device + } + + GDNtrain(train_config, env_config) + + +def execute(args: AlgorithmArgs): + ts = args.ts + + env_config = { + "dataset": ts, + "dataOutput": args.dataOutput, + "modelInput": args.modelInput, + "device": args.customParameters.device + } + + GDNtest(env_config) + + +def set_random_state(config: AlgorithmArgs) -> None: + seed = config.customParameters.random_state + import random + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Wrong number of arguments specified; expected a single json-string!") + exit(1) + + args = AlgorithmArgs.from_sys_args() + set_random_state(args) + print(f"AlgorithmArgs: {args}") + + if args.executionType == "train": + train(args) + elif args.executionType == "execute": + execute(args) + else: + raise ValueError(f"Unknown execution type '{args.executionType}'; expected either 'train' or 'execute'!") diff --git a/gdn/manifest.json b/gdn/manifest.json new file mode 100644 index 0000000..d061cec --- /dev/null +++ b/gdn/manifest.json @@ -0,0 +1,98 @@ +{ + "title": "GDN", + "description": "Implementation of https://doi.org/10.1609/aaai.v35i5.16523", + "inputDimensionality": "multivariate", + "version": "0.1", + "authors": "Ailin Deng, Bryan Hooi", + "language": "Python", + "type": "Detector", + "mainFile": "algorithm.py", + "learningType": "semi-supervised", + "trainingStep": { + "parameters": [ + { + "name": "window_size", + "type": "int", + "defaultValue": 15, + "optional": "true", + "description": "Size of the sliding windows" + }, + { + "name": "stride", + "type": "int", + "defaultValue": 5, + "optional": "true", + "description": "Stride for the sliding windows" + }, + { + "name": "latent_size", + "type": "int", + "defaultValue": 64, + "optional": "true", + "description": "Dimensionality of the latent embedding space" + }, + { + "name": "n_out_layers", + "type": "int", + "defaultValue": 1, + "optional": "true", + "description": "Number of output layers" + }, + { + "name": "out_layer_dimensionality", + "type": "int", + "defaultValue": 1, + "optional": "true", + "description": "Interim dimensionality of the output layers." + }, + { + "name": "epochs", + "type": "int", + "defaultValue": 1, + "optional": "true", + "description": "Number of training iterations over entire dataset; recommended value: 100" + }, + { + "name": "batch_size", + "type": "int", + "defaultValue": 128, + "optional": "true", + "description": "Number of instances trained at the same time" + }, + { + "name": "split", + "type": "float", + "defaultValue": 0.9, + "optional": "true", + "description": "Train-validation split" + }, + { + "name": "learning_rate_decay", + "type": "float", + "defaultValue": 0.001, + "optional": "true", + "description": "Learning rate decay for Adam optimizer" + }, + { + "name": "random_state", + "type": "int", + "defaultValue": 42, + "optional": "true", + "description": "Seed for the random number generator" + } + ], + "modelInput": "none" + }, + "executionStep": { + "parameters": [ + { + "name": "random_state", + "type": "int", + "defaultValue": 42, + "optional": "true", + "description": "Seed for the random number generator" + } + ], + "modelInput": "required" + } +} diff --git a/gdn/requirements.txt b/gdn/requirements.txt new file mode 100644 index 0000000..66fc7d7 --- /dev/null +++ b/gdn/requirements.txt @@ -0,0 +1,5 @@ +numpy>=1.19.5 +pandas>=1.2.1 +torch-geometric==1.5.0 +torch-scatter==2.1.1 +torch-sparse==0.6.17