From 9d2c1f1cbaebb1847f10b54a2c709bb9e5bdb8af Mon Sep 17 00:00:00 2001 From: dlagul Date: Fri, 23 Oct 2020 16:51:20 +0800 Subject: [PATCH] Add files via upload --- sdfvae/evaluation.py | 272 ++++++++++++++++++++++++++++ sdfvae/get_interval_anomaly.py | 67 +++++++ sdfvae/logger.py | 93 ++++++++++ sdfvae/model.py | 312 +++++++++++++++++++++++++++++++++ sdfvae/scripts.txt | 53 ++++++ sdfvae/tester.py | 201 +++++++++++++++++++++ sdfvae/trainer.py | 208 ++++++++++++++++++++++ sdfvae/util.py | 16 ++ 8 files changed, 1222 insertions(+) create mode 100644 sdfvae/evaluation.py create mode 100644 sdfvae/get_interval_anomaly.py create mode 100644 sdfvae/logger.py create mode 100644 sdfvae/model.py create mode 100644 sdfvae/scripts.txt create mode 100644 sdfvae/tester.py create mode 100644 sdfvae/trainer.py create mode 100644 sdfvae/util.py diff --git a/sdfvae/evaluation.py b/sdfvae/evaluation.py new file mode 100644 index 0000000..1d47212 --- /dev/null +++ b/sdfvae/evaluation.py @@ -0,0 +1,272 @@ +import argparse +import os +import numpy as np +from get_interval_anomaly import IntervalAnomaly +import time +from logger import Logger + +class Evaluator(): + def __init__(self, anomaly_score_label_file, th_range = [-10,10], th_step = 1, log_path='', log_file=''): + self.anomaly_score_label_file = anomaly_score_label_file + self.th_range = th_range + self.th_step = th_step + self.log_path = log_path + self.log_file = log_file + self.label = [] + self.eval_metrics = {} + self.best_eval_metrics = '' + self.f1_best = 0 + self.pr_auc = 0 + self.ground_truth_anomaly_intervals = [] + self.detected_anomaly_intervals = [] + self.reconstructed_anomaly_intervals = [] + self.detected_result = [] + self.timestamp_detected_result = [] + self.logger = Logger(self.log_path, self.log_file) + + def get_ground_truth_anomaly_intervals(self, timestamp_anomalyscore_label): + ground_truth_anomaly_intervals = [] + IA_ground_truth = IntervalAnomaly() + for i in range(len(timestamp_anomalyscore_label[0])): + if timestamp_anomalyscore_label[2][i] == "Anomaly": + isAnomaly = True + else: + isAnomaly = False + IA_ground_truth.IntervalAnomalyDetect(timestamp_anomalyscore_label[0][i], isAnomaly, ground_truth_anomaly_intervals) + del IA_ground_truth + return ground_truth_anomaly_intervals + + def get_detected_anomaly_intervals(self, th, timestamp_anomalyscore_label): + anomaly_detected = [] + detected_anomaly_intervals = [] + for idx in range(len(timestamp_anomalyscore_label[0])): + if float(timestamp_anomalyscore_label[1][idx]) <= th: + anomaly_detected.append(True) + else: + anomaly_detected.append(False) + IA_detected = IntervalAnomaly() + for k in range(len(anomaly_detected)): + IA_detected.IntervalAnomalyDetect(timestamp_anomalyscore_label[0][k],anomaly_detected[k],detected_anomaly_intervals) + del IA_detected + return detected_anomaly_intervals + + def get_reconstruct_detected_anomaly_intervals(self, timestamp_anomalyscore_label, + ground_truth_anomaly_intervals, detected_anomaly_intervals): + detected_anomaly_intervals_tmp = detected_anomaly_intervals + for i in range(len(timestamp_anomalyscore_label[0])): + current_timestamp = int(time.mktime(time.strptime(timestamp_anomalyscore_label[0][i], "%Y%m%d%H%M%S"))) + for j in range(len(ground_truth_anomaly_intervals)): + start_gt = int(time.mktime(time.strptime(ground_truth_anomaly_intervals[j][0], "%Y%m%d%H%M%S"))) + end_gt = int(time.mktime(time.strptime(ground_truth_anomaly_intervals[j][1], "%Y%m%d%H%M%S"))) + if current_timestamp >= start_gt and current_timestamp <= end_gt: + for k in range(len(detected_anomaly_intervals)): + start_dt = int(time.mktime(time.strptime(detected_anomaly_intervals[k][0], "%Y%m%d%H%M%S"))) + end_dt = int(time.mktime(time.strptime(detected_anomaly_intervals[k][1], "%Y%m%d%H%M%S"))) + if current_timestamp >= start_dt and current_timestamp <= end_dt: + detected_anomaly_intervals_tmp[k] = ground_truth_anomaly_intervals[j] + return detected_anomaly_intervals_tmp + + def get_detected_result(self, reconstruct_detected_anomaly_intervals,timestamp_anomalyscore_label): + timestamp_detected_result = [] + detected_result = [] + for i in range(len(timestamp_anomalyscore_label[0])): + current_timestamp = int(time.mktime(time.strptime(timestamp_anomalyscore_label[0][i], "%Y%m%d%H%M%S"))) + flag = False + for j in range(len(reconstruct_detected_anomaly_intervals)): + start_gt = int(time.mktime(time.strptime(reconstruct_detected_anomaly_intervals[j][0], "%Y%m%d%H%M%S"))) + end_gt = int(time.mktime(time.strptime(reconstruct_detected_anomaly_intervals[j][1], "%Y%m%d%H%M%S"))) + if current_timestamp >= start_gt and current_timestamp <= end_gt: + flag = True + break + timestamp_detected_result.append(str(timestamp_anomalyscore_label[0][i])+","+str(flag)) + detected_result.append(flag) + return timestamp_detected_result,detected_result + + def get_metrics(self, label, detected_result): + assert len(label) == len(detected_result) + TP = 0 + FP = 0 + TN = 0 + FN = 0 + for i in range(len(label)): + if label[i]==True and detected_result[i] == True: + TP = TP+1 + elif label[i]==True and detected_result[i] == False: + FN = FN+1 + elif label[i]==False and detected_result[i] == False: + TN = TN+1 + elif label[i]==False and detected_result[i] == True: + FP = FP+1 + if TP+FP-0 != 0: + precision = TP/(TP+FP) + else: + precision = 0 + if TP+FN-0 != 0: + recall = TP/(TP+FN) + tpr = TP/(TP+FN) + fnr = FN/(TP+FN) + else: + recall = 0 + fnr = 0 + tpr = 0 + if FP+TN-0 != 0: + tnr = TN/(FP+TN) + fpr = FP/(FP+TN) + else: + tnr = 0 + fpr = 0 + if precision+recall-0 != 0: + f1 = 2*precision*recall/(precision+recall) + else: + f1 = 0 + return TP, FN, TN, FP, precision, recall, f1, fpr, tpr + + def get_label(self,timestamp_anomalyscore_label): + label = [] + for idx in range(len(timestamp_anomalyscore_label[2])): + if timestamp_anomalyscore_label[2][idx] == "Anomaly": + label.append(True) + else: + label.append(False) + return label + + + def perform_evaluating(self): + timestamp_anomalyscore_label1 = np.loadtxt(self.anomaly_score_label_file, delimiter=',', dtype=bytes, unpack=False).astype(str) + timestamp_anomalyscore_label2 = timestamp_anomalyscore_label1.tolist() + timestamp_anomalyscore_label2.sort() + timestamp_anomalyscore_label3 = [[],[],[]] + for i in range(len(timestamp_anomalyscore_label2)): + timestamp_anomalyscore_label3[0].append(timestamp_anomalyscore_label2[i][0]) + timestamp_anomalyscore_label3[1].append(timestamp_anomalyscore_label2[i][1]) + timestamp_anomalyscore_label3[2].append(timestamp_anomalyscore_label2[i][2]) + timestamp_anomalyscore_label = np.array(timestamp_anomalyscore_label3) + anomaly_score_min = np.min(timestamp_anomalyscore_label[1].astype(float),axis=0) + anomaly_score_max = np.max(timestamp_anomalyscore_label[1].astype(float),axis=0) + if self.th_range[1] >= anomaly_score_max: + if self.th_range[0] >= anomaly_score_min: + threshold_candidates = [t for t in np.arange(self.th_range[0],anomaly_score_max,self.th_step)] + else: + threshold_candidates = [t for t in np.arange(anomaly_score_min,anomaly_score_max,self.th_step)] + else: + if self.th_range[0] >= anomaly_score_min: + threshold_candidates = [t for t in np.arange(self.th_range[0],self.th_range[1],self.th_step)] + else: + threshold_candidates = [t for t in np.arange(anomaly_score_min,self.th_range[1],self.th_step)] + self.ground_truth_anomaly_intervals = self.get_ground_truth_anomaly_intervals(timestamp_anomalyscore_label) + fscore = {} + for th in threshold_candidates: + self.detected_anomaly_intervals = self.get_detected_anomaly_intervals(th, timestamp_anomalyscore_label) + self.reconstruct_detected_anomaly_intervals = self.get_reconstruct_detected_anomaly_intervals(timestamp_anomalyscore_label, + self.ground_truth_anomaly_intervals, self.detected_anomaly_intervals) + self.timestamp_detected_result,self.detected_result = self.get_detected_result(self.reconstruct_detected_anomaly_intervals, + timestamp_anomalyscore_label) + self.label = self.get_label(timestamp_anomalyscore_label) + TP, FN, TN, FP, precision, recall, f1, fpr, tpr = self.get_metrics(self.label, self.detected_result) + fscore[f1] = "th:{}, p:{}, r:{}, f1score:{}, TP:{}, FN:{}, TN:{}, FP:{}, FPR:{}, TPR:{}".format( + th, precision, recall, f1,TP, FN, TN, FP,fpr, tpr) + self.eval_metrics['Th'] = th + self.eval_metrics['P'] = precision + self.eval_metrics['R'] = recall + self.eval_metrics['F1score'] = f1 + self.eval_metrics['TP'] = TP + self.eval_metrics['FN']= FN + self.eval_metrics['TN'] = TN + self.eval_metrics['FP'] = FP + self.eval_metrics['Fpr'] = fpr + self.eval_metrics['Tpr'] = tpr + self.logger.log_evaluator(self.eval_metrics) + # If the recall has been reached to 1.0, we break the loop, due to the best f1-score has been achieved + # Since as the threshold increases, recall remains unchanged (1.0), while precision decreases and thus f1-score decreases + if float(recall) < 1.0: + continue + elif float(recall) == 1.0: + break + + fscore_sorted_by_key = sorted(fscore.items(), key=lambda d:d[0], reverse = True) + self.f1_best = fscore_sorted_by_key[0][0] + self.best_eval_metrics = fscore_sorted_by_key[0][1] + self.logger.log_evaluator_re("f1-best is: {}".format(fscore_sorted_by_key[0][0])) + self.logger.log_evaluator_re("details: {}".format(fscore_sorted_by_key[0][1])) + + +def main(): + + parser = argparse.ArgumentParser() + # GPU option + parser.add_argument('--gpu_id', type=int, default=0) + # Dataset options + parser.add_argument('--dataset_path', type=str, default='') + parser.add_argument('--data_nums', type=int, default=0) + parser.add_argument('--batch_size', type=int, default=64) + parser.add_argument('--num_workers', type=int, default=4) + parser.add_argument('--T', type=int, default=20) + parser.add_argument('--win_size', type=int, default=36) + parser.add_argument('--l', type=int, default=10) + parser.add_argument('--n', type=int, default=24) + + # Model options + parser.add_argument('--s_dims', type=int, default=8) + parser.add_argument('--d_dims', type=int, default=10) + parser.add_argument('--conv_dims', type=int, default=100) + parser.add_argument('--hidden_dims', type=int, default=40) + parser.add_argument('--enc_dec', type=str, default='CNN') + + # Training options + parser.add_argument('--learning_rate', type=float, default=0.0002) + parser.add_argument('--epochs', type=int, default=50) + parser.add_argument('--start_epoch', type=int, default=0) + parser.add_argument('--checkpoints_path', type=str, default='') + parser.add_argument('--checkpoints_interval', type=int, default=10) + parser.add_argument('--log_path', type=str, default='log_evaluator') + parser.add_argument('--log_file', type=str, default='') + + parser.add_argument('--llh_path', type=str, default='log_tester') + parser.add_argument('--llh_file', type=str, default='') + + parser.add_argument('--th_min', type=float, default=-50) + parser.add_argument('--th_max', type=float, default=10) + parser.add_argument('--th_step', type=float, default=0.2) + + args = parser.parse_args() + + if args.llh_file == '': + args.ll_file = 'sdim{}_ddim{}_cdim{}_hdim{}_winsize{}_T{}_l{}_epochs{}_loss.txt'.format( + args.s_dims, + args.d_dims, + args.conv_dims, + args.hidden_dims, + args.win_size, + args.T, + args.l, + args.start_epoch) + + if args.log_file == '': + args.log_file = 'sdim{}_ddim{}_cdim{}_hdim{}_winsize{}_T{}_l{}_epochs{}_eval_records'.format( + args.s_dims, + args.d_dims, + args.conv_dims, + args.hidden_dims, + args.win_size, + args.T, + args.l, + args.start_epoch) + + if not os.path.exists(os.path.join(args.llh_path,args.llh_file)): + raise ValueError('Unknown anomaly score label file: {}'.format(args.llh_path)) + + if not os.path.exists(args.log_path): + os.makedirs(args.log_path) + + anomaly_score_label_file = os.path.join(args.llh_path,args.ll_file) + evaluator = Evaluator(anomaly_score_label_file, + th_range = [args.th_min,args.th_max], + th_step = args.th_step, + log_path = args.log_path, + log_file = args.log_file) + + evaluator.perform_evaluating() + +if __name__ == '__main__': + main() + diff --git a/sdfvae/get_interval_anomaly.py b/sdfvae/get_interval_anomaly.py new file mode 100644 index 0000000..153515c --- /dev/null +++ b/sdfvae/get_interval_anomaly.py @@ -0,0 +1,67 @@ +import time, datetime +import os +class IntervalAnomaly: + def __init__(self): + self.isLastIntervalAnomaly = False + self.timeWindow = [] + self.anomalyWindow = [] + self.anomalyIntervalStart = "" + # self.anomalyIntervalEnd = "20190101000001" + self.anomalyIntervalEnd = "20180101000001" + self.windowsize = 3 + self.IntervalAnomalyThreshold = 1 + self.IntervalMergeThreshold = 300 + + def IntervalAnomalyDetect(self, realtime, isAnomaly, anomaly_intervals): + ''' + params: + realtime: string, as "20180101000001" total 14 bits + isAnomaly: bool, True (represents Anomaly) or False (represents not Anomaly) + return: + None + ''' + if len(self.timeWindow) < self.windowsize - 1: + self.timeWindow.append(realtime) + self.anomalyWindow.append(isAnomaly) + return + elif len(self.timeWindow) == self.windowsize - 1: + self.timeWindow.append(realtime) + self.anomalyWindow.append(isAnomaly) + else: + self.timeWindow.pop(0) + self.timeWindow.append(realtime) + self.anomalyWindow.pop(0) + self.anomalyWindow.append(isAnomaly) + + anomalyCount = 0 + for i in range(self.windowsize): + if self.anomalyWindow[i]: + anomalyCount += 1 + if anomalyCount >= self.IntervalAnomalyThreshold: + if self.isLastIntervalAnomaly == False: + current_array = time.strptime(self.timeWindow[0], "%Y%m%d%H%M%S") + last_array = time.strptime(self.anomalyIntervalEnd, "%Y%m%d%H%M%S") + current_timestamp = int(time.mktime(current_array)) + last_timestamp = int(time.mktime(last_array)) + if (current_timestamp - last_timestamp) <= self.IntervalMergeThreshold: + first_anomaly_idx = self.anomalyWindow.index(True) + last_anomaly_idx = -self.anomalyWindow[::-1].index(True)-1 + self.anomalyIntervalEnd = self.timeWindow[self.windowsize + last_anomaly_idx] + anomaly_intervals.pop(-1) + anomaly_intervals.append([self.anomalyIntervalStart,self.anomalyIntervalEnd]) + else: + first_anomaly_idx = self.anomalyWindow.index(True) + last_anomaly_idx = -self.anomalyWindow[::-1].index(True)-1 + self.anomalyIntervalStart = self.timeWindow[first_anomaly_idx] + self.anomalyIntervalEnd = self.timeWindow[self.windowsize + last_anomaly_idx] + anomaly_intervals.append([self.anomalyIntervalStart, self.anomalyIntervalEnd]) + self.isLastIntervalAnomaly = True + else: + first_anomaly_idx = self.anomalyWindow.index(True) + last_anomaly_idx = -self.anomalyWindow[::-1].index(True)-1 + self.anomalyIntervalEnd = self.timeWindow[self.windowsize + last_anomaly_idx] + anomaly_intervals.pop(-1) + anomaly_intervals.append([self.anomalyIntervalStart,self.anomalyIntervalEnd]) + else: + if self.isLastIntervalAnomaly: + self.isLastIntervalAnomaly = False diff --git a/sdfvae/logger.py b/sdfvae/logger.py new file mode 100644 index 0000000..5b51782 --- /dev/null +++ b/sdfvae/logger.py @@ -0,0 +1,93 @@ +import os +import sys + +import matplotlib.pyplot as plt +import numpy as np + + +class Logger(): + def __init__(self, out, name='loss', xlabel='epoch'): + self.out = out + self.name = name + self.xlabel = xlabel + self.txt_file = os.path.join(out, name + '.txt') + self.plot_file = os.path.join(out, name + '.png') + + def log_trainer(self, epoch, states, t=None): + self._print_trainer(epoch, states, t) + self._plot(epoch, states) + def log_tester(self, epoch, states, t=None): + self._print_tester(epoch, states, t) + + def log_evaluator(self, states): + self._print_eval(states) + + def log_evaluator_re(self, message): + self._print_eval_result(message) + + def _print_trainer(self, epoch, states, t=None): + if t is not None: + if self.xlabel == 'epoch': + message = '(eps: %d, time: %.5f) ' % (epoch, t) + else: + message = '(%s: %d, time: %.5f) ' % (self.xlabel, epoch, t) + else: + if self.xlabel == 'epoch': + message = '(eps: %d) ' % (epoch) + else: + message = '(%s: %d) ' % (self.xlabel, epoch) + for k, v in states.items(): + message += '%s: %.5f ' % (k, v) + + with open(self.txt_file, "a") as f: + f.write('%s\n' % message) + + def _print_tester(self, epoch, states, t=None): + message = '{},{},{}'.format(states['Last_timestamp'], + states['Llh_Lt'], + states['IA']) + with open(self.txt_file, "a") as f: + f.write('%s\n' % message) + + def _print_eval(self, states): + + message = 'th:{}, p:{}, r:{}, f1score:{}, TP:{}, FN:{}, TN:{}, FP:{}, FPR:{}, TPR:{}'.format( + states['Th'], + states['P'], + states['R'], + states['F1score'], + states['TP'], + states['FN'], + states['TN'], + states['FP'], + states['Fpr'], + states['Tpr']) + + with open(self.txt_file, "a") as f: + f.write('%s\n' % message) + + def _print_eval_result(self, message): + with open(self.txt_file, "a") as f: + f.write('%s\n' % message) + + def _plot(self, epoch, states): + if not hasattr(self, 'plot_data'): + self.plot_data = {'X': [], 'Y': [], 'legend': list(states.keys())} + self.plot_data['X'].append(epoch) + self.plot_data['Y'].append( + [states[k] for k in self.plot_data['legend']]) + + fig = plt.figure() + ax = fig.add_subplot(111) + ax.grid() + for i, k in enumerate(self.plot_data['legend']): + ax.plot(np.array(self.plot_data['X']), + np.array(self.plot_data['Y'])[:, i], + label=k) + ax.set_xlabel(self.xlabel) + ax.set_ylabel(self.name) + l = ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) + fig.savefig(self.plot_file, + bbox_extra_artists=(l, ), + bbox_inches='tight') + plt.close() diff --git a/sdfvae/model.py b/sdfvae/model.py new file mode 100644 index 0000000..cd01cd7 --- /dev/null +++ b/sdfvae/model.py @@ -0,0 +1,312 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class ConvUnit(nn.Module): + def __init__(self, in_channels, out_channels, kernel, stride=1, padding=0, nonlinearity=nn.LeakyReLU(0.2)): + super(ConvUnit, self).__init__() + self.model = nn.Sequential( + nn.Conv2d(in_channels, out_channels, kernel, stride, padding), nonlinearity) + def forward(self, x): + return self.model(x) + +class ConvUnitTranspose(nn.Module): + def __init__(self, in_channels, out_channels, kernel, stride=1, padding=0, out_padding=0, nonlinearity=nn.LeakyReLU(0.2)): + super(ConvUnitTranspose, self).__init__() + self.model = nn.Sequential( + nn.ConvTranspose2d(in_channels, out_channels, kernel, stride, padding), nonlinearity) + def forward(self, x): + return self.model(x) + +class LinearUnit(nn.Module): + def __init__(self, in_features, out_features, nonlinearity=nn.LeakyReLU(0.2)): + super(LinearUnit, self).__init__() + self.model = nn.Sequential( + nn.Linear(in_features, out_features),nonlinearity) + def forward(self, x): + return self.model(x) + + +class SDFVAE(nn.Module): + def __init__(self, s_dim=8, d_dim=10, conv_dim=100, hidden_dim=40, + T=20, w=36, n=24, enc_dec='CNN', nonlinearity=None, device=torch.device('cuda:0')): + super(SDFVAE, self).__init__() + + self.s_dim = s_dim + self.d_dim = d_dim + self.conv_dim = conv_dim + self.hidden_dim = hidden_dim + self.T = T + self.w = w + self.n = n + self.enc_dec = enc_dec + self.device = device + self.nonlinearity = nn.LeakyReLU(0.2) if nonlinearity is None else nonlinearity + self.dec_init_dim = self.s_dim+self.d_dim+self.hidden_dim + + self.d_lstm_prior = nn.LSTMCell(self.hidden_dim, self.hidden_dim) + self.d_mean_prior = nn.Linear(self.hidden_dim, self.d_dim) + self.d_logvar_prior = nn.Linear(self.hidden_dim, self.d_dim) + self.phi_d_prior = nn.Sequential( + nn.Linear(self.d_dim, self.hidden_dim), + self.nonlinearity) + + self.enc_d_prior = nn.Sequential( + nn.Linear(self.hidden_dim, self.hidden_dim), + self.nonlinearity, + nn.Linear(self.hidden_dim, self.hidden_dim)) + + # Bidirectional LSTM with option bidirectional=True + self.s_lstm = nn.LSTM(self.conv_dim, self.hidden_dim, + 1, batch_first=True, bidirectional=True) + + self.s_mean = LinearUnit(self.hidden_dim*2, self.s_dim) + self.s_logvar = LinearUnit(self.hidden_dim*2, self.s_dim) + + self.phi_conv = nn.Sequential( + nn.Linear(self.conv_dim, self.hidden_dim), + self.nonlinearity, + nn.Linear(self.hidden_dim, self.hidden_dim), + self.nonlinearity) + + self.phi_d = nn.Sequential( + nn.Linear(self.d_dim, self.hidden_dim), + self.nonlinearity) + + self.enc_d = nn.Sequential( + nn.Linear(2*self.hidden_dim, self.hidden_dim), + self.nonlinearity, + nn.Linear(self.hidden_dim, self.hidden_dim), + self.nonlinearity) + + self.d_mean = nn.Linear(self.hidden_dim, self.d_dim) + self.d_logvar = nn.Linear(self.hidden_dim, self.d_dim) + + self.d_rnn = nn.LSTMCell(2*self.hidden_dim, self.hidden_dim, bias=True) + + # set up the kernel_size, stride, padding of CNN with respect to different n and w + if self.enc_dec == 'CNN': + if self.n == 16 or self.n == 24: + k0_0,s0_0,p0_0=2,2,0 + k0_1,s0_1,p0_1=2,2,0 + k0_2,s0_2,p0_2=2,2,0 + sd_0=int(self.n/(k0_0*k0_1*k0_2)) + elif self.n == 38: + k0_0,s0_0,p0_0=2,2,1 + k0_1,s0_1,p0_1=2,2,0 + k0_2,s0_2,p0_2=2,2,0 + sd_0=int(((s0_0*p0_0)+self.n)/(k0_0*k0_1*k0_2)) + elif self.n == 48: + k0_0,s0_0,p0_0=4,4,0 + k0_1,s0_1,p0_1=2,2,0 + k0_2,s0_2,p0_2=2,2,0 + sd_0=int(self.n/(k0_0*k0_1*k0_2)) + else: + raise ValueError('Invalid Kpi numbers: {}, choose from the candidate set [16,24,38,48].'.format(self.n)) + + if self.w == 36: + k1_0,s1_0,p1_0=3,3,0 + k1_1,s1_1,p1_1=2,2,0 + k1_2,s1_2,p1_2=2,2,0 + sd_1=int(self.w/(k1_0*k1_1*k1_2)) + elif self.w == 144: + k1_0,s1_0,p1_0=4,4,0 + k1_1,s1_1,p1_1=4,4,0 + k1_2,s1_2,p1_2=3,3,0 + sd_1=int(self.w/(k1_0*k1_1*k1_2)) + elif self.w == 288: + k1_0,s1_0,p1_0=8,8,0 + k1_1,s1_1,p1_1=4,4,0 + k1_2,s1_2,p1_2=3,3,0 + sd_1=int(self.w/(k1_0*k1_1*k1_2)) + else: + raise ValueError('Invalid window size: {}, choose from the set [36,144,288]'.format(self.w)) + + self.krl = [[k0_0,k1_0],[k0_1,k1_1],[k0_2,k1_2]] + self.srd =[[s0_0,s1_0],[s0_1,s1_1],[s0_2,s1_2]] + self.pd = [[p0_0,p1_0],[p0_1,p1_1],[p0_2,p1_2]] + self.cd = [64,sd_0,sd_1] + + self.conv = nn.Sequential( + ConvUnit(1, 8, kernel=(self.krl[0][0],self.krl[0][1]), + stride=(self.srd[0][0],self.srd[0][1]), + padding=(self.pd[0][0],self.pd[0][1])), + ConvUnit(8, 32, kernel=(self.krl[1][0],self.krl[1][1]), + stride=(self.srd[1][0],self.srd[1][1]), + padding=(self.pd[1][0],self.pd[1][1])), + ConvUnit(32, 64, kernel=(self.krl[2][0],self.krl[2][1]), + stride=(self.srd[2][0],self.srd[2][1]), + padding=(self.pd[2][0],self.pd[2][1])) + ) + + self.conv_fc = nn.Sequential( + LinearUnit(self.cd[0]*self.cd[1]*self.cd[2], self.conv_dim*2), + LinearUnit(self.conv_dim*2, self.conv_dim)) + + self.deconv_fc_mu = nn.Sequential( + LinearUnit(self.dec_init_dim, self.conv_dim*2), + LinearUnit(self.conv_dim*2, self.cd[0]*self.cd[1]*self.cd[2])) + self.deconv_mu = nn.Sequential( + ConvUnitTranspose(64, 32, kernel=(self.krl[2][0],self.krl[2][1]), + stride=(self.srd[2][0],self.srd[2][1]), + padding=(self.pd[2][0],self.pd[2][1])), + ConvUnitTranspose(32, 8, kernel=(self.krl[1][0],self.krl[1][1]), + stride=(self.srd[1][0],self.srd[1][1]), + padding=(self.pd[1][0],self.pd[1][1])), + ConvUnitTranspose(8, 1, kernel=(self.krl[0][0],self.krl[0][1]), + stride=(self.srd[0][0],self.srd[0][1]), + padding=(self.pd[0][0],self.pd[0][1]), + nonlinearity=nn.Tanh()) + ) + self.deconv_fc_logvar = nn.Sequential( + LinearUnit(self.dec_init_dim, self.conv_dim*2), + LinearUnit(self.conv_dim*2, self.cd[0]*self.cd[1]*self.cd[2])) + self.deconv_logvar = nn.Sequential( + ConvUnitTranspose(64, 32, kernel=(self.krl[2][0],self.krl[2][1]), + stride=(self.srd[2][0],self.srd[2][1]), + padding=(self.pd[2][0],self.pd[2][1])), + ConvUnitTranspose(32, 8, kernel=(self.krl[1][0],self.krl[1][1]), + stride=(self.srd[1][0],self.srd[1][1]), + padding=(self.pd[1][0],self.pd[1][1])), + ConvUnitTranspose(8, 1, kernel=(self.krl[0][0],self.krl[0][1]), + stride=(self.srd[0][0],self.srd[0][1]), + padding=(self.pd[0][0],self.pd[0][1]), + nonlinearity=nn.Tanh()) + ) + + else: + raise ValueError('Unknown encoder and decoder: {}'.format(self.enc_dec)) + + for m in self.modules(): + if isinstance(m, nn.Conv1d) or isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d) or isinstance(m, nn.Linear): + nn.init.kaiming_normal_(m.weight) + + + def sample_d_lstmcell(self, batch_size, random_sampling=True): + d_out = None + d_means = None + d_logvars = None + + d_t = torch.zeros(batch_size, self.d_dim, device=self.device) + d_mean_t = torch.zeros(batch_size, self.d_dim, device=self.device) + d_logvar_t = torch.zeros(batch_size, self.d_dim, device=self.device) + h_t = torch.zeros(batch_size, self.hidden_dim, device=self.device) + c_t = torch.zeros(batch_size, self.hidden_dim, device=self.device) + + for _ in range(self.T): + enc_d_t = self.enc_d_prior(h_t) + d_mean_t = self.d_mean_prior(enc_d_t) + d_logvar_t = self.d_logvar_prior(enc_d_t) + d_t = self.reparameterize(d_mean_t, d_logvar_t, random_sampling) + phi_d_t = self.phi_d_prior(d_t) + h_t, c_t = self.d_lstm_prior(phi_d_t, (h_t, c_t)) + if d_out is None: + d_out = d_t.unsqueeze(1) + d_means = d_mean_t.unsqueeze(1) + d_logvars = d_logvar_t.unsqueeze(1) + else: + d_out = torch.cat((d_out, d_t.unsqueeze(1)), dim=1) + d_means = torch.cat((d_means, d_mean_t.unsqueeze(1)), dim=1) + d_logvars = torch.cat((d_logvars, d_logvar_t.unsqueeze(1)), dim=1) + return d_means, d_logvars, d_out + + + + def encode_frames(self, x): + if self.enc_dec == 'CNN': + x = x.view(-1, 1, self.n, self.w) + x = self.conv(x) + x = x.view(-1, self.cd[0]*self.cd[1]*self.cd[2]) + x = self.conv_fc(x) + x = x.view(-1, self.T, self.conv_dim) + else: + raise ValueError('Unknown encoder and decoder: {}'.format(self.enc_dec)) + return x + + def decode_frames_mu(self, sdh): + if self.enc_dec == 'CNN': + x = self.deconv_fc_mu(sdh) + x = x.view(-1, self.cd[0], self.cd[1], self.cd[2]) + x = self.deconv_mu(x) + x = x.view(-1, self.T, 1, self.n, self.w) + else: + raise ValueError('Unknown encoder and decoder: {}'.format(self.enc_dec)) + return x + + + def decode_frames_logvar(self, sdh): + if self.enc_dec == 'CNN': + x = self.deconv_fc_logvar(sdh) + x = x.view(-1, self.cd[0], self.cd[1], self.cd[2]) + x = self.deconv_logvar(x) + x = x.view(-1, self.T, 1, self.n, self.w) + else: + raise ValueError('Unknown encoder and decoder: {}'.format(self.enc_dec)) + return x + + def reparameterize(self, mean, logvar, random_sampling=True): + if random_sampling is True: + eps = torch.randn_like(logvar) + std = torch.exp(0.5*logvar) + z = mean + eps*std + return z + else: + return mean + + def encode_s(self, x): + lstm_out, _ = self.s_lstm(x) + backward = lstm_out[:, 0, self.hidden_dim:self.hidden_dim*2] + frontal = lstm_out[:, 1, 0:self.hidden_dim] + lstm_out = torch.cat((frontal, backward),dim=1) + mean = self.s_mean(lstm_out) + logvar = self.s_logvar(lstm_out) + s = self.reparameterize(mean, logvar, self.training) + return mean, logvar, s + + + def encode_d(self, batch_size, x): + d_out = None + d_means = None + d_logvars = None + h_out = None + + d_t = torch.zeros(batch_size, self.d_dim, device=self.device) + d_mean_t = torch.zeros(batch_size, self.d_dim, device=self.device) + d_logvar_t = torch.zeros(batch_size, self.d_dim, device=self.device) + + h_t = torch.zeros(batch_size, self.hidden_dim, device=self.device) + c_t = torch.zeros(batch_size, self.hidden_dim, device=self.device) + + for t in range(self.T): + phi_conv_t = self.phi_conv(x[:,t,:]) + enc_d_t = self.enc_d(torch.cat([phi_conv_t, h_t], 1)) + d_mean_t = self.d_mean(enc_d_t) + d_logvar_t = self.d_logvar(enc_d_t) + d_t = self.reparameterize(d_mean_t, d_logvar_t, self.training) + phi_d_t = self.phi_d(d_t) + if d_out is None: + d_out = d_t.unsqueeze(1) + d_means = d_mean_t.unsqueeze(1) + d_logvars = d_logvar_t.unsqueeze(1) + h_out = h_t.unsqueeze(1) + else: + d_out = torch.cat((d_out, d_t.unsqueeze(1)), dim=1) + d_means = torch.cat((d_means, d_mean_t.unsqueeze(1)), dim=1) + d_logvars = torch.cat((d_logvars, d_logvar_t.unsqueeze(1)), dim=1) + h_out = torch.cat((h_out, h_t.unsqueeze(1)), dim=1) + h_t, c_t = self.d_rnn(torch.cat([phi_conv_t, phi_d_t], 1), (h_t, c_t)) + return d_means, d_logvars, d_out, h_out + + def forward(self, x): + x = x.float() + d_mean_prior, d_logvar_prior, _ = self.sample_d_lstmcell(x.size(0), random_sampling = self.training) + x_hat = self.encode_frames(x) + d_mean, d_logvar, d, h = self.encode_d(x.size(0), x_hat) + s_mean, s_logvar, s = self.encode_s(x_hat) + s_expand = s.unsqueeze(1).expand(-1, self.T, self.s_dim) + ds = torch.cat((d, s_expand), dim=2) + dsh = torch.cat((ds, h), dim=2) + recon_x_mu = self.decode_frames_mu(dsh) + recon_x_logvar = self.decode_frames_logvar(dsh) + return s_mean, s_logvar, s, d_mean, d_logvar, d, d_mean_prior, d_logvar_prior, recon_x_mu, recon_x_logvar + diff --git a/sdfvae/scripts.txt b/sdfvae/scripts.txt new file mode 100644 index 0000000..7dead05 --- /dev/null +++ b/sdfvae/scripts.txt @@ -0,0 +1,53 @@ +# Scripts + +# 1. Data Preprocessing + +# OPTIONS +# raw_data_file: KPI data file +# label_file: The corresponding ground-truth file +# train_data_path: The path of the preprocessed training set +# test_data_path: The path of the preprocessed testing set +# test_start_time: The data later than this timestamp is considered as the testing data + +# VoD1 +python data_preprocess.py --raw_data_file data/vod1-data.csv --label_file data/vod1-label.csv --train_data_path data_processed/vod1-train --test_data_path data_processed/vod1-test --test_start_time 20181107000000 + +# Live +python data_preprocess.py --raw_data_file data/live-data.csv --label_file data/live-label.csv --train_data_path data_processed/live-train --test_data_path data_processed/live-test --test_start_time 20181120121500 + +# Machine-1-1 +python data_preprocess.py --raw_data_file data/machine-1-1-data.csv --label_file data/machine-1-1-label.csv --train_data_path data_processed/machine-1-1-train --test_data_path data_processed/machine-1-1-test --test_start_time 20190923005800 + +# Machine-1-5 +python data_preprocess.py --raw_data_file data/machine-1-5-data.csv --label_file data/machine-1-5-label.csv --train_data_path data_processed/machine-1-5-train --test_data_path data_processed/machine-1-5-test --test_start_time 20190919162400 + + +# 2. Training, Testing and Evaluation +# OPTIONS +# dataset_path: The path of the processed training or testing dataset +# data_nums: The size of training or testing dataset +# checkpoints_path: The path to store the trained models +# n: The number of KPIs +# start_epoch: The model with corresponding training epochs will be resumed for testing, default is 30 +# Set it to 40 please, if you want to test the model which trained 40 epochs +# llh_path: The path of log-likelihood (anomaly score) file output by testing + +# VoD1 +python trainer.py --dataset_path ../data_preprocess/data_processed/vod1-train --data_nums 10430 --gpu_id 0 --log_path log_trainer/vod1 --checkpoints_path model/vod1 --n 24 +nohup python tester.py --dataset_path ../data_preprocess/data_processed/vod1-test --data_nums 11690 --gpu_id 0 --log_path log_tester/vod1 --checkpoints_path model/vod1 --n 24 --start_epoch 30 2>&1 & +nohup python evaluation.py --llh_path log_tester/vod1 --log_path log_evaluator/vod1 --n 24 --start_epoch 30 2>&1 & + +# Live +python trainer.py --dataset_path ../data_preprocess/data_processed/live-train --data_nums 7582 --gpu_id 0 --log_path log_trainer/live --checkpoints_path model/live --n 48 +nohup python tester.py --dataset_path ../data_preprocess/data_processed/live-test --data_nums 7800 --gpu_id 0 --log_path log_tester/live --checkpoints_path model/live --n 48 --start_epoch 30 2>&1 & +nohup python evaluation.py --llh_path log_tester/live --log_path log_evaluator/live --n 48 --start_epoch 30 2>&1 & + +# Machine-1-1 +python trainer.py --dataset_path ../data_preprocess/data_processed/machine-1-1-train --data_nums 28253 --gpu_id 0 --log_path log_trainer/machine-1-1 --checkpoints_path model/machine-1-1 --n 38 +nohup python tester.py --dataset_path ../data_preprocess/data_processed/machine-1-1-test --data_nums 28469 --gpu_id 0 --log_path log_tester/machine-1-1 --checkpoints_path model/machine-1-1 --n 38 --start_epoch 30 2>&1 & +nohup python evaluation.py --llh_path log_tester/machine-1-1 --log_path log_evaluator/machine-1-1 --n 38 --start_epoch 30 2>&1 & + +# Machine-1-5 +python trainer.py --dataset_path ../data_preprocess/data_processed/machine-1-5-train --data_nums 23480 --gpu_id 0 --log_path log_trainer/machine-1-5 --checkpoints_path model/machine-1-5 --n 38 +nohup python tester.py --dataset_path ../data_preprocess/data_processed/machine-1-5-test --data_nums 23695 --gpu_id 0 --log_path log_tester/machine-1-5 --checkpoints_path model/machine-1-5 --n 38 --start_epoch 30 2>&1 & +nohup python evaluation.py --llh_path log_tester/machine-1-5 --log_path log_evaluator/machine-1-5 --n 38 --start_epoch 30 2>&1 & diff --git a/sdfvae/tester.py b/sdfvae/tester.py new file mode 100644 index 0000000..bad34d2 --- /dev/null +++ b/sdfvae/tester.py @@ -0,0 +1,201 @@ +import torch +import os +import argparse +import torch.nn.functional as F +import torch.optim as optim +import numpy as np +import math +from model import * +from tqdm import * +from util import KpiReader +from logger import Logger + +class Tester(object): + def __init__(self, model, device, test, testloader, log_path='log_tester', log_file='loss', learning_rate=0.0002, checkpoints=None): + self.model = model + self.model.to(device) + self.device = device + self.test = test + self.testloader = testloader + self.log_path = log_path + self.log_file = log_file + self.learning_rate = learning_rate + self.checkpoints = checkpoints + self.start_epoch = 0 + self.optimizer = optim.Adam(self.model.parameters(), self.learning_rate) + self.epoch_losses = [] + self.logger = Logger(self.log_path, self.log_file) + self.loss = {} + + + def load_checkpoint(self, start_ep): + try: + print ("Loading Chechpoint from ' {} '".format(self.checkpoints+'_epochs{}.pth'.format(start_ep))) + checkpoint = torch.load(self.checkpoints+'_epochs{}.pth'.format(start_ep)) + self.start_epoch = checkpoint['epoch'] + self.model.load_state_dict(checkpoint['state_dict']) + self.optimizer.load_state_dict(checkpoint['optimizer']) + self.epoch_losses = checkpoint['losses'] + print ("Resuming Training From Epoch {}".format(self.start_epoch)) + except: + print ("No Checkpoint Exists At '{}', Starting Fresh Training".format(self.checkpoints+'_epochs{}.pth'.format(start_ep))) + self.start_epoch = 0 + + def model_test(self): + self.model.eval() + for i, dataitem in enumerate(self.testloader,1): + timestamps,labels,data = dataitem + data = data.to(self.device) + s_mean, s_logvar, s, d_post_mean, d_post_logvar, d, d_prior_mean, d_prior_logvar, recon_x_mu, recon_x_logvar = self.forward_test(data) + avg_loss, llh, kld_s, kld_d = self.loss_fn(data, recon_x_mu, recon_x_logvar, s_mean, s_logvar, + d_post_mean, d_post_logvar, d_prior_mean, d_prior_logvar) + last_timestamp = timestamps[-1,-1,-1,-1] + label_last_timestamp_tensor = labels[-1,-1,-1,-1] + + anomaly_index = (label_last_timestamp_tensor.numpy() == 1) + anomaly_nums = len(label_last_timestamp_tensor.numpy()[anomaly_index]) + if anomaly_nums >= 1: + isanomaly = "Anomaly" + else: + isanomaly = "Normaly" + llh_last_timestamp = self.loglikelihood_last_timestamp(data[-1,-1,-1,:,-1], + recon_x_mu[-1,-1,-1,:,-1], + recon_x_logvar[-1,-1,-1,:,-1]) + + self.loss['Last_timestamp'] = last_timestamp.item() + self.loss['Avg_loss'] = avg_loss.item() + self.loss['Llh'] = llh.item() + self.loss['KL_s'] = kld_s.item() + self.loss['KL_d'] = kld_d.item() + self.loss['Llh_Lt'] = llh_last_timestamp.item() + self.loss['IA'] = isanomaly + self.logger.log_tester(self.start_epoch, self.loss) + print ("Testing is complete!") + + def forward_test(self, data): + with torch.no_grad(): + s_mean, s_logvar, s, d_post_mean, d_post_logvar, d, d_prior_mean, d_prior_logvar, recon_x_mu, recon_x_logvar= self.model(data) + return s_mean, s_logvar, s, d_post_mean, d_post_logvar, d, d_prior_mean, d_prior_logvar, recon_x_mu, recon_x_logvar + + + + def loss_fn(self, original_seq, recon_seq_mu, recon_seq_logvar, s_mean, s_logvar, d_post_mean, d_post_logvar, d_prior_mean, d_prior_logvar): + batch_size = original_seq.size(0) + loglikelihood = -0.5 * torch.sum(torch.pow(((original_seq.float()-recon_seq_mu.float())/torch.exp(recon_seq_logvar.float())), 2) + + 2 * recon_seq_logvar.float() + + np.log(np.pi*2)) + kld_s = -0.5 * torch.sum(1 + s_logvar - torch.pow(s_mean, 2) - torch.exp(s_logvar)) + d_post_var = torch.exp(d_post_logvar) + d_prior_var = torch.exp(d_prior_logvar) + kld_d = 0.5 * torch.sum(d_prior_logvar - d_post_logvar + + ((d_post_var + torch.pow(d_post_mean - d_prior_mean, 2)) / d_prior_var) + - 1) + return (-loglikelihood + kld_s + kld_d)/batch_size, loglikelihood/batch_size, kld_s/batch_size, kld_d/batch_size + + + def loglikelihood_last_timestamp(self, x, recon_x_mu, recon_x_logvar): + llh = -0.5 * torch.sum(torch.pow(((x.float()-recon_x_mu.float())/torch.exp(recon_x_logvar.float())), 2) + + 2 * recon_x_logvar.float() + + np.log(np.pi*2)) + return llh + + +def main(): + parser = argparse.ArgumentParser() + # GPU option + parser.add_argument('--gpu_id', type=int, default=1) + # Dataset options + parser.add_argument('--dataset_path', type=str, default='') + parser.add_argument('--data_nums', type=int, default=0) + parser.add_argument('--batch_size', type=int, default=1) + parser.add_argument('--num_workers', type=int, default=4) + parser.add_argument('--T', type=int, default=20) + parser.add_argument('--win_size', type=int, default=36) + parser.add_argument('--l', type=int, default=10) + parser.add_argument('--n', type=int, default=24) + + # Model options + parser.add_argument('--s_dims', type=int, default=8) + parser.add_argument('--d_dims', type=int, default=10) + parser.add_argument('--conv_dims', type=int, default=100) + parser.add_argument('--hidden_dims', type=int, default=40) + parser.add_argument('--enc_dec', type=str, default='CNN') + + # Training and testing options + parser.add_argument('--learning_rate', type=float, default=0.0002) + parser.add_argument('--epochs', type=int, default=50) + parser.add_argument('--start_epoch', type=int, default=30) + parser.add_argument('--checkpoints_path', type=str, default='model') + parser.add_argument('--checkpoints_file', type=str, default='') + parser.add_argument('--checkpoints_interval', type=int, default=10) + parser.add_argument('--log_path', type=str, default='log_tester') + parser.add_argument('--log_file', type=str, default='') + + args = parser.parse_args() + + # Set up GPU + if torch.cuda.is_available() and args.gpu_id >= 0: + device = torch.device('cuda:%d' % args.gpu_id) + else: + device = torch.device('cpu') + + if not os.path.exists(args.log_path): + os.makedirs(args.log_path) + + if not os.path.exists(args.dataset_path): + raise ValueError('Unknown dataset path: {}'.format(args.dataset_path)) + + if not os.path.exists(args.checkpoints_path): + raise ValueError('Unknown checkpoints path: {}'.format(checkpoints_path)) + + if args.data_nums == 0: + raise ValueError('Wrong data numbers: {}'.format(args.data_nums)) + + if args.checkpoints_file == '': + args.checkpoints_file = 'sdim{}_ddim{}_cdim{}_hdim{}_winsize{}_T{}_l{}'.format( + args.s_dims, + args.d_dims, + args.conv_dims, + args.hidden_dims, + args.win_size, + args.T, + args.l) + if args.log_file == '': + args.log_file = 'sdim{}_ddim{}_cdim{}_hdim{}_winsize{}_T{}_l{}_epochs{}_loss'.format( + args.s_dims, + args.d_dims, + args.conv_dims, + args.hidden_dims, + args.win_size, + args.T, + args.l, + args.start_epoch) + + kpi_value_test = KpiReader(args.dataset_path, args.data_nums) + + test_loader = torch.utils.data.DataLoader(kpi_value_test, + batch_size = args.batch_size, + shuffle = False, + num_workers = args.num_workers) + sdfvae = SDFVAE(s_dim = args.s_dims, + d_dim = args.d_dims, + conv_dim = args.conv_dims, + hidden_dim = args.hidden_dims, + T = args.T, + w = args.win_size, + n = args.n, + enc_dec = args.enc_dec, + device = device) + + tester = Tester(sdfvae, device, kpi_value_test, test_loader, + log_path = args.log_path, + log_file = args.log_file, + learning_rate = args.learning_rate, + checkpoints = os.path.join(args.checkpoints_path,args.checkpoints_file)) + + tester.load_checkpoint(args.start_epoch) + + tester.model_test() + +if __name__ == '__main__': + main() diff --git a/sdfvae/trainer.py b/sdfvae/trainer.py new file mode 100644 index 0000000..27edade --- /dev/null +++ b/sdfvae/trainer.py @@ -0,0 +1,208 @@ +import torch +import os +import time +import argparse +import torch.nn.functional as F +import torch.optim as optim +import numpy as np +import math +from model import * +from tqdm import * +from util import KpiReader +from logger import Logger + +class Trainer(object): + def __init__(self, model, train, trainloader, log_path='log_trainer', + log_file='loss', epochs=50, batch_size=64, learning_rate=0.001, + checkpoints='kpi_model.path', checkpoints_interval = 10, device=torch.device('cuda:0')): + self.trainloader = trainloader + self.train = train + self.log_path = log_path + self.log_file = log_file + self.start_epoch = 0 + self.epochs = epochs + self.device = device + self.batch_size = batch_size + self.model = model + self.model.to(device) + self.learning_rate = learning_rate + self.checkpoints = checkpoints + self.checkpoints_interval = checkpoints_interval + self.optimizer = optim.Adam(self.model.parameters(), self.learning_rate) + self.epoch_losses = [] + self.loss = {} + self.logger = Logger(self.log_path, self.log_file) + + def save_checkpoint(self, epoch): + torch.save({'epoch': epoch + 1, + 'state_dict': self.model.state_dict(), + 'optimizer': self.optimizer.state_dict(), + 'losses': self.epoch_losses}, + self.checkpoints + '_epochs{}.pth'.format(epoch+1)) + + def load_checkpoint(self, start_ep): + try: + print ("Loading Chechpoint from ' {} '".format(self.checkpoints+'_epochs{}.pth'.format(start_ep))) + checkpoint = torch.load(self.checkpoints+'_epochs{}.pth'.format(start_ep)) + self.start_epoch = checkpoint['epoch'] + self.model.load_state_dict(checkpoint['state_dict']) + self.optimizer.load_state_dict(checkpoint['optimizer']) + self.epoch_losses = checkpoint['losses'] + print ("Resuming Training From Epoch {}".format(self.start_epoch)) + except: + print ("No Checkpoint Exists At '{}', Starting Fresh Training".format(self.checkpoints)) + self.start_epoch = 0 + + def loss_fn(self, original_seq, recon_seq_mu, recon_seq_logvar, s_mean, + s_logvar, d_post_mean, d_post_logvar, d_prior_mean, d_prior_logvar): + batch_size = original_seq.size(0) + loglikelihood = -0.5 * torch.sum(torch.pow(((original_seq.float()-recon_seq_mu.float())/torch.exp(recon_seq_logvar.float())), 2) + + 2 * recon_seq_logvar.float() + + np.log(np.pi*2)) + kld_s = -0.5 * torch.sum(1 + s_logvar - torch.pow(s_mean, 2) - torch.exp(s_logvar)) + + d_post_var = torch.exp(d_post_logvar) + d_prior_var = torch.exp(d_prior_logvar) + kld_d = 0.5 * torch.sum(d_prior_logvar - d_post_logvar + + ((d_post_var + torch.pow(d_post_mean - d_prior_mean, 2)) / d_prior_var) + - 1) + return (-loglikelihood + kld_s + kld_d)/batch_size, loglikelihood/batch_size, kld_s/batch_size, kld_d/batch_size + + def train_model(self): + self.model.train() + for epoch in range(self.start_epoch, self.epochs): + losses = [] + llhs = [] + kld_ss = [] + kld_ds = [] + print ("Running Epoch : {}".format(epoch+1)) + for i, dataitem in tqdm(enumerate(self.trainloader,1)): + _,_,data = dataitem + data = data.to(self.device) + self.optimizer.zero_grad() + s_mean, s_logvar, s, d_post_mean, d_post_logvar, d, d_prior_mean, d_prior_logvar, recon_x_mu, recon_x_logvar = self.model(data) + loss, llh, kld_s, kld_d = self.loss_fn(data, recon_x_mu, recon_x_logvar, s_mean, s_logvar, + d_post_mean, d_post_logvar, d_prior_mean, d_prior_logvar) + loss.backward() + self.optimizer.step() + losses.append(loss.item()) + llhs.append(llh.item()) + kld_ss.append(kld_s.item()) + kld_ds.append(kld_d.item()) + meanloss = np.mean(losses) + meanllh = np.mean(llhs) + means = np.mean(kld_ss) + meand = np.mean(kld_ds) + self.epoch_losses.append(meanloss) + print ("Epoch {} : Average Loss: {} Loglikelihood: {} KL of s: {} KL of d: {}".format(epoch+1, meanloss, meanllh, means, meand)) + self.loss['Epoch'] = epoch+1 + self.loss['Avg_loss'] = meanloss + self.loss['Llh'] = meanllh + self.loss['KL_s'] = means + self.loss['KL_d'] = meand + self.logger.log_trainer(epoch+1, self.loss) + if (self.checkpoints_interval > 0 + and (epoch+1) % self.checkpoints_interval == 0): + self.save_checkpoint(epoch) + print ("Training is complete!") + + +def main(): + parser = argparse.ArgumentParser() + # GPU option + parser.add_argument('--gpu_id', type=int, default=0) + # Dataset options + parser.add_argument('--dataset_path', type=str, default='') + parser.add_argument('--data_nums', type=int, default=0) + parser.add_argument('--batch_size', type=int, default=64) + parser.add_argument('--num_workers', type=int, default=4) + parser.add_argument('--T', type=int, default=20) + parser.add_argument('--win_size', type=int, default=36) + parser.add_argument('--l', type=int, default=10) + parser.add_argument('--n', type=int, default=24) + + # Model options + parser.add_argument('--s_dims', type=int, default=8) + parser.add_argument('--d_dims', type=int, default=10) + parser.add_argument('--conv_dims', type=int, default=100) + parser.add_argument('--hidden_dims', type=int, default=40) + parser.add_argument('--enc_dec', type=str, default='CNN') + + # Training options + parser.add_argument('--learning_rate', type=float, default=0.0002) + parser.add_argument('--epochs', type=int, default=50) + parser.add_argument('--start_epoch', type=int, default=0) + parser.add_argument('--checkpoints_path', type=str, default='model') + parser.add_argument('--checkpoints_file', type=str, default='') + parser.add_argument('--checkpoints_interval', type=int, default=10) + parser.add_argument('--log_path', type=str, default='log_trainer') + parser.add_argument('--log_file', type=str, default='') + + args = parser.parse_args() + + # Set up GPU + if torch.cuda.is_available() and args.gpu_id >= 0: + device = torch.device('cuda:%d' % args.gpu_id) + else: + device = torch.device('cpu') + + if not os.path.exists(args.dataset_path): + raise ValueError('Unknown dataset path: {}'.format(args.dataset_path)) + + if args.data_nums == 0: + raise ValueError('Wrong data numbers: {}'.format(args.data_nums)) + + if not os.path.exists(args.log_path): + os.makedirs(args.log_path) + + if not os.path.exists(args.checkpoints_path): + os.makedirs(args.checkpoints_path) + if args.checkpoints_file == '': + args.checkpoints_file = 'sdim{}_ddim{}_cdim{}_hdim{}_winsize{}_T{}_l{}'.format( + args.s_dims, + args.d_dims, + args.conv_dims, + args.hidden_dims, + args.win_size, + args.T,args.l) + if args.log_file == '': + args.log_file = 'sdim{}_ddim{}_cdim{}_hdim{}_winsize{}_T{}_l{}_loss'.format( + args.s_dims, + args.d_dims, + args.conv_dims, + args.hidden_dims, + args.win_size, + args.T,args.l) + + kpi_value_train = KpiReader(args.dataset_path, args.data_nums) + + train_loader = torch.utils.data.DataLoader(kpi_value_train, + batch_size = args.batch_size, + shuffle = True, + num_workers = args.num_workers) + + sdfvae = SDFVAE(s_dim = args.s_dims, + d_dim = args.d_dims, + conv_dim = args.conv_dims, + hidden_dim = args.hidden_dims, + T = args.T, + w = args.win_size, + n = args.n, + enc_dec = args.enc_dec, + device = device) + + trainer = Trainer(sdfvae, kpi_value_train, train_loader, + log_path = args.log_path, + log_file = args.log_file, + batch_size = args.batch_size, + epochs = args.epochs, + learning_rate = args.learning_rate, + checkpoints = os.path.join(args.checkpoints_path,args.checkpoints_file), + checkpoints_interval = args.checkpoints_interval, + device = device) + + trainer.load_checkpoint(args.start_epoch) + trainer.train_model() + +if __name__ == '__main__': + main() diff --git a/sdfvae/util.py b/sdfvae/util.py new file mode 100644 index 0000000..22e0dea --- /dev/null +++ b/sdfvae/util.py @@ -0,0 +1,16 @@ +import torch +import os +import torch.utils.data as data + +class KpiReader(data.Dataset): + def __init__(self, path, size): + self.path = path + self.length = size + + def __len__(self): + return self.length + + def __getitem__(self, idx): + item = torch.load(self.path+'/%d.seq' % (idx+1)) + return item['ts'], item['label'], item['value'] + -- GitLab