Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import numpy as np
import argparse
import pandas as pd
import os, sys
import math
import string
import torch
import time
from sklearn import preprocessing
def data_preprocess(raw_data_file,label_file,train_data_path,test_data_path,test_time,win_size=36,l=10,T=20,fr=(-1,1)):
if not os.path.exists(raw_data_file):
raise ValueError('Unknown input data file: {}'.format(raw_data_file))
if not os.path.exists(label_file):
raise ValueError('Unknown input label file: {}'.format(label_file))
# get raw data and the corresponding labels
raw_data = np.array(pd.read_csv(raw_data_file, header = 0), dtype=np.float64).T
# data normalization, default is [-1,1]
min_max_scaler = preprocessing.MinMaxScaler(feature_range=fr)
scaled_data = min_max_scaler.fit_transform(raw_data).T
raw_ts_label = np.array(pd.read_csv(label_file, header = None), dtype=np.int64)
raw_ts = raw_ts_label[0,:]
raw_ts = np.array([raw_ts.tolist()])
raw_label = raw_ts_label[1,:]
raw_label = np.array([raw_label.tolist()])
rectangle_samples = []
rectangle_labels = []
rectangle_tss = []
for j in range(l):
rectangle_sample = []
rectangle_label = []
rectangle_ts = []
for i in range(0, scaled_data.shape[1]-win_size, l):
if i+j <= scaled_data.shape[1]-win_size:
scaled_data_tmp = scaled_data[:,i+j:i+j+win_size]
rectangle_sample.append(scaled_data_tmp.tolist())
raw_label_tmp = raw_label[:,i+j:i+j+win_size]
rectangle_label.append(raw_label_tmp.tolist())
raw_ts_tmp = raw_ts[:,i+j:i+j+win_size]
rectangle_ts.append(raw_ts_tmp.tolist())
rectangle_samples.append(np.array(rectangle_sample))
rectangle_labels.append(np.array(rectangle_label))
rectangle_tss.append(np.array(rectangle_ts))
if not os.path.exists(train_data_path):
os.makedirs(train_data_path)
if not os.path.exists(test_data_path):
os.makedirs(test_data_path)
train_sample_id = 1
test_sample_id = 1
test_time_stamp = int(time.mktime(time.strptime(str(test_time), '%Y%m%d%H%M%S')))
for i in range(len(rectangle_samples)):
for data_id in range(T, len(rectangle_samples[i])):
kpi_data = rectangle_samples[i][data_id-T:data_id]
kpi_label = rectangle_labels[i][data_id-T:data_id]
kpi_ts = rectangle_tss[i][data_id-T:data_id]
kpi_data = torch.tensor(kpi_data).unsqueeze(1)
data = {'ts':kpi_ts,
'label':kpi_label,
'value':kpi_data}
cur_timestamp = kpi_ts[-1][-1][-1]
cur_time_stamp = int(time.mktime(time.strptime(str(cur_timestamp), '%Y%m%d%H%M%S')))
if cur_time_stamp < test_time_stamp:
path_temp = os.path.join(train_data_path, str(train_sample_id))
torch.save(data, path_temp + '.seq')
train_sample_id += 1
else:
path_temp = os.path.join(test_data_path, str(test_sample_id))
torch.save(data, path_temp + '.seq')
test_sample_id += 1
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--raw_data_file', type=str, default='')
parser.add_argument('--label_file', type=str, default='')
parser.add_argument('--train_data_path', type=str, default='')
parser.add_argument('--test_data_path', type=str, default='')
parser.add_argument('--test_start_time', type=str, default='')
parser.add_argument('--T', type=int, default=20)
parser.add_argument('--win_size', type=int, default=36)
parser.add_argument('--l', type=int, default=10)
args = parser.parse_args()
data_preprocess(args.raw_data_file,
args.label_file,
args.train_data_path,
args.test_data_path,
args.test_start_time,
win_size = args.win_size,
l = args.l,
T = args.T)
if __name__ == '__main__':
main()