Concept
λͺκ°μ§ κ°λ
μ 리 λ° μ½λλ₯Ό λ³΄κ³ μ§νν©λλ€.
Outlier removal workflow
μ°¨λΆ(Difference):
sort ν κ° νλͺ©μ μ°¨μ΄ κ°
μ°¨λΆ κ°μ λ°°μ΄μ λ§λ€ μ μμ.
μ¬λΆμμ(Quartile):
λ°μ΄ν°λ₯Ό κ°μ₯ μμ μλΆν° κ°μ₯ ν° μκΉμ§ ν¬κΈ°κ° 컀μ§λ μμλλ‘ μ λ ¬νμμ λ,
1/4, 2/4, 3/4 μμΉμ μλ μλ₯Ό λ§νλ€.
κ°κ° 1μ¬λΆμμ, 2μ¬λΆμμ, 3μ¬λΆμμλΌκ³ νλ€.
1/4μ μμΉλ μ 체 λ°μ΄ν°μ μκ° λ§μ½ 100κ°μ΄λ©΄ 25λ²μ§Έ μμ, μ¦ νμ 25%λ₯Ό λ§νλ€.
λ°λΌμ 2μ¬λΆμμλ μ€μκ°κ³Ό κ°λ€.
λλ‘λ μμΉλ₯Ό 1/100 λ¨μλ‘ λλ λ°±λΆμμ(percentile)μ μ¬μ©νκΈ°λ νλ€. 1μ¬λΆμμλ 25% λ°±λΆμμμ κ°λ€.
np.percentile(x, 0) # μ΅μκ°
np.percentile(x, 25) # 1μ¬λΆμ μ
np.percentile(x, 50) # 2μ¬λΆμ μ
np.percentile(x, 75) # 3μ¬λΆμ μ
np.percentile(x, 100) # μ΅λκ°
μλλ μμ μ€λͺ
ν κ°λ
μ μ΄μ©ν μμ νλ‘κ·Έλ¨μ
λλ€.
Oulier removal using diffrence series
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
# Set lower and upper bound of a sequence
# Percentile -> Difference -> x sigma
import numpy as np
import pandas ad pd
"""
# numpy.argmin
# min(), max() : μ΅μ, μ΅λκ°μ ꡬνκΈ°.
# argmin(), argmax() : μ΅μ, μ΅λκ°μ΄ μ‘΄μ¬νλ μμΉ(μΈλ±μ€)λ₯Ό ꡬνκΈ°.
# numpy.arange([start, ] stop, [step, ] dtype=None)
numpy λͺ¨λμ arange ν¨μλ λ°μ΄λ¦°κ΅¬κ° [start, stop) μμ
step μ ν¬κΈ°λ§νΌ μΌμ νκ² λ¨μ΄μ Έ μλ μ«μλ€μ array ννλ‘ λ°νν΄ μ£Όλ ν¨μλ€.
# numpy.mean
νκ· κ΅¬νκΈ°
# numpy.diff(arr, [n])
n order μ°¨λΆμ ꡬν¨
"""
"""
@self :
@series : μμ μ€μ κ°μΌλ‘ μ΄λ£¨μ΄μ§ 1μ°¨μ λ°°μ΄
@scale :
@cut_pt :
@cut_ratio : μ κ· λΆν¬λ‘ λ³νλ λ°μ΄ν°μμ μ νν νμ€ νΈμ°¨(μκ·Έλ§)κ°
"""
class Self:
def __init__(self):
self.series = np.array([]);
def setSeries(arr)
self.series = np.array(arr);
def calc_bound_from_dist(self, series = self.series, scale = 1000, cut_pt = 10, cut_ratio = 2):
unit = 100 / scale
center = int(scale / 2)
cut_point = int(cut_pt / unit) if cut_pt / unit > 1 else 1 # 100
# np.percentile(1000, np.arange(0, 100.1, 0.1))
# np.percentile(1000, [ 0. 0.1 0.2 ..., 99.8 99.9 100. ])
percentiles = np.percentile(series, np.arange(0, 100 + unit, unit))
# nonstationary process -> stationary process
# TODO : Operator Overloading
# percentiles[1:1001] - percentiles[0:1000]
# -> e(n+1) - e(n) -> 1 dimensional array
# 1st order differincing
diff = percentiles[1:scale + 1] - percentiles[0:scale]
# diff = np.diff(percentiles)
# TODO : Operator Overloading
# 1e-6 delta less than (>) filter -> 1 dimensional array
diff_nnz_idx = np.abs(diff) > 1e-6 # 1 * 10^(-6)
# Fix code to cpu caching
""" Upper Bound Operation """
ev_upper_range = np.arange(center, scale - cut_point + 1, 1) # [500, 501, ... 900]
diff_ev_upper_part = diff[ev_upper_range][diff_nnz_idx[ev_upper_range]]
upper_bound_diff = np.mean(diff_ev_upper_part) + np.std(diff_ev_upper_part) * cut_ratio
# upper
upper_bound_idx = np.argmax(diff[center:] > upper_bound_diff)
if upper_bound_idx == 0 or upper_bound_idx == scale-center:
upper_bound_idx = scale - 1
else:
upper_bound_idx += center
upper_bound = percentiles[upper_bound_idx]
""" Lower Bound Operation """
ev_lower_range = np.arange(cut_point, center + 1, 1) # [100, 101, ... 500]
diff_ev_lower_part = diff[ev_lower_range][diff_nnz_idx[ev_lower_range]]
lower_bound_diff = np.mean(diff_ev_lower_part) + np.std(diff_ev_lower_part) * cut_ratio
# lower
lower_bound_idx = -np.argmax(diff[center::-1] > lower_bound_diff)
if lower_bound_idx == 0 or lower_bound_idx == -center:
lower_bound_idx = 1
else:
lower_bound_idx += center
lower_bound = percentiles[lower_bound_idx]
return upper_bound, lower_bound
|
boxcox transformation
볡μ‘ν μ§μν¨μμ μΉμμ ν΄λ₯Ό λ΄ν΄ λ©μλλ‘ κ΅¬νκ³ μ΄λ₯Ό μ κ·λΆν¬λ‘ λ³νν©λλ€.
무μμ κ°μΌλ‘ boxcox κ°μ΄ μΌλ§λ μ κ·λΆν¬λ‘ μ λ°κΏμ£Όλμ§
box-cox-transformation-using-pythonμ νλμ 보기 μ¬μ΄ μμ κ° μμ΅λλ€.
μλ μμ λ μ€μ μΈ‘μ κ°μ μμ λ‘ νλ μμμ
λλ€.
C.Doomμ Cygnus λ°©ν₯μΌλ‘ 47 κ°μ λ³μ ν¬ν¨νλ μ±λ¨ CYG OB1μ Hertzsprung-Russell Diagram μ€μΈ‘ λ°μ΄ν°μ
λλ€.
첫 λ²μ§Έ λ³μλ λ³ νλ©΄μμ μ ν¨ μ¨λμ λ‘κ·Έ (log.Te)μ΄κ³ λ λ²μ§Έ λ³μλ λΉ κ°λμ λ‘κ·Έ(log.light)μ
λλ€.
μλμ μμ μμλ log.Te λ§ λ°μ΄ν°λ‘ μ¬μ©ν©λλ€.
μλ³Έ λ°μ΄ν°
μλ³Έ λ°μ΄ν° csv
μμ λ₯Ό λ리기 μν΄μ μλμ½λ€ νλ μμν¬λ₯Ό μ€μΉνλ©΄ νΈλ¦¬ν©λλ€.
μλλ μμ μμλ scipy.stats.boxcox λΌμ΄λΈλ¬λ¦¬λ₯Ό μ¬μ©ν©λλ€.
scipy.stats.boxcox(x, lmbda=None, alpha=None)[source]
Return a dataset transformed by a Box-Cox power transformation.
Parameters
xndarray
Input array. Must be positive 1-dimensional. Must not be constant.
lmbda{None, scalar}, optional
If lmbda is not None, do the transformation for that value.
If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.
alpha{None, float}, optional
If alpha is not None, return the 100 * (1-alpha)% confidence interval for lmbda as the third output argument. Must be between 0.0 and 1.0.
Returns
boxcoxndarray
Box-Cox power transformed array.
maxlogfloat, optional
If the lmbda parameter is None, the second returned argument is the lambda that maximizes the log-likelihood function.
(min_ci, max_ci)tuple of float, optional
If lmbda parameter is None and alpha is not None, this returned tuple of floats represents the minimum and maximum confidence limits given alpha.
λΉμ μ νλ₯ κ³Όμ μ μ μ νλ₯ κ³Όμ μΌλ‘ λ³ννκΈ°
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
|
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.stats import boxcox
import seaborn as sns
import dautil as dl
import pandas_datareader as pd
import numpy as np
# import csv
#from IPython.display import HTML
# Load the data and transform it as follows :
context = dl.nb.Context('normalizing_boxcox')
lr = dl.nb.LatexRenderer(chapter=4, start=3, context=context)
lr.render(r'y_i^{(\lambda)} = \begin{cases} \dfrac{y_i^\lambda - 1}{\lambda} & \text{if } \lambda \neq 0, \\[8pt] \ln{(y_i)} & \text{if } \lambda = 0, \end{cases} ')
starsCYG = sm.datasets.get_rdataset("starsCYG", "robustbase", cache=True).data
whichData = 'log.Te'
# Data must be positive
# Data must be 1-dimensional.
transformed, lamda = boxcox(starsCYG[whichData])
# export CSV
#with open('test', 'w', newline='', encoding='utf-8') as csv_file:
#writer = csv.writer(transformed, delimiter=',')
#writer.writerow('my_utf8_string')
print("1. input data : ")
print(starsCYG)
print(type(starsCYG))
np.savetxt('./input_data.txt'
, (starsCYG)
, header='--input data start--'
, footer='--input data end--'
, fmt='%1.2f')
print("\n\n")
print("2. input data [%s] : " %whichData)
print(starsCYG[whichData])
print(type(starsCYG[whichData]))
np.savetxt('./input_data_x.txt'
, (starsCYG)
, header='--input data start--'
, footer='--input data end--'
, fmt='%1.2f')
print("\n\n")
print("3. transformed output : ")
print(transformed)
print(type(transformed))
print("max lamda value : ")
print(lamda)
print(type(lamda))
print("\n\n")
np.savetxt('D:/PythonProject/output_data_x.txt'
, (transformed)
, header='--output data start--'
, footer='--output data end--'
, fmt='%1.2f')
#region Plot
# Display the Q - Q plots and the distribution as follows :
#"""
sp = dl.plotting.Subplotter(2, 2, context)
sp.label()
sm.qqplot(starsCYG[whichData], fit=True, line='s', ax=sp.ax)
sp.label(advance=True)
sm.qqplot(transformed, fit=True, line='s', ax=sp.ax)
sp.label(advance=True)
sns.distplot(starsCYG[whichData], ax=sp.ax)
sp.label(advance=True)
sns.distplot(transformed, ax=sp.ax)
plt.tight_layout()
plt.show()
#"""
#endregion Plot
|
λ€μμ postgres μ μ μ¬νλ μμλ₯Ό λ³΄κ² μ΅λλ€.
Postgres μ°λμ Citus membership-manager.py λ₯Ό μ°Έκ³ νμ΅λλ€.
Boxcox transformation batch process example on postgres
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
|
import datetime
import csv
import numpy as np
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import os
import sys
import psycopg2
from datetime import datetime as dt
print("WORKFLOW START [BOXCOX] :", dt.isoformat(dt.utcnow()))
idValue = (sys.argv[1])
print("idValue : %(idValue)s")
timeValue = (sys.argv[2])
print("timeValue : %(timeValue)s")
cdate = (sys.argv[3])
print("cdate : %(cdate)s")
sigma = float(sys.argv[4])
print("sigma : %(sigma)s")
connParam = "dbname = 'db' user = 'user' host = '192.168.10.1' password = 'paranlee'"
conn = psycopg2.connect(connParam)
cur = conn.cursor()
sql = """SELECT value
FROM ts_bg_alarm_outlier_threshold_data
WHERE cdate = %(cdate)s
AND idValue = %(idValue)s
AND timeValue = %(timeValue)s ;"""
cur.execute(sql)
result = [r[0] for r in cur.fetchall()]
inputSeries = np.array(result, dtype=float)
print("inputSeries: ", inputSeries)
# tryboxcox transformation
try:
#inputSeries = np.array(list_file,dtype=float)
transformed, lamda = boxcox(inputSeries)
print("transformed output : ", transformed)
print("max lamda value : ", lamda)
print("transformed: ", transformed[0], transformed[1])
stddev = np.std(transformed)
avg = np.mean(transformed)
orgAvg = np.mean(inputSeries)
orgStd = np.std(inputSeries)
print("avg value : %(avg)s")
print("std value : %(stddev)s")
print("orgAvg value : %(orgAvg)s")
print("orgStd value : %(orgStd)s")
print("sigma value : %(sigma)s")
# up, down boundary
up = avg + (stddev * sigma)
down = avg - (stddev * sigma)
orgUp = orgAvg + (orgStd * sigma)
orgDown = orgAvg - (orgStd * sigma)
print("up bodundary output : %(up)s")
print("down boundary output : %(down)s")
print("orgUp output : %(orgUp)s")
print("orgDown output : %(orgDown)s")
#inverse up,down
invup = inv_boxcox(up, lamda)
invdown = inv_boxcox(down, lamda)
print("invup output : %(invup)s")
print("invdown output : %(invdown)s")
invup2 = ((up * lamda) ** (1 / lamda))
invdown2 = ((down * lamda) ** (1 / lamda))
print("invup2 output : %(invup2)s")
print("invdown2 output : %(invdown2)s")
if str(invup) == 'nan' or str(invup) == 'inf':
invup = orgUp
if str(invdown) == 'nan' or str(invdown) == 'inf':
invdown = orgDown
invup = orgUp
invdown = orgDown
except:
print('except')
orgAvg = np.mean(inputSeries)
orgStd = np.std(inputSeries)
orgUp = orgAvg + (orgStd * sigma)
orgDown = orgAvg - (orgStd * sigma)
invup = orgUp
invdown = orgDown
print(orgAvg)
print(orgStd)
if str(invup) == 'nan' or str(invup) == 'inf':
invup = psycopg2.extensions.AsIs('NULL')
if str(invdown) == 'nan' or str(invdown) == 'inf':
invdown = psycopg2.extensions.AsIs('NULL')
# minus change 0
# if invup < 0:
# invup = 0
# if invdown < 0:
# invdown = 0
# threshold update
sql = """UPDATE
outlier_threshold_table
SET threshold_min = %(invdown)s,
threshold_max = %(invup)s
WHERE 1=1
AND cdate = %(cdate)s
AND idValue = %(idValue)s
AND timeValue = %(timeValue)s ;"""
cur.execute(sql)
conn.commit()
print("WORKFLOW END [BOXCOX] :", dt.isoformat(dt.utcnow()))
|
PG μλ λ΄μ₯ ν¨μμ Boxcox transformation μ΄ μκ³ ,
λΆλμμμ μ λν μμΈμ²λ¦¬κ° μ’ λ νΈλ¦¬νκ² ν μ μμ΄,
PL/SQL νλ‘μμ λ‘ κ΅¬ννμ§ μκ³ , Python3 λ‘ κ΅¬νν λ°°μΉ νλ‘κ·Έλ¨ μμλ₯Ό ꡬνν΄λ³΄μμ΅λλ€.
Summary
μ΅μ’
μ μΌλ‘ λ°μ΄ν° μ μ¬λ₯Ό μν΄μλ
-
μ΄μμ μΆμΆ λ° μ μΈνκΈ°
-
μ κ·λΆν¬ λ³ν ν νμ€νΈμ°¨λ‘ μ μ¬ν λ°μ΄ν°μ λ²μ μ€μ , μλ³νν κ°μ μ μ¬νκΈ°
ν¬κ² 2κ°μ§ μν¬ νλ‘μ°λ‘ μ΄λ£¨μ΄μ§λ κ²μ νμΈνμ΅λλ€.