-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimnormtolint.py
378 lines (330 loc) · 16.1 KB
/
simnormtolint.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
import numpy as np
import scipy.stats
import scipy.integrate as integrate
import scipy.optimize as opt
import pandas as pd
def Kfactor(n, f = None, alpha = 0.05, P = 0.99, side = 1, method = 'HE', m=50):
K=None
if f == None:
f = n-1
if (len((n,)*1)) != len((f,)*1) and (len((f,)*1) > 1):
return 'Length of \'f\' needs to match length of \'n\'!'
if (side != 1) and (side != 2):
return 'Must specify one sided or two sided procedure'
if side ==1:
zp = scipy.stats.norm.ppf(P)
ncp = np.sqrt(n)*zp
ta = scipy.stats.nct.ppf(1-alpha,df = f, nc=ncp) #students t noncentralized
K = ta/np.sqrt(n)
else:
def Ktemp(n, f, alpha, P, method, m):
chia = scipy.stats.chi2.ppf(alpha, df = f)
k2 = np.sqrt(f*scipy.stats.ncx2.ppf(P,df=1,nc=(1/n))/chia) #noncentralized chi 2 (ncx2))
if method == 'HE':
def TEMP4(n, f, P, alpha):
chia = scipy.stats.chi2.ppf(alpha, df = f)
zp = scipy.stats.norm.ppf((1+P)/2)
za = scipy.stats.norm.ppf((2-alpha)/2)
dfcut = n**2*(1+(1/za**2))
V = 1 + (za**2)/n + ((3-zp**2)*za**4)/(6*n**2)
K1 = (zp * np.sqrt(V * (1 + (n * V/(2 * f)) * (1 + 1/za**2))))
G = (f-2-chia)/(2*(n+1)**2)
K2 = (zp * np.sqrt(((f * (1 + 1/n))/(chia)) * (1 + G)))
if f > dfcut:
K = K1
else:
K = K2
if K == np.nan or K == None:
K = 0
return K
#TEMP5 = np.vectorize(TEMP4())
K = TEMP4(n, f, P, alpha)
return K
elif method == 'HE2':
zp = scipy.stats.norm.ppf((1+P)/2)
K = zp * np.sqrt((1+1/n)*f/chia)
return K
elif method == 'WBE':
r = 0.5
delta = 1
while abs(delta) > 0.00000001:
Pnew = scipy.stats.norm.cdf(1/np.sqrt(n)+r) - scipy.stats.norm.cdf(1/np.sqrt(n)-r)
delta = Pnew-P
diff = scipy.stats.norm.pdf(1/np.sqrt(n)+r) + scipy.stats.norm.pdf(1/np.sqrt(n)-r)
r = r-delta/diff
K = r*np.sqrt(f/chia)
return K
elif method == 'ELL':
if f < n**2:
print("Warning Message:\nThe ellison method should only be used for f appreciably larger than n^2")
r = 0.5
delta = 1
zp = scipy.stats.norm.ppf((1+P)/2)
while abs(delta) > 0.00000001:
Pnew = scipy.stats.norm.cdf(zp/np.sqrt(n)+r) - scipy.stats.norm.cdf(zp/np.sqrt(n)-r)
delta = Pnew - P
diff = scipy.stats.norm.pdf(zp/np.sqrt(n)+r) + scipy.stats.norm.pdf(zp/np.sqrt(n)-r)
r = r-delta/diff
K = r*np.sqrt(f/chia)
return K
elif method == 'KM':
K = k2
return K
elif method == 'OCT':
delta = np.sqrt(n)*scipy.stats.norm.ppf((1+P)/2)
def Fun1(z,P,ke,n,f1,delta):
return (2 * scipy.stats.norm.cdf(-delta + (ke * np.sqrt(n * z))/(np.sqrt(f1))) - 1) * scipy.stats.chi2.pdf(z,f1)
def Fun2(ke, P, n, f1, alpha, m, delta):
return integrate.quad(Fun1,a = f1 * delta**2/(ke**2 * n), b = np.inf, args=(P,ke,n,f1,delta),limit = m)
def Fun3(ke,P,n,f1,alpha,m,delta):
f = Fun2(ke = ke, P = P, n = n, f1 = f1, alpha = alpha, m = m, delta = delta)
return abs(f[0] - (1-alpha))
K = opt.minimize(fun=Fun3, x0=k2,args=(P,n,f,alpha,m,delta), method = 'L-BFGS-B')['x']
return float(K)
elif method == 'EXACT':
def fun1(z,df1,P,X,n):
k = (scipy.stats.chi2.sf(df1*scipy.stats.ncx2.ppf(P,1,z**2)/X**2,df=df1)*np.exp(-0.5*n*z**2))
return k
def fun2(X,df1,P,n,alpha,m):
return integrate.quad(fun1,a =0, b = 5, args=(df1,P,X,n),limit=m)
def fun3(X,df1,P,n,alpha,m):
return np.sqrt(2*n/np.pi)*fun2(X,df1,P,n,alpha,m)[0]-(1-alpha)
K = opt.brentq(f=fun3,a=0,b=k2+(1000)/n, args=(f,P,n,alpha,m))
return K
K = Ktemp(n=n,f=f,alpha=alpha,P=P,method=method,m=m)
return K
def length(x):
if type(x) == float or type(x) == int or type(x) == np.int32 or type(x) == np.float64 or type(x) == np.float32 or type(x) == np.int64:
return 1
return len(x)
def Kfactorsim(n, l = None, alpha = 0.05, P = 0.99, side = 1, method = 'EXACT', m = 50):
'''
Estimating K-factors for Simultaneous Tolerance Intervals Based on Normality
Description
Estimates k-factors for simultaneous tolerance intervals based on normality.
K.factor.sim(n, l = None, alpha = 0.05, P = 0.99, side = 1,
method = ["EXACT", "BONF"], m = 50)
Parameters
----------
n : int or list
If method = "EXACT", this is the sample size of each of the l groups.
If method = "BONF", then n can be a vector of different sample sizes
for the l groups.
l : int, optional
The number of normal populations for which the k-factors will be
constructed simultaneously. If NULL, then it is taken to be the length
of n. The default is None.
alpha : float, optional
The level chosen such that 1-alpha is the confidence level. The
default is 0.05.
P : float, optional
The proportion of the population to be covered by the tolerance
interval. The default is 0.99.
side : 1 or 2, optional
Whether a k-factor for a 1-sided or 2-sided tolerance interval is
required (determined by side = 1 or side = 2, respectively).
method : string, optional
The method for calculating the k-factors. "EXACT" is an exact method
that can be used when all l groups have the same sample size. "BONF"
is an approximate method using the Bonferroni inequality, which can be
used when the l groups have different sample sizes. The default is
'EXACT'.
m : int, optional
The maximum number of subintervals to be used in the integrate
function. This is necessary only for method = "EXACT". The larger the
number, the more accurate the solution. Too low of a value can result
in an error. A large value can also cause the function to be slow for
method = "EXACT". The default is 50.
Returns
-------
K.factor returns the k-factor for simultaneous tolerance intervals based
on normality with the arguments specified above.
Note
-----
For larger combinations of n and l when side = 2 and method = "EXACT", the
calculation can be slow. For larger sample sizes when side = "BONF", there
may be some accuracy issues with the 1-sided calculation since it depends
on the noncentral t-distribution. The code is primarily intended to be
used for moderate values of the noncentrality parameter. It will not be
highly accurate, especially in the tails, for large values.
References
----------
Derek S. Young (2010). tolerance: An R Package for Estimating Tolerance
Intervals. Journal of Statistical Software, 36(5), 1-39.
URL http://www.jstatsoft.org/v36/i05/.
Krishnamoorthy, K. and Mathew, T. (2009), Statistical Tolerance Regions:
Theory, Applications, and Computation, Wiley.
Mee, R. W. (1990), Simultaneous Tolerance Intervals for Normal Populations
with Common Variance, Technometrics, 32, 83-92.
Examples
--------
Kfactorsim(n=4, alpha = 0.05, P = 0.99, side = 1, m = 50,method = 'BONF')
'''
if side != 1 and side != 2:
return 'Must specify a one-sided or two-sided procedure!'
if method.upper() == "EXACT":
if l == None:
l = 1
if length(n) != 1:
print('Sample size n cannot be a list of sample sizes.')
df = n*l-l
chia = scipy.stats.chi2.ppf(alpha,df)
k2 = np.sqrt(df*scipy.stats.chi2.ppf(P,1,1/n)/chia)
if side == 1:
def Ksimfun(K,n,P,alpha,l,m):
def fun_temp(z,K,n,P,l):
df = n*l-l
zp = scipy.stats.norm.ppf(P)
inside = np.sqrt(n) * (K*np.sqrt(z)/np.sqrt(df)-zp)
return scipy.stats.chi2.pdf(z,df)*scipy.stats.norm.cdf(inside)**l
return integrate.quad(fun_temp, a = 0, b = n*l*10, args = (K,n,P,l),limit = m)[0] - (1-alpha)
K = opt.brentq(Ksimfun, a=0, b = k2+100, args=(n,P,alpha,l,m), xtol=np.finfo(float).eps**0.5)
else:
def Ksimfun(K,n,P,alpha,l,m):
def fun_temp(z,K,n,P,l):
df = n*l-l
P1 = scipy.stats.chi2.sf(df*scipy.stats.ncx2.ppf(P,df=1,nc=z**2/n)/(K**2),df) #cdf lower tail = False
P2 = (2*scipy.stats.norm.cdf(z)-1)**(l-1)
return scipy.stats.norm.pdf(z)*P1*P2
return 2*l*integrate.quad(fun_temp, a = 0, b = n*l*10, args = (K,n,P,l), limit = m)[0] - (1-alpha)
K = opt.brentq(Ksimfun, a = 0, b = k2+100, args=(n,P,alpha,l,m), xtol=np.finfo(float).eps**0.5)
else:
if length(n) > 1 or l == None:
l = length(n)
if side == 1:
if length(n) != 1:
K = []
for i in range(len(n)):
K.append(Kfactor(n[i],f=sum(n)-l,alpha=alpha/l,P=P,side = 1))
else:
K = Kfactor(n,f=n-l,alpha=alpha/l,P=P,side = 1)
else:
if length(n) != 1:
K = []
for i in range(len(n)):
K.append(Kfactor(n[i],f=sum(n)-l,alpha=alpha/l,P=P,side=2,method="EXACT",m=m))
else:
K = Kfactor(n,f=n-l,alpha=alpha/l,P=P,side=2,method="EXACT",m=m)
return K
def simnormtolint(x, alpha = 0.05, P = 0.99, side = 1, method = 'EXACT', m = 50, lognorm = False):
'''
Simultaneous Normal (or Log-Normal) Tolerance Intervals
Description
Provides simultaneous 1-sided or 2-sided tolerance intervals for data
distributed according to either a normal distribution or log-normal
distribution.
simnormtol.int(x, alpha = 0.05, P = 0.99, side = 1,
method = ["EXACT", "BONF"], m = 50, log.norm = FALSE)
Parameters
----------
x: matrix or list
Either a matrix or list of vectors of the data. If a matrix, then the
columns are the samples from the different normal (or log-normal)
populations. If method = "EXACT", then x must be a matrix.
alpha: float, optional
The level chosen such that 1-alpha is the confidence level.
The default is 0.05.
P: float, optional
The proportion of the population to be covered by this tolerance
interval. The default is 0.99.
side: 1 or 2, optional
Whether a 1-sided or 2-sided tolerance interval is required
(determined by side = 1 or side = 2, respectively). The default is 1.
method: string, optional
The method for calculating the k-factors. "EXACT" is an exact method
that can be used when all l groups have the same sample size. "BONF"
is an approximate method using the Bonferroni inequality, which can be
used when the l groups have different sample sizes. The default is
"EXACT".
m: int, optional
The maximum number of subintervals to be used in the integrate
function. This is necessary only for method = "EXACT". The larger the
number, the more accurate the solution. Too low of a value can result
in an error. A large value can also cause the function to be slow for
method = "EXACT". The default is 50.
lognorm: bool, optional
If True, then the data is considered to be from a log-normal
distribution, in which case the output gives tolerance intervals for
the log-normal distribution. The default is False.
Details
Recall that if the random variable X is distributed according to a
log-normal distribution, then the random variable Y = ln(X) is distributed
according to a normal distribution.
Returns
-------
simnormtolint returns a data frame with items:
alpha:
The specified significance level.
P:
The proportion of the population covered by this tolerance interval.
mean:
The sample mean.
1-sided.lower:
The 1-sided lower tolerance bound. This is given only if side = 1.
1-sided.upper:
The 1-sided upper tolerance bound. This is given only if side = 1.
2-sided.lower:
The 2-sided lower tolerance bound. This is given only if side = 2.
2-sided.upper:
The 2-sided upper tolerance bound. This is given only if side = 2.
References
----------
Derek S. Young (2010). tolerance: An R Package for Estimating Tolerance
Intervals. Journal of Statistical Software, 36(5), 1-39.
URL http://www.jstatsoft.org/v36/i05/.
Krishnamoorthy, K. and Mathew, T. (2009), Statistical Tolerance Regions:
Theory, Applications, and Computation, Wiley.
Mee, R. W. (1990), Simultaneous Tolerance Intervals for Normal Populations
with Common Variance, Technometrics, 32, 83-92.
Examples
--------
## 95%/95% simultaneous 2-sided normal tolerance intervals for ONE sample
of unequal variances and means.
x=[]
x.extend(np.random.normal(size=20))
x.extend(np.random.normal(size=10,loc=1))
x.extend(np.random.normal(size = 12,loc=1,scale=2))
simnormtolint(x,method = 'BONF',side = 2)
'''
x = np.array(x)
if x.ndim != 2 and x.ndim != 1:
return "The data must be a matrix or a list."
if lognorm:
x = np.log(x)
if x.ndim == 2:
xbar = np.mean(x,axis = 0)
s2bar = np.std(x,axis = 0, ddof=1)**2
n = len(x)
l = len(x[0])
sp = np.sqrt(sum((n-1)*s2bar)/(n*l-l))
else:
if method == 'EXACT':
return 'The EXACT procedure is only available for equal sample sizes.'
xbar = np.mean(x)
s2bar = np.std(x,ddof = 1)**2
n = length(x)
l = length(x[0])
sp = np.sqrt(((n-1)*s2bar)/(n-l))
K = Kfactorsim(n=n,l=l,alpha=alpha,P=P,side=side,method=method,m=m)
lower = xbar - sp*K
upper = xbar + sp*K
if lognorm:
lower = np.exp(lower)
upper = np.exp(upper)
xbar = np.exp(xbar)
if side == 1 and x.ndim == 2:
temp = pd.DataFrame({'alpha':[alpha], 'P':[P], 'xbar':xbar[0], '1-sided,lower':lower[0],'1-sided.upper':upper[0]})
for i in range(1,len(xbar)):
temp.loc[len(temp.index)] = [alpha, P, xbar[i],lower[i],upper[i]]
return temp
elif side == 2 and x.ndim == 2:
temp = pd.DataFrame({'alpha':[alpha], 'P':[P], 'xbar':xbar[0], '2-sided,lower':lower[0],'2-sided.upper':upper[0]})
for i in range(1,len(xbar)):
temp.loc[len(temp.index)] = [alpha, P, xbar[i],lower[i],upper[i]]
return temp
elif side == 1 and x.ndim == 1:
return pd.DataFrame({'alpha':[alpha], 'P':[P], 'xbar':xbar, '1-sided,lower':lower,'1-sided.upper':upper})
else:
return pd.DataFrame({'alpha':[alpha], 'P':[P], 'xbar':xbar, '2-sided,lower':lower,'2-sided.upper':upper})
# x = np.array([[1,2,2,3],[3,3,4,5],[10,10,20,25]]).T
# print(simnormtolint(x,method = 'EXACT',side = 1))