Added percentile

master
Per.Andreas.Brodtkorb 14 years ago
parent 9b9a7dda38
commit 69fcf175f6

@ -292,7 +292,7 @@ class TKDE(_KDE):
------- -------
N = 20 N = 20
data = np.random.rayleigh(1, size=(N,)) data = np.random.rayleigh(1, size=(N,))
>>> data = array([ 0.75355792, 0.72779194, 0.94149169, 0.07841119, 2.32291887, >>> data = np.array([ 0.75355792, 0.72779194, 0.94149169, 0.07841119, 2.32291887,
... 1.10419995, 0.77055114, 0.60288273, 1.36883635, 1.74754326, ... 1.10419995, 0.77055114, 0.60288273, 1.36883635, 1.74754326,
... 1.09547561, 1.01671133, 0.73211143, 0.61891719, 0.75903487, ... 1.09547561, 1.01671133, 0.73211143, 0.61891719, 0.75903487,
... 1.8919469 , 0.72433808, 1.92973094, 0.44749838, 1.36508452]) ... 1.8919469 , 0.72433808, 1.92973094, 0.44749838, 1.36508452])
@ -511,7 +511,7 @@ class KDE(_KDE):
------- -------
N = 20 N = 20
data = np.random.rayleigh(1, size=(N,)) data = np.random.rayleigh(1, size=(N,))
>>> data = array([ 0.75355792, 0.72779194, 0.94149169, 0.07841119, 2.32291887, >>> data = np.array([ 0.75355792, 0.72779194, 0.94149169, 0.07841119, 2.32291887,
... 1.10419995, 0.77055114, 0.60288273, 1.36883635, 1.74754326, ... 1.10419995, 0.77055114, 0.60288273, 1.36883635, 1.74754326,
... 1.09547561, 1.01671133, 0.73211143, 0.61891719, 0.75903487, ... 1.09547561, 1.01671133, 0.73211143, 0.61891719, 0.75903487,
... 1.8919469 , 0.72433808, 1.92973094, 0.44749838, 1.36508452]) ... 1.8919469 , 0.72433808, 1.92973094, 0.44749838, 1.36508452])
@ -864,7 +864,7 @@ class Kernel(object):
-------- --------
N = 20 N = 20
data = np.random.rayleigh(1, size=(N,)) data = np.random.rayleigh(1, size=(N,))
>>> data = array([ 0.75355792, 0.72779194, 0.94149169, 0.07841119, 2.32291887, >>> data = np.array([ 0.75355792, 0.72779194, 0.94149169, 0.07841119, 2.32291887,
... 1.10419995, 0.77055114, 0.60288273, 1.36883635, 1.74754326, ... 1.10419995, 0.77055114, 0.60288273, 1.36883635, 1.74754326,
... 1.09547561, 1.01671133, 0.73211143, 0.61891719, 0.75903487, ... 1.09547561, 1.01671133, 0.73211143, 0.61891719, 0.75903487,
... 1.8919469 , 0.72433808, 1.92973094, 0.44749838, 1.36508452]) ... 1.8919469 , 0.72433808, 1.92973094, 0.44749838, 1.36508452])
@ -1825,14 +1825,11 @@ def qlevels(pdf, p=(10, 30, 50, 70, 90, 95, 99, 99.9), x1=None, x2=None):
Thus QL is questionable''' Thus QL is questionable'''
warnings.warn(msg) warnings.warn(msg)
ind, = np.where(np.diff(np.r_[Fi, 1]) > 0) # make sure Fi is strictly increasing by not considering duplicate values ind, = np.where(np.diff(np.r_[Fi, 1]) > 0) # make sure Fi is strictly increasing by not considering duplicate values
ui = tranproc(Fi[ind], fi[ind], p2) # calculating the inverse of Fi to find the index ui = tranproc(Fi[ind], fi[ind], p2) # calculating the inverse of Fi to find the index
# to the desired quantile level # to the desired quantile level
#ui=smooth(Fi(ind),fi(ind),1,p2(:),1) % alternative # ui=smooth(Fi(ind),fi(ind),1,p2(:),1) % alternative
#res=ui-ui2 # res=ui-ui2
if np.any(ui >= max(pdf.ravel())): if np.any(ui >= max(pdf.ravel())):
warnings.warn('The lowest percent level is too close to 0%') warnings.warn('The lowest percent level is too close to 0%')
@ -1843,11 +1840,241 @@ def qlevels(pdf, p=(10, 30, 50, 70, 90, 95, 99, 99.9), x1=None, x2=None):
warnings.warn(msg) warnings.warn(msg)
ui[ui < 0] = 0.0 ui[ui < 0] = 0.0
return ui return ui
#def qlevels2(r, p=(10,30,50,70,90, 95, 99, 99.9), method=1):
# '''
# QLEVELS2 Calculates quantile levels which encloses P% of data
#
# CALL: [ql PL] = qlevels2(data,PL,method);
#
# ql = the discrete quantile levels, size Np x D
# data = data matrix, size N x D (D = # of dimensions)
# PL = percent level vector, length Np (default [10:20:90 95 99 99.9])
# method = 1 Interpolation so that F(X_(k)) == (k-0.5)/n. (default)
# 2 Interpolation so that F(X_(k)) == k/(n+1).
# 3 Based on the empirical distribution.
#
# QLEVELS2 sort the columns of data in ascending order and find the
# quantile levels for each column which encloses P% of the data.
#
# Examples : % Finding quantile levels enclosing P% of data:
# xs = rndnorm(0,1,100000,1);
# qls = qlevels2(pdfnorm(xs),[10:20:90 95 99 99.9]);
# % compared with the exact values
# ql = pdfnorm(invnorm((100-[10:20:90 95 99 99.9])/200));
#
# % Finding the median of xs:
# ql = qlevels2(xs,50);
#
# See also qlevels
# '''
# [n, d]=size(r);
# p = np.atleast_1d(p)
# if np.any((p<0) | (100<p)):
# ValueError('PL must satisfy 0 <= PL <= 100')
#
# if (n==1) && (d>1)
# r=r(:);
# n=d;
# d=1;
# end
# if d>1
# if min(size(p)) > 1
# error('Not both matrix r and matrix p input')
# end
# q = zeros(length(p),d);
# else
# q = zeros(size(p));
# end
# p = 1-p(:)/100;
# x = sort(r);
#
#
# if method == 3
# qq1 = x(ceil(max(1,p*n)),:);
# qq2 = x(floor(min(p*n+1,n)),:);
# qq = (qq1+qq2)/2;
# else
# x = [x(1,:); x; x(n,:)];
# if method == 2
# % This method is from Hjort's "Computer
# % intensive statistical methods" page 102
# i = p*(n+1)+1;
# else % Metod 1
# i = p*n+1.5;
# end
# iu = ceil(i);
# il = floor(i);
# d1 = (i-il)*ones(1,d);
# qq = x(il,:).*(1-d1)+x(iu,:).*d1;
# end
#
# q(:) = qq;
#
# return
def _compute_qth_percentile(sorted, q, axis, out, method):
if not np.isscalar(q):
p = [_compute_qth_percentile(sorted, qi, axis, None, method)
for qi in q]
if out is not None:
out.flat = p
return p
q = q / 100.0
if (q < 0) or (q > 1):
raise ValueError, "percentile must be in the range [0,100]"
indexer = [slice(None)] * sorted.ndim
Nx = sorted.shape[axis]
if method==1:
index = q*(Nx-1) # p(k) = k/n
elif method==2:
index = q*(Nx-1) + 0.5 # p(k) = (k-0.5)/n.
elif method==3:
index = q*Nx # p(k) = k/(n+1)
elif method==4:
index = q*(Nx-2) + 1 # p(k) = (k-1)/(n-1)
elif method==5:
index = q*(Nx-2./3) + 1./3 #p(k) = (k-1/3)/(n+1/3)
i = int(index)
if i == index:
indexer[axis] = slice(i, i+1)
weights = np.array(1)
sumval = 1.0
else:
indexer[axis] = slice(i, i+2)
j = i + 1
weights = np.array([(j - index), (index - i)],float)
wshape = [1]*sorted.ndim
wshape[axis] = 2
weights.shape = wshape
sumval = weights.sum()
# Use add.reduce in both cases to coerce data type as well as
# check and use out array.
return np.add.reduce(sorted[indexer]*weights, axis=axis, out=out)/sumval
def percentile(a, q, axis=None, out=None, overwrite_input=False, method=1):
"""
Compute the qth percentile of the data along the specified axis.
Returns the qth percentile of the array elements.
Parameters
----------
a : array_like
Input array or object that can be converted to an array.
q : float in range of [0,100] (or sequence of floats)
percentile to compute which must be between 0 and 100 inclusive
axis : {None, int}, optional
Axis along which the percentiles are computed. The default (axis=None)
is to compute the median along a flattened version of the array.
out : ndarray, optional
Alternative output array in which to place the result. It must
have the same shape and buffer length as the expected output,
but the type (of the output) will be cast if necessary.
method : scalar integer
defining the interpolation method. Valid options are
1 : p(k) = k/n. That is, linear interpolation of the empirical cdf.
2 : p(k) = (k-0.5)/n. That is a piecewise linear function where
the knots are the values midway through the steps of the
empirical cdf. This is popular amongst hydrologists. (default)
PRCTILE also uses this formula.
3 : p(k) = k/(n+1). Thus p(k) = E[F(x[k])].
This is used by Minitab and by SPSS.
4 : p(k) = (k-1)/(n-1). In this case, p(k) = mode[F(x[k])].
This is used by S.
5 : p(k) = (k-1/3)/(n+1/3). Then p(k) =~ median[F(x[k])].
The resulting quantile estimates are approximately
median-unbiased regardless of the distribution of x.
6 : p(k) = (k-3/8)/(n+1/4). The resulting quantile estimates are
approximately unbiased for the expected order statistics
if x is normally distributed.
overwrite_input : {False, True}, optional
If True, then allow use of memory of input array (a) for
calculations. The input array will be modified by the call to
median. This will save memory when you do not need to preserve
the contents of the input array. Treat the input as undefined,
but it will probably be fully or partially sorted. Default is
False. Note that, if `overwrite_input` is True and the input
is not already an ndarray, an error will be raised.
Returns
-------
pcntile : ndarray
A new array holding the result (unless `out` is specified, in
which case that array is returned instead). If the input contains
integers, or floats of smaller precision than 64, then the output
data-type is float64. Otherwise, the output data-type is the same
as that of the input.
See Also
--------
mean, median
Notes
-----
Given a vector V of length N, the qth percentile of V is the qth ranked
value in a sorted copy of V. A weighted average of the two nearest neighbors
is used if the normalized ranking does not match q exactly.
The same as the median if q is 0.5; the same as the min if q is 0;
and the same as the max if q is 1
Examples
--------
>>> a = np.array([[10, 7, 4], [3, 2, 1]])
>>> a
array([[10, 7, 4],
[ 3, 2, 1]])
>>> np.percentile(a, 50)
3.5
>>> np.percentile(a, 50, axis=0)
array([ 6.5, 4.5, 2.5])
>>> np.percentile(a, 50, axis=1)
array([ 7., 2.])
>>> m = np.percentile(a, 50, axis=0)
>>> out = np.zeros_like(m)
>>> np.percentile(a, 50, axis=0, out=m)
array([ 6.5, 4.5, 2.5])
>>> m
array([ 6.5, 4.5, 2.5])
>>> b = a.copy()
>>> np.percentile(b, 50, axis=1, overwrite_input=True)
array([ 7., 2.])
>>> assert not np.all(a==b)
>>> b = a.copy()
>>> np.percentile(b, 50, axis=None, overwrite_input=True)
3.5
>>> assert not np.all(a==b)
"""
a = np.asarray(a)
if q == 0:
return a.min(axis=axis, out=out)
elif q == 100:
return a.max(axis=axis, out=out)
if overwrite_input:
if axis is None:
sorted = a.ravel()
sorted.sort()
else:
a.sort(axis=axis)
sorted = a
else:
sorted = np.sort(a, axis=axis)
if axis is None:
axis = 0
return _compute_qth_percentile(sorted, q, axis, out, method)
def iqrange(data, axis=None): def iqrange(data, axis=None):
''' '''

Loading…
Cancel
Save