From 0bfe623f5cf1362e665e8eca2bc42e0f48cc287b Mon Sep 17 00:00:00 2001 From: "Per.Andreas.Brodtkorb" Date: Wed, 5 Mar 2014 01:55:42 +0000 Subject: [PATCH] Fixed more bugs in distributions.py --- pywafo/src/wafo/misc.py | 767 +- pywafo/src/wafo/objects.py | 5751 +++++------ pywafo/src/wafo/source/rind2007/krobovmod.mod | 34 +- pywafo/src/wafo/source/rind2007/rcrudemod.mod | 20 +- pywafo/src/wafo/source/rind2007/rind71mod.mod | 32 +- pywafo/src/wafo/source/rind2007/rindmod.mod | 16 +- pywafo/src/wafo/source/rind2007/swapmod.mod | 16 +- pywafo/src/wafo/stats/_binned_statistic.py | 402 + pywafo/src/wafo/stats/_constants.py | 24 + pywafo/src/wafo/stats/_continuous_distns.py | 4654 +++++++++ pywafo/src/wafo/stats/_discrete_distns.py | 1552 +-- .../src/wafo/stats/_distn_infrastructure.py | 2 +- pywafo/src/wafo/stats/kde_test.py | 15 + pywafo/src/wafo/stats/morestats.py | 3902 ++++---- pywafo/src/wafo/stats/stats.py | 8724 +++++++++-------- pywafo/src/wafo/stats/tests/__init__.py | 0 pywafo/src/wafo/stats/tests/common_tests.py | 310 +- .../wafo/stats/tests/test_binned_statistic.py | 2 +- .../wafo/stats/tests/test_continuous_basic.py | 244 +- .../wafo/stats/tests/test_discrete_basic.py | 47 +- .../wafo/stats/tests/test_distributions.py | 3796 +++---- pywafo/src/wafo/stats/tests/test_fit.py | 28 +- pywafo/src/wafo/stats/tests/test_morestats.py | 1596 +-- .../src/wafo/stats/tests/test_multivariate.py | 10 +- pywafo/src/wafo/stats/tests/test_rank.py | 2 +- .../stats/tests/test_tukeylambda_stats.py | 4 +- pywafo/src/wafo/test/test_misc.py | 863 +- 27 files changed, 19283 insertions(+), 13530 deletions(-) create mode 100644 pywafo/src/wafo/stats/_binned_statistic.py create mode 100644 pywafo/src/wafo/stats/_constants.py create mode 100644 pywafo/src/wafo/stats/_continuous_distns.py create mode 100644 pywafo/src/wafo/stats/kde_test.py create mode 100644 pywafo/src/wafo/stats/tests/__init__.py diff --git a/pywafo/src/wafo/misc.py b/pywafo/src/wafo/misc.py index 3c1f57c..d4f6f55 100644 --- a/pywafo/src/wafo/misc.py +++ b/pywafo/src/wafo/misc.py @@ -7,12 +7,13 @@ import sys import fractions import numpy as np from numpy import ( - abs, amax, any, logical_and, arange, linspace, atleast_1d, # atleast_2d, - array, asarray, broadcast_arrays, ceil, floor, frexp, hypot, + meshgrid, + abs, amax, any, logical_and, arange, linspace, atleast_1d, + array, asarray, ceil, floor, frexp, hypot, sqrt, arctan2, sin, cos, exp, log, mod, diff, empty_like, finfo, inf, pi, interp, isnan, isscalar, zeros, ones, linalg, r_, sign, unique, hstack, vstack, nonzero, where, extract) -from scipy.special import gammaln +from scipy.special import gammaln, gamma, psi from scipy.integrate import trapz, simps import warnings from plotbackend import plotbackend @@ -24,7 +25,8 @@ try: except: clib = None floatinfo = finfo(float) - +_TINY = np.finfo(float).tiny +_EPS = np.finfo(float).eps __all__ = [ 'is_numlike', 'JITImport', 'DotDict', 'Bunch', 'printf', 'sub_dict_select', @@ -1693,6 +1695,600 @@ def gravity(phi=45): 0.0000059 * sin(2 * phir) ** 2.) +def dea3(v0, v1, v2): + ''' + Extrapolate a slowly convergent sequence + + Parameters + ---------- + v0, v1, v2 : array-like + 3 values of a convergent sequence to extrapolate + + Returns + ------- + result : array-like + extrapolated value + abserr : array-like + absolute error estimate + + Description + ----------- + DEA3 attempts to extrapolate nonlinearly to a better estimate + of the sequence's limiting value, thus improving the rate of + convergence. The routine is based on the epsilon algorithm of + P. Wynn, see [1]_. + + Example + ------- + # integrate sin(x) from 0 to pi/2 + + >>> import numpy as np + >>> import numdifftools as nd + >>> Ei= np.zeros(3) + >>> linfun = lambda k : np.linspace(0,np.pi/2.,2.**(k+5)+1) + >>> for k in np.arange(3): + ... x = linfun(k) + ... Ei[k] = np.trapz(np.sin(x),x) + >>> [En, err] = nd.dea3(Ei[0], Ei[1], Ei[2]) + >>> truErr = Ei-1. + >>> (truErr, err, En) + (array([ -2.00805680e-04, -5.01999079e-05, -1.25498825e-05]), + array([ 0.00020081]), array([ 1.])) + + See also + -------- + dea + + Reference + --------- + .. [1] C. Brezinski (1977) + "Acceleration de la convergence en analyse numerique", + "Lecture Notes in Math.", vol. 584, + Springer-Verlag, New York, 1977. + ''' + E0, E1, E2 = np.atleast_1d(v0, v1, v2) + abs = np.abs # @ReservedAssignment + max = np.maximum # @ReservedAssignment + delta2, delta1 = E2 - E1, E1 - E0 + err2, err1 = abs(delta2), abs(delta1) + tol2, tol1 = max(abs(E2), abs(E1)) * _EPS, max(abs(E1), abs(E0)) * _EPS + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") # ignore division by zero and overflow + ss = 1.0 / delta2 - 1.0 / delta1 + smallE2 = (abs(ss * E1) <= 1.0e-3).ravel() + + result = 1.0 * E2 + abserr = err1 + err2 + E2 * _EPS * 10.0 + converged = (err1 <= tol1) & (err2 <= tol2).ravel() | smallE2 + k4, = (1 - converged).nonzero() + if k4.size > 0: + result[k4] = E1[k4] + 1.0 / ss[k4] + abserr[k4] = err1[k4] + err2[k4] + abs(result[k4] - E2[k4]) + return result, abserr + + +def hyp2f1_taylor(a, b, c, z, tol=1e-13, itermax=500): + a, b, c, z = np.broadcast_arrays(*np.atleast_1d(a, b, c, z)) + shape = a.shape + ak, bk, ck, zk = [d.ravel() for d in (a, b, c, z)] + ajm1 = np.ones(ak.shape) + bjm2 = 0.5 * np.ones(ak.shape) + bjm1 = np.ones(ak.shape) + hout = np.zeros(ak.shape) + k0 = np.arange(len(ak)) + for j in range(0, itermax): + aj = ajm1 * (ak + j) * (bk + j) / (ck + j) * zk / (j + 1) + bj = bjm1 + aj + h, err = dea3(bjm2, bjm1, bj) + k = np.flatnonzero(err > tol * np.abs(h)) + hout[k0] = h + if len(k) == 0: + break + k0 = k0[k] + ak, bk, ck, zk = ak[k], bk[k], ck[k], zk[k] + ajm1 = aj[k] + bjm2 = bjm1[k] + bjm1 = bj[k] + else: + warnings.warn(('Reached %d limit! \n' + + '#%d values did not converge! Max error=%g') % + (j, len(k), np.max(err))) + return hout.reshape(shape) + + +def hyp2f1(a, b, c, z, rho=0.5): + e1 = gammaln(a) + e2 = gammaln(b) + e3 = gammaln(c) + e4 = gammaln(b - a) + e5 = gammaln(a - b) + + e6 = gammaln(c - a) + e7 = gammaln(c - b) + e8 = gammaln(c - a - b) + e9 = gammaln(a + b - c) + _cmab = c-a-b + #~(np.round(cmab) == cmab & cmab <= 0) + if abs(z) <= rho: + h = hyp2f1_taylor(a, b, c, z, 1e-15) + elif abs(1 - z) <= rho: # % Require that |arg(1-z)| 10: + break + xjm2 = xjm1 + xjm1 = xj + else: + warnings.warn('Reached %d limit' % j) + return h + + +def hygfz(A, B, C, Z): + ''' Return hypergeometric function for a complex argument, F(a,b,c,z) + + Parameters + ---------- + a, b, c: + parameters where c <> 0,-1,-2,... + z :--- Complex argument + ''' + X = np.real(Z) + Y = np.imag(Z) + EPS = 1.0e-15 + L0 = C == np.round(C) and C < 0.0e0 + L1 = abs(1.0 - X) < EPS and Y == 0.0 and C - A - B <= 0.0 + L2 = abs(Z + 1.0) < EPS and abs(C - A + B - 1.0) < EPS + L3 = A == np.round(A) and A < 0.0 + L4 = B == np.round(B) and B < 0.0 + L5 = C - A == np.round(C - A) and C - A <= 0.0 + L6 = C - B == np.round(C - B) and C - B <= 0.0 + AA = A + BB = B + A0 = abs(Z) + if (A0 > 0.95): + EPS = 1.0e-8 + PI = 3.141592653589793 + EL = .5772156649015329 + if (L0 or L1): + # 'The hypergeometric series is divergent' + return np.inf + + NM = 0 + if (A0 == 0.0 or A == 0.0 or B == 0.0): + ZHF = 1.0 + elif (Z == 1.0 and C - A - B > 0.0): + GC = gamma(C) + GCAB = gamma(C - A - B) + GCA = gamma(C - A) + GCB = gamma(C - B) + ZHF = GC * GCAB / (GCA * GCB) + elif L2: + G0 = sqrt(PI) * 2.0 ** (-A) + G1 = gamma(C) + G2 = gamma(1.0 + A / 2.0 - B) + G3 = gamma(0.5 + 0.5 * A) + ZHF = G0 * G1 / (G2 * G3) + elif L3 or L4: + if (L3): + NM = int(np.round(abs(A))) + if (L4): + NM = int(np.round(abs(B))) + ZHF = 1.0 + ZR = 1.0 + for K in range(NM): + ZR = ZR * (A + K) * (B + K) / ((K + 1.) * (C + K)) * Z + ZHF = ZHF + ZR + elif L5 or L6: + if (L5): + NM = np.round(abs(C - A)) + if (L6): + NM = np.round(abs(C - B)) + ZHF = 1.0 + 0j + ZR = 1.0 + 0j + for K in range(NM): + ZR *= (C - A + K) * (C - B + K) / ((K + 1.) * (C + K)) * Z + ZHF = ZHF + ZR + ZHF = (1.0 - Z) ** (C - A - B) * ZHF + elif (A0 <= 1.0): + if (X < 0.0): + Z1 = Z / (Z - 1.0) + if (C > A and B < A and B > 0.0): + A = BB + B = AA + + ZC0 = 1.0 / ((1.0 - Z) ** A) + ZHF = 1.0 + 0j + ZR0 = 1.0 + 0j + ZW = 0 + for K in range(500): + ZR0 *= (A + K) * (C - B + K) / ((K + 1.0) * (C + K)) * Z1 + ZHF += ZR0 + if (abs(ZHF - ZW) < abs(ZHF) * EPS): + break + ZW = ZHF + ZHF = ZC0 * ZHF + elif (A0 >= 0.90): + ZW = 0.0 + GM = 0.0 + MCAB = np.round(C - A - B) + if (abs(C - A - B - MCAB) < EPS): + M = int(np.round(C - A - B)) + GA = gamma(A) + GB = gamma(B) + GC = gamma(C) + GAM = gamma(A + M) + GBM = gamma(B + M) + PA = psi(A) + PB = psi(B) + if (M != 0): + GM = 1.0 + for j in range(1, abs(M)): + GM *= j + RM = 1.0 + for j in range(1, abs(M) + 1): # DO 35 J=1,abs(M) + RM *= j + ZF0 = 1.0 + ZR0 = 1.0 + ZR1 = 1.0 + SP0 = 0.0 + SP = 0.0 + if (M >= 0): + ZC0 = GM * GC / (GAM * GBM) + ZC1 = -GC * (Z - 1.0) ** M / (GA * GB * RM) + for K in range(1, M): + ZR0 = ZR0 * \ + (A + K - 1.) * (B + K - 1.) / \ + (K * (K - M)) * (1. - Z) + ZF0 = ZF0 + ZR0 + for K in range(M): + SP0 = SP0 + 1.0 / \ + (A + K) + 1.0 / (B + K) - 1. / (K + 1.) + ZF1 = PA + PB + SP0 + 2.0 * EL + np.log(1.0 - Z) + for K in range(1, 501): + SP = SP + \ + (1.0 - A) / (K * (A + K - 1.0)) + ( + 1.0 - B) / (K * (B + K - 1.0)) + SM = 0.0 + for J in range(1, M): + SM += (1.0 - A) / ( + (J + K) * (A + J + K - 1.0)) + 1.0 / (B + J + K - 1.0) + + ZP = PA + PB + 2.0 * EL + SP + SM + np.log(1.0 - Z) + ZR1 = ZR1 * \ + (A + M + K - 1.0) * (B + M + K - 1.0) / ( + K * (M + K)) * (1.0 - Z) + ZF1 = ZF1 + ZR1 * ZP + if (abs(ZF1 - ZW) < abs(ZF1) * EPS): + break + ZW = ZF1 + ZHF = ZF0 * ZC0 + ZF1 * ZC1 + elif (M < 0): + M = -M + ZC0 = GM * GC / (GA * GB * (1.0 - Z) ** M) + ZC1 = -(-1) ** M * GC / (GAM * GBM * RM) + for K in range(1, M): + ZR0 = ZR0 * \ + (A - M + K - 1.0) * (B - M + K - 1.0) / ( + K * (K - M)) * (1.0 - Z) + ZF0 = ZF0 + ZR0 + for K in range(1, M + 1): + SP0 = SP0 + 1.0 / K + ZF1 = PA + PB - SP0 + 2.0 * EL + np.log(1.0 - Z) + for K in range(1, 501): + SP = SP + \ + (1.0 - A) / (K * (A + K - 1.0)) + ( + 1.0 - B) / (K * (B + K - 1.0)) + SM = 0.0 + for J in range(1, M + 1): + SM = SM + 1.0 / (J + K) + ZP = PA + PB + 2.0 * EL + SP - SM + np.log(1.0 - Z) + ZR1 = ZR1 * \ + (A + K - 1.) * (B + K - 1.) / \ + (K * (M + K)) * (1. - Z) + ZF1 = ZF1 + ZR1 * ZP + if (abs(ZF1 - ZW) < abs(ZF1) * EPS): + break + ZW = ZF1 + ZHF = ZF0 * ZC0 + ZF1 * ZC1 + else: + GA = gamma(A) + GB = gamma(B) + GC = gamma(C) + GCA = gamma(C - A) + GCB = gamma(C - B) + GCAB = gamma(C - A - B) + GABC = gamma(A + B - C) + ZC0 = GC * GCAB / (GCA * GCB) + ZC1 = GC * GABC / (GA * GB) * (1.0 - Z) ** (C - A - B) + ZHF = 0 + 0j + ZR0 = ZC0 + ZR1 = ZC1 + for K in range(1, 501): + ZR0 = ZR0 * \ + (A + K - 1.) * (B + K - 1.) / \ + (K * (A + B - C + K)) * (1. - Z) + ZR1 = ZR1 * \ + (C - A + K - 1.0) * (C - B + K - 1.0) / ( + K * (C - A - B + K)) * (1.0 - Z) + ZHF = ZHF + ZR0 + ZR1 + if (abs(ZHF - ZW) < abs(ZHF) * EPS): + break + ZW = ZHF + ZHF = ZHF + ZC0 + ZC1 + else: + ZW = 0.0 + Z00 = 1.0 #+ 0j + if (C - A < A and C - B < B): + Z00 = (1.0 - Z) ** (C - A - B) + A = C - A + B = C - B + ZHF = 1.0 + ZR = 1.0 + for K in range(1, 501): + ZR = ZR * \ + (A + K - 1.0) * (B + K - 1.0) / (K * (C + K - 1.0)) * Z + ZHF = ZHF + ZR + if (abs(ZHF - ZW) <= abs(ZHF) * EPS): + break + ZW = ZHF + ZHF = Z00 * ZHF + elif (A0 > 1.0): + MAB = np.round(A - B) + if (abs(A - B - MAB) < EPS and A0 <= 1.1): + B = B + EPS + if (abs(A - B - MAB) > EPS): + GA = gamma(A) + GB = gamma(B) + GC = gamma(C) + GAB = gamma(A - B) + GBA = gamma(B - A) + GCA = gamma(C - A) + GCB = gamma(C - B) + ZC0 = GC * GBA / (GCA * GB * (-Z) ** A) + ZC1 = GC * GAB / (GCB * GA * (-Z) ** B) + ZR0 = ZC0 + ZR1 = ZC1 + ZHF = 0.0 + 0j + for K in range(1, 501): + ZR0 = ZR0 * (A + K - 1.0) * (A - C + K) / ((A - B + K) * K * Z) + ZR1 = ZR1 * (B + K - 1.0) * (B - C + K) / ((B - A + K) * K * Z) + ZHF = ZHF + ZR0 + ZR1 + if (abs((ZHF - ZW) / ZHF) <= EPS): + break + ZW = ZHF + ZHF = ZHF + ZC0 + ZC1 + else: + if (A - B < 0.0): + A = BB + B = AA + CA = C - A + CB = C - B + NCA = np.round(CA) + NCB = np.round(CB) + if (abs(CA - NCA) < EPS or abs(CB - NCB) < EPS): + C = C + EPS + GA = gamma(A) + GC = gamma(C) + GCB = gamma(C - B) + PA = psi(A) + PCA = psi(C - A) + PAC = psi(A - C) + MAB = np.round(A - B + EPS) + ZC0 = GC / (GA * (-Z) ** B) + GM = gamma(A - B) + ZF0 = GM / GCB * ZC0 + ZR = ZC0 + for K in range(1, MAB): + ZR = ZR * (B + K - 1.0) / (K * Z) + T0 = A - B - K + G0 = gamma(T0) + GCBK = gamma(C - B - K) + ZF0 = ZF0 + ZR * G0 / GCBK + if (MAB == 0): + ZF0 = 0.0 + 0j + ZC1 = GC / (GA * GCB * (-Z) ** A) + SP = -2.0 * EL - PA - PCA + for J in range(1, MAB + 1): + SP = SP + 1.0 / J + ZP0 = SP + np.log(-Z) + SQ = 1.0 + for J in range(1, MAB + 1): + SQ = SQ * (B + J - 1.0) * (B - C + J) / J + ZF1 = (SQ * ZP0) * ZC1 + ZR = ZC1 + RK1 = 1.0 + SJ1 = 0.0 + W0 = 0.0 + for K in range(1, 10001): + ZR = ZR / Z + RK1 = RK1 * (B + K - 1.0) * (B - C + K) / (K * K) + RK2 = RK1 + for J in range(K + 1, K + MAB + 1): + RK2 = RK2 * (B + J - 1.0) * (B - C + J) / J + SJ1 = SJ1 + \ + (A - 1.0) / (K * (A + K - 1.0)) + \ + (A - C - 1.0) / (K * (A - C + K - 1.0)) + SJ2 = SJ1 + for J in range(K + 1, K + MAB + 1): + SJ2 = SJ2 + 1.0 / J + ZP = -2.0 * EL - PA - PAC + SJ2 - 1.0 / \ + (K + A - C) - PI / np.tan(PI * (K + A - C)) + np.log(-Z) + ZF1 = ZF1 + RK2 * ZR * ZP + WS = abs(ZF1) + if (abs((WS - W0) / WS) < EPS): + break + W0 = WS + ZHF = ZF0 + ZF1 + A = AA + B = BB + if (K > 150): + warnings.warn('Warning! You should check the accuracy') + return ZHF + +# def hypgf(a, b, c, x, abseps=0, releps=1e-13, kmax=10000): +# '''HYPGF Hypergeometric function F(a,b,c,x) +# +# CALL: [y ,abserr] = hypgf(a,b,c,x,abseps,releps) +# +# y = F(a,b,c,x) +# abserr = absolute error estimate +# a,b,c,x = input parameters +# abseps = requested absolute error +# releps = requested relative error +# +# HYPGF calculates one solution to Gauss's hypergeometric differential +# equation: +# +# x*(1-x)Y''(x)+[c-(a+b+1)*x]*Y'(x)-a*b*Y(x) = 0 +# where +# F(a,b,c,x) = Y1(x) = 1 + a*b*x/c + a*(a+1)*b*(b+1)*x^2/(c*(c+1))+.... +# +# +# Many elementary functions are special cases of F(a,b,c,x): +# 1/(1-x) = F(1,1,1,x) = F(1,b,b,x) = F(a,1,a,x) +# (1+x)^n = F(-n,b,b,-x) +# atan(x) = x*F(.5,1,1.5,-x^2) +# asin(x) = x*F(.5,.5,1.5,x^2) +# log(x) = x*F(1,1,2,-x) +# log(1+x)-log(1-x) = 2*x*F(.5,1,1.5,x^2) +# +# NOTE: only real x, abs(x) < 1 and c~=0,-1,-2,... are allowed. +# +# Examples: +# x = linspace(-.99,.99)'; +# [Sn1,err1] = hypgf(1,1,1,x) +# plot(x,abs(Sn1-1./(1-x)),'b',x,err1,'r'),set(gca,'yscale','log') +# [Sn2,err2] = hypgf(.5,.5,1.5,x.^2); +# plot(x,abs(x.*Sn2-asin(x)),'b',x,abs(x.*err2),'r'),set(gca,'yscale','log') +# +# +# Reference: +# --------- +# Kreyszig, Erwin (1988) +# Advanced engineering mathematics +# John Wiley & Sons, sixth edition, pp 204. +# ''' +# csize = common_shape(x, a, b, c) +# kmin = 2 +# fsum = np.zeros(csize) +# delta = np.zeros(csize) +# err = np.zeros(csize) +# +# ok = ~((np.round(c) == c & c <= 0) | np.abs(x) > 1) +# if np.any(~ok): +# warnings.warn('HYPGF', 'Illegal input: c = 0,-1,-2,... or abs(x)>1') +# fsum[~ok] = np.NaN +# err[~ok] = np.NaN +# +# k0=find(ok & abs(x)==1); +# if any(k0) +# cmab = c(k0)-a(k0)-b(k0); +# fsum(k0) = exp(gammaln(c(k0))+gammaln(cmab)-... +# gammaln(c(k0)-a(k0))-gammaln(c(k0)-b(k0))); +# err(k0) = eps; +# k00 = find(real(cmab)<=0); +# if any(k00) +# err(k0(k00)) = nan; +# fsum(k0(k00)) = nan; +# end +# end +# k=find(ok & abs(x)<1); +# if any(k), +# delta(k) = ones(size(k)); +# fsum(k) = delta(k); +# +# k1 = k; +# E = cell(1,3); +# E{3} = fsum(k); +# converge = 'n'; +# for ix=0:Kmax-1, +# delta(k1) = delta(k1).*((a(k1)+ix)./(ix+1)).*((b(k1)+ix)./(c(k1)+ ix)).*x(k1); +# fsum(k1) = fsum(k1)+delta(k1); +# +# E(1:2) = E(2:3); +# E{3} = fsum(k1); +# +# if ix>Kmin +# if useDEA, +# [Sn, err(k1)] = dea3(E{:}); +# k00 = find((abs(err(k1))) <= max(absEps,abs(relEps.*fsum(k1)))); +# if any(k00) +# fsum(k1(k00)) = Sn(k00); +# end +# if (ix==Kmax-1) +# fsum(k1) = Sn; +# end +# k0 = (find((abs(err(k1))) > max(absEps,abs(relEps.*fsum(k1))))); +# if any(k0),% compute more terms +# %nk=length(k0);%# of values we have to compute again +# E{2} = E{2}(k0); +# E{3} = E{3}(k0); +# else +# converge='y'; +# break; +# end +# else +# err(k1) = 10*abs(delta(k1)); +# k0 = (find((abs(err(k1))) > max(absEps,abs(relEps.* ... +# fsum(k1))))); +# if any(k0),% compute more terms +# %nk=length(k0);%# of values we have to compute again +# else +# converge='y'; +# break; +# end +# end +# k1 = k1(k0); +# end +# end +# if ~strncmpi(converge,'y',1) +# disp(sprintf('#%d values did not converge',length(k1))) +# end +# end +# %ix +# return + + def nextpow2(x): ''' Return next higher power of 2 @@ -1761,8 +2357,6 @@ def _discretize_linear(fun, a, b, tol=0.005, n=5): ''' Automatic discretization of function, linear gridding ''' - tiny = floatinfo.tiny - x = linspace(a, b, n) y = fun(x) @@ -1777,7 +2371,7 @@ def _discretize_linear(fun, a, b, tol=0.005, n=5): x = linspace(a, b, n) y = fun(x) y00 = interp(x, x0, y0) - err = 0.5 * amax(abs((y00 - y) / (abs(y00 + y) + tiny))) + err = 0.5 * amax(abs((y00 - y) / (abs(y00 + y) + _TINY))) return x, y @@ -1785,7 +2379,6 @@ def _discretize_adaptive(fun, a, b, tol=0.005, n=5): ''' Automatic discretization of function, adaptive gridding. ''' - tiny = floatinfo.tiny n += (mod(n, 2) == 0) # make sure n is odd x = linspace(a, b, n) fx = fun(x) @@ -1807,7 +2400,7 @@ def _discretize_adaptive(fun, a, b, tol=0.005, n=5): fy = fun(y) fy0 = interp(y, x, fx) - erri = 0.5 * (abs((fy0 - fy) / (abs(fy0 + fy) + tiny))) + erri = 0.5 * (abs((fy0 - fy) / (abs(fy0 + fy) + _TINY))) err = erri.max() @@ -1867,125 +2460,6 @@ def cart2polar(x, y, z=None): return t, r, z -def meshgrid(*xi, **kwargs): - """ - Return coordinate matrices from one or more coordinate vectors. - - Make N-D coordinate arrays for vectorized evaluations of - N-D scalar/vector fields over N-D grids, given - one-dimensional coordinate arrays x1, x2,..., xn. - - Parameters - ---------- - x1, x2,..., xn : array_like - 1-D arrays representing the coordinates of a grid. - indexing : 'xy' or 'ij' (optional) - cartesian ('xy', default) or matrix ('ij') indexing of output - sparse : True or False (default) (optional) - If True a sparse grid is returned in order to conserve memory. - copy : True (default) or False (optional) - If False a view into the original arrays are returned in order to - conserve memory - - Returns - ------- - X1, X2,..., XN : ndarray - For vectors `x1`, `x2`,..., 'xn' with lengths ``Ni=len(xi)`` , - return ``(N1, N2, N3,...Nn)`` shaped arrays if indexing='ij' - or ``(N2, N1, N3,...Nn)`` shaped arrays if indexing='xy' - with the elements of `xi` repeated to fill the matrix along - the first dimension for `x1`, the second for `x2` and so on. - - See Also - -------- - index_tricks.mgrid : Construct a multi-dimensional "meshgrid" - using indexing notation. - index_tricks.ogrid : Construct an open multi-dimensional "meshgrid" - using indexing notation. - - Examples - -------- - >>> x = np.linspace(0,1,3) # coordinates along x axis - >>> y = np.linspace(0,1,2) # coordinates along y axis - >>> xv, yv = meshgrid(x,y) # extend x and y for a 2D xy grid - >>> xv - array([[ 0. , 0.5, 1. ], - [ 0. , 0.5, 1. ]]) - >>> yv - array([[ 0., 0., 0.], - [ 1., 1., 1.]]) - >>> xv, yv = meshgrid(x,y, sparse=True) # make sparse output arrays - >>> xv - array([[ 0. , 0.5, 1. ]]) - >>> yv - array([[ 0.], - [ 1.]]) - - >>> meshgrid(x,y,sparse=True,indexing='ij') # change to matrix indexing - [array([[ 0. ], - [ 0.5], - [ 1. ]]), array([[ 0., 1.]])] - >>> meshgrid(x,y,indexing='ij') - [array([[ 0. , 0. ], - [ 0.5, 0.5], - [ 1. , 1. ]]), array([[ 0., 1.], - [ 0., 1.], - [ 0., 1.]])] - - >>> meshgrid(0,1,5) # just a 3D point - [array([[[0]]]), array([[[1]]]), array([[[5]]])] - >>> map(np.squeeze,meshgrid(0,1,5)) # just a 3D point - [array(0), array(1), array(5)] - >>> meshgrid(3) - array([3]) - >>> meshgrid(y) # 1D grid y is just returned - array([ 0., 1.]) - - `meshgrid` is very useful to evaluate functions on a grid. - - >>> x = np.arange(-5, 5, 0.1) - >>> y = np.arange(-5, 5, 0.1) - >>> xx, yy = meshgrid(x, y, sparse=True) - >>> z = np.sin(xx**2+yy**2)/(xx**2+yy**2) - """ - copy_ = kwargs.get('copy', True) - args = atleast_1d(*xi) - if not isinstance(args, list): - if args.size > 0: - return args.copy() if copy_ else args - else: - raise TypeError('meshgrid() take 1 or more arguments (0 given)') - - sparse = kwargs.get('sparse', False) - indexing = kwargs.get('indexing', 'xy') # 'ij' - - ndim = len(args) - s0 = (1,) * ndim - output = [x.reshape(s0[:i] + (-1,) + s0[i + 1::]) - for i, x in enumerate(args)] - - shape = [x.size for x in output] - - if indexing == 'xy': - # switch first and second axis - output[0].shape = (1, -1) + (1,) * (ndim - 2) - output[1].shape = (-1, 1) + (1,) * (ndim - 2) - shape[0], shape[1] = shape[1], shape[0] - - if sparse: - if copy_: - return [x.copy() for x in output] - else: - return output - else: - # Return the full N-D matrix (not only the 1-D vector) - if copy_: - mult_fact = ones(shape, dtype=int) - return [x * mult_fact for x in output] - else: - return broadcast_arrays(*output) - - def ndgrid(*args, **kwargs): """ Same as calling meshgrid with indexing='ij' (see meshgrid for @@ -2059,8 +2533,7 @@ def trangood(x, f, min_n=None, min_x=None, max_x=None, max_n=inf): xn = xo[-1] x0 = xo[0] L = float(xn - x0) - eps = floatinfo.eps - if ((nf < min_n) or (max_n < nf) or any(abs(ddx) > 10 * eps * (L))): + if ((nf < min_n) or (max_n < nf) or any(abs(ddx) > 10 * _EPS * (L))): # % pab 07.01.2001: Always choose the stepsize df so that # % it is an exactly representable number. # % This is important when calculating numerical derivatives and is @@ -2140,8 +2613,6 @@ def tranproc(x, f, x0, *xi): -------- trangood. """ - - eps = floatinfo.eps xo, fo, x0 = atleast_1d(x, f, x0) xi = atleast_1d(*xi) if not isinstance(xi, list): @@ -2165,7 +2636,7 @@ def tranproc(x, f, x0, *xi): if N > 0: y = [y0] hn = xo[1] - xo[0] - if hn ** N < sqrt(eps): + if hn ** N < sqrt(_EPS): msg = ('Numerical problems may occur for the derivatives in ' + 'tranproc.\nThe sampling of the transformation may be too small.') warnings.warn(msg) @@ -2602,5 +3073,33 @@ def test_docstrings(): import doctest doctest.testmod() + +def test_hyp2f1(): + # 1/(1-x) = F(1,1,1,x) = F(1,b,b,x) = F(a,1,a,x) +# (1+x)^n = F(-n,b,b,-x) +# atan(x) = x*F(.5,1,1.5,-x^2) +# asin(x) = x*F(.5,.5,1.5,x^2) +# log(x) = x*F(1,1,2,-x) +# log(1+x)-log(1-x) = 2*x*F(.5,1,1.5,x^2) + x = linspace(0., .7, 20) + y = hyp2f1_taylor(-1, -4, 1, .9) + y2 = hygfz(-1, -4, 1, .9) + y3 = hygfz(5, -300, 10, 0.5) + y4 = hyp2f1_taylor(5, -300, 10, 0.5) + #y = hyp2f1(0.1, 0.2, 0.3, 0.5) + #y = hyp2f1(1, 1.5, 3, -4 +3j) + #y = hyp2f1(5, 7.5, 2.5, 5) +# fun = lambda x : 1./(1-x) +# x = .99 +# y = hyp2f1(1,1,1,x) +# print(y-fun(x)) +# + plt = plotbackend + plt.interactive(False) + plt.semilogy(x, np.abs(y- 1. / (1 - x)) + 1e-20, 'r') + plt.show() + + if __name__ == "__main__": - test_docstrings() + #test_docstrings() + test_hyp2f1() \ No newline at end of file diff --git a/pywafo/src/wafo/objects.py b/pywafo/src/wafo/objects.py index 7177eab..b3b73e5 100644 --- a/pywafo/src/wafo/objects.py +++ b/pywafo/src/wafo/objects.py @@ -1,2851 +1,2900 @@ - - -# Name: module1 -# Purpose: -# -# Author: pab -# -# Created: 16.09.2008 -# Copyright: (c) pab 2008 -# Licence: - -#!/usr/bin/env python - - -from __future__ import division -from wafo.transform.core import TrData -from wafo.transform.models import TrHermite, TrOchi, TrLinear -from wafo.stats import edf, distributions -from wafo.misc import (nextpow2, findtp, findrfc, findtc, findcross, - ecross, JITImport, DotDict, gravity, findrfc_astm) -from wafodata import PlotData -from wafo.interpolate import SmoothSpline -from scipy.interpolate.interpolate import interp1d -from scipy.integrate.quadrature import cumtrapz #@UnresolvedImport -from scipy.special import ndtr as cdfnorm, ndtri as invnorm - -import warnings -import numpy as np - -from numpy import (inf, pi, zeros, ones, sqrt, where, log, exp, cos, sin, arcsin, mod, interp, #@UnresolvedImport - linspace, arange, sort, all, abs, vstack, hstack, atleast_1d, sign, expm1, #@UnresolvedImport - finfo, polyfit, r_, nonzero, cumsum, ravel, size, isnan, nan, ceil, diff, array) #@UnresolvedImport -from numpy.fft import fft -from numpy.random import randn -from scipy.integrate import trapz -from wafo.interpolate import stineman_interp -from matplotlib.mlab import psd, detrend_mean -import scipy.signal - - -from plotbackend import plotbackend -import matplotlib -from scipy.stats.stats import skew, kurtosis -from scipy.signal.windows import parzen -from scipy import special - - -floatinfo = finfo(float) -matplotlib.interactive(True) -_wafocov = JITImport('wafo.covariance') -_wafospec = JITImport('wafo.spectrum') - -__all__ = ['TimeSeries', 'LevelCrossings', 'CyclePairs', 'TurningPoints', - 'sensortypeid', 'sensortype'] - -def _invchi2(q, df): - return special.chdtri(df, q) - -class LevelCrossings(PlotData): - ''' - Container class for Level crossing data objects in WAFO - - Member variables - ---------------- - data : array-like - number of upcrossings or upcrossingintensity - args : array-like - crossing levels - - Examples - -------- - >>> import wafo.data - >>> import wafo.objects as wo - >>> x = wafo.data.sea() - >>> ts = wo.mat2timeseries(x) - - >>> tp = ts.turning_points() - >>> mm = tp.cycle_pairs() - - >>> lc = mm.level_crossings() - >>> h2 = lc.plot() - ''' - def __init__(self, *args, **kwds): - options = dict(title='Level crossing spectrum', - xlab='Levels', ylab='Count', - plotmethod='semilogy', - plot_args=['b'], - plot_args_children=['r--']) - options.update(**kwds) - super(LevelCrossings, self).__init__(*args, **options) - self.intensity = kwds.get('intensity', False) - self.sigma = kwds.get('sigma', None) - self.mean = kwds.get('mean', None) - #self.setplotter(plotmethod='step') - - icmax = self.data.argmax() - if self.data != None: - if self.sigma is None or self.mean is None: - logcros = where(self.data == 0.0, inf, -log(self.data)) - logcmin = logcros[icmax] - logcros = sqrt(2 * abs(logcros - logcmin)) - logcros[0:icmax + 1] = 2 * logcros[icmax] - logcros[0:icmax + 1] - ncr = 10 - p = polyfit(self.args[ncr:-ncr], logcros[ncr:-ncr], 1) #least square fit - if self.sigma is None: - self.sigma = 1.0 / p[0] #estimated standard deviation of x - if self.mean is None: - self.mean = -p[1] / p[0] #self.args[icmax] - cmax = self.data[icmax] - x = (self.args - self.mean) / self.sigma - y = cmax * exp(-x ** 2 / 2.0) - self.children = [PlotData(y, self.args)] - - def extrapolate(self, u_min=None, u_max=None, method='ml', dist='genpar', plotflag=0): - ''' - Returns an extrapolated level crossing spectrum - - Parameters - ----------- - u_min, u_max : real scalars - extrapolate below u_min and above u_max. - method : string - describing the method of estimation. Options are: - 'ml' : Maximum Likelihood method (default) - 'mps': Maximum Product Spacing method - dist : string - defining distribution function. Options are: - genpareto : Generalized Pareto distribution (GPD) - expon : Exponential distribution (GPD with k=0) - rayleigh : truncated Rayleigh distribution - plotflag : scalar integer - 1: Diagnostic plots. (default) - 0: Don't plot diagnostic plots. - - Returns - ------- - lc : LevelCrossing object - with the estimated level crossing spectrum - Est = Estimated parameters. [struct array] - - Extrapolates the level crossing spectrum (LC) for high and for low levels. - The tails of the LC is fitted to a survival function of a GPD. - H(x) = (1-k*x/s)^(1/k) (GPD) - The use of GPD is motivated by POT methods in extreme value theory. - For k=0 the GPD is the exponential distribution - H(x) = exp(-x/s), k=0 (expon) - The tails with the survival function of a truncated Rayleigh distribution. - H(x) = exp(-((x+x0).^2-x0^2)/s^2) (rayleigh) - where x0 is the distance from the truncation level to where the LC has its maximum. - The method 'gpd' uses the GPD. We recommend the use of 'gpd,ml'. - The method 'exp' uses the Exp. - The method 'ray' uses Ray, and should be used if the load is a Gaussian process. - - Example - ------- - >>> import wafo.data - >>> import wafo.objects as wo - >>> x = wafo.data.sea() - >>> ts = wo.mat2timeseries(x) - - >>> tp = ts.turning_points() - >>> mm = tp.cycle_pairs() - >>> lc = mm.level_crossings() - - >>> s = x[:,1].std() - >>> lc_gpd = lc.extrapolate(-2*s, 2*s) - >>> lc_exp = lc.extrapolate(-2*s, 2*s, dist='expon') - >>> lc_ray = lc.extrapolate(-2*s, 2*s, dist='rayleigh') - - lc.plot() - lc_gpd.plot() - lc_exp.plot() - lc_ray.plot() - - - See also - -------- - cmat2extralc, rfmextrapolate, lc2rfmextreme, extralc, fitgenpar - - References - ---------- - Johannesson, P., and Thomas, J-.J. (2000): - Extrapolation of Rainflow Matrices. - Preprint 2000:82, Mathematical statistics, Chalmers, pp. 18. - ''' - - - i_max = self.data.argmax() - c_max = self.data[i_max] - # Maximum of lc - lc_max = self.args[i_max] - - if u_min is None or u_max is None: - fraction = sqrt(c_max) - i = np.flatnonzero(self.data > fraction) - if u_min is None : - u_min = self.args[i.min()] - if u_max is None: - u_max = self.args[i.max()] - lcf, lcx = self.data, self.args - # Extrapolate LC for high levels - [lc_High, phat_high] = self._extrapolate(lcx, lcf, u_max, u_max - lc_max, method, dist); -# -# # Extrapolate LC for low levels - [lcEst1, phat_low] = self._extrapolate(-lcx[::-1], lcf[::-1], -u_min, lc_max - u_min, method, dist) - lc_Low = lcEst1[::-1, :] #[-lcEst1[::-1, 0], lcEst1[::-1, 1::]] - lc_Low[:,0] *= -1 -# Est.Low = Est1; -# - if plotflag: - plotbackend.semilogx(lcf, lcx, - lc_High[:, 1], lc_High[:, 0], - lc_Low[:, 1], lc_Low[:, 0]) - i_mask = (u_min u - lcx1, lcf1 = lcx[Iu], lcf[Iu] - lcf2, lcx2 = self._make_increasing(lcf1[::-1], lcx1[::-1]) - - nim1 = 0 - x = [] - for xk, ni in zip(lcx2.tolist(),lcf2.tolist()): - x.append(ones(ni - nim1) * xk) - nim1 = ni - - - x = np.hstack(x) - u - - df = 0.01 - xF = np.arange(0.0, 4 + df / 2, df) - lcu = np.interp(u, lcx, lcf) + 1 - # Estimate tail - if dist.startswith('gen'): - genpareto = distributions.genpareto - phat = genpareto.fit2(x, floc=0, method=method) - SF = phat.sf(xF) - - covar = phat.par_cov[::2, ::2] - # Calculate 90 # confidence region, an ellipse, for (k,s) - D, B = np.linalg.eig(covar); - b = phat.par[::2] - if b[0] > 0: - phat.upperlimit = u + b[1] / b[0] - - r = sqrt(-2 * log(1 - 90 / 100)) # 90 # confidence sphere - Nc = 16 + 1 - ang = linspace(0, 2 * pi, Nc) - c0 = np.vstack((r * sqrt(D[0]) * sin(ang), r * sqrt(D[1]) * cos(ang))) # 90# Circle - # plot(c0(1,:),c0(2,:)) - - c1 = np.dot(B, c0) + b[:,None] #* ones((1, len(c0))) # Transform to ellipse for (k,s) - # plot(c1(1,:),c1(2,:)), hold on - - # Calculate conf.int for lcu - # Assumtion: lcu is Poisson distributed - # Poissin distr. approximated by normal when calculating conf. int. - dXX = 1.64 * sqrt(lcu) # 90 # quantile for lcu - - lcEstCu = zeros((len(xF), Nc)) - lcEstCl = zeros((len(xF), Nc)) - for i in range(Nc): - k = c1[0, i] - s = c1[1, i] - SF2 = genpareto.sf(xF, k, scale=s) - lcEstCu[:, i] = (lcu + dXX) * (SF2) - lcEstCl[:, i] = (lcu - dXX) * (SF2) - #end - - lcEst = np.vstack((xF + u, lcu * (SF), - lcEstCl.min(axis=1), lcEstCu.max(axis=1))).T - elif dist.startswith('exp'): - expon = distributions.expon - phat = expon.fit2(x, floc=0, method=method) - SF = phat.sf(xF) - lcEst = np.vstack((xF + u, lcu * (SF))).T - - elif dist.startswith('ray') or dist.startswith('trun'): - phat = distributions.truncrayleigh.fit2(x, floc=0, method=method) - SF = phat.sf(xF) -# if False: -# n = len(x) -# Sx = sum((x + offset) ** 2 - offset ** 2) -# s = sqrt(Sx / n); # Shape parameter -# F = -np.expm1(-((xF + offset) ** 2 - offset ** 2) / s ** 2) - lcEst = np.vstack((xF + u, lcu * (SF))).T - else: - raise ValueError() - - return lcEst, phat - ## End extrapolate - - def _make_increasing(self, f, t=None): - # Makes the signal f strictly increasing. - - n = len(f) - if t is None: - t = np.arange(n) - ff = [f[0], ] - tt = [t[0], ] - - for i in xrange(1, n): - if f[i] > ff[-1]: - ff.append(f[i]) - tt.append(t[i]) - - return np.asarray(ff), np.asarray(tt) - - def sim(self, ns, alpha): - """ - Simulates process with given irregularity factor and crossing spectrum - - Parameters - ---------- - ns : scalar, integer - number of sample points. - alpha : real scalar - irregularity factor, 0>> import wafo.spectrum.models as sm - >>> from wafo.objects import mat2timeseries - >>> Sj = sm.Jonswap(Hm0=7) - >>> S = Sj.tospecdata() #Make spectrum object from numerical values - >>> alpha = S.characteristic('alpha')[0] - >>> n = 10000 - >>> xs = S.sim(ns=n) - >>> ts = mat2timeseries(xs) - >>> tp = ts.turning_points() - >>> mm = tp.cycle_pairs() - >>> lc = mm.level_crossings() - - >>> xs2 = lc.sim(n,alpha) - >>> ts2 = mat2timeseries(xs2) - >>> Se = ts2.tospecdata(L=324) - - >>> alpha2 = Se.characteristic('alpha')[0] - >>> np.round(alpha2*10) - array([ 7.]) - >>> np.abs(alpha-alpha2)<0.03 - array([ True], dtype=bool) - - >>> h0 = S.plot('b') - >>> h1 = Se.plot('r') - - >>> lc2 = ts2.turning_points().cycle_pairs().level_crossings() - - >>> import pylab as plt - >>> h = plt.subplot(211) - >>> h2 = lc2.plot() - >>> h = plt.subplot(212) - >>> h0 = lc.plot() - - """ - - # TODO: add a good example - f = linspace(0, 0.49999, 1000) - rho_st = 2. * sin(f * pi) ** 2 - 1. - tmp = alpha * arcsin(sqrt((1. + rho_st) / 2)) - tmp = sin(tmp) ** 2 - a2 = (tmp - rho_st) / (1 - tmp) - y = vstack((a2 + rho_st, 1 - a2)).min(axis=0) - maxidx = y.argmax() - #[maximum,maxidx]=max(y) - - rho_st = rho_st[maxidx] - a2 = a2[maxidx] - a1 = 2. * rho_st + a2 - 1. - r0 = 1. - r1 = -a1 / (1. + a2) - r2 = (a1 ** 2 - a2 - a2 ** 2) / (1 + a2) - sigma2 = r0 + a1 * r1 + a2 * r2 - #randn = np.random.randn - e = randn(ns) * sqrt(sigma2) - e[:2] = 0.0 - L0 = randn(1) - L0 = hstack((L0, r1 * L0 + sqrt(1 - r2 ** 2) * randn(1))) - #%Simulate the process, starting in L0 - lfilter = scipy.signal.lfilter - z0 = lfilter([1, a1, a2], ones(1), L0) - L, unused_zf = lfilter(ones(1), [1, a1, a2], e, axis=0, zi=z0) - - epsilon = 1.01 - min_L = min(L) - max_L = max(L) - maxi = max(abs(r_[min_L, max_L])) * epsilon - mini = -maxi - - u = linspace(mini, maxi, 101) - G = cdfnorm(u) #(1 + erf(u / sqrt(2))) / 2 - G = G * (1 - G) - - x = linspace(0, r1, 100) - factor1 = 1. / sqrt(1 - x ** 2) - factor2 = 1. / (1 + x) - integral = zeros(u.shape, dtype=float) - for i in range(len(integral)): - y = factor1 * exp(-u[i] * u[i] * factor2) - integral[i] = trapz(y, x) - #end - G = G - integral / (2 * pi) - G = G / max(G) - - Z = ((u >= 0) * 2 - 1) * sqrt(-2 * log(G)) - - sumcr = trapz(self.data, self.args) - lc = self.data / sumcr - lc1 = self.args - mcr = trapz(lc1 * lc, lc1) if self.mean is None else self.mean - if self.sigma is None: - scr = trapz(lc1 ** 2 * lc, lc1) - scr = sqrt(scr - mcr ** 2) - else: - scr = self.sigma - lc2 = LevelCrossings(lc, lc1, mean=mcr, sigma=scr, intensity=True) - - g = lc2.trdata()[0] - - - f = g.gauss2dat(Z) - G = TrData(f, u) - - process = G.dat2gauss(L) - return np.vstack((arange(len(process)), process)).T - - -## -## -## %Check the result without reference to getrfc: -## LCe = dat2lc(process) -## max(lc(:,2)) -## max(LCe(:,2)) -## -## clf -## plot(lc(:,1),lc(:,2)/max(lc(:,2))) -## hold on -## plot(LCe(:,1),LCe(:,2)/max(LCe(:,2)),'-.') -## title('Relative crossing intensity') -## -## %% Plot made by the function funplot_4, JE 970707 -## %param = [min(process(:,2)) max(process(:,2)) 100] -## %plot(lc(:,1),lc(:,2)/max(lc(:,2))) -## %hold on -## %plot(levels(param),mu/max(mu),'--') -## %hold off -## %title('Crossing intensity') -## %watstamp -## -## % Temporarily -## %funplot_4(lc,param,mu) - - - def trdata(self, mean=None, sigma=None, **options): - ''' - Estimate transformation, g, from observed crossing intensity, version2. - - Assumption: a Gaussian process, Y, is related to the - non-Gaussian process, X, by Y = g(X). - - Parameters - ---------- - mean, sigma : real scalars - mean and standard deviation of the process - **options : - csm, gsm : real scalars - defines the smoothing of the crossing intensity and the transformation g. - Valid values must be 0<=csm,gsm<=1. (default csm = 0.9 gsm=0.05) - Smaller values gives smoother functions. - param : - vector which defines the region of variation of the data X. - (default [-5, 5, 513]). - monitor : bool - if true monitor development of estimation - linextrap : bool - if true use a smoothing spline with a constraint on the ends to - ensure linear extrapolation outside the range of the data. (default) - otherwise use a regular smoothing spline - cvar, gvar : real scalars - Variances for the crossing intensity and the empirical transformation, g. (default 1) - ne : scalar integer - Number of extremes (maxima & minima) to remove from the estimation - of the transformation. This makes the estimation more robust against - outliers. (default 7) - ntr : scalar integer - Maximum length of empirical crossing intensity. The empirical - crossing intensity is interpolated linearly before smoothing if the - length exceeds ntr. A reasonable NTR (eg. 1000) will significantly - speed up the estimation for long time series without loosing any accuracy. - NTR should be chosen greater than PARAM(3). (default inf) - - Returns - ------- - gs, ge : TrData objects - smoothed and empirical estimate of the transformation g. - - - Notes - ----- - The empirical crossing intensity is usually very irregular. - More than one local maximum of the empirical crossing intensity - may cause poor fit of the transformation. In such case one - should use a smaller value of GSM or set a larger variance for GVAR. - If X(t) is likely to cross levels higher than 5 standard deviations - then the vector param has to be modified. For example if X(t) is - unlikely to cross a level of 7 standard deviations one can use - param = [-7 7 513]. - - Example - ------- - >>> import wafo.spectrum.models as sm - >>> import wafo.transform.models as tm - >>> from wafo.objects import mat2timeseries - >>> Hs = 7.0 - >>> Sj = sm.Jonswap(Hm0=Hs) - >>> S = Sj.tospecdata() #Make spectrum object from numerical values - >>> S.tr = tm.TrOchi(mean=0, skew=0.16, kurt=0, sigma=Hs/4, ysigma=Hs/4) - >>> xs = S.sim(ns=2**16, iseed=10) - >>> ts = mat2timeseries(xs) - >>> tp = ts.turning_points() - >>> mm = tp.cycle_pairs() - >>> lc = mm.level_crossings() - >>> g0, g0emp = lc.trdata(monitor=True) # Monitor the development - >>> g1, g1emp = lc.trdata(gvar=0.5 ) # Equal weight on all points - >>> g2, g2emp = lc.trdata(gvar=[3.5, 0.5, 3.5]) # Less weight on the ends - >>> int(S.tr.dist2gauss()*100) - 141 - >>> int(g0emp.dist2gauss()*100) - 380995 - >>> int(g0.dist2gauss()*100) - 143 - >>> int(g1.dist2gauss()*100) - 162 - >>> int(g2.dist2gauss()*100) - 120 - - g0.plot() # Check the fit. - - See also - troptset, dat2tr, trplot, findcross, smooth - - NB! the transformated data will be N(0,1) - - Reference - --------- - Rychlik , I., Johannesson, P., and Leadbetter, M.R. (1997) - "Modelling and statistical analysis of ocean wavedata - using a transformed Gaussian process", - Marine structures, Design, Construction and Safety, - Vol 10, pp 13--47 - ''' - - if mean is None: - mean = self.mean - if sigma is None: - sigma = self.sigma - - opt = DotDict(chkder=True, plotflag=False, csm=0.9, gsm=.05, - param=(-5, 5, 513), delay=2, linextrap=True, ntr=10000, ne=7, gvar=1) - opt.update(options) - param = opt.param - Ne = opt.ne - - ncr = len(self.data) - if ncr > opt.ntr and opt.ntr > 0: - x0 = linspace(self.args[Ne], self.args[-1 - Ne], opt.ntr) - lc1, lc2 = x0, interp(x0, self.args, self.data) - Ne = 0 - Ner = opt.ne - ncr = opt.ntr - else: - Ner = 0 - lc1, lc2 = self.args, self.data - ng = len(atleast_1d(opt.gvar)) - if ng == 1: - gvar = opt.gvar * ones(ncr) - else: - gvar = interp1d(linspace(0, 1, ng) , opt.gvar, kind='linear')(linspace(0, 1, ncr)) - - - uu = linspace(*param) - g1 = sigma * uu + mean - - if Ner > 0: # Compute correction factors - cor1 = trapz(lc2[0:Ner + 1], lc1[0:Ner + 1]) - cor2 = trapz(lc2[-Ner - 1::], lc1[-Ner - 1::]) - else: - cor1 = 0 - cor2 = 0 - - lc22 = hstack((0, cumtrapz(lc2, lc1) + cor1)) - - if self.intensity: - lc22 = (lc22 + 0.5 / ncr) / (lc22[-1] + cor2 + 1. / ncr) - else: - lc22 = (lc22 + 0.5) / (lc22[-1] + cor2 + 1) - - - lc11 = (lc1 - mean) / sigma - - lc22 = invnorm(lc22) #- ymean - - g2 = TrData(lc22.copy(), lc1.copy(), mean=mean, sigma=sigma) - g2.setplotter('step') - # NB! the smooth function does not always extrapolate well outside the edges - # causing poor estimate of g - # We may alleviate this problem by: forcing the extrapolation - # to be linear outside the edges or choosing a lower value for csm2. - - inds = slice(Ne, ncr - Ne) # indices to points we are smoothing over - slc22 = SmoothSpline(lc11[inds], lc22[inds], opt.gsm, opt.linextrap, gvar[inds])(uu) - - g = TrData(slc22.copy(), g1.copy(), mean=mean, sigma=sigma) - - if opt.chkder: - for ix in range(5): - dy = diff(g.data) - if any(dy <= 0): - warnings.warn( - ''' The empirical crossing spectrum is not sufficiently smoothed. - The estimated transfer function, g, is not a strictly increasing function. - ''') - eps = finfo(float).eps - dy[dy > 0] = eps - gvar = -(hstack((dy, 0)) + hstack((0, dy))) / 2 + eps - g.data = SmoothSpline(g.args, g.data, 1, opt.linextrap, ix * gvar)(g.args) - else: - break - - if opt.plotflag > 0: - g.plot() - g2.plot() - - return g, g2 -def test_levelcrossings_extrapolate(): - import wafo.data - #import wafo.objects as wo - x = wafo.data.sea() - ts = mat2timeseries(x) - - tp = ts.turning_points() - mm = tp.cycle_pairs() - lc = mm.level_crossings() - - s = x[:,1].std() - lc_gpd = lc.extrapolate(-2*s, 2*s, dist='rayleigh') #@UnusedVariable - -class CyclePairs(PlotData): - ''' - Container class for Cycle Pairs data objects in WAFO - - Member variables - ---------------- - data : array_like - args : vector for 1D - - Examples - -------- - >>> import wafo.data - >>> import wafo.objects as wo - >>> x = wafo.data.sea() - >>> ts = wo.mat2timeseries(x) - - >>> tp = ts.turning_points() - >>> mm = tp.cycle_pairs() - >>> h1 = mm.plot(marker='x') - ''' - def __init__(self, *args, **kwds): - self.kind = kwds.pop('kind', 'min2max') - self.sigma = kwds.pop('sigma', None) - self.mean = kwds.pop('mean', None) - self.time = kwds.pop('time', 1) - - options = dict(title=self.kind + ' cycle pairs', - xlab='min', ylab='max', - plot_args=['b.']) - options.update(**kwds) - super(CyclePairs, self).__init__(*args, **options) - - def amplitudes(self): - return (self.data - self.args) / 2. - - def damage(self, beta, K=1): - """ - Calculates the total Palmgren-Miner damage of cycle pairs. - - Parameters - ---------- - beta : array-like, size m - Beta-values, material parameter. - K : scalar, optional - K-value, material parameter. - - Returns - ------- - D : ndarray, size m - Damage. - - Notes - ----- - The damage is calculated according to - D[i] = sum ( K * a**beta[i] ), with a = (max-min)/2 - - Examples - -------- - >>> import wafo - >>> from matplotlib import pyplot as plt - >>> ts = wafo.objects.mat2timeseries(wafo.data.sea()) - >>> tp = ts.turning_points() - >>> mm = tp.cycle_pairs() - >>> h = mm.plot(marker='.') - >>> bv = range(3,9) - >>> D = mm.damage(beta=bv) - >>> D - array([ 138.5238799 , 117.56050788, 108.99265423, 107.86681126, - 112.3791076 , 122.08375071]) - >>> h = plt.plot(bv,D,'x-') - - See also - -------- - SurvivalCycleCount - """ - amp = abs(self.amplitudes()) - return atleast_1d([K * np.sum(amp ** betai) for betai in beta]) - - def level_crossings(self, kind='uM', intensity=False): - """ Return level crossing spectrum from a cycle count. - - Parameters - ---------- - kind : int or string - defining crossing type, options are - 0,'u' : only upcrossings. - 1,'uM' : upcrossings and maxima (default). - 2,'umM': upcrossings, minima, and maxima. - 3,'um' : upcrossings and minima. - intensity : bool - True if level crossing intensity spectrum - False if level crossing count spectrum - Return - ------ - lc : level crossing object - with levels and number of upcrossings. - - - Calculates the number of upcrossings from a cycle pairs, e.g. - min2Max cycles or rainflow cycles. - - Example: - -------- - >>> import wafo - >>> ts = wafo.objects.mat2timeseries(wafo.data.sea()) - >>> tp = ts.turning_points() - >>> mm = tp.cycle_pairs() - >>> h = mm.plot(marker='.') - >>> lc = mm.level_crossings() - >>> h2 = lc.plot() - - See also - -------- - TurningPoints - LevelCrossings - """ - - if isinstance(kind, str): - t = dict(u=0, uM=1, umM=2, um=3) - defnr = t.get(kind, 1) - else: - defnr = kind - - if ((defnr < 0) or (defnr > 3)): - raise ValueError('kind must be one of (1,2,3,4).') - - index, = nonzero(self.args <= self.data) - if index.size == 0: - index, = nonzero(self.args >= self.data) - M = self.args[index] - m = self.data[index] - else: - m = self.args[index] - M = self.data[index] - -#if isempty(index) -# error('Error in input cc.') -#end - ncc = len(m) - - minima = vstack((m, ones(ncc), zeros(ncc), ones(ncc))) - maxima = vstack((M, -ones(ncc), ones(ncc), zeros(ncc))) - - extremes = hstack((maxima, minima)) - index = extremes[0].argsort() - extremes = extremes[:, index] - - ii = 0 - n = extremes.shape[1] - extr = zeros((4, n)) - extr[:, 0] = extremes[:, 0] - for i in xrange(1, n): - if extremes[0, i] == extr[0, ii]: - extr[1:4, ii] = extr[1:4, ii] + extremes[1:4, i] - else: - ii += 1 - extr[:, ii] = extremes[:, i] - - #[xx nx]=max(extr(:,1)) - nx = extr[0].argmax() + 1 - levels = extr[0, 0:nx] - if defnr == 2: ## This are upcrossings + maxima - dcount = cumsum(extr[1, 0:nx]) + extr[2, 0:nx] - extr[3, 0:nx] - elif defnr == 4: # # This are upcrossings + minima - dcount = cumsum(extr[1, 0:nx]) - dcount[nx - 1] = dcount[nx - 2] - elif defnr == 1: ## This are only upcrossings - dcount = cumsum(extr[1, 0:nx]) - extr[3, 0:nx] - elif defnr == 3: ## This are upcrossings + minima + maxima - dcount = cumsum(extr[1, 0:nx]) + extr[2, 0:nx] - ylab = 'Count' - if intensity: - dcount = dcount / self.time - ylab = 'Intensity [count/sec]' - return LevelCrossings(dcount, levels, mean=self.mean, sigma=self.sigma, ylab=ylab, intensity=intensity) - -class TurningPoints(PlotData): - ''' - Container class for Turning Points data objects in WAFO - - Member variables - ---------------- - data : array_like - args : vector for 1D - - Examples - -------- - >>> import wafo.data - >>> import wafo.objects as wo - >>> x = wafo.data.sea() - >>> ts = wo.mat2timeseries(x) - - >>> tp = ts.turning_points() - >>> h1 = tp.plot(marker='x') - ''' - def __init__(self, *args, **kwds): - self.name_ = kwds.pop('name', 'WAFO TurningPoints Object') - self.sigma = kwds.pop('sigma', None) - self.mean = kwds.pop('mean', None) - - options = dict(title='Turning points') - #plot_args=['b.']) - options.update(**kwds) - super(TurningPoints, self).__init__(*args, **options) - - if not any(self.args): - n = len(self.data) - self.args = range(0, n) - else: - self.args = ravel(self.args) - self.data = ravel(self.data) - - def rainflow_filter(self, h=0.0, method='clib'): - ''' - Return rainflow filtered turning points (tp). - - Parameters - ---------- - h : scalar - a threshold - if h<=0, then tp is a sequence of turning points (default) - if h>0, then all rainflow cycles with height smaller than - h are removed. - - Returns - ------- - tp : TurningPoints object - with times and turning points. - - Example: - >>> import wafo.data - >>> x = wafo.data.sea() - >>> x1 = x[:200,:] - >>> ts1 = mat2timeseries(x1) - >>> tp = ts1.turning_points(wavetype='Mw') - >>> tph = tp.rainflow_filter(h=0.3) - >>> hs = ts1.plot() - >>> hp = tp.plot('ro') - >>> hph = tph.plot('k.') - - See also - --------- - findcross, - findrfc - findtp - ''' - ind = findrfc(self.data, max(h, 0.0), method) - try: - t = self.args[ind] - except: - t = ind - mean = self.mean - sigma = self.sigma - return TurningPoints(self.data[ind], t, mean=mean, sigma=sigma) - - def cycle_pairs(self, h=0, kind='min2max', method='clib'): - """ Return min2Max or Max2min cycle pairs from turning points - - Parameters - ---------- - kind : string - type of cycles to return options are 'min2max' or 'max2min' - method : string - specify which library to use - 'clib' for wafo's c_library - 'None' for wafo's Python functions - - Return - ------ - mm : cycles object - with min2Max or Max2min cycle pairs. - - Example - ------- - >>> import wafo - >>> x = wafo.data.sea() - >>> ts = wafo.objects.mat2timeseries(x) - >>> tp = ts.turning_points() - >>> mM = tp.cycle_pairs() - >>> h = mM.plot(marker='x') - - - See also - -------- - TurningPoints - SurvivalCycleCount - """ - - if h > 0: - ind = findrfc(self.data, h, method=method) - data = self.data[ind] - else: - data = self.data - if data[0] > data[1]: - im = 1 - iM = 0 - else: - im = 0 - iM = 1 - - # Extract min-max and max-min cycle pairs - #n = len(self.data) - if kind.lower().startswith('min2max'): - m = data[im:-1:2] - M = data[im + 1::2] - else: - kind = 'max2min' - M = data[iM:-1:2] - m = data[iM + 1::2] - - time = self.args[-1] - self.args[0] - - return CyclePairs(M, m, kind=kind, mean=self.mean, sigma=self.sigma, - time=time) - - def cycle_astm(self): - """ - Rainflow counted cycles according to Nieslony's ASTM implementation - - Parameters - ---------- - - Returns - ------- - sig_rfc : array-like - array of shape (n,3) with: - sig_rfc[:,0] Cycles amplitude - sig_rfc[:,1] Cycles mean value - sig_rfc[:,2] Cycle type, half (=0.5) or full (=1.0) - - References - ---------- - Adam Nieslony, "Determination of fragments of multiaxial service loading - strongly influencing the fatigue of machine components", - Mechanical Systems and Signal Processing 23, no. 8 (2009): 2712-2721. - - and is based on the following standard: - ASTM E 1049-85 (Reapproved 1997), Standard practices for cycle counting - in fatigue analysis, in: Annual Book of ASTM Standards, - vol. 03.01, ASTM, Philadelphia, 1999, pp. 710-718. - - Copyright (c) 1999-2002 by Adam Nieslony - Ported to Python by David Verelst - - Example - ------- - >>> import wafo - >>> x = wafo.data.sea() - >>> sig_ts = wafo.objects.mat2timeseries(x) - >>> sig_tp = sig_ts.turning_points(h=0, wavetype='astm') - >>> sig_cp = sig_tp.cycle_astm() - """ - - # output of Nieslony's algorithm is organised differently with - # respect to wafo's approach - # TODO: integrate ASTM method into the CyclyPairs class? - return findrfc_astm(self.data) - - -def mat2timeseries(x): - """ - Convert 2D arrays to TimeSeries object - assuming 1st column is time and the remaining columns contain data. - """ - return TimeSeries(x[:, 1::], x[:, 0].ravel()) - -class TimeSeries(PlotData): - ''' - Container class for 1D TimeSeries data objects in WAFO - - Member variables - ---------------- - data : array_like - args : vector for 1D, list of vectors for 2D, 3D, ... - - sensortypes : list of integers or strings - sensor type for time series (default ['n'] : Surface elevation) - see sensortype for more options - position : vector of size 3 - instrument position relative to the coordinate system - - Examples - -------- - >>> import wafo.data - >>> import wafo.objects as wo - >>> x = wafo.data.sea() - >>> ts = wo.mat2timeseries(x) - >>> rf = ts.tocovdata(lag=150) - >>> h = rf.plot() - - >>> S = ts.tospecdata() - The default L is set to 325 - - >>> tp = ts.turning_points() - >>> mm = tp.cycle_pairs() - >>> h1 = mm.plot(marker='x') - - >>> lc = mm.level_crossings() - >>> h2 = lc.plot() - - ''' - def __init__(self, *args, **kwds): - self.name_ = kwds.pop('name', 'WAFO TimeSeries Object') - self.sensortypes = kwds.pop('sensortypes', ['n', ]) - self.position = kwds.pop('position', [zeros(3), ]) - - super(TimeSeries, self).__init__(*args, **kwds) - - if not any(self.args): - n = len(self.data) - self.args = range(0, n) - - - def sampling_period(self): - ''' - Returns sampling interval - - Returns - ------- - dt : scalar - sampling interval, unit: - [s] if lagtype=='t' - [m] otherwise - - See also - ''' - dt1 = self.args[1] - self.args[0] - n = size(self.args) - 1 - t = self.args[-1] - self.args[0] - dt = t / n - if abs(dt - dt1) > 1e-10: - warnings.warn('Data is not uniformly sampled!') - return dt - - def tocovdata(self, lag=None, flag='biased', norm=False, dt=None): - ''' - Return auto covariance function from data. - - Parameters - ---------- - lag : scalar, int - maximum time-lag for which the ACF is estimated. (Default lag=n-1) - flag : string, 'biased' or 'unbiased' - If 'unbiased' scales the raw correlation by 1/(n-abs(k)), - where k is the index into the result, otherwise scales the raw - cross-correlation by 1/n. (default) - norm : bool - True if normalize output to one - dt : scalar - time-step between data points (default see sampling_period). - - Return - ------- - R : CovData1D object - with attributes: - data : ACF vector length L+1 - args : time lags length L+1 - sigma : estimated large lag standard deviation of the estimate - assuming x is a Gaussian process: - if R(k)=0 for all lags k>q then an approximation - of the variance for large samples due to Bartlett - var(R(k))=1/N*(R(0)^2+2*R(1)^2+2*R(2)^2+ ..+2*R(q)^2) - for k>q and where N=length(x). Special case is - white noise where it equals R(0)^2/N for k>0 - norm : bool - If false indicating that R is not normalized - - Example: - -------- - >>> import wafo.data - >>> import wafo.objects as wo - >>> x = wafo.data.sea() - >>> ts = wo.mat2timeseries(x) - >>> acf = ts.tocovdata(150) - >>> h = acf.plot() - ''' - n = len(self.data) - if not lag: - lag = n - 1 - - x = self.data.flatten() - indnan = isnan(x) - if any(indnan): - x = x - x[1 - indnan].mean() # remove the mean pab 09.10.2000 - #indnan = find(indnan) - Ncens = n - sum(indnan) - x[indnan] = 0. # pab 09.10.2000 much faster for censored samples - else: - indnan = None - Ncens = n - x = x - x.mean() - - #fft = np.fft.fft - nfft = 2 ** nextpow2(n) - Rper = abs(fft(x, nfft)) ** 2 / Ncens # Raw periodogram - - R = np.real(fft(Rper)) / nfft # %ifft=fft/nfft since Rper is real! - lags = range(0, lag + 1) - if flag.startswith('unbiased'): - # unbiased result, i.e. divide by n-abs(lag) - R = R[lags] * Ncens / arange(Ncens, Ncens - lag, -1) - #else % biased result, i.e. divide by n - # r=r(1:L+1)*Ncens/Ncens - - c0 = R[0] - if norm: - R = R / c0 - r0 = R[0] - if dt is None: - dt = self.sampling_period() - t = linspace(0, lag * dt, lag + 1) - #cumsum = np.cumsum - acf = _wafocov.CovData1D(R[lags], t) - acf.sigma = sqrt(r_[ 0, r0 ** 2 , r0 ** 2 + 2 * cumsum(R[1:] ** 2)] / Ncens) - acf.children = [PlotData(-2. * acf.sigma[lags], t), PlotData(2. * acf.sigma[lags], t)] - acf.plot_args_children = ['r:'] - acf.norm = norm - return acf - - def _specdata(self, L=None, tr=None, method='cov', detrend=detrend_mean, window=parzen, noverlap=0, pad_to=None): - """ - Obsolete: Delete? - Return power spectral density by Welches average periodogram method. - - Parameters - ---------- - NFFT : int, scalar - if len(data) < NFFT, it will be zero padded to `NFFT` - before estimation. Must be even; a power 2 is most efficient. - detrend : function - window : vector of length NFFT or function - To create window vectors see numpy.blackman, numpy.hamming, - numpy.bartlett, scipy.signal, scipy.signal.get_window etc. - noverlap : scalar int - gives the length of the overlap between segments. - - Returns - ------- - S : SpecData1D - Power Spectral Density - - Notes - ----- - The data vector is divided into NFFT length segments. Each segment - is detrended by function detrend and windowed by function window. - noverlap gives the length of the overlap between segments. The - absolute(fft(segment))**2 of each segment are averaged to compute Pxx, - with a scaling to correct for power loss due to windowing. - - Reference - --------- - Bendat & Piersol (1986) Random Data: Analysis and Measurement - Procedures, John Wiley & Sons - """ - dt = self.sampling_period() - #fs = 1. / (2 * dt) - yy = self.data.ravel() if tr is None else tr.dat2gauss(self.data.ravel()) - yy = detrend(yy) if hasattr(detrend, '__call__') else yy - - S, f = psd(yy, Fs=1. / dt, NFFT=L, detrend=detrend, window=window, - noverlap=noverlap, pad_to=pad_to, scale_by_freq=True) - fact = 2.0 * pi - w = fact * f - return _wafospec.SpecData1D(S / fact, w) - def tospecdata(self, L=None, tr=None, method='cov', detrend=detrend_mean, window=parzen, noverlap=0, ftype='w', alpha=None): - ''' - Estimate one-sided spectral density from data. - - Parameters - ---------- - L : scalar integer - maximum lag size of the window function. As L decreases the estimate - becomes smoother and Bw increases. If we want to resolve peaks in - S which is Bf (Hz or rad/sec) apart then Bw < Bf. If no value is given the - lag size is set to be the lag where the auto correlation is less than - 2 standard deviations. (maximum 300) - tr : transformation object - the transformation assuming that x is a sample of a transformed - Gaussian process. If g is None then x is a sample of a Gaussian process (Default) - method : string - defining estimation method. Options are - 'cov' : Frequency smoothing using a parzen window function - on the estimated autocovariance function. (default) - 'psd' : Welch's averaged periodogram method with no overlapping batches - detrend : function - defining detrending performed on the signal before estimation. - (default detrend_mean) - window : vector of length NFFT or function - To create window vectors see numpy.blackman, numpy.hamming, - numpy.bartlett, scipy.signal, scipy.signal.get_window etc. - noverlap : scalar int - gives the length of the overlap between segments. - ftype : character - defining frequency type: 'w' or 'f' (default 'w') - - Returns - --------- - spec : SpecData1D object - - - Example - ------- - x = load('sea.dat'); - S = dat2spec(x); - specplot(S) - - See also - -------- - dat2tr, dat2cov - - - References: - ----------- - Georg Lindgren and Holger Rootzen (1986) - "Stationara stokastiska processer", pp 173--176. - - Gareth Janacek and Louise Swift (1993) - "TIME SERIES forecasting, simulation, applications", - pp 75--76 and 261--268 - - Emanuel Parzen (1962), - "Stochastic Processes", HOLDEN-DAY, - pp 66--103 - ''' - - #% Initialize constants - #%~~~~~~~~~~~~~~~~~~~~~ - nugget = 1e-12 - rate = 2; #% interpolationrate for frequency - - wdef = 1; #% 1=parzen window 2=hanning window, 3= bartlett window - - dt = self.sampling_period() - #yy = self.data if tr is None else tr.dat2gauss(self.data) - yy = self.data.ravel() if tr is None else tr.dat2gauss(self.data.ravel()) - yy = detrend(yy) if hasattr(detrend, '__call__') else yy - n = len(yy) - L = min(L, n); - - max_L = min(300, n); #% maximum lag if L is undetermined - estimate_L = L is None - if estimate_L: - L = min(n - 2, int(4. / 3 * max_L + 0.5)) - - if method == 'cov' or estimate_L: - tsy = TimeSeries(yy, self.args) - R = tsy.tocovdata() - if estimate_L: - #finding where ACF is less than 2 st. deviations. - L = max_L + 2 - (np.abs(R.data[max_L::-1]) > 2 * R.sigma[max_L::-1]).argmax() # a better L value - if wdef == 1: # modify L so that hanning and Parzen give appr. the same result - L = min(int(4 * L / 3), n - 2) - print('The default L is set to %d' % L) - try: - win = window(2 * L - 1) - wname = window.__name__ - if wname == 'parzen': - v = int(3.71 * n / L) # degrees of freedom used in chi^2 distribution - Be = 2 * pi * 1.33 / (L * dt) # % bandwidth (rad/sec) - elif wname == 'hanning': - v = int(2.67 * n / L); # degrees of freedom used in chi^2 distribution - Be = 2 * pi / (L * dt); # % bandwidth (rad/sec) - elif wname == 'bartlett': - v = int(3 * n / L); # degrees of freedom used in chi^2 distribution - Be = 2 * pi * 1.33 / (L * dt); # bandwidth (rad/sec) - except: - wname = None - win = window - v = None - Be = None - - if method == 'psd': - nfft = 2 ** nextpow2(L) - pad_to = rate * nfft # Interpolate the spectrum with rate - S, f = psd(yy, Fs=1. / dt, NFFT=nfft, detrend=detrend, window=window(nfft), - noverlap=noverlap, pad_to=pad_to, scale_by_freq=True) - fact = 2.0 * pi - w = fact * f - spec = _wafospec.SpecData1D(S / fact, w) - else :# cov method - # add a nugget effect to ensure that round off errors - # do not result in negative spectral estimates - - R.data[:L] = R.data[:L] * win[L - 1::] - R.data[L] = 0.0 - R.data = R.data[:L + 1] - R.args = R.args[:L + 1] - #R.plot() - #R.show() - spec = R.tospecdata(rate=rate, nugget=nugget) - - spec.Bw = Be - if ftype == 'f': - spec.Bw = Be / (2 * pi) # bandwidth in Hz - - if alpha is not None : - #% Confidence interval constants - spec.CI = [v / _invchi2(1 - alpha / 2 , v), v / _invchi2(alpha / 2 , v)]; - - spec.tr = tr - spec.L = L - spec.norm = False - spec.note = 'method=%s' % method -# S = createspec('freq',ftype); -# S.tr = g; -# S.note = ['dat2spec(',inputname(1),'), Method = ' method ]; -# S.norm = 0; % not normalized -# S.L = L; -# S.S = zeros(nf+1,m-1); - return spec - - - - - def _trdata_cdf(self, **options): - ''' - Estimate transformation, g, from observed marginal CDF. - Assumption: a Gaussian process, Y, is related to the - non-Gaussian process, X, by Y = g(X). - Parameters - ---------- - options = options structure defining how the smoothing is done. - (See troptset for default values) - Returns - ------- - tr, tr_emp = smoothed and empirical estimate of the transformation g. - - The empirical CDF is usually very irregular. More than one local - maximum of the empirical CDF may cause poor fit of the transformation. - In such case one should use a smaller value of GSM or set a larger - variance for GVAR. If X(t) is likely to cross levels higher than 5 - standard deviations then the vector param has to be modified. For - example if X(t) is unlikely to cross a level of 7 standard deviations - one can use param = [-7 7 513]. - - ''' - - mean = self.data.mean() - sigma = self.data.std() - cdf = edf(self.data.ravel()) - - opt = DotDict(chkder=True, plotflag=False, gsm=0.05, param=[-5, 5, 513], - delay=2, linextrap=True, ntr=1000, ne=7, gvar=1) - opt.update(options) - Ne = opt.ne - nd = len(cdf.data) - if nd > opt.ntr and opt.ntr > 0: - x0 = linspace(cdf.args[Ne], cdf.args[nd - 1 - Ne], opt.ntr) - cdf.data = interp(x0, cdf.args, cdf.data) - cdf.args = x0 - Ne = 0 - uu = linspace(*opt.param) - - ncr = len(cdf.data); - ng = len(np.atleast_1d(opt.gvar)) - if ng == 1: - gvar = opt.gvar * ones(ncr) - else: - opt.gvar = np.atleast_1d(opt.gvar) - gvar = interp(linspace(0, 1, ncr), linspace(0, 1, ng), opt.gvar.ravel()) - - - ind = np.flatnonzero(diff(cdf.args) > 0) # remove equal points - nd = len(ind) - ind1 = ind[Ne:nd - Ne] - tmp = invnorm(cdf.data[ind]) - - x = sigma * uu + mean - pp_tr = SmoothSpline(cdf.args[ind1], tmp[Ne:nd - Ne], p=opt.gsm, lin_extrap=opt.linextrap, var=gvar[ind1]) - #g(:,2) = smooth(Fx(ind1,1),tmp(Ne+1:end-Ne),opt.gsm,g(:,1),def,gvar); - tr = TrData(pp_tr(x) , x, mean=mean, sigma=sigma) - tr_emp = TrData(tmp, cdf.args[ind], mean=mean, sigma=sigma) - tr_emp.setplotter('step') - - if opt.chkder: - for ix in xrange(5): - dy = diff(tr.data) - if (dy <= 0).any(): - dy[dy > 0] = floatinfo.eps - gvar = -(np.hstack((dy, 0)) + np.hstack((0, dy))) / 2 + floatinfo.eps - pp_tr = SmoothSpline(cdf.args[ind1], tmp[Ne:nd - Ne], p=1, lin_extrap=opt.linextrap, var=ix * gvar) - tr = TrData(pp_tr(x) , x, mean=mean, sigma=sigma) - else: - break - else: - msg = '''The empirical distribution is not sufficiently smoothed. - The estimated transfer function, g, is not - a strictly increasing function.''' - warnings.warn(msg) - - if opt.plotflag > 0: - tr.plot() - tr_emp.plot() - return tr, tr_emp - - def trdata(self, method='nonlinear', **options): - ''' - Estimate transformation, g, from data. - - Parameters - ---------- - method : string - 'nonlinear' : transform based on smoothed crossing intensity (default) - 'mnonlinear': transform based on smoothed marginal distribution - 'hermite' : transform based on cubic Hermite polynomial - 'ochi' : transform based on exponential function - 'linear' : identity. - - options : keyword with the following fields: - csm,gsm - defines the smoothing of the logarithm of crossing intensity - and the transformation g, respectively. Valid values must - be 0<=csm,gsm<=1. (default csm=0.9, gsm=0.05) - Smaller values gives smoother functions. - param - vector which defines the region of variation of the data x. - (default see lc2tr). - plotflag - 0 no plotting (Default) - 1 plots empirical and smoothed g(u) and the theoretical for - a Gaussian model. - 2 monitor the development of the estimation - linextrap - 0 use a regular smoothing spline - 1 use a smoothing spline with a constraint on the ends to - ensure linear extrapolation outside the range of the data. - (default) - gvar - Variances for the empirical transformation, g. (default 1) - ne - Number of extremes (maxima & minima) to remove from the - estimation of the transformation. This makes the - estimation more robust against outliers. (default 7) - ntr - Maximum length of empirical crossing intensity or CDF. - The empirical crossing intensity or CDF is interpolated - linearly before smoothing if their lengths exceeds Ntr. - A reasonable NTR will significantly speed up the - estimation for long time series without loosing any - accuracy. NTR should be chosen greater than - PARAM(3). (default 1000) - - Returns - ------- - tr, tr_emp : TrData objects - with the smoothed and empirical transformation, respectively. - - - TRDATA estimates the transformation in a transformed Gaussian model. - Assumption: a Gaussian process, Y, is related to the - non-Gaussian process, X, by Y = g(X). - - The empirical crossing intensity is usually very irregular. - More than one local maximum of the empirical crossing intensity - may cause poor fit of the transformation. In such case one - should use a smaller value of CSM. In order to check the effect - of smoothing it is recomended to also plot g and g2 in the same plot or - plot the smoothed g against an interpolated version of g (when CSM=GSM=1). - If x is likely to cross levels higher than 5 standard deviations - then the vector param has to be modified. For example if x is - unlikely to cross a level of 7 standard deviations one can use - PARAM=[-7 7 513]. - - Example - ------- - >>> import wafo.spectrum.models as sm - >>> import wafo.transform.models as tm - >>> from wafo.objects import mat2timeseries - >>> Hs = 7.0 - >>> Sj = sm.Jonswap(Hm0=Hs) - >>> S = Sj.tospecdata() #Make spectrum object from numerical values - >>> S.tr = tm.TrOchi(mean=0, skew=0.16, kurt=0, sigma=Hs/4, ysigma=Hs/4) - >>> xs = S.sim(ns=2**16, iseed=10) - >>> ts = mat2timeseries(xs) - >>> g0, g0emp = ts.trdata(monitor=True) # Monitor the development - >>> g1, g1emp = ts.trdata(method='m', gvar=0.5 ) # Equal weight on all points - >>> g2, g2emp = ts.trdata(method='n', gvar=[3.5, 0.5, 3.5]) # Less weight on the ends - >>> int(S.tr.dist2gauss()*100) - 141 - >>> int(g0emp.dist2gauss()*100) - 217949 - >>> int(g0.dist2gauss()*100) - 93 - >>> int(g1.dist2gauss()*100) - 66 - >>> int(g2.dist2gauss()*100) - 84 - - See also - -------- - LevelCrossings.trdata - wafo.transform.models - - References - ---------- - Rychlik, I. , Johannesson, P and Leadbetter, M. R. (1997) - "Modelling and statistical analysis of ocean wavedata using - transformed Gaussian process." - Marine structures, Design, Construction and Safety, Vol. 10, No. 1, pp 13--47 - - - Brodtkorb, P, Myrhaug, D, and Rue, H (1999) - "Joint distribution of wave height and crest velocity from - reconstructed data" - in Proceedings of 9th ISOPE Conference, Vol III, pp 66-73 - ''' - - - #opt = troptset('plotflag','off','csm',.95,'gsm',.05,.... - # 'param',[-5 5 513],'delay',2,'linextrap','on','ne',7,... - # 'cvar',1,'gvar',1,'multip',0); - - opt = DotDict(chkder=True, plotflag=False, csm=.95, gsm=.05, - param=[-5, 5, 513], delay=2, ntr=1000, linextrap=True, ne=7, cvar=1, gvar=1, - multip=False, crossdef='uM') - opt.update(**options) - - ma = self.data.mean() - sa = self.data.std() - - if method.startswith('lin'): - return TrLinear(mean=ma, sigma=sa) - - if method[0] == 'n': - tp = self.turning_points() - mM = tp.cycle_pairs() - lc = mM.level_crossings(opt.crossdef) - return lc.trdata(mean=ma, sigma=sa, **opt) - elif method[0] == 'm': - return self._trdata_cdf(**opt) - elif method[0] == 'h': - ga1 = skew(self.data) - ga2 = kurtosis(self.data, fisher=True) #kurt(xx(n+1:end))-3; - up = min(4 * (4 * ga1 / 3) ** 2, 13) - lo = (ga1 ** 2) * 3 / 2; - kurt1 = min(up, max(ga2, lo)) + 3 - return TrHermite(mean=ma, var=sa ** 2, skew=ga1, kurt=kurt1) - elif method[0] == 'o': - ga1 = skew(self.data) - return TrOchi(mean=ma, var=sa ** 2, skew=ga1) - - def turning_points(self, h=0.0, wavetype=None): - ''' - Return turning points (tp) from data, optionally rainflowfiltered. - - Parameters - ---------- - h : scalar - a threshold - if h<=0, then tp is a sequence of turning points (default) - if h>0, then all rainflow cycles with height smaller than - h are removed. - - wavetype : string - defines the type of wave. Possible options are - 'astm' 'mw' 'Mw' or 'none'. - If None all rainflow filtered min and max - will be returned, otherwise only the rainflow filtered - min and max, which define a wave according to the - wave definition, will be returned. - 'astm' forces to have the first data point of the load history as - the first turning point. To be used in combination with - TurningPoints.cycle_astm() - - Returns - ------- - tp : TurningPoints object - with times and turning points. - - Example: - >>> import wafo.data - >>> x = wafo.data.sea() - >>> x1 = x[:200,:] - >>> ts1 = mat2timeseries(x1) - >>> tp = ts1.turning_points(wavetype='Mw') - >>> tph = ts1.turning_points(h=0.3,wavetype='Mw') - >>> hs = ts1.plot() - >>> hp = tp.plot('ro') - >>> hph = tph.plot('k.') - - See also - --------- - findcross, - findrfc - findtp - ''' - ind = findtp(self.data, max(h, 0.0), wavetype) - try: - t = self.args[ind] - except: - t = ind - mean = self.data.mean() - sigma = self.data.std() - return TurningPoints(self.data[ind], t, mean=mean, sigma=sigma) - - def trough_crest(self, v=None, wavetype=None): - """ - Return trough and crest turning points - - Parameters - ----------- - v : scalar - reference level (default v = mean of x). - - wavetype : string - defines the type of wave. Possible options are - 'dw', 'uw', 'tw', 'cw' or None. - If None indices to all troughs and crests will be returned, - otherwise only the paired ones will be returned - according to the wavedefinition. - - Returns - -------- - tc : TurningPoints object - with trough and crest turningpoints - """ - ind = findtc(self.data, v, wavetype)[0] - try: - t = self.args[ind] - except: - t = ind - mean = self.data.mean() - sigma = self.data.std() - return TurningPoints(self.data[ind], t, mean=mean, sigma=sigma) - def wave_parameters(self, rate=1): - ''' - Returns several wave parameters from data. - - Parameters - ---------- - rate : scalar integer - interpolation rate. Interpolates with spline if greater than one. - - Returns - ------- - parameters : dict - wave parameters such as - Ac, At : Crest and trough amplitude, respectively - Tcf, Tcb : Crest front and crest (rear) back period, respectively - Hu, Hd : zero-up-crossing and zero-downcrossing wave height, respectively. - Tu, Td : zero-up-crossing and zero-downcrossing wave period, respectively. - - The definition of g, Ac,At, Tcf, etc. are given in gravity and - wafo.definitions. - - Example - ------- - >>> import wafo.data as wd - >>> import wafo.objects as wo - >>> x = wd.sea() - >>> ts = wo.mat2timeseries(x) - >>> wp = ts.wave_parameters() - >>> for name in ['Ac', 'At', 'Hu', 'Hd', 'Tu', 'Td', 'Tcf', 'Tcb']: - ... print('%s' % name, wp[name][:2]) - ('Ac', array([ 0.25950546, 0.34950546])) - ('At', array([ 0.16049454, 0.43049454])) - ('Hu', array([ 0.69, 0.86])) - ('Hd', array([ 0.42, 0.78])) - ('Tu', array([ 6.10295202, 3.36978685])) - ('Td', array([ 3.84377468, 6.35707656])) - ('Tcf', array([ 0.42656819, 0.57361617])) - ('Tcb', array([ 0.93355982, 1.04063638])) - - >>> import pylab as plt - >>> h = plt.plot(wp['Td'],wp['Hd'],'.') - >>> h = plt.xlabel('Td [s]') - >>> h = plt.ylabel('Hd [m]') - - - See also - -------- - wafo.definitions - ''' - dT = self.sampling_period() - if rate > 1: - dT = dT / rate - t0, tn = self.args[0], self.args[-1] - n = len(self.args) - ti = linspace(t0, tn, int(rate * n)) - xi = interp1d(self.args , self.data.ravel(), kind='cubic')(ti) - - else: - ti, xi = self.args, self.data.ravel() - - tc_ind, z_ind = findtc(xi, v=0, kind='tw') - tc_a = xi[tc_ind] - tc_t = ti[tc_ind] - Ac = tc_a[1::2] # crest amplitude - At = -tc_a[0::2] # trough amplitude - Hu = Ac + At[1:] - Hd = Ac + At[:-1] - tu = ecross(ti, xi, z_ind[1::2], v=0) - Tu = diff(tu)# Period zero-upcrossing waves - td = ecross(ti, xi , z_ind[::2], v=0) - Td = diff(td)# Period zero-downcrossing waves - Tcf = tc_t[1::2] - tu[:-1] - Tcf[(Tcf == 0)] = dT # avoiding division by zero - Tcb = td[1:] - tc_t[1::2] - Tcb[(Tcb == 0)] = dT; #% avoiding division by zero - return dict(Ac=Ac, At=At, Hu=Hu, Hd=Hd, Tu=Tu, Td=Td, Tcf=Tcf, Tcb=Tcb) - - def wave_height_steepness(self, method=1, rate=1, g=None): - ''' - Returns waveheights and steepnesses from data. - - Parameters - ---------- - rate : scalar integer - interpolation rate. Interpolates with spline if greater than one. - - method : scalar integer - 0 max(Vcf, Vcb) and corresponding wave height Hd or Hu in H - 1 crest front (rise) speed (Vcf) in S and wave height Hd in H. (default) - -1 crest back (fall) speed (Vcb) in S and waveheight Hu in H. - 2 crest front steepness in S and the wave height Hd in H. - -2 crest back steepness in S and the wave height Hu in H. - 3 total wave steepness in S and the wave height Hd in H - for zero-downcrossing waves. - -3 total wave steepness in S and the wave height Hu in H. - for zero-upcrossing waves. - Returns - ------- - S, H = Steepness and the corresponding wave height according to method - - - The parameters are calculated as follows: - Crest front speed (velocity) = Vcf = Ac/Tcf - Crest back speed (velocity) = Vcb = Ac/Tcb - Crest front steepness = 2*pi*Ac./Td/Tcf/g - Crest back steepness = 2*pi*Ac./Tu/Tcb/g - Total wave steepness (zero-downcrossing wave) = 2*pi*Hd./Td.^2/g - Total wave steepness (zero-upcrossing wave) = 2*pi*Hu./Tu.^2/g - - The definition of g, Ac,At, Tcf, etc. are given in gravity and - wafo.definitions. - - Example - ------- - >>> import wafo.data as wd - >>> import wafo.objects as wo - >>> x = wd.sea() - >>> ts = wo.mat2timeseries(x) - >>> for i in xrange(-3,4): - ... S, H = ts.wave_height_steepness(method=i) - ... print(S[:2],H[:2]) - (array([ 0.01186982, 0.04852534]), array([ 0.69, 0.86])) - (array([ 0.02918363, 0.06385979]), array([ 0.69, 0.86])) - (array([ 0.27797411, 0.33585743]), array([ 0.69, 0.86])) - (array([ 0.60835634, 0.60930197]), array([ 0.42, 0.78])) - (array([ 0.60835634, 0.60930197]), array([ 0.42, 0.78])) - (array([ 0.10140867, 0.06141156]), array([ 0.42, 0.78])) - (array([ 0.01821413, 0.01236672]), array([ 0.42, 0.78])) - - >>> import pylab as plt - >>> h = plt.plot(S,H,'.') - >>> h = plt.xlabel('S') - >>> h = plt.ylabel('Hd [m]') - - - See also - -------- - wafo.definitions - ''' - - dT = self.sampling_period() - if g is None: - g = gravity() #% acceleration of gravity - - if rate > 1: - dT = dT / rate - t0, tn = self.args[0], self.args[-1] - n = len(self.args) - ti = linspace(t0, tn, int(rate * n)) - xi = interp1d(self.args , self.data.ravel(), kind='cubic')(ti) - - else: - ti, xi = self.args, self.data.ravel() - - tc_ind, z_ind = findtc(xi, v=0, kind='tw') - tc_a = xi[tc_ind] - tc_t = ti[tc_ind] - Ac = tc_a[1::2] # crest amplitude - At = -tc_a[0::2] # trough amplitude - - if (0 <= method and method <= 2): - # time between zero-upcrossing and crest [s] - tu = ecross(ti, xi, z_ind[1:-1:2], v=0) - Tcf = tc_t[1::2] - tu - Tcf[(Tcf == 0)] = dT # avoiding division by zero - if (0 >= method and method >= -2): - # time between crest and zero-downcrossing [s] - td = ecross(ti, xi, z_ind[2::2], v=0) - Tcb = td - tc_t[1::2] - Tcb[(Tcb == 0)] = dT; #% avoiding division by zero - - if method == 0: - # max(Vcf, Vcr) and the corresponding wave height Hd or Hu in H - Hu = Ac + At[1:] - Hd = Ac + At[:-1] - T = np.where(Tcf < Tcb, Tcf, Tcb) - S = Ac / T - H = np.where(Tcf < Tcb, Hd, Hu) - elif method == 1: # extracting crest front velocity [m/s] and - # Zero-downcrossing wave height [m] - H = Ac + At[:-1] # Hd - S = Ac / Tcf - elif method == -1: # extracting crest rear velocity [m/s] and - # Zero-upcrossing wave height [m] - H = Ac + At[1:] #Hu - S = Ac / Tcb - elif method == 2: #crest front steepness in S and the wave height Hd in H. - H = Ac + At[:-1] #Hd - Td = diff(ecross(ti, xi, z_ind[::2], v=0)) - S = 2 * pi * Ac / Td / Tcf / g - elif method == -2: # crest back steepness in S and the wave height Hu in H. - H = Ac + At[1:] - Tu = diff(ecross(ti, xi, z_ind[1::2], v=0)) - S = 2 * pi * Ac / Tu / Tcb / g - elif method == 3: # total steepness in S and the wave height Hd in H - # for zero-doewncrossing waves. - H = Ac + At[:-1] - Td = diff(ecross(ti, xi , z_ind[::2], v=0))# Period zero-downcrossing waves - S = 2 * pi * H / Td ** 2 / g - elif method == -3: # total steepness in S and the wave height Hu in H for - # zero-upcrossing waves. - H = Ac + At[1:] - Tu = diff(ecross(ti, xi, z_ind[1::2], v=0))# Period zero-upcrossing waves - S = 2 * pi * H / Tu ** 2 / g - - return S, H - def wave_periods(self, vh=None, pdef='d2d', wdef=None, index=None, rate=1): - """ - Return sequence of wave periods/lengths from data. - - Parameters - ---------- - vh : scalar - reference level ( default v=mean(x(:,2)) ) or - rainflow filtering height (default h=0) - pdef : string - defining type of waveperiod (wavelength) returned: - Level v separated 't2c', 'c2t', 't2t' or 'c2c' -waveperiod. - Level v 'd2d', 'u2u', 'd2u' or 'u2d' -waveperiod. - Rain flow filtered (with height greater than h) - 'm2M', 'M2m', 'm2m' or 'M2M' -waveperiod. - Explanation to the abbreviations: - M=Max, m=min, d=down-crossing, u=up-crossing , - t=trough and c=crest. - Thus 'd2d' means period between a down-crossing to the - next down-crossing and 'u2c' means period between a - u-crossing to the following crest. - wdef : string - defining type of wave. Possible options are - 'mw','Mw','dw', 'uw', 'tw', 'cw' or None. - If wdef is None all troughs and crests will be used, - otherwise only the troughs and crests which define a - wave according to the wavedefinition are used. - - index : vector - index sequence of one of the following : - -level v-crossings (indices to "du" are required to - calculate 'd2d', 'd2u', 'u2d' or 'u2u' waveperiods) - -level v separated trough and crest turningpoints - (indices to 'tc' are required to calculate - 't2t', 't2c', 'c2t' or 'c2c' waveperiods) - -level v crossings and level v separated trough and - crest turningpoints (indices to "dutc" are - required to calculate t2u, u2c, c2d or d2t - waveperiods) - -rainflow filtered turningpoints with minimum rfc height h - (indices to "mMtc" are required to calculate - 'm2m', 'm2M', 'M2m' or 'M2M' waveperiods) - - rate : scalar - interpolation rate. If rate larger than one, then x is - interpolated before extrating T - - Returns - -------- - T : vector - sequence of waveperiods (or wavelengths). - index : vector - of indices - - - Example: - -------- - Histogram of crest2crest waveperiods - >>> import wafo.data as wd - >>> import wafo.objects as wo - >>> import pylab as plb - >>> x = wd.sea() - >>> ts = wo.mat2timeseries(x[0:400,:]) - >>> T, ix = ts.wave_periods(vh=0.0,pdef='c2c') - >>> h = plb.hist(T) - - See also: - -------- - findtp, - findtc, - findcross, perioddef - """ - -##% This is a more flexible version than the dat2hwa or tp2wa routines. -##% There is a secret option: if pdef='all' the function returns -##% all the waveperiods 'd2t', 't2u', 'u2c' and 'c2d' in sequence. -##% It is up to the user to extract the right waveperiods. -##% If the first is a down-crossing then the first is a 'd2t' waveperiod. -##% If the first is a up-crossing then the first is a 'u2c' waveperiod. -##% -##% Example: -##% [T ind]=dat2wa(x,0,'all') %returns all waveperiods -##% nn = length(T) -##% % want to extract all t2u waveperiods -##% if x(ind(1),2)>0 % if first is down-crossing -##% Tt2u=T(2:4:nn) -##% else % first is up-crossing -##% Tt2u=T(4:4:nn) -##% end - - if rate > 1: #% interpolate with spline - n = ceil(self.data.size * rate) - ti = linspace(self.args[0], self.args[-1], n) - x = stineman_interp(ti, self.args, self.data.ravel()) - else: - x = self.data - ti = self.args - - - if vh is None: - if pdef[0] in ('m', 'M'): - vh = 0 - print(' The minimum rfc height, h, is set to: %g' % vh) - else: - vh = x.mean() - print(' The level l is set to: %g' % vh) - - - if index is None: - if pdef in ('m2m', 'm2M', 'M2m', 'M2M'): - index = findtp(x, vh, wdef) - elif pdef in ('u2u', 'u2d', 'd2u', 'd2d'): - index = findcross(x, vh, wdef) - elif pdef in ('t2t', 't2c', 'c2t', 'c2c'): - index = findtc(x, vh, wdef)[0] - elif pdef in ('d2t', 't2u', 'u2c', 'c2d', 'all'): - index, v_ind = findtc(x, vh, wdef) - index = sort(r_[index, v_ind]) #% sorting crossings and tp in sequence - else: - raise ValueError('Unknown pdef option!') - - if (x[index[0]] > x[index[1]]): #% if first is down-crossing or max - if pdef in ('d2t', 'M2m', 'c2t', 'd2u' , 'M2M', 'c2c', 'd2d', 'all'): - start = 1 - elif pdef in ('t2u', 'm2M', 't2c', 'u2d' , 'm2m', 't2t', 'u2u'): - start = 2 - elif pdef in ('u2c'): - start = 3 - elif pdef in ('c2d'): - start = 4 - else: - raise ValueError('Unknown pdef option!') - # else first is up-crossing or min - elif pdef in ('all', 'u2c', 'm2M', 't2c', 'u2d', 'm2m', 't2t', 'u2u'): - start = 0 - elif pdef in ('c2d', 'M2m', 'c2t', 'd2u', 'M2M', 'c2c', 'd2d'): - start = 1 - elif pdef in ('d2t'): - start = 2 - elif pdef in ('t2u'): - start = 3 - else: - raise ValueError('Unknown pdef option!') - - # determine the steps between wanted periods - if pdef in ('d2t', 't2u', 'u2c', 'c2d'): - step = 4 - elif pdef in ('all'): - step = 1 #% secret option! - else: - step = 2 - - #% determine the distance between min2min, t2t etc.. - if pdef in ('m2m', 't2t', 'u2u', 'M2M', 'c2c', 'd2d'): - dist = 2 - else: - dist = 1 - - nn = len(index) - #% New call: (pab 28.06.2001) - if pdef[0] in ('u', 'd'): - t0 = ecross(ti, x, index[start:(nn - dist):step], vh) - else: # % min, Max, trough, crest or all crossings wanted - t0 = x[index[start:(nn - dist):step]] - - if pdef[2] in ('u', 'd'): - t1 = ecross(ti, x, index[(start + dist):nn:step], vh) - else: # % min, Max, trough, crest or all crossings wanted - t1 = x[index[(start + dist):nn:step]] - - T = t1 - t0 - return T, index - - def reconstruct(self): - # TODO: finish reconstruct - pass - def plot_wave(self, sym1='k.', ts=None, sym2='k+', nfig=None, nsub=None, - sigma=None, vfact=3): - ''' - Plots the surface elevation of timeseries. - - Parameters - ---------- - sym1, sym2 : string - plot symbol and color for data and ts, respectively - (see PLOT) (default 'k.' and 'k+') - ts : TimeSeries or TurningPoints object - to overplot data. default zero-separated troughs and crests. - nsub : scalar integer - Number of subplots in each figure. By default nsub is such that - there are about 20 mean down crossing waves in each subplot. - If nfig is not given and nsub is larger than 6 then nsub is - changed to nsub=min(6,ceil(nsub/nfig)) - nfig : scalar integer - Number of figures. By default nfig=ceil(Nsub/6). - sigma : real scalar - standard deviation of data. - vfact : real scalar - how large in stdev the vertical scale should be (default 3) - - - Example - ------- - Plot x1 with red lines and mark troughs and crests with blue circles. - >>> import wafo - >>> x = wafo.data.sea() - >>> ts150 = wafo.objects.mat2timeseries(x[:150,:]) - >>> h = ts150.plot_wave('r-', sym2='bo') - - See also - -------- - findtc, plot - ''' - - nw = 20 - tn = self.args - xn = self.data.ravel() - indmiss = isnan(xn) # indices to missing points - indg = where(1 - indmiss)[0] - if ts is None: - tc_ix = findtc(xn[indg], 0, 'tw')[0] - xn2 = xn[tc_ix] - tn2 = tn[tc_ix] - else: - xn2 = ts.data - tn2 = ts.args - - if sigma is None: - sigma = xn[indg].std() - - if nsub is None: - nsub = int(len(xn2) / (2 * nw)) + 1 # about Nw mdc waves in each plot - if nfig is None: - nfig = int(ceil(nsub / 6)) - nsub = min(6, int(ceil(nsub / nfig))) - - n = len(xn) - Ns = int(n / (nfig * nsub)) - ind = r_[0:Ns] - if all(xn >= 0): - vscale = [0, 2 * sigma * vfact] #@UnusedVariable - else: - vscale = array([-1, 1]) * vfact * sigma #@UnusedVariable - - - XlblTxt = 'Time [sec]' - dT = 1 - timespan = tn[ind[-1]] - tn[ind[0]] - if abs(timespan) > 18000: # more than 5 hours - dT = 1 / (60 * 60) - XlblTxt = 'Time (hours)' - elif abs(timespan) > 300:# more than 5 minutes - dT = 1 / 60 - XlblTxt = 'Time (minutes)' - - if np.max(abs(xn[indg])) > 5 * sigma: - XlblTxt = XlblTxt + ' (Spurious data since max > 5 std.)' - - plot = plotbackend.plot - subplot = plotbackend.subplot - figs = [] - for unused_iz in xrange(nfig): - figs.append(plotbackend.figure()) - plotbackend.title('Surface elevation from mean water level (MWL).') - for ix in xrange(nsub): - if nsub > 1: - subplot(nsub, 1, ix) - - h_scale = array([tn[ind[0]], tn[ind[-1]]]) - ind2 = where((h_scale[0] <= tn2) & (tn2 <= h_scale[1]))[0] - plot(tn[ind] * dT, xn[ind], sym1) - if len(ind2) > 0: - plot(tn2[ind2] * dT, xn2[ind2], sym2) - plot(h_scale * dT, [0, 0], 'k-') - #plotbackend.axis([h_scale*dT, v_scale]) - - for iy in [-2, 2]: - plot(h_scale * dT, iy * sigma * ones(2), ':') - - ind = ind + Ns - #end - plotbackend.xlabel(XlblTxt) - - return figs - - - def plot_sp_wave(self, wave_idx_, *args, **kwds): - """ - Plot specified wave(s) from timeseries - - Parameters - ---------- - wave_idx : integer vector - of indices to waves we want to plot, i.e., wave numbers. - tz_idx : integer vector - of indices to the beginning, middle and end of - defining wave, i.e. for zero-downcrossing waves, indices to - zerocrossings (default trough2trough wave) - - Examples - -------- - Plot waves nr. 6,7,8 and waves nr. 12,13,...,17 - >>> import wafo - >>> x = wafo.data.sea() - >>> ts = wafo.objects.mat2timeseries(x[0:500,...]) - >>> h = ts.plot_sp_wave(np.r_[6:9,12:18]) - - See also - -------- - plot_wave, findtc - """ - wave_idx = atleast_1d(wave_idx_).flatten() - tz_idx = kwds.pop('tz_idx', None) - if tz_idx is None: - unused_tc_ind, tz_idx = findtc(self.data, 0, 'tw') # finding trough to trough waves - - dw = nonzero(abs(diff(wave_idx)) > 1)[0] - Nsub = dw.size + 1 - Nwp = zeros(Nsub, dtype=int) - if Nsub > 1: - dw = dw + 1 - Nwp[Nsub - 1] = wave_idx[-1] - wave_idx[dw[-1]] + 1 - wave_idx[dw[-1] + 1:] = -2 - for ix in range(Nsub - 2, 1, -2): - Nwp[ix] = wave_idx[dw[ix] - 1] - wave_idx[dw[ix - 1]] + 1 # # of waves pr subplot - wave_idx[dw[ix - 1] + 1:dw[ix]] = -2 - - Nwp[0] = wave_idx[dw[0] - 1] - wave_idx[0] + 1 - wave_idx[1:dw[0]] = -2 - wave_idx = wave_idx[wave_idx > -1] - else: - Nwp[0] = wave_idx[-1] - wave_idx[0] + 1 - #end - - Nsub = min(6, Nsub) - Nfig = int(ceil(Nsub / 6)) - Nsub = min(6, int(ceil(Nsub / Nfig))) - figs = [] - for unused_iy in range(Nfig): - figs.append(plotbackend.figure()) - for ix in range(Nsub): - plotbackend.subplot(Nsub, 1, mod(ix, Nsub) + 1) - ind = r_[tz_idx[2 * wave_idx[ix] - 1]:tz_idx[2 * wave_idx[ix] + 2 * Nwp[ix] - 1]] - ## indices to wave - plotbackend.plot(self.args[ind], self.data[ind], *args, **kwds) - plotbackend.hold('on') - xi = [self.args[ind[0]], self.args[ind[-1]]] - plotbackend.plot(xi, [0, 0]) - - if Nwp[ix] == 1: - plotbackend.ylabel('Wave %d' % wave_idx[ix]) - else: - plotbackend.ylabel('Wave %d - %d' % (wave_idx[ix], wave_idx[ix] + Nwp[ix] - 1)) - - plotbackend.xlabel('Time [sec]') - #wafostamp - return figs - -#def hyperbolic_ratio(a, b, sa, sb): -# ''' -# Return ratio of hyperbolic functions -# to allow extreme variations of arguments. -# -# Parameters -# ---------- -# a, b : array-like -# arguments vectors of the same size -# sa, sb : scalar integers -# defining the hyperbolic function used, i.e., f(x,1)=cosh(x), f(x,-1)=sinh(x) -# -# Returns -# ------- -# r : ndarray -# f(a,sa)/f(b,sb), ratio of hyperbolic functions of same -# size as a and b -# Examples -# -------- -# >>> x = [-2,0,2] -# >>> hyperbolic_ratio(x,1,1,1) # gives r=cosh(x)/cosh(1) -# array([ 2.438107 , 0.64805427, 2.438107 ]) -# >>> hyperbolic_ratio(x,1,1,-1) # gives r=cosh(x)/sinh(1) -# array([ 3.20132052, 0.85091813, 3.20132052]) -# >>> hyperbolic_ratio(x,1,-1,1) # gives r=sinh(x)/cosh(1) -# array([-2.35040239, 0. , 2.35040239]) -# >>> hyperbolic_ratio(x,1,-1,-1) # gives r=sinh(x)/sinh(1) -# array([-3.08616127, 0. , 3.08616127]) -# >>> hyperbolic_ratio(1,x,1,1) # gives r=cosh(1)/cosh(x) -# array([ 0.41015427, 1.54308063, 0.41015427]) -# >>> hyperbolic_ratio(1,x,1,-1) # gives r=cosh(1)/sinh(x) -# array([-0.42545906, inf, 0.42545906]) -# >>> hyperbolic_ratio(1,x,-1,1) # gives r=sinh(1)/cosh(x) -# array([ 0.3123711 , 1.17520119, 0.3123711 ]) -# >>> hyperbolic_ratio(1,x,-1,-1) # gives r=sinh(1)/sinh(x) -# array([-0.32402714, inf, 0.32402714]) -# -# See also -# -------- -# tran -# ''' -# ak, bk, sak, sbk = np.atleast_1d(a, b, sign(sa), sign(sb)) -# # old call -# #return exp(ak-bk)*(1+sak*exp(-2*ak))/(1+sbk*exp(-2*bk)) -# # TODO: Does not always handle division by zero correctly -# -# signRatio = np.where(sak * ak < 0, sak, 1) -# signRatio = np.where(sbk * bk < 0, sbk * signRatio, signRatio) -# -# bk = np.abs(bk) -# ak = np.abs(ak) -# -# num = np.where(sak < 0, expm1(-2 * ak), 1 + exp(-2 * ak)) -# den = np.where(sbk < 0, expm1(-2 * bk), 1 + exp(-2 * bk)) -# iden = np.ones(den.shape) * inf -# ind = np.flatnonzero(den != 0) -# iden.flat[ind] = 1.0 / den[ind] -# val = np.where(num == den, 1, num * iden) -# return signRatio * exp(ak - bk) * val #((sak+exp(-2*ak))/(sbk+exp(-2*bk))) -# -#def sensor_typeid(*sensortypes): -# ''' Return ID for sensortype name -# -# Parameter -# --------- -# sensortypes : list of strings defining the sensortype -# -# Returns -# ------- -# sensorids : list of integers defining the sensortype -# -# Valid senor-ids and -types for time series are as follows: -# 0, 'n' : Surface elevation (n=Eta) -# 1, 'n_t' : Vertical surface velocity -# 2, 'n_tt' : Vertical surface acceleration -# 3, 'n_x' : Surface slope in x-direction -# 4, 'n_y' : Surface slope in y-direction -# 5, 'n_xx' : Surface curvature in x-direction -# 6, 'n_yy' : Surface curvature in y-direction -# 7, 'n_xy' : Surface curvature in xy-direction -# 8, 'P' : Pressure fluctuation about static MWL pressure -# 9, 'U' : Water particle velocity in x-direction -# 10, 'V' : Water particle velocity in y-direction -# 11, 'W' : Water particle velocity in z-direction -# 12, 'U_t' : Water particle acceleration in x-direction -# 13, 'V_t' : Water particle acceleration in y-direction -# 14, 'W_t' : Water particle acceleration in z-direction -# 15, 'X_p' : Water particle displacement in x-direction from its mean position -# 16, 'Y_p' : Water particle displacement in y-direction from its mean position -# 17, 'Z_p' : Water particle displacement in z-direction from its mean position -# -# Example: -# >>> sensor_typeid('W','v') -# [11, 10] -# >>> sensor_typeid('rubbish') -# [nan] -# -# See also -# -------- -# sensor_type -# ''' -# -# sensorid_table = dict(n=0, n_t=1, n_tt=2, n_x=3, n_y=4, n_xx=5, -# n_yy=6, n_xy=7, p=8, u=9, v=10, w=11, u_t=12, -# v_t=13, w_t=14, x_p=15, y_p=16, z_p=17) -# try: -# return [sensorid_table.get(name.lower(), nan) for name in sensortypes] -# except: -# raise ValueError('Input must be a string!') -# -# -# -#def sensor_type(*sensorids): -# ''' -# Return sensortype name -# -# Parameter -# --------- -# sensorids : vector or list of integers defining the sensortype -# -# Returns -# ------- -# sensornames : tuple of strings defining the sensortype -# Valid senor-ids and -types for time series are as follows: -# 0, 'n' : Surface elevation (n=Eta) -# 1, 'n_t' : Vertical surface velocity -# 2, 'n_tt' : Vertical surface acceleration -# 3, 'n_x' : Surface slope in x-direction -# 4, 'n_y' : Surface slope in y-direction -# 5, 'n_xx' : Surface curvature in x-direction -# 6, 'n_yy' : Surface curvature in y-direction -# 7, 'n_xy' : Surface curvature in xy-direction -# 8, 'P' : Pressure fluctuation about static MWL pressure -# 9, 'U' : Water particle velocity in x-direction -# 10, 'V' : Water particle velocity in y-direction -# 11, 'W' : Water particle velocity in z-direction -# 12, 'U_t' : Water particle acceleration in x-direction -# 13, 'V_t' : Water particle acceleration in y-direction -# 14, 'W_t' : Water particle acceleration in z-direction -# 15, 'X_p' : Water particle displacement in x-direction from its mean position -# 16, 'Y_p' : Water particle displacement in y-direction from its mean position -# 17, 'Z_p' : Water particle displacement in z-direction from its mean position -# -# Example: -# >>> sensor_type(range(3)) -# ('n', 'n_t', 'n_tt') -# -# See also -# -------- -# sensor_typeid, tran -# ''' -# valid_names = ('n', 'n_t', 'n_tt', 'n_x', 'n_y', 'n_xx', 'n_yy', 'n_xy', -# 'p', 'u', 'v', 'w', 'u_t', 'v_t', 'w_t', 'x_p', 'y_p', 'z_p', -# nan) -# ids = atleast_1d(*sensorids) -# if isinstance(ids, list): -# ids = hstack(ids) -# n = len(valid_names) - 1 -# ids = where(((ids < 0) | (n < ids)), n , ids) -# return tuple(valid_names[i] for i in ids) -# -#class TransferFunction(object): -# ''' -# Class for computing transfer functions based on linear wave theory -# of the system with input surface elevation, -# eta(x0,y0,t) = exp(i*(kx*x0+ky*y0-w*t)), -# and output Y determined by sensortype and position of sensor. -# -# Member methods -# -------------- -# tran(w, theta, kw) -# -# Hw = a function of frequency only (not direction) size 1 x Nf -# Gwt = a function of frequency and direction size Nt x Nf -# w = vector of angular frequencies in Rad/sec. Length Nf -# theta = vector of directions in radians Length Nt (default 0) -# ( theta = 0 -> positive x axis theta = pi/2 -> positive y axis) -# Member variables -# ---------------- -# pos : [x,y,z] -# vector giving coordinate position relative to [x0 y0 z0] (default [0,0,0]) -# sensortype = string -# defining the sensortype or transfer function in output. -# 0, 'n' : Surface elevation (n=Eta) (default) -# 1, 'n_t' : Vertical surface velocity -# 2, 'n_tt' : Vertical surface acceleration -# 3, 'n_x' : Surface slope in x-direction -# 4, 'n_y' : Surface slope in y-direction -# 5, 'n_xx' : Surface curvature in x-direction -# 6, 'n_yy' : Surface curvature in y-direction -# 7, 'n_xy' : Surface curvature in xy-direction -# 8, 'P' : Pressure fluctuation about static MWL pressure -# 9, 'U' : Water particle velocity in x-direction -# 10, 'V' : Water particle velocity in y-direction -# 11, 'W' : Water particle velocity in z-direction -# 12, 'U_t' : Water particle acceleration in x-direction -# 13, 'V_t' : Water particle acceleration in y-direction -# 14, 'W_t' : Water particle acceleration in z-direction -# 15, 'X_p' : Water particle displacement in x-direction from its mean position -# 16, 'Y_p' : Water particle displacement in y-direction from its mean position -# 17, 'Z_p' : Water particle displacement in z-direction from its mean position -# h : real scalar -# water depth (default inf) -# g : real scalar -# acceleration of gravity (default 9.81 m/s**2) -# rho : real scalar -# water density (default 1028 kg/m**3) -# bet : 1 or -1 -# 1, theta given in terms of directions toward which waves travel (default) -# -1, theta given in terms of directions from which waves come -# igam : 1,2 or 3 -# 1, if z is measured positive upward from mean water level (default) -# 2, if z is measured positive downward from mean water level -# 3, if z is measured positive upward from sea floor -# thetax, thetay : real scalars -# angle in degrees clockwise from true north to positive x-axis and -# positive y-axis, respectively. (default theatx=90, thetay=0) -# -# Example -# ------- -# >>> import pylab as plt -# >>> N=50; f0=0.1; th0=0; h=50; w0 = 2*pi*f0 -# >>> t = np.linspace(0,15,N) -# >>> eta0 = np.exp(-1j*w0*t) -# >>> stypes = ['n', 'n_x', 'n_y']; -# >>> tf = TransferFunction(pos=(0, 0, 0), h=50) -# >>> vals = [] -# >>> fh = plt.plot(t, eta0.real, 'r.') -# >>> plt.hold(True) -# >>> for i,stype in enumerate(stypes): -# ... tf.sensortype = stype -# ... Hw, Gwt = tf.tran(w0,th0) -# ... vals.append((Hw*Gwt*eta0).real.ravel()) -# ... vals[i] -# ... fh = plt.plot(t, vals[i]) -# >>> plt.show() -# -# -# See also -# -------- -# dat2dspec, sensor_type, sensor_typeid -# -# Reference -# --------- -# Young I.R. (1994) -# "On the measurement of directional spectra", -# Applied Ocean Research, Vol 16, pp 283-294 -# ''' -# def __init__(self, pos=(0, 0, 0), sensortype='n', h=inf, g=9.81, rho=1028, -# bet=1, igam=1, thetax=90, thetay=0): -# self.pos = pos -# self.sensortype = sensortype if isinstance(sensortype, str) else sensor_type(sensortype) -# self.h = h -# self.g = g -# self.rho = rho -# self.bet = bet -# self.igam = igam -# self.thetax = thetax -# self.thetay = thetay -# self._tran_dict = dict(n=self._n, n_t=self._n_t, n_tt=self._n_tt, -# n_x=self._n_x, n_y=self._n_y, n_xx=self._n_xx, -# n_yy=self._n_yy, n_xy=self._n_xy, -# P=self._p, p=self._p, -# U=self._u, u=self._u, -# V=self._v, v=self._v, -# W=self._w, w=self._w, -# U_t=self._u_t, u_t=self._u_t, -# V_t=self._v_t, v_t=self._v_t, -# W_t=self._w_t, w_t=self._w_t, -# X_p=self._x_p, x_p=self._x_p, -# Y_p=self._y_p, y_p=self._y_p, -# Z_p=self._z_p, z_p=self._z_p) -# -# def tran(self, w, theta=0, kw=None): -# ''' -# Return transfer functions based on linear wave theory -# of the system with input surface elevation, -# eta(x0,y0,t) = exp(i*(kx*x0+ky*y0-w*t)), -# and output, -# Y = Hw*Gwt*eta, determined by sensortype and position of sensor. -# -# Parameters -# ---------- -# w : array-like -# vector of angular frequencies in Rad/sec. Length Nf -# theta : array-like -# vector of directions in radians Length Nt (default 0) -# ( theta = 0 -> positive x axis theta = pi/2 -> positive y axis) -# kw : array-like -# vector of wave numbers corresponding to angular frequencies, w. Length Nf -# (default calculated with w2k) -# -# Returns -# ------- -# Hw = transfer function of frequency only (not direction) size 1 x Nf -# Gwt = transfer function of frequency and direction size Nt x Nf -# -# The complete transfer function Hwt = Hw*Gwt is a function of -# w (columns) and theta (rows) size Nt x Nf -# ''' -# if kw is None: -# kw, unusedkw2 = w2k(w, 0, self.h) #wave number as function of angular frequency -# -# w, theta, kw = np.atleast_1d(w, theta, kw) -# # make sure they have the correct orientation -# theta.shape = (-1, 1) -# kw.shape = (-1,) -# w.shape = (-1,) -# -# tran_fun = self._tran_dict[self.sensortype] -# Hw, Gwt = tran_fun(w, theta, kw) -# -# # New call to avoid singularities. pab 07.11.2000 -# # Set Hw to 0 for expressions w*hyperbolic_ratio(z*k,h*k,1,-1)= 0*inf -# ind = np.flatnonzero(1 - np.isfinite(Hw)) -# Hw.flat[ind] = 0 -# -# sgn = np.sign(Hw); -# k0 = np.flatnonzero(sgn < 0) -# if len(k0): # make sure Hw>=0 ie. transfer negative signs to Gwt -# Gwt[:, k0] = -Gwt[:, k0] -# Hw[:, k0] = -Hw[:, k0] -# -# if self.igam == 2: -# #pab 09 Oct.2002: bug fix -# # Changing igam by 2 should affect the directional result in the same way that changing eta by -eta! -# Gwt = -Gwt -# return Hw, Gwt -# __call__ = tran -##---Private member methods -# def _get_ee_cthxy(self, theta, kw): -# # convert from angle in degrees to radians -# bet = self.bet -# thxr = self.thetax * pi / 180 -# thyr = self.thetay * pi / 180 -# -# cthx = bet * cos(theta - thxr + pi / 2) -# #cthy = cos(theta-thyr-pi/2) -# cthy = bet * sin(theta - thyr) -# -# # Compute location complex exponential -# x, y, unused_z = list(self.pos) -# ee = exp((1j * (x * cthx + y * cthy)) * kw) # exp(i*k(w)*(x*cos(theta)+y*sin(theta)) size Nt X Nf -# return ee, cthx, cthy -# -# def _get_zk(self, kw): -# h = self.h -# z = self.pos[2] -# if self.igam == 1: -# zk = kw * (h + z) # z measured positive upward from mean water level (default) -# elif self.igam == 2: -# zk = kw * (h - z) # z measured positive downward from mean water level -# else: -# zk = kw * z # z measured positive upward from sea floor -# return zk -# -# #--- Surface elevation --- -# def _n(self, w, theta, kw): -# '''n = Eta = wave profile -# ''' -# ee, unused_cthx, unused_cthy = self._get_ee_cthxy(theta, kw) -# return np.ones_like(w), ee -# -# #---- Vertical surface velocity and acceleration----- -# def _n_t(self, w, theta, kw): -# ''' n_t = Eta_t ''' -# ee, unused_cthx, unused_cthy = self._get_ee_cthxy(theta, kw) -# return w, -1j * ee; -# def _n_tt(self, w, theta, kw): -# '''n_tt = Eta_tt''' -# ee, unused_cthx, unused_cthy = self._get_ee_cthxy(theta, kw) -# return w ** 2, -ee -# -# #--- Surface slopes --- -# def _n_x(self, w, theta, kw): -# ''' n_x = Eta_x = x-slope''' -# ee, cthx, unused_cthy = self._get_ee_cthxy(theta, kw) -# return kw, 1j * cthx * ee -# def _n_y(self, w, theta, kw): -# ''' n_y = Eta_y = y-slope''' -# ee, unused_cthx, cthy = self._get_ee_cthxy(theta, kw) -# return kw, 1j * cthy * ee -# -# #--- Surface curvatures --- -# def _n_xx(self, w, theta, kw): -# ''' n_xx = Eta_xx = Surface curvature (x-dir)''' -# ee, cthx, unused_cthy = self._get_ee_cthxy(theta, kw) -# return kw ** 2, -(cthx ** 2) * ee -# def _n_yy(self, w, theta, kw): -# ''' n_yy = Eta_yy = Surface curvature (y-dir)''' -# ee, unused_cthx, cthy = self._get_ee_cthxy(theta, kw) -# return kw ** 2, -cthy ** 2 * ee -# def _n_xy(self, w, theta, kw): -# ''' n_xy = Eta_xy = Surface curvature (xy-dir)''' -# ee, cthx, cthy = self._get_ee_cthxy(theta, kw) -# return kw ** 2, -cthx * cthy * ee -# -# #--- Pressure--- -# def _p(self, w, theta, kw): -# ''' pressure fluctuations''' -# ee, unused_cthx, unused_cthy = self._get_ee_cthxy(theta, kw) -# hk = kw * self.h -# zk = self._get_zk(kw) -# return self.rho * self.g * hyperbolic_ratio(zk, hk, 1, 1), ee #hyperbolic_ratio = cosh(zk)/cosh(hk) -# -# #---- Water particle velocities --- -# def _u(self, w, theta, kw): -# ''' U = x-velocity''' -# ee, cthx, unused_cthy = self._get_ee_cthxy(theta, kw) -# hk = kw * self.h -# zk = self._get_zk(kw) -# return w * hyperbolic_ratio(zk, hk, 1, -1), cthx * ee# w*cosh(zk)/sinh(hk), cos(theta)*ee -# def _v(self, w, theta, kw): -# '''V = y-velocity''' -# ee, unused_cthx, cthy = self._get_ee_cthxy(theta, kw) -# hk = kw * self.h -# zk = self._get_zk(kw) -# return w * hyperbolic_ratio(zk, hk, 1, -1), cthy * ee # w*cosh(zk)/sinh(hk), sin(theta)*ee -# def _w(self, w, theta, kw): -# ''' W = z-velocity''' -# ee, unused_cthx, unused_cthy = self._get_ee_cthxy(theta, kw) -# hk = kw * self.h -# zk = self._get_zk(kw) -# return w * hyperbolic_ratio(zk, hk, -1, -1), -1j * ee # w*sinh(zk)/sinh(hk), -? -# -# #---- Water particle acceleration --- -# def _u_t(self, w, theta, kw): -# ''' U_t = x-acceleration''' -# ee, cthx, unused_cthy = self._get_ee_cthxy(theta, kw) -# hk = kw * self.h -# zk = self._get_zk(kw) -# return (w ** 2) * hyperbolic_ratio(zk, hk, 1, -1), -1j * cthx * ee # w^2*cosh(zk)/sinh(hk), ? -# -# def _v_t(self, w, theta, kw): -# ''' V_t = y-acceleration''' -# ee, unused_cthx, cthy = self._get_ee_cthxy(theta, kw) -# hk = kw * self.h -# zk = self._get_zk(kw) -# return (w ** 2) * hyperbolic_ratio(zk, hk, 1, -1), -1j * cthy * ee # w^2*cosh(zk)/sinh(hk), ? -# def _w_t(self, w, theta, kw): -# ''' W_t = z-acceleration''' -# ee, unused_cthx, unused_cthy = self._get_ee_cthxy(theta, kw) -# hk = kw * self.h -# zk = self._get_zk(kw) -# return (w ** 2) * hyperbolic_ratio(zk, hk, -1, -1), -ee # w*sinh(zk)/sinh(hk), ? -# -# #---- Water particle displacement --- -# def _x_p(self, w, theta, kw): -# ''' X_p = x-displacement''' -# ee, cthx, unused_cthy = self._get_ee_cthxy(theta, kw) -# hk = kw * self.h -# zk = self._get_zk(kw) -# return hyperbolic_ratio(zk, hk, 1, -1), 1j * cthx * ee # cosh(zk)./sinh(hk), ? -# def _y_p(self, w, theta, kw): -# ''' Y_p = y-displacement''' -# ee, unused_cthx, cthy = self._get_ee_cthxy(theta, kw) -# hk = kw * self.h -# zk = self._get_zk(kw) -# return hyperbolic_ratio(zk, hk, 1, -1), 1j * cthy * ee # cosh(zk)./sinh(hk), ? -# def _z_p(self, w, theta, kw): -# ''' Z_p = z-displacement''' -# ee, unused_cthx, unused_cthy = self._get_ee_cthxy(theta, kw) -# hk = kw * self.h -# zk = self._get_zk(kw) -# return hyperbolic_ratio(zk, hk, -1, -1), ee # sinh(zk)./sinh(hk), ee -# -#def wave_pressure(z, Hm0, h=10000, g=9.81, rho=1028): -# ''' -# Calculate pressure amplitude due to water waves. -# -# Parameters -# ---------- -# z : array-like -# depth where pressure is calculated [m] -# Hm0 : array-like -# significant wave height (same as the average of the 1/3'rd highest -# waves in a seastate. [m] -# h : real scalar -# waterdepth (default 10000 [m]) -# g : real scalar -# acceleration of gravity (default 9.81 m/s**2) -# rho : real scalar -# water density (default 1028 kg/m**3) -# -# -# Returns -# ------- -# p : ndarray -# pressure amplitude due to water waves at water depth z. [Pa] -# -# PRESSURE calculate pressure amplitude due to water waves according to -# linear theory. -# -# Example -# ----- -# >>> import pylab as plt -# >>> z = -np.linspace(10,20) -# >>> fh = plt.plot(z, wave_pressure(z, Hm0=1, h=20)) -# >>> plt.show() -# -# See also -# -------- -# w2k -# -# -# u = psweep.Fn*sqrt(mgf.length*9.81) -# z = -10; h = inf; -# Hm0 = 1.5;Tp = 4*sqrt(Hm0); -# S = jonswap([],[Hm0,Tp]); -# Hw = tran(S.w,0,[0 0 -z],'P',h) -# Sm = S; -# Sm.S = Hw.'.*S.S; -# x1 = spec2sdat(Sm,1000); -# pwave = pressure(z,Hm0,h) -# -# plot(psweep.x{1}/u, psweep.f) -# hold on -# plot(x1(1:100,1)-30,x1(1:100,2),'r') -# ''' -# -# -# # Assume seastate with jonswap spectrum: -# -# Tp = 4 * np.sqrt(Hm0) -# gam = jonswap_peakfact(Hm0, Tp) -# Tm02 = Tp / (1.30301 - 0.01698 * gam + 0.12102 / gam) -# w = 2 * np.pi / Tm02 -# kw, unused_kw2 = w2k(w, 0, h) -# -# hk = kw * h -# zk1 = kw * z -# zk = hk + zk1 # z measured positive upward from mean water level (default) -# #zk = hk-zk1; % z measured positive downward from mean water level -# #zk1 = -zk1; -# #zk = zk1; % z measured positive upward from sea floor -# -# # cosh(zk)/cosh(hk) approx exp(zk) for large h -# # hyperbolic_ratio(zk,hk,1,1) = cosh(zk)/cosh(hk) -# # pr = np.where(np.pi < hk, np.exp(zk1), hyperbolic_ratio(zk, hk, 1, 1)) -# pr = hyperbolic_ratio(zk, hk, 1, 1) -# pressure = (rho * g * Hm0 / 2) * pr -# -## pos = [np.zeros_like(z),np.zeros_like(z),z] -## tf = TransferFunction(pos=pos, sensortype='p', h=h, rho=rho, g=g) -## Hw, Gwt = tf.tran(w,0) -## pressure2 = np.abs(Hw) * Hm0 / 2 -# -# return pressure - -def main(): - import wafo - ts = wafo.objects.mat2timeseries(wafo.data.sea()) - tp = ts.turning_points() - mm = tp.cycle_pairs() - lc = mm.level_crossings() - lc.plot() - T = ts.wave_periods(vh=0.0, pdef='c2c') #@UnusedVariable - - - - #main() - import wafo.spectrum.models as sm - Sj = sm.Jonswap() - S = Sj.tospecdata() - - R = S.tocovdata() - x = R.sim(ns=1000, dt=0.2) #@UnusedVariable - S.characteristic(['hm0', 'tm02']) - ns = 1000 - dt = .2 - x1 = S.sim(ns, dt=dt) - - ts = TimeSeries(x1[:, 1], x1[:, 0]) - tp = ts.turning_points(0.0) - - x = np.arange(-2, 2, 0.2) - - # Plot 2 objects in one call - d2 = PlotData(np.sin(x), x, xlab='x', ylab='sin', title='sinus') - - - d0 = d2.copy() - d0.data = d0.data * 0.9 - d1 = d2.copy() - d1.data = d1.data * 1.2 - d1.children = [d0] - d2.children = [d1] - - d2.plot() - print 'Done' - -def test_docstrings(): - import doctest - doctest.testmod() - -if __name__ == '__main__': - test_docstrings() -# test_levelcrossings_extrapolate() -# if True: #False : # - -# import doctest -# doctest.testmod() -# else: -# main() - + + +# Name: module1 +# Purpose: +# +# Author: pab +# +# Created: 16.09.2008 +# Copyright: (c) pab 2008 +# Licence: + +#!/usr/bin/env python + + +from __future__ import division +from wafo.transform.core import TrData +from wafo.transform.models import TrHermite, TrOchi, TrLinear +from wafo.stats import edf, distributions +from wafo.misc import (nextpow2, findtp, findrfc, findtc, findcross, + ecross, JITImport, DotDict, gravity, findrfc_astm) +from wafodata import PlotData +from wafo.interpolate import SmoothSpline +from scipy.interpolate.interpolate import interp1d +from scipy.integrate.quadrature import cumtrapz # @UnresolvedImport +from scipy.special import ndtr as cdfnorm, ndtri as invnorm + +import warnings +import numpy as np + +from numpy import (inf, pi, zeros, ones, sqrt, where, log, exp, cos, sin, arcsin, mod, interp, # @UnresolvedImport + #@UnresolvedImport + linspace, arange, sort, all, abs, vstack, hstack, atleast_1d, sign, expm1, + finfo, polyfit, r_, nonzero, cumsum, ravel, size, isnan, nan, ceil, diff, array) # @UnresolvedImport +from numpy.fft import fft +from numpy.random import randn +from scipy.integrate import trapz +from wafo.interpolate import stineman_interp +from matplotlib.mlab import psd, detrend_mean +import scipy.signal + + +from plotbackend import plotbackend +import matplotlib +from scipy.stats.stats import skew, kurtosis +from scipy.signal.windows import parzen +from scipy import special + + +floatinfo = finfo(float) +matplotlib.interactive(True) +_wafocov = JITImport('wafo.covariance') +_wafospec = JITImport('wafo.spectrum') + +__all__ = ['TimeSeries', 'LevelCrossings', 'CyclePairs', 'TurningPoints', + 'sensortypeid', 'sensortype'] + + +def _invchi2(q, df): + return special.chdtri(df, q) + + +class LevelCrossings(PlotData): + + ''' + Container class for Level crossing data objects in WAFO + + Member variables + ---------------- + data : array-like + number of upcrossings or upcrossingintensity + args : array-like + crossing levels + + Examples + -------- + >>> import wafo.data + >>> import wafo.objects as wo + >>> x = wafo.data.sea() + >>> ts = wo.mat2timeseries(x) + + >>> tp = ts.turning_points() + >>> mm = tp.cycle_pairs() + + >>> lc = mm.level_crossings() + >>> h2 = lc.plot() + ''' + + def __init__(self, *args, **kwds): + options = dict(title='Level crossing spectrum', + xlab='Levels', ylab='Count', + plotmethod='semilogy', + plot_args=['b'], + plot_args_children=['r--']) + options.update(**kwds) + super(LevelCrossings, self).__init__(*args, **options) + self.intensity = kwds.get('intensity', False) + self.sigma = kwds.get('sigma', None) + self.mean = kwds.get('mean', None) + # self.setplotter(plotmethod='step') + + icmax = self.data.argmax() + if self.data != None: + if self.sigma is None or self.mean is None: + logcros = where(self.data == 0.0, inf, -log(self.data)) + logcmin = logcros[icmax] + logcros = sqrt(2 * abs(logcros - logcmin)) + logcros[0:icmax + 1] = 2 * logcros[ + icmax] - logcros[0:icmax + 1] + ncr = 10 + #least square fit + p = polyfit(self.args[ncr:-ncr], logcros[ncr:-ncr], 1) + if self.sigma is None: + # estimated standard deviation of x + self.sigma = 1.0 / p[0] + if self.mean is None: + self.mean = -p[1] / p[0] # self.args[icmax] + cmax = self.data[icmax] + x = (self.args - self.mean) / self.sigma + y = cmax * exp(-x ** 2 / 2.0) + self.children = [PlotData(y, self.args)] + + def extrapolate(self, u_min=None, u_max=None, method='ml', dist='genpar', plotflag=0): + ''' + Returns an extrapolated level crossing spectrum + + Parameters + ----------- + u_min, u_max : real scalars + extrapolate below u_min and above u_max. + method : string + describing the method of estimation. Options are: + 'ml' : Maximum Likelihood method (default) + 'mps': Maximum Product Spacing method + dist : string + defining distribution function. Options are: + genpareto : Generalized Pareto distribution (GPD) + expon : Exponential distribution (GPD with k=0) + rayleigh : truncated Rayleigh distribution + plotflag : scalar integer + 1: Diagnostic plots. (default) + 0: Don't plot diagnostic plots. + + Returns + ------- + lc : LevelCrossing object + with the estimated level crossing spectrum + Est = Estimated parameters. [struct array] + + Extrapolates the level crossing spectrum (LC) for high and for low levels. + The tails of the LC is fitted to a survival function of a GPD. + H(x) = (1-k*x/s)^(1/k) (GPD) + The use of GPD is motivated by POT methods in extreme value theory. + For k=0 the GPD is the exponential distribution + H(x) = exp(-x/s), k=0 (expon) + The tails with the survival function of a truncated Rayleigh distribution. + H(x) = exp(-((x+x0).^2-x0^2)/s^2) (rayleigh) + where x0 is the distance from the truncation level to where the LC has its maximum. + The method 'gpd' uses the GPD. We recommend the use of 'gpd,ml'. + The method 'exp' uses the Exp. + The method 'ray' uses Ray, and should be used if the load is a Gaussian process. + + Example + ------- + >>> import wafo.data + >>> import wafo.objects as wo + >>> x = wafo.data.sea() + >>> ts = wo.mat2timeseries(x) + + >>> tp = ts.turning_points() + >>> mm = tp.cycle_pairs() + >>> lc = mm.level_crossings() + + >>> s = x[:,1].std() + >>> lc_gpd = lc.extrapolate(-2*s, 2*s) + >>> lc_exp = lc.extrapolate(-2*s, 2*s, dist='expon') + >>> lc_ray = lc.extrapolate(-2*s, 2*s, dist='rayleigh') + + lc.plot() + lc_gpd.plot() + lc_exp.plot() + lc_ray.plot() + + + See also + -------- + cmat2extralc, rfmextrapolate, lc2rfmextreme, extralc, fitgenpar + + References + ---------- + Johannesson, P., and Thomas, J-.J. (2000): + Extrapolation of Rainflow Matrices. + Preprint 2000:82, Mathematical statistics, Chalmers, pp. 18. + ''' + + i_max = self.data.argmax() + c_max = self.data[i_max] + # Maximum of lc + lc_max = self.args[i_max] + + if u_min is None or u_max is None: + fraction = sqrt(c_max) + i = np.flatnonzero(self.data > fraction) + if u_min is None: + u_min = self.args[i.min()] + if u_max is None: + u_max = self.args[i.max()] + lcf, lcx = self.data, self.args + # Extrapolate LC for high levels + [lc_High, phat_high] = self._extrapolate( + lcx, lcf, u_max, u_max - lc_max, method, dist) +# +# Extrapolate LC for low levels + [lcEst1, phat_low] = self._extrapolate( + -lcx[::-1], lcf[::-1], -u_min, lc_max - u_min, method, dist) + lc_Low = lcEst1[::-1, :] # [-lcEst1[::-1, 0], lcEst1[::-1, 1::]] + lc_Low[:, 0] *= -1 +# Est.Low = Est1; +# + if plotflag: + plotbackend.semilogx(lcf, lcx, + lc_High[:, 1], lc_High[:, 0], + lc_Low[:, 1], lc_Low[:, 0]) + i_mask = (u_min < lcx) & (lcx < u_max) + f = np.hstack((lc_Low[:, 1], lcf[i_mask], lc_High[:, 1])) + x = np.hstack((lc_Low[:, 0], lcx[i_mask], lc_High[:, 0])) + lc_out = LevelCrossings(f, x, sigma=self.sigma, mean=self.mean) + lc_out.phat_high = phat_high + lc_out.phat_low = phat_low + return lc_out + # + + def _extrapolate(self, lcx, lcf, u, offset, method, dist): + # Extrapolate the level crossing spectra for high levels + + method = method.lower() + dist = dist.lower() + + # Excedences over level u + Iu = lcx > u + lcx1, lcf1 = lcx[Iu], lcf[Iu] + lcf2, lcx2 = self._make_increasing(lcf1[::-1], lcx1[::-1]) + + nim1 = 0 + x = [] + for xk, ni in zip(lcx2.tolist(), lcf2.tolist()): + x.append(ones(ni - nim1) * xk) + nim1 = ni + + x = np.hstack(x) - u + + df = 0.01 + xF = np.arange(0.0, 4 + df / 2, df) + lcu = np.interp(u, lcx, lcf) + 1 + # Estimate tail + if dist.startswith('gen'): + genpareto = distributions.genpareto + phat = genpareto.fit2(x, floc=0, method=method) + SF = phat.sf(xF) + + covar = phat.par_cov[::2, ::2] + # Calculate 90 # confidence region, an ellipse, for (k,s) + D, B = np.linalg.eig(covar) + b = phat.par[::2] + if b[0] > 0: + phat.upperlimit = u + b[1] / b[0] + + r = sqrt(-2 * log(1 - 90 / 100)) # 90 # confidence sphere + Nc = 16 + 1 + ang = linspace(0, 2 * pi, Nc) + # 90# Circle + c0 = np.vstack( + (r * sqrt(D[0]) * sin(ang), r * sqrt(D[1]) * cos(ang))) + # plot(c0(1,:),c0(2,:)) + + #* ones((1, len(c0))) # Transform to ellipse for (k,s) + c1 = np.dot(B, c0) + b[:, None] + # plot(c1(1,:),c1(2,:)), hold on + + # Calculate conf.int for lcu + # Assumtion: lcu is Poisson distributed + # Poissin distr. approximated by normal when calculating conf. int. + dXX = 1.64 * sqrt(lcu) # 90 # quantile for lcu + + lcEstCu = zeros((len(xF), Nc)) + lcEstCl = zeros((len(xF), Nc)) + for i in range(Nc): + k = c1[0, i] + s = c1[1, i] + SF2 = genpareto.sf(xF, k, scale=s) + lcEstCu[:, i] = (lcu + dXX) * (SF2) + lcEstCl[:, i] = (lcu - dXX) * (SF2) + # end + + lcEst = np.vstack((xF + u, lcu * (SF), + lcEstCl.min(axis=1), lcEstCu.max(axis=1))).T + elif dist.startswith('exp'): + expon = distributions.expon + phat = expon.fit2(x, floc=0, method=method) + SF = phat.sf(xF) + lcEst = np.vstack((xF + u, lcu * (SF))).T + + elif dist.startswith('ray') or dist.startswith('trun'): + phat = distributions.truncrayleigh.fit2(x, floc=0, method=method) + SF = phat.sf(xF) +# if False: +# n = len(x) +# Sx = sum((x + offset) ** 2 - offset ** 2) +# s = sqrt(Sx / n); # Shape parameter +# F = -np.expm1(-((xF + offset) ** 2 - offset ** 2) / s ** 2) + lcEst = np.vstack((xF + u, lcu * (SF))).T + else: + raise ValueError() + + return lcEst, phat + # End extrapolate + + def _make_increasing(self, f, t=None): + # Makes the signal f strictly increasing. + + n = len(f) + if t is None: + t = np.arange(n) + ff = [f[0], ] + tt = [t[0], ] + + for i in xrange(1, n): + if f[i] > ff[-1]: + ff.append(f[i]) + tt.append(t[i]) + + return np.asarray(ff), np.asarray(tt) + + def sim(self, ns, alpha): + """ + Simulates process with given irregularity factor and crossing spectrum + + Parameters + ---------- + ns : scalar, integer + number of sample points. + alpha : real scalar + irregularity factor, 0>> import wafo.spectrum.models as sm + >>> from wafo.objects import mat2timeseries + >>> Sj = sm.Jonswap(Hm0=7) + >>> S = Sj.tospecdata() #Make spectrum object from numerical values + >>> alpha = S.characteristic('alpha')[0] + >>> n = 10000 + >>> xs = S.sim(ns=n) + >>> ts = mat2timeseries(xs) + >>> tp = ts.turning_points() + >>> mm = tp.cycle_pairs() + >>> lc = mm.level_crossings() + + >>> xs2 = lc.sim(n,alpha) + >>> ts2 = mat2timeseries(xs2) + >>> Se = ts2.tospecdata(L=324) + + >>> alpha2 = Se.characteristic('alpha')[0] + >>> np.round(alpha2*10) + array([ 7.]) + >>> np.abs(alpha-alpha2)<0.03 + array([ True], dtype=bool) + + >>> h0 = S.plot('b') + >>> h1 = Se.plot('r') + + >>> lc2 = ts2.turning_points().cycle_pairs().level_crossings() + + >>> import pylab as plt + >>> h = plt.subplot(211) + >>> h2 = lc2.plot() + >>> h = plt.subplot(212) + >>> h0 = lc.plot() + + """ + + # TODO: add a good example + f = linspace(0, 0.49999, 1000) + rho_st = 2. * sin(f * pi) ** 2 - 1. + tmp = alpha * arcsin(sqrt((1. + rho_st) / 2)) + tmp = sin(tmp) ** 2 + a2 = (tmp - rho_st) / (1 - tmp) + y = vstack((a2 + rho_st, 1 - a2)).min(axis=0) + maxidx = y.argmax() + #[maximum,maxidx]=max(y) + + rho_st = rho_st[maxidx] + a2 = a2[maxidx] + a1 = 2. * rho_st + a2 - 1. + r0 = 1. + r1 = -a1 / (1. + a2) + r2 = (a1 ** 2 - a2 - a2 ** 2) / (1 + a2) + sigma2 = r0 + a1 * r1 + a2 * r2 + #randn = np.random.randn + e = randn(ns) * sqrt(sigma2) + e[:2] = 0.0 + L0 = randn(1) + L0 = hstack((L0, r1 * L0 + sqrt(1 - r2 ** 2) * randn(1))) + #%Simulate the process, starting in L0 + lfilter = scipy.signal.lfilter + z0 = lfilter([1, a1, a2], ones(1), L0) + L, unused_zf = lfilter(ones(1), [1, a1, a2], e, axis=0, zi=z0) + + epsilon = 1.01 + min_L = min(L) + max_L = max(L) + maxi = max(abs(r_[min_L, max_L])) * epsilon + mini = -maxi + + u = linspace(mini, maxi, 101) + G = cdfnorm(u) # (1 + erf(u / sqrt(2))) / 2 + G = G * (1 - G) + + x = linspace(0, r1, 100) + factor1 = 1. / sqrt(1 - x ** 2) + factor2 = 1. / (1 + x) + integral = zeros(u.shape, dtype=float) + for i in range(len(integral)): + y = factor1 * exp(-u[i] * u[i] * factor2) + integral[i] = trapz(y, x) + # end + G = G - integral / (2 * pi) + G = G / max(G) + + Z = ((u >= 0) * 2 - 1) * sqrt(-2 * log(G)) + + sumcr = trapz(self.data, self.args) + lc = self.data / sumcr + lc1 = self.args + mcr = trapz(lc1 * lc, lc1) if self.mean is None else self.mean + if self.sigma is None: + scr = trapz(lc1 ** 2 * lc, lc1) + scr = sqrt(scr - mcr ** 2) + else: + scr = self.sigma + lc2 = LevelCrossings(lc, lc1, mean=mcr, sigma=scr, intensity=True) + + g = lc2.trdata()[0] + + f = g.gauss2dat(Z) + G = TrData(f, u) + + process = G.dat2gauss(L) + return np.vstack((arange(len(process)), process)).T + +# +# +# %Check the result without reference to getrfc: +## LCe = dat2lc(process) +# max(lc(:,2)) +# max(LCe(:,2)) +# +# clf +# plot(lc(:,1),lc(:,2)/max(lc(:,2))) +# hold on +# plot(LCe(:,1),LCe(:,2)/max(LCe(:,2)),'-.') +## title('Relative crossing intensity') +# +# %% Plot made by the function funplot_4, JE 970707 +# %param = [min(process(:,2)) max(process(:,2)) 100] +# %plot(lc(:,1),lc(:,2)/max(lc(:,2))) +# %hold on +# %plot(levels(param),mu/max(mu),'--') +# %hold off +# %title('Crossing intensity') +# %watstamp +# +# % Temporarily +# %funplot_4(lc,param,mu) + def trdata(self, mean=None, sigma=None, **options): + ''' + Estimate transformation, g, from observed crossing intensity, version2. + + Assumption: a Gaussian process, Y, is related to the + non-Gaussian process, X, by Y = g(X). + + Parameters + ---------- + mean, sigma : real scalars + mean and standard deviation of the process + **options : + csm, gsm : real scalars + defines the smoothing of the crossing intensity and the transformation g. + Valid values must be 0<=csm,gsm<=1. (default csm = 0.9 gsm=0.05) + Smaller values gives smoother functions. + param : + vector which defines the region of variation of the data X. + (default [-5, 5, 513]). + monitor : bool + if true monitor development of estimation + linextrap : bool + if true use a smoothing spline with a constraint on the ends to + ensure linear extrapolation outside the range of the data. (default) + otherwise use a regular smoothing spline + cvar, gvar : real scalars + Variances for the crossing intensity and the empirical transformation, g. (default 1) + ne : scalar integer + Number of extremes (maxima & minima) to remove from the estimation + of the transformation. This makes the estimation more robust against + outliers. (default 7) + ntr : scalar integer + Maximum length of empirical crossing intensity. The empirical + crossing intensity is interpolated linearly before smoothing if the + length exceeds ntr. A reasonable NTR (eg. 1000) will significantly + speed up the estimation for long time series without loosing any accuracy. + NTR should be chosen greater than PARAM(3). (default inf) + + Returns + ------- + gs, ge : TrData objects + smoothed and empirical estimate of the transformation g. + + + Notes + ----- + The empirical crossing intensity is usually very irregular. + More than one local maximum of the empirical crossing intensity + may cause poor fit of the transformation. In such case one + should use a smaller value of GSM or set a larger variance for GVAR. + If X(t) is likely to cross levels higher than 5 standard deviations + then the vector param has to be modified. For example if X(t) is + unlikely to cross a level of 7 standard deviations one can use + param = [-7 7 513]. + + Example + ------- + >>> import wafo.spectrum.models as sm + >>> import wafo.transform.models as tm + >>> from wafo.objects import mat2timeseries + >>> Hs = 7.0 + >>> Sj = sm.Jonswap(Hm0=Hs) + >>> S = Sj.tospecdata() #Make spectrum object from numerical values + >>> S.tr = tm.TrOchi(mean=0, skew=0.16, kurt=0, sigma=Hs/4, ysigma=Hs/4) + >>> xs = S.sim(ns=2**16, iseed=10) + >>> ts = mat2timeseries(xs) + >>> tp = ts.turning_points() + >>> mm = tp.cycle_pairs() + >>> lc = mm.level_crossings() + >>> g0, g0emp = lc.trdata(monitor=True) # Monitor the development + >>> g1, g1emp = lc.trdata(gvar=0.5 ) # Equal weight on all points + >>> g2, g2emp = lc.trdata(gvar=[3.5, 0.5, 3.5]) # Less weight on the ends + >>> int(S.tr.dist2gauss()*100) + 141 + >>> int(g0emp.dist2gauss()*100) + 380995 + >>> int(g0.dist2gauss()*100) + 143 + >>> int(g1.dist2gauss()*100) + 162 + >>> int(g2.dist2gauss()*100) + 120 + + g0.plot() # Check the fit. + + See also + troptset, dat2tr, trplot, findcross, smooth + + NB! the transformated data will be N(0,1) + + Reference + --------- + Rychlik , I., Johannesson, P., and Leadbetter, M.R. (1997) + "Modelling and statistical analysis of ocean wavedata + using a transformed Gaussian process", + Marine structures, Design, Construction and Safety, + Vol 10, pp 13--47 + ''' + + if mean is None: + mean = self.mean + if sigma is None: + sigma = self.sigma + + opt = DotDict(chkder=True, plotflag=False, csm=0.9, gsm=.05, + param=(-5, 5, 513), delay=2, linextrap=True, ntr=10000, ne=7, gvar=1) + opt.update(options) + param = opt.param + Ne = opt.ne + + ncr = len(self.data) + if ncr > opt.ntr and opt.ntr > 0: + x0 = linspace(self.args[Ne], self.args[-1 - Ne], opt.ntr) + lc1, lc2 = x0, interp(x0, self.args, self.data) + Ne = 0 + Ner = opt.ne + ncr = opt.ntr + else: + Ner = 0 + lc1, lc2 = self.args, self.data + ng = len(atleast_1d(opt.gvar)) + if ng == 1: + gvar = opt.gvar * ones(ncr) + else: + gvar = interp1d(linspace(0, 1, ng), opt.gvar, kind='linear')( + linspace(0, 1, ncr)) + + uu = linspace(*param) + g1 = sigma * uu + mean + + if Ner > 0: # Compute correction factors + cor1 = trapz(lc2[0:Ner + 1], lc1[0:Ner + 1]) + cor2 = trapz(lc2[-Ner - 1::], lc1[-Ner - 1::]) + else: + cor1 = 0 + cor2 = 0 + + lc22 = hstack((0, cumtrapz(lc2, lc1) + cor1)) + + if self.intensity: + lc22 = (lc22 + 0.5 / ncr) / (lc22[-1] + cor2 + 1. / ncr) + else: + lc22 = (lc22 + 0.5) / (lc22[-1] + cor2 + 1) + + lc11 = (lc1 - mean) / sigma + + lc22 = invnorm(lc22) # - ymean + + g2 = TrData(lc22.copy(), lc1.copy(), mean=mean, sigma=sigma) + g2.setplotter('step') + # NB! the smooth function does not always extrapolate well outside the edges + # causing poor estimate of g + # We may alleviate this problem by: forcing the extrapolation + # to be linear outside the edges or choosing a lower value for csm2. + + inds = slice(Ne, ncr - Ne) # indices to points we are smoothing over + slc22 = SmoothSpline( + lc11[inds], lc22[inds], opt.gsm, opt.linextrap, gvar[inds])(uu) + + g = TrData(slc22.copy(), g1.copy(), mean=mean, sigma=sigma) + + if opt.chkder: + for ix in range(5): + dy = diff(g.data) + if any(dy <= 0): + warnings.warn( + ''' The empirical crossing spectrum is not sufficiently smoothed. + The estimated transfer function, g, is not a strictly increasing function. + ''') + eps = finfo(float).eps + dy[dy > 0] = eps + gvar = -(hstack((dy, 0)) + hstack((0, dy))) / 2 + eps + g.data = SmoothSpline( + g.args, g.data, 1, opt.linextrap, ix * gvar)(g.args) + else: + break + + if opt.plotflag > 0: + g.plot() + g2.plot() + + return g, g2 + + +def test_levelcrossings_extrapolate(): + import wafo.data + #import wafo.objects as wo + x = wafo.data.sea() + ts = mat2timeseries(x) + + tp = ts.turning_points() + mm = tp.cycle_pairs() + lc = mm.level_crossings() + + s = x[:, 1].std() + lc_gpd = lc.extrapolate(-2 * s, 2 * s, dist='rayleigh') # @UnusedVariable + + +class CyclePairs(PlotData): + + ''' + Container class for Cycle Pairs data objects in WAFO + + Member variables + ---------------- + data : array_like + args : vector for 1D + + Examples + -------- + >>> import wafo.data + >>> import wafo.objects as wo + >>> x = wafo.data.sea() + >>> ts = wo.mat2timeseries(x) + + >>> tp = ts.turning_points() + >>> mm = tp.cycle_pairs() + >>> h1 = mm.plot(marker='x') + ''' + + def __init__(self, *args, **kwds): + self.kind = kwds.pop('kind', 'min2max') + self.sigma = kwds.pop('sigma', None) + self.mean = kwds.pop('mean', None) + self.time = kwds.pop('time', 1) + + options = dict(title=self.kind + ' cycle pairs', + xlab='min', ylab='max', + plot_args=['b.']) + options.update(**kwds) + super(CyclePairs, self).__init__(*args, **options) + + def amplitudes(self): + return (self.data - self.args) / 2. + + def damage(self, beta, K=1): + """ + Calculates the total Palmgren-Miner damage of cycle pairs. + + Parameters + ---------- + beta : array-like, size m + Beta-values, material parameter. + K : scalar, optional + K-value, material parameter. + + Returns + ------- + D : ndarray, size m + Damage. + + Notes + ----- + The damage is calculated according to + D[i] = sum ( K * a**beta[i] ), with a = (max-min)/2 + + Examples + -------- + >>> import wafo + >>> from matplotlib import pyplot as plt + >>> ts = wafo.objects.mat2timeseries(wafo.data.sea()) + >>> tp = ts.turning_points() + >>> mm = tp.cycle_pairs() + >>> h = mm.plot(marker='.') + >>> bv = range(3,9) + >>> D = mm.damage(beta=bv) + >>> D + array([ 138.5238799 , 117.56050788, 108.99265423, 107.86681126, + 112.3791076 , 122.08375071]) + >>> h = plt.plot(bv,D,'x-') + + See also + -------- + SurvivalCycleCount + """ + amp = abs(self.amplitudes()) + return atleast_1d([K * np.sum(amp ** betai) for betai in beta]) + + def level_crossings(self, kind='uM', intensity=False): + """ Return level crossing spectrum from a cycle count. + + Parameters + ---------- + kind : int or string + defining crossing type, options are + 0,'u' : only upcrossings. + 1,'uM' : upcrossings and maxima (default). + 2,'umM': upcrossings, minima, and maxima. + 3,'um' : upcrossings and minima. + intensity : bool + True if level crossing intensity spectrum + False if level crossing count spectrum + Return + ------ + lc : level crossing object + with levels and number of upcrossings. + + + Calculates the number of upcrossings from a cycle pairs, e.g. + min2Max cycles or rainflow cycles. + + Example: + -------- + >>> import wafo + >>> ts = wafo.objects.mat2timeseries(wafo.data.sea()) + >>> tp = ts.turning_points() + >>> mm = tp.cycle_pairs() + >>> h = mm.plot(marker='.') + >>> lc = mm.level_crossings() + >>> h2 = lc.plot() + + See also + -------- + TurningPoints + LevelCrossings + """ + + if isinstance(kind, str): + t = dict(u=0, uM=1, umM=2, um=3) + defnr = t.get(kind, 1) + else: + defnr = kind + + if ((defnr < 0) or (defnr > 3)): + raise ValueError('kind must be one of (1,2,3,4).') + + index, = nonzero(self.args <= self.data) + if index.size == 0: + index, = nonzero(self.args >= self.data) + M = self.args[index] + m = self.data[index] + else: + m = self.args[index] + M = self.data[index] + +# if isempty(index) +# error('Error in input cc.') +# end + ncc = len(m) + + minima = vstack((m, ones(ncc), zeros(ncc), ones(ncc))) + maxima = vstack((M, -ones(ncc), ones(ncc), zeros(ncc))) + + extremes = hstack((maxima, minima)) + index = extremes[0].argsort() + extremes = extremes[:, index] + + ii = 0 + n = extremes.shape[1] + extr = zeros((4, n)) + extr[:, 0] = extremes[:, 0] + for i in xrange(1, n): + if extremes[0, i] == extr[0, ii]: + extr[1:4, ii] = extr[1:4, ii] + extremes[1:4, i] + else: + ii += 1 + extr[:, ii] = extremes[:, i] + + #[xx nx]=max(extr(:,1)) + nx = extr[0].argmax() + 1 + levels = extr[0, 0:nx] + if defnr == 2: # This are upcrossings + maxima + dcount = cumsum(extr[1, 0:nx]) + extr[2, 0:nx] - extr[3, 0:nx] + elif defnr == 4: # This are upcrossings + minima + dcount = cumsum(extr[1, 0:nx]) + dcount[nx - 1] = dcount[nx - 2] + elif defnr == 1: # This are only upcrossings + dcount = cumsum(extr[1, 0:nx]) - extr[3, 0:nx] + elif defnr == 3: # This are upcrossings + minima + maxima + dcount = cumsum(extr[1, 0:nx]) + extr[2, 0:nx] + ylab = 'Count' + if intensity: + dcount = dcount / self.time + ylab = 'Intensity [count/sec]' + return LevelCrossings(dcount, levels, mean=self.mean, sigma=self.sigma, ylab=ylab, intensity=intensity) + + +class TurningPoints(PlotData): + + ''' + Container class for Turning Points data objects in WAFO + + Member variables + ---------------- + data : array_like + args : vector for 1D + + Examples + -------- + >>> import wafo.data + >>> import wafo.objects as wo + >>> x = wafo.data.sea() + >>> ts = wo.mat2timeseries(x) + + >>> tp = ts.turning_points() + >>> h1 = tp.plot(marker='x') + ''' + + def __init__(self, *args, **kwds): + self.name_ = kwds.pop('name', 'WAFO TurningPoints Object') + self.sigma = kwds.pop('sigma', None) + self.mean = kwds.pop('mean', None) + + options = dict(title='Turning points') + # plot_args=['b.']) + options.update(**kwds) + super(TurningPoints, self).__init__(*args, **options) + + if not any(self.args): + n = len(self.data) + self.args = range(0, n) + else: + self.args = ravel(self.args) + self.data = ravel(self.data) + + def rainflow_filter(self, h=0.0, method='clib'): + ''' + Return rainflow filtered turning points (tp). + + Parameters + ---------- + h : scalar + a threshold + if h<=0, then tp is a sequence of turning points (default) + if h>0, then all rainflow cycles with height smaller than + h are removed. + + Returns + ------- + tp : TurningPoints object + with times and turning points. + + Example: + >>> import wafo.data + >>> x = wafo.data.sea() + >>> x1 = x[:200,:] + >>> ts1 = mat2timeseries(x1) + >>> tp = ts1.turning_points(wavetype='Mw') + >>> tph = tp.rainflow_filter(h=0.3) + >>> hs = ts1.plot() + >>> hp = tp.plot('ro') + >>> hph = tph.plot('k.') + + See also + --------- + findcross, + findrfc + findtp + ''' + ind = findrfc(self.data, max(h, 0.0), method) + try: + t = self.args[ind] + except: + t = ind + mean = self.mean + sigma = self.sigma + return TurningPoints(self.data[ind], t, mean=mean, sigma=sigma) + + def cycle_pairs(self, h=0, kind='min2max', method='clib'): + """ Return min2Max or Max2min cycle pairs from turning points + + Parameters + ---------- + kind : string + type of cycles to return options are 'min2max' or 'max2min' + method : string + specify which library to use + 'clib' for wafo's c_library + 'None' for wafo's Python functions + + Return + ------ + mm : cycles object + with min2Max or Max2min cycle pairs. + + Example + ------- + >>> import wafo + >>> x = wafo.data.sea() + >>> ts = wafo.objects.mat2timeseries(x) + >>> tp = ts.turning_points() + >>> mM = tp.cycle_pairs() + >>> h = mM.plot(marker='x') + + + See also + -------- + TurningPoints + SurvivalCycleCount + """ + + if h > 0: + ind = findrfc(self.data, h, method=method) + data = self.data[ind] + else: + data = self.data + if data[0] > data[1]: + im = 1 + iM = 0 + else: + im = 0 + iM = 1 + + # Extract min-max and max-min cycle pairs + #n = len(self.data) + if kind.lower().startswith('min2max'): + m = data[im:-1:2] + M = data[im + 1::2] + else: + kind = 'max2min' + M = data[iM:-1:2] + m = data[iM + 1::2] + + time = self.args[-1] - self.args[0] + + return CyclePairs(M, m, kind=kind, mean=self.mean, sigma=self.sigma, + time=time) + + def cycle_astm(self): + """ + Rainflow counted cycles according to Nieslony's ASTM implementation + + Parameters + ---------- + + Returns + ------- + sig_rfc : array-like + array of shape (n,3) with: + sig_rfc[:,0] Cycles amplitude + sig_rfc[:,1] Cycles mean value + sig_rfc[:,2] Cycle type, half (=0.5) or full (=1.0) + + References + ---------- + Adam Nieslony, "Determination of fragments of multiaxial service loading + strongly influencing the fatigue of machine components", + Mechanical Systems and Signal Processing 23, no. 8 (2009): 2712-2721. + + and is based on the following standard: + ASTM E 1049-85 (Reapproved 1997), Standard practices for cycle counting + in fatigue analysis, in: Annual Book of ASTM Standards, + vol. 03.01, ASTM, Philadelphia, 1999, pp. 710-718. + + Copyright (c) 1999-2002 by Adam Nieslony + Ported to Python by David Verelst + + Example + ------- + >>> import wafo + >>> x = wafo.data.sea() + >>> sig_ts = wafo.objects.mat2timeseries(x) + >>> sig_tp = sig_ts.turning_points(h=0, wavetype='astm') + >>> sig_cp = sig_tp.cycle_astm() + """ + + # output of Nieslony's algorithm is organised differently with + # respect to wafo's approach + # TODO: integrate ASTM method into the CyclyPairs class? + return findrfc_astm(self.data) + + +def mat2timeseries(x): + """ + Convert 2D arrays to TimeSeries object + assuming 1st column is time and the remaining columns contain data. + """ + return TimeSeries(x[:, 1::], x[:, 0].ravel()) + + +class TimeSeries(PlotData): + + ''' + Container class for 1D TimeSeries data objects in WAFO + + Member variables + ---------------- + data : array_like + args : vector for 1D, list of vectors for 2D, 3D, ... + + sensortypes : list of integers or strings + sensor type for time series (default ['n'] : Surface elevation) + see sensortype for more options + position : vector of size 3 + instrument position relative to the coordinate system + + Examples + -------- + >>> import wafo.data + >>> import wafo.objects as wo + >>> x = wafo.data.sea() + >>> ts = wo.mat2timeseries(x) + >>> rf = ts.tocovdata(lag=150) + >>> h = rf.plot() + + >>> S = ts.tospecdata() + The default L is set to 325 + + >>> tp = ts.turning_points() + >>> mm = tp.cycle_pairs() + >>> h1 = mm.plot(marker='x') + + >>> lc = mm.level_crossings() + >>> h2 = lc.plot() + + ''' + + def __init__(self, *args, **kwds): + self.name_ = kwds.pop('name', 'WAFO TimeSeries Object') + self.sensortypes = kwds.pop('sensortypes', ['n', ]) + self.position = kwds.pop('position', [zeros(3), ]) + + super(TimeSeries, self).__init__(*args, **kwds) + + if not any(self.args): + n = len(self.data) + self.args = range(0, n) + + def sampling_period(self): + ''' + Returns sampling interval + + Returns + ------- + dt : scalar + sampling interval, unit: + [s] if lagtype=='t' + [m] otherwise + + See also + ''' + dt1 = self.args[1] - self.args[0] + n = size(self.args) - 1 + t = self.args[-1] - self.args[0] + dt = t / n + if abs(dt - dt1) > 1e-10: + warnings.warn('Data is not uniformly sampled!') + return dt + + def tocovdata(self, lag=None, flag='biased', norm=False, dt=None): + ''' + Return auto covariance function from data. + + Parameters + ---------- + lag : scalar, int + maximum time-lag for which the ACF is estimated. (Default lag=n-1) + flag : string, 'biased' or 'unbiased' + If 'unbiased' scales the raw correlation by 1/(n-abs(k)), + where k is the index into the result, otherwise scales the raw + cross-correlation by 1/n. (default) + norm : bool + True if normalize output to one + dt : scalar + time-step between data points (default see sampling_period). + + Return + ------- + R : CovData1D object + with attributes: + data : ACF vector length L+1 + args : time lags length L+1 + sigma : estimated large lag standard deviation of the estimate + assuming x is a Gaussian process: + if R(k)=0 for all lags k>q then an approximation + of the variance for large samples due to Bartlett + var(R(k))=1/N*(R(0)^2+2*R(1)^2+2*R(2)^2+ ..+2*R(q)^2) + for k>q and where N=length(x). Special case is + white noise where it equals R(0)^2/N for k>0 + norm : bool + If false indicating that R is not normalized + + Example: + -------- + >>> import wafo.data + >>> import wafo.objects as wo + >>> x = wafo.data.sea() + >>> ts = wo.mat2timeseries(x) + >>> acf = ts.tocovdata(150) + >>> h = acf.plot() + ''' + n = len(self.data) + if not lag: + lag = n - 1 + + x = self.data.flatten() + indnan = isnan(x) + if any(indnan): + x = x - x[1 - indnan].mean() # remove the mean pab 09.10.2000 + #indnan = find(indnan) + Ncens = n - sum(indnan) + x[indnan] = 0. # pab 09.10.2000 much faster for censored samples + else: + indnan = None + Ncens = n + x = x - x.mean() + + #fft = np.fft.fft + nfft = 2 ** nextpow2(n) + Rper = abs(fft(x, nfft)) ** 2 / Ncens # Raw periodogram + + R = np.real(fft(Rper)) / nfft # %ifft=fft/nfft since Rper is real! + lags = range(0, lag + 1) + if flag.startswith('unbiased'): + # unbiased result, i.e. divide by n-abs(lag) + R = R[lags] * Ncens / arange(Ncens, Ncens - lag, -1) + # else % biased result, i.e. divide by n + # r=r(1:L+1)*Ncens/Ncens + + c0 = R[0] + if norm: + R = R / c0 + r0 = R[0] + if dt is None: + dt = self.sampling_period() + t = linspace(0, lag * dt, lag + 1) + #cumsum = np.cumsum + acf = _wafocov.CovData1D(R[lags], t) + acf.sigma = sqrt( + r_[0, r0 ** 2, r0 ** 2 + 2 * cumsum(R[1:] ** 2)] / Ncens) + acf.children = [ + PlotData(-2. * acf.sigma[lags], t), PlotData(2. * acf.sigma[lags], t)] + acf.plot_args_children = ['r:'] + acf.norm = norm + return acf + + def _specdata(self, L=None, tr=None, method='cov', detrend=detrend_mean, window=parzen, noverlap=0, pad_to=None): + """ + Obsolete: Delete? + Return power spectral density by Welches average periodogram method. + + Parameters + ---------- + NFFT : int, scalar + if len(data) < NFFT, it will be zero padded to `NFFT` + before estimation. Must be even; a power 2 is most efficient. + detrend : function + window : vector of length NFFT or function + To create window vectors see numpy.blackman, numpy.hamming, + numpy.bartlett, scipy.signal, scipy.signal.get_window etc. + noverlap : scalar int + gives the length of the overlap between segments. + + Returns + ------- + S : SpecData1D + Power Spectral Density + + Notes + ----- + The data vector is divided into NFFT length segments. Each segment + is detrended by function detrend and windowed by function window. + noverlap gives the length of the overlap between segments. The + absolute(fft(segment))**2 of each segment are averaged to compute Pxx, + with a scaling to correct for power loss due to windowing. + + Reference + --------- + Bendat & Piersol (1986) Random Data: Analysis and Measurement + Procedures, John Wiley & Sons + """ + dt = self.sampling_period() + #fs = 1. / (2 * dt) + yy = self.data.ravel() if tr is None else tr.dat2gauss( + self.data.ravel()) + yy = detrend(yy) if hasattr(detrend, '__call__') else yy + + S, f = psd(yy, Fs=1. / dt, NFFT=L, detrend=detrend, window=window, + noverlap=noverlap, pad_to=pad_to, scale_by_freq=True) + fact = 2.0 * pi + w = fact * f + return _wafospec.SpecData1D(S / fact, w) + + def tospecdata(self, L=None, tr=None, method='cov', detrend=detrend_mean, window=parzen, noverlap=0, ftype='w', alpha=None): + ''' + Estimate one-sided spectral density from data. + + Parameters + ---------- + L : scalar integer + maximum lag size of the window function. As L decreases the estimate + becomes smoother and Bw increases. If we want to resolve peaks in + S which is Bf (Hz or rad/sec) apart then Bw < Bf. If no value is given the + lag size is set to be the lag where the auto correlation is less than + 2 standard deviations. (maximum 300) + tr : transformation object + the transformation assuming that x is a sample of a transformed + Gaussian process. If g is None then x is a sample of a Gaussian process (Default) + method : string + defining estimation method. Options are + 'cov' : Frequency smoothing using a parzen window function + on the estimated autocovariance function. (default) + 'psd' : Welch's averaged periodogram method with no overlapping batches + detrend : function + defining detrending performed on the signal before estimation. + (default detrend_mean) + window : vector of length NFFT or function + To create window vectors see numpy.blackman, numpy.hamming, + numpy.bartlett, scipy.signal, scipy.signal.get_window etc. + noverlap : scalar int + gives the length of the overlap between segments. + ftype : character + defining frequency type: 'w' or 'f' (default 'w') + + Returns + --------- + spec : SpecData1D object + + + Example + ------- + x = load('sea.dat'); + S = dat2spec(x); + specplot(S) + + See also + -------- + dat2tr, dat2cov + + + References: + ----------- + Georg Lindgren and Holger Rootzen (1986) + "Stationara stokastiska processer", pp 173--176. + + Gareth Janacek and Louise Swift (1993) + "TIME SERIES forecasting, simulation, applications", + pp 75--76 and 261--268 + + Emanuel Parzen (1962), + "Stochastic Processes", HOLDEN-DAY, + pp 66--103 + ''' + + #% Initialize constants + #%~~~~~~~~~~~~~~~~~~~~~ + nugget = 1e-12 + rate = 2 + # % interpolationrate for frequency + + wdef = 1 + # % 1=parzen window 2=hanning window, 3= bartlett window + + dt = self.sampling_period() + #yy = self.data if tr is None else tr.dat2gauss(self.data) + yy = self.data.ravel() if tr is None else tr.dat2gauss( + self.data.ravel()) + yy = detrend(yy) if hasattr(detrend, '__call__') else yy + n = len(yy) + L = min(L, n) + + max_L = min(300, n) + # % maximum lag if L is undetermined + estimate_L = L is None + if estimate_L: + L = min(n - 2, int(4. / 3 * max_L + 0.5)) + + if method == 'cov' or estimate_L: + tsy = TimeSeries(yy, self.args) + R = tsy.tocovdata() + if estimate_L: + # finding where ACF is less than 2 st. deviations. + # a better L value + L = max_L + 2 - \ + (np.abs(R.data[max_L::-1]) > 2 * R.sigma[ + max_L::-1]).argmax() + # modify L so that hanning and Parzen give appr. the same + # result + if wdef == 1: + L = min(int(4 * L / 3), n - 2) + print('The default L is set to %d' % L) + try: + win = window(2 * L - 1) + wname = window.__name__ + if wname == 'parzen': + # degrees of freedom used in chi^2 distribution + v = int(3.71 * n / L) + Be = 2 * pi * 1.33 / (L * dt) # % bandwidth (rad/sec) + elif wname == 'hanning': + # degrees of freedom used in chi^2 distribution + v = int(2.67 * n / L) + Be = 2 * pi / (L * dt) + # % bandwidth (rad/sec) + elif wname == 'bartlett': + # degrees of freedom used in chi^2 distribution + v = int(3 * n / L) + Be = 2 * pi * 1.33 / (L * dt) + # bandwidth (rad/sec) + except: + wname = None + win = window + v = None + Be = None + + if method == 'psd': + nfft = 2 ** nextpow2(L) + pad_to = rate * nfft # Interpolate the spectrum with rate + S, f = psd( + yy, Fs=1. / dt, NFFT=nfft, detrend=detrend, window=window(nfft), + noverlap=noverlap, pad_to=pad_to, scale_by_freq=True) + fact = 2.0 * pi + w = fact * f + spec = _wafospec.SpecData1D(S / fact, w) + else: # cov method + # add a nugget effect to ensure that round off errors + # do not result in negative spectral estimates + + R.data[:L] = R.data[:L] * win[L - 1::] + R.data[L] = 0.0 + R.data = R.data[:L + 1] + R.args = R.args[:L + 1] + # R.plot() + # R.show() + spec = R.tospecdata(rate=rate, nugget=nugget) + + spec.Bw = Be + if ftype == 'f': + spec.Bw = Be / (2 * pi) # bandwidth in Hz + + if alpha is not None: + #% Confidence interval constants + spec.CI = [ + v / _invchi2(1 - alpha / 2, v), v / _invchi2(alpha / 2, v)] + + spec.tr = tr + spec.L = L + spec.norm = False + spec.note = 'method=%s' % method +# S = createspec('freq',ftype); +# S.tr = g; +# S.note = ['dat2spec(',inputname(1),'), Method = ' method ]; +# S.norm = 0; % not normalized +# S.L = L; +# S.S = zeros(nf+1,m-1); + return spec + + def _trdata_cdf(self, **options): + ''' + Estimate transformation, g, from observed marginal CDF. + Assumption: a Gaussian process, Y, is related to the + non-Gaussian process, X, by Y = g(X). + Parameters + ---------- + options = options structure defining how the smoothing is done. + (See troptset for default values) + Returns + ------- + tr, tr_emp = smoothed and empirical estimate of the transformation g. + + The empirical CDF is usually very irregular. More than one local + maximum of the empirical CDF may cause poor fit of the transformation. + In such case one should use a smaller value of GSM or set a larger + variance for GVAR. If X(t) is likely to cross levels higher than 5 + standard deviations then the vector param has to be modified. For + example if X(t) is unlikely to cross a level of 7 standard deviations + one can use param = [-7 7 513]. + + ''' + + mean = self.data.mean() + sigma = self.data.std() + cdf = edf(self.data.ravel()) + + opt = DotDict( + chkder=True, plotflag=False, gsm=0.05, param=[-5, 5, 513], + delay=2, linextrap=True, ntr=1000, ne=7, gvar=1) + opt.update(options) + Ne = opt.ne + nd = len(cdf.data) + if nd > opt.ntr and opt.ntr > 0: + x0 = linspace(cdf.args[Ne], cdf.args[nd - 1 - Ne], opt.ntr) + cdf.data = interp(x0, cdf.args, cdf.data) + cdf.args = x0 + Ne = 0 + uu = linspace(*opt.param) + + ncr = len(cdf.data) + ng = len(np.atleast_1d(opt.gvar)) + if ng == 1: + gvar = opt.gvar * ones(ncr) + else: + opt.gvar = np.atleast_1d(opt.gvar) + gvar = interp( + linspace(0, 1, ncr), linspace(0, 1, ng), opt.gvar.ravel()) + + ind = np.flatnonzero(diff(cdf.args) > 0) # remove equal points + nd = len(ind) + ind1 = ind[Ne:nd - Ne] + tmp = invnorm(cdf.data[ind]) + + x = sigma * uu + mean + pp_tr = SmoothSpline( + cdf.args[ind1], tmp[Ne:nd - Ne], p=opt.gsm, lin_extrap=opt.linextrap, var=gvar[ind1]) + # g(:,2) = smooth(Fx(ind1,1),tmp(Ne+1:end-Ne),opt.gsm,g(:,1),def,gvar); + tr = TrData(pp_tr(x), x, mean=mean, sigma=sigma) + tr_emp = TrData(tmp, cdf.args[ind], mean=mean, sigma=sigma) + tr_emp.setplotter('step') + + if opt.chkder: + for ix in xrange(5): + dy = diff(tr.data) + if (dy <= 0).any(): + dy[dy > 0] = floatinfo.eps + gvar = - \ + (np.hstack((dy, 0)) + np.hstack((0, dy))) / \ + 2 + floatinfo.eps + pp_tr = SmoothSpline(cdf.args[ind1], tmp[ + Ne:nd - Ne], p=1, lin_extrap=opt.linextrap, var=ix * gvar) + tr = TrData(pp_tr(x), x, mean=mean, sigma=sigma) + else: + break + else: + msg = '''The empirical distribution is not sufficiently smoothed. + The estimated transfer function, g, is not + a strictly increasing function.''' + warnings.warn(msg) + + if opt.plotflag > 0: + tr.plot() + tr_emp.plot() + return tr, tr_emp + + def trdata(self, method='nonlinear', **options): + ''' + Estimate transformation, g, from data. + + Parameters + ---------- + method : string + 'nonlinear' : transform based on smoothed crossing intensity (default) + 'mnonlinear': transform based on smoothed marginal distribution + 'hermite' : transform based on cubic Hermite polynomial + 'ochi' : transform based on exponential function + 'linear' : identity. + + options : keyword with the following fields: + csm,gsm - defines the smoothing of the logarithm of crossing intensity + and the transformation g, respectively. Valid values must + be 0<=csm,gsm<=1. (default csm=0.9, gsm=0.05) + Smaller values gives smoother functions. + param - vector which defines the region of variation of the data x. + (default see lc2tr). + plotflag - 0 no plotting (Default) + 1 plots empirical and smoothed g(u) and the theoretical for + a Gaussian model. + 2 monitor the development of the estimation + linextrap - 0 use a regular smoothing spline + 1 use a smoothing spline with a constraint on the ends to + ensure linear extrapolation outside the range of the data. + (default) + gvar - Variances for the empirical transformation, g. (default 1) + ne - Number of extremes (maxima & minima) to remove from the + estimation of the transformation. This makes the + estimation more robust against outliers. (default 7) + ntr - Maximum length of empirical crossing intensity or CDF. + The empirical crossing intensity or CDF is interpolated + linearly before smoothing if their lengths exceeds Ntr. + A reasonable NTR will significantly speed up the + estimation for long time series without loosing any + accuracy. NTR should be chosen greater than + PARAM(3). (default 1000) + + Returns + ------- + tr, tr_emp : TrData objects + with the smoothed and empirical transformation, respectively. + + + TRDATA estimates the transformation in a transformed Gaussian model. + Assumption: a Gaussian process, Y, is related to the + non-Gaussian process, X, by Y = g(X). + + The empirical crossing intensity is usually very irregular. + More than one local maximum of the empirical crossing intensity + may cause poor fit of the transformation. In such case one + should use a smaller value of CSM. In order to check the effect + of smoothing it is recomended to also plot g and g2 in the same plot or + plot the smoothed g against an interpolated version of g (when CSM=GSM=1). + If x is likely to cross levels higher than 5 standard deviations + then the vector param has to be modified. For example if x is + unlikely to cross a level of 7 standard deviations one can use + PARAM=[-7 7 513]. + + Example + ------- + >>> import wafo.spectrum.models as sm + >>> import wafo.transform.models as tm + >>> from wafo.objects import mat2timeseries + >>> Hs = 7.0 + >>> Sj = sm.Jonswap(Hm0=Hs) + >>> S = Sj.tospecdata() #Make spectrum object from numerical values + >>> S.tr = tm.TrOchi(mean=0, skew=0.16, kurt=0, sigma=Hs/4, ysigma=Hs/4) + >>> xs = S.sim(ns=2**16, iseed=10) + >>> ts = mat2timeseries(xs) + >>> g0, g0emp = ts.trdata(monitor=True) # Monitor the development + >>> g1, g1emp = ts.trdata(method='m', gvar=0.5 ) # Equal weight on all points + >>> g2, g2emp = ts.trdata(method='n', gvar=[3.5, 0.5, 3.5]) # Less weight on the ends + >>> int(S.tr.dist2gauss()*100) + 141 + >>> int(g0emp.dist2gauss()*100) + 217949 + >>> int(g0.dist2gauss()*100) + 93 + >>> int(g1.dist2gauss()*100) + 66 + >>> int(g2.dist2gauss()*100) + 84 + + See also + -------- + LevelCrossings.trdata + wafo.transform.models + + References + ---------- + Rychlik, I. , Johannesson, P and Leadbetter, M. R. (1997) + "Modelling and statistical analysis of ocean wavedata using + transformed Gaussian process." + Marine structures, Design, Construction and Safety, Vol. 10, No. 1, pp 13--47 + + + Brodtkorb, P, Myrhaug, D, and Rue, H (1999) + "Joint distribution of wave height and crest velocity from + reconstructed data" + in Proceedings of 9th ISOPE Conference, Vol III, pp 66-73 + ''' + + # opt = troptset('plotflag','off','csm',.95,'gsm',.05,.... + # 'param',[-5 5 513],'delay',2,'linextrap','on','ne',7,... + # 'cvar',1,'gvar',1,'multip',0); + opt = DotDict(chkder=True, plotflag=False, csm=.95, gsm=.05, + param=[-5, 5, 513], delay=2, ntr=1000, linextrap=True, ne=7, cvar=1, gvar=1, + multip=False, crossdef='uM') + opt.update(**options) + + ma = self.data.mean() + sa = self.data.std() + + if method.startswith('lin'): + return TrLinear(mean=ma, sigma=sa) + + if method[0] == 'n': + tp = self.turning_points() + mM = tp.cycle_pairs() + lc = mM.level_crossings(opt.crossdef) + return lc.trdata(mean=ma, sigma=sa, **opt) + elif method[0] == 'm': + return self._trdata_cdf(**opt) + elif method[0] == 'h': + ga1 = skew(self.data) + ga2 = kurtosis(self.data, fisher=True) # kurt(xx(n+1:end))-3; + up = min(4 * (4 * ga1 / 3) ** 2, 13) + lo = (ga1 ** 2) * 3 / 2 + kurt1 = min(up, max(ga2, lo)) + 3 + return TrHermite(mean=ma, var=sa ** 2, skew=ga1, kurt=kurt1) + elif method[0] == 'o': + ga1 = skew(self.data) + return TrOchi(mean=ma, var=sa ** 2, skew=ga1) + + def turning_points(self, h=0.0, wavetype=None): + ''' + Return turning points (tp) from data, optionally rainflowfiltered. + + Parameters + ---------- + h : scalar + a threshold + if h<=0, then tp is a sequence of turning points (default) + if h>0, then all rainflow cycles with height smaller than + h are removed. + + wavetype : string + defines the type of wave. Possible options are + 'astm' 'mw' 'Mw' or 'none'. + If None all rainflow filtered min and max + will be returned, otherwise only the rainflow filtered + min and max, which define a wave according to the + wave definition, will be returned. + 'astm' forces to have the first data point of the load history as + the first turning point. To be used in combination with + TurningPoints.cycle_astm() + + Returns + ------- + tp : TurningPoints object + with times and turning points. + + Example: + >>> import wafo.data + >>> x = wafo.data.sea() + >>> x1 = x[:200,:] + >>> ts1 = mat2timeseries(x1) + >>> tp = ts1.turning_points(wavetype='Mw') + >>> tph = ts1.turning_points(h=0.3,wavetype='Mw') + >>> hs = ts1.plot() + >>> hp = tp.plot('ro') + >>> hph = tph.plot('k.') + + See also + --------- + findcross, + findrfc + findtp + ''' + ind = findtp(self.data, max(h, 0.0), wavetype) + try: + t = self.args[ind] + except: + t = ind + mean = self.data.mean() + sigma = self.data.std() + return TurningPoints(self.data[ind], t, mean=mean, sigma=sigma) + + def trough_crest(self, v=None, wavetype=None): + """ + Return trough and crest turning points + + Parameters + ----------- + v : scalar + reference level (default v = mean of x). + + wavetype : string + defines the type of wave. Possible options are + 'dw', 'uw', 'tw', 'cw' or None. + If None indices to all troughs and crests will be returned, + otherwise only the paired ones will be returned + according to the wavedefinition. + + Returns + -------- + tc : TurningPoints object + with trough and crest turningpoints + """ + ind = findtc(self.data, v, wavetype)[0] + try: + t = self.args[ind] + except: + t = ind + mean = self.data.mean() + sigma = self.data.std() + return TurningPoints(self.data[ind], t, mean=mean, sigma=sigma) + + def wave_parameters(self, rate=1): + ''' + Returns several wave parameters from data. + + Parameters + ---------- + rate : scalar integer + interpolation rate. Interpolates with spline if greater than one. + + Returns + ------- + parameters : dict + wave parameters such as + Ac, At : Crest and trough amplitude, respectively + Tcf, Tcb : Crest front and crest (rear) back period, respectively + Hu, Hd : zero-up-crossing and zero-downcrossing wave height, respectively. + Tu, Td : zero-up-crossing and zero-downcrossing wave period, respectively. + + The definition of g, Ac,At, Tcf, etc. are given in gravity and + wafo.definitions. + + Example + ------- + >>> import wafo.data as wd + >>> import wafo.objects as wo + >>> x = wd.sea() + >>> ts = wo.mat2timeseries(x) + >>> wp = ts.wave_parameters() + >>> for name in ['Ac', 'At', 'Hu', 'Hd', 'Tu', 'Td', 'Tcf', 'Tcb']: + ... print('%s' % name, wp[name][:2]) + ('Ac', array([ 0.25950546, 0.34950546])) + ('At', array([ 0.16049454, 0.43049454])) + ('Hu', array([ 0.69, 0.86])) + ('Hd', array([ 0.42, 0.78])) + ('Tu', array([ 6.10295202, 3.36978685])) + ('Td', array([ 3.84377468, 6.35707656])) + ('Tcf', array([ 0.42656819, 0.57361617])) + ('Tcb', array([ 0.93355982, 1.04063638])) + + >>> import pylab as plt + >>> h = plt.plot(wp['Td'],wp['Hd'],'.') + >>> h = plt.xlabel('Td [s]') + >>> h = plt.ylabel('Hd [m]') + + + See also + -------- + wafo.definitions + ''' + dT = self.sampling_period() + if rate > 1: + dT = dT / rate + t0, tn = self.args[0], self.args[-1] + n = len(self.args) + ti = linspace(t0, tn, int(rate * n)) + xi = interp1d(self.args, self.data.ravel(), kind='cubic')(ti) + + else: + ti, xi = self.args, self.data.ravel() + + tc_ind, z_ind = findtc(xi, v=0, kind='tw') + tc_a = xi[tc_ind] + tc_t = ti[tc_ind] + Ac = tc_a[1::2] # crest amplitude + At = -tc_a[0::2] # trough amplitude + Hu = Ac + At[1:] + Hd = Ac + At[:-1] + tu = ecross(ti, xi, z_ind[1::2], v=0) + Tu = diff(tu) # Period zero-upcrossing waves + td = ecross(ti, xi, z_ind[::2], v=0) + Td = diff(td) # Period zero-downcrossing waves + Tcf = tc_t[1::2] - tu[:-1] + Tcf[(Tcf == 0)] = dT # avoiding division by zero + Tcb = td[1:] - tc_t[1::2] + Tcb[(Tcb == 0)] = dT + # % avoiding division by zero + return dict(Ac=Ac, At=At, Hu=Hu, Hd=Hd, Tu=Tu, Td=Td, Tcf=Tcf, Tcb=Tcb) + + def wave_height_steepness(self, method=1, rate=1, g=None): + ''' + Returns waveheights and steepnesses from data. + + Parameters + ---------- + rate : scalar integer + interpolation rate. Interpolates with spline if greater than one. + + method : scalar integer + 0 max(Vcf, Vcb) and corresponding wave height Hd or Hu in H + 1 crest front (rise) speed (Vcf) in S and wave height Hd in H. (default) + -1 crest back (fall) speed (Vcb) in S and waveheight Hu in H. + 2 crest front steepness in S and the wave height Hd in H. + -2 crest back steepness in S and the wave height Hu in H. + 3 total wave steepness in S and the wave height Hd in H + for zero-downcrossing waves. + -3 total wave steepness in S and the wave height Hu in H. + for zero-upcrossing waves. + Returns + ------- + S, H = Steepness and the corresponding wave height according to method + + + The parameters are calculated as follows: + Crest front speed (velocity) = Vcf = Ac/Tcf + Crest back speed (velocity) = Vcb = Ac/Tcb + Crest front steepness = 2*pi*Ac./Td/Tcf/g + Crest back steepness = 2*pi*Ac./Tu/Tcb/g + Total wave steepness (zero-downcrossing wave) = 2*pi*Hd./Td.^2/g + Total wave steepness (zero-upcrossing wave) = 2*pi*Hu./Tu.^2/g + + The definition of g, Ac,At, Tcf, etc. are given in gravity and + wafo.definitions. + + Example + ------- + >>> import wafo.data as wd + >>> import wafo.objects as wo + >>> x = wd.sea() + >>> ts = wo.mat2timeseries(x) + >>> for i in xrange(-3,4): + ... S, H = ts.wave_height_steepness(method=i) + ... print(S[:2],H[:2]) + (array([ 0.01186982, 0.04852534]), array([ 0.69, 0.86])) + (array([ 0.02918363, 0.06385979]), array([ 0.69, 0.86])) + (array([ 0.27797411, 0.33585743]), array([ 0.69, 0.86])) + (array([ 0.60835634, 0.60930197]), array([ 0.42, 0.78])) + (array([ 0.60835634, 0.60930197]), array([ 0.42, 0.78])) + (array([ 0.10140867, 0.06141156]), array([ 0.42, 0.78])) + (array([ 0.01821413, 0.01236672]), array([ 0.42, 0.78])) + + >>> import pylab as plt + >>> h = plt.plot(S,H,'.') + >>> h = plt.xlabel('S') + >>> h = plt.ylabel('Hd [m]') + + + See also + -------- + wafo.definitions + ''' + + dT = self.sampling_period() + if g is None: + g = gravity() # % acceleration of gravity + + if rate > 1: + dT = dT / rate + t0, tn = self.args[0], self.args[-1] + n = len(self.args) + ti = linspace(t0, tn, int(rate * n)) + xi = interp1d(self.args, self.data.ravel(), kind='cubic')(ti) + + else: + ti, xi = self.args, self.data.ravel() + + tc_ind, z_ind = findtc(xi, v=0, kind='tw') + tc_a = xi[tc_ind] + tc_t = ti[tc_ind] + Ac = tc_a[1::2] # crest amplitude + At = -tc_a[0::2] # trough amplitude + + if (0 <= method and method <= 2): + # time between zero-upcrossing and crest [s] + tu = ecross(ti, xi, z_ind[1:-1:2], v=0) + Tcf = tc_t[1::2] - tu + Tcf[(Tcf == 0)] = dT # avoiding division by zero + if (0 >= method and method >= -2): + # time between crest and zero-downcrossing [s] + td = ecross(ti, xi, z_ind[2::2], v=0) + Tcb = td - tc_t[1::2] + Tcb[(Tcb == 0)] = dT + # % avoiding division by zero + + if method == 0: + # max(Vcf, Vcr) and the corresponding wave height Hd or Hu in H + Hu = Ac + At[1:] + Hd = Ac + At[:-1] + T = np.where(Tcf < Tcb, Tcf, Tcb) + S = Ac / T + H = np.where(Tcf < Tcb, Hd, Hu) + elif method == 1: # extracting crest front velocity [m/s] and + # Zero-downcrossing wave height [m] + H = Ac + At[:-1] # Hd + S = Ac / Tcf + elif method == -1: # extracting crest rear velocity [m/s] and + # Zero-upcrossing wave height [m] + H = Ac + At[1:] # Hu + S = Ac / Tcb + #crest front steepness in S and the wave height Hd in H. + elif method == 2: + H = Ac + At[:-1] # Hd + Td = diff(ecross(ti, xi, z_ind[::2], v=0)) + S = 2 * pi * Ac / Td / Tcf / g + # crest back steepness in S and the wave height Hu in H. + elif method == -2: + H = Ac + At[1:] + Tu = diff(ecross(ti, xi, z_ind[1::2], v=0)) + S = 2 * pi * Ac / Tu / Tcb / g + elif method == 3: # total steepness in S and the wave height Hd in H + # for zero-doewncrossing waves. + H = Ac + At[:-1] + # Period zero-downcrossing waves + Td = diff(ecross(ti, xi, z_ind[::2], v=0)) + S = 2 * pi * H / Td ** 2 / g + # total steepness in S and the wave height Hu in H for + elif method == -3: + # zero-upcrossing waves. + H = Ac + At[1:] + # Period zero-upcrossing waves + Tu = diff(ecross(ti, xi, z_ind[1::2], v=0)) + S = 2 * pi * H / Tu ** 2 / g + + return S, H + + def wave_periods(self, vh=None, pdef='d2d', wdef=None, index=None, rate=1): + """ + Return sequence of wave periods/lengths from data. + + Parameters + ---------- + vh : scalar + reference level ( default v=mean(x(:,2)) ) or + rainflow filtering height (default h=0) + pdef : string + defining type of waveperiod (wavelength) returned: + Level v separated 't2c', 'c2t', 't2t' or 'c2c' -waveperiod. + Level v 'd2d', 'u2u', 'd2u' or 'u2d' -waveperiod. + Rain flow filtered (with height greater than h) + 'm2M', 'M2m', 'm2m' or 'M2M' -waveperiod. + Explanation to the abbreviations: + M=Max, m=min, d=down-crossing, u=up-crossing , + t=trough and c=crest. + Thus 'd2d' means period between a down-crossing to the + next down-crossing and 'u2c' means period between a + u-crossing to the following crest. + wdef : string + defining type of wave. Possible options are + 'mw','Mw','dw', 'uw', 'tw', 'cw' or None. + If wdef is None all troughs and crests will be used, + otherwise only the troughs and crests which define a + wave according to the wavedefinition are used. + + index : vector + index sequence of one of the following : + -level v-crossings (indices to "du" are required to + calculate 'd2d', 'd2u', 'u2d' or 'u2u' waveperiods) + -level v separated trough and crest turningpoints + (indices to 'tc' are required to calculate + 't2t', 't2c', 'c2t' or 'c2c' waveperiods) + -level v crossings and level v separated trough and + crest turningpoints (indices to "dutc" are + required to calculate t2u, u2c, c2d or d2t + waveperiods) + -rainflow filtered turningpoints with minimum rfc height h + (indices to "mMtc" are required to calculate + 'm2m', 'm2M', 'M2m' or 'M2M' waveperiods) + + rate : scalar + interpolation rate. If rate larger than one, then x is + interpolated before extrating T + + Returns + -------- + T : vector + sequence of waveperiods (or wavelengths). + index : vector + of indices + + + Example: + -------- + Histogram of crest2crest waveperiods + >>> import wafo.data as wd + >>> import wafo.objects as wo + >>> import pylab as plb + >>> x = wd.sea() + >>> ts = wo.mat2timeseries(x[0:400,:]) + >>> T, ix = ts.wave_periods(vh=0.0,pdef='c2c') + >>> h = plb.hist(T) + + See also: + -------- + findtp, + findtc, + findcross, perioddef + """ + +# % This is a more flexible version than the dat2hwa or tp2wa routines. +# % There is a secret option: if pdef='all' the function returns +# % all the waveperiods 'd2t', 't2u', 'u2c' and 'c2d' in sequence. +# % It is up to the user to extract the right waveperiods. +# % If the first is a down-crossing then the first is a 'd2t' waveperiod. +# % If the first is a up-crossing then the first is a 'u2c' waveperiod. +# % +# % Example: +# % [T ind]=dat2wa(x,0,'all') %returns all waveperiods +# % nn = length(T) +# % % want to extract all t2u waveperiods +# % if x(ind(1),2)>0 % if first is down-crossing +# % Tt2u=T(2:4:nn) +# % else % first is up-crossing +# % Tt2u=T(4:4:nn) +# % end + + if rate > 1: # % interpolate with spline + n = ceil(self.data.size * rate) + ti = linspace(self.args[0], self.args[-1], n) + x = stineman_interp(ti, self.args, self.data.ravel()) + else: + x = self.data + ti = self.args + + if vh is None: + if pdef[0] in ('m', 'M'): + vh = 0 + print(' The minimum rfc height, h, is set to: %g' % vh) + else: + vh = x.mean() + print(' The level l is set to: %g' % vh) + + if index is None: + if pdef in ('m2m', 'm2M', 'M2m', 'M2M'): + index = findtp(x, vh, wdef) + elif pdef in ('u2u', 'u2d', 'd2u', 'd2d'): + index = findcross(x, vh, wdef) + elif pdef in ('t2t', 't2c', 'c2t', 'c2c'): + index = findtc(x, vh, wdef)[0] + elif pdef in ('d2t', 't2u', 'u2c', 'c2d', 'all'): + index, v_ind = findtc(x, vh, wdef) + #% sorting crossings and tp in sequence + index = sort(r_[index, v_ind]) + else: + raise ValueError('Unknown pdef option!') + + if (x[index[0]] > x[index[1]]): # % if first is down-crossing or max + if pdef in ('d2t', 'M2m', 'c2t', 'd2u', 'M2M', 'c2c', 'd2d', 'all'): + start = 1 + elif pdef in ('t2u', 'm2M', 't2c', 'u2d', 'm2m', 't2t', 'u2u'): + start = 2 + elif pdef in ('u2c'): + start = 3 + elif pdef in ('c2d'): + start = 4 + else: + raise ValueError('Unknown pdef option!') + # else first is up-crossing or min + elif pdef in ('all', 'u2c', 'm2M', 't2c', 'u2d', 'm2m', 't2t', 'u2u'): + start = 0 + elif pdef in ('c2d', 'M2m', 'c2t', 'd2u', 'M2M', 'c2c', 'd2d'): + start = 1 + elif pdef in ('d2t'): + start = 2 + elif pdef in ('t2u'): + start = 3 + else: + raise ValueError('Unknown pdef option!') + + # determine the steps between wanted periods + if pdef in ('d2t', 't2u', 'u2c', 'c2d'): + step = 4 + elif pdef in ('all'): + step = 1 # % secret option! + else: + step = 2 + + #% determine the distance between min2min, t2t etc.. + if pdef in ('m2m', 't2t', 'u2u', 'M2M', 'c2c', 'd2d'): + dist = 2 + else: + dist = 1 + + nn = len(index) + #% New call: (pab 28.06.2001) + if pdef[0] in ('u', 'd'): + t0 = ecross(ti, x, index[start:(nn - dist):step], vh) + else: # % min, Max, trough, crest or all crossings wanted + t0 = x[index[start:(nn - dist):step]] + + if pdef[2] in ('u', 'd'): + t1 = ecross(ti, x, index[(start + dist):nn:step], vh) + else: # % min, Max, trough, crest or all crossings wanted + t1 = x[index[(start + dist):nn:step]] + + T = t1 - t0 + return T, index + + def reconstruct(self): + # TODO: finish reconstruct + pass + + def plot_wave(self, sym1='k.', ts=None, sym2='k+', nfig=None, nsub=None, + sigma=None, vfact=3): + ''' + Plots the surface elevation of timeseries. + + Parameters + ---------- + sym1, sym2 : string + plot symbol and color for data and ts, respectively + (see PLOT) (default 'k.' and 'k+') + ts : TimeSeries or TurningPoints object + to overplot data. default zero-separated troughs and crests. + nsub : scalar integer + Number of subplots in each figure. By default nsub is such that + there are about 20 mean down crossing waves in each subplot. + If nfig is not given and nsub is larger than 6 then nsub is + changed to nsub=min(6,ceil(nsub/nfig)) + nfig : scalar integer + Number of figures. By default nfig=ceil(Nsub/6). + sigma : real scalar + standard deviation of data. + vfact : real scalar + how large in stdev the vertical scale should be (default 3) + + + Example + ------- + Plot x1 with red lines and mark troughs and crests with blue circles. + >>> import wafo + >>> x = wafo.data.sea() + >>> ts150 = wafo.objects.mat2timeseries(x[:150,:]) + >>> h = ts150.plot_wave('r-', sym2='bo') + + See also + -------- + findtc, plot + ''' + + nw = 20 + tn = self.args + xn = self.data.ravel() + indmiss = isnan(xn) # indices to missing points + indg = where(1 - indmiss)[0] + if ts is None: + tc_ix = findtc(xn[indg], 0, 'tw')[0] + xn2 = xn[tc_ix] + tn2 = tn[tc_ix] + else: + xn2 = ts.data + tn2 = ts.args + + if sigma is None: + sigma = xn[indg].std() + + if nsub is None: + # about Nw mdc waves in each plot + nsub = int(len(xn2) / (2 * nw)) + 1 + if nfig is None: + nfig = int(ceil(nsub / 6)) + nsub = min(6, int(ceil(nsub / nfig))) + + n = len(xn) + Ns = int(n / (nfig * nsub)) + ind = r_[0:Ns] + if all(xn >= 0): + vscale = [0, 2 * sigma * vfact] # @UnusedVariable + else: + vscale = array([-1, 1]) * vfact * sigma # @UnusedVariable + + XlblTxt = 'Time [sec]' + dT = 1 + timespan = tn[ind[-1]] - tn[ind[0]] + if abs(timespan) > 18000: # more than 5 hours + dT = 1 / (60 * 60) + XlblTxt = 'Time (hours)' + elif abs(timespan) > 300: # more than 5 minutes + dT = 1 / 60 + XlblTxt = 'Time (minutes)' + + if np.max(abs(xn[indg])) > 5 * sigma: + XlblTxt = XlblTxt + ' (Spurious data since max > 5 std.)' + + plot = plotbackend.plot + subplot = plotbackend.subplot + figs = [] + for unused_iz in xrange(nfig): + figs.append(plotbackend.figure()) + plotbackend.title('Surface elevation from mean water level (MWL).') + for ix in xrange(nsub): + if nsub > 1: + subplot(nsub, 1, ix) + + h_scale = array([tn[ind[0]], tn[ind[-1]]]) + ind2 = where((h_scale[0] <= tn2) & (tn2 <= h_scale[1]))[0] + plot(tn[ind] * dT, xn[ind], sym1) + if len(ind2) > 0: + plot(tn2[ind2] * dT, xn2[ind2], sym2) + plot(h_scale * dT, [0, 0], 'k-') + #plotbackend.axis([h_scale*dT, v_scale]) + + for iy in [-2, 2]: + plot(h_scale * dT, iy * sigma * ones(2), ':') + + ind = ind + Ns + # end + plotbackend.xlabel(XlblTxt) + + return figs + + def plot_sp_wave(self, wave_idx_, *args, **kwds): + """ + Plot specified wave(s) from timeseries + + Parameters + ---------- + wave_idx : integer vector + of indices to waves we want to plot, i.e., wave numbers. + tz_idx : integer vector + of indices to the beginning, middle and end of + defining wave, i.e. for zero-downcrossing waves, indices to + zerocrossings (default trough2trough wave) + + Examples + -------- + Plot waves nr. 6,7,8 and waves nr. 12,13,...,17 + >>> import wafo + >>> x = wafo.data.sea() + >>> ts = wafo.objects.mat2timeseries(x[0:500,...]) + >>> h = ts.plot_sp_wave(np.r_[6:9,12:18]) + + See also + -------- + plot_wave, findtc + """ + wave_idx = atleast_1d(wave_idx_).flatten() + tz_idx = kwds.pop('tz_idx', None) + if tz_idx is None: + # finding trough to trough waves + unused_tc_ind, tz_idx = findtc(self.data, 0, 'tw') + + dw = nonzero(abs(diff(wave_idx)) > 1)[0] + Nsub = dw.size + 1 + Nwp = zeros(Nsub, dtype=int) + if Nsub > 1: + dw = dw + 1 + Nwp[Nsub - 1] = wave_idx[-1] - wave_idx[dw[-1]] + 1 + wave_idx[dw[-1] + 1:] = -2 + for ix in range(Nsub - 2, 1, -2): + # # of waves pr subplot + Nwp[ix] = wave_idx[dw[ix] - 1] - wave_idx[dw[ix - 1]] + 1 + wave_idx[dw[ix - 1] + 1:dw[ix]] = -2 + + Nwp[0] = wave_idx[dw[0] - 1] - wave_idx[0] + 1 + wave_idx[1:dw[0]] = -2 + wave_idx = wave_idx[wave_idx > -1] + else: + Nwp[0] = wave_idx[-1] - wave_idx[0] + 1 + # end + + Nsub = min(6, Nsub) + Nfig = int(ceil(Nsub / 6)) + Nsub = min(6, int(ceil(Nsub / Nfig))) + figs = [] + for unused_iy in range(Nfig): + figs.append(plotbackend.figure()) + for ix in range(Nsub): + plotbackend.subplot(Nsub, 1, mod(ix, Nsub) + 1) + ind = r_[tz_idx[2 * wave_idx[ix] - 1]:tz_idx[ + 2 * wave_idx[ix] + 2 * Nwp[ix] - 1]] + # indices to wave + plotbackend.plot(self.args[ind], self.data[ind], *args, **kwds) + plotbackend.hold('on') + xi = [self.args[ind[0]], self.args[ind[-1]]] + plotbackend.plot(xi, [0, 0]) + + if Nwp[ix] == 1: + plotbackend.ylabel('Wave %d' % wave_idx[ix]) + else: + plotbackend.ylabel( + 'Wave %d - %d' % (wave_idx[ix], wave_idx[ix] + Nwp[ix] - 1)) + + plotbackend.xlabel('Time [sec]') + # wafostamp + return figs + +# def hyperbolic_ratio(a, b, sa, sb): +# ''' +# Return ratio of hyperbolic functions +# to allow extreme variations of arguments. +# +# Parameters +# ---------- +# a, b : array-like +# arguments vectors of the same size +# sa, sb : scalar integers +# defining the hyperbolic function used, i.e., f(x,1)=cosh(x), f(x,-1)=sinh(x) +# +# Returns +# ------- +# r : ndarray +# f(a,sa)/f(b,sb), ratio of hyperbolic functions of same +# size as a and b +# Examples +# -------- +# >>> x = [-2,0,2] +# >>> hyperbolic_ratio(x,1,1,1) # gives r=cosh(x)/cosh(1) +# array([ 2.438107 , 0.64805427, 2.438107 ]) +# >>> hyperbolic_ratio(x,1,1,-1) # gives r=cosh(x)/sinh(1) +# array([ 3.20132052, 0.85091813, 3.20132052]) +# >>> hyperbolic_ratio(x,1,-1,1) # gives r=sinh(x)/cosh(1) +# array([-2.35040239, 0. , 2.35040239]) +# >>> hyperbolic_ratio(x,1,-1,-1) # gives r=sinh(x)/sinh(1) +# array([-3.08616127, 0. , 3.08616127]) +# >>> hyperbolic_ratio(1,x,1,1) # gives r=cosh(1)/cosh(x) +# array([ 0.41015427, 1.54308063, 0.41015427]) +# >>> hyperbolic_ratio(1,x,1,-1) # gives r=cosh(1)/sinh(x) +# array([-0.42545906, inf, 0.42545906]) +# >>> hyperbolic_ratio(1,x,-1,1) # gives r=sinh(1)/cosh(x) +# array([ 0.3123711 , 1.17520119, 0.3123711 ]) +# >>> hyperbolic_ratio(1,x,-1,-1) # gives r=sinh(1)/sinh(x) +# array([-0.32402714, inf, 0.32402714]) +# +# See also +# -------- +# tran +# ''' +# ak, bk, sak, sbk = np.atleast_1d(a, b, sign(sa), sign(sb)) +# old call +# return exp(ak-bk)*(1+sak*exp(-2*ak))/(1+sbk*exp(-2*bk)) +# TODO: Does not always handle division by zero correctly +# +# signRatio = np.where(sak * ak < 0, sak, 1) +# signRatio = np.where(sbk * bk < 0, sbk * signRatio, signRatio) +# +# bk = np.abs(bk) +# ak = np.abs(ak) +# +# num = np.where(sak < 0, expm1(-2 * ak), 1 + exp(-2 * ak)) +# den = np.where(sbk < 0, expm1(-2 * bk), 1 + exp(-2 * bk)) +# iden = np.ones(den.shape) * inf +# ind = np.flatnonzero(den != 0) +# iden.flat[ind] = 1.0 / den[ind] +# val = np.where(num == den, 1, num * iden) +# return signRatio * exp(ak - bk) * val #((sak+exp(-2*ak))/(sbk+exp(-2*bk))) +# +# def sensor_typeid(*sensortypes): +# ''' Return ID for sensortype name +# +# Parameter +# --------- +# sensortypes : list of strings defining the sensortype +# +# Returns +# ------- +# sensorids : list of integers defining the sensortype +# +# Valid senor-ids and -types for time series are as follows: +# 0, 'n' : Surface elevation (n=Eta) +# 1, 'n_t' : Vertical surface velocity +# 2, 'n_tt' : Vertical surface acceleration +# 3, 'n_x' : Surface slope in x-direction +# 4, 'n_y' : Surface slope in y-direction +# 5, 'n_xx' : Surface curvature in x-direction +# 6, 'n_yy' : Surface curvature in y-direction +# 7, 'n_xy' : Surface curvature in xy-direction +# 8, 'P' : Pressure fluctuation about static MWL pressure +# 9, 'U' : Water particle velocity in x-direction +# 10, 'V' : Water particle velocity in y-direction +# 11, 'W' : Water particle velocity in z-direction +# 12, 'U_t' : Water particle acceleration in x-direction +# 13, 'V_t' : Water particle acceleration in y-direction +# 14, 'W_t' : Water particle acceleration in z-direction +# 15, 'X_p' : Water particle displacement in x-direction from its mean position +# 16, 'Y_p' : Water particle displacement in y-direction from its mean position +# 17, 'Z_p' : Water particle displacement in z-direction from its mean position +# +# Example: +# >>> sensor_typeid('W','v') +# [11, 10] +# >>> sensor_typeid('rubbish') +# [nan] +# +# See also +# -------- +# sensor_type +# ''' +# +# sensorid_table = dict(n=0, n_t=1, n_tt=2, n_x=3, n_y=4, n_xx=5, +# n_yy=6, n_xy=7, p=8, u=9, v=10, w=11, u_t=12, +# v_t=13, w_t=14, x_p=15, y_p=16, z_p=17) +# try: +# return [sensorid_table.get(name.lower(), nan) for name in sensortypes] +# except: +# raise ValueError('Input must be a string!') +# +# +# +# def sensor_type(*sensorids): +# ''' +# Return sensortype name +# +# Parameter +# --------- +# sensorids : vector or list of integers defining the sensortype +# +# Returns +# ------- +# sensornames : tuple of strings defining the sensortype +# Valid senor-ids and -types for time series are as follows: +# 0, 'n' : Surface elevation (n=Eta) +# 1, 'n_t' : Vertical surface velocity +# 2, 'n_tt' : Vertical surface acceleration +# 3, 'n_x' : Surface slope in x-direction +# 4, 'n_y' : Surface slope in y-direction +# 5, 'n_xx' : Surface curvature in x-direction +# 6, 'n_yy' : Surface curvature in y-direction +# 7, 'n_xy' : Surface curvature in xy-direction +# 8, 'P' : Pressure fluctuation about static MWL pressure +# 9, 'U' : Water particle velocity in x-direction +# 10, 'V' : Water particle velocity in y-direction +# 11, 'W' : Water particle velocity in z-direction +# 12, 'U_t' : Water particle acceleration in x-direction +# 13, 'V_t' : Water particle acceleration in y-direction +# 14, 'W_t' : Water particle acceleration in z-direction +# 15, 'X_p' : Water particle displacement in x-direction from its mean position +# 16, 'Y_p' : Water particle displacement in y-direction from its mean position +# 17, 'Z_p' : Water particle displacement in z-direction from its mean position +# +# Example: +# >>> sensor_type(range(3)) +# ('n', 'n_t', 'n_tt') +# +# See also +# -------- +# sensor_typeid, tran +# ''' +# valid_names = ('n', 'n_t', 'n_tt', 'n_x', 'n_y', 'n_xx', 'n_yy', 'n_xy', +# 'p', 'u', 'v', 'w', 'u_t', 'v_t', 'w_t', 'x_p', 'y_p', 'z_p', +# nan) +# ids = atleast_1d(*sensorids) +# if isinstance(ids, list): +# ids = hstack(ids) +# n = len(valid_names) - 1 +# ids = where(((ids < 0) | (n < ids)), n , ids) +# return tuple(valid_names[i] for i in ids) +# +# class TransferFunction(object): +# ''' +# Class for computing transfer functions based on linear wave theory +# of the system with input surface elevation, +# eta(x0,y0,t) = exp(i*(kx*x0+ky*y0-w*t)), +# and output Y determined by sensortype and position of sensor. +# +# Member methods +# -------------- +# tran(w, theta, kw) +# +# Hw = a function of frequency only (not direction) size 1 x Nf +# Gwt = a function of frequency and direction size Nt x Nf +# w = vector of angular frequencies in Rad/sec. Length Nf +# theta = vector of directions in radians Length Nt (default 0) +# ( theta = 0 -> positive x axis theta = pi/2 -> positive y axis) +# Member variables +# ---------------- +# pos : [x,y,z] +# vector giving coordinate position relative to [x0 y0 z0] (default [0,0,0]) +# sensortype = string +# defining the sensortype or transfer function in output. +# 0, 'n' : Surface elevation (n=Eta) (default) +# 1, 'n_t' : Vertical surface velocity +# 2, 'n_tt' : Vertical surface acceleration +# 3, 'n_x' : Surface slope in x-direction +# 4, 'n_y' : Surface slope in y-direction +# 5, 'n_xx' : Surface curvature in x-direction +# 6, 'n_yy' : Surface curvature in y-direction +# 7, 'n_xy' : Surface curvature in xy-direction +# 8, 'P' : Pressure fluctuation about static MWL pressure +# 9, 'U' : Water particle velocity in x-direction +# 10, 'V' : Water particle velocity in y-direction +# 11, 'W' : Water particle velocity in z-direction +# 12, 'U_t' : Water particle acceleration in x-direction +# 13, 'V_t' : Water particle acceleration in y-direction +# 14, 'W_t' : Water particle acceleration in z-direction +# 15, 'X_p' : Water particle displacement in x-direction from its mean position +# 16, 'Y_p' : Water particle displacement in y-direction from its mean position +# 17, 'Z_p' : Water particle displacement in z-direction from its mean position +# h : real scalar +# water depth (default inf) +# g : real scalar +# acceleration of gravity (default 9.81 m/s**2) +# rho : real scalar +# water density (default 1028 kg/m**3) +# bet : 1 or -1 +# 1, theta given in terms of directions toward which waves travel (default) +# -1, theta given in terms of directions from which waves come +# igam : 1,2 or 3 +# 1, if z is measured positive upward from mean water level (default) +# 2, if z is measured positive downward from mean water level +# 3, if z is measured positive upward from sea floor +# thetax, thetay : real scalars +# angle in degrees clockwise from true north to positive x-axis and +# positive y-axis, respectively. (default theatx=90, thetay=0) +# +# Example +# ------- +# >>> import pylab as plt +# >>> N=50; f0=0.1; th0=0; h=50; w0 = 2*pi*f0 +# >>> t = np.linspace(0,15,N) +# >>> eta0 = np.exp(-1j*w0*t) +# >>> stypes = ['n', 'n_x', 'n_y']; +# >>> tf = TransferFunction(pos=(0, 0, 0), h=50) +# >>> vals = [] +# >>> fh = plt.plot(t, eta0.real, 'r.') +# >>> plt.hold(True) +# >>> for i,stype in enumerate(stypes): +# ... tf.sensortype = stype +# ... Hw, Gwt = tf.tran(w0,th0) +# ... vals.append((Hw*Gwt*eta0).real.ravel()) +# ... vals[i] +# ... fh = plt.plot(t, vals[i]) +# >>> plt.show() +# +# +# See also +# -------- +# dat2dspec, sensor_type, sensor_typeid +# +# Reference +# --------- +# Young I.R. (1994) +# "On the measurement of directional spectra", +# Applied Ocean Research, Vol 16, pp 283-294 +# ''' +# def __init__(self, pos=(0, 0, 0), sensortype='n', h=inf, g=9.81, rho=1028, +# bet=1, igam=1, thetax=90, thetay=0): +# self.pos = pos +# self.sensortype = sensortype if isinstance(sensortype, str) else sensor_type(sensortype) +# self.h = h +# self.g = g +# self.rho = rho +# self.bet = bet +# self.igam = igam +# self.thetax = thetax +# self.thetay = thetay +# self._tran_dict = dict(n=self._n, n_t=self._n_t, n_tt=self._n_tt, +# n_x=self._n_x, n_y=self._n_y, n_xx=self._n_xx, +# n_yy=self._n_yy, n_xy=self._n_xy, +# P=self._p, p=self._p, +# U=self._u, u=self._u, +# V=self._v, v=self._v, +# W=self._w, w=self._w, +# U_t=self._u_t, u_t=self._u_t, +# V_t=self._v_t, v_t=self._v_t, +# W_t=self._w_t, w_t=self._w_t, +# X_p=self._x_p, x_p=self._x_p, +# Y_p=self._y_p, y_p=self._y_p, +# Z_p=self._z_p, z_p=self._z_p) +# +# def tran(self, w, theta=0, kw=None): +# ''' +# Return transfer functions based on linear wave theory +# of the system with input surface elevation, +# eta(x0,y0,t) = exp(i*(kx*x0+ky*y0-w*t)), +# and output, +# Y = Hw*Gwt*eta, determined by sensortype and position of sensor. +# +# Parameters +# ---------- +# w : array-like +# vector of angular frequencies in Rad/sec. Length Nf +# theta : array-like +# vector of directions in radians Length Nt (default 0) +# ( theta = 0 -> positive x axis theta = pi/2 -> positive y axis) +# kw : array-like +# vector of wave numbers corresponding to angular frequencies, w. Length Nf +# (default calculated with w2k) +# +# Returns +# ------- +# Hw = transfer function of frequency only (not direction) size 1 x Nf +# Gwt = transfer function of frequency and direction size Nt x Nf +# +# The complete transfer function Hwt = Hw*Gwt is a function of +# w (columns) and theta (rows) size Nt x Nf +# ''' +# if kw is None: +# kw, unusedkw2 = w2k(w, 0, self.h) #wave number as function of angular frequency +# +# w, theta, kw = np.atleast_1d(w, theta, kw) +# make sure they have the correct orientation +# theta.shape = (-1, 1) +# kw.shape = (-1,) +# w.shape = (-1,) +# +# tran_fun = self._tran_dict[self.sensortype] +# Hw, Gwt = tran_fun(w, theta, kw) +# +# New call to avoid singularities. pab 07.11.2000 +# Set Hw to 0 for expressions w*hyperbolic_ratio(z*k,h*k,1,-1)= 0*inf +# ind = np.flatnonzero(1 - np.isfinite(Hw)) +# Hw.flat[ind] = 0 +# +# sgn = np.sign(Hw); +# k0 = np.flatnonzero(sgn < 0) +# if len(k0): # make sure Hw>=0 ie. transfer negative signs to Gwt +# Gwt[:, k0] = -Gwt[:, k0] +# Hw[:, k0] = -Hw[:, k0] +# +# if self.igam == 2: +# pab 09 Oct.2002: bug fix +# Changing igam by 2 should affect the directional result in the same way that changing eta by -eta! +# Gwt = -Gwt +# return Hw, Gwt +# __call__ = tran +# ---Private member methods +# def _get_ee_cthxy(self, theta, kw): +# convert from angle in degrees to radians +# bet = self.bet +# thxr = self.thetax * pi / 180 +# thyr = self.thetay * pi / 180 +# +# cthx = bet * cos(theta - thxr + pi / 2) +# cthy = cos(theta-thyr-pi/2) +# cthy = bet * sin(theta - thyr) +# +# Compute location complex exponential +# x, y, unused_z = list(self.pos) +# ee = exp((1j * (x * cthx + y * cthy)) * kw) # exp(i*k(w)*(x*cos(theta)+y*sin(theta)) size Nt X Nf +# return ee, cthx, cthy +# +# def _get_zk(self, kw): +# h = self.h +# z = self.pos[2] +# if self.igam == 1: +# zk = kw * (h + z) # z measured positive upward from mean water level (default) +# elif self.igam == 2: +# zk = kw * (h - z) # z measured positive downward from mean water level +# else: +# zk = kw * z # z measured positive upward from sea floor +# return zk +# +# --- Surface elevation --- +# def _n(self, w, theta, kw): +# '''n = Eta = wave profile +# ''' +# ee, unused_cthx, unused_cthy = self._get_ee_cthxy(theta, kw) +# return np.ones_like(w), ee +# +# ---- Vertical surface velocity and acceleration----- +# def _n_t(self, w, theta, kw): +# ''' n_t = Eta_t ''' +# ee, unused_cthx, unused_cthy = self._get_ee_cthxy(theta, kw) +# return w, -1j * ee; +# def _n_tt(self, w, theta, kw): +# '''n_tt = Eta_tt''' +# ee, unused_cthx, unused_cthy = self._get_ee_cthxy(theta, kw) +# return w ** 2, -ee +# +# --- Surface slopes --- +# def _n_x(self, w, theta, kw): +# ''' n_x = Eta_x = x-slope''' +# ee, cthx, unused_cthy = self._get_ee_cthxy(theta, kw) +# return kw, 1j * cthx * ee +# def _n_y(self, w, theta, kw): +# ''' n_y = Eta_y = y-slope''' +# ee, unused_cthx, cthy = self._get_ee_cthxy(theta, kw) +# return kw, 1j * cthy * ee +# +# --- Surface curvatures --- +# def _n_xx(self, w, theta, kw): +# ''' n_xx = Eta_xx = Surface curvature (x-dir)''' +# ee, cthx, unused_cthy = self._get_ee_cthxy(theta, kw) +# return kw ** 2, -(cthx ** 2) * ee +# def _n_yy(self, w, theta, kw): +# ''' n_yy = Eta_yy = Surface curvature (y-dir)''' +# ee, unused_cthx, cthy = self._get_ee_cthxy(theta, kw) +# return kw ** 2, -cthy ** 2 * ee +# def _n_xy(self, w, theta, kw): +# ''' n_xy = Eta_xy = Surface curvature (xy-dir)''' +# ee, cthx, cthy = self._get_ee_cthxy(theta, kw) +# return kw ** 2, -cthx * cthy * ee +# +# --- Pressure--- +# def _p(self, w, theta, kw): +# ''' pressure fluctuations''' +# ee, unused_cthx, unused_cthy = self._get_ee_cthxy(theta, kw) +# hk = kw * self.h +# zk = self._get_zk(kw) +# return self.rho * self.g * hyperbolic_ratio(zk, hk, 1, 1), ee #hyperbolic_ratio = cosh(zk)/cosh(hk) +# +# ---- Water particle velocities --- +# def _u(self, w, theta, kw): +# ''' U = x-velocity''' +# ee, cthx, unused_cthy = self._get_ee_cthxy(theta, kw) +# hk = kw * self.h +# zk = self._get_zk(kw) +# return w * hyperbolic_ratio(zk, hk, 1, -1), cthx * ee# w*cosh(zk)/sinh(hk), cos(theta)*ee +# def _v(self, w, theta, kw): +# '''V = y-velocity''' +# ee, unused_cthx, cthy = self._get_ee_cthxy(theta, kw) +# hk = kw * self.h +# zk = self._get_zk(kw) +# return w * hyperbolic_ratio(zk, hk, 1, -1), cthy * ee # w*cosh(zk)/sinh(hk), sin(theta)*ee +# def _w(self, w, theta, kw): +# ''' W = z-velocity''' +# ee, unused_cthx, unused_cthy = self._get_ee_cthxy(theta, kw) +# hk = kw * self.h +# zk = self._get_zk(kw) +# return w * hyperbolic_ratio(zk, hk, -1, -1), -1j * ee # w*sinh(zk)/sinh(hk), -? +# +# ---- Water particle acceleration --- +# def _u_t(self, w, theta, kw): +# ''' U_t = x-acceleration''' +# ee, cthx, unused_cthy = self._get_ee_cthxy(theta, kw) +# hk = kw * self.h +# zk = self._get_zk(kw) +# return (w ** 2) * hyperbolic_ratio(zk, hk, 1, -1), -1j * cthx * ee # w^2*cosh(zk)/sinh(hk), ? +# +# def _v_t(self, w, theta, kw): +# ''' V_t = y-acceleration''' +# ee, unused_cthx, cthy = self._get_ee_cthxy(theta, kw) +# hk = kw * self.h +# zk = self._get_zk(kw) +# return (w ** 2) * hyperbolic_ratio(zk, hk, 1, -1), -1j * cthy * ee # w^2*cosh(zk)/sinh(hk), ? +# def _w_t(self, w, theta, kw): +# ''' W_t = z-acceleration''' +# ee, unused_cthx, unused_cthy = self._get_ee_cthxy(theta, kw) +# hk = kw * self.h +# zk = self._get_zk(kw) +# return (w ** 2) * hyperbolic_ratio(zk, hk, -1, -1), -ee # w*sinh(zk)/sinh(hk), ? +# +# ---- Water particle displacement --- +# def _x_p(self, w, theta, kw): +# ''' X_p = x-displacement''' +# ee, cthx, unused_cthy = self._get_ee_cthxy(theta, kw) +# hk = kw * self.h +# zk = self._get_zk(kw) +# return hyperbolic_ratio(zk, hk, 1, -1), 1j * cthx * ee # cosh(zk)./sinh(hk), ? +# def _y_p(self, w, theta, kw): +# ''' Y_p = y-displacement''' +# ee, unused_cthx, cthy = self._get_ee_cthxy(theta, kw) +# hk = kw * self.h +# zk = self._get_zk(kw) +# return hyperbolic_ratio(zk, hk, 1, -1), 1j * cthy * ee # cosh(zk)./sinh(hk), ? +# def _z_p(self, w, theta, kw): +# ''' Z_p = z-displacement''' +# ee, unused_cthx, unused_cthy = self._get_ee_cthxy(theta, kw) +# hk = kw * self.h +# zk = self._get_zk(kw) +# return hyperbolic_ratio(zk, hk, -1, -1), ee # sinh(zk)./sinh(hk), ee +# +# def wave_pressure(z, Hm0, h=10000, g=9.81, rho=1028): +# ''' +# Calculate pressure amplitude due to water waves. +# +# Parameters +# ---------- +# z : array-like +# depth where pressure is calculated [m] +# Hm0 : array-like +# significant wave height (same as the average of the 1/3'rd highest +# waves in a seastate. [m] +# h : real scalar +# waterdepth (default 10000 [m]) +# g : real scalar +# acceleration of gravity (default 9.81 m/s**2) +# rho : real scalar +# water density (default 1028 kg/m**3) +# +# +# Returns +# ------- +# p : ndarray +# pressure amplitude due to water waves at water depth z. [Pa] +# +# PRESSURE calculate pressure amplitude due to water waves according to +# linear theory. +# +# Example +# ----- +# >>> import pylab as plt +# >>> z = -np.linspace(10,20) +# >>> fh = plt.plot(z, wave_pressure(z, Hm0=1, h=20)) +# >>> plt.show() +# +# See also +# -------- +# w2k +# +# +# u = psweep.Fn*sqrt(mgf.length*9.81) +# z = -10; h = inf; +# Hm0 = 1.5;Tp = 4*sqrt(Hm0); +# S = jonswap([],[Hm0,Tp]); +# Hw = tran(S.w,0,[0 0 -z],'P',h) +# Sm = S; +# Sm.S = Hw.'.*S.S; +# x1 = spec2sdat(Sm,1000); +# pwave = pressure(z,Hm0,h) +# +# plot(psweep.x{1}/u, psweep.f) +# hold on +# plot(x1(1:100,1)-30,x1(1:100,2),'r') +# ''' +# +# +# Assume seastate with jonswap spectrum: +# +# Tp = 4 * np.sqrt(Hm0) +# gam = jonswap_peakfact(Hm0, Tp) +# Tm02 = Tp / (1.30301 - 0.01698 * gam + 0.12102 / gam) +# w = 2 * np.pi / Tm02 +# kw, unused_kw2 = w2k(w, 0, h) +# +# hk = kw * h +# zk1 = kw * z +# zk = hk + zk1 # z measured positive upward from mean water level (default) +# zk = hk-zk1; % z measured positive downward from mean water level +# zk1 = -zk1; +# zk = zk1; % z measured positive upward from sea floor +# +# cosh(zk)/cosh(hk) approx exp(zk) for large h +# hyperbolic_ratio(zk,hk,1,1) = cosh(zk)/cosh(hk) +# pr = np.where(np.pi < hk, np.exp(zk1), hyperbolic_ratio(zk, hk, 1, 1)) +# pr = hyperbolic_ratio(zk, hk, 1, 1) +# pressure = (rho * g * Hm0 / 2) * pr +# +## pos = [np.zeros_like(z),np.zeros_like(z),z] +## tf = TransferFunction(pos=pos, sensortype='p', h=h, rho=rho, g=g) +## Hw, Gwt = tf.tran(w,0) +## pressure2 = np.abs(Hw) * Hm0 / 2 +# +# return pressure + + +def main(): + import wafo + ts = wafo.objects.mat2timeseries(wafo.data.sea()) + tp = ts.turning_points() + mm = tp.cycle_pairs() + lc = mm.level_crossings() + lc.plot() + T = ts.wave_periods(vh=0.0, pdef='c2c') # @UnusedVariable + + # main() + import wafo.spectrum.models as sm + Sj = sm.Jonswap() + S = Sj.tospecdata() + + R = S.tocovdata() + x = R.sim(ns=1000, dt=0.2) # @UnusedVariable + S.characteristic(['hm0', 'tm02']) + ns = 1000 + dt = .2 + x1 = S.sim(ns, dt=dt) + + ts = TimeSeries(x1[:, 1], x1[:, 0]) + tp = ts.turning_points(0.0) + + x = np.arange(-2, 2, 0.2) + + # Plot 2 objects in one call + d2 = PlotData(np.sin(x), x, xlab='x', ylab='sin', title='sinus') + + d0 = d2.copy() + d0.data = d0.data * 0.9 + d1 = d2.copy() + d1.data = d1.data * 1.2 + d1.children = [d0] + d2.children = [d1] + + d2.plot() + print 'Done' + + +def test_docstrings(): + import doctest + doctest.testmod() + +if __name__ == '__main__': + test_docstrings() +# test_levelcrossings_extrapolate() +# if True: #False : # + +# import doctest +# doctest.testmod() +# else: +# main() diff --git a/pywafo/src/wafo/source/rind2007/krobovmod.mod b/pywafo/src/wafo/source/rind2007/krobovmod.mod index 0926ad0..e296e8f 100644 --- a/pywafo/src/wafo/source/rind2007/krobovmod.mod +++ b/pywafo/src/wafo/source/rind2007/krobovmod.mod @@ -1,5 +1,5 @@ -GFORTRAN module version '4' created from intmodule.f on Sat May 05 23:15:41 2012 -MD5:eb0327a40d874f78d04c89aa93e323f2 -- If you edit this, you'll get what you deserve. +GFORTRAN module version '4' created from intmodule.f on Fri Apr 05 14:43:34 2013 +MD5:99db0c86db329df2a1ee0bbf67b9ec99 -- If you edit this, you'll get what you deserve. (() () () () () () () () () () () () () () () () () () () () () () () () () () ()) @@ -17,30 +17,30 @@ MD5:eb0327a40d874f78d04c89aa93e323f2 -- If you edit this, you'll get what you de (2 'krobov' 'krobovmod' 'krobov' 1 ((PROCEDURE UNKNOWN-INTENT MODULE-PROC DECL UNKNOWN 0 0 SUBROUTINE GENERIC) (UNKNOWN 0 0 0 UNKNOWN ()) 3 0 (4 5 6 7 8 9 10 11 12) () 0 () () () 0 0) -7 'functn' '' 'functn' 3 ((PROCEDURE UNKNOWN-INTENT UNKNOWN-PROC BODY -UNKNOWN 0 0 DUMMY FUNCTION ALWAYS_EXPLICIT) (REAL 8 0 0 REAL ()) 13 0 ( -14 15) () 7 () () () 0 0) -5 'minvls' '' 'minvls' 3 ((VARIABLE INOUT UNKNOWN-PROC UNKNOWN UNKNOWN 0 -0 DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) +4 'ndim' '' 'ndim' 3 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 +DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) 8 'abseps' '' 'abseps' 3 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) +6 'maxvls' '' 'maxvls' 3 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 +DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) +9 'releps' '' 'releps' 3 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 +DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) +11 'finest' '' 'finest' 3 ((VARIABLE OUT UNKNOWN-PROC UNKNOWN UNKNOWN 0 +0 DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) 12 'inform' '' 'inform' 3 ((VARIABLE OUT UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) +5 'minvls' '' 'minvls' 3 ((VARIABLE INOUT UNKNOWN-PROC UNKNOWN UNKNOWN 0 +0 DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) 10 'abserr' '' 'abserr' 3 ((VARIABLE OUT UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) -11 'finest' '' 'finest' 3 ((VARIABLE OUT UNKNOWN-PROC UNKNOWN UNKNOWN 0 -0 DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) -4 'ndim' '' 'ndim' 3 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 -DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) -9 'releps' '' 'releps' 3 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 -DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) -6 'maxvls' '' 'maxvls' 3 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 -DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) -14 'n' '' 'n' 13 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) ( -INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) +7 'functn' '' 'functn' 3 ((PROCEDURE UNKNOWN-INTENT UNKNOWN-PROC BODY +UNKNOWN 0 0 DUMMY FUNCTION ALWAYS_EXPLICIT) (REAL 8 0 0 REAL ()) 13 0 ( +14 15) () 7 () () () 0 0) 15 'z' '' 'z' 13 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DIMENSION DUMMY) (REAL 8 0 0 REAL ()) 0 0 () (1 ASSUMED_SHAPE (CONSTANT (INTEGER 4 0 0 INTEGER ()) 0 '1') ()) 0 () () () 0 0) +14 'n' '' 'n' 13 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) ( +INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) ) ('krobov' 0 2) diff --git a/pywafo/src/wafo/source/rind2007/rcrudemod.mod b/pywafo/src/wafo/source/rind2007/rcrudemod.mod index 6dea54c..6b37a6b 100644 --- a/pywafo/src/wafo/source/rind2007/rcrudemod.mod +++ b/pywafo/src/wafo/source/rind2007/rcrudemod.mod @@ -1,5 +1,5 @@ -GFORTRAN module version '4' created from intmodule.f on Sat May 05 23:15:40 2012 -MD5:f628260304c0d5215e1ef95941599430 -- If you edit this, you'll get what you deserve. +GFORTRAN module version '4' created from intmodule.f on Fri Apr 05 14:43:34 2013 +MD5:c88c5a15c480306fb971bd1e5ced587e -- If you edit this, you'll get what you deserve. (() () () () () () () () () () () () () () () () () () () () () () () () () () ()) @@ -17,6 +17,14 @@ MD5:f628260304c0d5215e1ef95941599430 -- If you edit this, you'll get what you de (2 'ranmc' 'rcrudemod' 'ranmc' 1 ((PROCEDURE UNKNOWN-INTENT MODULE-PROC DECL UNKNOWN 0 0 SUBROUTINE GENERIC) (UNKNOWN 0 0 0 UNKNOWN ()) 3 0 (4 5 6 7 8 9 10 11) () 0 () () () 0 0) +8 'releps' '' 'releps' 3 ((VARIABLE UNKNOWN-INTENT UNKNOWN-PROC UNKNOWN +UNKNOWN 0 0 DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) +9 'error' '' 'error' 3 ((VARIABLE UNKNOWN-INTENT UNKNOWN-PROC UNKNOWN +UNKNOWN 0 0 DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) +10 'value' '' 'value' 3 ((VARIABLE UNKNOWN-INTENT UNKNOWN-PROC UNKNOWN +UNKNOWN 0 0 DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) +11 'inform' '' 'inform' 3 ((VARIABLE UNKNOWN-INTENT UNKNOWN-PROC UNKNOWN +UNKNOWN 0 0 DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) 4 'n' '' 'n' 3 ((VARIABLE UNKNOWN-INTENT UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) 5 'maxpts' '' 'maxpts' 3 ((VARIABLE UNKNOWN-INTENT UNKNOWN-PROC UNKNOWN @@ -26,14 +34,6 @@ UNKNOWN 0 0 DUMMY FUNCTION ALWAYS_EXPLICIT) (REAL 8 0 0 REAL ()) 12 0 ( 13 14) () 6 () () () 0 0) 7 'abseps' '' 'abseps' 3 ((VARIABLE UNKNOWN-INTENT UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) -8 'releps' '' 'releps' 3 ((VARIABLE UNKNOWN-INTENT UNKNOWN-PROC UNKNOWN -UNKNOWN 0 0 DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) -9 'error' '' 'error' 3 ((VARIABLE UNKNOWN-INTENT UNKNOWN-PROC UNKNOWN -UNKNOWN 0 0 DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) -10 'value' '' 'value' 3 ((VARIABLE UNKNOWN-INTENT UNKNOWN-PROC UNKNOWN -UNKNOWN 0 0 DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) -11 'inform' '' 'inform' 3 ((VARIABLE UNKNOWN-INTENT UNKNOWN-PROC UNKNOWN -UNKNOWN 0 0 DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) 13 'n' '' 'n' 12 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) ( INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) 14 'z' '' 'z' 12 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 diff --git a/pywafo/src/wafo/source/rind2007/rind71mod.mod b/pywafo/src/wafo/source/rind2007/rind71mod.mod index b633931..8149535 100644 --- a/pywafo/src/wafo/source/rind2007/rind71mod.mod +++ b/pywafo/src/wafo/source/rind2007/rind71mod.mod @@ -1,5 +1,5 @@ -GFORTRAN module version '4' created from rind71mod.f on Mon Feb 18 02:58:35 2013 -MD5:520dd65f929350d1434842f22f38b888 -- If you edit this, you'll get what you deserve. +GFORTRAN module version '4' created from rind71mod.f on Fri Apr 05 14:43:37 2013 +MD5:c5460e9301460ce17aef8031cd82ad57 -- If you edit this, you'll get what you deserve. (() () () () () () () () () () () () () () () () () () () () () () () () () () ()) @@ -27,16 +27,26 @@ UNKNOWN ()) 10 0 (11 12 13 14 15 16 17 18) () 0 () () () 0 0) 4 'setdata' 'rind71mod' 'setdata' 1 ((PROCEDURE UNKNOWN-INTENT MODULE-PROC DECL UNKNOWN 0 0 SUBROUTINE GENERIC) (UNKNOWN 0 0 0 UNKNOWN ()) 19 0 (20 21 22 23 24 25 26 27 28) () 0 () () () 0 0) -25 'dnit' '' 'dnit' 19 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 +27 'dnint' '' 'dnint' 19 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) +28 'dxsplt' '' 'dxsplt' 19 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 +0 DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) +20 'method' '' 'method' 19 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 +0 DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) +23 'dreps' '' 'dreps' 19 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 +DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) 24 'deps2' '' 'deps2' 19 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) +25 'dnit' '' 'dnit' 19 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 +DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) 26 'dxc' '' 'dxc' 19 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) -27 'dnint' '' 'dnint' 19 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 +21 'scale' '' 'scale' 19 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 +DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) +22 'depss' '' 'depss' 19 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 +DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) +9 'speed' '' 'speed' 8 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) -28 'dxsplt' '' 'dxsplt' 19 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 -0 DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) 7 'array' '' 'array' 6 ((VARIABLE UNKNOWN-INTENT UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DIMENSION DUMMY) (REAL 8 0 0 REAL ()) 0 0 () (2 ASSUMED_SHAPE (CONSTANT (INTEGER 4 0 0 INTEGER ()) 0 '1') () (CONSTANT ( @@ -68,16 +78,6 @@ DIMENSION DUMMY) (REAL 8 0 0 REAL ()) 0 0 () (2 ASSUMED_SHAPE (CONSTANT DIMENSION DUMMY) (REAL 8 0 0 REAL ()) 0 0 () (2 ASSUMED_SHAPE (CONSTANT (INTEGER 4 0 0 INTEGER ()) 0 '1') () (CONSTANT (INTEGER 4 0 0 INTEGER ()) 0 '1') ()) 0 () () () 0 0) -9 'speed' '' 'speed' 8 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 -DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) -20 'method' '' 'method' 19 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 -0 DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) -21 'scale' '' 'scale' 19 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 -DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) -22 'depss' '' 'depss' 19 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 -DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) -23 'dreps' '' 'dreps' 19 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 -DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) ) ('echo' 0 2 'initdata' 0 3 'rind71' 0 5 'setdata' 0 4) diff --git a/pywafo/src/wafo/source/rind2007/rindmod.mod b/pywafo/src/wafo/source/rind2007/rindmod.mod index 8522320..5d8b22c 100644 --- a/pywafo/src/wafo/source/rind2007/rindmod.mod +++ b/pywafo/src/wafo/source/rind2007/rindmod.mod @@ -1,5 +1,5 @@ -GFORTRAN module version '4' created from rindmod.f on Sat May 05 23:15:44 2012 -MD5:27b48943ab247880a4203cf14574fba3 -- If you edit this, you'll get what you deserve. +GFORTRAN module version '4' created from rindmod.f on Fri Apr 05 14:43:35 2013 +MD5:dcdbb9dedca21469ecd6ba2a3e2bf880 -- If you edit this, you'll get what you deserve. (() () () () () () () () () () () () () () () () () () () () () () () () () () ()) @@ -52,12 +52,6 @@ UNKNOWN ()) 15 0 (16 17 18 19 20 21 22 23 24 25 26) () 0 () () () 0 0) MODULE-PROC DECL UNKNOWN 0 0 SUBROUTINE GENERIC ALWAYS_EXPLICIT) ( UNKNOWN 0 0 0 UNKNOWN ()) 27 0 (28 29 30 31 32 33 34 35 36 37) () 0 () () () 0 0) -28 'method' '' 'method' 27 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 -0 OPTIONAL DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) -29 'xcscale' '' 'xcscale' 27 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN -0 0 OPTIONAL DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) -30 'abseps' '' 'abseps' 27 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 -0 OPTIONAL DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) 31 'releps' '' 'releps' 27 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 OPTIONAL DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) 32 'coveps' '' 'coveps' 27 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 @@ -72,6 +66,12 @@ OPTIONAL DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) 0 0 OPTIONAL DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) 37 'nc1c2' '' 'nc1c2' 27 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 OPTIONAL DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) +28 'method' '' 'method' 27 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 +0 OPTIONAL DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) +29 'xcscale' '' 'xcscale' 27 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN +0 0 OPTIONAL DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) +30 'abseps' '' 'abseps' 27 ((VARIABLE IN UNKNOWN-PROC UNKNOWN UNKNOWN 0 +0 OPTIONAL DUMMY) (REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) 16 'vals' '' 'vals' 15 ((VARIABLE OUT UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DIMENSION DUMMY) (REAL 8 0 0 REAL ()) 0 0 () (1 ASSUMED_SHAPE (CONSTANT (INTEGER 4 0 0 INTEGER ()) 0 '1') ()) 0 () () () 0 0) diff --git a/pywafo/src/wafo/source/rind2007/swapmod.mod b/pywafo/src/wafo/source/rind2007/swapmod.mod index 036cf21..eab5a09 100644 --- a/pywafo/src/wafo/source/rind2007/swapmod.mod +++ b/pywafo/src/wafo/source/rind2007/swapmod.mod @@ -1,5 +1,5 @@ -GFORTRAN module version '4' created from swapmod.f on Sat May 05 23:15:42 2012 -MD5:52275e19413dc7ab9d6082dbb7b7af80 -- If you edit this, you'll get what you deserve. +GFORTRAN module version '4' created from swapmod.f on Fri Apr 05 14:43:34 2013 +MD5:d3f134c81002cd5f6cec09ebff3e336f -- If you edit this, you'll get what you deserve. (() () () () () () () () () () () () () () () () () () () () () () () () () () ()) @@ -25,6 +25,12 @@ DECL UNKNOWN 0 0 SUBROUTINE) (UNKNOWN 0 0 0 UNKNOWN ()) 11 0 (12 13) () 0 () () () 0 0) 14 'swapmod' 'swapmod' 'swapmod' 1 ((MODULE UNKNOWN-INTENT UNKNOWN-PROC UNKNOWN UNKNOWN 0 0) (UNKNOWN 0 0 0 UNKNOWN ()) 0 0 () () 0 () () () 0 0) +12 'a' '' 'a' 11 ((VARIABLE INOUT UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) +(REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) +13 'b' '' 'b' 11 ((VARIABLE INOUT UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) +(REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) +9 'a' '' 'a' 8 ((VARIABLE INOUT UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) +(INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) 10 'b' '' 'b' 8 ((VARIABLE INOUT UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) (INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) 6 'a' '' 'a' 5 ((VARIABLE INOUT UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) @@ -33,12 +39,6 @@ UNKNOWN UNKNOWN 0 0) (UNKNOWN 0 0 0 UNKNOWN ()) 0 0 () () 0 () () () 0 0) 7 'b' '' 'b' 5 ((VARIABLE INOUT UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) (CHARACTER 1 0 0 CHARACTER ((CONSTANT (INTEGER 4 0 0 INTEGER ()) 0 '1'))) 0 0 () () 0 () () () 0 0) -12 'a' '' 'a' 11 ((VARIABLE INOUT UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) -(REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) -13 'b' '' 'b' 11 ((VARIABLE INOUT UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) -(REAL 8 0 0 REAL ()) 0 0 () () 0 () () () 0 0) -9 'a' '' 'a' 8 ((VARIABLE INOUT UNKNOWN-PROC UNKNOWN UNKNOWN 0 0 DUMMY) -(INTEGER 4 0 0 INTEGER ()) 0 0 () () 0 () () () 0 0) ) ('swap_c' 0 2 'swap_i' 0 3 'swap_r' 0 4 'swapmod' 0 14) diff --git a/pywafo/src/wafo/stats/_binned_statistic.py b/pywafo/src/wafo/stats/_binned_statistic.py new file mode 100644 index 0000000..cae5103 --- /dev/null +++ b/pywafo/src/wafo/stats/_binned_statistic.py @@ -0,0 +1,402 @@ +from __future__ import division, print_function, absolute_import + +import warnings + +import numpy as np +from scipy.lib.six import callable + + +def binned_statistic(x, values, statistic='mean', + bins=10, range=None): + """ + Compute a binned statistic for a set of data. + + This is a generalization of a histogram function. A histogram divides + the space into bins, and returns the count of the number of points in + each bin. This function allows the computation of the sum, mean, median, + or other statistic of the values within each bin. + + .. versionadded:: 0.11.0 + + Parameters + ---------- + x : array_like + A sequence of values to be binned. + values : array_like + The values on which the statistic will be computed. This must be + the same shape as `x`. + statistic : string or callable, optional + The statistic to compute (default is 'mean'). + The following statistics are available: + + * 'mean' : compute the mean of values for points within each bin. + Empty bins will be represented by NaN. + * 'median' : compute the median of values for points within each + bin. Empty bins will be represented by NaN. + * 'count' : compute the count of points within each bin. This is + identical to an unweighted histogram. `values` array is not + referenced. + * 'sum' : compute the sum of values for points within each bin. + This is identical to a weighted histogram. + * function : a user-defined function which takes a 1D array of + values, and outputs a single numerical statistic. This function + will be called on the values in each bin. Empty bins will be + represented by function([]), or NaN if this returns an error. + + bins : int or sequence of scalars, optional + If `bins` is an int, it defines the number of equal-width + bins in the given range (10, by default). If `bins` is a sequence, + it defines the bin edges, including the rightmost edge, allowing + for non-uniform bin widths. + range : (float, float) or [(float, float)], optional + The lower and upper range of the bins. If not provided, range + is simply ``(x.min(), x.max())``. Values outside the range are + ignored. + + Returns + ------- + statistic : array + The values of the selected statistic in each bin. + bin_edges : array of dtype float + Return the bin edges ``(length(statistic)+1)``. + binnumber : 1-D ndarray of ints + This assigns to each observation an integer that represents the bin + in which this observation falls. Array has the same length as values. + + See Also + -------- + numpy.histogram, binned_statistic_2d, binned_statistic_dd + + Notes + ----- + All but the last (righthand-most) bin is half-open. In other words, if + `bins` is:: + + [1, 2, 3, 4] + + then the first bin is ``[1, 2)`` (including 1, but excluding 2) and the + second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which *includes* + 4. + + Examples + -------- + >>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean', + ... bins=3) + (array([ 1., 2., 4.]), array([ 1., 2., 3., 4.]), array([1, 2, 1, 2, 3])) + + >>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean', bins=3) + (array([ 1., 2., 4.]), array([ 1., 2., 3., 4.]), array([1, 2, 1, 2, 3])) + + """ + try: + N = len(bins) + except TypeError: + N = 1 + + if N != 1: + bins = [np.asarray(bins, float)] + + if range is not None: + if len(range) == 2: + range = [range] + + medians, edges, xy = binned_statistic_dd([x], values, statistic, + bins, range) + + return medians, edges[0], xy + + +def binned_statistic_2d(x, y, values, statistic='mean', + bins=10, range=None): + """ + Compute a bidimensional binned statistic for a set of data. + + This is a generalization of a histogram2d function. A histogram divides + the space into bins, and returns the count of the number of points in + each bin. This function allows the computation of the sum, mean, median, + or other statistic of the values within each bin. + + .. versionadded:: 0.11.0 + + Parameters + ---------- + x : (N,) array_like + A sequence of values to be binned along the first dimension. + y : (M,) array_like + A sequence of values to be binned along the second dimension. + values : (N,) array_like + The values on which the statistic will be computed. This must be + the same shape as `x`. + statistic : string or callable, optional + The statistic to compute (default is 'mean'). + The following statistics are available: + + * 'mean' : compute the mean of values for points within each bin. + Empty bins will be represented by NaN. + * 'median' : compute the median of values for points within each + bin. Empty bins will be represented by NaN. + * 'count' : compute the count of points within each bin. This is + identical to an unweighted histogram. `values` array is not + referenced. + * 'sum' : compute the sum of values for points within each bin. + This is identical to a weighted histogram. + * function : a user-defined function which takes a 1D array of + values, and outputs a single numerical statistic. This function + will be called on the values in each bin. Empty bins will be + represented by function([]), or NaN if this returns an error. + + bins : int or [int, int] or array-like or [array, array], optional + The bin specification: + + * the number of bins for the two dimensions (nx=ny=bins), + * the number of bins in each dimension (nx, ny = bins), + * the bin edges for the two dimensions (x_edges = y_edges = bins), + * the bin edges in each dimension (x_edges, y_edges = bins). + + range : (2,2) array_like, optional + The leftmost and rightmost edges of the bins along each dimension + (if not specified explicitly in the `bins` parameters): + [[xmin, xmax], [ymin, ymax]]. All values outside of this range will be + considered outliers and not tallied in the histogram. + + Returns + ------- + statistic : (nx, ny) ndarray + The values of the selected statistic in each two-dimensional bin + xedges : (nx + 1) ndarray + The bin edges along the first dimension. + yedges : (ny + 1) ndarray + The bin edges along the second dimension. + binnumber : 1-D ndarray of ints + This assigns to each observation an integer that represents the bin + in which this observation falls. Array has the same length as `values`. + + See Also + -------- + numpy.histogram2d, binned_statistic, binned_statistic_dd + + """ + + # This code is based on np.histogram2d + try: + N = len(bins) + except TypeError: + N = 1 + + if N != 1 and N != 2: + xedges = yedges = np.asarray(bins, float) + bins = [xedges, yedges] + + medians, edges, xy = binned_statistic_dd([x, y], values, statistic, + bins, range) + + return medians, edges[0], edges[1], xy + + +def binned_statistic_dd(sample, values, statistic='mean', + bins=10, range=None): + """ + Compute a multidimensional binned statistic for a set of data. + + This is a generalization of a histogramdd function. A histogram divides + the space into bins, and returns the count of the number of points in + each bin. This function allows the computation of the sum, mean, median, + or other statistic of the values within each bin. + + .. versionadded:: 0.11.0 + + Parameters + ---------- + sample : array_like + Data to histogram passed as a sequence of D arrays of length N, or + as an (N,D) array. + values : array_like + The values on which the statistic will be computed. This must be + the same shape as x. + statistic : string or callable, optional + The statistic to compute (default is 'mean'). + The following statistics are available: + + * 'mean' : compute the mean of values for points within each bin. + Empty bins will be represented by NaN. + * 'median' : compute the median of values for points within each + bin. Empty bins will be represented by NaN. + * 'count' : compute the count of points within each bin. This is + identical to an unweighted histogram. `values` array is not + referenced. + * 'sum' : compute the sum of values for points within each bin. + This is identical to a weighted histogram. + * function : a user-defined function which takes a 1D array of + values, and outputs a single numerical statistic. This function + will be called on the values in each bin. Empty bins will be + represented by function([]), or NaN if this returns an error. + + bins : sequence or int, optional + The bin specification: + + * A sequence of arrays describing the bin edges along each dimension. + * The number of bins for each dimension (nx, ny, ... =bins) + * The number of bins for all dimensions (nx=ny=...=bins). + + range : sequence, optional + A sequence of lower and upper bin edges to be used if the edges are + not given explicitely in `bins`. Defaults to the minimum and maximum + values along each dimension. + + Returns + ------- + statistic : ndarray, shape(nx1, nx2, nx3,...) + The values of the selected statistic in each two-dimensional bin + edges : list of ndarrays + A list of D arrays describing the (nxi + 1) bin edges for each + dimension + binnumber : 1-D ndarray of ints + This assigns to each observation an integer that represents the bin + in which this observation falls. Array has the same length as values. + + See Also + -------- + np.histogramdd, binned_statistic, binned_statistic_2d + + """ + if type(statistic) == str: + if statistic not in ['mean', 'median', 'count', 'sum', 'std']: + raise ValueError('unrecognized statistic "%s"' % statistic) + elif callable(statistic): + pass + else: + raise ValueError("statistic not understood") + + # This code is based on np.histogramdd + try: + # Sample is an ND-array. + N, D = sample.shape + except (AttributeError, ValueError): + # Sample is a sequence of 1D arrays. + sample = np.atleast_2d(sample).T + N, D = sample.shape + + nbin = np.empty(D, int) + edges = D * [None] + dedges = D * [None] + + try: + M = len(bins) + if M != D: + raise AttributeError('The dimension of bins must be equal ' + 'to the dimension of the sample x.') + except TypeError: + bins = D * [bins] + + # Select range for each dimension + # Used only if number of bins is given. + if range is None: + smin = np.atleast_1d(np.array(sample.min(0), float)) + smax = np.atleast_1d(np.array(sample.max(0), float)) + else: + smin = np.zeros(D) + smax = np.zeros(D) + for i in np.arange(D): + smin[i], smax[i] = range[i] + + # Make sure the bins have a finite width. + for i in np.arange(len(smin)): + if smin[i] == smax[i]: + smin[i] = smin[i] - .5 + smax[i] = smax[i] + .5 + + # Create edge arrays + for i in np.arange(D): + if np.isscalar(bins[i]): + nbin[i] = bins[i] + 2 # +2 for outlier bins + edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1) + else: + edges[i] = np.asarray(bins[i], float) + nbin[i] = len(edges[i]) + 1 # +1 for outlier bins + dedges[i] = np.diff(edges[i]) + + nbin = np.asarray(nbin) + + # Compute the bin number each sample falls into. + Ncount = {} + for i in np.arange(D): + Ncount[i] = np.digitize(sample[:, i], edges[i]) + + # Using digitize, values that fall on an edge are put in the right bin. + # For the rightmost bin, we want values equal to the right + # edge to be counted in the last bin, and not as an outlier. + for i in np.arange(D): + # Rounding precision + decimal = int(-np.log10(dedges[i].min())) + 6 + # Find which points are on the rightmost edge. + on_edge = np.where(np.around(sample[:, i], decimal) + == np.around(edges[i][-1], decimal))[0] + # Shift these points one bin to the left. + Ncount[i][on_edge] -= 1 + + # Compute the sample indices in the flattened statistic matrix. + ni = nbin.argsort() + xy = np.zeros(N, int) + for i in np.arange(0, D - 1): + xy += Ncount[ni[i]] * nbin[ni[i + 1:]].prod() + xy += Ncount[ni[-1]] + + result = np.empty(nbin.prod(), float) + + if statistic == 'mean': + result.fill(np.nan) + flatcount = np.bincount(xy, None) + flatsum = np.bincount(xy, values) + a = flatcount.nonzero() + result[a] = flatsum[a] / flatcount[a] + elif statistic == 'std': + result.fill(0) + flatcount = np.bincount(xy, None) + flatsum = np.bincount(xy, values) + flatsum2 = np.bincount(xy, values ** 2) + a = flatcount.nonzero() + result[a] = np.sqrt(flatsum2[a] / flatcount[a] + - (flatsum[a] / flatcount[a]) ** 2) + elif statistic == 'count': + result.fill(0) + flatcount = np.bincount(xy, None) + a = np.arange(len(flatcount)) + result[a] = flatcount + elif statistic == 'sum': + result.fill(0) + flatsum = np.bincount(xy, values) + a = np.arange(len(flatsum)) + result[a] = flatsum + elif statistic == 'median': + result.fill(np.nan) + for i in np.unique(xy): + result[i] = np.median(values[xy == i]) + elif callable(statistic): + with warnings.catch_warnings(): + # Numpy generates a warnings for mean/std/... with empty list + warnings.filterwarnings('ignore', category=RuntimeWarning) + old = np.seterr(invalid='ignore') + try: + null = statistic([]) + except: + null = np.nan + np.seterr(**old) + result.fill(null) + for i in np.unique(xy): + result[i] = statistic(values[xy == i]) + + # Shape into a proper matrix + result = result.reshape(np.sort(nbin)) + for i in np.arange(nbin.size): + j = ni.argsort()[i] + result = result.swapaxes(i, j) + ni[i], ni[j] = ni[j], ni[i] + + # Remove outliers (indices 0 and -1 for each dimension). + core = D * [slice(1, -1)] + result = result[core] + + if (result.shape != nbin - 2).any(): + raise RuntimeError('Internal Shape Error') + + return result, edges, xy diff --git a/pywafo/src/wafo/stats/_constants.py b/pywafo/src/wafo/stats/_constants.py new file mode 100644 index 0000000..4b5048f --- /dev/null +++ b/pywafo/src/wafo/stats/_constants.py @@ -0,0 +1,24 @@ +""" +Statistics-related constants. + +""" +from __future__ import division, print_function, absolute_import + +import numpy as np + + +# The smallest representable positive number such that 1.0 + _EPS != 1.0. +_EPS = np.finfo(float).eps + +# The largest [in magnitude] usable floating value. +_XMAX = np.finfo(float).machar.xmax + +# The smallest [in magnitude] usable floating value. +_XMIN = np.finfo(float).machar.xmin + +# -special.psi(1) +_EULER = 0.577215664901532860606512090082402431042 + +# special.zeta(3, 1) Apery's constant +_ZETA3 = 1.202056903159594285399738161511449990765 + diff --git a/pywafo/src/wafo/stats/_continuous_distns.py b/pywafo/src/wafo/stats/_continuous_distns.py new file mode 100644 index 0000000..58dd2c1 --- /dev/null +++ b/pywafo/src/wafo/stats/_continuous_distns.py @@ -0,0 +1,4654 @@ +# +# Author: Travis Oliphant 2002-2011 with contributions from +# SciPy Developers 2004-2011 +# +from __future__ import division, print_function, absolute_import + +import warnings + +from scipy.misc import comb # @UnresolvedImport +from scipy.misc.doccer import inherit_docstring_from +from scipy import special +from scipy import optimize +from scipy import integrate +from scipy.special import (gammaln as gamln, gamma as gam, log1p) + +from numpy import (where, arange, putmask, ravel, sum, shape, + log, sqrt, exp, arctanh, tan, sin, arcsin, arctan, + tanh, cos, cosh, sinh, expm1) + +from numpy import polyval, place, extract, any, asarray, nan, inf, pi + +import numpy as np +import numpy.random as mtrand +try: + from scipy.stats.distributions import vonmises_cython +except: + vonmises_cython = None +# try: +# from scipy.stats._tukeylambda_stats import \ +# tukeylambda_variance as _tlvar, \ +# tukeylambda_kurtosis as _tlkurt +# except: +# _tlvar = _tlkurt = None +#from . import vonmises_cython +from ._tukeylambda_stats import (tukeylambda_variance as _tlvar, + tukeylambda_kurtosis as _tlkurt) + +from ._distn_infrastructure import ( + rv_continuous, valarray, + _skew, _kurtosis, _lazywhere, + _ncx2_log_pdf, _ncx2_pdf, _ncx2_cdf, +) + +from ._constants import _XMIN, _EULER, _ZETA3, _EPS + +#from .estimation import FitDistribution + +__all__ = [ + 'ksone', 'kstwobign', 'norm', 'alpha', 'anglit', 'arcsine', + 'beta', 'betaprime', 'bradford', 'burr', 'fisk', 'cauchy', + 'chi', 'chi2', 'cosine', 'dgamma', 'dweibull', 'erlang', + 'expon', 'exponweib', 'exponpow', 'fatiguelife', 'foldcauchy', + 'f', 'foldnorm', 'frechet_r', 'weibull_min', 'frechet_l', + 'weibull_max', 'genlogistic', 'genpareto', 'genexpon', 'genextreme', + 'gamma', 'gengamma', 'genhalflogistic', 'gompertz', 'gumbel_r', + 'gumbel_l', 'halfcauchy', 'halflogistic', 'halfnorm', 'hypsecant', + 'gausshyper', 'invgamma', 'invgauss', 'invweibull', + 'johnsonsb', 'johnsonsu', 'laplace', 'levy', 'levy_l', + 'levy_stable', 'logistic', 'loggamma', 'loglaplace', 'lognorm', + 'gilbrat', 'maxwell', 'mielke', 'nakagami', 'ncx2', 'ncf', 't', + 'nct', 'pareto', 'lomax', 'pearson3', 'powerlaw', 'powerlognorm', + 'powernorm', 'rdist', 'rayleigh', 'reciprocal', 'rice', + 'truncrayleigh', + 'recipinvgauss', 'semicircular', 'triang', 'truncexpon', + 'truncnorm', 'tukeylambda', 'uniform', 'vonmises', 'vonmises_line', + 'wald', 'wrapcauchy'] + + +# Kolmogorov-Smirnov one-sided and two-sided test statistics +class ksone_gen(rv_continuous): + + """General Kolmogorov-Smirnov one-sided test. + + %(default)s + + """ + + def _cdf(self, x, n): + return 1.0 - special.smirnov(n, x) + + def _ppf(self, q, n): + return special.smirnovi(n, 1.0 - q) +ksone = ksone_gen(a=0.0, name='ksone') + + +class kstwobign_gen(rv_continuous): + + """Kolmogorov-Smirnov two-sided test for large N. + + %(default)s + + """ + + def _cdf(self, x): + return 1.0 - special.kolmogorov(x) + + def _sf(self, x): + return special.kolmogorov(x) + + def _ppf(self, q): + return special.kolmogi(1.0 - q) +kstwobign = kstwobign_gen(a=0.0, name='kstwobign') + + +# Normal distribution + +# loc = mu, scale = std +# Keep these implementations out of the class definition so they can be reused +# by other distributions. +_norm_pdf_C = np.sqrt(2 * pi) +_norm_pdf_logC = np.log(_norm_pdf_C) + + +def _norm_pdf(x): + return exp(-x ** 2 / 2.0) / _norm_pdf_C + + +def _norm_logpdf(x): + return -x ** 2 / 2.0 - _norm_pdf_logC + + +def _norm_cdf(x): + return special.ndtr(x) + + +def _norm_logcdf(x): + return special.log_ndtr(x) + + +def _norm_ppf(q): + return special.ndtri(q) + + +def _norm_sf(x): + return special.ndtr(-x) + + +def _norm_logsf(x): + return special.log_ndtr(-x) + + +def _norm_isf(q): + return -special.ndtri(q) + + +class norm_gen(rv_continuous): + + """A normal continuous random variable. + + The location (loc) keyword specifies the mean. + The scale (scale) keyword specifies the standard deviation. + + %(before_notes)s + + Notes + ----- + The probability density function for `norm` is:: + + norm.pdf(x) = exp(-x**2/2)/sqrt(2*pi) + + %(example)s + + """ + + def _rvs(self): + return mtrand.standard_normal(self._size) + + def _pdf(self, x): + return _norm_pdf(x) + + def _logpdf(self, x): + return _norm_logpdf(x) + + def _cdf(self, x): + return _norm_cdf(x) + + def _logcdf(self, x): + return _norm_logcdf(x) + + def _sf(self, x): + return _norm_sf(x) + + def _logsf(self, x): + return _norm_logsf(x) + + def _ppf(self, q): + return _norm_ppf(q) + + def _isf(self, q): + return _norm_isf(q) + + def _stats(self): + return 0.0, 1.0, 0.0, 0.0 + + def _entropy(self): + return 0.5 * (log(2 * pi) + 1) + + @inherit_docstring_from(rv_continuous) + def fit(self, data, **kwds): + """%(super)s + This function (norm_gen.fit) uses explicit formulas for the maximum + likelihood estimation of the parameters, so the `optimizer` argument + is ignored. + """ + floc = kwds.get('floc', None) + fscale = kwds.get('fscale', None) + + if floc is not None and fscale is not None: + # This check is for consistency with `rv_continuous.fit`. + # Without this check, this function would just return the + # parameters that were given. + raise ValueError("All parameters fixed. There is nothing to " + "optimize.") + + data = np.asarray(data) + + if floc is None: + loc = data.mean() + else: + loc = floc + + if fscale is None: + scale = np.sqrt(((data - loc) ** 2).mean()) + else: + scale = fscale + + return loc, scale + +norm = norm_gen(name='norm') + + +class alpha_gen(rv_continuous): + + """An alpha continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `alpha` is:: + + alpha.pdf(x, a) = 1/(x**2*Phi(a)*sqrt(2*pi)) * exp(-1/2 * (a-1/x)**2), + + where ``Phi(alpha)`` is the normal CDF, ``x > 0``, and ``a > 0``. + + %(example)s + + """ + + def _pdf(self, x, a): + return 1.0 / (x ** 2) / special.ndtr(a) * _norm_pdf(a - 1.0 / x) + + def _logpdf(self, x, a): + return -2 * log(x) + _norm_logpdf(a - 1.0 / x) - log(special.ndtr(a)) + + def _cdf(self, x, a): + return special.ndtr(a - 1.0 / x) / special.ndtr(a) + + def _ppf(self, q, a): + return 1.0 / asarray(a - special.ndtri(q * special.ndtr(a))) + + def _stats(self, a): + return [inf] * 2 + [nan] * 2 +alpha = alpha_gen(a=0.0, name='alpha') + + +class anglit_gen(rv_continuous): + + """An anglit continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `anglit` is:: + + anglit.pdf(x) = sin(2*x + pi/2) = cos(2*x), + + for ``-pi/4 <= x <= pi/4``. + + %(example)s + + """ + + def _pdf(self, x): + return cos(2 * x) + + def _cdf(self, x): + return sin(x + pi / 4) ** 2.0 + + def _ppf(self, q): + return (arcsin(sqrt(q)) - pi / 4) + + def _stats(self): + return (0.0, pi * pi / 16 - 0.5, 0.0, + -2 * (pi ** 4 - 96) / (pi * pi - 8) ** 2) + + def _entropy(self): + return 1 - log(2) +anglit = anglit_gen(a=-pi / 4, b=pi / 4, name='anglit') + + +class arcsine_gen(rv_continuous): + + """An arcsine continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `arcsine` is:: + + arcsine.pdf(x) = 1/(pi*sqrt(x*(1-x))) + for 0 < x < 1. + + %(example)s + + """ + + def _pdf(self, x): + return 1.0 / pi / sqrt(x * (1 - x)) + + def _cdf(self, x): + return 2.0 / pi * arcsin(sqrt(x)) + + def _ppf(self, q): + return sin(pi / 2.0 * q) ** 2.0 + + def _stats(self): + mu = 0.5 + mu2 = 1.0 / 8 + g1 = 0 + g2 = -3.0 / 2.0 + return mu, mu2, g1, g2 + + def _entropy(self): + return -0.24156447527049044468 +arcsine = arcsine_gen(a=0.0, b=1.0, name='arcsine') + + +class FitDataError(ValueError): + # This exception is raised by, for example, beta_gen.fit when both floc + # and fscale are fixed and there are values in the data not in the open + # interval (floc, floc+fscale). + + def __init__(self, distr, lower, upper): + self.args = ( + "Invalid values in `data`. Maximum likelihood " + "estimation with {distr!r} requires that {lower!r} < x " + "< {upper!r} for each x in `data`.".format( + distr=distr, lower=lower, upper=upper), + ) + + +class FitSolverError(RuntimeError): + # This exception is raised by, for example, beta_gen.fit when + # optimize.fsolve returns with ier != 1. + + def __init__(self, mesg): + emsg = "Solver for the MLE equations failed to converge: " + emsg += mesg.replace('\n', '') + self.args = (emsg,) + + +def _beta_mle_a(a, b, n, s1): + # The zeros of this function give the MLE for `a`, with + # `b`, `n` and `s1` given. `s1` is the sum of the logs of + # the data. `n` is the number of data points. + psiab = special.psi(a + b) + func = s1 - n * (-psiab + special.psi(a)) + return func + + +def _beta_mle_ab(theta, n, s1, s2): + # Zeros of this function are critical points of + # the maximum likelihood function. Solving this system + # for theta (which contains a and b) gives the MLE for a and b + # given `n`, `s1` and `s2`. `s1` is the sum of the logs of the data, + # and `s2` is the sum of the logs of 1 - data. `n` is the number + # of data points. + a, b = theta + psiab = special.psi(a + b) + func = [s1 - n * (-psiab + special.psi(a)), + s2 - n * (-psiab + special.psi(b))] + return func + + +class beta_gen(rv_continuous): + + """A beta continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `beta` is:: + + beta.pdf(x, a, b) = gamma(a+b)/(gamma(a)*gamma(b)) * x**(a-1) * + (1-x)**(b-1), + + for ``0 < x < 1``, ``a > 0``, ``b > 0``. + + %(example)s + + """ + + def _rvs(self, a, b): + return mtrand.beta(a, b, self._size) + + def _pdf(self, x, a, b): + return np.exp(self._logpdf(x, a, b)) + + def _logpdf(self, x, a, b): + lPx = special.xlog1py(b - 1.0, -x) + special.xlogy(a - 1.0, x) + lPx -= special.betaln(a, b) + return lPx + + def _cdf(self, x, a, b): + return special.btdtr(a, b, x) + + def _ppf(self, q, a, b): + return special.btdtri(a, b, q) + + def _stats(self, a, b): + mn = a * 1.0 / (a + b) + var = (a * b * 1.0) / (a + b + 1.0) / (a + b) ** 2.0 + g1 = 2.0 * (b - a) * sqrt((1.0 + a + b) / (a * b)) / (2 + a + b) + g2 = 6.0 * \ + (a ** 3 + a ** 2 * (1 - 2 * b) + + b ** 2 * (1 + b) - 2 * a * b * (2 + b)) + g2 /= a * b * (a + b + 2) * (a + b + 3) + return mn, var, g1, g2 + + def _fitstart(self, data): + g1 = _skew(data) + g2 = _kurtosis(data) + + def func(x): + a, b = x + sk = 2 * (b - a) * sqrt(a + b + 1) / (a + b + 2) / sqrt(a * b) + ku = a ** 3 - a ** 2 * \ + (2 * b - 1) + b ** 2 * (b + 1) - 2 * a * b * (b + 2) + ku /= a * b * (a + b + 2) * (a + b + 3) + ku *= 6 + return [sk - g1, ku - g2] + a, b = optimize.fsolve(func, (1.0, 1.0)) + return super(beta_gen, self)._fitstart(data, args=(a, b)) + + @inherit_docstring_from(rv_continuous) + def fit(self, data, *args, **kwds): + """%(super)s + In the special case where both `floc` and `fscale` are given, a + `ValueError` is raised if any value `x` in `data` does not satisfy + `floc < x < floc + fscale`. + """ + # Override rv_continuous.fit, so we can more efficiently handle the + # case where floc and fscale are given. + + f0 = kwds.get('f0', None) + f1 = kwds.get('f1', None) + floc = kwds.get('floc', None) + fscale = kwds.get('fscale', None) + + if floc is None or fscale is None: + # do general fit + return super(beta_gen, self).fit(data, *args, **kwds) + + if f0 is not None and f1 is not None: + # This check is for consistency with `rv_continuous.fit`. + raise ValueError("All parameters fixed. There is nothing to " + "optimize.") + + # Special case: loc and scale are constrained, so we are fitting + # just the shape parameters. This can be done much more efficiently + # than the method used in `rv_continuous.fit`. (See the subsection + # "Two unknown parameters" in the section "Maximum likelihood" of + # the Wikipedia article on the Beta distribution for the formulas.) + + # Normalize the data to the interval [0, 1]. + data = (ravel(data) - floc) / fscale + if np.any(data <= 0) or np.any(data >= 1): + raise FitDataError("beta", lower=floc, upper=floc + fscale) + xbar = data.mean() + + if f0 is not None or f1 is not None: + # One of the shape parameters is fixed. + + if f0 is not None: + # The shape parameter a is fixed, so swap the parameters + # and flip the data. We always solve for `a`. The result + # will be swapped back before returning. + b = f0 + data = 1 - data + xbar = 1 - xbar + else: + b = f1 + + # Initial guess for a. Use the formula for the mean of the beta + # distribution, E[x] = a / (a + b), to generate a reasonable + # starting point based on the mean of the data and the given + # value of b. + a = b * xbar / (1 - xbar) + + # Compute the MLE for `a` by solving _beta_mle_a. + theta, _info, ier, mesg = optimize.fsolve( + _beta_mle_a, a, + args=(b, len(data), np.log(data).sum()), + full_output=True + ) + if ier != 1: + raise FitSolverError(mesg=mesg) + a = theta[0] + + if f0 is not None: + # The shape parameter a was fixed, so swap back the + # parameters. + a, b = b, a + + else: + # Neither of the shape parameters is fixed. + + # s1 and s2 are used in the extra arguments passed to _beta_mle_ab + # by optimize.fsolve. + s1 = np.log(data).sum() + s2 = np.log(1 - data).sum() + + # Use the "method of moments" to estimate the initial + # guess for a and b. + fac = xbar * (1 - xbar) / data.var(ddof=0) - 1 + a = xbar * fac + b = (1 - xbar) * fac + + # Compute the MLE for a and b by solving _beta_mle_ab. + theta, _info, ier, mesg = optimize.fsolve( + _beta_mle_ab, [a, b], + args=(len(data), s1, s2), + full_output=True + ) + if ier != 1: + raise FitSolverError(mesg=mesg) + a, b = theta + + return a, b, floc, fscale + +beta = beta_gen(a=0.0, b=1.0, name='beta') + + +class betaprime_gen(rv_continuous): + + """A beta prime continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `betaprime` is:: + + betaprime.pdf(x, a, b) = x**(a-1) * (1+x)**(-a-b) / beta(a, b) + + for ``x > 0``, ``a > 0``, ``b > 0``, where ``beta(a, b)`` is the beta + function (see `scipy.special.beta`). + + %(example)s + + """ + + def _rvs(self, a, b): + u1 = gamma.rvs(a, size=self._size) + u2 = gamma.rvs(b, size=self._size) + return (u1 / u2) + + def _pdf(self, x, a, b): + return np.exp(self._logpdf(x, a, b)) + + def _logpdf(self, x, a, b): + return (special.xlogy(a - 1.0, x) - special.xlog1py(a + b, x) - + special.betaln(a, b)) + + def _cdf_skip(self, x, a, b): + # remove for now: special.hyp2f1 is incorrect for large a + x = where(x == 1.0, 1.0 - 1e-6, x) + return (pow(x, a) * special.hyp2f1(a + b, a, 1 + a, -x) / a / + special.beta(a, b)) + + def _munp(self, n, a, b): + if (n == 1.0): + return where(b > 1, a / (b - 1.0), inf) + elif (n == 2.0): + return where(b > 2, a * (a + 1.0) / ((b - 2.0) * (b - 1.0)), inf) + elif (n == 3.0): + return where( + b > 3, a * + (a + 1.0) * (a + 2.0) / ((b - 3.0) * (b - 2.0) * (b - 1.0)), + inf) + elif (n == 4.0): + return where(b > 4, + a * (a + 1.0) * (a + 2.0) * (a + 3.0) / + ((b - 4.0) * (b - 3.0) * (b - 2.0) * (b - 1.0)), inf) + else: + raise NotImplementedError +betaprime = betaprime_gen(a=0.0, name='betaprime') + + +class bradford_gen(rv_continuous): + + """A Bradford continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `bradford` is:: + + bradford.pdf(x, c) = c / (k * (1+c*x)), + + for ``0 < x < 1``, ``c > 0`` and ``k = log(1+c)``. + + %(example)s + + """ + + def _pdf(self, x, c): + return c / (c * x + 1.0) / log1p(c) + + def _cdf(self, x, c): + return log1p(c * x) / log1p(c) + + def _ppf(self, q, c): + return ((1.0 + c) ** q - 1) / c + + def _stats(self, c, moments='mv'): + k = log1p(c) + mu = (c - k) / (c * k) + mu2 = ((c + 2.0) * k - 2.0 * c) / (2 * c * k * k) + g1 = None + g2 = None + if 's' in moments: + g1 = sqrt(2) * \ + (12 * c * c - 9 * c * k * (c + 2) + + 2 * k * k * (c * (c + 3) + 3)) + g1 /= sqrt(c * (c * (k - 2) + 2 * k)) * (3 * c * (k - 2) + 6 * k) + if 'k' in moments: + g2 = (c ** 3 * (k - 3) * (k * (3 * k - 16) + 24) + + 12 * k * c * c * (k - 4) * (k - 3) + + 6 * c * k * k * (3 * k - 14) + 12 * k ** 3) + g2 /= 3 * c * (c * (k - 2) + 2 * k) ** 2 + return mu, mu2, g1, g2 + + def _entropy(self, c): + k = log1p(c) + return k / 2.0 - log(c / k) + + def _fitstart(self, data): + loc = data.min() - 1e-4 + scale = (data - loc).max() + m = np.mean((data - loc) / scale) + fun = lambda c: (c - log1p(c)) / (c * log1p(c)) - m + res = optimize.root(fun, 0.3) + c = res.x + return c, loc, scale +bradford = bradford_gen(a=0.0, b=1.0, name='bradford') + + +class burr_gen(rv_continuous): + + """A Burr continuous random variable. + + %(before_notes)s + + See Also + -------- + fisk : a special case of `burr` with ``d = 1`` + + Notes + ----- + The probability density function for `burr` is:: + + burr.pdf(x, c, d) = c * d * x**(-c-1) * (1+x**(-c))**(-d-1) + + for ``x > 0``. + + %(example)s + + """ + + def _pdf(self, x, c, d): + return c * d * (x ** (-c - 1.0)) * ((1 + x ** (-c * 1.)) ** (-d - 1.)) + + def _cdf(self, x, c, d): + return (1 + x ** (-c * 1.0)) ** (-d ** 1.0) + + def _ppf(self, q, c, d): + return (q ** (-1.0 / d) - 1) ** (-1.0 / c) + + def _munp(self, n, c, d): + nc = 1. * n / c + return d * special.beta(1.0 - nc, d + nc) +burr = burr_gen(a=0.0, name='burr') + + +class fisk_gen(burr_gen): + + """A Fisk continuous random variable. + + The Fisk distribution is also known as the log-logistic distribution, and + equals the Burr distribution with ``d == 1``. + + %(before_notes)s + + See Also + -------- + burr + + %(example)s + + """ + + def _pdf(self, x, c): + return burr_gen._pdf(self, x, c, 1.0) + + def _cdf(self, x, c): + return burr_gen._cdf(self, x, c, 1.0) + + def _ppf(self, x, c): + return burr_gen._ppf(self, x, c, 1.0) + + def _munp(self, n, c): + return burr_gen._munp(self, n, c, 1.0) + + def _entropy(self, c): + return 2 - log(c) +fisk = fisk_gen(a=0.0, name='fisk') + + +# median = loc +class cauchy_gen(rv_continuous): + + """A Cauchy continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `cauchy` is:: + + cauchy.pdf(x) = 1 / (pi * (1 + x**2)) + + %(example)s + + """ + + def _pdf(self, x): + return 1.0 / pi / (1.0 + x * x) + + def _cdf(self, x): + return 0.5 + 1.0 / pi * arctan(x) + + def _ppf(self, q): + return tan(pi * q - pi / 2.0) + + def _sf(self, x): + return 0.5 - 1.0 / pi * arctan(x) + + def _isf(self, q): + return tan(pi / 2.0 - pi * q) + + def _stats(self): + return inf, inf, nan, nan + + def _entropy(self): + return log(4 * pi) + + def _fitstart(self, data, args=None): + return (0, 1) +cauchy = cauchy_gen(name='cauchy') + + +class chi_gen(rv_continuous): + + """A chi continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `chi` is:: + + chi.pdf(x, df) = x**(df-1) * exp(-x**2/2) / (2**(df/2-1) * gamma(df/2)) + + for ``x > 0``. + + Special cases of `chi` are: + + - ``chi(1, loc, scale) = `halfnormal` + - ``chi(2, 0, scale) = `rayleigh` + - ``chi(3, 0, scale) : `maxwell` + + %(example)s + + """ + + def _rvs(self, df): + return sqrt(chi2.rvs(df, size=self._size)) + + def _pdf(self, x, df): + return (x ** (df - 1.) * exp(-x * x * 0.5) / (2.0) ** (df * 0.5 - 1) / + gam(df * 0.5)) + + def _cdf(self, x, df): + return special.gammainc(df * 0.5, 0.5 * x * x) + + def _ppf(self, q, df): + return sqrt(2 * special.gammaincinv(df * 0.5, q)) + + def _stats(self, df): + mu = sqrt(2) * special.gamma(df / 2.0 + 0.5) / special.gamma(df / 2.0) + mu2 = df - mu * mu + g1 = (2 * mu ** 3.0 + mu * (1 - 2 * df)) / asarray(np.power(mu2, 1.5)) + g2 = 2 * df * (1.0 - df) - 6 * mu ** 4 + 4 * mu ** 2 * (2 * df - 1) + g2 /= asarray(mu2 ** 2.0) + return mu, mu2, g1, g2 + + def _fitstart(self, data): + m = data.mean() + v = data.var() + # Supply a starting guess with method of moments: + df = max(np.round(v + m ** 2), 1) + return super(chi_gen, self)._fitstart(data, args=(df,)) +chi = chi_gen(a=0.0, name='chi') + + +# Chi-squared (gamma-distributed with loc=0 and scale=2 and shape=df/2) +class chi2_gen(rv_continuous): + + """A chi-squared continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `chi2` is:: + + chi2.pdf(x, df) = 1 / (2*gamma(df/2)) * (x/2)**(df/2-1) * exp(-x/2) + + %(example)s + + """ + + def _rvs(self, df): + return mtrand.chisquare(df, self._size) + + def _pdf(self, x, df): + return exp(self._logpdf(x, df)) + + def _logpdf(self, x, df): + return (special.xlogy(df / 2. - 1, x) - x / 2. - gamln(df / 2.) - + (log(2) * df) / 2.) + + def _cdf(self, x, df): + return special.chdtr(df, x) + + def _sf(self, x, df): + return special.chdtrc(df, x) + + def _isf(self, p, df): + return special.chdtri(df, p) + + def _ppf(self, p, df): + return self._isf(1.0 - p, df) + + def _stats(self, df): + mu = df + mu2 = 2 * df + g1 = 2 * sqrt(2.0 / df) + g2 = 12.0 / df + return mu, mu2, g1, g2 + + def _fitstart(self, data): + m = data.mean() + v = data.var() + # Supply a starting guess with method of moments: + df = max(np.round((m + v / 2) / 2), 1) + return super(chi2_gen, self)._fitstart(data, args=(df,)) +chi2 = chi2_gen(a=0.0, name='chi2') + + +class cosine_gen(rv_continuous): + + """A cosine continuous random variable. + + %(before_notes)s + + Notes + ----- + The cosine distribution is an approximation to the normal distribution. + The probability density function for `cosine` is:: + + cosine.pdf(x) = 1/(2*pi) * (1+cos(x)) + + for ``-pi <= x <= pi``. + + %(example)s + + """ + + def _pdf(self, x): + return 1.0 / 2 / pi * (1 + cos(x)) + + def _cdf(self, x): + return 1.0 / 2 / pi * (pi + x + sin(x)) + + def _stats(self): + return (0.0, pi * pi / 3.0 - 2.0, 0.0, + -6.0 * (pi ** 4 - 90) / (5.0 * (pi * pi - 6) ** 2)) + + def _entropy(self): + return log(4 * pi) - 1.0 +cosine = cosine_gen(a=-pi, b=pi, name='cosine') + + +class dgamma_gen(rv_continuous): + + """A double gamma continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `dgamma` is:: + + dgamma.pdf(x, a) = 1 / (2*gamma(a)) * abs(x)**(a-1) * exp(-abs(x)) + + for ``a > 0``. + + %(example)s + + """ + + def _rvs(self, a): + u = mtrand.random_sample(size=self._size) + return (gamma.rvs(a, size=self._size) * where(u >= 0.5, 1, -1)) + + def _pdf(self, x, a): + ax = abs(x) + return 1.0 / (2 * special.gamma(a)) * ax ** (a - 1.0) * exp(-ax) + + def _logpdf(self, x, a): + ax = abs(x) + return special.xlogy(a - 1.0, ax) - ax - log(2) - gamln(a) + + def _cdf(self, x, a): + fac = 0.5 * special.gammainc(a, abs(x)) + return where(x > 0, 0.5 + fac, 0.5 - fac) + + def _sf(self, x, a): + fac = 0.5 * special.gammainc(a, abs(x)) + return where(x > 0, 0.5 - fac, 0.5 + fac) + + def _ppf(self, q, a): + fac = special.gammainccinv(a, 1 - abs(2 * q - 1)) + return where(q > 0.5, fac, -fac) + + def _stats(self, a): + mu2 = a * (a + 1.0) + return 0.0, mu2, 0.0, (a + 2.0) * (a + 3.0) / mu2 - 3.0 +dgamma = dgamma_gen(name='dgamma') + + +class dweibull_gen(rv_continuous): + + """A double Weibull continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `dweibull` is:: + + dweibull.pdf(x, c) = c / 2 * abs(x)**(c-1) * exp(-abs(x)**c) + + %(example)s + + """ + + def _rvs(self, c): + u = mtrand.random_sample(size=self._size) + return weibull_min.rvs(c, size=self._size) * (where(u >= 0.5, 1, -1)) + + def _pdf(self, x, c): + ax = abs(x) + Px = c / 2.0 * ax ** (c - 1.0) * exp(-ax ** c) + return Px + + def _logpdf(self, x, c): + ax = abs(x) + return log(c) - log(2.0) + special.xlogy(c - 1.0, ax) - ax ** c + + def _cdf(self, x, c): + Cx1 = 0.5 * exp(-abs(x) ** c) + return where(x > 0, 1 - Cx1, Cx1) + + def _ppf(self, q, c): + fac = 2. * where(q <= 0.5, q, 1. - q) + fac = np.power(-log(fac), 1.0 / c) + return where(q > 0.5, fac, -fac) + + def _munp(self, n, c): + return (1 - (n % 2)) * special.gamma(1.0 + 1.0 * n / c) + + # since we know that all odd moments are zeros, return them at once. + # returning Nones from _stats makes the public stats call _munp + # so overall we're saving one or two gamma function evaluations here. + def _stats(self, c): + return 0, None, 0, None +dweibull = dweibull_gen(name='dweibull') + + +# Exponential (gamma distributed with a=1.0, loc=loc and scale=scale) +class expon_gen(rv_continuous): + + """An exponential continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `expon` is:: + + expon.pdf(x) = lambda * exp(- lambda*x) + + for ``x >= 0``. + + The scale parameter is equal to ``scale = 1.0 / lambda``. + + `expon` does not have shape parameters. + + %(example)s + + """ + + def link(self, x, logSF, phat, ix): + ''' Link for x,SF and parameters of Exponential distribution + + CALL phati = expon.link(x,logSF,phat,i) + + phati = parameter i as function of x, logSF and phat(j) where j ~= i + x = quantile + logSF = logarithm of the survival probability + + LINK is a function connecting the quantile (x) and the survival + probability (R) with the fixed distribution parameter, i.e.: + phat(i) = link(x,logSF,phat,i), + where logSF = log(Prob(X>x;phat)). + + Example % See proflog + + See also profile + ''' + if ix == 1: + return - (x - phat[0]) / logSF + elif ix == 0: + return x + phat[1] * logSF + + def _rvs(self): + return mtrand.standard_exponential(self._size) + + def _pdf(self, x): + return exp(-x) + + def _logpdf(self, x): + return -x + + def _cdf(self, x): + return -expm1(-x) + + def _ppf(self, q): + return -log1p(-q) + + def _sf(self, x): + return exp(-x) + + def _logsf(self, x): + return -x + + def _isf(self, q): + return -log(q) + + def _stats(self): + return 1.0, 1.0, 2.0, 6.0 + + def _entropy(self): + return 1.0 +expon = expon_gen(a=0.0, name='expon') + + +class exponweib_gen(rv_continuous): + + """An exponentiated Weibull continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `exponweib` is:: + + exponweib.pdf(x, a, c) = + a * c * (1-exp(-x**c))**(a-1) * exp(-x**c)*x**(c-1) + + for ``x > 0``, ``a > 0``, ``c > 0``. + + %(example)s + + """ + + def _pdf(self, x, a, c): + exc = exp(-x ** c) + return a * c * (1 - exc) ** asarray(a - 1) * exc * x ** (c - 1) + + def _logpdf(self, x, a, c): + exc = exp(-x ** c) + return (log(a) + log(c) + special.xlog1py(a - 1., -exc) - x ** c + + special.xlogy(c - 1.0, x)) + + def _cdf(self, x, a, c): + exm1c = -expm1(-x ** c) + return (exm1c) ** a + + def _ppf(self, q, a, c): + return (-log1p(-q ** (1.0 / a))) ** asarray(1.0 / c) +exponweib = exponweib_gen(a=0.0, name='exponweib') + + +class exponpow_gen(rv_continuous): + + """An exponential power continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `exponpow` is:: + + exponpow.pdf(x, b) = b * x**(b-1) * exp(1 + x**b - exp(x**b)) + + for ``x >= 0``, ``b > 0``. Note that this is a different distribution + from the exponential power distribution that is also known under the names + "generalized normal" or "generalized Gaussian". + + References + ---------- + http://www.math.wm.edu/~leemis/chart/UDR/PDFs/Exponentialpower.pdf + + %(example)s + + """ + + def _pdf(self, x, b): + xbm1 = x ** (b - 1.0) + xb = xbm1 * x + return exp(1) * b * xbm1 * exp(xb - exp(xb)) + + def _logpdf(self, x, b): + xb = x ** (b - 1.0) * x + return 1 + log(b) + (b - 1.0) * log(x) + xb - exp(xb) + + def _cdf(self, x, b): + return -expm1(-expm1(x ** b)) + + def _sf(self, x, b): + return exp(-expm1(x ** b)) + + def _isf(self, x, b): + return (log1p(-log(x))) ** (1. / b) + + def _ppf(self, q, b): + return pow(log1p(-log1p(-q)), 1.0 / b) +exponpow = exponpow_gen(a=0.0, name='exponpow') + + +class fatiguelife_gen(rv_continuous): + + """A fatigue-life (Birnbaum-Sanders) continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `fatiguelife` is:: + + fatiguelife.pdf(x, c) = + (x+1) / (2*c*sqrt(2*pi*x**3)) * exp(-(x-1)**2/(2*x*c**2)) + + for ``x > 0``. + + %(example)s + + """ + + def _rvs(self, c): + z = mtrand.standard_normal(self._size) + x = 0.5 * c * z + x2 = x * x + t = 1.0 + 2 * x2 + 2 * x * sqrt(1 + x2) + return t + + def _pdf(self, x, c): + return np.exp(self._logpdf(x, c)) + + def _logpdf(self, x, c): + return (log(x + 1) - (x - 1) ** 2 / (2.0 * x * c ** 2) - log(2 * c) - + 0.5 * (log(2 * pi) + 3 * log(x))) + + def _cdf(self, x, c): + return special.ndtr(1.0 / c * (sqrt(x) - 1.0 / sqrt(x))) + + def _ppf(self, q, c): + tmp = c * special.ndtri(q) + return 0.25 * (tmp + sqrt(tmp ** 2 + 4)) ** 2 + + def _stats(self, c): + # NB: the formula for kurtosis in wikipedia seems to have an error: + # it's 40, not 41. At least it disagrees with the one from Wolfram + # Alpha. And the latter one, below, passes the tests, while the wiki + # one doesn't So far I didn't have the guts to actually check the + # coefficients from the expressions for the raw moments. + c2 = c * c + mu = c2 / 2.0 + 1.0 + den = 5.0 * c2 + 4.0 + mu2 = c2 * den / 4.0 + g1 = 4 * c * (11 * c2 + 6.0) / np.power(den, 1.5) + g2 = 6 * c2 * (93 * c2 + 40.0) / den ** 2.0 + return mu, mu2, g1, g2 +fatiguelife = fatiguelife_gen(a=0.0, name='fatiguelife') + + +class foldcauchy_gen(rv_continuous): + + """A folded Cauchy continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `foldcauchy` is:: + + foldcauchy.pdf(x, c) = 1/(pi*(1+(x-c)**2)) + 1/(pi*(1+(x+c)**2)) + + for ``x >= 0``. + + %(example)s + + """ + + def _rvs(self, c): + return abs(cauchy.rvs(loc=c, size=self._size)) + + def _pdf(self, x, c): + return 1.0 / pi * (1.0 / (1 + (x - c) ** 2) + 1.0 / (1 + (x + c) ** 2)) + + def _cdf(self, x, c): + return 1.0 / pi * (arctan(x - c) + arctan(x + c)) + + def _stats(self, c): + return inf, inf, nan, nan +foldcauchy = foldcauchy_gen(a=0.0, name='foldcauchy') + + +class f_gen(rv_continuous): + + """An F continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `f` is:: + + df2**(df2/2) * df1**(df1/2) * x**(df1/2-1) + F.pdf(x, df1, df2) = -------------------------------------------- + (df2+df1*x)**((df1+df2)/2) * B(df1/2, df2/2) + + for ``x > 0``. + + %(example)s + + """ + + def _rvs(self, dfn, dfd): + return mtrand.f(dfn, dfd, self._size) + + def _pdf(self, x, dfn, dfd): + return exp(self._logpdf(x, dfn, dfd)) + + def _logpdf(self, x, dfn, dfd): + n = 1.0 * dfn + m = 1.0 * dfd + lPx = m / 2 * log(m) + n / 2 * log(n) + (n / 2 - 1) * log(x) + lPx -= ((n + m) / 2) * log(m + n * x) + special.betaln(n / 2, m / 2) + return lPx + + def _cdf(self, x, dfn, dfd): + return special.fdtr(dfn, dfd, x) + + def _sf(self, x, dfn, dfd): + return special.fdtrc(dfn, dfd, x) + + def _ppf(self, q, dfn, dfd): + return special.fdtri(dfn, dfd, q) + + def _stats(self, dfn, dfd): + v1, v2 = 1. * dfn, 1. * dfd + v2_2, v2_4, v2_6, v2_8 = v2 - 2., v2 - 4., v2 - 6., v2 - 8. + + mu = _lazywhere( + v2 > 2, (v2, v2_2), + lambda v2, v2_2: v2 / v2_2, + np.inf) + + mu2 = _lazywhere( + v2 > 4, (v1, v2, v2_2, v2_4), + lambda v1, v2, v2_2, v2_4: + 2 * v2 * v2 * (v1 + v2_2) / (v1 * v2_2 ** 2 * v2_4), + np.inf) + + g1 = _lazywhere( + v2 > 6, (v1, v2_2, v2_4, v2_6), + lambda v1, v2_2, v2_4, v2_6: + (2 * v1 + v2_2) / v2_6 * sqrt(v2_4 / (v1 * (v1 + v2_2))), + np.nan) + g1 *= np.sqrt(8.) + + g2 = _lazywhere( + v2 > 8, (g1, v2_6, v2_8), + lambda g1, v2_6, v2_8: (8 + g1 * g1 * v2_6) / v2_8, + np.nan) + g2 *= 3. / 2. + + return mu, mu2, g1, g2 + + def _fitstart(self, data): + m = data.mean() + v = data.var() + # Supply a starting guess with method of moments: + dfd = max(np.round(2 * m / (m - 1)), 5) + dfn = max( + np.round(2 * dfd * dfd * (dfd - 2) / + (v * (dfd - 4) * (dfd - 2) ** 2 - 2 * dfd * dfd)), 1) + return super(f_gen, self)._fitstart(data, args=(dfn, dfd,)) +f = f_gen(a=0.0, name='f') + + +# Folded Normal +# abs(Z) where (Z is normal with mu=L and std=S so that c=abs(L)/S) +# +# note: regress docs have scale parameter correct, but first parameter +# he gives is a shape parameter A = c * scale + +# Half-normal is folded normal with shape-parameter c=0. + +class foldnorm_gen(rv_continuous): + + """A folded normal continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `foldnorm` is:: + + foldnormal.pdf(x, c) = sqrt(2/pi) * cosh(c*x) * exp(-(x**2+c**2)/2) + + for ``c >= 0``. + + %(example)s + + """ + + def _argcheck(self, c): + return (c >= 0) + + def _rvs(self, c): + return abs(mtrand.standard_normal(self._size) + c) + + def _pdf(self, x, c): + return _norm_pdf(x + c) + _norm_pdf(x - c) + + def _cdf(self, x, c): + return special.ndtr(x - c) + special.ndtr(x + c) - 1.0 + + def _stats(self, c): + # Regina C. Elandt, Technometrics 3, 551 (1961) + # http://www.jstor.org/stable/1266561 + # + c2 = c * c + expfac = np.exp(-0.5 * c2) / np.sqrt(2. * pi) + + mu = 2. * expfac + c * special.erf(c / sqrt(2)) + mu2 = c2 + 1 - mu * mu + + g1 = 2. * (mu * mu * mu - c2 * mu - expfac) + g1 /= np.power(mu2, 1.5) + + g2 = c2 * (c2 + 6.) + 3 + 8. * expfac * mu + g2 += (2. * (c2 - 3.) - 3. * mu ** 2) * mu ** 2 + g2 = g2 / mu2 ** 2.0 - 3. + + return mu, mu2, g1, g2 +foldnorm = foldnorm_gen(a=0.0, name='foldnorm') + + +# Extreme Value Type II or Frechet +# (defined in Regress+ documentation as Extreme LB) as +# a limiting value distribution. +# +class frechet_r_gen(rv_continuous): + + """A Frechet right (or Weibull minimum) continuous random variable. + + %(before_notes)s + + See Also + -------- + weibull_min : The same distribution as `frechet_r`. + frechet_l, weibull_max + + Notes + ----- + The probability density function for `frechet_r` is:: + + frechet_r.pdf(x, c) = c * x**(c-1) * exp(-x**c) + + for ``x > 0``, ``c > 0``. + + %(example)s + + """ + + def link(self, x, logSF, phat, ix): + #u = phat[1] + if ix == 0: + phati = log(-logSF) / log((x - phat[1]) / phat[2]) + elif ix == 1: + phati = x - phat[2] * (-logSF) ** (1. / phat[0]) + elif ix == 2: + phati = (x - phat[1]) / (-logSF) ** (1. / phat[0]) + else: + raise IndexError('Index to the fixed parameter is out of bounds') + return phati + + def _pdf(self, x, c): + return c * pow(x, c - 1) * exp(-pow(x, c)) + + def _logpdf(self, x, c): + return log(c) + (c - 1) * log(x) - pow(x, c) + + def _cdf(self, x, c): + return -expm1(-pow(x, c)) + + def _ppf(self, q, c): + return pow(-log1p(-q), 1.0 / c) + + def _munp(self, n, c): + return special.gamma(1.0 + n * 1.0 / c) + + def _entropy(self, c): + return -_EULER / c - log(c) + _EULER + 1 + + def _fitstart(self, data): + loc = data.min() - 0.01 # *np.std(data) + chat = 1. / (6 ** (1 / 2) / pi * np.std(log(data - loc))) + scale = np.mean((data - loc) ** chat) ** (1. / chat) + return chat, loc, scale + +frechet_r = frechet_r_gen(a=0.0, name='frechet_r') +weibull_min = frechet_r_gen(a=0.0, name='weibull_min') + + +class frechet_l_gen(rv_continuous): + + """A Frechet left (or Weibull maximum) continuous random variable. + + %(before_notes)s + + See Also + -------- + weibull_max : The same distribution as `frechet_l`. + frechet_r, weibull_min + + Notes + ----- + The probability density function for `frechet_l` is:: + + frechet_l.pdf(x, c) = c * (-x)**(c-1) * exp(-(-x)**c) + + for ``x < 0``, ``c > 0``. + + %(example)s + + """ + + def _pdf(self, x, c): + return c * pow(-x, c - 1) * exp(-pow(-x, c)) + + def _cdf(self, x, c): + return exp(-pow(-x, c)) + + def _ppf(self, q, c): + return -pow(-log(q), 1.0 / c) + + def _munp(self, n, c): + val = special.gamma(1.0 + n * 1.0 / c) + if (int(n) % 2): + sgn = -1 + else: + sgn = 1 + return sgn * val + + def _entropy(self, c): + return -_EULER / c - log(c) + _EULER + 1 + + def _fitstart(self, data): + loc = data.max() + 0.1 * np.std(data) + chat = 1. / (6 ** (1 / 2) / pi * np.std(log(loc - data))) + scale = np.mean((loc - data) ** chat) ** (1. / chat) + return chat, loc, scale +frechet_l = frechet_l_gen(b=0.0, name='frechet_l') +weibull_max = frechet_l_gen(b=0.0, name='weibull_max') + + +class genlogistic_gen(rv_continuous): + + """A generalized logistic continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `genlogistic` is:: + + genlogistic.pdf(x, c) = c * exp(-x) / (1 + exp(-x))**(c+1) + + for ``x > 0``, ``c > 0``. + + %(example)s + + """ + + def _pdf(self, x, c): + return exp(self._logpdf(x, c)) + + def _logpdf(self, x, c): + return log(c) - x - (c + 1.0) * log1p(exp(-x)) + + def _cdf(self, x, c): + Cx = (1 + exp(-x)) ** (-c) + return Cx + + def _ppf(self, q, c): + vals = -log(pow(q, -1.0 / c) - 1) + return vals + + def _stats(self, c): + zeta = special.zeta + mu = _EULER + special.psi(c) + mu2 = pi * pi / 6.0 + zeta(2, c) + g1 = -2 * zeta(3, c) + 2 * _ZETA3 + g1 /= np.power(mu2, 1.5) + g2 = pi ** 4 / 15.0 + 6 * zeta(4, c) + g2 /= mu2 ** 2.0 + return mu, mu2, g1, g2 +genlogistic = genlogistic_gen(name='genlogistic') + + +def log1pxdx(x): + '''Computes Log(1+x)/x + ''' + xd = where((x == 0) | (x == inf), 1.0, x) # avoid 0/0 or inf/inf + y = where(x == 0, 1.0, log1p(x) / xd) + return where(x == inf, 0.0, y) + + +class genpareto_gen(rv_continuous): + + """A generalized Pareto continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `genpareto` is:: + + genpareto.pdf(x, c) = exp(-x) + + for c==0 + + genpareto.pdf(x, c) = (1 + c * x)**(-1 - 1/c) + + for ``c != 0``, and for ``x >= 0`` for all c, + and ``x < 1/abs(c)`` for ``c < 0``. + + %(example)s + + """ + + def link(self, x, logSF, phat, ix): + # Reference + # Stuart Coles (2004) + # "An introduction to statistical modelling of extreme values". + # Springer series in statistics + + u = phat[1] + if ix == 0: + raise ValueError( + 'link(x,logSF,phat,i) where i=0 is not implemented!') + elif ix == 2: + # Reorganizing w.r.t. phat[2] (scale), Eq. 4.13 and 4.14, pp 81 in + # Coles (2004) gives + # link = -(x-phat[1]).*phat[0]/expm1(phat[0]*logSF) + if phat[0] != 0.0: + phati = (x - u) * phat[0] / expm1(-phat[0] * logSF) + else: + phati = -(x - u) / logSF + elif ix == 1: + if phat[0] != 0: + phati = x + phat[2] * expm1(phat[0] * logSF) / phat[0] + else: + phati = x + phat(2) * logSF + else: + raise IndexError('Index to the fixed parameter is out of bounds') + return phati + + def _argcheck(self, c): + c = asarray(c) + self.b = _lazywhere(c < 0, (c,), + lambda c: -1. / c, np.inf) + return where(abs(c) == inf, 0, 1) + + def _pdf(self, x, c): + return exp(self._logpdf(x, c)) + + def _logpdf(self, x, c): + return -(c + 1.) * self._log1pcx(x, c) +# x1 = where((c == 0) & (x == inf), 0.0, x) +# cx = where((c == 0) & (x == inf), 0.0, c * x1) +# logpdf = where((cx == inf) | (cx == -1), +# -inf, -(x + cx) * log1pxdx(cx)) +# putmask(logpdf, (c == -1) & (x == 1.0), 0.0) +# return logpdf + # return (-1.0-1.0/c) * np.log1p(c*x) + + def _logsf(self, x, c): + return -self._log1pcx(x, c) + #cx = c * x + # return where((0.0 < x) & (-1.0 <= cx) & (c != 0), -log1p(cx) / c, -x) + + def _cdf(self, x, c): + log_sf = self._logsf(x, c) + return - expm1(log_sf) + # return 1.0 - power(1+c*x,asarray(-1.0/c)) + + def _sf(self, x, c): + log_sf = self._logsf(x, c) + return exp(log_sf) + + def _ppf(self, q, c): + log_sf = log1p(-q) + return where((c != 0) & (-inf < log_sf), expm1(-c * log_sf) / c, + -log_sf) + + def _isf(self, q, c): + log_sf = log(q) + return where((c != 0) & (-inf < log_sf), expm1(-c * log_sf) / c, + -log_sf) + #vals = 1.0/c * (power(1-q, -c)-1) + # return vals + + def _log1pcx(self, x, c): + ''' log(1+c*x)/c incl c\to 0 limit''' + return _lazywhere((x == x) & (c != 0), (x, c), + lambda x, c: np.log1p(c * x) / c, + x) + + def _fitstart(self, data): + d = asarray(data) + loc = d.min() - 0.01 * d.std() + # moments estimator + d1 = d - loc + m = d1.mean() + s = d1.std() + + shape = ((m / s) ** 2 - 1) / 2 + scale = m * ((m / s) ** 2 + 1) / 2 + return shape, loc, scale + + def hessian_nnlf(self, theta, x, eps=None): + try: + loc = theta[-2] + scale = theta[-1] + args = tuple(theta[:-2]) + except IndexError: + raise ValueError("Not enough input arguments.") + if not self._argcheck(*args) or scale <= 0: + return inf + x = asarray((x - loc) / scale) + cond0 = (x <= self.a) | (x >= self.b) + if any(cond0): + np = self.numargs + 2 + return valarray((np, np), value=nan) + eps = _EPS + c = args[0] + n = len(x) + if abs(c) > eps: + cx = c * x + sumlog1pcx = sum(log1p(cx)) + #LL = n*log(scale) + (1-1/k)*sumlog1mkxn + r = x / (1.0 + cx) + sumix = sum(1.0 / (1.0 + cx) ** 2.0) + + sumr = sum(r) + sumr2 = sum(r ** 2.0) + H11 = -2 * sumlog1pcx / c ** 3 + 2 * \ + sumr / c ** 2 + (1.0 + 1.0 / c) * sumr2 + H22 = c * (c + 1) * sumix / scale ** 2.0 + H33 = (n - 2 * (c + 1) * sumr + + c * (c + 1) * sumr2) / scale ** 2.0 + H12 = -sum((1 - x) / ((1 + cx) ** 2.0)) / scale + H23 = -(c + 1) * sumix / scale ** 2.0 + H13 = -(sumr - (c + 1) * sumr2) / scale + + else: # c == 0 + sumx = sum(x) + #LL = n*log(scale) + sumx; + + sumx2 = sum(x ** 2.0) + H11 = -(2 / 3) * sum(x ** 3.0) + sumx2 + H22 = 0.0 + H12 = -(n - sum(x)) / scale + H23 = -n * 1.0 / scale ** 2.0 + H33 = (n - 2 * sumx) / scale ** 2.0 + H13 = -(sumx - sumx2) / scale + + #% Hessian matrix + H = [[H11, H12, H13], [H12, H22, H23], [H13, H23, H33]] + return asarray(H) + + def __stats(self, c): + # return None,None,None,None + k = -c + m = where(k < -1.0, inf, 1.0 / (1 + k)) + v = where(k < -0.5, nan, 1.0 / ((1 + k) ** 2.0 * (1 + 2 * k))) + sk = where(k < -1.0 / 3, nan, 2. * (1 - k) + * sqrt(1 + 2.0 * k) / (1.0 + 3. * k)) + # E(X^r) = s^r*(-k)^-(r+1)*gamma(1+r)*gamma(-1/k-r)/gamma(1-1/k) + # = s^r*gamma(1+r)./( (1+k)*(1+2*k).*....*(1+r*k)) + # E[(1-k(X-m0)/s)^r] = 1/(1+k*r) + + #%Ex3 = (sk.*sqrt(v)+3*m).*v+m^3 + #%Ex3 = 6.*s.^3/((1+k).*(1+2*k).*(1+3*k)) + r = 4.0 + Ex4 = gam(1. + r) / \ + ((1. + k) * (1. + 2. * k) * (1. + 3. * k) * (1 + 4. * k)) + m1 = m + ku = where(k < -1. / 4, nan, (Ex4 - 4. * sk * v ** (3. / 2) + * m1 - 6 * m1 ** 2. * v - m1 ** 4.) / v ** 2. - 3.0) + return m, v, sk, ku + + def _munp(self, n, c): + def __munp(n, c): + val = 0.0 + k = arange(0, n + 1) + for ki, cnk in zip(k, comb(n, k)): + val = val + cnk * (-1) ** ki / (1.0 - c * ki) + return where(c * n < 1, val * (-1.0 / c) ** n, inf) + munp = lambda c: __munp(n, c) + return _lazywhere(c != 0, (c,), munp, gam(n + 1)) + + def _munp2(self, n, c): + k = arange(0, n + 1) + val = (-1.0 / c) ** n * \ + sum(comb(n, k) * (-1) ** k / (1.0 - c * k), axis=0) + return where(c * n < 1, val, inf) + + def _entropy(self, c): + return 1 + c +genpareto = genpareto_gen(a=0.0, name='genpareto') + + +class genexpon_gen(rv_continuous): + + """A generalized exponential continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `genexpon` is:: + + genexpon.pdf(x, a, b, c) = (a + b * (1 - exp(-c*x))) * \ + exp(-a*x - b*x + b/c * (1-exp(-c*x))) + + for ``x >= 0``, ``a, b, c > 0``. + + References + ---------- + H.K. Ryu, "An Extension of Marshall and Olkin's Bivariate Exponential + Distribution", Journal of the American Statistical Association, 1993. + + N. Balakrishnan, "The Exponential Distribution: Theory, Methods and + Applications", Asit P. Basu. + + %(example)s + + """ + + def link(self, x, logSF, phat, ix): + xn = (x - phat[3]) / phat[4] + b = phat[1] + c = phat[2] + fact1 = (xn + expm1(-c * xn) / c) + if ix == 0: + phati = b * fact1 + logSF + elif ix == 1: + phati = (phat[0] - logSF) / fact1 + else: + raise IndexError('Only implemented for ix in [1,2]!') + return phati + + def _pdf(self, x, a, b, c): + return (a + b * (-expm1(-c * x))) * exp((-a - b) * x + + b * (-expm1(-c * x)) / c) + + def _cdf(self, x, a, b, c): + return -expm1((-a - b) * x + b * (-expm1(-c * x)) / c) + + def _logpdf(self, x, a, b, c): + return (np.log(a + b * (-expm1(-c * x))) + (-a - b) * x + + b * (-expm1(-c * x)) / c) +genexpon = genexpon_gen(a=0.0, name='genexpon') + + +class genextreme_gen(rv_continuous): + + """A generalized extreme value continuous random variable. + + %(before_notes)s + + See Also + -------- + gumbel_r + + Notes + ----- + For ``c=0``, `genextreme` is equal to `gumbel_r`. + The probability density function for `genextreme` is:: + + genextreme.pdf(x, c) = + exp(-exp(-x))*exp(-x), for c==0 + exp(-(1-c*x)**(1/c))*(1-c*x)**(1/c-1), for x <= 1/c, c > 0 + + %(example)s + + """ + + def _argcheck(self, c): + min = np.minimum # @ReservedAssignment + max = np.maximum # @ReservedAssignment + self.b = where(c > 0, 1.0 / max(c, _XMIN), inf) + self.a = where(c < 0, 1.0 / min(c, -_XMIN), -inf) + return where(abs(c) == inf, 0, 1) + + def _pdf(self, x, c): + return exp(self._logpdf(x, c)) + + def _logpdf(self, x, c): + x1 = where((c == 0) & (x == inf), 0.0, x) + cx = c * x1 + cond1 = (c == 0) * (x == x) + logex2 = where(cond1, 0.0, log1p(-cx)) + logpex2 = -x * log1pxdx(-cx) + #logpex2 = where(cond1,-x,logex2/c) + pex2 = exp(logpex2) + # Handle special cases + logpdf = where( + (cx == 1) | (cx == -inf), -inf, -pex2 + logpex2 - logex2) + putmask(logpdf, (c == 1) & (x == 1), 0.0) + return exp(logpdf) + + def _cdf(self, x, c): + return exp(self._logcdf(x, c)) + + def _logcdf(self, x, c): + x1 = where((c == 0) & (x == inf), 0.0, x) + cx = c * x1 + loglogcdf = -x * log1pxdx(-cx) + #loglogcdf = where((c==0)*(x==x),-x,log1p(-cx)/c) + return -exp(loglogcdf) + + def _sf(self, x, c): + return -expm1(self._logcdf(x, c)) + + def _ppf(self, q, c): + x = -log(-log(q)) + return where((c == 0) * (x == x), x, -expm1(-c * x) / c) + # return _lazywhere((x==x) & (c != 0), (x, c), + # lambda x, c: -expm1(-c*x) / c, x) + + def _stats(self, c): + g = lambda n: gam(n * c + 1) + g1 = g(1) + g2 = g(2) + g3 = g(3) + g4 = g(4) + g2mg12 = where(abs(c) < 1e-7, (c * pi) ** 2.0 / 6.0, g2 - g1 ** 2.0) + gam2k = where(abs(c) < 1e-7, pi ** 2.0 / 6.0, + expm1(gamln(2.0 * c + 1.0) - + 2 * gamln(c + 1.0)) / c ** 2.0) + eps = 1e-14 + gamk = where(abs(c) < eps, -_EULER, expm1(gamln(c + 1)) / c) + + m = where(c < -1.0, nan, -gamk) + v = where(c < -0.5, nan, g1 ** 2.0 * gam2k) + + # skewness + sk1 = where(c < -1. / 3, nan, + np.sign(c) * (-g3 + (g2 + 2 * g2mg12) * g1) / + ((g2mg12) ** (3. / 2.))) + sk = where(abs(c) <= eps ** 0.29, 12 * sqrt(6) * _ZETA3 / pi ** 3, sk1) + + # kurtosis + ku1 = where(c < -1. / 4, nan, + (g4 + (-4 * g3 + 3 * (g2 + g2mg12) * g1) * g1) / + ((g2mg12) ** 2)) + ku = where(abs(c) <= (eps) ** 0.23, 12.0 / 5.0, ku1 - 3.0) + return m, v, sk, ku + + def _munp(self, n, c): + k = arange(0, n + 1) + vals = 1.0 / c ** n * sum( + comb(n, k) * (-1) ** k * special.gamma(c * k + 1), + axis=0) + return where(c * n > -1, vals, inf) + + def _fitstart(self, data): + d = asarray(data) + # Probability weighted moments + log = np.log + n = len(d) + d.sort() + koeff1 = np.r_[0:n] / (n - 1) + koeff2 = koeff1 * (np.r_[0:n] - 1) / (n - 2) + b2 = np.dot(koeff2, d) / n + b1 = np.dot(koeff1, d) / n + b0 = d.mean() + z = (2 * b1 - b0) / (3 * b2 - b0) - log(2) / log(3) + shape = 7.8590 * z + 2.9554 * z ** 2 + scale = (2 * b1 - b0) * shape / \ + (exp(gamln(1 + shape)) * (1 - 2 ** (-shape))) + loc = b0 + scale * (expm1(gamln(1 + shape))) / shape + return shape, loc, scale +genextreme = genextreme_gen(name='genextreme') + + +def _digammainv(y): + # Inverse of the digamma function (real positive arguments only). + # This function is used in the `fit` method of `gamma_gen`. + # The function uses either optimize.fsolve or optimize.newton + # to solve `digamma(x) - y = 0`. There is probably room for + # improvement, but currently it works over a wide range of y: + # >>> y = 64*np.random.randn(1000000) + # >>> y.min(), y.max() + # (-311.43592651416662, 351.77388222276869) + # x = [_digammainv(t) for t in y] + # np.abs(digamma(x) - y).max() + # 1.1368683772161603e-13 + # + _em = 0.5772156649015328606065120 + func = lambda x: special.digamma(x) - y + if y > -0.125: + x0 = exp(y) + 0.5 + if y < 10: + # Some experimentation shows that newton reliably converges + # must faster than fsolve in this y range. For larger y, + # newton sometimes fails to converge. + value = optimize.newton(func, x0, tol=1e-10) + return value + elif y > -3: + x0 = exp(y / 2.332) + 0.08661 + else: + x0 = 1.0 / (-y - _em) + + value, _info, ier, _msg = optimize.fsolve(func, x0, xtol=1e-11, + full_output=True) + if ier != 1: + raise RuntimeError("_digammainv: fsolve failed, y = %r" % y) + + return value[0] + + +# Gamma (Use MATLAB and MATHEMATICA (b=theta=scale, a=alpha=shape) definition) + +# gamma(a, loc, scale) with a an integer is the Erlang distribution +# gamma(1, loc, scale) is the Exponential distribution +# gamma(df/2, 0, 2) is the chi2 distribution with df degrees of freedom. + +class gamma_gen(rv_continuous): + + """A gamma continuous random variable. + + %(before_notes)s + + See Also + -------- + erlang, expon + + Notes + ----- + The probability density function for `gamma` is:: + + gamma.pdf(x, a) = lambda**a * x**(a-1) * exp(-lambda*x) / gamma(a) + + for ``x >= 0``, ``a > 0``. Here ``gamma(a)`` refers to the gamma function. + + The scale parameter is equal to ``scale = 1.0 / lambda``. + + `gamma` has a shape parameter `a` which needs to be set explicitly. For + instance: + + >>> from scipy.stats import gamma + >>> rv = gamma(3., loc = 0., scale = 2.) + + produces a frozen form of `gamma` with shape ``a = 3.``, ``loc =0.`` + and ``lambda = 1./scale = 1./2.``. + + When ``a`` is an integer, `gamma` reduces to the Erlang + distribution, and when ``a=1`` to the exponential distribution. + + %(example)s + + """ + + def _rvs(self, a): + return mtrand.standard_gamma(a, self._size) + + def _pdf(self, x, a): + return exp(self._logpdf(x, a)) + + def _logpdf(self, x, a): + return special.xlogy(a - 1.0, x) - x - gamln(a) + + def _cdf(self, x, a): + return special.gammainc(a, x) + + def _sf(self, x, a): + return special.gammaincc(a, x) + + def _ppf(self, q, a): + return special.gammaincinv(a, q) + + def _stats(self, a): + return a, a, 2.0 / sqrt(a), 6.0 / a + + def _entropy(self, a): + return special.psi(a) * (1 - a) + a + gamln(a) + + def _fitstart(self, data): + # The skewness of the gamma distribution is `4 / sqrt(a)`. + # We invert that to estimate the shape `a` using the skewness + # of the data. The formula is regularized with 1e-8 in the + # denominator to allow for degenerate data where the skewness + # is close to 0. + a = 4 / (1e-8 + _skew(data) ** 2) + return super(gamma_gen, self)._fitstart(data, args=(a,)) + + @inherit_docstring_from(rv_continuous) + def fit(self, data, *args, **kwds): + f0 = kwds.get('f0', None) + floc = kwds.get('floc', None) + fscale = kwds.get('fscale', None) + + if floc is None: + # loc is not fixed. Use the default fit method. + return super(gamma_gen, self).fit(data, *args, **kwds) + + # Special case: loc is fixed. + + if f0 is not None and fscale is not None: + # This check is for consistency with `rv_continuous.fit`. + # Without this check, this function would just return the + # parameters that were given. + raise ValueError("All parameters fixed. There is nothing to " + "optimize.") + + # Fixed location is handled by shifting the data. + data = np.asarray(data) + if np.any(data <= floc): + raise FitDataError("gamma", lower=floc, upper=np.inf) + if floc != 0: + # Don't do the subtraction in-place, because `data` might be a + # view of the input array. + data = data - floc + xbar = data.mean() + + # Three cases to handle: + # * shape and scale both free + # * shape fixed, scale free + # * shape free, scale fixed + + if fscale is None: + # scale is free + if f0 is not None: + # shape is fixed + a = f0 + else: + # shape and scale are both free. + # The MLE for the shape parameter `a` is the solution to: + # log(a) - special.digamma(a) - log(xbar) + log(data.mean) = 0 + s = log(xbar) - log(data).mean() + func = lambda a: log(a) - special.digamma(a) - s + aest = (3 - s + np.sqrt((s - 3) ** 2 + 24 * s)) / (12 * s) + xa = aest * (1 - 0.4) + xb = aest * (1 + 0.4) + a = optimize.brentq(func, xa, xb, disp=0) + + # The MLE for the scale parameter is just the data mean + # divided by the shape parameter. + scale = xbar / a + else: + # scale is fixed, shape is free + # The MLE for the shape parameter `a` is the solution to: + # special.digamma(a) - log(data).mean() + log(fscale) = 0 + c = log(data).mean() - log(fscale) + a = _digammainv(c) + scale = fscale + + return a, floc, scale + +gamma = gamma_gen(a=0.0, name='gamma') + + +class erlang_gen(gamma_gen): + + """An Erlang continuous random variable. + + %(before_notes)s + + See Also + -------- + gamma + + Notes + ----- + The Erlang distribution is a special case of the Gamma distribution, with + the shape parameter `a` an integer. Note that this restriction is not + enforced by `erlang`. It will, however, generate a warning the first time + a non-integer value is used for the shape parameter. + + Refer to `gamma` for examples. + + """ + + def _argcheck(self, a): + allint = np.all(np.floor(a) == a) + allpos = np.all(a > 0) + if not allint: + # An Erlang distribution shouldn't really have a non-integer + # shape parameter, so warn the user. + warnings.warn( + 'The shape parameter of the erlang distribution ' + 'has been given a non-integer value %r.' % (a,), + RuntimeWarning) + return allpos + + def _fitstart(self, data): + # Override gamma_gen_fitstart so that an integer initial value is + # used. (Also regularize the division, to avoid issues when + # _skew(data) is 0 or close to 0.) + a = int(4.0 / (1e-8 + _skew(data) ** 2)) + return super(gamma_gen, self)._fitstart(data, args=(a,)) + + # Trivial override of the fit method, so we can monkey-patch its + # docstring. + def fit(self, data, *args, **kwds): + return super(erlang_gen, self).fit(data, *args, **kwds) + + if fit.__doc__ is not None: + fit.__doc__ = (rv_continuous.fit.__doc__ + + """ + Notes + ----- + The Erlang distribution is generally defined to have integer values + for the shape parameter. This is not enforced by the `erlang` class. + When fitting the distribution, it will generally return a non-integer + value for the shape parameter. By using the keyword argument + `f0=`, the fit method can be constrained to fit the data to + a specific integer shape parameter. + """) +erlang = erlang_gen(a=0.0, name='erlang') + + +class gengamma_gen(rv_continuous): + + """A generalized gamma continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `gengamma` is:: + + gengamma.pdf(x, a, c) = abs(c) * x**(c*a-1) * exp(-x**c) / gamma(a) + + for ``x > 0``, ``a > 0``, and ``c != 0``. + + %(example)s + + """ + + def _argcheck(self, a, c): + return (a > 0) & (c != 0) + + def _pdf(self, x, a, c): + return abs(c) * exp((c * a - 1) * log(x) - x ** c - gamln(a)) + + def _cdf(self, x, a, c): + val = special.gammainc(a, x ** c) + cond = c + 0 * val + return where(cond > 0, val, 1 - val) + + def _ppf(self, q, a, c): + val1 = special.gammaincinv(a, q) + val2 = special.gammaincinv(a, 1.0 - q) + ic = 1.0 / c + cond = c + 0 * val1 + return where(cond > 0, val1 ** ic, val2 ** ic) + + def _munp(self, n, a, c): + return special.gamma(a + n * 1.0 / c) / special.gamma(a) + + def _entropy(self, a, c): + val = special.psi(a) + return a * (1 - val) + 1.0 / c * val + gamln(a) - log(abs(c)) +gengamma = gengamma_gen(a=0.0, name='gengamma') + + +class genhalflogistic_gen(rv_continuous): + + """A generalized half-logistic continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `genhalflogistic` is:: + + genhalflogistic.pdf(x, c) = 2 * (1-c*x)**(1/c-1) / (1+(1-c*x)**(1/c))**2 + + for ``0 <= x <= 1/c``, and ``c > 0``. + + %(example)s + + """ + + def _argcheck(self, c): + self.b = 1.0 / c + return (c > 0) + + def _pdf(self, x, c): + limit = 1.0 / c + tmp = asarray(1 - c * x) + tmp0 = tmp ** (limit - 1) + tmp2 = tmp0 * tmp + return 2 * tmp0 / (1 + tmp2) ** 2 + + def _cdf(self, x, c): + limit = 1.0 / c + tmp = asarray(1 - c * x) + tmp2 = tmp ** (limit) + return (1.0 - tmp2) / (1 + tmp2) + + def _ppf(self, q, c): + return 1.0 / c * (1 - ((1.0 - q) / (1.0 + q)) ** c) + + def _entropy(self, c): + return 2 - (2 * c + 1) * log(2) +genhalflogistic = genhalflogistic_gen(a=0.0, name='genhalflogistic') + + +class gompertz_gen(rv_continuous): + + """A Gompertz (or truncated Gumbel) continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `gompertz` is:: + + gompertz.pdf(x, c) = c * exp(x) * exp(-c*(exp(x)-1)) + + for ``x >= 0``, ``c > 0``. + + %(example)s + + """ + + def _pdf(self, x, c): + return exp(self._logpdf(x, c)) + + def _logpdf(self, x, c): + return log(c) + x - c * expm1(x) + + def _cdf(self, x, c): + return -expm1(-c * expm1(x)) + + def _ppf(self, q, c): + return log1p(-1.0 / c * log1p(-q)) + + def _entropy(self, c): + return 1.0 - log(c) - exp(c) * special.expn(1, c) +gompertz = gompertz_gen(a=0.0, name='gompertz') + + +class gumbel_r_gen(rv_continuous): + + """A right-skewed Gumbel continuous random variable. + + %(before_notes)s + + See Also + -------- + gumbel_l, gompertz, genextreme + + Notes + ----- + The probability density function for `gumbel_r` is:: + + gumbel_r.pdf(x) = exp(-(x + exp(-x))) + + The Gumbel distribution is sometimes referred to as a type I Fisher-Tippett + distribution. It is also related to the extreme value distribution, + log-Weibull and Gompertz distributions. + + %(example)s + + """ + + def _pdf(self, x): + return exp(self._logpdf(x)) + + def _logpdf(self, x): + return -x - exp(-x) + + def _cdf(self, x): + return exp(-exp(-x)) + + def _logcdf(self, x): + return -exp(-x) + + def _ppf(self, q): + return -log(-log(q)) + + def _stats(self): + return _EULER, pi * pi / 6.0, 12 * sqrt(6) / pi ** 3 * _ZETA3, 12.0 / 5 + + def _entropy(self): + # http://en.wikipedia.org/wiki/Gumbel_distribution + return _EULER + 1. +gumbel_r = gumbel_r_gen(name='gumbel_r') + + +class gumbel_l_gen(rv_continuous): + + """A left-skewed Gumbel continuous random variable. + + %(before_notes)s + + See Also + -------- + gumbel_r, gompertz, genextreme + + Notes + ----- + The probability density function for `gumbel_l` is:: + + gumbel_l.pdf(x) = exp(x - exp(x)) + + The Gumbel distribution is sometimes referred to as a type I Fisher-Tippett + distribution. It is also related to the extreme value distribution, + log-Weibull and Gompertz distributions. + + %(example)s + + """ + + def _pdf(self, x): + return exp(self._logpdf(x)) + + def _logpdf(self, x): + return x - exp(x) + + def _cdf(self, x): + return -expm1(-exp(x)) + + def _ppf(self, q): + return log(-log1p(-q)) + + def _stats(self): + return -_EULER, pi * pi / 6.0, \ + -12 * sqrt(6) / pi ** 3 * _ZETA3, 12.0 / 5 + + def _entropy(self): + return _EULER + 1. +gumbel_l = gumbel_l_gen(name='gumbel_l') + + +class halfcauchy_gen(rv_continuous): + + """A Half-Cauchy continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `halfcauchy` is:: + + halfcauchy.pdf(x) = 2 / (pi * (1 + x**2)) + + for ``x >= 0``. + + %(example)s + + """ + + def _pdf(self, x): + return 2.0 / pi / (1.0 + x * x) + + def _logpdf(self, x): + return np.log(2.0 / pi) - np.log1p(x * x) + + def _cdf(self, x): + return 2.0 / pi * arctan(x) + + def _ppf(self, q): + return tan(pi / 2 * q) + + def _stats(self): + return inf, inf, nan, nan + + def _entropy(self): + return log(2 * pi) +halfcauchy = halfcauchy_gen(a=0.0, name='halfcauchy') + + +class halflogistic_gen(rv_continuous): + + """A half-logistic continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `halflogistic` is:: + + halflogistic.pdf(x) = 2 * exp(-x) / (1+exp(-x))**2 = 1/2 * sech(x/2)**2 + + for ``x >= 0``. + + %(example)s + + """ + + def _pdf(self, x): + return exp(self._logpdf(x)) + + def _logpdf(self, x): + return log(2) - x - 2. * special.log1p(exp(-x)) + + def _cdf(self, x): + return tanh(x / 2.0) + + def _ppf(self, q): + return 2 * arctanh(q) + + def _munp(self, n): + if n == 1: + return 2 * log(2) + if n == 2: + return pi * pi / 3.0 + if n == 3: + return 9 * _ZETA3 + if n == 4: + return 7 * pi ** 4 / 15.0 + return (2 * (1 - pow(2.0, 1 - n)) * special.gamma(n + 1) * + special.zeta(n, 1)) + + def _entropy(self): + return 2 - log(2) +halflogistic = halflogistic_gen(a=0.0, name='halflogistic') + + +class halfnorm_gen(rv_continuous): + + """A half-normal continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `halfnorm` is:: + + halfnorm.pdf(x) = sqrt(2/pi) * exp(-x**2/2) + + for ``x > 0``. + + `halfnorm` is a special case of `chi` with ``df == 1``. + + %(example)s + + """ + + def _rvs(self): + return abs(mtrand.standard_normal(size=self._size)) + + def _pdf(self, x): + return sqrt(2.0 / pi) * exp(-x * x / 2.0) + + def _logpdf(self, x): + return 0.5 * np.log(2.0 / pi) - x * x / 2.0 + + def _cdf(self, x): + return special.ndtr(x) * 2 - 1.0 + + def _ppf(self, q): + return special.ndtri((1 + q) / 2.0) + + def _stats(self): + return ( + sqrt(2.0 / pi), 1 - 2.0 / pi, sqrt(2) * (4 - pi) / (pi - 2) ** 1.5, + 8 * (pi - 3) / (pi - 2) ** 2) + + def _entropy(self): + return 0.5 * log(pi / 2.0) + 0.5 +halfnorm = halfnorm_gen(a=0.0, name='halfnorm') + + +class hypsecant_gen(rv_continuous): + + """A hyperbolic secant continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `hypsecant` is:: + + hypsecant.pdf(x) = 1/pi * sech(x) + + %(example)s + + """ + + def _pdf(self, x): + return 1.0 / (pi * cosh(x)) + + def _cdf(self, x): + return 2.0 / pi * arctan(exp(x)) + + def _ppf(self, q): + return log(tan(pi * q / 2.0)) + + def _stats(self): + return 0, pi * pi / 4, 0, 2 + + def _entropy(self): + return log(2 * pi) +hypsecant = hypsecant_gen(name='hypsecant') + + +class gausshyper_gen(rv_continuous): + + """A Gauss hypergeometric continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `gausshyper` is:: + + gausshyper.pdf(x, a, b, c, z) = + C * x**(a-1) * (1-x)**(b-1) * (1+z*x)**(-c) + + for ``0 <= x <= 1``, ``a > 0``, ``b > 0``, and + ``C = 1 / (B(a, b) F[2, 1](c, a; a+b; -z))`` + + %(example)s + + """ + + def _argcheck(self, a, b, c, z): + return (a > 0) & (b > 0) & (c == c) & (z == z) + + def _pdf(self, x, a, b, c, z): + Cinv = gam(a) * gam(b) / gam(a + b) * special.hyp2f1(c, a, a + b, -z) + return (1.0 / Cinv * x ** (a - 1.0) * (1.0 - x) ** (b - 1.0) / + (1.0 + z * x) ** c) + + def _munp(self, n, a, b, c, z): + fac = special.beta(n + a, b) / special.beta(a, b) + num = special.hyp2f1(c, a + n, a + b + n, -z) + den = special.hyp2f1(c, a, a + b, -z) + return fac * num / den +gausshyper = gausshyper_gen(a=0.0, b=1.0, name='gausshyper') + + +class invgamma_gen(rv_continuous): + + """An inverted gamma continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `invgamma` is:: + + invgamma.pdf(x, a) = x**(-a-1) / gamma(a) * exp(-1/x) + + for x > 0, a > 0. + + `invgamma` is a special case of `gengamma` with ``c == -1``. + + %(example)s + + """ + + def _pdf(self, x, a): + return exp(self._logpdf(x, a)) + + def _logpdf(self, x, a): + return (-(a + 1) * log(x) - gamln(a) - 1.0 / x) + + def _cdf(self, x, a): + return 1.0 - special.gammainc(a, 1.0 / x) + + def _ppf(self, q, a): + return 1.0 / special.gammaincinv(a, 1. - q) + + def _stats(self, a, moments='mvsk'): + m1 = _lazywhere(a > 1, (a,), lambda x: 1. / (x - 1.), np.inf) + m2 = _lazywhere(a > 2, (a,), lambda x: 1. / (x - 1.) ** 2 / (x - 2.), + np.inf) + + g1, g2 = None, None + if 's' in moments: + g1 = _lazywhere( + a > 3, (a,), + lambda x: 4. * np.sqrt(x - 2.) / (x - 3.), np.nan) + if 'k' in moments: + g2 = _lazywhere( + a > 4, (a,), + lambda x: 6. * (5. * x - 11.) / (x - 3.) / (x - 4.), np.nan) + return m1, m2, g1, g2 + + def _entropy(self, a): + return a - (a + 1.0) * special.psi(a) + gamln(a) +invgamma = invgamma_gen(a=0.0, name='invgamma') + + +# scale is gamma from DATAPLOT and B from Regress +class invgauss_gen(rv_continuous): + + """An inverse Gaussian continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `invgauss` is:: + + invgauss.pdf(x, mu) = 1 / sqrt(2*pi*x**3) * exp(-(x-mu)**2/(2*x*mu**2)) + + for ``x > 0``. + + When `mu` is too small, evaluating the cumulative density function will be + inaccurate due to ``cdf(mu -> 0) = inf * 0``. + NaNs are returned for ``mu <= 0.0028``. + + %(example)s + + """ + + def _rvs(self, mu): + return mtrand.wald(mu, 1.0, size=self._size) + + def _pdf(self, x, mu): + return (1.0 / sqrt(2 * pi * x ** 3.0) * + exp(-1.0 / (2 * x) * ((x - mu) / mu) ** 2)) + + def _logpdf(self, x, mu): + return (-0.5 * log(2 * pi) - 1.5 * log(x) - + ((x - mu) / mu) ** 2 / (2 * x)) + + def _cdf(self, x, mu): + fac = sqrt(1.0 / x) + # Numerical accuracy for small `mu` is bad. See #869. + C1 = _norm_cdf(fac * (x - mu) / mu) + C1 += exp(1.0 / mu) * _norm_cdf(-fac * (x + mu) / mu) * exp(1.0 / mu) + return C1 + + def _stats(self, mu): + return mu, mu ** 3.0, 3 * sqrt(mu), 15 * mu +invgauss = invgauss_gen(a=0.0, name='invgauss') + + +class invweibull_gen(rv_continuous): + + """An inverted Weibull continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `invweibull` is:: + + invweibull.pdf(x, c) = c * x**(-c-1) * exp(-x**(-c)) + + for ``x > 0``, ``c > 0``. + + References + ---------- + F.R.S. de Gusmao, E.M.M Ortega and G.M. Cordeiro, "The generalized inverse + Weibull distribution", Stat. Papers, vol. 52, pp. 591-619, 2011. + + %(example)s + + """ + + def _pdf(self, x, c): + xc1 = np.power(x, -c - 1.0) + xc2 = np.power(x, -c) + xc2 = exp(-xc2) + return c * xc1 * xc2 + + def _cdf(self, x, c): + xc1 = np.power(x, -c) + return exp(-xc1) + + def _ppf(self, q, c): + return np.power(-log(q), -1.0 / c) + + def _munp(self, n, c): + return special.gamma(1 - n / c) + + def _entropy(self, c): + return 1 + _EULER + _EULER / c - log(c) +invweibull = invweibull_gen(a=0, name='invweibull') + + +class johnsonsb_gen(rv_continuous): + + """A Johnson SB continuous random variable. + + %(before_notes)s + + See Also + -------- + johnsonsu + + Notes + ----- + The probability density function for `johnsonsb` is:: + + johnsonsb.pdf(x, a, b) = b / (x*(1-x)) * phi(a + b * log(x/(1-x))) + + for ``0 < x < 1`` and ``a, b > 0``, and ``phi`` is the normal pdf. + + %(example)s + + """ + + def _argcheck(self, a, b): + return (b > 0) & (a == a) + + def _pdf(self, x, a, b): + trm = _norm_pdf(a + b * log(x / (1.0 - x))) + return b * 1.0 / (x * (1 - x)) * trm + + def _cdf(self, x, a, b): + return _norm_cdf(a + b * log(x / (1.0 - x))) + + def _ppf(self, q, a, b): + return 1.0 / (1 + exp(-1.0 / b * (_norm_ppf(q) - a))) +johnsonsb = johnsonsb_gen(a=0.0, b=1.0, name='johnsonb') + + +class johnsonsu_gen(rv_continuous): + + """A Johnson SU continuous random variable. + + %(before_notes)s + + See Also + -------- + johnsonsb + + Notes + ----- + The probability density function for `johnsonsu` is:: + + johnsonsu.pdf(x, a, b) = b / sqrt(x**2 + 1) * + phi(a + b * log(x + sqrt(x**2 + 1))) + + for all ``x, a, b > 0``, and `phi` is the normal pdf. + + %(example)s + + """ + + def _argcheck(self, a, b): + return (b > 0) & (a == a) + + def _pdf(self, x, a, b): + x2 = x * x + trm = _norm_pdf(a + b * log(x + sqrt(x2 + 1))) + return b * 1.0 / sqrt(x2 + 1.0) * trm + + def _cdf(self, x, a, b): + return _norm_cdf(a + b * log(x + sqrt(x * x + 1))) + + def _ppf(self, q, a, b): + return sinh((_norm_ppf(q) - a) / b) +johnsonsu = johnsonsu_gen(name='johnsonsu') + + +class laplace_gen(rv_continuous): + + """A Laplace continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `laplace` is:: + + laplace.pdf(x) = 1/2 * exp(-abs(x)) + + %(example)s + + """ + + def _rvs(self): + return mtrand.laplace(0, 1, size=self._size) + + def _pdf(self, x): + return 0.5 * exp(-abs(x)) + + def _cdf(self, x): + return where(x > 0, 1.0 - 0.5 * exp(-x), 0.5 * exp(x)) + + def _ppf(self, q): + return where(q > 0.5, -log(2 * (1 - q)), log(2 * q)) + + def _stats(self): + return 0, 2, 0, 3 + + def _entropy(self): + return log(2) + 1 +laplace = laplace_gen(name='laplace') + + +class levy_gen(rv_continuous): + + """A Levy continuous random variable. + + %(before_notes)s + + See Also + -------- + levy_stable, levy_l + + Notes + ----- + The probability density function for `levy` is:: + + levy.pdf(x) = 1 / (x * sqrt(2*pi*x)) * exp(-1/(2*x)) + + for ``x > 0``. + + This is the same as the Levy-stable distribution with a=1/2 and b=1. + + %(example)s + + """ + + def _pdf(self, x): + return 1 / sqrt(2 * pi * x) / x * exp(-1 / (2 * x)) + + def _cdf(self, x): + return 2 * (1 - _norm_cdf(1 / sqrt(x))) + + def _ppf(self, q): + val = _norm_ppf(1 - q / 2.0) + return 1.0 / (val * val) + + def _stats(self): + return inf, inf, nan, nan +levy = levy_gen(a=0.0, name="levy") + + +class levy_l_gen(rv_continuous): + + """A left-skewed Levy continuous random variable. + + %(before_notes)s + + See Also + -------- + levy, levy_stable + + Notes + ----- + The probability density function for `levy_l` is:: + + levy_l.pdf(x) = 1 / (abs(x) * sqrt(2*pi*abs(x))) * exp(-1/(2*abs(x))) + + for ``x < 0``. + + This is the same as the Levy-stable distribution with a=1/2 and b=-1. + + %(example)s + + """ + + def _pdf(self, x): + ax = abs(x) + return 1 / sqrt(2 * pi * ax) / ax * exp(-1 / (2 * ax)) + + def _cdf(self, x): + ax = abs(x) + return 2 * _norm_cdf(1 / sqrt(ax)) - 1 + + def _ppf(self, q): + val = _norm_ppf((q + 1.0) / 2) + return -1.0 / (val * val) + + def _stats(self): + return inf, inf, nan, nan +levy_l = levy_l_gen(b=0.0, name="levy_l") + + +class levy_stable_gen(rv_continuous): + + """A Levy-stable continuous random variable. + + %(before_notes)s + + See Also + -------- + levy, levy_l + + Notes + ----- + Levy-stable distribution (only random variates available -- ignore other + docs) + + %(example)s + + """ + + def _rvs(self, alpha, beta): + sz = self._size + TH = uniform.rvs(loc=-pi / 2.0, scale=pi, size=sz) + W = expon.rvs(size=sz) + if alpha == 1: + return (2. / pi * (pi / 2 + beta * TH) * tan(TH) - + beta * log((pi / 2 * W * cos(TH)) / (pi / 2 + beta * TH))) + + ialpha = 1.0 / alpha + aTH = alpha * TH + if beta == 0: + return (W / (cos(TH) / tan(aTH) + sin(TH)) * + ((cos(aTH) + sin(aTH) * tan(TH)) / W) ** ialpha) + + val0 = beta * tan(pi * alpha / 2) + th0 = arctan(val0) / alpha + val3 = W / (cos(TH) / tan(alpha * (th0 + TH)) + sin(TH)) + res3 = val3 * \ + ((cos(aTH) + sin(aTH) * tan(TH) - val0 * + (sin(aTH) - cos(aTH) * tan(TH))) / W) ** ialpha + return res3 + + def _argcheck(self, alpha, beta): + if beta == -1: + self.b = 0.0 + elif beta == 1: + self.a = 0.0 + return (alpha > 0) & (alpha <= 2) & (beta <= 1) & (beta >= -1) + + def _pdf(self, x, alpha, beta): + raise NotImplementedError +levy_stable = levy_stable_gen(name='levy_stable') + + +class logistic_gen(rv_continuous): + + """A logistic (or Sech-squared) continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `logistic` is:: + + logistic.pdf(x) = exp(-x) / (1+exp(-x))**2 + + `logistic` is a special case of `genlogistic` with ``c == 1``. + + %(example)s + + """ + + def _rvs(self): + return mtrand.logistic(size=self._size) + + def _pdf(self, x): + return exp(self._logpdf(x)) + + def _logpdf(self, x): + return -x - 2. * special.log1p(exp(-x)) + + def _cdf(self, x): + return special.expit(x) + + def _ppf(self, q): + return -log(1.0 / q - 1) + + def _stats(self): + return 0, pi * pi / 3.0, 0, 6.0 / 5.0 + + def _entropy(self): + # http://en.wikipedia.org/wiki/Logistic_distribution + return 2.0 +logistic = logistic_gen(name='logistic') + + +class loggamma_gen(rv_continuous): + + """A log gamma continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `loggamma` is:: + + loggamma.pdf(x, c) = exp(c*x-exp(x)) / gamma(c) + + for all ``x, c > 0``. + + %(example)s + + """ + + def _rvs(self, c): + return log(mtrand.gamma(c, size=self._size)) + + def _pdf(self, x, c): + return exp(c * x - exp(x) - gamln(c)) + + def _cdf(self, x, c): + return special.gammainc(c, exp(x)) + + def _ppf(self, q, c): + return log(special.gammaincinv(c, q)) + + def _stats(self, c): + # See, for example, "A Statistical Study of Log-Gamma Distribution", by + # Ping Shing Chan (thesis, McMaster University, 1993). + mean = special.digamma(c) + var = special.polygamma(1, c) + skewness = special.polygamma(2, c) / np.power(var, 1.5) + excess_kurtosis = special.polygamma(3, c) / (var * var) + return mean, var, skewness, excess_kurtosis + +loggamma = loggamma_gen(name='loggamma') + + +class loglaplace_gen(rv_continuous): + + """A log-Laplace continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `loglaplace` is:: + + loglaplace.pdf(x, c) = c / 2 * x**(c-1), for 0 < x < 1 + = c / 2 * x**(-c-1), for x >= 1 + + for ``c > 0``. + + References + ---------- + T.J. Kozubowski and K. Podgorski, "A log-Laplace growth rate model", + The Mathematical Scientist, vol. 28, pp. 49-60, 2003. + + %(example)s + + """ + + def _pdf(self, x, c): + cd2 = c / 2.0 + c = where(x < 1, c, -c) + return cd2 * x ** (c - 1) + + def _cdf(self, x, c): + return where(x < 1, 0.5 * x ** c, 1 - 0.5 * x ** (-c)) + + def _ppf(self, q, c): + return where(q < 0.5, (2.0 * q) ** (1.0 / c), + (2 * (1.0 - q)) ** (-1.0 / c)) + + def _munp(self, n, c): + return c ** 2 / (c ** 2 - n ** 2) + + def _entropy(self, c): + return log(2.0 / c) + 1.0 +loglaplace = loglaplace_gen(a=0.0, name='loglaplace') + + +def _lognorm_logpdf(x, s): + return (-log(x) ** 2 / (2 * s ** 2) + np.where(x == 0, 0, + -log(s * x * sqrt(2 * pi)))) + + +class lognorm_gen(rv_continuous): + + """A lognormal continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `lognorm` is:: + + lognorm.pdf(x, s) = 1 / (s*x*sqrt(2*pi)) * exp(-1/2*(log(x)/s)**2) + + for ``x > 0``, ``s > 0``. + + If ``log(x)`` is normally distributed with mean ``mu`` and variance + ``sigma**2``, then ``x`` is log-normally distributed with shape parameter + sigma and scale parameter ``exp(mu)``. + + %(example)s + + """ + + def _rvs(self, s): + return exp(s * mtrand.standard_normal(self._size)) + + def _pdf(self, x, s): + return exp(self._logpdf(x, s)) + + def _logpdf(self, x, s): + return _lognorm_logpdf(x, s) + + def _cdf(self, x, s): + return _norm_cdf(log(x) / s) + + def _ppf(self, q, s): + return exp(s * _norm_ppf(q)) + + def _stats(self, s): + p = exp(s * s) + mu = sqrt(p) + mu2 = p * (p - 1) + g1 = sqrt((p - 1)) * (2 + p) + g2 = np.polyval([1, 2, 3, 0, -6.0], p) + return mu, mu2, g1, g2 + + def _entropy(self, s): + return 0.5 * (1 + log(2 * pi) + 2 * log(s)) + + def _fitstart(self, data): + scale = data.std() + loc = data.min() - 0.001 + logd = log(data - loc) + m = logd.mean() + s = sqrt((logd ** 2).mean() - m ** 2) + return s, loc, scale +lognorm = lognorm_gen(a=0.0, name='lognorm') + + +class gilbrat_gen(rv_continuous): + + """A Gilbrat continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `gilbrat` is:: + + gilbrat.pdf(x) = 1/(x*sqrt(2*pi)) * exp(-1/2*(log(x))**2) + + `gilbrat` is a special case of `lognorm` with ``s = 1``. + + %(example)s + + """ + + def _rvs(self): + return exp(mtrand.standard_normal(self._size)) + + def _pdf(self, x): + return exp(self._logpdf(x)) + + def _logpdf(self, x): + return _lognorm_logpdf(x, 1.0) + + def _cdf(self, x): + return _norm_cdf(log(x)) + + def _ppf(self, q): + return exp(_norm_ppf(q)) + + def _stats(self): + p = np.e + mu = sqrt(p) + mu2 = p * (p - 1) + g1 = sqrt((p - 1)) * (2 + p) + g2 = np.polyval([1, 2, 3, 0, -6.0], p) + return mu, mu2, g1, g2 + + def _entropy(self): + return 0.5 * log(2 * pi) + 0.5 + + def _fitstart(self, data): + scale = data.std() + loc = data.min() - 0.001 + return loc, scale +gilbrat = gilbrat_gen(a=0.0, name='gilbrat') + + +class maxwell_gen(rv_continuous): + + """A Maxwell continuous random variable. + + %(before_notes)s + + Notes + ----- + A special case of a `chi` distribution, with ``df = 3``, ``loc = 0.0``, + and given ``scale = a``, where ``a`` is the parameter used in the + Mathworld description [1]_. + + The probability density function for `maxwell` is:: + + maxwell.pdf(x) = sqrt(2/pi)x**2 * exp(-x**2/2) + + for ``x > 0``. + + References + ---------- + .. [1] http://mathworld.wolfram.com/MaxwellDistribution.html + + %(example)s + """ + + def _rvs(self): + return chi.rvs(3.0, size=self._size) + + def _pdf(self, x): + return sqrt(2.0 / pi) * x * x * exp(-x * x / 2.0) + + def _cdf(self, x): + return special.gammainc(1.5, x * x / 2.0) + + def _ppf(self, q): + return sqrt(2 * special.gammaincinv(1.5, q)) + + def _stats(self): + val = 3 * pi - 8 + return ( + 2 * sqrt(2.0 / pi), 3 - 8 / pi, sqrt( + 2) * (32 - 10 * pi) / val ** 1.5, + (-12 * pi * pi + 160 * pi - 384) / val ** 2.0) + + def _entropy(self): + return _EULER + 0.5 * log(2 * pi) - 0.5 +maxwell = maxwell_gen(a=0.0, name='maxwell') + + +class mielke_gen(rv_continuous): + + """A Mielke's Beta-Kappa continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `mielke` is:: + + mielke.pdf(x, k, s) = k * x**(k-1) / (1+x**s)**(1+k/s) + + for ``x > 0``. + + %(example)s + + """ + + def _pdf(self, x, k, s): + return k * x ** (k - 1.0) / (1.0 + x ** s) ** (1.0 + k * 1.0 / s) + + def _cdf(self, x, k, s): + return x ** k / (1.0 + x ** s) ** (k * 1.0 / s) + + def _ppf(self, q, k, s): + qsk = pow(q, s * 1.0 / k) + return pow(qsk / (1.0 - qsk), 1.0 / s) +mielke = mielke_gen(a=0.0, name='mielke') + + +class nakagami_gen(rv_continuous): + + """A Nakagami continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `nakagami` is:: + + nakagami.pdf(x, nu) = 2 * nu**nu / gamma(nu) * + x**(2*nu-1) * exp(-nu*x**2) + + for ``x > 0``, ``nu > 0``. + + %(example)s + + """ + + def _pdf(self, x, nu): + return (2 * nu ** nu / gam(nu) * (x ** (2 * nu - 1.0)) * + exp(-nu * x * x)) + + def _cdf(self, x, nu): + return special.gammainc(nu, nu * x * x) + + def _ppf(self, q, nu): + return sqrt(1.0 / nu * special.gammaincinv(nu, q)) + + def _stats(self, nu): + mu = gam(nu + 0.5) / gam(nu) / sqrt(nu) + mu2 = 1.0 - mu * mu + g1 = mu * (1 - 4 * nu * mu2) / 2.0 / nu / np.power(mu2, 1.5) + g2 = -6 * mu ** 4 * nu + (8 * nu - 2) * mu ** 2 - 2 * nu + 1 + g2 /= nu * mu2 ** 2.0 + return mu, mu2, g1, g2 +nakagami = nakagami_gen(a=0.0, name="nakagami") + + +class ncx2_gen(rv_continuous): + + """A non-central chi-squared continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `ncx2` is:: + + ncx2.pdf(x, df, nc) = exp(-(nc+df)/2) * 1/2 * (x/nc)**((df-2)/4) + * I[(df-2)/2](sqrt(nc*x)) + + for ``x > 0``. + + %(example)s + + """ + + def _rvs(self, df, nc): + return mtrand.noncentral_chisquare(df, nc, self._size) + + def _logpdf(self, x, df, nc): + return _ncx2_log_pdf(x, df, nc) + + def _pdf(self, x, df, nc): + return _ncx2_pdf(x, df, nc) + + def _cdf(self, x, df, nc): + return _ncx2_cdf(x, df, nc) + + def _ppf(self, q, df, nc): + return special.chndtrix(q, df, nc) + + def _stats(self, df, nc): + val = df + 2.0 * nc + return (df + nc, 2 * val, sqrt(8) * (val + nc) / val ** 1.5, + 12.0 * (val + 2 * nc) / val ** 2.0) + + def _fitstart(self, data): + m = data.mean() + v = data.var() + # Supply a starting guess with method of moments: + nc = (v / 2 - m) / 2 + df = m - nc + return super(ncx2_gen, self)._fitstart(data, args=(df, nc)) +ncx2 = ncx2_gen(a=0.0, name='ncx2') + + +class ncf_gen(rv_continuous): + + """A non-central F distribution continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `ncf` is:: + + ncf.pdf(x, df1, df2, nc) = exp(nc/2 + nc*df1*x/(2*(df1*x+df2))) + * df1**(df1/2) * df2**(df2/2) * x**(df1/2-1) + * (df2+df1*x)**(-(df1+df2)/2) + * gamma(df1/2)*gamma(1+df2/2) + * L^{v1/2-1}^{v2/2}(-nc*v1*x/(2*(v1*x+v2))) + / (B(v1/2, v2/2) * gamma((v1+v2)/2)) + + for ``df1, df2, nc > 0``. + + %(example)s + + """ + + def _rvs(self, dfn, dfd, nc): + return mtrand.noncentral_f(dfn, dfd, nc, self._size) + + def _pdf_skip(self, x, dfn, dfd, nc): + n1, n2 = dfn, dfd + term = -nc / 2 + nc * n1 * x / \ + (2 * (n2 + n1 * x)) + gamln(n1 / 2.) + gamln(1 + n2 / 2.) + term -= gamln((n1 + n2) / 2.0) + Px = exp(term) + Px *= n1 ** (n1 / 2) * n2 ** (n2 / 2) * x ** (n1 / 2 - 1) + Px *= (n2 + n1 * x) ** (-(n1 + n2) / 2) + Px *= special.assoc_laguerre( + -nc * n1 * x / (2.0 * (n2 + n1 * x)), n2 / 2, n1 / 2 - 1) + Px /= special.beta(n1 / 2, n2 / 2) + # this function does not have a return + # drop it for now, the generic function seems to work ok + + def _cdf(self, x, dfn, dfd, nc): + return special.ncfdtr(dfn, dfd, nc, x) + + def _ppf(self, q, dfn, dfd, nc): + return special.ncfdtri(dfn, dfd, nc, q) + + def _munp(self, n, dfn, dfd, nc): + val = (dfn * 1.0 / dfd) ** n + term = gamln(n + 0.5 * dfn) + gamln(0.5 * dfd - n) - gamln(dfd * 0.5) + val *= exp(-nc / 2.0 + term) + val *= special.hyp1f1(n + 0.5 * dfn, 0.5 * dfn, 0.5 * nc) + return val + + def _stats(self, dfn, dfd, nc): + mu = where(dfd <= 2, inf, dfd / (dfd - 2.0) * (1 + nc * 1.0 / dfn)) + mu2 = where(dfd <= 4, inf, 2 * (dfd * 1.0 / dfn) ** 2.0 * + ((dfn + nc / 2.0) ** 2.0 + (dfn + nc) * (dfd - 2.0)) / + ((dfd - 2.0) ** 2.0 * (dfd - 4.0))) + return mu, mu2, None, None +ncf = ncf_gen(a=0.0, name='ncf') + + +class t_gen(rv_continuous): + + """A Student's T continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `t` is:: + + gamma((df+1)/2) + t.pdf(x, df) = --------------------------------------------------- + sqrt(pi*df) * gamma(df/2) * (1+x**2/df)**((df+1)/2) + + for ``df > 0``. + + %(example)s + + """ + + def _rvs(self, df): + return mtrand.standard_t(df, size=self._size) + + def _pdf(self, x, df): + r = asarray(df * 1.0) + Px = exp(gamln((r + 1) / 2) - gamln(r / 2)) + Px /= sqrt(r * pi) * (1 + (x ** 2) / r) ** ((r + 1) / 2) + return Px + + def _logpdf(self, x, df): + r = df * 1.0 + lPx = gamln((r + 1) / 2) - gamln(r / 2) + lPx -= 0.5 * log(r * pi) + (r + 1) / 2 * log1p((x ** 2) / r) + return lPx + + def _cdf(self, x, df): + return special.stdtr(df, x) + + def _sf(self, x, df): + return special.stdtr(df, -x) + + def _ppf(self, q, df): + return special.stdtrit(df, q) + + def _isf(self, q, df): + return -special.stdtrit(df, q) + + def _stats(self, df): + mu2 = where(df > 2, df / (df - 2.0), inf) + g1 = where(df > 3, 0.0, nan) + g2 = where(df > 4, 6.0 / (df - 4.0), nan) + return 0, mu2, g1, g2 +t = t_gen(name='t') + + +class nct_gen(rv_continuous): + + """A non-central Student's T continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `nct` is:: + + df**(df/2) * gamma(df+1) + nct.pdf(x, df, nc) = ---------------------------------------------------- + 2**df*exp(nc**2/2) * (df+x**2)**(df/2) * gamma(df/2) + + for ``df > 0``. + + %(example)s + + """ + + def _argcheck(self, df, nc): + return (df > 0) & (nc == nc) + + def _rvs(self, df, nc): + return (norm.rvs(loc=nc, size=self._size) * sqrt(df) / + sqrt(chi2.rvs(df, size=self._size))) + + def _pdf(self, x, df, nc): + n = df * 1.0 + nc = nc * 1.0 + x2 = x * x + ncx2 = nc * nc * x2 + fac1 = n + x2 + trm1 = n / 2. * log(n) + gamln(n + 1) + trm1 -= n * \ + log(2) + nc * nc / 2. + (n / 2.) * log(fac1) + gamln(n / 2.) + Px = exp(trm1) + valF = ncx2 / (2 * fac1) + trm1 = sqrt(2) * nc * x * special.hyp1f1(n / 2 + 1, 1.5, valF) + trm1 /= asarray(fac1 * special.gamma((n + 1) / 2)) + trm2 = special.hyp1f1((n + 1) / 2, 0.5, valF) + trm2 /= asarray(sqrt(fac1) * special.gamma(n / 2 + 1)) + Px *= trm1 + trm2 + return Px + + def _cdf(self, x, df, nc): + return special.nctdtr(df, nc, x) + + def _ppf(self, q, df, nc): + return special.nctdtrit(df, nc, q) + + def _stats(self, df, nc, moments='mv'): + # + # See D. Hogben, R.S. Pinkham, and M.B. Wilk, + # 'The moments of the non-central t-distribution' + # Biometrika 48, p. 465 (2961). + # e.g. http://www.jstor.org/stable/2332772 (gated) + # + _mu, _mu2, g1, g2 = None, None, None, None + + gfac = gam(df / 2. - 0.5) / gam(df / 2.) + c11 = sqrt(df / 2.) * gfac + c20 = df / (df - 2.) + c22 = c20 - c11 * c11 + mu = np.where(df > 1, nc * c11, np.inf) + mu2 = np.where(df > 2, c22 * nc * nc + c20, np.inf) + if 's' in moments: + c33t = df * (7. - 2. * df) / (df - 2.) / (df - 3.) + 2. * c11 * c11 + c31t = 3. * df / (df - 2.) / (df - 3.) + mu3 = (c33t * nc * nc + c31t) * c11 * nc + g1 = np.where(df > 3, mu3 / np.power(mu2, 1.5), np.nan) + # kurtosis + if 'k' in moments: + c44 = df * df / (df - 2.) / (df - 4.) + c44 -= c11 * c11 * 2. * df * (5. - df) / (df - 2.) / (df - 3.) + c44 -= 3. * c11 ** 4 + c42 = df / (df - 4.) - c11 * c11 * (df - 1.) / (df - 3.) + c42 *= 6. * df / (df - 2.) + c40 = 3. * df * df / (df - 2.) / (df - 4.) + + mu4 = c44 * nc ** 4 + c42 * nc ** 2 + c40 + g2 = np.where(df > 4, mu4 / mu2 ** 2 - 3., np.nan) + return mu, mu2, g1, g2 +nct = nct_gen(name="nct") + + +class pareto_gen(rv_continuous): + + """A Pareto continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `pareto` is:: + + pareto.pdf(x, b) = b / x**(b+1) + + for ``x >= 1``, ``b > 0``. + + %(example)s + + """ + + def _pdf(self, x, b): + return b * x ** (-b - 1) + + def _cdf(self, x, b): + return 1 - x ** (-b) + + def _ppf(self, q, b): + return pow(1 - q, -1.0 / b) + + def _stats(self, b, moments='mv'): + mu, mu2, g1, g2 = None, None, None, None + if 'm' in moments: + mask = b > 1 + bt = extract(mask, b) + mu = valarray(shape(b), value=inf) + place(mu, mask, bt / (bt - 1.0)) + if 'v' in moments: + mask = b > 2 + bt = extract(mask, b) + mu2 = valarray(shape(b), value=inf) + place(mu2, mask, bt / (bt - 2.0) / (bt - 1.0) ** 2) + if 's' in moments: + mask = b > 3 + bt = extract(mask, b) + g1 = valarray(shape(b), value=nan) + vals = 2 * (bt + 1.0) * sqrt(bt - 2.0) / ((bt - 3.0) * sqrt(bt)) + place(g1, mask, vals) + if 'k' in moments: + mask = b > 4 + bt = extract(mask, b) + g2 = valarray(shape(b), value=nan) + vals = (6.0 * polyval([1.0, 1.0, -6, -2], bt) / + polyval([1.0, -7.0, 12.0, 0.0], bt)) + place(g2, mask, vals) + return mu, mu2, g1, g2 + + def _entropy(self, c): + return 1 + 1.0 / c - log(c) +pareto = pareto_gen(a=1.0, name="pareto") + + +class lomax_gen(rv_continuous): + + """A Lomax (Pareto of the second kind) continuous random variable. + + %(before_notes)s + + Notes + ----- + The Lomax distribution is a special case of the Pareto distribution, with + (loc=-1.0). + + The probability density function for `lomax` is:: + + lomax.pdf(x, c) = c / (1+x)**(c+1) + + for ``x >= 0``, ``c > 0``. + + %(example)s + + """ + + def _pdf(self, x, c): + return c * 1.0 / (1.0 + x) ** (c + 1.0) + + def _logpdf(self, x, c): + return log(c) - (c + 1) * log1p(x) + + def _cdf(self, x, c): + return 1.0 - 1.0 / (1.0 + x) ** c + + def _sf(self, x, c): + return 1.0 / (1.0 + x) ** c + + def _logsf(self, x, c): + return -c * log1p(x) + + def _ppf(self, q, c): + return pow(1.0 - q, -1.0 / c) - 1 + + def _stats(self, c): + mu, mu2, g1, g2 = pareto.stats(c, loc=-1.0, moments='mvsk') + return mu, mu2, g1, g2 + + def _entropy(self, c): + return 1 + 1.0 / c - log(c) +lomax = lomax_gen(a=0.0, name="lomax") + + +class pearson3_gen(rv_continuous): + + """A pearson type III continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `pearson3` is:: + + pearson3.pdf(x, skew) = abs(beta) / gamma(alpha) * + (beta * (x - zeta))**(alpha - 1) * exp(-beta*(x - zeta)) + + where:: + + beta = 2 / (skew * stddev) + alpha = (stddev * beta)**2 + zeta = loc - alpha / beta + + %(example)s + + References + ---------- + R.W. Vogel and D.E. McMartin, "Probability Plot Goodness-of-Fit and + Skewness Estimation Procedures for the Pearson Type 3 Distribution", Water + Resources Research, Vol.27, 3149-3158 (1991). + + L.R. Salvosa, "Tables of Pearson's Type III Function", Ann. Math. Statist., + Vol.1, 191-198 (1930). + + "Using Modern Computing Tools to Fit the Pearson Type III Distribution to + Aviation Loads Data", Office of Aviation Research (2003). + + """ + + def _preprocess(self, x, skew): + # The real 'loc' and 'scale' are handled in the calling pdf(...). The + # local variables 'loc' and 'scale' within pearson3._pdf are set to + # the defaults just to keep them as part of the equations for + # documentation. + loc = 0.0 + scale = 1.0 + + # If skew is small, return _norm_pdf. The divide between pearson3 + # and norm was found by brute force and is approximately a skew of + # 0.000016. No one, I hope, would actually use a skew value even + # close to this small. + norm2pearson_transition = 0.000016 + + ans, x, skew = np.broadcast_arrays([1.0], x, skew) + ans = ans.copy() + + mask = np.absolute(skew) < norm2pearson_transition + invmask = ~mask + + beta = 2.0 / (skew[invmask] * scale) + alpha = (scale * beta) ** 2 + zeta = loc - alpha / beta + + transx = beta * (x[invmask] - zeta) + return ans, x, transx, skew, mask, invmask, beta, alpha, zeta + + def _argcheck(self, skew): + # The _argcheck function in rv_continuous only allows positive + # arguments. The skew argument for pearson3 can be zero (which I want + # to handle inside pearson3._pdf) or negative. So just return True + # for all skew args. + return np.ones(np.shape(skew), dtype=bool) + + def _stats(self, skew): + _, _x, _transx, skew, _mask, _invmask, beta, alpha, zeta = ( + self._preprocess([1], skew)) + m = zeta + alpha / beta + v = alpha / (beta ** 2) + s = 2.0 / (alpha ** 0.5) * np.sign(beta) + k = 6.0 / alpha + return m, v, s, k + + def _pdf(self, x, skew): + # Do the calculation in _logpdf since helps to limit + # overflow/underflow problems + ans = exp(self._logpdf(x, skew)) + if ans.ndim == 0: + if np.isnan(ans): + return 0.0 + return ans + ans[np.isnan(ans)] = 0.0 + return ans + + def _logpdf(self, x, skew): + # PEARSON3 logpdf GAMMA logpdf + # np.log(abs(beta)) + # + (alpha - 1)*log(beta*(x - zeta)) + (a - 1)*log(x) + # - beta*(x - zeta) - x + # - gamln(alpha) - gamln(a) + ans, x, transx, skew, mask, invmask, beta, alpha, _zeta = ( + self._preprocess(x, skew)) + + ans[mask] = np.log(_norm_pdf(x[mask])) + ans[invmask] = log(abs(beta)) + gamma._logpdf(transx, alpha) + return ans + + def _cdf(self, x, skew): + ans, x, transx, skew, mask, invmask, _beta, alpha, _zeta = ( + self._preprocess(x, skew)) + + ans[mask] = _norm_cdf(x[mask]) + ans[invmask] = gamma._cdf(transx, alpha) + return ans + + def _rvs(self, skew): + _ans, _x, _transx, skew, mask, _invmask, beta, alpha, zeta = ( + self._preprocess([0], skew)) + if mask[0]: + return mtrand.standard_normal(self._size) + ans = mtrand.standard_gamma(alpha, self._size) / beta + zeta + if ans.size == 1: + return ans[0] + return ans + + def _ppf(self, q, skew): + ans, q, _transq, skew, mask, invmask, beta, alpha, zeta = ( + self._preprocess(q, skew)) + ans[mask] = _norm_ppf(q[mask]) + ans[invmask] = special.gammaincinv(alpha, q[invmask]) / beta + zeta + return ans +pearson3 = pearson3_gen(name="pearson3") + + +class powerlaw_gen(rv_continuous): + + """A power-function continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `powerlaw` is:: + + powerlaw.pdf(x, a) = a * x**(a-1) + + for ``0 <= x <= 1``, ``a > 0``. + + `powerlaw` is a special case of `beta` with ``d == 1``. + + %(example)s + + """ + + def _pdf(self, x, a): + return a * x ** (a - 1.0) + + def _logpdf(self, x, a): + return log(a) + (a - 1) * log(x) + + def _cdf(self, x, a): + return x ** (a * 1.0) + + def _logcdf(self, x, a): + return a * log(x) + + def _ppf(self, q, a): + return pow(q, 1.0 / a) + + def _stats(self, a): + return (a / (a + 1.0), + a / (a + 2.0) / (a + 1.0) ** 2, + -2.0 * ((a - 1.0) / (a + 3.0)) * sqrt((a + 2.0) / a), + 6 * polyval([1, -1, -6, 2], a) / (a * (a + 3.0) * (a + 4))) + + def _entropy(self, a): + return 1 - 1.0 / a - log(a) +powerlaw = powerlaw_gen(a=0.0, b=1.0, name="powerlaw") + + +class powerlognorm_gen(rv_continuous): + + """A power log-normal continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `powerlognorm` is:: + + powerlognorm.pdf(x, c, s) = c / (x*s) * phi(log(x)/s) * + (Phi(-log(x)/s))**(c-1), + + where ``phi`` is the normal pdf, and ``Phi`` is the normal cdf, + and ``x > 0``, ``s, c > 0``. + + %(example)s + + """ + + def _pdf(self, x, c, s): + return (c / (x * s) * _norm_pdf(log(x) / s) * + pow(_norm_cdf(-log(x) / s), c * 1.0 - 1.0)) + + def _cdf(self, x, c, s): + return 1.0 - pow(_norm_cdf(-log(x) / s), c * 1.0) + + def _ppf(self, q, c, s): + return exp(-s * _norm_ppf(pow(1.0 - q, 1.0 / c))) +powerlognorm = powerlognorm_gen(a=0.0, name="powerlognorm") + + +class powernorm_gen(rv_continuous): + + """A power normal continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `powernorm` is:: + + powernorm.pdf(x, c) = c * phi(x) * (Phi(-x))**(c-1) + + where ``phi`` is the normal pdf, and ``Phi`` is the normal cdf, + and ``x > 0``, ``c > 0``. + + %(example)s + + """ + + def _pdf(self, x, c): + return (c * _norm_pdf(x) * (_norm_cdf(-x) ** (c - 1.0))) + + def _logpdf(self, x, c): + return log(c) + _norm_logpdf(x) + (c - 1) * _norm_logcdf(-x) + + def _cdf(self, x, c): + return 1.0 - _norm_cdf(-x) ** (c * 1.0) + + def _ppf(self, q, c): + return -_norm_ppf(pow(1.0 - q, 1.0 / c)) +powernorm = powernorm_gen(name='powernorm') + + +class rdist_gen(rv_continuous): + + """An R-distributed continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `rdist` is:: + + rdist.pdf(x, c) = (1-x**2)**(c/2-1) / B(1/2, c/2) + + for ``-1 <= x <= 1``, ``c > 0``. + + %(example)s + + """ + + def _pdf(self, x, c): + return (np.power((1.0 - x ** 2), c / 2.0 - 1) / + special.beta(0.5, c / 2.0)) + + def _cdf(self, x, c): + term1 = x / special.beta(0.5, c / 2.0) + res = 0.5 + term1 * special.hyp2f1(0.5, 1 - c / 2.0, 1.5, x ** 2) + # There's an issue with hyp2f1, it returns nans near x = +-1, c > 100. + # Use the generic implementation in that case. See gh-1285 for + # background. + if any(np.isnan(res)): + return rv_continuous._cdf(self, x, c) + return res + + def _munp(self, n, c): + numerator = (1 - (n % 2)) * special.beta((n + 1.0) / 2, c / 2.0) + return numerator / special.beta(1. / 2, c / 2.) +rdist = rdist_gen(a=-1.0, b=1.0, name="rdist") + + +class rayleigh_gen(rv_continuous): + + """A Rayleigh continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `rayleigh` is:: + + rayleigh.pdf(r) = r * exp(-r**2/2) + + for ``x >= 0``. + + `rayleigh` is a special case of `chi` with ``df == 2``. + + %(example)s + + """ + + def link(self, x, logSF, phat, ix): + rv_continuous.link.__doc__ + if ix == 1: + return x - phat[0] / sqrt(-2.0 * logSF) + else: + return x - phat[1] * sqrt(-2.0 * logSF) + + def _rvs(self): + return chi.rvs(2, size=self._size) + + def _pdf(self, r): + return exp(self._logpdf(r)) + + def _logpdf(self, r): + rr2 = r * r / 2.0 + return where(rr2 == inf, - rr2, log(r) - rr2) + + def _cdf(self, r): + return - expm1(-r * r / 2.0) + + def _sf(self, r): + return exp(-r * r / 2.0) + + def _ppf(self, q): + return sqrt(-2 * log1p(-q)) + + def _stats(self): + val = 4 - pi + return (np.sqrt(pi / 2), val / 2, 2 * (pi - 3) * sqrt(pi) / val ** 1.5, + 6 * pi / val - 16 / val ** 2) + + def _entropy(self): + return _EULER / 2.0 + 1 - 0.5 * log(2) +rayleigh = rayleigh_gen(a=0.0, name="rayleigh") + + +class truncrayleigh_gen(rv_continuous): + + """A truncated Rayleigh continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `truncrayleigh` is:: + + truncrayleigh.cdf(r) = 1 - exp(-((r+c)**2-c**2)/2) + + for ``x >= 0, c>=0``. + + %(example)s + + """ + + def _argcheck(self, c): + return (c >= 0) + + def link(self, x, logSF, phat, ix): + rv_continuous.link.__doc__ + c = phat[0] + if ix == 2: + return x - phat[1] / (sqrt(c * c - 2 * logSF) - c) + elif ix == 1: + return x - phat[2] * (sqrt(c * c - 2 * logSF) - c) + elif ix == 0: + xn = (x - phat[1]) / phat[2] + return - 2 * logSF / xn - xn / 2.0 + + def _fitstart(self, data, args=None): + if args is None: + args = (0.0,) * self.numargs + return args + self.fit_loc_scale(data, *args) + + def _pdf(self, r, c): + rc = r + c + return rc * exp(-(rc * rc - c * c) / 2.0) + + def _logpdf(self, r, c): + rc = r + c + return log(rc) - (rc * rc - c * c) / 2.0 + + def _cdf(self, r, c): + rc = r + c + return - expm1(-(rc * rc - c * c) / 2.0) + + def _logsf(self, r, c): + rc = r + c + return -(rc * rc - c * c) / 2.0 + + def _sf(self, r, c): + return exp(self._logsf(r, c)) + + def _ppf(self, q, c): + return sqrt(c * c - 2 * log1p(-q)) - c + + def _stats(self, c): + # TODO: correct this it is wrong! + val = 4 - pi + return (np.sqrt(pi / 2), + val / 2, + 2 * (pi - 3) * sqrt(pi) / val ** 1.5, + 6 * pi / val - 16 / val ** 2) + + def _entropy(self, c): + # TODO: correct this it is wrong! + return _EULER / 2.0 + 1 - 0.5 * log(2) +truncrayleigh = truncrayleigh_gen(a=0.0, name="truncrayleigh", shapes='c') + +# Reciprocal Distribution + + +class reciprocal_gen(rv_continuous): + + """A reciprocal continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `reciprocal` is:: + + reciprocal.pdf(x, a, b) = 1 / (x*log(b/a)) + + for ``a <= x <= b``, ``a, b > 0``. + + %(example)s + + """ + + def _argcheck(self, a, b): + self.a = a + self.b = b + self.d = log(b * 1.0 / a) + return (a > 0) & (b > 0) & (b > a) + + def _pdf(self, x, a, b): + return 1.0 / (x * self.d) + + def _logpdf(self, x, a, b): + return -log(x) - log(self.d) + + def _cdf(self, x, a, b): + return (log(x) - log(a)) / self.d + + def _ppf(self, q, a, b): + return a * pow(b * 1.0 / a, q) + + def _munp(self, n, a, b): + return 1.0 / self.d / n * (pow(b * 1.0, n) - pow(a * 1.0, n)) + + def _entropy(self, a, b): + return 0.5 * log(a * b) + log(log(b / a)) +reciprocal = reciprocal_gen(name="reciprocal") + + +# FIXME: PPF does not work. +class rice_gen(rv_continuous): + + """A Rice continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `rice` is:: + + rice.pdf(x, b) = x * exp(-(x**2+b**2)/2) * I[0](x*b) + + for ``x > 0``, ``b > 0``. + + %(example)s + + """ + + def _argcheck(self, b): + return b >= 0 + + def _rvs(self, b): + # http://en.wikipedia.org/wiki/Rice_distribution + sz = self._size if self._size else 1 + t = b / np.sqrt(2) + mtrand.standard_normal(size=(2, sz)) + return np.sqrt((t * t).sum(axis=0)) + + def _pdf(self, x, b): + return x * exp(-(x - b) * (x - b) / 2.0) * special.i0e(x * b) + + def _munp(self, n, b): + nd2 = n / 2.0 + n1 = 1 + nd2 + b2 = b * b / 2.0 + return (2.0 ** (nd2) * exp(-b2) * special.gamma(n1) * + special.hyp1f1(n1, 1, b2)) +rice = rice_gen(a=0.0, name="rice") + + +# FIXME: PPF does not work. +class recipinvgauss_gen(rv_continuous): + + """A reciprocal inverse Gaussian continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `recipinvgauss` is:: + + recipinvgauss.pdf(x, mu) = 1/sqrt(2*pi*x) * exp(-(1-mu*x)**2/(2*x*mu**2)) + + for ``x >= 0``. + + %(example)s + + """ + + def _rvs(self, mu): + return 1.0 / mtrand.wald(mu, 1.0, size=self._size) + + def _pdf(self, x, mu): + return (1.0 / sqrt(2 * pi * x) * exp(-(1 - mu * x) ** 2.0 / + (2 * x * mu ** 2.0))) + + def _logpdf(self, x, mu): + return (-(1 - mu * x) ** 2.0 / (2 * x * mu ** 2.0) - + 0.5 * log(2 * pi * x)) + + def _cdf(self, x, mu): + trm1 = 1.0 / mu - x + trm2 = 1.0 / mu + x + isqx = 1.0 / sqrt(x) + return (1.0 - _norm_cdf(isqx * trm1) - + exp(2.0 / mu) * _norm_cdf(-isqx * trm2)) +recipinvgauss = recipinvgauss_gen(a=0.0, name='recipinvgauss') + + +class semicircular_gen(rv_continuous): + + """A semicircular continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `semicircular` is:: + + semicircular.pdf(x) = 2/pi * sqrt(1-x**2) + + for ``-1 <= x <= 1``. + + %(example)s + + """ + + def _pdf(self, x): + return 2.0 / pi * sqrt(1 - x * x) + + def _cdf(self, x): + return 0.5 + 1.0 / pi * (x * sqrt(1 - x * x) + arcsin(x)) + + def _stats(self): + return 0, 0.25, 0, -1.0 + + def _entropy(self): + return 0.64472988584940017414 +semicircular = semicircular_gen(a=-1.0, b=1.0, name="semicircular") + + +class triang_gen(rv_continuous): + + """A triangular continuous random variable. + + %(before_notes)s + + Notes + ----- + The triangular distribution can be represented with an up-sloping line from + ``loc`` to ``(loc + c*scale)`` and then downsloping for ``(loc + c*scale)`` + to ``(loc+scale)``. + + The standard form is in the range [0, 1] with c the mode. + The location parameter shifts the start to `loc`. + The scale parameter changes the width from 1 to `scale`. + + %(example)s + + """ + + def _rvs(self, c): + return mtrand.triangular(0, c, 1, self._size) + + def _argcheck(self, c): + return (c >= 0) & (c <= 1) + + def _pdf(self, x, c): + return where(x < c, 2 * x / c, 2 * (1 - x) / (1 - c)) + + def _cdf(self, x, c): + return where(x < c, x * x / c, (x * x - 2 * x + c) / (c - 1)) + + def _ppf(self, q, c): + return where(q < c, sqrt(c * q), 1 - sqrt((1 - c) * (1 - q))) + + def _stats(self, c): + return ((c + 1.0) / 3.0, + (1.0 - c + c * c) / 18, + sqrt(2) * (2 * c - 1) * (c + 1) * (c - 2) / + (5 * np.power((1.0 - c + c * c), 1.5)), + -3.0 / 5.0) + + def _entropy(self, c): + return 0.5 - log(2) +triang = triang_gen(a=0.0, b=1.0, name="triang") + + +class truncexpon_gen(rv_continuous): + + """A truncated exponential continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `truncexpon` is:: + + truncexpon.pdf(x, b) = exp(-x) / (1-exp(-b)) + + for ``0 < x < b``. + + %(example)s + + """ + + def _argcheck(self, b): + self.b = b + return (b > 0) + + def _pdf(self, x, b): + return exp(-x) / (-expm1(-b)) + + def _logpdf(self, x, b): + return - x - log(-expm1(-b)) + + def _cdf(self, x, b): + return expm1(-x) / expm1(-b) + + def _ppf(self, q, b): + return - log1p(q * expm1(-b)) + + def _munp(self, n, b): + # wrong answer with formula, same as in continuous.pdf + # return gam(n+1)-special.gammainc(1+n, b) + if n == 1: + return (1 - (b + 1) * exp(-b)) / (-expm1(-b)) + elif n == 2: + return 2 * (1 - 0.5 * (b * b + 2 * b + 2) * exp(-b)) / (-expm1(-b)) + else: + # return generic for higher moments + # return rv_continuous._mom1_sc(self, n, b) + return self._mom1_sc(n, b) + + def _entropy(self, b): + eB = exp(b) + return log(eB - 1) + (1 + eB * (b - 1.0)) / (1.0 - eB) +truncexpon = truncexpon_gen(a=0.0, name='truncexpon') + + +class truncnorm_gen(rv_continuous): + + """A truncated normal continuous random variable. + + %(before_notes)s + + Notes + ----- + The standard form of this distribution is a standard normal truncated to + the range [a, b] --- notice that a and b are defined over the domain of the + standard normal. To convert clip values for a specific mean and standard + deviation, use:: + + a, b = (myclip_a - my_mean) / my_std, (myclip_b - my_mean) / my_std + + %(example)s + + """ + + def _argcheck(self, a, b): + self.a = a + self.b = b + self._nb = _norm_cdf(b) + self._na = _norm_cdf(a) + self._sb = _norm_sf(b) + self._sa = _norm_sf(a) + if self.a > 0: + self._delta = -(self._sb - self._sa) + else: + self._delta = self._nb - self._na + self._logdelta = log(self._delta) + return (a != b) + + def _pdf(self, x, a, b): + return _norm_pdf(x) / self._delta + + def _logpdf(self, x, a, b): + return _norm_logpdf(x) - self._logdelta + + def _cdf(self, x, a, b): + return (_norm_cdf(x) - self._na) / self._delta + + def _ppf(self, q, a, b): + if self.a > 0: + return _norm_isf(q * self._sb + self._sa * (1.0 - q)) + else: + return _norm_ppf(q * self._nb + self._na * (1.0 - q)) + + def _stats(self, a, b): + nA, nB = self._na, self._nb + d = nB - nA + pA, pB = _norm_pdf(a), _norm_pdf(b) + mu = (pA - pB) / d # correction sign + mu2 = 1 + (a * pA - b * pB) / d - mu * mu + return mu, mu2, None, None +truncnorm = truncnorm_gen(name='truncnorm') + + +# FIXME: RVS does not work. +class tukeylambda_gen(rv_continuous): + + """A Tukey-Lamdba continuous random variable. + + %(before_notes)s + + Notes + ----- + A flexible distribution, able to represent and interpolate between the + following distributions: + + - Cauchy (lam=-1) + - logistic (lam=0.0) + - approx Normal (lam=0.14) + - u-shape (lam = 0.5) + - uniform from -1 to 1 (lam = 1) + + %(example)s + + """ + + def _argcheck(self, lam): + return np.ones(np.shape(lam), dtype=bool) + + def _pdf(self, x, lam): + Fx = asarray(special.tklmbda(x, lam)) + Px = Fx ** (lam - 1.0) + (asarray(1 - Fx)) ** (lam - 1.0) + Px = 1.0 / asarray(Px) + return where((lam <= 0) | (abs(x) < 1.0 / asarray(lam)), Px, 0.0) + + def _cdf(self, x, lam): + return special.tklmbda(x, lam) + + def _ppf(self, q, lam): + q = q * 1.0 + vals1 = (q ** lam - (1 - q) ** lam) / lam + vals2 = log(q / (1 - q)) + return where((lam == 0) & (q == q), vals2, vals1) + + def _stats(self, lam): + return 0, _tlvar(lam), 0, _tlkurt(lam) + + def _entropy(self, lam): + def integ(p): + return log(pow(p, lam - 1) + pow(1 - p, lam - 1)) + return integrate.quad(integ, 0, 1)[0] +tukeylambda = tukeylambda_gen(name='tukeylambda') + + +class uniform_gen(rv_continuous): + + """A uniform continuous random variable. + + This distribution is constant between `loc` and ``loc + scale``. + + %(before_notes)s + + %(example)s + + """ + + def _rvs(self): + return mtrand.uniform(0.0, 1.0, self._size) + + def _pdf(self, x): + return 1.0 * (x == x) + + def _cdf(self, x): + return x + + def _ppf(self, q): + return q + + def _stats(self): + return 0.5, 1.0 / 12, 0, -1.2 + + def _entropy(self): + return 0.0 +uniform = uniform_gen(a=0.0, b=1.0, name='uniform') + + +class vonmises_gen(rv_continuous): + + """A Von Mises continuous random variable. + + %(before_notes)s + + Notes + ----- + If `x` is not in range or `loc` is not in range it assumes they are angles + and converts them to [-pi, pi] equivalents. + + The probability density function for `vonmises` is:: + + vonmises.pdf(x, kappa) = exp(kappa * cos(x)) / (2*pi*I[0](kappa)) + + for ``-pi <= x <= pi``, ``kappa > 0``. + + See Also + -------- + vonmises_line : The same distribution, defined on a [-pi, pi] segment + of the real line. + + %(example)s + + """ + + def _rvs(self, kappa): + return mtrand.vonmises(0.0, kappa, size=self._size) + + def _pdf(self, x, kappa): + return exp(kappa * cos(x)) / (2 * pi * special.i0(kappa)) + + def _cdf(self, x, kappa): + return vonmises_cython.von_mises_cdf(kappa, x) + + def _stats_skip(self, kappa): + return 0, None, 0, None +vonmises = vonmises_gen(name='vonmises') +vonmises_line = vonmises_gen(a=-np.pi, b=np.pi, name='vonmises_line') + + +class wald_gen(invgauss_gen): + + """A Wald continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `wald` is:: + + wald.pdf(x, a) = 1/sqrt(2*pi*x**3) * exp(-(x-1)**2/(2*x)) + + for ``x > 0``. + + `wald` is a special case of `invgauss` with ``mu == 1``. + + %(example)s + """ + + def _rvs(self): + return mtrand.wald(1.0, 1.0, size=self._size) + + def _pdf(self, x): + return invgauss._pdf(x, 1.0) + + def _logpdf(self, x): + return invgauss._logpdf(x, 1.0) + + def _cdf(self, x): + return invgauss._cdf(x, 1.0) + + def _stats(self): + return 1.0, 1.0, 3.0, 15.0 +wald = wald_gen(a=0.0, name="wald") + + +class wrapcauchy_gen(rv_continuous): + + """A wrapped Cauchy continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `wrapcauchy` is:: + + wrapcauchy.pdf(x, c) = (1-c**2) / (2*pi*(1+c**2-2*c*cos(x))) + + for ``0 <= x <= 2*pi``, ``0 < c < 1``. + + %(example)s + + """ + + def _argcheck(self, c): + return (c > 0) & (c < 1) + + def _pdf(self, x, c): + return (1.0 - c * c) / (2 * pi * (1 + c * c - 2 * c * cos(x))) + + def _cdf(self, x, c): + output = 0.0 * x + val = (1.0 + c) / (1.0 - c) + c1 = x < pi + c2 = 1 - c1 + xp = extract(c1, x) + xn = extract(c2, x) + if (any(xn)): + valn = extract(c2, np.ones_like(x) * val) + xn = 2 * pi - xn + yn = tan(xn / 2.0) + on = 1.0 - 1.0 / pi * arctan(valn * yn) + place(output, c2, on) + if (any(xp)): + valp = extract(c1, np.ones_like(x) * val) + yp = tan(xp / 2.0) + op = 1.0 / pi * arctan(valp * yp) + place(output, c1, op) + return output + + def _ppf(self, q, c): + val = (1.0 - c) / (1.0 + c) + rcq = 2 * arctan(val * tan(pi * q)) + rcmq = 2 * pi - 2 * arctan(val * tan(pi * (1 - q))) + return where(q < 1.0 / 2, rcq, rcmq) + + def _entropy(self, c): + return log(2 * pi * (1 - c * c)) +wrapcauchy = wrapcauchy_gen(a=0.0, b=2 * pi, name='wrapcauchy') diff --git a/pywafo/src/wafo/stats/_discrete_distns.py b/pywafo/src/wafo/stats/_discrete_distns.py index 2e922f7..07ed14a 100644 --- a/pywafo/src/wafo/stats/_discrete_distns.py +++ b/pywafo/src/wafo/stats/_discrete_distns.py @@ -1,762 +1,790 @@ -# -# Author: Travis Oliphant 2002-2011 with contributions from -# SciPy Developers 2004-2011 -# -from __future__ import division, print_function, absolute_import - -from scipy import special -from scipy.special import gammaln as gamln - -from numpy import floor, ceil, log, exp, sqrt, log1p, expm1, tanh, cosh, sinh - -import numpy as np -import numpy.random as mtrand - -from ._distn_infrastructure import ( - rv_discrete, _lazywhere, _ncx2_pdf, _ncx2_cdf) - -__all__ = [ - 'binom', 'bernoulli', 'nbinom', 'geom', 'hypergeom', - 'logser', 'poisson', 'planck', 'boltzmann', 'randint', - 'zipf', 'dlaplace', 'skellam' - ] - - -class binom_gen(rv_discrete): - """A binomial discrete random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `binom` is:: - - binom.pmf(k) = choose(n, k) * p**k * (1-p)**(n-k) - - for ``k`` in ``{0, 1,..., n}``. - - `binom` takes ``n`` and ``p`` as shape parameters. - - %(example)s - - """ - def _rvs(self, n, p): - return mtrand.binomial(n, p, self._size) - - def _argcheck(self, n, p): - self.b = n - return (n >= 0) & (p >= 0) & (p <= 1) - - def _logpmf(self, x, n, p): - k = floor(x) - combiln = (gamln(n+1) - (gamln(k+1) + gamln(n-k+1))) - return combiln + special.xlogy(k, p) + special.xlog1py(n-k, -p) - - def _pmf(self, x, n, p): - return exp(self._logpmf(x, n, p)) - - def _cdf(self, x, n, p): - k = floor(x) - vals = special.bdtr(k, n, p) - return vals - - def _sf(self, x, n, p): - k = floor(x) - return special.bdtrc(k, n, p) - - def _ppf(self, q, n, p): - vals = ceil(special.bdtrik(q, n, p)) - vals1 = vals-1 - temp = special.bdtr(vals1, n, p) - return np.where(temp >= q, vals1, vals) - - def _stats(self, n, p): - q = 1.0-p - mu = n * p - var = n * p * q - g1 = (q-p) / sqrt(n*p*q) - g2 = (1.0-6*p*q)/(n*p*q) - return mu, var, g1, g2 - - def _entropy(self, n, p): - k = np.r_[0:n + 1] - vals = self._pmf(k, n, p) - h = -np.sum(special.xlogy(vals, vals), axis=0) - return h -binom = binom_gen(name='binom') - - -class bernoulli_gen(binom_gen): - """A Bernoulli discrete random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `bernoulli` is:: - - bernoulli.pmf(k) = 1-p if k = 0 - = p if k = 1 - - for ``k`` in ``{0, 1}``. - - `bernoulli` takes ``p`` as shape parameter. - - %(example)s - - """ - def _rvs(self, p): - return binom_gen._rvs(self, 1, p) - - def _argcheck(self, p): - return (p >= 0) & (p <= 1) - - def _logpmf(self, x, p): - return binom._logpmf(x, 1, p) - - def _pmf(self, x, p): - return binom._pmf(x, 1, p) - - def _cdf(self, x, p): - return binom._cdf(x, 1, p) - - def _sf(self, x, p): - return binom._sf(x, 1, p) - - def _ppf(self, q, p): - return binom._ppf(q, 1, p) - - def _stats(self, p): - return binom._stats(1, p) - - def _entropy(self, p): - h = -special.xlogy(p, p) - special.xlogy(1 - p, 1 - p) - return h -bernoulli = bernoulli_gen(b=1, name='bernoulli') - - -class nbinom_gen(rv_discrete): - """A negative binomial discrete random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `nbinom` is:: - - nbinom.pmf(k) = choose(k+n-1, n-1) * p**n * (1-p)**k - - for ``k >= 0``. - - `nbinom` takes ``n`` and ``p`` as shape parameters. - - %(example)s - - """ - def _rvs(self, n, p): - return mtrand.negative_binomial(n, p, self._size) - - def _argcheck(self, n, p): - return (n >= 0) & (p >= 0) & (p <= 1) - - def _pmf(self, x, n, p): - return exp(self._logpmf(x, n, p)) - - def _logpmf(self, x, n, p): - coeff = gamln(n+x) - gamln(x+1) - gamln(n) - return coeff + n*log(p) + x*log1p(-p) - - def _cdf(self, x, n, p): - k = floor(x) - return special.betainc(n, k+1, p) - - def _sf_skip(self, x, n, p): - # skip because special.nbdtrc doesn't work for 0= q, vals1, vals) - - def _stats(self, n, p): - Q = 1.0 / p - P = Q - 1.0 - mu = n*P - var = n*P*Q - g1 = (Q+P)/sqrt(n*P*Q) - g2 = (1.0 + 6*P*Q) / (n*P*Q) - return mu, var, g1, g2 -nbinom = nbinom_gen(name='nbinom') - - -class geom_gen(rv_discrete): - """A geometric discrete random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `geom` is:: - - geom.pmf(k) = (1-p)**(k-1)*p - - for ``k >= 1``. - - `geom` takes ``p`` as shape parameter. - - %(example)s - - """ - def _rvs(self, p): - return mtrand.geometric(p, size=self._size) - - def _argcheck(self, p): - return (p <= 1) & (p >= 0) - - def _pmf(self, k, p): - return np.power(1-p, k-1) * p - - def _logpmf(self, k, p): - return (k-1)*log1p(-p) + log(p) - - def _cdf(self, x, p): - k = floor(x) - return -expm1(log1p(-p)*k) - - def _sf(self, x, p): - return np.exp(self._logsf(x, p)) - - def _logsf(self, x, p): - k = floor(x) - return k*log1p(-p) - - def _ppf(self, q, p): - vals = ceil(log1p(-q)/log1p(-p)) - temp = self._cdf(vals-1, p) - return np.where((temp >= q) & (vals > 0), vals-1, vals) - - def _stats(self, p): - mu = 1.0/p - qr = 1.0-p - var = qr / p / p - g1 = (2.0-p) / sqrt(qr) - g2 = np.polyval([1, -6, 6], p)/(1.0-p) - return mu, var, g1, g2 -geom = geom_gen(a=1, name='geom', longname="A geometric") - - -class hypergeom_gen(rv_discrete): - """A hypergeometric discrete random variable. - - The hypergeometric distribution models drawing objects from a bin. - M is the total number of objects, n is total number of Type I objects. - The random variate represents the number of Type I objects in N drawn - without replacement from the total population. - - %(before_notes)s - - Notes - ----- - The probability mass function is defined as:: - - pmf(k, M, n, N) = choose(n, k) * choose(M - n, N - k) / choose(M, N), - for max(0, N - (M-n)) <= k <= min(n, N) - - Examples - -------- - >>> from scipy.stats import hypergeom - - Suppose we have a collection of 20 animals, of which 7 are dogs. Then if - we want to know the probability of finding a given number of dogs if we - choose at random 12 of the 20 animals, we can initialize a frozen - distribution and plot the probability mass function: - - >>> [M, n, N] = [20, 7, 12] - >>> rv = hypergeom(M, n, N) - >>> x = np.arange(0, n+1) - >>> pmf_dogs = rv.pmf(x) - - >>> fig = plt.figure() - >>> ax = fig.add_subplot(111) - >>> ax.plot(x, pmf_dogs, 'bo') - >>> ax.vlines(x, 0, pmf_dogs, lw=2) - >>> ax.set_xlabel('# of dogs in our group of chosen animals') - >>> ax.set_ylabel('hypergeom PMF') - >>> plt.show() - - Instead of using a frozen distribution we can also use `hypergeom` - methods directly. To for example obtain the cumulative distribution - function, use: - - >>> prb = hypergeom.cdf(x, M, n, N) - - And to generate random numbers: - - >>> R = hypergeom.rvs(M, n, N, size=10) - - """ - def _rvs(self, M, n, N): - return mtrand.hypergeometric(n, M-n, N, size=self._size) - - def _argcheck(self, M, n, N): - cond = rv_discrete._argcheck(self, M, n, N) - cond &= (n <= M) & (N <= M) - self.a = max(N-(M-n), 0) - self.b = min(n, N) - return cond - - def _logpmf(self, k, M, n, N): - tot, good = M, n - bad = tot - good - return gamln(good+1) - gamln(good-k+1) - gamln(k+1) + gamln(bad+1) \ - - gamln(bad-N+k+1) - gamln(N-k+1) - gamln(tot+1) + gamln(tot-N+1) \ - + gamln(N+1) - - def _pmf(self, k, M, n, N): - # same as the following but numerically more precise - # return comb(good, k) * comb(bad, N-k) / comb(tot, N) - return exp(self._logpmf(k, M, n, N)) - - def _stats(self, M, n, N): - # tot, good, sample_size = M, n, N - # "wikipedia".replace('N', 'M').replace('n', 'N').replace('K', 'n') - M, n, N = 1.*M, 1.*n, 1.*N - m = M - n - p = n/M - mu = N*p - - var = m*n*N*(M - N)*1.0/(M*M*(M-1)) - g1 = (m - n)*(M-2*N) / (M-2.0) * sqrt((M-1.0) / (m*n*N*(M-N))) - - g2 = M*(M+1) - 6.*N*(M-N) - 6.*n*m - g2 *= (M-1)*M*M - g2 += 6.*n*N*(M-N)*m*(5.*M-6) - g2 /= n * N * (M-N) * m * (M-2.) * (M-3.) - return mu, var, g1, g2 - - def _entropy(self, M, n, N): - k = np.r_[N - (M - n):min(n, N) + 1] - vals = self.pmf(k, M, n, N) - h = -np.sum(special.xlogy(vals, vals), axis=0) - return h - - def _sf(self, k, M, n, N): - """More precise calculation, 1 - cdf doesn't cut it.""" - # This for loop is needed because `k` can be an array. If that's the - # case, the sf() method makes M, n and N arrays of the same shape. We - # therefore unpack all inputs args, so we can do the manual - # integration. - res = [] - for quant, tot, good, draw in zip(k, M, n, N): - # Manual integration over probability mass function. More accurate - # than integrate.quad. - k2 = np.arange(quant + 1, draw + 1) - res.append(np.sum(self._pmf(k2, tot, good, draw))) - return np.asarray(res) -hypergeom = hypergeom_gen(name='hypergeom') - - -# FIXME: Fails _cdfvec -class logser_gen(rv_discrete): - """A Logarithmic (Log-Series, Series) discrete random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `logser` is:: - - logser.pmf(k) = - p**k / (k*log(1-p)) - - for ``k >= 1``. - - `logser` takes ``p`` as shape parameter. - - %(example)s - - """ - def _rvs(self, p): - # looks wrong for p>0.5, too few k=1 - # trying to use generic is worse, no k=1 at all - return mtrand.logseries(p, size=self._size) - - def _argcheck(self, p): - return (p > 0) & (p < 1) - - def _pmf(self, k, p): - return -np.power(p, k) * 1.0 / k / log1p(- p) - - def _stats(self, p): - r = log1p(-p) - mu = p / (p - 1.0) / r - mu2p = -p / r / (p - 1.0)**2 - var = mu2p - mu*mu - mu3p = -p / r * (1.0+p) / (1.0 - p)**3 - mu3 = mu3p - 3*mu*mu2p + 2*mu**3 - g1 = mu3 / np.power(var, 1.5) - - mu4p = -p / r * ( - 1.0 / (p-1)**2 - 6*p / (p - 1)**3 + 6*p*p / (p-1)**4) - mu4 = mu4p - 4*mu3p*mu + 6*mu2p*mu*mu - 3*mu**4 - g2 = mu4 / var**2 - 3.0 - return mu, var, g1, g2 -logser = logser_gen(a=1, name='logser', longname='A logarithmic') - - -class poisson_gen(rv_discrete): - """A Poisson discrete random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `poisson` is:: - - poisson.pmf(k) = exp(-mu) * mu**k / k! - - for ``k >= 0``. - - `poisson` takes ``mu`` as shape parameter. - - %(example)s - - """ - def _rvs(self, mu): - return mtrand.poisson(mu, self._size) - - def _logpmf(self, k, mu): - Pk = k*log(mu)-gamln(k+1) - mu - return Pk - - def _pmf(self, k, mu): - return exp(self._logpmf(k, mu)) - - def _cdf(self, x, mu): - k = floor(x) - return special.pdtr(k, mu) - - def _sf(self, x, mu): - k = floor(x) - return special.pdtrc(k, mu) - - def _ppf(self, q, mu): - vals = ceil(special.pdtrik(q, mu)) - vals1 = vals - 1 - temp = special.pdtr(vals1, mu) - return np.where((temp >= q), vals1, vals) - - def _stats(self, mu): - var = mu - tmp = np.asarray(mu) - g1 = sqrt(1.0 / tmp) - g2 = 1.0 / tmp - return mu, var, g1, g2 -poisson = poisson_gen(name="poisson", longname='A Poisson') - - -class planck_gen(rv_discrete): - """A Planck discrete exponential random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `planck` is:: - - planck.pmf(k) = (1-exp(-lambda_))*exp(-lambda_*k) - - for ``k*lambda_ >= 0``. - - `planck` takes ``lambda_`` as shape parameter. - - %(example)s - - """ - def _argcheck(self, lambda_): - if (lambda_ > 0): - self.a = 0 - self.b = np.inf - return 1 - elif (lambda_ < 0): - self.a = -np.inf - self.b = 0 - return 1 - else: - return 0 - - def _pmf(self, k, lambda_): - fact = -expm1(-lambda_) - return fact * exp(-lambda_ * k) - - def _cdf(self, x, lambda_): - k = floor(x) - return - expm1(-lambda_ * (k + 1)) - - def _ppf(self, q, lambda_): - vals = ceil(-1.0/lambda_ * log1p(-q)-1) - vals1 = (vals-1).clip(self.a, np.inf) - temp = self._cdf(vals1, lambda_) - return np.where(temp >= q, vals1, vals) - - def _stats(self, lambda_): - mu = 1/(exp(lambda_)-1) - var = exp(-lambda_)/(expm1(-lambda_))**2 - g1 = 2*cosh(lambda_/2.0) - g2 = 4+2*cosh(lambda_) - return mu, var, g1, g2 - - def _entropy(self, lambda_): - l = lambda_ - C = -expm1(-l) - return l * exp(-l) / C - log(C) -planck = planck_gen(name='planck', longname='A discrete exponential ') - - -class boltzmann_gen(rv_discrete): - """A Boltzmann (Truncated Discrete Exponential) random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `boltzmann` is:: - - boltzmann.pmf(k) = (1-exp(-lambda_)*exp(-lambda_*k)/(1-exp(-lambda_*N)) - - for ``k = 0,..., N-1``. - - `boltzmann` takes ``lambda_`` and ``N`` as shape parameters. - - %(example)s - - """ - def _pmf(self, k, lambda_, N): - fact = (expm1(-lambda_))/(expm1(-lambda_*N)) - return fact*exp(-lambda_*k) - - def _cdf(self, x, lambda_, N): - k = floor(x) - return (expm1(-lambda_*(k+1)))/(expm1(-lambda_*N)) - - def _ppf(self, q, lambda_, N): - qnew = -q*(expm1(-lambda_*N)) - vals = ceil(-1.0/lambda_ * log1p(-qnew)-1) - vals1 = (vals-1).clip(0.0, np.inf) - temp = self._cdf(vals1, lambda_, N) - return np.where(temp >= q, vals1, vals) - - def _stats(self, lambda_, N): - z = exp(-lambda_) - zN = exp(-lambda_*N) - mu = z/(1.0-z)-N*zN/(1-zN) - var = z/(1.0-z)**2 - N*N*zN/(1-zN)**2 - trm = (1-zN)/(1-z) - trm2 = (z*trm**2 - N*N*zN) - g1 = z*(1+z)*trm**3 - N**3*zN*(1+zN) - g1 = g1 / trm2**(1.5) - g2 = z*(1+4*z+z*z)*trm**4 - N**4 * zN*(1+4*zN+zN*zN) - g2 = g2 / trm2 / trm2 - return mu, var, g1, g2 -boltzmann = boltzmann_gen(name='boltzmann', - longname='A truncated discrete exponential ') - - -class randint_gen(rv_discrete): - """A uniform discrete random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `randint` is:: - - randint.pmf(k) = 1./(high - low) - - for ``k = low, ..., high - 1``. - - `randint` takes ``low`` and ``high`` as shape parameters. - - Note the difference to the numpy ``random_integers`` which - returns integers on a *closed* interval ``[low, high]``. - - %(example)s - - """ - def _argcheck(self, low, high): - self.a = low - self.b = high - 1 - return (high > low) - - def _pmf(self, k, low, high): - p = np.ones_like(k) / (high - low) - return np.where((k >= low) & (k < high), p, 0.) - - def _cdf(self, x, low, high): - k = floor(x) - return (k - low + 1.) / (high - low) - - def _ppf(self, q, low, high): - vals = ceil(q * (high - low) + low) - 1 - vals1 = (vals - 1).clip(low, high) - temp = self._cdf(vals1, low, high) - return np.where(temp >= q, vals1, vals) - - def _stats(self, low, high): - m2, m1 = np.asarray(high), np.asarray(low) - mu = (m2 + m1 - 1.0) / 2 - d = m2 - m1 - var = (d*d - 1) / 12.0 - g1 = 0.0 - g2 = -6.0/5.0 * (d*d + 1.0) / (d*d - 1.0) - return mu, var, g1, g2 - - def _rvs(self, low, high=None): - """An array of *size* random integers >= ``low`` and < ``high``. - - If ``high`` is ``None``, then range is >=0 and < low - """ - return mtrand.randint(low, high, self._size) - - def _entropy(self, low, high): - return log(high - low) -randint = randint_gen(name='randint', longname='A discrete uniform ' - '(random integer)') - - -# FIXME: problems sampling. -class zipf_gen(rv_discrete): - """A Zipf discrete random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `zipf` is:: - - zipf.pmf(k, a) = 1/(zeta(a) * k**a) - - for ``k >= 1``. - - `zipf` takes ``a`` as shape parameter. - - %(example)s - - """ - def _rvs(self, a): - return mtrand.zipf(a, size=self._size) - - def _argcheck(self, a): - return a > 1 - - def _pmf(self, k, a): - Pk = 1.0 / special.zeta(a, 1) / k**a - return Pk - - def _munp(self, n, a): - return _lazywhere( - a > n + 1, (a, n), - lambda a, n: special.zeta(a - n, 1) / special.zeta(a, 1), - np.inf) -zipf = zipf_gen(a=1, name='zipf', longname='A Zipf') - - -class dlaplace_gen(rv_discrete): - """A Laplacian discrete random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `dlaplace` is:: - - dlaplace.pmf(k) = tanh(a/2) * exp(-a*abs(k)) - - for ``a > 0``. - - `dlaplace` takes ``a`` as shape parameter. - - %(example)s - - """ - def _pmf(self, k, a): - return tanh(a/2.0) * exp(-a * abs(k)) - - def _cdf(self, x, a): - k = floor(x) - f = lambda k, a: 1.0 - exp(-a * k) / (exp(a) + 1) - f2 = lambda k, a: exp(a * (k+1)) / (exp(a) + 1) - return _lazywhere(k >= 0, (k, a), f=f, f2=f2) - - def _ppf(self, q, a): - const = 1 + exp(a) - vals = ceil(np.where(q < 1.0 / (1 + exp(-a)), log(q*const) / a - 1, - -log((1-q) * const) / a)) - vals1 = vals - 1 - return np.where(self._cdf(vals1, a) >= q, vals1, vals) - - def _stats(self, a): - ea = exp(a) - mu2 = 2.*ea/(ea-1.)**2 - mu4 = 2.*ea*(ea**2+10.*ea+1.) / (ea-1.)**4 - return 0., mu2, 0., mu4/mu2**2 - 3. - - def _entropy(self, a): - return a / sinh(a) - log(tanh(a/2.0)) -dlaplace = dlaplace_gen(a=-np.inf, - name='dlaplace', longname='A discrete Laplacian') - - -class skellam_gen(rv_discrete): - """A Skellam discrete random variable. - - %(before_notes)s - - Notes - ----- - Probability distribution of the difference of two correlated or - uncorrelated Poisson random variables. - - Let k1 and k2 be two Poisson-distributed r.v. with expected values - lam1 and lam2. Then, ``k1 - k2`` follows a Skellam distribution with - parameters ``mu1 = lam1 - rho*sqrt(lam1*lam2)`` and - ``mu2 = lam2 - rho*sqrt(lam1*lam2)``, where rho is the correlation - coefficient between k1 and k2. If the two Poisson-distributed r.v. - are independent then ``rho = 0``. - - Parameters mu1 and mu2 must be strictly positive. - - For details see: http://en.wikipedia.org/wiki/Skellam_distribution - - `skellam` takes ``mu1`` and ``mu2`` as shape parameters. - - %(example)s - - """ - def _rvs(self, mu1, mu2): - n = self._size - return mtrand.poisson(mu1, n) - mtrand.poisson(mu2, n) - - def _pmf(self, x, mu1, mu2): - px = np.where(x < 0, - _ncx2_pdf(2*mu2, 2*(1-x), 2*mu1)*2, - _ncx2_pdf(2*mu1, 2*(1+x), 2*mu2)*2) - # ncx2.pdf() returns nan's for extremely low probabilities - return px - - def _cdf(self, x, mu1, mu2): - x = floor(x) - px = np.where(x < 0, - _ncx2_cdf(2*mu2, -2*x, 2*mu1), - 1-_ncx2_cdf(2*mu1, 2*(x+1), 2*mu2)) - return px - - def _stats(self, mu1, mu2): - mean = mu1 - mu2 - var = mu1 + mu2 - g1 = mean / sqrt((var)**3) - g2 = 1 / var - return mean, var, g1, g2 -skellam = skellam_gen(a=-np.inf, name="skellam", longname='A Skellam') +# +# Author: Travis Oliphant 2002-2011 with contributions from +# SciPy Developers 2004-2011 +# +from __future__ import division, print_function, absolute_import + +from scipy import special +from scipy.special import gammaln as gamln + +from numpy import floor, ceil, log, exp, sqrt, log1p, expm1, tanh, cosh, sinh + +import numpy as np +import numpy.random as mtrand + +from ._distn_infrastructure import ( + rv_discrete, _lazywhere, _ncx2_pdf, _ncx2_cdf) + +__all__ = [ + 'binom', 'bernoulli', 'nbinom', 'geom', 'hypergeom', + 'logser', 'poisson', 'planck', 'boltzmann', 'randint', + 'zipf', 'dlaplace', 'skellam' +] + + +class binom_gen(rv_discrete): + + """A binomial discrete random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `binom` is:: + + binom.pmf(k) = choose(n, k) * p**k * (1-p)**(n-k) + + for ``k`` in ``{0, 1,..., n}``. + + `binom` takes ``n`` and ``p`` as shape parameters. + + %(example)s + + """ + + def _rvs(self, n, p): + return mtrand.binomial(n, p, self._size) + + def _argcheck(self, n, p): + self.b = n + return (n >= 0) & (p >= 0) & (p <= 1) + + def _logpmf(self, x, n, p): + k = floor(x) + combiln = (gamln(n + 1) - (gamln(k + 1) + gamln(n - k + 1))) + return combiln + special.xlogy(k, p) + special.xlog1py(n - k, -p) + + def _pmf(self, x, n, p): + return exp(self._logpmf(x, n, p)) + + def _cdf(self, x, n, p): + k = floor(x) + vals = special.bdtr(k, n, p) + return vals + + def _sf(self, x, n, p): + k = floor(x) + return special.bdtrc(k, n, p) + + def _ppf(self, q, n, p): + vals = ceil(special.bdtrik(q, n, p)) + vals1 = vals - 1 + temp = special.bdtr(vals1, n, p) + return np.where(temp >= q, vals1, vals) + + def _stats(self, n, p): + q = 1.0 - p + mu = n * p + var = n * p * q + g1 = (q - p) / sqrt(n * p * q) + g2 = (1.0 - 6 * p * q) / (n * p * q) + return mu, var, g1, g2 + + def _entropy(self, n, p): + k = np.r_[0:n + 1] + vals = self._pmf(k, n, p) + h = -np.sum(special.xlogy(vals, vals), axis=0) + return h +binom = binom_gen(name='binom') + + +class bernoulli_gen(binom_gen): + + """A Bernoulli discrete random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `bernoulli` is:: + + bernoulli.pmf(k) = 1-p if k = 0 + = p if k = 1 + + for ``k`` in ``{0, 1}``. + + `bernoulli` takes ``p`` as shape parameter. + + %(example)s + + """ + + def _rvs(self, p): + return binom_gen._rvs(self, 1, p) + + def _argcheck(self, p): + return (p >= 0) & (p <= 1) + + def _logpmf(self, x, p): + return binom._logpmf(x, 1, p) + + def _pmf(self, x, p): + return binom._pmf(x, 1, p) + + def _cdf(self, x, p): + return binom._cdf(x, 1, p) + + def _sf(self, x, p): + return binom._sf(x, 1, p) + + def _ppf(self, q, p): + return binom._ppf(q, 1, p) + + def _stats(self, p): + return binom._stats(1, p) + + def _entropy(self, p): + h = -special.xlogy(p, p) - special.xlogy(1 - p, 1 - p) + return h +bernoulli = bernoulli_gen(b=1, name='bernoulli') + + +class nbinom_gen(rv_discrete): + + """A negative binomial discrete random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `nbinom` is:: + + nbinom.pmf(k) = choose(k+n-1, n-1) * p**n * (1-p)**k + + for ``k >= 0``. + + `nbinom` takes ``n`` and ``p`` as shape parameters. + + %(example)s + + """ + + def _rvs(self, n, p): + return mtrand.negative_binomial(n, p, self._size) + + def _argcheck(self, n, p): + return (n >= 0) & (p >= 0) & (p <= 1) + + def _pmf(self, x, n, p): + return exp(self._logpmf(x, n, p)) + + def _logpmf(self, x, n, p): + coeff = gamln(n + x) - gamln(x + 1) - gamln(n) + return coeff + n * log(p) + x * log1p(-p) + + def _cdf(self, x, n, p): + k = floor(x) + return special.betainc(n, k + 1, p) + + def _sf_skip(self, x, n, p): + # skip because special.nbdtrc doesn't work for 0= q, vals1, vals) + + def _stats(self, n, p): + Q = 1.0 / p + P = Q - 1.0 + mu = n * P + var = n * P * Q + g1 = (Q + P) / sqrt(n * P * Q) + g2 = (1.0 + 6 * P * Q) / (n * P * Q) + return mu, var, g1, g2 +nbinom = nbinom_gen(name='nbinom') + + +class geom_gen(rv_discrete): + + """A geometric discrete random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `geom` is:: + + geom.pmf(k) = (1-p)**(k-1)*p + + for ``k >= 1``. + + `geom` takes ``p`` as shape parameter. + + %(example)s + + """ + + def _rvs(self, p): + return mtrand.geometric(p, size=self._size) + + def _argcheck(self, p): + return (p <= 1) & (p >= 0) + + def _pmf(self, k, p): + return np.power(1 - p, k - 1) * p + + def _logpmf(self, k, p): + return (k - 1) * log1p(-p) + log(p) + + def _cdf(self, x, p): + k = floor(x) + return -expm1(log1p(-p) * k) + + def _sf(self, x, p): + return np.exp(self._logsf(x, p)) + + def _logsf(self, x, p): + k = floor(x) + return k * log1p(-p) + + def _ppf(self, q, p): + vals = ceil(log1p(-q) / log1p(-p)) + temp = self._cdf(vals - 1, p) + return np.where((temp >= q) & (vals > 0), vals - 1, vals) + + def _stats(self, p): + mu = 1.0 / p + qr = 1.0 - p + var = qr / p / p + g1 = (2.0 - p) / sqrt(qr) + g2 = np.polyval([1, -6, 6], p) / (1.0 - p) + return mu, var, g1, g2 +geom = geom_gen(a=1, name='geom', longname="A geometric") + + +class hypergeom_gen(rv_discrete): + + """A hypergeometric discrete random variable. + + The hypergeometric distribution models drawing objects from a bin. + M is the total number of objects, n is total number of Type I objects. + The random variate represents the number of Type I objects in N drawn + without replacement from the total population. + + %(before_notes)s + + Notes + ----- + The probability mass function is defined as:: + + pmf(k, M, n, N) = choose(n, k) * choose(M - n, N - k) / choose(M, N), + for max(0, N - (M-n)) <= k <= min(n, N) + + Examples + -------- + >>> from scipy.stats import hypergeom + + Suppose we have a collection of 20 animals, of which 7 are dogs. Then if + we want to know the probability of finding a given number of dogs if we + choose at random 12 of the 20 animals, we can initialize a frozen + distribution and plot the probability mass function: + + >>> [M, n, N] = [20, 7, 12] + >>> rv = hypergeom(M, n, N) + >>> x = np.arange(0, n+1) + >>> pmf_dogs = rv.pmf(x) + + >>> fig = plt.figure() + >>> ax = fig.add_subplot(111) + >>> ax.plot(x, pmf_dogs, 'bo') + >>> ax.vlines(x, 0, pmf_dogs, lw=2) + >>> ax.set_xlabel('# of dogs in our group of chosen animals') + >>> ax.set_ylabel('hypergeom PMF') + >>> plt.show() + + Instead of using a frozen distribution we can also use `hypergeom` + methods directly. To for example obtain the cumulative distribution + function, use: + + >>> prb = hypergeom.cdf(x, M, n, N) + + And to generate random numbers: + + >>> R = hypergeom.rvs(M, n, N, size=10) + + """ + + def _rvs(self, M, n, N): + return mtrand.hypergeometric(n, M - n, N, size=self._size) + + def _argcheck(self, M, n, N): + cond = rv_discrete._argcheck(self, M, n, N) + cond &= (n <= M) & (N <= M) + self.a = max(N - (M - n), 0) + self.b = min(n, N) + return cond + + def _logpmf(self, k, M, n, N): + tot, good = M, n + bad = tot - good + return gamln(good + 1) - gamln(good - k + 1) - gamln(k + 1) + \ + gamln(bad + 1) - gamln(bad - N + k + 1) - gamln(N - k + 1) - \ + gamln(tot + 1) + gamln(tot - N + 1) + gamln(N + 1) + + def _pmf(self, k, M, n, N): + # same as the following but numerically more precise + # return comb(good, k) * comb(bad, N-k) / comb(tot, N) + return exp(self._logpmf(k, M, n, N)) + + def _stats(self, M, n, N): + # tot, good, sample_size = M, n, N + # "wikipedia".replace('N', 'M').replace('n', 'N').replace('K', 'n') + M, n, N = 1. * M, 1. * n, 1. * N + m = M - n + p = n / M + mu = N * p + + var = m * n * N * (M - N) * 1.0 / (M * M * (M - 1)) + g1 = (m - n) * (M - 2 * N) / (M - 2.0) * \ + sqrt((M - 1.0) / (m * n * N * (M - N))) + + g2 = M * (M + 1) - 6. * N * (M - N) - 6. * n * m + g2 *= (M - 1) * M * M + g2 += 6. * n * N * (M - N) * m * (5. * M - 6) + g2 /= n * N * (M - N) * m * (M - 2.) * (M - 3.) + return mu, var, g1, g2 + + def _entropy(self, M, n, N): + k = np.r_[N - (M - n):min(n, N) + 1] + vals = self.pmf(k, M, n, N) + h = -np.sum(special.xlogy(vals, vals), axis=0) + return h + + def _sf(self, k, M, n, N): + """More precise calculation, 1 - cdf doesn't cut it.""" + # This for loop is needed because `k` can be an array. If that's the + # case, the sf() method makes M, n and N arrays of the same shape. We + # therefore unpack all inputs args, so we can do the manual + # integration. + res = [] + for quant, tot, good, draw in zip(k, M, n, N): + # Manual integration over probability mass function. More accurate + # than integrate.quad. + k2 = np.arange(quant + 1, draw + 1) + res.append(np.sum(self._pmf(k2, tot, good, draw))) + return np.asarray(res) +hypergeom = hypergeom_gen(name='hypergeom') + + +# FIXME: Fails _cdfvec +class logser_gen(rv_discrete): + + """A Logarithmic (Log-Series, Series) discrete random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `logser` is:: + + logser.pmf(k) = - p**k / (k*log(1-p)) + + for ``k >= 1``. + + `logser` takes ``p`` as shape parameter. + + %(example)s + + """ + + def _rvs(self, p): + # looks wrong for p>0.5, too few k=1 + # trying to use generic is worse, no k=1 at all + return mtrand.logseries(p, size=self._size) + + def _argcheck(self, p): + return (p > 0) & (p < 1) + + def _pmf(self, k, p): + return -np.power(p, k) * 1.0 / k / log1p(- p) + + def _stats(self, p): + r = log1p(-p) + mu = p / (p - 1.0) / r + mu2p = -p / r / (p - 1.0) ** 2 + var = mu2p - mu * mu + mu3p = -p / r * (1.0 + p) / (1.0 - p) ** 3 + mu3 = mu3p - 3 * mu * mu2p + 2 * mu ** 3 + g1 = mu3 / np.power(var, 1.5) + + mu4p = -p / r * ( + 1.0 / (p - 1) ** 2 - 6 * p / (p - 1) ** 3 + 6 * p * p / (p - 1) ** 4) + mu4 = mu4p - 4 * mu3p * mu + 6 * mu2p * mu * mu - 3 * mu ** 4 + g2 = mu4 / var ** 2 - 3.0 + return mu, var, g1, g2 +logser = logser_gen(a=1, name='logser', longname='A logarithmic') + + +class poisson_gen(rv_discrete): + + """A Poisson discrete random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `poisson` is:: + + poisson.pmf(k) = exp(-mu) * mu**k / k! + + for ``k >= 0``. + + `poisson` takes ``mu`` as shape parameter. + + %(example)s + + """ + + def _rvs(self, mu): + return mtrand.poisson(mu, self._size) + + def _logpmf(self, k, mu): + Pk = k * log(mu) - gamln(k + 1) - mu + return Pk + + def _pmf(self, k, mu): + return exp(self._logpmf(k, mu)) + + def _cdf(self, x, mu): + k = floor(x) + return special.pdtr(k, mu) + + def _sf(self, x, mu): + k = floor(x) + return special.pdtrc(k, mu) + + def _ppf(self, q, mu): + vals = ceil(special.pdtrik(q, mu)) + vals1 = vals - 1 + temp = special.pdtr(vals1, mu) + return np.where((temp >= q), vals1, vals) + + def _stats(self, mu): + var = mu + tmp = np.asarray(mu) + g1 = sqrt(1.0 / tmp) + g2 = 1.0 / tmp + return mu, var, g1, g2 +poisson = poisson_gen(name="poisson", longname='A Poisson') + + +class planck_gen(rv_discrete): + + """A Planck discrete exponential random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `planck` is:: + + planck.pmf(k) = (1-exp(-lambda_))*exp(-lambda_*k) + + for ``k*lambda_ >= 0``. + + `planck` takes ``lambda_`` as shape parameter. + + %(example)s + + """ + + def _argcheck(self, lambda_): + if (lambda_ > 0): + self.a = 0 + self.b = np.inf + return 1 + elif (lambda_ < 0): + self.a = -np.inf + self.b = 0 + return 1 + else: + return 0 + + def _pmf(self, k, lambda_): + fact = -expm1(-lambda_) + return fact * exp(-lambda_ * k) + + def _cdf(self, x, lambda_): + k = floor(x) + return - expm1(-lambda_ * (k + 1)) + + def _ppf(self, q, lambda_): + vals = ceil(-1.0 / lambda_ * log1p(-q) - 1) + vals1 = (vals - 1).clip(self.a, np.inf) + temp = self._cdf(vals1, lambda_) + return np.where(temp >= q, vals1, vals) + + def _stats(self, lambda_): + mu = 1 / (exp(lambda_) - 1) + var = exp(-lambda_) / (expm1(-lambda_)) ** 2 + g1 = 2 * cosh(lambda_ / 2.0) + g2 = 4 + 2 * cosh(lambda_) + return mu, var, g1, g2 + + def _entropy(self, lambda_): + l = lambda_ + C = -expm1(-l) + return l * exp(-l) / C - log(C) +planck = planck_gen(name='planck', longname='A discrete exponential ') + + +class boltzmann_gen(rv_discrete): + + """A Boltzmann (Truncated Discrete Exponential) random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `boltzmann` is:: + + boltzmann.pmf(k) = (1-exp(-lambda_)*exp(-lambda_*k)/(1-exp(-lambda_*N)) + + for ``k = 0,..., N-1``. + + `boltzmann` takes ``lambda_`` and ``N`` as shape parameters. + + %(example)s + + """ + + def _pmf(self, k, lambda_, N): + fact = (expm1(-lambda_)) / (expm1(-lambda_ * N)) + return fact * exp(-lambda_ * k) + + def _cdf(self, x, lambda_, N): + k = floor(x) + return (expm1(-lambda_ * (k + 1))) / (expm1(-lambda_ * N)) + + def _ppf(self, q, lambda_, N): + qnew = -q * (expm1(-lambda_ * N)) + vals = ceil(-1.0 / lambda_ * log1p(-qnew) - 1) + vals1 = (vals - 1).clip(0.0, np.inf) + temp = self._cdf(vals1, lambda_, N) + return np.where(temp >= q, vals1, vals) + + def _stats(self, lambda_, N): + z = exp(-lambda_) + zN = exp(-lambda_ * N) + mu = z / (1.0 - z) - N * zN / (1 - zN) + var = z / (1.0 - z) ** 2 - N * N * zN / (1 - zN) ** 2 + trm = (1 - zN) / (1 - z) + trm2 = (z * trm ** 2 - N * N * zN) + g1 = z * (1 + z) * trm ** 3 - N ** 3 * zN * (1 + zN) + g1 = g1 / trm2 ** (1.5) + g2 = z * (1 + 4 * z + z * z) * \ + trm ** 4 - N ** 4 * zN * (1 + 4 * zN + zN * zN) + g2 = g2 / trm2 / trm2 + return mu, var, g1, g2 +boltzmann = boltzmann_gen(name='boltzmann', + longname='A truncated discrete exponential ') + + +class randint_gen(rv_discrete): + + """A uniform discrete random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `randint` is:: + + randint.pmf(k) = 1./(high - low) + + for ``k = low, ..., high - 1``. + + `randint` takes ``low`` and ``high`` as shape parameters. + + Note the difference to the numpy ``random_integers`` which + returns integers on a *closed* interval ``[low, high]``. + + %(example)s + + """ + + def _argcheck(self, low, high): + self.a = low + self.b = high - 1 + return (high > low) + + def _pmf(self, k, low, high): + p = np.ones_like(k) / (high - low) + return np.where((k >= low) & (k < high), p, 0.) + + def _cdf(self, x, low, high): + k = floor(x) + return (k - low + 1.) / (high - low) + + def _ppf(self, q, low, high): + vals = ceil(q * (high - low) + low) - 1 + vals1 = (vals - 1).clip(low, high) + temp = self._cdf(vals1, low, high) + return np.where(temp >= q, vals1, vals) + + def _stats(self, low, high): + m2, m1 = np.asarray(high), np.asarray(low) + mu = (m2 + m1 - 1.0) / 2 + d = m2 - m1 + var = (d * d - 1) / 12.0 + g1 = 0.0 + g2 = -6.0 / 5.0 * (d * d + 1.0) / (d * d - 1.0) + return mu, var, g1, g2 + + def _rvs(self, low, high=None): + """An array of *size* random integers >= ``low`` and < ``high``. + + If ``high`` is ``None``, then range is >=0 and < low + """ + return mtrand.randint(low, high, self._size) + + def _entropy(self, low, high): + return log(high - low) +randint = randint_gen(name='randint', longname='A discrete uniform ' + '(random integer)') + + +# FIXME: problems sampling. +class zipf_gen(rv_discrete): + + """A Zipf discrete random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `zipf` is:: + + zipf.pmf(k, a) = 1/(zeta(a) * k**a) + + for ``k >= 1``. + + `zipf` takes ``a`` as shape parameter. + + %(example)s + + """ + + def _rvs(self, a): + return mtrand.zipf(a, size=self._size) + + def _argcheck(self, a): + return a > 1 + + def _pmf(self, k, a): + Pk = 1.0 / special.zeta(a, 1) / k ** a + return Pk + + def _munp(self, n, a): + return _lazywhere( + a > n + 1, (a, n), + lambda a, n: special.zeta(a - n, 1) / special.zeta(a, 1), + np.inf) +zipf = zipf_gen(a=1, name='zipf', longname='A Zipf') + + +class dlaplace_gen(rv_discrete): + + """A Laplacian discrete random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `dlaplace` is:: + + dlaplace.pmf(k) = tanh(a/2) * exp(-a*abs(k)) + + for ``a > 0``. + + `dlaplace` takes ``a`` as shape parameter. + + %(example)s + + """ + + def _pmf(self, k, a): + return tanh(a / 2.0) * exp(-a * abs(k)) + + def _cdf(self, x, a): + k = floor(x) + f = lambda k, a: 1.0 - exp(-a * k) / (exp(a) + 1) + f2 = lambda k, a: exp(a * (k + 1)) / (exp(a) + 1) + return _lazywhere(k >= 0, (k, a), f=f, f2=f2) + + def _ppf(self, q, a): + const = 1 + exp(a) + vals = ceil(np.where(q < 1.0 / (1 + exp(-a)), log(q * const) / a - 1, + -log((1 - q) * const) / a)) + vals1 = vals - 1 + return np.where(self._cdf(vals1, a) >= q, vals1, vals) + + def _stats(self, a): + ea = exp(a) + mu2 = 2. * ea / (ea - 1.) ** 2 + mu4 = 2. * ea * (ea ** 2 + 10. * ea + 1.) / (ea - 1.) ** 4 + return 0., mu2, 0., mu4 / mu2 ** 2 - 3. + + def _entropy(self, a): + return a / sinh(a) - log(tanh(a / 2.0)) +dlaplace = dlaplace_gen(a=-np.inf, + name='dlaplace', longname='A discrete Laplacian') + + +class skellam_gen(rv_discrete): + + """A Skellam discrete random variable. + + %(before_notes)s + + Notes + ----- + Probability distribution of the difference of two correlated or + uncorrelated Poisson random variables. + + Let k1 and k2 be two Poisson-distributed r.v. with expected values + lam1 and lam2. Then, ``k1 - k2`` follows a Skellam distribution with + parameters ``mu1 = lam1 - rho*sqrt(lam1*lam2)`` and + ``mu2 = lam2 - rho*sqrt(lam1*lam2)``, where rho is the correlation + coefficient between k1 and k2. If the two Poisson-distributed r.v. + are independent then ``rho = 0``. + + Parameters mu1 and mu2 must be strictly positive. + + For details see: http://en.wikipedia.org/wiki/Skellam_distribution + + `skellam` takes ``mu1`` and ``mu2`` as shape parameters. + + %(example)s + + """ + + def _rvs(self, mu1, mu2): + n = self._size + return mtrand.poisson(mu1, n) - mtrand.poisson(mu2, n) + + def _pmf(self, x, mu1, mu2): + px = np.where(x < 0, + _ncx2_pdf(2 * mu2, 2 * (1 - x), 2 * mu1) * 2, + _ncx2_pdf(2 * mu1, 2 * (1 + x), 2 * mu2) * 2) + # ncx2.pdf() returns nan's for extremely low probabilities + return px + + def _cdf(self, x, mu1, mu2): + x = floor(x) + px = np.where(x < 0, + _ncx2_cdf(2 * mu2, -2 * x, 2 * mu1), + 1 - _ncx2_cdf(2 * mu1, 2 * (x + 1), 2 * mu2)) + return px + + def _stats(self, mu1, mu2): + mean = mu1 - mu2 + var = mu1 + mu2 + g1 = mean / sqrt((var) ** 3) + g2 = 1 / var + return mean, var, g1, g2 +skellam = skellam_gen(a=-np.inf, name="skellam", longname='A Skellam') diff --git a/pywafo/src/wafo/stats/_distn_infrastructure.py b/pywafo/src/wafo/stats/_distn_infrastructure.py index aad95b4..f03709c 100644 --- a/pywafo/src/wafo/stats/_distn_infrastructure.py +++ b/pywafo/src/wafo/stats/_distn_infrastructure.py @@ -455,7 +455,7 @@ class rv_frozen(object): def __init__(self, dist, *args, **kwds): self.dist = dist args, loc, scale = dist._parse_args(*args, **kwds) - if len(args) == dist.numargs - 2: # isinstance(dist, rv_continuous): + if isinstance(dist, rv_continuous): self.par = args + (loc, scale) else: # rv_discrete self.par = args + (loc,) diff --git a/pywafo/src/wafo/stats/kde_test.py b/pywafo/src/wafo/stats/kde_test.py new file mode 100644 index 0000000..0e9139c --- /dev/null +++ b/pywafo/src/wafo/stats/kde_test.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Dec 06 16:02:47 2011 + +@author: pab +""" +import numpy as np +import wafo.kdetools as wk +n = 100 +x = np.sort(5*np.random.rand(1,n)-2.5, axis=-1).ravel() +y = (np.cos(x)>2*np.random.rand(n, 1)-1).ravel() + +kreg = wk.KRegression(x,y) +f = kreg(output='plotobj', title='Kernel regression', plotflag=1) +f.plot() \ No newline at end of file diff --git a/pywafo/src/wafo/stats/morestats.py b/pywafo/src/wafo/stats/morestats.py index 59c432f..aa63ea6 100644 --- a/pywafo/src/wafo/stats/morestats.py +++ b/pywafo/src/wafo/stats/morestats.py @@ -1,1943 +1,1959 @@ -# Author: Travis Oliphant, 2002 -# -# Further updates and enhancements by many SciPy developers. -# -from __future__ import division, print_function, absolute_import - -import math -import warnings - -import numpy as np -from numpy import (isscalar, r_, log, sum, around, unique, asarray, - zeros, arange, sort, amin, amax, any, atleast_1d, sqrt, ceil, - floor, array, poly1d, compress, not_equal, pi, exp, ravel, angle) -from numpy.testing.decorators import setastest - -from scipy.lib.six import string_types -from scipy import optimize -from scipy import special -from . import statlib -from . import stats -from .stats import find_repeats -from . import distributions -from ._distn_infrastructure import rv_generic - - -__all__ = ['mvsdist', - 'bayes_mvs', 'kstat', 'kstatvar', 'probplot', 'ppcc_max', 'ppcc_plot', - 'boxcox_llf', 'boxcox', 'boxcox_normmax', 'boxcox_normplot', - 'shapiro', 'anderson', 'ansari', 'bartlett', 'levene', 'binom_test', - 'fligner', 'mood', 'wilcoxon', - 'pdf_fromgamma', 'circmean', 'circvar', 'circstd', - ] - - -def bayes_mvs(data, alpha=0.90): - """ - Bayesian confidence intervals for the mean, var, and std. - - Parameters - ---------- - data : array_like - Input data, if multi-dimensional it is flattened to 1-D by `bayes_mvs`. - Requires 2 or more data points. - alpha : float, optional - Probability that the returned confidence interval contains - the true parameter. - - Returns - ------- - mean_cntr, var_cntr, std_cntr : tuple - The three results are for the mean, variance and standard deviation, - respectively. Each result is a tuple of the form:: - - (center, (lower, upper)) - - with `center` the mean of the conditional pdf of the value given the - data, and `(lower, upper)` a confidence interval, centered on the - median, containing the estimate to a probability `alpha`. - - Notes - ----- - Each tuple of mean, variance, and standard deviation estimates represent - the (center, (lower, upper)) with center the mean of the conditional pdf - of the value given the data and (lower, upper) is a confidence interval - centered on the median, containing the estimate to a probability - `alpha`. - - Converts data to 1-D and assumes all data has the same mean and variance. - Uses Jeffrey's prior for variance and std. - - Equivalent to tuple((x.mean(), x.interval(alpha)) for x in mvsdist(dat)) - - References - ---------- - T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and - standard-deviation from data", http://hdl.handle.net/1877/438, 2006. - - """ - res = mvsdist(data) - if alpha >= 1 or alpha <= 0: - raise ValueError("0 < alpha < 1 is required, but alpha=%s was given." % alpha) - return tuple((x.mean(), x.interval(alpha)) for x in res) - - -def mvsdist(data): - """ - 'Frozen' distributions for mean, variance, and standard deviation of data. - - Parameters - ---------- - data : array_like - Input array. Converted to 1-D using ravel. - Requires 2 or more data-points. - - Returns - ------- - mdist : "frozen" distribution object - Distribution object representing the mean of the data - vdist : "frozen" distribution object - Distribution object representing the variance of the data - sdist : "frozen" distribution object - Distribution object representing the standard deviation of the data - - Notes - ----- - The return values from bayes_mvs(data) is equivalent to - ``tuple((x.mean(), x.interval(0.90)) for x in mvsdist(data))``. - - In other words, calling ``.mean()`` and ``.interval(0.90)`` - on the three distribution objects returned from this function will give - the same results that are returned from `bayes_mvs`. - - Examples - -------- - >>> from scipy.stats import mvsdist - >>> data = [6, 9, 12, 7, 8, 8, 13] - >>> mean, var, std = mvsdist(data) - - We now have frozen distribution objects "mean", "var" and "std" that we can - examine: - - >>> mean.mean() - 9.0 - >>> mean.interval(0.95) - (6.6120585482655692, 11.387941451734431) - >>> mean.std() - 1.1952286093343936 - - """ - x = ravel(data) - n = len(x) - if (n < 2): - raise ValueError("Need at least 2 data-points.") - xbar = x.mean() - C = x.var() - if (n > 1000): # gaussian approximations for large n - mdist = distributions.norm(loc=xbar, scale=math.sqrt(C/n)) - sdist = distributions.norm(loc=math.sqrt(C), scale=math.sqrt(C/(2.*n))) - vdist = distributions.norm(loc=C, scale=math.sqrt(2.0/n)*C) - else: - nm1 = n-1 - fac = n*C/2. - val = nm1/2. - mdist = distributions.t(nm1,loc=xbar,scale=math.sqrt(C/nm1)) - sdist = distributions.gengamma(val,-2,scale=math.sqrt(fac)) - vdist = distributions.invgamma(val,scale=fac) - return mdist, vdist, sdist - - -def kstat(data,n=2): - """ - Return the nth k-statistic (1<=n<=4 so far). - - The nth k-statistic is the unique symmetric unbiased estimator of the nth - cumulant kappa_n. - - Parameters - ---------- - data : array_like - Input array. - n : int, {1, 2, 3, 4}, optional - Default is equal to 2. - - Returns - ------- - kstat : float - The nth k-statistic. - - See Also - -------- - kstatvar: Returns an unbiased estimator of the variance of the k-statistic. - - Notes - ----- - The cumulants are related to central moments but are specifically defined - using a power series expansion of the logarithm of the characteristic - function (which is the Fourier transform of the PDF). - In particular let phi(t) be the characteristic function, then:: - - ln phi(t) = > kappa_n (it)^n / n! (sum from n=0 to inf) - - The first few cumulants (kappa_n) in terms of central moments (mu_n) are:: - - kappa_1 = mu_1 - kappa_2 = mu_2 - kappa_3 = mu_3 - kappa_4 = mu_4 - 3*mu_2**2 - kappa_5 = mu_5 - 10*mu_2 * mu_3 - - References - ---------- - http://mathworld.wolfram.com/k-Statistic.html - - http://mathworld.wolfram.com/Cumulant.html - - """ - if n > 4 or n < 1: - raise ValueError("k-statistics only supported for 1<=n<=4") - n = int(n) - S = zeros(n+1,'d') - data = ravel(data) - N = len(data) - for k in range(1,n+1): - S[k] = sum(data**k,axis=0) - if n == 1: - return S[1]*1.0/N - elif n == 2: - return (N*S[2]-S[1]**2.0)/(N*(N-1.0)) - elif n == 3: - return (2*S[1]**3 - 3*N*S[1]*S[2]+N*N*S[3]) / (N*(N-1.0)*(N-2.0)) - elif n == 4: - return (-6*S[1]**4 + 12*N*S[1]**2 * S[2] - 3*N*(N-1.0)*S[2]**2 - - 4*N*(N+1)*S[1]*S[3] + N*N*(N+1)*S[4]) / \ - (N*(N-1.0)*(N-2.0)*(N-3.0)) - else: - raise ValueError("Should not be here.") - - -def kstatvar(data,n=2): - """ - Returns an unbiased estimator of the variance of the k-statistic. - - See `kstat` for more details of the k-statistic. - - Parameters - ---------- - data : array_like - Input array. - n : int, {1, 2}, optional - Default is equal to 2. - - Returns - ------- - kstatvar : float - The nth k-statistic variance. - - See Also - -------- - kstat - - """ - data = ravel(data) - N = len(data) - if n == 1: - return kstat(data,n=2)*1.0/N - elif n == 2: - k2 = kstat(data,n=2) - k4 = kstat(data,n=4) - return (2*k2*k2*N + (N-1)*k4)/(N*(N+1)) - else: - raise ValueError("Only n=1 or n=2 supported.") - - -def _calc_uniform_order_statistic_medians(x): - """See Notes section of `probplot` for details.""" - N = len(x) - osm_uniform = np.zeros(N, dtype=np.float64) - osm_uniform[-1] = 0.5**(1.0 / N) - osm_uniform[0] = 1 - osm_uniform[-1] - i = np.arange(2, N) - osm_uniform[1:-1] = (i - 0.3175) / (N + 0.365) - return osm_uniform - - -def _parse_dist_kw(dist, enforce_subclass=True): - """Parse `dist` keyword. - - Parameters - ---------- - dist : str or stats.distributions instance. - Several functions take `dist` as a keyword, hence this utility - function. - enforce_subclass : bool, optional - If True (default), `dist` needs to be a - `_distn_infrastructure.rv_generic` instance. - It can sometimes be useful to set this keyword to False, if a function - wants to accept objects that just look somewhat like such an instance - (for example, they have a ``ppf`` method). - - """ - if isinstance(dist, rv_generic): - pass - elif isinstance(dist, string_types): - try: - dist = getattr(distributions, dist) - except AttributeError: - raise ValueError("%s is not a valid distribution name" % dist) - elif enforce_subclass: - msg = ("`dist` should be a stats.distributions instance or a string " - "with the name of such a distribution.") - raise ValueError(msg) - - return dist - - -def probplot(x, sparams=(), dist='norm', fit=True, plot=None): - """ - Calculate quantiles for a probability plot, and optionally show the plot. - - Generates a probability plot of sample data against the quantiles of a - specified theoretical distribution (the normal distribution by default). - `probplot` optionally calculates a best-fit line for the data and plots the - results using Matplotlib or a given plot function. - - Parameters - ---------- - x : array_like - Sample/response data from which `probplot` creates the plot. - sparams : tuple, optional - Distribution-specific shape parameters (shape parameters plus location - and scale). - dist : str or stats.distributions instance, optional - Distribution or distribution function name. The default is 'norm' for a - normal probability plot. Objects that look enough like a - stats.distributions instance (i.e. they have a ``ppf`` method) are also - accepted. - fit : bool, optional - Fit a least-squares regression (best-fit) line to the sample data if - True (default). - plot : object, optional - If given, plots the quantiles and least squares fit. - `plot` is an object that has to have methods "plot" and "text". - The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, - or a custom object with the same methods. - Default is None, which means that no plot is created. - - Returns - ------- - (osm, osr) : tuple of ndarrays - Tuple of theoretical quantiles (osm, or order statistic medians) and - ordered responses (osr). `osr` is simply sorted input `x`. - For details on how `osm` is calculated see the Notes section. - (slope, intercept, r) : tuple of floats, optional - Tuple containing the result of the least-squares fit, if that is - performed by `probplot`. `r` is the square root of the coefficient of - determination. If ``fit=False`` and ``plot=None``, this tuple is not - returned. - - Notes - ----- - Even if `plot` is given, the figure is not shown or saved by `probplot`; - ``plt.show()`` or ``plt.savefig('figname.png')`` should be used after - calling `probplot`. - - `probplot` generates a probability plot, which should not be confused with - a Q-Q or a P-P plot. Statsmodels has more extensive functionality of this - type, see ``statsmodels.api.ProbPlot``. - - The formula used for the theoretical quantiles (horizontal axis of the - probability plot) is Filliben's estimate:: - - quantiles = dist.ppf(val), for - - 0.5**(1/n), for i = n - val = (i - 0.3175) / (n + 0.365), for i = 2, ..., n-1 - 1 - 0.5**(1/n), for i = 1 - - where ``i`` indicates the i-th ordered value and ``n`` is the total number - of values. - - Examples - -------- - >>> from scipy import stats - >>> import matplotlib.pyplot as plt - >>> nsample = 100 - >>> np.random.seed(7654321) - - A t distribution with small degrees of freedom: - - >>> ax1 = plt.subplot(221) - >>> x = stats.t.rvs(3, size=nsample) - >>> res = stats.probplot(x, plot=plt) - - A t distribution with larger degrees of freedom: - - >>> ax2 = plt.subplot(222) - >>> x = stats.t.rvs(25, size=nsample) - >>> res = stats.probplot(x, plot=plt) - - A mixture of two normal distributions with broadcasting: - - >>> ax3 = plt.subplot(223) - >>> x = stats.norm.rvs(loc=[0,5], scale=[1,1.5], - ... size=(nsample/2.,2)).ravel() - >>> res = stats.probplot(x, plot=plt) - - A standard normal distribution: - - >>> ax4 = plt.subplot(224) - >>> x = stats.norm.rvs(loc=0, scale=1, size=nsample) - >>> res = stats.probplot(x, plot=plt) - - Produce a new figure with a loggamma distribution, using the ``dist`` and - ``sparams`` keywords: - - >>> fig = plt.figure() - >>> ax = fig.add_subplot(111) - >>> x = stats.loggamma.rvs(c=2.5, size=500) - >>> stats.probplot(x, dist=stats.loggamma, sparams=(2.5,), plot=ax) - >>> ax.set_title("Probplot for loggamma dist with shape parameter 2.5") - - Show the results with Matplotlib: - - >>> plt.show() - - """ - x = np.asarray(x) - osm_uniform = _calc_uniform_order_statistic_medians(x) - dist = _parse_dist_kw(dist, enforce_subclass=False) - if sparams is None: - sparams = () - if isscalar(sparams): - sparams = (sparams,) - if not isinstance(sparams, tuple): - sparams = tuple(sparams) - - osm = dist.ppf(osm_uniform, *sparams) - osr = sort(x) - if fit or (plot is not None): - # perform a linear fit. - slope, intercept, r, prob, sterrest = stats.linregress(osm, osr) - - if plot is not None: - plot.plot(osm, osr, 'bo', osm, slope*osm + intercept, 'r-') - try: - if hasattr(plot, 'set_title'): - # Matplotlib Axes instance or something that looks like it - plot.set_title('Probability Plot') - plot.set_xlabel('Quantiles') - plot.set_ylabel('Ordered Values') - else: - # matplotlib.pyplot module - plot.title('Probability Plot') - plot.xlabel('Quantiles') - plot.ylabel('Ordered Values') - except: - # Not an MPL object or something that looks (enough) like it. - # Don't crash on adding labels or title - pass - - # Add R^2 value to the plot as text - xmin = amin(osm) - xmax = amax(osm) - ymin = amin(x) - ymax = amax(x) - posx = xmin + 0.70 * (xmax - xmin) - posy = ymin + 0.01 * (ymax - ymin) - plot.text(posx, posy, "$R^2=%1.4f$" % r) - - if fit: - return (osm, osr), (slope, intercept, r) - else: - return osm, osr - - -def ppcc_max(x, brack=(0.0,1.0), dist='tukeylambda'): - """Returns the shape parameter that maximizes the probability plot - correlation coefficient for the given data to a one-parameter - family of distributions. - - See also ppcc_plot - """ - dist = _parse_dist_kw(dist) - osm_uniform = _calc_uniform_order_statistic_medians(x) - osr = sort(x) - - # this function computes the x-axis values of the probability plot - # and computes a linear regression (including the correlation) - # and returns 1-r so that a minimization function maximizes the - # correlation - def tempfunc(shape, mi, yvals, func): - xvals = func(mi, shape) - r, prob = stats.pearsonr(xvals, yvals) - return 1-r - - return optimize.brent(tempfunc, brack=brack, args=(osm_uniform, osr, dist.ppf)) - - -def ppcc_plot(x,a,b,dist='tukeylambda', plot=None, N=80): - """Returns (shape, ppcc), and optionally plots shape vs. ppcc - (probability plot correlation coefficient) as a function of shape - parameter for a one-parameter family of distributions from shape - value a to b. - - See also ppcc_max - """ - svals = r_[a:b:complex(N)] - ppcc = svals*0.0 - k = 0 - for sval in svals: - r1,r2 = probplot(x,sval,dist=dist,fit=1) - ppcc[k] = r2[-1] - k += 1 - if plot is not None: - plot.plot(svals, ppcc, 'x') - plot.title('(%s) PPCC Plot' % dist) - plot.xlabel('Prob Plot Corr. Coef.') - plot.ylabel('Shape Values') - return svals, ppcc - - -def boxcox_llf(lmb, data): - r"""The boxcox log-likelihood function. - - Parameters - ---------- - lmb : scalar - Parameter for Box-Cox transformation. See `boxcox` for details. - data : array_like - Data to calculate Box-Cox log-likelihood for. If `data` is - multi-dimensional, the log-likelihood is calculated along the first - axis. - - Returns - ------- - llf : float or ndarray - Box-Cox log-likelihood of `data` given `lmb`. A float for 1-D `data`, - an array otherwise. - - See Also - -------- - boxcox, probplot, boxcox_normplot, boxcox_normmax - - Notes - ----- - The Box-Cox log-likelihood function is defined here as - - .. math:: - - llf = (\lambda - 1) \sum_i(\log(x_i)) - - N/2 \log(\sum_i (y_i - \bar{y})^2 / N), - - where ``y`` is the Box-Cox transformed input data ``x``. - - Examples - -------- - >>> from scipy import stats - >>> import matplotlib.pyplot as plt - >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes - >>> np.random.seed(1245) - - Generate some random variates and calculate Box-Cox log-likelihood values - for them for a range of ``lmbda`` values: - - >>> x = stats.loggamma.rvs(5, loc=10, size=1000) - >>> lmbdas = np.linspace(-2, 10) - >>> llf = np.zeros(lmbdas.shape, dtype=np.float) - >>> for ii, lmbda in enumerate(lmbdas): - ... llf[ii] = stats.boxcox_llf(lmbda, x) - - Also find the optimal lmbda value with `boxcox`: - - >>> x_most_normal, lmbda_optimal = stats.boxcox(x) - - Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a - horizontal line to check that that's really the optimum: - - >>> fig = plt.figure() - >>> ax = fig.add_subplot(111) - >>> ax.plot(lmbdas, llf, 'b.-') - >>> ax.axhline(stats.boxcox_llf(lmbda_optimal, x), color='r') - >>> ax.set_xlabel('lmbda parameter') - >>> ax.set_ylabel('Box-Cox log-likelihood') - - Now add some probability plots to show that where the log-likelihood is - maximized the data transformed with `boxcox` looks closest to normal: - - >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right' - >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs): - ... xt = stats.boxcox(x, lmbda=lmbda) - ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt) - ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc) - ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-') - ... ax_inset.set_xticklabels([]) - ... ax_inset.set_yticklabels([]) - ... ax_inset.set_title('$\lambda=%1.2f$' % lmbda) - - >>> plt.show() - - """ - data = np.asarray(data) - N = data.shape[0] - if N == 0: - return np.nan - - y = boxcox(data, lmb) - y_mean = np.mean(y, axis=0) - llf = (lmb - 1) * np.sum(np.log(data), axis=0) - llf -= N / 2.0 * np.log(np.sum((y - y_mean)**2. / N, axis=0)) - return llf - - -def _boxcox_conf_interval(x, lmax, alpha): - # Need to find the lambda for which - # f(x,lmbda) >= f(x,lmax) - 0.5*chi^2_alpha;1 - fac = 0.5 * distributions.chi2.ppf(1 - alpha, 1) - target = boxcox_llf(lmax, x) - fac - - def rootfunc(lmbda, data, target): - return boxcox_llf(lmbda, data) - target - - # Find positive endpoint of interval in which answer is to be found - newlm = lmax + 0.5 - N = 0 - while (rootfunc(newlm, x, target) > 0.0) and (N < 500): - newlm += 0.1 - N += 1 - - if N == 500: - raise RuntimeError("Could not find endpoint.") - - lmplus = optimize.brentq(rootfunc, lmax, newlm, args=(x, target)) - - # Now find negative interval in the same way - newlm = lmax - 0.5 - N = 0 - while (rootfunc(newlm, x, target) > 0.0) and (N < 500): - newlm -= 0.1 - N += 1 - - if N == 500: - raise RuntimeError("Could not find endpoint.") - - lmminus = optimize.brentq(rootfunc, newlm, lmax, args=(x, target)) - return lmminus, lmplus - - -def boxcox(x, lmbda=None, alpha=None): - r""" - Return a positive dataset transformed by a Box-Cox power transformation. - - Parameters - ---------- - x : ndarray - Input array. Should be 1-dimensional. - lmbda : {None, scalar}, optional - If `lmbda` is not None, do the transformation for that value. - - If `lmbda` is None, find the lambda that maximizes the log-likelihood - function and return it as the second output argument. - alpha : {None, float}, optional - If `alpha` is not None, return the ``100 * (1-alpha)%`` confidence - interval for `lmbda` as the third output argument. - Must be between 0.0 and 1.0. - - Returns - ------- - boxcox : ndarray - Box-Cox power transformed array. - maxlog : float, optional - If the `lmbda` parameter is None, the second returned argument is - the lambda that maximizes the log-likelihood function. - (min_ci, max_ci) : tuple of float, optional - If `lmbda` parameter is None and `alpha` is not None, this returned - tuple of floats represents the minimum and maximum confidence limits - given `alpha`. - - See Also - -------- - probplot, boxcox_normplot, boxcox_normmax, boxcox_llf - - Notes - ----- - The Box-Cox transform is given by:: - - y = (x**lmbda - 1) / lmbda, for lmbda > 0 - log(x), for lmbda = 0 - - `boxcox` requires the input data to be positive. Sometimes a Box-Cox - transformation provides a shift parameter to achieve this; `boxcox` does - not. Such a shift parameter is equivalent to adding a positive constant to - `x` before calling `boxcox`. - - The confidence limits returned when `alpha` is provided give the interval - where: - - .. math:: - - llf(\hat{\lambda}) - llf(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1), - - with ``llf`` the log-likelihood function and :math:`\chi^2` the chi-squared - function. - - References - ---------- - G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the - Royal Statistical Society B, 26, 211-252 (1964). - - Examples - -------- - >>> from scipy import stats - >>> import matplotlib.pyplot as plt - - We generate some random variates from a non-normal distribution and make a - probability plot for it, to show it is non-normal in the tails: - - >>> fig = plt.figure() - >>> ax1 = fig.add_subplot(211) - >>> x = stats.loggamma.rvs(5, size=500) + 5 - >>> stats.probplot(x, dist=stats.norm, plot=ax1) - >>> ax1.set_xlabel('') - >>> ax1.set_title('Probplot against normal distribution') - - We now use `boxcox` to transform the data so it's closest to normal: - - >>> ax2 = fig.add_subplot(212) - >>> xt, _ = stats.boxcox(x) - >>> stats.probplot(xt, dist=stats.norm, plot=ax2) - >>> ax2.set_title('Probplot after Box-Cox transformation') - - >>> plt.show() - - """ - x = np.asarray(x) - if x.size == 0: - return x - - if any(x <= 0): - raise ValueError("Data must be positive.") - - if lmbda is not None: # single transformation - return special.boxcox(x, lmbda) - - # If lmbda=None, find the lmbda that maximizes the log-likelihood function. - lmax = boxcox_normmax(x, method='mle') - y = boxcox(x, lmax) - - if alpha is None: - return y, lmax - else: - # Find confidence interval - interval = _boxcox_conf_interval(x, lmax, alpha) - return y, lmax, interval - - -def boxcox_normmax(x, brack=(-2.0, 2.0), method='pearsonr'): - """Compute optimal Box-Cox transform parameter for input data. - - Parameters - ---------- - x : array_like - Input array. - brack : 2-tuple, optional - The starting interval for a downhill bracket search with - `optimize.brent`. Note that this is in most cases not critical; the - final result is allowed to be outside this bracket. - method : str, optional - The method to determine the optimal transform parameter (`boxcox` - ``lmbda`` parameter). Options are: - - 'pearsonr' (default) - Maximizes the Pearson correlation coefficient between - ``y = boxcox(x)`` and the expected values for ``y`` if `x` would be - normally-distributed. - - 'mle' - Minimizes the log-likelihood `boxcox_llf`. This is the method used - in `boxcox`. - - 'all' - Use all optimization methods available, and return all results. - Useful to compare different methods. - - Returns - ------- - maxlog : float or ndarray - The optimal transform parameter found. An array instead of a scalar - for ``method='all'``. - - See Also - -------- - boxcox, boxcox_llf, boxcox_normplot - - Examples - -------- - >>> from scipy import stats - >>> import matplotlib.pyplot as plt - >>> np.random.seed(1234) # make this example reproducible - - Generate some data and determine optimal ``lmbda`` in various ways: - - >>> x = stats.loggamma.rvs(5, size=30) + 5 - >>> y, lmax_mle = stats.boxcox(x) - >>> lmax_pearsonr = stats.boxcox_normmax(x) - - >>> lmax_mle - 7.177... - >>> lmax_pearsonr - 7.916... - >>> stats.boxcox_normmax(x, method='all') - array([ 7.91667384, 7.17718692]) - - >>> fig = plt.figure() - >>> ax = fig.add_subplot(111) - >>> stats.boxcox_normplot(x, -10, 10, plot=ax) - >>> ax.axvline(lmax_mle, color='r') - >>> ax.axvline(lmax_pearsonr, color='g', ls='--') - - >>> plt.show() - - """ - def _pearsonr(x, brack): - osm_uniform = _calc_uniform_order_statistic_medians(x) - xvals = distributions.norm.ppf(osm_uniform) - - def _eval_pearsonr(lmbda, xvals, samps): - # This function computes the x-axis values of the probability plot - # and computes a linear regression (including the correlation) and - # returns ``1 - r`` so that a minimization function maximizes the - # correlation. - y = boxcox(samps, lmbda) - yvals = np.sort(y) - r, prob = stats.pearsonr(xvals, yvals) - return 1 - r - - return optimize.brent(_eval_pearsonr, brack=brack, args=(xvals, x)) - - def _mle(x, brack): - def _eval_mle(lmb, data): - # function to minimize - return -boxcox_llf(lmb, data) - - return optimize.brent(_eval_mle, brack=brack, args=(x,)) - - def _all(x, brack): - maxlog = np.zeros(2, dtype=np.float) - maxlog[0] = _pearsonr(x, brack) - maxlog[1] = _mle(x, brack) - return maxlog - - methods = {'pearsonr': _pearsonr, - 'mle': _mle, - 'all': _all} - if not method in methods.keys(): - raise ValueError("Method %s not recognized." % method) - - optimfunc = methods[method] - return optimfunc(x, brack) - - -def boxcox_normplot(x, la, lb, plot=None, N=80): - """Compute parameters for a Box-Cox normality plot, optionally show it. - - A Box-Cox normality plot shows graphically what the best transformation - parameter is to use in `boxcox` to obtain a distribution that is close - to normal. - - Parameters - ---------- - x : array_like - Input array. - la, lb : scalar - The lower and upper bounds for the ``lmbda`` values to pass to `boxcox` - for Box-Cox transformations. These are also the limits of the - horizontal axis of the plot if that is generated. - plot : object, optional - If given, plots the quantiles and least squares fit. - `plot` is an object that has to have methods "plot" and "text". - The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, - or a custom object with the same methods. - Default is None, which means that no plot is created. - N : int, optional - Number of points on the horizontal axis (equally distributed from - `la` to `lb`). - - Returns - ------- - lmbdas : ndarray - The ``lmbda`` values for which a Box-Cox transform was done. - ppcc : ndarray - Probability Plot Correlelation Coefficient, as obtained from `probplot` - when fitting the Box-Cox transformed input `x` against a normal - distribution. - - See Also - -------- - probplot, boxcox, boxcox_normmax, boxcox_llf, ppcc_max - - Notes - ----- - Even if `plot` is given, the figure is not shown or saved by - `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')`` - should be used after calling `probplot`. - - Examples - -------- - >>> from scipy import stats - >>> import matplotlib.pyplot as plt - - Generate some non-normally distributed data, and create a Box-Cox plot: - - >>> x = stats.loggamma.rvs(5, size=500) + 5 - >>> fig = plt.figure() - >>> ax = fig.add_subplot(111) - >>> stats.boxcox_normplot(x, -20, 20, plot=ax) - - Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in - the same plot: - - >>> _, maxlog = stats.boxcox(x) - >>> ax.axvline(maxlog, color='r') - - >>> plt.show() - - """ - x = np.asarray(x) - if x.size == 0: - return x - - if lb <= la: - raise ValueError("`lb` has to be larger than `la`.") - - lmbdas = np.linspace(la, lb, num=N) - ppcc = lmbdas * 0.0 - for i, val in enumerate(lmbdas): - # Determine for each lmbda the correlation coefficient of transformed x - z = boxcox(x, lmbda=val) - _, r2 = probplot(z, dist='norm', fit=True) - ppcc[i] = r2[-1] - - if plot is not None: - plot.plot(lmbdas, ppcc, 'x') - try: - if hasattr(plot, 'set_title'): - # Matplotlib Axes instance or something that looks like it - plot.set_title('Box-Cox Normality Plot') - plot.set_ylabel('Prob Plot Corr. Coef.') - plot.set_xlabel('$\lambda$') - else: - # matplotlib.pyplot module - plot.title('Box-Cox Normality Plot') - plot.ylabel('Prob Plot Corr. Coef.') - plot.xlabel('$\lambda$') - except Exception: - # Not an MPL object or something that looks (enough) like it. - # Don't crash on adding labels or title - pass - - return lmbdas, ppcc - - -def shapiro(x, a=None, reta=False): - """ - Perform the Shapiro-Wilk test for normality. - - The Shapiro-Wilk test tests the null hypothesis that the - data was drawn from a normal distribution. - - Parameters - ---------- - x : array_like - Array of sample data. - a : array_like, optional - Array of internal parameters used in the calculation. If these - are not given, they will be computed internally. If x has length - n, then a must have length n/2. - reta : bool, optional - Whether or not to return the internally computed a values. The - default is False. - - Returns - ------- - W : float - The test statistic. - p-value : float - The p-value for the hypothesis test. - a : array_like, optional - If `reta` is True, then these are the internally computed "a" - values that may be passed into this function on future calls. - - See Also - -------- - anderson : The Anderson-Darling test for normality - - References - ---------- - .. [1] http://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm - - """ - N = len(x) - if N < 3: - raise ValueError("Data must be at least length 3.") - if a is None: - a = zeros(N,'f') - init = 0 - else: - if len(a) != N//2: - raise ValueError("len(a) must equal len(x)/2") - init = 1 - y = sort(x) - a, w, pw, ifault = statlib.swilk(y, a[:N//2], init) - if not ifault in [0,2]: - warnings.warn(str(ifault)) - if N > 5000: - warnings.warn("p-value may not be accurate for N > 5000.") - if reta: - return w, pw, a - else: - return w, pw - -# Values from Stephens, M A, "EDF Statistics for Goodness of Fit and -# Some Comparisons", Journal of he American Statistical -# Association, Vol. 69, Issue 347, Sept. 1974, pp 730-737 -_Avals_norm = array([0.576, 0.656, 0.787, 0.918, 1.092]) -_Avals_expon = array([0.922, 1.078, 1.341, 1.606, 1.957]) -# From Stephens, M A, "Goodness of Fit for the Extreme Value Distribution", -# Biometrika, Vol. 64, Issue 3, Dec. 1977, pp 583-588. -_Avals_gumbel = array([0.474, 0.637, 0.757, 0.877, 1.038]) -# From Stephens, M A, "Tests of Fit for the Logistic Distribution Based -# on the Empirical Distribution Function.", Biometrika, -# Vol. 66, Issue 3, Dec. 1979, pp 591-595. -_Avals_logistic = array([0.426, 0.563, 0.660, 0.769, 0.906, 1.010]) - - -def anderson(x,dist='norm'): - """ - Anderson-Darling test for data coming from a particular distribution - - The Anderson-Darling test is a modification of the Kolmogorov- - Smirnov test kstest_ for the null hypothesis that a sample is - drawn from a population that follows a particular distribution. - For the Anderson-Darling test, the critical values depend on - which distribution is being tested against. This function works - for normal, exponential, logistic, or Gumbel (Extreme Value - Type I) distributions. - - Parameters - ---------- - x : array_like - array of sample data - dist : {'norm','expon','logistic','gumbel','extreme1'}, optional - the type of distribution to test against. The default is 'norm' - and 'extreme1' is a synonym for 'gumbel' - - Returns - ------- - A2 : float - The Anderson-Darling test statistic - critical : list - The critical values for this distribution - sig : list - The significance levels for the corresponding critical values - in percents. The function returns critical values for a - differing set of significance levels depending on the - distribution that is being tested against. - - Notes - ----- - Critical values provided are for the following significance levels: - - normal/exponenential - 15%, 10%, 5%, 2.5%, 1% - logistic - 25%, 10%, 5%, 2.5%, 1%, 0.5% - Gumbel - 25%, 10%, 5%, 2.5%, 1% - - If A2 is larger than these critical values then for the corresponding - significance level, the null hypothesis that the data come from the - chosen distribution can be rejected. - - References - ---------- - .. [1] http://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm - .. [2] Stephens, M. A. (1974). EDF Statistics for Goodness of Fit and - Some Comparisons, Journal of the American Statistical Association, - Vol. 69, pp. 730-737. - .. [3] Stephens, M. A. (1976). Asymptotic Results for Goodness-of-Fit - Statistics with Unknown Parameters, Annals of Statistics, Vol. 4, - pp. 357-369. - .. [4] Stephens, M. A. (1977). Goodness of Fit for the Extreme Value - Distribution, Biometrika, Vol. 64, pp. 583-588. - .. [5] Stephens, M. A. (1977). Goodness of Fit with Special Reference - to Tests for Exponentiality , Technical Report No. 262, - Department of Statistics, Stanford University, Stanford, CA. - .. [6] Stephens, M. A. (1979). Tests of Fit for the Logistic Distribution - Based on the Empirical Distribution Function, Biometrika, Vol. 66, - pp. 591-595. - - """ - if not dist in ['norm','expon','gumbel','extreme1','logistic']: - raise ValueError("Invalid distribution; dist must be 'norm', " - "'expon', 'gumbel', 'extreme1' or 'logistic'.") - y = sort(x) - xbar = np.mean(x, axis=0) - N = len(y) - if dist == 'norm': - s = np.std(x, ddof=1, axis=0) - w = (y-xbar)/s - z = distributions.norm.cdf(w) - sig = array([15,10,5,2.5,1]) - critical = around(_Avals_norm / (1.0 + 4.0/N - 25.0/N/N),3) - elif dist == 'expon': - w = y / xbar - z = distributions.expon.cdf(w) - sig = array([15,10,5,2.5,1]) - critical = around(_Avals_expon / (1.0 + 0.6/N),3) - elif dist == 'logistic': - def rootfunc(ab,xj,N): - a,b = ab - tmp = (xj-a)/b - tmp2 = exp(tmp) - val = [sum(1.0/(1+tmp2),axis=0)-0.5*N, - sum(tmp*(1.0-tmp2)/(1+tmp2),axis=0)+N] - return array(val) - sol0 = array([xbar,np.std(x, ddof=1, axis=0)]) - sol = optimize.fsolve(rootfunc,sol0,args=(x,N),xtol=1e-5) - w = (y-sol[0])/sol[1] - z = distributions.logistic.cdf(w) - sig = array([25,10,5,2.5,1,0.5]) - critical = around(_Avals_logistic / (1.0+0.25/N),3) - else: # (dist == 'gumbel') or (dist == 'extreme1'): - # the following is incorrect, see ticket:1097 -## def fixedsolve(th,xj,N): -## val = stats.sum(xj)*1.0/N -## tmp = exp(-xj/th) -## term = sum(xj*tmp,axis=0) -## term /= sum(tmp,axis=0) -## return val - term -## s = optimize.fixed_point(fixedsolve, 1.0, args=(x,N),xtol=1e-5) -## xbar = -s*log(sum(exp(-x/s),axis=0)*1.0/N) - xbar, s = distributions.gumbel_l.fit(x) - w = (y-xbar)/s - z = distributions.gumbel_l.cdf(w) - sig = array([25,10,5,2.5,1]) - critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)),3) - - i = arange(1,N+1) - S = sum((2*i-1.0)/N*(log(z)+log(1-z[::-1])),axis=0) - A2 = -N-S - return A2, critical, sig - - -def ansari(x,y): - """ - Perform the Ansari-Bradley test for equal scale parameters - - The Ansari-Bradley test is a non-parametric test for the equality - of the scale parameter of the distributions from which two - samples were drawn. - - Parameters - ---------- - x, y : array_like - arrays of sample data - - Returns - ------- - AB : float - The Ansari-Bradley test statistic - p-value : float - The p-value of the hypothesis test - - See Also - -------- - fligner : A non-parametric test for the equality of k variances - mood : A non-parametric test for the equality of two scale parameters - - Notes - ----- - The p-value given is exact when the sample sizes are both less than - 55 and there are no ties, otherwise a normal approximation for the - p-value is used. - - References - ---------- - .. [1] Sprent, Peter and N.C. Smeeton. Applied nonparametric statistical - methods. 3rd ed. Chapman and Hall/CRC. 2001. Section 5.8.2. - - """ - x,y = asarray(x),asarray(y) - n = len(x) - m = len(y) - if m < 1: - raise ValueError("Not enough other observations.") - if n < 1: - raise ValueError("Not enough test observations.") - N = m+n - xy = r_[x,y] # combine - rank = stats.rankdata(xy) - symrank = amin(array((rank,N-rank+1)),0) - AB = sum(symrank[:n],axis=0) - uxy = unique(xy) - repeats = (len(uxy) != len(xy)) - exact = ((m < 55) and (n < 55) and not repeats) - if repeats and ((m < 55) or (n < 55)): - warnings.warn("Ties preclude use of exact statistic.") - if exact: - astart, a1, ifault = statlib.gscale(n,m) - ind = AB-astart - total = sum(a1,axis=0) - if ind < len(a1)/2.0: - cind = int(ceil(ind)) - if (ind == cind): - pval = 2.0*sum(a1[:cind+1],axis=0)/total - else: - pval = 2.0*sum(a1[:cind],axis=0)/total - else: - find = int(floor(ind)) - if (ind == floor(ind)): - pval = 2.0*sum(a1[find:],axis=0)/total - else: - pval = 2.0*sum(a1[find+1:],axis=0)/total - return AB, min(1.0,pval) - - # otherwise compute normal approximation - if N % 2: # N odd - mnAB = n*(N+1.0)**2 / 4.0 / N - varAB = n*m*(N+1.0)*(3+N**2)/(48.0*N**2) - else: - mnAB = n*(N+2.0)/4.0 - varAB = m*n*(N+2)*(N-2.0)/48/(N-1.0) - if repeats: # adjust variance estimates - # compute sum(tj * rj**2,axis=0) - fac = sum(symrank**2,axis=0) - if N % 2: # N odd - varAB = m*n*(16*N*fac-(N+1)**4)/(16.0 * N**2 * (N-1)) - else: # N even - varAB = m*n*(16*fac-N*(N+2)**2)/(16.0 * N * (N-1)) - z = (AB - mnAB)/sqrt(varAB) - pval = distributions.norm.sf(abs(z)) * 2.0 - return AB, pval - - -def bartlett(*args): - """ - Perform Bartlett's test for equal variances - - Bartlett's test tests the null hypothesis that all input samples - are from populations with equal variances. For samples - from significantly non-normal populations, Levene's test - `levene`_ is more robust. - - Parameters - ---------- - sample1, sample2,... : array_like - arrays of sample data. May be different lengths. - - Returns - ------- - T : float - The test statistic. - p-value : float - The p-value of the test. - - References - ---------- - .. [1] http://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm - - .. [2] Snedecor, George W. and Cochran, William G. (1989), Statistical - Methods, Eighth Edition, Iowa State University Press. - - """ - k = len(args) - if k < 2: - raise ValueError("Must enter at least two input sample vectors.") - Ni = zeros(k) - ssq = zeros(k,'d') - for j in range(k): - Ni[j] = len(args[j]) - ssq[j] = np.var(args[j], ddof=1) - Ntot = sum(Ni,axis=0) - spsq = sum((Ni-1)*ssq,axis=0)/(1.0*(Ntot-k)) - numer = (Ntot*1.0-k)*log(spsq) - sum((Ni-1.0)*log(ssq),axis=0) - denom = 1.0 + (1.0/(3*(k-1)))*((sum(1.0/(Ni-1.0),axis=0))-1.0/(Ntot-k)) - T = numer / denom - pval = distributions.chi2.sf(T,k-1) # 1 - cdf - return T, pval - - -def levene(*args,**kwds): - """ - Perform Levene test for equal variances. - - The Levene test tests the null hypothesis that all input samples - are from populations with equal variances. Levene's test is an - alternative to Bartlett's test `bartlett` in the case where - there are significant deviations from normality. - - Parameters - ---------- - sample1, sample2, ... : array_like - The sample data, possibly with different lengths - center : {'mean', 'median', 'trimmed'}, optional - Which function of the data to use in the test. The default - is 'median'. - proportiontocut : float, optional - When `center` is 'trimmed', this gives the proportion of data points - to cut from each end. (See `scipy.stats.trim_mean`.) - Default is 0.05. - - Returns - ------- - W : float - The test statistic. - p-value : float - The p-value for the test. - - Notes - ----- - Three variations of Levene's test are possible. The possibilities - and their recommended usages are: - - * 'median' : Recommended for skewed (non-normal) distributions> - * 'mean' : Recommended for symmetric, moderate-tailed distributions. - * 'trimmed' : Recommended for heavy-tailed distributions. - - References - ---------- - .. [1] http://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm - .. [2] Levene, H. (1960). In Contributions to Probability and Statistics: - Essays in Honor of Harold Hotelling, I. Olkin et al. eds., - Stanford University Press, pp. 278-292. - .. [3] Brown, M. B. and Forsythe, A. B. (1974), Journal of the American - Statistical Association, 69, 364-367 - - """ - # Handle keyword arguments. - center = 'median' - proportiontocut = 0.05 - for kw, value in kwds.items(): - if kw not in ['center', 'proportiontocut']: - raise TypeError("levene() got an unexpected keyword argument '%s'" % kw) - if kw == 'center': - center = value - else: - proportiontocut = value - - k = len(args) - if k < 2: - raise ValueError("Must enter at least two input sample vectors.") - Ni = zeros(k) - Yci = zeros(k,'d') - - if not center in ['mean','median','trimmed']: - raise ValueError("Keyword argument
must be 'mean', 'median'" - + "or 'trimmed'.") - - if center == 'median': - func = lambda x: np.median(x, axis=0) - elif center == 'mean': - func = lambda x: np.mean(x, axis=0) - else: # center == 'trimmed' - args = tuple(stats.trimboth(np.sort(arg), proportiontocut) for arg in args) - func = lambda x: np.mean(x, axis=0) - - for j in range(k): - Ni[j] = len(args[j]) - Yci[j] = func(args[j]) - Ntot = sum(Ni,axis=0) - - # compute Zij's - Zij = [None]*k - for i in range(k): - Zij[i] = abs(asarray(args[i])-Yci[i]) - # compute Zbari - Zbari = zeros(k,'d') - Zbar = 0.0 - for i in range(k): - Zbari[i] = np.mean(Zij[i], axis=0) - Zbar += Zbari[i]*Ni[i] - Zbar /= Ntot - - numer = (Ntot-k)*sum(Ni*(Zbari-Zbar)**2,axis=0) - - # compute denom_variance - dvar = 0.0 - for i in range(k): - dvar += sum((Zij[i]-Zbari[i])**2,axis=0) - - denom = (k-1.0)*dvar - - W = numer / denom - pval = distributions.f.sf(W,k-1,Ntot-k) # 1 - cdf - return W, pval - - -@setastest(False) -def binom_test(x,n=None,p=0.5): - """ - Perform a test that the probability of success is p. - - This is an exact, two-sided test of the null hypothesis - that the probability of success in a Bernoulli experiment - is `p`. - - Parameters - ---------- - x : integer or array_like - the number of successes, or if x has length 2, it is the - number of successes and the number of failures. - n : integer - the number of trials. This is ignored if x gives both the - number of successes and failures - p : float, optional - The hypothesized probability of success. 0 <= p <= 1. The - default value is p = 0.5 - - Returns - ------- - p-value : float - The p-value of the hypothesis test - - References - ---------- - .. [1] http://en.wikipedia.org/wiki/Binomial_test - - """ - x = atleast_1d(x).astype(np.integer) - if len(x) == 2: - n = x[1]+x[0] - x = x[0] - elif len(x) == 1: - x = x[0] - if n is None or n < x: - raise ValueError("n must be >= x") - n = np.int_(n) - else: - raise ValueError("Incorrect length for x.") - - if (p > 1.0) or (p < 0.0): - raise ValueError("p must be in range [0,1]") - - d = distributions.binom.pmf(x,n,p) - rerr = 1+1e-7 - if (x == p*n): - # special case as shortcut, would also be handled by `else` below - pval = 1. - elif (x < p*n): - i = np.arange(np.ceil(p*n),n+1) - y = np.sum(distributions.binom.pmf(i,n,p) <= d*rerr,axis=0) - pval = distributions.binom.cdf(x,n,p) + distributions.binom.sf(n-y,n,p) - else: - i = np.arange(np.floor(p*n) + 1) - y = np.sum(distributions.binom.pmf(i,n,p) <= d*rerr,axis=0) - pval = distributions.binom.cdf(y-1,n,p) + distributions.binom.sf(x-1,n,p) - - return min(1.0,pval) - - -def _apply_func(x,g,func): - # g is list of indices into x - # separating x into different groups - # func should be applied over the groups - g = unique(r_[0,g,len(x)]) - output = [] - for k in range(len(g)-1): - output.append(func(x[g[k]:g[k+1]])) - return asarray(output) - - -def fligner(*args,**kwds): - """ - Perform Fligner's test for equal variances. - - Fligner's test tests the null hypothesis that all input samples - are from populations with equal variances. Fligner's test is - non-parametric in contrast to Bartlett's test `bartlett` and - Levene's test `levene`. - - Parameters - ---------- - sample1, sample2, ... : array_like - arrays of sample data. Need not be the same length - center : {'mean', 'median', 'trimmed'}, optional - keyword argument controlling which function of the data - is used in computing the test statistic. The default - is 'median'. - proportiontocut : float, optional - When `center` is 'trimmed', this gives the proportion of data points - to cut from each end. (See `scipy.stats.trim_mean`.) - Default is 0.05. - - Returns - ------- - Xsq : float - the test statistic - p-value : float - the p-value for the hypothesis test - - Notes - ----- - As with Levene's test there are three variants - of Fligner's test that differ by the measure of central - tendency used in the test. See `levene` for more information. - - References - ---------- - .. [1] http://www.stat.psu.edu/~bgl/center/tr/TR993.ps - - .. [2] Fligner, M.A. and Killeen, T.J. (1976). Distribution-free two-sample - tests for scale. 'Journal of the American Statistical Association.' - 71(353), 210-213. - - """ - # Handle keyword arguments. - center = 'median' - proportiontocut = 0.05 - for kw, value in kwds.items(): - if kw not in ['center', 'proportiontocut']: - raise TypeError("fligner() got an unexpected keyword argument '%s'" % kw) - if kw == 'center': - center = value - else: - proportiontocut = value - - k = len(args) - if k < 2: - raise ValueError("Must enter at least two input sample vectors.") - - if not center in ['mean','median','trimmed']: - raise ValueError("Keyword argument
must be 'mean', 'median'" - + "or 'trimmed'.") - - if center == 'median': - func = lambda x: np.median(x, axis=0) - elif center == 'mean': - func = lambda x: np.mean(x, axis=0) - else: # center == 'trimmed' - args = tuple(stats.trimboth(arg, proportiontocut) for arg in args) - func = lambda x: np.mean(x, axis=0) - - Ni = asarray([len(args[j]) for j in range(k)]) - Yci = asarray([func(args[j]) for j in range(k)]) - Ntot = sum(Ni,axis=0) - # compute Zij's - Zij = [abs(asarray(args[i])-Yci[i]) for i in range(k)] - allZij = [] - g = [0] - for i in range(k): - allZij.extend(list(Zij[i])) - g.append(len(allZij)) - - ranks = stats.rankdata(allZij) - a = distributions.norm.ppf(ranks/(2*(Ntot+1.0)) + 0.5) - - # compute Aibar - Aibar = _apply_func(a,g,sum) / Ni - anbar = np.mean(a, axis=0) - varsq = np.var(a,axis=0, ddof=1) - Xsq = sum(Ni*(asarray(Aibar)-anbar)**2.0,axis=0)/varsq - pval = distributions.chi2.sf(Xsq,k-1) # 1 - cdf - return Xsq, pval - - -def mood(x, y, axis=0): - """ - Perform Mood's test for equal scale parameters. - - Mood's two-sample test for scale parameters is a non-parametric - test for the null hypothesis that two samples are drawn from the - same distribution with the same scale parameter. - - Parameters - ---------- - x, y : array_like - Arrays of sample data. - axis: int, optional - The axis along which the samples are tested. `x` and `y` can be of - different length along `axis`. - If `axis` is None, `x` and `y` are flattened and the test is done on - all values in the flattened arrays. - - Returns - ------- - z : scalar or ndarray - The z-score for the hypothesis test. For 1-D inputs a scalar is - returned; - p-value : scalar ndarray - The p-value for the hypothesis test. - - See Also - -------- - fligner : A non-parametric test for the equality of k variances - ansari : A non-parametric test for the equality of 2 variances - bartlett : A parametric test for equality of k variances in normal samples - levene : A parametric test for equality of k variances - - Notes - ----- - The data are assumed to be drawn from probability distributions ``f(x)`` - and ``f(x/s) / s`` respectively, for some probability density function f. - The null hypothesis is that ``s == 1``. - - For multi-dimensional arrays, if the inputs are of shapes - ``(n0, n1, n2, n3)`` and ``(n0, m1, n2, n3)``, then if ``axis=1``, the - resulting z and p values will have shape ``(n0, n2, n3)``. Note that - ``n1`` and ``m1`` don't have to be equal, but the other dimensions do. - - Examples - -------- - >>> from scipy import stats - >>> x2 = np.random.randn(2, 45, 6, 7) - >>> x1 = np.random.randn(2, 30, 6, 7) - >>> z, p = stats.mood(x1, x2, axis=1) - >>> p.shape - (2, 6, 7) - - Find the number of points where the difference in scale is not significant: - - >>> (p > 0.1).sum() - 74 - - Perform the test with different scales: - - >>> x1 = np.random.randn(2, 30) - >>> x2 = np.random.randn(2, 35) * 10.0 - >>> stats.mood(x1, x2, axis=1) - (array([-5.84332354, -5.6840814 ]), array([5.11694980e-09, 1.31517628e-08])) - - """ - x = np.asarray(x, dtype=float) - y = np.asarray(y, dtype=float) - - if axis is None: - x = x.flatten() - y = y.flatten() - axis = 0 - - # Determine shape of the result arrays - res_shape = tuple([x.shape[ax] for ax in range(len(x.shape)) if ax != axis]) - if not (res_shape == tuple([y.shape[ax] for ax in range(len(y.shape)) if - ax != axis])): - raise ValueError("Dimensions of x and y on all axes except `axis` " - "should match") - - n = x.shape[axis] - m = y.shape[axis] - N = m + n - if N < 3: - raise ValueError("Not enough observations.") - - xy = np.concatenate((x, y), axis=axis) - if axis != 0: - xy = np.rollaxis(xy, axis) - - xy = xy.reshape(xy.shape[0], -1) - - # Generalized to the n-dimensional case by adding the axis argument, and - # using for loops, since rankdata is not vectorized. For improving - # performance consider vectorizing rankdata function. - all_ranks = np.zeros_like(xy) - for j in range(xy.shape[1]): - all_ranks[:, j] = stats.rankdata(xy[:, j]) - - Ri = all_ranks[:n] - M = sum((Ri - (N + 1.0) / 2) ** 2, axis=0) - # Approx stat. - mnM = n * (N * N - 1.0) / 12 - varM = m * n * (N + 1.0) * (N + 2) * (N - 2) / 180 - z = (M - mnM) / sqrt(varM) - - # sf for right tail, cdf for left tail. Factor 2 for two-sidedness - z_pos = z > 0 - pval = np.zeros_like(z) - pval[z_pos] = 2 * distributions.norm.sf(z[z_pos]) - pval[~z_pos] = 2 * distributions.norm.cdf(z[~z_pos]) - - if res_shape == (): - # Return scalars, not 0-D arrays - z = z[0] - pval = pval[0] - else: - z.shape = res_shape - pval.shape = res_shape - - return z, pval - - -def wilcoxon(x, y=None, zero_method="wilcox", correction=False): - """ - Calculate the Wilcoxon signed-rank test. - - The Wilcoxon signed-rank test tests the null hypothesis that two - related paired samples come from the same distribution. In particular, - it tests whether the distribution of the differences x - y is symmetric - about zero. It is a non-parametric version of the paired T-test. - - Parameters - ---------- - x : array_like - The first set of measurements. - y : array_like, optional - The second set of measurements. If `y` is not given, then the `x` - array is considered to be the differences between the two sets of - measurements. - zero_method : string, {"pratt", "wilcox", "zsplit"}, optional - "pratt": - Pratt treatment: includes zero-differences in the ranking process - (more conservative) - "wilcox": - Wilcox treatment: discards all zero-differences - "zsplit": - Zero rank split: just like Pratt, but spliting the zero rank - between positive and negative ones - correction : bool, optional - If True, apply continuity correction by adjusting the Wilcoxon rank - statistic by 0.5 towards the mean value when computing the - z-statistic. Default is False. - - Returns - ------- - T : float - The sum of the ranks of the differences above or below zero, whichever - is smaller. - p-value : float - The two-sided p-value for the test. - - Notes - ----- - Because the normal approximation is used for the calculations, the - samples used should be large. A typical rule is to require that - n > 20. - - References - ---------- - .. [1] http://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test - - """ - - if not zero_method in ["wilcox", "pratt", "zsplit"]: - raise ValueError("Zero method should be either 'wilcox' \ - or 'pratt' or 'zsplit'") - - if y is None: - d = x - else: - x, y = map(asarray, (x, y)) - if len(x) != len(y): - raise ValueError('Unequal N in wilcoxon. Aborting.') - d = x-y - - if zero_method == "wilcox": - d = compress(not_equal(d, 0), d, axis=-1) # Keep all non-zero differences - - count = len(d) - if (count < 10): - warnings.warn("Warning: sample size too small for normal approximation.") - r = stats.rankdata(abs(d)) - r_plus = sum((d > 0) * r, axis=0) - r_minus = sum((d < 0) * r, axis=0) - - if zero_method == "zsplit": - r_zero = sum((d == 0) * r, axis=0) - r_plus += r_zero / 2. - r_minus += r_zero / 2. - - T = min(r_plus, r_minus) - mn = count*(count + 1.) * 0.25 - se = count*(count + 1.) * (2. * count + 1.) - - if zero_method == "pratt": - r = r[d != 0] - - replist, repnum = find_repeats(r) - if repnum.size != 0: - # Correction for repeated elements. - se -= 0.5 * (repnum * (repnum * repnum - 1)).sum() - - se = sqrt(se / 24) - correction = 0.5 * int(bool(correction)) * np.sign(T - mn) - z = (T - mn - correction) / se - prob = 2. * distributions.norm.sf(abs(z)) - return T, prob - - -def _hermnorm(N): - # return the negatively normalized hermite polynomials up to order N-1 - # (inclusive) - # using the recursive relationship - # p_n+1 = p_n(x)' - x*p_n(x) - # and p_0(x) = 1 - plist = [None]*N - plist[0] = poly1d(1) - for n in range(1,N): - plist[n] = plist[n-1].deriv() - poly1d([1,0])*plist[n-1] - return plist - - -def pdf_fromgamma(g1,g2,g3=0.0,g4=None): - if g4 is None: - g4 = 3*g2*g2 - sigsq = 1.0/g2 - sig = sqrt(sigsq) - mu = g1*sig**3.0 - p12 = _hermnorm(13) - for k in range(13): - p12[k] = p12[k]/sig**k - - # Add all of the terms to polynomial - totp = p12[0] - (g1/6.0*p12[3]) + \ - (g2/24.0*p12[4] + g1*g1/72.0*p12[6]) - \ - (g3/120.0*p12[5] + g1*g2/144.0*p12[7] + g1**3.0/1296.0*p12[9]) + \ - (g4/720*p12[6] + (g2*g2/1152.0+g1*g3/720)*p12[8] + - g1*g1*g2/1728.0*p12[10] + g1**4.0/31104.0*p12[12]) - # Final normalization - totp = totp / sqrt(2*pi)/sig - - def thefunc(x): - xn = (x-mu)/sig - return totp(xn)*exp(-xn*xn/2.0) - return thefunc - - -def _circfuncs_common(samples, high, low): - samples = np.asarray(samples) - if samples.size == 0: - return np.nan, np.nan - - ang = (samples - low)*2*pi / (high-low) - return samples, ang - - -def circmean(samples, high=2*pi, low=0, axis=None): - """ - Compute the circular mean for samples in a range. - - Parameters - ---------- - samples : array_like - Input array. - high : float or int, optional - High boundary for circular mean range. Default is ``2*pi``. - low : float or int, optional - Low boundary for circular mean range. Default is 0. - axis : int, optional - Axis along which means are computed. The default is to compute - the mean of the flattened array. - - Returns - ------- - circmean : float - Circular mean. - - """ - samples, ang = _circfuncs_common(samples, high, low) - res = angle(np.mean(exp(1j*ang), axis=axis)) - mask = res < 0 - if (mask.ndim > 0): - res[mask] += 2*pi - elif mask: - res = res + 2*pi - - return res*(high-low)/2.0/pi + low - - -def circvar(samples, high=2*pi, low=0, axis=None): - """ - Compute the circular variance for samples assumed to be in a range - - Parameters - ---------- - samples : array_like - Input array. - low : float or int, optional - Low boundary for circular variance range. Default is 0. - high : float or int, optional - High boundary for circular variance range. Default is ``2*pi``. - axis : int, optional - Axis along which variances are computed. The default is to compute - the variance of the flattened array. - - Returns - ------- - circvar : float - Circular variance. - - Notes - ----- - This uses a definition of circular variance that in the limit of small - angles returns a number close to the 'linear' variance. - - """ - samples, ang = _circfuncs_common(samples, high, low) - res = np.mean(exp(1j*ang), axis=axis) - R = abs(res) - return ((high-low)/2.0/pi)**2 * 2 * log(1/R) - - -def circstd(samples, high=2*pi, low=0, axis=None): - """ - Compute the circular standard deviation for samples assumed to be in the - range [low to high]. - - Parameters - ---------- - samples : array_like - Input array. - low : float or int, optional - Low boundary for circular standard deviation range. Default is 0. - high : float or int, optional - High boundary for circular standard deviation range. - Default is ``2*pi``. - axis : int, optional - Axis along which standard deviations are computed. The default is - to compute the standard deviation of the flattened array. - - Returns - ------- - circstd : float - Circular standard deviation. - - Notes - ----- - This uses a definition of circular standard deviation that in the limit of - small angles returns a number close to the 'linear' standard deviation. - - """ - samples, ang = _circfuncs_common(samples, high, low) - res = np.mean(exp(1j*ang), axis=axis) - R = abs(res) - return ((high-low)/2.0/pi) * sqrt(-2*log(R)) - - -# Tests to include (from R) -- some of these already in stats. -######## -# X Ansari-Bradley -# X Bartlett (and Levene) -# X Binomial -# Y Pearson's Chi-squared (stats.chisquare) -# Y Association Between Paired samples (stats.pearsonr, stats.spearmanr) -# stats.kendalltau) -- these need work though -# Fisher's exact test -# X Fligner-Killeen Test -# Y Friedman Rank Sum (stats.friedmanchisquare?) -# Y Kruskal-Wallis -# Y Kolmogorov-Smirnov -# Cochran-Mantel-Haenszel Chi-Squared for Count -# McNemar's Chi-squared for Count -# X Mood Two-Sample -# X Test For Equal Means in One-Way Layout (see stats.ttest also) -# Pairwise Comparisons of proportions -# Pairwise t tests -# Tabulate p values for pairwise comparisons -# Pairwise Wilcoxon rank sum tests -# Power calculations two sample test of prop. -# Power calculations for one and two sample t tests -# Equal or Given Proportions -# Trend in Proportions -# Quade Test -# Y Student's T Test -# Y F Test to compare two variances -# XY Wilcoxon Rank Sum and Signed Rank Tests +# Author: Travis Oliphant, 2002 +# +# Further updates and enhancements by many SciPy developers. +# +from __future__ import division, print_function, absolute_import + +import math +import warnings + +import numpy as np +from numpy import (isscalar, r_, log, sum, around, unique, asarray, zeros, + arange, sort, amin, amax, any, atleast_1d, sqrt, ceil, + floor, array, poly1d, compress, not_equal, pi, exp, ravel, + angle) +from numpy.testing.decorators import setastest + +from scipy.lib.six import string_types +from scipy import optimize +from scipy import special +from wafo.stats import statlib +from wafo.stats import stats +from wafo.stats.stats import find_repeats +from wafo.stats import distributions +from wafo.stats._distn_infrastructure import rv_generic + + +__all__ = ['mvsdist', + 'bayes_mvs', 'kstat', 'kstatvar', 'probplot', 'ppcc_max', + 'ppcc_plot', + 'boxcox_llf', 'boxcox', 'boxcox_normmax', 'boxcox_normplot', + 'shapiro', 'anderson', 'ansari', 'bartlett', 'levene', 'binom_test', + 'fligner', 'mood', 'wilcoxon', + 'pdf_fromgamma', 'circmean', 'circvar', 'circstd', + ] + + +def bayes_mvs(data, alpha=0.90): + """ + Bayesian confidence intervals for the mean, var, and std. + + Parameters + ---------- + data : array_like + Input data, if multi-dimensional it is flattened to 1-D by `bayes_mvs`. + Requires 2 or more data points. + alpha : float, optional + Probability that the returned confidence interval contains + the true parameter. + + Returns + ------- + mean_cntr, var_cntr, std_cntr : tuple + The three results are for the mean, variance and standard deviation, + respectively. Each result is a tuple of the form:: + + (center, (lower, upper)) + + with `center` the mean of the conditional pdf of the value given the + data, and `(lower, upper)` a confidence interval, centered on the + median, containing the estimate to a probability `alpha`. + + Notes + ----- + Each tuple of mean, variance, and standard deviation estimates represent + the (center, (lower, upper)) with center the mean of the conditional pdf + of the value given the data and (lower, upper) is a confidence interval + centered on the median, containing the estimate to a probability + `alpha`. + + Converts data to 1-D and assumes all data has the same mean and variance. + Uses Jeffrey's prior for variance and std. + + Equivalent to tuple((x.mean(), x.interval(alpha)) for x in mvsdist(dat)) + + References + ---------- + T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and + standard-deviation from data", http://hdl.handle.net/1877/438, 2006. + + """ + res = mvsdist(data) + if alpha >= 1 or alpha <= 0: + raise ValueError( + "0 < alpha < 1 is required, but alpha=%s was given." % alpha) + return tuple((x.mean(), x.interval(alpha)) for x in res) + + +def mvsdist(data): + """ + 'Frozen' distributions for mean, variance, and standard deviation of data. + + Parameters + ---------- + data : array_like + Input array. Converted to 1-D using ravel. + Requires 2 or more data-points. + + Returns + ------- + mdist : "frozen" distribution object + Distribution object representing the mean of the data + vdist : "frozen" distribution object + Distribution object representing the variance of the data + sdist : "frozen" distribution object + Distribution object representing the standard deviation of the data + + Notes + ----- + The return values from bayes_mvs(data) is equivalent to + ``tuple((x.mean(), x.interval(0.90)) for x in mvsdist(data))``. + + In other words, calling ``.mean()`` and ``.interval(0.90)`` + on the three distribution objects returned from this function will give + the same results that are returned from `bayes_mvs`. + + Examples + -------- + >>> from scipy.stats import mvsdist + >>> data = [6, 9, 12, 7, 8, 8, 13] + >>> mean, var, std = mvsdist(data) + + We now have frozen distribution objects "mean", "var" and "std" that we can + examine: + + >>> mean.mean() + 9.0 + >>> mean.interval(0.95) + (6.6120585482655692, 11.387941451734431) + >>> mean.std() + 1.1952286093343936 + + """ + x = ravel(data) + n = len(x) + if (n < 2): + raise ValueError("Need at least 2 data-points.") + xbar = x.mean() + C = x.var() + if (n > 1000): # gaussian approximations for large n + mdist = distributions.norm(loc=xbar, scale=math.sqrt(C / n)) + sdist = distributions.norm( + loc=math.sqrt(C), scale=math.sqrt(C / (2. * n))) + vdist = distributions.norm(loc=C, scale=math.sqrt(2.0 / n) * C) + else: + nm1 = n - 1 + fac = n * C / 2. + val = nm1 / 2. + mdist = distributions.t(nm1, loc=xbar, scale=math.sqrt(C / nm1)) + sdist = distributions.gengamma(val, -2, scale=math.sqrt(fac)) + vdist = distributions.invgamma(val, scale=fac) + return mdist, vdist, sdist + + +def kstat(data, n=2): + """ + Return the nth k-statistic (1<=n<=4 so far). + + The nth k-statistic is the unique symmetric unbiased estimator of the nth + cumulant kappa_n. + + Parameters + ---------- + data : array_like + Input array. + n : int, {1, 2, 3, 4}, optional + Default is equal to 2. + + Returns + ------- + kstat : float + The nth k-statistic. + + See Also + -------- + kstatvar: Returns an unbiased estimator of the variance of the k-statistic. + + Notes + ----- + The cumulants are related to central moments but are specifically defined + using a power series expansion of the logarithm of the characteristic + function (which is the Fourier transform of the PDF). + In particular let phi(t) be the characteristic function, then:: + + ln phi(t) = > kappa_n (it)^n / n! (sum from n=0 to inf) + + The first few cumulants (kappa_n) in terms of central moments (mu_n) are:: + + kappa_1 = mu_1 + kappa_2 = mu_2 + kappa_3 = mu_3 + kappa_4 = mu_4 - 3*mu_2**2 + kappa_5 = mu_5 - 10*mu_2 * mu_3 + + References + ---------- + http://mathworld.wolfram.com/k-Statistic.html + + http://mathworld.wolfram.com/Cumulant.html + + """ + if n > 4 or n < 1: + raise ValueError("k-statistics only supported for 1<=n<=4") + n = int(n) + S = zeros(n + 1, 'd') + data = ravel(data) + N = len(data) + for k in range(1, n + 1): + S[k] = sum(data ** k, axis=0) + if n == 1: + return S[1] * 1.0 / N + elif n == 2: + return (N * S[2] - S[1] ** 2.0) / (N * (N - 1.0)) + elif n == 3: + return (2 * S[1] ** 3 - 3 * N * S[1] * S[2] + N * N * S[3]) / (N * (N - 1.0) * (N - 2.0)) + elif n == 4: + return (-6 * S[1] ** 4 + 12 * N * S[1] ** 2 * S[2] - 3 * N * (N - 1.0) * S[2] ** 2 - + 4 * N * (N + 1) * S[1] * S[3] + N * N * (N + 1) * S[4]) / \ + (N * (N - 1.0) * (N - 2.0) * (N - 3.0)) + else: + raise ValueError("Should not be here.") + + +def kstatvar(data, n=2): + """ + Returns an unbiased estimator of the variance of the k-statistic. + + See `kstat` for more details of the k-statistic. + + Parameters + ---------- + data : array_like + Input array. + n : int, {1, 2}, optional + Default is equal to 2. + + Returns + ------- + kstatvar : float + The nth k-statistic variance. + + See Also + -------- + kstat + + """ + data = ravel(data) + N = len(data) + if n == 1: + return kstat(data, n=2) * 1.0 / N + elif n == 2: + k2 = kstat(data, n=2) + k4 = kstat(data, n=4) + return (2 * k2 * k2 * N + (N - 1) * k4) / (N * (N + 1)) + else: + raise ValueError("Only n=1 or n=2 supported.") + + +def _calc_uniform_order_statistic_medians(x): + """See Notes section of `probplot` for details.""" + N = len(x) + osm_uniform = np.zeros(N, dtype=np.float64) + osm_uniform[-1] = 0.5 ** (1.0 / N) + osm_uniform[0] = 1 - osm_uniform[-1] + i = np.arange(2, N) + osm_uniform[1:-1] = (i - 0.3175) / (N + 0.365) + return osm_uniform + + +def _parse_dist_kw(dist, enforce_subclass=True): + """Parse `dist` keyword. + + Parameters + ---------- + dist : str or stats.distributions instance. + Several functions take `dist` as a keyword, hence this utility + function. + enforce_subclass : bool, optional + If True (default), `dist` needs to be a + `_distn_infrastructure.rv_generic` instance. + It can sometimes be useful to set this keyword to False, if a function + wants to accept objects that just look somewhat like such an instance + (for example, they have a ``ppf`` method). + + """ + if isinstance(dist, rv_generic): + pass + elif isinstance(dist, string_types): + try: + dist = getattr(distributions, dist) + except AttributeError: + raise ValueError("%s is not a valid distribution name" % dist) + elif enforce_subclass: + msg = ("`dist` should be a stats.distributions instance or a string " + "with the name of such a distribution.") + raise ValueError(msg) + + return dist + + +def probplot(x, sparams=(), dist='norm', fit=True, plot=None): + """ + Calculate quantiles for a probability plot, and optionally show the plot. + + Generates a probability plot of sample data against the quantiles of a + specified theoretical distribution (the normal distribution by default). + `probplot` optionally calculates a best-fit line for the data and plots the + results using Matplotlib or a given plot function. + + Parameters + ---------- + x : array_like + Sample/response data from which `probplot` creates the plot. + sparams : tuple, optional + Distribution-specific shape parameters (shape parameters plus location + and scale). + dist : str or stats.distributions instance, optional + Distribution or distribution function name. The default is 'norm' for a + normal probability plot. Objects that look enough like a + stats.distributions instance (i.e. they have a ``ppf`` method) are also + accepted. + fit : bool, optional + Fit a least-squares regression (best-fit) line to the sample data if + True (default). + plot : object, optional + If given, plots the quantiles and least squares fit. + `plot` is an object that has to have methods "plot" and "text". + The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, + or a custom object with the same methods. + Default is None, which means that no plot is created. + + Returns + ------- + (osm, osr) : tuple of ndarrays + Tuple of theoretical quantiles (osm, or order statistic medians) and + ordered responses (osr). `osr` is simply sorted input `x`. + For details on how `osm` is calculated see the Notes section. + (slope, intercept, r) : tuple of floats, optional + Tuple containing the result of the least-squares fit, if that is + performed by `probplot`. `r` is the square root of the coefficient of + determination. If ``fit=False`` and ``plot=None``, this tuple is not + returned. + + Notes + ----- + Even if `plot` is given, the figure is not shown or saved by `probplot`; + ``plt.show()`` or ``plt.savefig('figname.png')`` should be used after + calling `probplot`. + + `probplot` generates a probability plot, which should not be confused with + a Q-Q or a P-P plot. Statsmodels has more extensive functionality of this + type, see ``statsmodels.api.ProbPlot``. + + The formula used for the theoretical quantiles (horizontal axis of the + probability plot) is Filliben's estimate:: + + quantiles = dist.ppf(val), for + + 0.5**(1/n), for i = n + val = (i - 0.3175) / (n + 0.365), for i = 2, ..., n-1 + 1 - 0.5**(1/n), for i = 1 + + where ``i`` indicates the i-th ordered value and ``n`` is the total number + of values. + + Examples + -------- + >>> from scipy import stats + >>> import matplotlib.pyplot as plt + >>> nsample = 100 + >>> np.random.seed(7654321) + + A t distribution with small degrees of freedom: + + >>> ax1 = plt.subplot(221) + >>> x = stats.t.rvs(3, size=nsample) + >>> res = stats.probplot(x, plot=plt) + + A t distribution with larger degrees of freedom: + + >>> ax2 = plt.subplot(222) + >>> x = stats.t.rvs(25, size=nsample) + >>> res = stats.probplot(x, plot=plt) + + A mixture of two normal distributions with broadcasting: + + >>> ax3 = plt.subplot(223) + >>> x = stats.norm.rvs(loc=[0,5], scale=[1,1.5], + ... size=(nsample/2.,2)).ravel() + >>> res = stats.probplot(x, plot=plt) + + A standard normal distribution: + + >>> ax4 = plt.subplot(224) + >>> x = stats.norm.rvs(loc=0, scale=1, size=nsample) + >>> res = stats.probplot(x, plot=plt) + + Produce a new figure with a loggamma distribution, using the ``dist`` and + ``sparams`` keywords: + + >>> fig = plt.figure() + >>> ax = fig.add_subplot(111) + >>> x = stats.loggamma.rvs(c=2.5, size=500) + >>> stats.probplot(x, dist=stats.loggamma, sparams=(2.5,), plot=ax) + >>> ax.set_title("Probplot for loggamma dist with shape parameter 2.5") + + Show the results with Matplotlib: + + >>> plt.show() + + """ + x = np.asarray(x) + osm_uniform = _calc_uniform_order_statistic_medians(x) + dist = _parse_dist_kw(dist, enforce_subclass=False) + if sparams is None: + sparams = () + if isscalar(sparams): + sparams = (sparams,) + if not isinstance(sparams, tuple): + sparams = tuple(sparams) + + osm = dist.ppf(osm_uniform, *sparams) + osr = sort(x) + if fit or (plot is not None): + # perform a linear fit. + slope, intercept, r, _prob, _sterrest = stats.linregress(osm, osr) + + if plot is not None: + plot.plot(osm, osr, 'bo', osm, slope * osm + intercept, 'r-') + try: + if hasattr(plot, 'set_title'): + # Matplotlib Axes instance or something that looks like it + plot.set_title('Probability Plot') + plot.set_xlabel('Quantiles') + plot.set_ylabel('Ordered Values') + else: + # matplotlib.pyplot module + plot.title('Probability Plot') + plot.xlabel('Quantiles') + plot.ylabel('Ordered Values') + except: + # Not an MPL object or something that looks (enough) like it. + # Don't crash on adding labels or title + pass + + # Add R^2 value to the plot as text + xmin = amin(osm) + xmax = amax(osm) + ymin = amin(x) + ymax = amax(x) + posx = xmin + 0.70 * (xmax - xmin) + posy = ymin + 0.01 * (ymax - ymin) + plot.text(posx, posy, "$R^2=%1.4f$" % r) + + if fit: + return (osm, osr), (slope, intercept, r) + else: + return osm, osr + + +def ppcc_max(x, brack=(0.0, 1.0), dist='tukeylambda'): + """Returns the shape parameter that maximizes the probability plot + correlation coefficient for the given data to a one-parameter + family of distributions. + + See also ppcc_plot + """ + dist = _parse_dist_kw(dist) + osm_uniform = _calc_uniform_order_statistic_medians(x) + osr = sort(x) + + # this function computes the x-axis values of the probability plot + # and computes a linear regression (including the correlation) + # and returns 1-r so that a minimization function maximizes the + # correlation + def tempfunc(shape, mi, yvals, func): + xvals = func(mi, shape) + r, _prob = stats.pearsonr(xvals, yvals) + return 1 - r + + return optimize.brent(tempfunc, brack=brack, + args=(osm_uniform, osr, dist.ppf)) + + +def ppcc_plot(x, a, b, dist='tukeylambda', plot=None, N=80): + """Returns (shape, ppcc), and optionally plots shape vs. ppcc + (probability plot correlation coefficient) as a function of shape + parameter for a one-parameter family of distributions from shape + value a to b. + + See also ppcc_max + """ + svals = r_[a:b:complex(N)] + ppcc = svals * 0.0 + k = 0 + for sval in svals: + _r1, r2 = probplot(x, sval, dist=dist, fit=1) + ppcc[k] = r2[-1] + k += 1 + if plot is not None: + plot.plot(svals, ppcc, 'x') + plot.title('(%s) PPCC Plot' % dist) + plot.xlabel('Prob Plot Corr. Coef.') + plot.ylabel('Shape Values') + return svals, ppcc + + +def boxcox_llf(lmb, data): + r"""The boxcox log-likelihood function. + + Parameters + ---------- + lmb : scalar + Parameter for Box-Cox transformation. See `boxcox` for details. + data : array_like + Data to calculate Box-Cox log-likelihood for. If `data` is + multi-dimensional, the log-likelihood is calculated along the first + axis. + + Returns + ------- + llf : float or ndarray + Box-Cox log-likelihood of `data` given `lmb`. A float for 1-D `data`, + an array otherwise. + + See Also + -------- + boxcox, probplot, boxcox_normplot, boxcox_normmax + + Notes + ----- + The Box-Cox log-likelihood function is defined here as + + .. math:: + + llf = (\lambda - 1) \sum_i(\log(x_i)) - + N/2 \log(\sum_i (y_i - \bar{y})^2 / N), + + where ``y`` is the Box-Cox transformed input data ``x``. + + Examples + -------- + >>> from scipy import stats + >>> import matplotlib.pyplot as plt + >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes + >>> np.random.seed(1245) + + Generate some random variates and calculate Box-Cox log-likelihood values + for them for a range of ``lmbda`` values: + + >>> x = stats.loggamma.rvs(5, loc=10, size=1000) + >>> lmbdas = np.linspace(-2, 10) + >>> llf = np.zeros(lmbdas.shape, dtype=np.float) + >>> for ii, lmbda in enumerate(lmbdas): + ... llf[ii] = stats.boxcox_llf(lmbda, x) + + Also find the optimal lmbda value with `boxcox`: + + >>> x_most_normal, lmbda_optimal = stats.boxcox(x) + + Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a + horizontal line to check that that's really the optimum: + + >>> fig = plt.figure() + >>> ax = fig.add_subplot(111) + >>> ax.plot(lmbdas, llf, 'b.-') + >>> ax.axhline(stats.boxcox_llf(lmbda_optimal, x), color='r') + >>> ax.set_xlabel('lmbda parameter') + >>> ax.set_ylabel('Box-Cox log-likelihood') + + Now add some probability plots to show that where the log-likelihood is + maximized the data transformed with `boxcox` looks closest to normal: + + >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right' + >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs): + ... xt = stats.boxcox(x, lmbda=lmbda) + ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt) + ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc) + ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-') + ... ax_inset.set_xticklabels([]) + ... ax_inset.set_yticklabels([]) + ... ax_inset.set_title('$\lambda=%1.2f$' % lmbda) + + >>> plt.show() + + """ + data = np.asarray(data) + N = data.shape[0] + if N == 0: + return np.nan + + y = boxcox(data, lmb) + y_mean = np.mean(y, axis=0) + llf = (lmb - 1) * np.sum(np.log(data), axis=0) + llf -= N / 2.0 * np.log(np.sum((y - y_mean) ** 2. / N, axis=0)) + return llf + + +def _boxcox_conf_interval(x, lmax, alpha): + # Need to find the lambda for which + # f(x,lmbda) >= f(x,lmax) - 0.5*chi^2_alpha;1 + fac = 0.5 * distributions.chi2.ppf(1 - alpha, 1) + target = boxcox_llf(lmax, x) - fac + + def rootfunc(lmbda, data, target): + return boxcox_llf(lmbda, data) - target + + # Find positive endpoint of interval in which answer is to be found + newlm = lmax + 0.5 + N = 0 + while (rootfunc(newlm, x, target) > 0.0) and (N < 500): + newlm += 0.1 + N += 1 + + if N == 500: + raise RuntimeError("Could not find endpoint.") + + lmplus = optimize.brentq(rootfunc, lmax, newlm, args=(x, target)) + + # Now find negative interval in the same way + newlm = lmax - 0.5 + N = 0 + while (rootfunc(newlm, x, target) > 0.0) and (N < 500): + newlm -= 0.1 + N += 1 + + if N == 500: + raise RuntimeError("Could not find endpoint.") + + lmminus = optimize.brentq(rootfunc, newlm, lmax, args=(x, target)) + return lmminus, lmplus + + +def boxcox(x, lmbda=None, alpha=None): + r""" + Return a positive dataset transformed by a Box-Cox power transformation. + + Parameters + ---------- + x : ndarray + Input array. Should be 1-dimensional. + lmbda : {None, scalar}, optional + If `lmbda` is not None, do the transformation for that value. + + If `lmbda` is None, find the lambda that maximizes the log-likelihood + function and return it as the second output argument. + alpha : {None, float}, optional + If `alpha` is not None, return the ``100 * (1-alpha)%`` confidence + interval for `lmbda` as the third output argument. + Must be between 0.0 and 1.0. + + Returns + ------- + boxcox : ndarray + Box-Cox power transformed array. + maxlog : float, optional + If the `lmbda` parameter is None, the second returned argument is + the lambda that maximizes the log-likelihood function. + (min_ci, max_ci) : tuple of float, optional + If `lmbda` parameter is None and `alpha` is not None, this returned + tuple of floats represents the minimum and maximum confidence limits + given `alpha`. + + See Also + -------- + probplot, boxcox_normplot, boxcox_normmax, boxcox_llf + + Notes + ----- + The Box-Cox transform is given by:: + + y = (x**lmbda - 1) / lmbda, for lmbda > 0 + log(x), for lmbda = 0 + + `boxcox` requires the input data to be positive. Sometimes a Box-Cox + transformation provides a shift parameter to achieve this; `boxcox` does + not. Such a shift parameter is equivalent to adding a positive constant to + `x` before calling `boxcox`. + + The confidence limits returned when `alpha` is provided give the interval + where: + + .. math:: + + llf(\hat{\lambda}) - llf(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1), + + with ``llf`` the log-likelihood function and :math:`\chi^2` the chi-squared + function. + + References + ---------- + G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the + Royal Statistical Society B, 26, 211-252 (1964). + + Examples + -------- + >>> from scipy import stats + >>> import matplotlib.pyplot as plt + + We generate some random variates from a non-normal distribution and make a + probability plot for it, to show it is non-normal in the tails: + + >>> fig = plt.figure() + >>> ax1 = fig.add_subplot(211) + >>> x = stats.loggamma.rvs(5, size=500) + 5 + >>> stats.probplot(x, dist=stats.norm, plot=ax1) + >>> ax1.set_xlabel('') + >>> ax1.set_title('Probplot against normal distribution') + + We now use `boxcox` to transform the data so it's closest to normal: + + >>> ax2 = fig.add_subplot(212) + >>> xt, _ = stats.boxcox(x) + >>> stats.probplot(xt, dist=stats.norm, plot=ax2) + >>> ax2.set_title('Probplot after Box-Cox transformation') + + >>> plt.show() + + """ + x = np.asarray(x) + if x.size == 0: + return x + + if any(x <= 0): + raise ValueError("Data must be positive.") + + if lmbda is not None: # single transformation + return special.boxcox(x, lmbda) # @UndefinedVariable + + # If lmbda=None, find the lmbda that maximizes the log-likelihood function. + lmax = boxcox_normmax(x, method='mle') + y = boxcox(x, lmax) + + if alpha is None: + return y, lmax + else: + # Find confidence interval + interval = _boxcox_conf_interval(x, lmax, alpha) + return y, lmax, interval + + +def boxcox_normmax(x, brack=(-2.0, 2.0), method='pearsonr'): + """Compute optimal Box-Cox transform parameter for input data. + + Parameters + ---------- + x : array_like + Input array. + brack : 2-tuple, optional + The starting interval for a downhill bracket search with + `optimize.brent`. Note that this is in most cases not critical; the + final result is allowed to be outside this bracket. + method : str, optional + The method to determine the optimal transform parameter (`boxcox` + ``lmbda`` parameter). Options are: + + 'pearsonr' (default) + Maximizes the Pearson correlation coefficient between + ``y = boxcox(x)`` and the expected values for ``y`` if `x` would be + normally-distributed. + + 'mle' + Minimizes the log-likelihood `boxcox_llf`. This is the method used + in `boxcox`. + + 'all' + Use all optimization methods available, and return all results. + Useful to compare different methods. + + Returns + ------- + maxlog : float or ndarray + The optimal transform parameter found. An array instead of a scalar + for ``method='all'``. + + See Also + -------- + boxcox, boxcox_llf, boxcox_normplot + + Examples + -------- + >>> from scipy import stats + >>> import matplotlib.pyplot as plt + >>> np.random.seed(1234) # make this example reproducible + + Generate some data and determine optimal ``lmbda`` in various ways: + + >>> x = stats.loggamma.rvs(5, size=30) + 5 + >>> y, lmax_mle = stats.boxcox(x) + >>> lmax_pearsonr = stats.boxcox_normmax(x) + + >>> lmax_mle + 7.177... + >>> lmax_pearsonr + 7.916... + >>> stats.boxcox_normmax(x, method='all') + array([ 7.91667384, 7.17718692]) + + >>> fig = plt.figure() + >>> ax = fig.add_subplot(111) + >>> stats.boxcox_normplot(x, -10, 10, plot=ax) + >>> ax.axvline(lmax_mle, color='r') + >>> ax.axvline(lmax_pearsonr, color='g', ls='--') + + >>> plt.show() + + """ + def _pearsonr(x, brack): + osm_uniform = _calc_uniform_order_statistic_medians(x) + xvals = distributions.norm.ppf(osm_uniform) + + def _eval_pearsonr(lmbda, xvals, samps): + # This function computes the x-axis values of the probability plot + # and computes a linear regression (including the correlation) and + # returns ``1 - r`` so that a minimization function maximizes the + # correlation. + y = boxcox(samps, lmbda) + yvals = np.sort(y) + r, _prob = stats.pearsonr(xvals, yvals) + return 1 - r + + return optimize.brent(_eval_pearsonr, brack=brack, args=(xvals, x)) + + def _mle(x, brack): + def _eval_mle(lmb, data): + # function to minimize + return -boxcox_llf(lmb, data) + + return optimize.brent(_eval_mle, brack=brack, args=(x,)) + + def _all(x, brack): + maxlog = np.zeros(2, dtype=np.float) + maxlog[0] = _pearsonr(x, brack) + maxlog[1] = _mle(x, brack) + return maxlog + + methods = {'pearsonr': _pearsonr, + 'mle': _mle, + 'all': _all} + if not method in methods.keys(): + raise ValueError("Method %s not recognized." % method) + + optimfunc = methods[method] + return optimfunc(x, brack) + + +def boxcox_normplot(x, la, lb, plot=None, N=80): + """Compute parameters for a Box-Cox normality plot, optionally show it. + + A Box-Cox normality plot shows graphically what the best transformation + parameter is to use in `boxcox` to obtain a distribution that is close + to normal. + + Parameters + ---------- + x : array_like + Input array. + la, lb : scalar + The lower and upper bounds for the ``lmbda`` values to pass to `boxcox` + for Box-Cox transformations. These are also the limits of the + horizontal axis of the plot if that is generated. + plot : object, optional + If given, plots the quantiles and least squares fit. + `plot` is an object that has to have methods "plot" and "text". + The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, + or a custom object with the same methods. + Default is None, which means that no plot is created. + N : int, optional + Number of points on the horizontal axis (equally distributed from + `la` to `lb`). + + Returns + ------- + lmbdas : ndarray + The ``lmbda`` values for which a Box-Cox transform was done. + ppcc : ndarray + Probability Plot Correlelation Coefficient, as obtained from `probplot` + when fitting the Box-Cox transformed input `x` against a normal + distribution. + + See Also + -------- + probplot, boxcox, boxcox_normmax, boxcox_llf, ppcc_max + + Notes + ----- + Even if `plot` is given, the figure is not shown or saved by + `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')`` + should be used after calling `probplot`. + + Examples + -------- + >>> from scipy import stats + >>> import matplotlib.pyplot as plt + + Generate some non-normally distributed data, and create a Box-Cox plot: + + >>> x = stats.loggamma.rvs(5, size=500) + 5 + >>> fig = plt.figure() + >>> ax = fig.add_subplot(111) + >>> stats.boxcox_normplot(x, -20, 20, plot=ax) + + Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in + the same plot: + + >>> _, maxlog = stats.boxcox(x) + >>> ax.axvline(maxlog, color='r') + + >>> plt.show() + + """ + x = np.asarray(x) + if x.size == 0: + return x + + if lb <= la: + raise ValueError("`lb` has to be larger than `la`.") + + lmbdas = np.linspace(la, lb, num=N) + ppcc = lmbdas * 0.0 + for i, val in enumerate(lmbdas): + # Determine for each lmbda the correlation coefficient of transformed x + z = boxcox(x, lmbda=val) + _, r2 = probplot(z, dist='norm', fit=True) + ppcc[i] = r2[-1] + + if plot is not None: + plot.plot(lmbdas, ppcc, 'x') + try: + if hasattr(plot, 'set_title'): + # Matplotlib Axes instance or something that looks like it + plot.set_title('Box-Cox Normality Plot') + plot.set_ylabel('Prob Plot Corr. Coef.') + plot.set_xlabel('$\lambda$') + else: + # matplotlib.pyplot module + plot.title('Box-Cox Normality Plot') + plot.ylabel('Prob Plot Corr. Coef.') + plot.xlabel('$\lambda$') + except Exception: + # Not an MPL object or something that looks (enough) like it. + # Don't crash on adding labels or title + pass + + return lmbdas, ppcc + + +def shapiro(x, a=None, reta=False): + """ + Perform the Shapiro-Wilk test for normality. + + The Shapiro-Wilk test tests the null hypothesis that the + data was drawn from a normal distribution. + + Parameters + ---------- + x : array_like + Array of sample data. + a : array_like, optional + Array of internal parameters used in the calculation. If these + are not given, they will be computed internally. If x has length + n, then a must have length n/2. + reta : bool, optional + Whether or not to return the internally computed a values. The + default is False. + + Returns + ------- + W : float + The test statistic. + p-value : float + The p-value for the hypothesis test. + a : array_like, optional + If `reta` is True, then these are the internally computed "a" + values that may be passed into this function on future calls. + + See Also + -------- + anderson : The Anderson-Darling test for normality + + References + ---------- + .. [1] http://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm + + """ + N = len(x) + if N < 3: + raise ValueError("Data must be at least length 3.") + if a is None: + a = zeros(N, 'f') + init = 0 + else: + if len(a) != N // 2: + raise ValueError("len(a) must equal len(x)/2") + init = 1 + y = sort(x) + a, w, pw, ifault = statlib.swilk(y, a[:N // 2], init) + if not ifault in [0, 2]: + warnings.warn(str(ifault)) + if N > 5000: + warnings.warn("p-value may not be accurate for N > 5000.") + if reta: + return w, pw, a + else: + return w, pw + +# Values from Stephens, M A, "EDF Statistics for Goodness of Fit and +# Some Comparisons", Journal of he American Statistical +# Association, Vol. 69, Issue 347, Sept. 1974, pp 730-737 +_Avals_norm = array([0.576, 0.656, 0.787, 0.918, 1.092]) +_Avals_expon = array([0.922, 1.078, 1.341, 1.606, 1.957]) +# From Stephens, M A, "Goodness of Fit for the Extreme Value Distribution", +# Biometrika, Vol. 64, Issue 3, Dec. 1977, pp 583-588. +_Avals_gumbel = array([0.474, 0.637, 0.757, 0.877, 1.038]) +# From Stephens, M A, "Tests of Fit for the Logistic Distribution Based +# on the Empirical Distribution Function.", Biometrika, +# Vol. 66, Issue 3, Dec. 1979, pp 591-595. +_Avals_logistic = array([0.426, 0.563, 0.660, 0.769, 0.906, 1.010]) + + +def anderson(x, dist='norm'): + """ + Anderson-Darling test for data coming from a particular distribution + + The Anderson-Darling test is a modification of the Kolmogorov- + Smirnov test kstest_ for the null hypothesis that a sample is + drawn from a population that follows a particular distribution. + For the Anderson-Darling test, the critical values depend on + which distribution is being tested against. This function works + for normal, exponential, logistic, or Gumbel (Extreme Value + Type I) distributions. + + Parameters + ---------- + x : array_like + array of sample data + dist : {'norm','expon','logistic','gumbel','extreme1'}, optional + the type of distribution to test against. The default is 'norm' + and 'extreme1' is a synonym for 'gumbel' + + Returns + ------- + A2 : float + The Anderson-Darling test statistic + critical : list + The critical values for this distribution + sig : list + The significance levels for the corresponding critical values + in percents. The function returns critical values for a + differing set of significance levels depending on the + distribution that is being tested against. + + Notes + ----- + Critical values provided are for the following significance levels: + + normal/exponenential + 15%, 10%, 5%, 2.5%, 1% + logistic + 25%, 10%, 5%, 2.5%, 1%, 0.5% + Gumbel + 25%, 10%, 5%, 2.5%, 1% + + If A2 is larger than these critical values then for the corresponding + significance level, the null hypothesis that the data come from the + chosen distribution can be rejected. + + References + ---------- + .. [1] http://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm + .. [2] Stephens, M. A. (1974). EDF Statistics for Goodness of Fit and + Some Comparisons, Journal of the American Statistical Association, + Vol. 69, pp. 730-737. + .. [3] Stephens, M. A. (1976). Asymptotic Results for Goodness-of-Fit + Statistics with Unknown Parameters, Annals of Statistics, Vol. 4, + pp. 357-369. + .. [4] Stephens, M. A. (1977). Goodness of Fit for the Extreme Value + Distribution, Biometrika, Vol. 64, pp. 583-588. + .. [5] Stephens, M. A. (1977). Goodness of Fit with Special Reference + to Tests for Exponentiality , Technical Report No. 262, + Department of Statistics, Stanford University, Stanford, CA. + .. [6] Stephens, M. A. (1979). Tests of Fit for the Logistic Distribution + Based on the Empirical Distribution Function, Biometrika, Vol. 66, + pp. 591-595. + + """ + if not dist in ['norm', 'expon', 'gumbel', 'extreme1', 'logistic']: + raise ValueError("Invalid distribution; dist must be 'norm', " + "'expon', 'gumbel', 'extreme1' or 'logistic'.") + y = sort(x) + xbar = np.mean(x, axis=0) + N = len(y) + if dist == 'norm': + s = np.std(x, ddof=1, axis=0) + w = (y - xbar) / s + z = distributions.norm.cdf(w) + sig = array([15, 10, 5, 2.5, 1]) + critical = around(_Avals_norm / (1.0 + 4.0 / N - 25.0 / N / N), 3) + elif dist == 'expon': + w = y / xbar + z = distributions.expon.cdf(w) + sig = array([15, 10, 5, 2.5, 1]) + critical = around(_Avals_expon / (1.0 + 0.6 / N), 3) + elif dist == 'logistic': + def rootfunc(ab, xj, N): + a, b = ab + tmp = (xj - a) / b + tmp2 = exp(tmp) + val = [sum(1.0 / (1 + tmp2), axis=0) - 0.5 * N, + sum(tmp * (1.0 - tmp2) / (1 + tmp2), axis=0) + N] + return array(val) + sol0 = array([xbar, np.std(x, ddof=1, axis=0)]) + sol = optimize.fsolve(rootfunc, sol0, args=(x, N), xtol=1e-5) + w = (y - sol[0]) / sol[1] + z = distributions.logistic.cdf(w) + sig = array([25, 10, 5, 2.5, 1, 0.5]) + critical = around(_Avals_logistic / (1.0 + 0.25 / N), 3) + else: # (dist == 'gumbel') or (dist == 'extreme1'): + # the following is incorrect, see ticket:1097 +# def fixedsolve(th,xj,N): +## val = stats.sum(xj)*1.0/N +## tmp = exp(-xj/th) +## term = sum(xj*tmp,axis=0) +## term /= sum(tmp,axis=0) +# return val - term +## s = optimize.fixed_point(fixedsolve, 1.0, args=(x,N),xtol=1e-5) +## xbar = -s*log(sum(exp(-x/s),axis=0)*1.0/N) + xbar, s = distributions.gumbel_l.fit(x) + w = (y - xbar) / s + z = distributions.gumbel_l.cdf(w) + sig = array([25, 10, 5, 2.5, 1]) + critical = around(_Avals_gumbel / (1.0 + 0.2 / sqrt(N)), 3) + + i = arange(1, N + 1) + S = sum((2 * i - 1.0) / N * (log(z) + log(1 - z[::-1])), axis=0) + A2 = -N - S + return A2, critical, sig + + +def ansari(x, y): + """ + Perform the Ansari-Bradley test for equal scale parameters + + The Ansari-Bradley test is a non-parametric test for the equality + of the scale parameter of the distributions from which two + samples were drawn. + + Parameters + ---------- + x, y : array_like + arrays of sample data + + Returns + ------- + AB : float + The Ansari-Bradley test statistic + p-value : float + The p-value of the hypothesis test + + See Also + -------- + fligner : A non-parametric test for the equality of k variances + mood : A non-parametric test for the equality of two scale parameters + + Notes + ----- + The p-value given is exact when the sample sizes are both less than + 55 and there are no ties, otherwise a normal approximation for the + p-value is used. + + References + ---------- + .. [1] Sprent, Peter and N.C. Smeeton. Applied nonparametric statistical + methods. 3rd ed. Chapman and Hall/CRC. 2001. Section 5.8.2. + + """ + x, y = asarray(x), asarray(y) + n = len(x) + m = len(y) + if m < 1: + raise ValueError("Not enough other observations.") + if n < 1: + raise ValueError("Not enough test observations.") + N = m + n + xy = r_[x, y] # combine + rank = stats.rankdata(xy) + symrank = amin(array((rank, N - rank + 1)), 0) + AB = sum(symrank[:n], axis=0) + uxy = unique(xy) + repeats = (len(uxy) != len(xy)) + exact = ((m < 55) and (n < 55) and not repeats) + if repeats and ((m < 55) or (n < 55)): + warnings.warn("Ties preclude use of exact statistic.") + if exact: + astart, a1, _ifault = statlib.gscale(n, m) + ind = AB - astart + total = sum(a1, axis=0) + if ind < len(a1) / 2.0: + cind = int(ceil(ind)) + if (ind == cind): + pval = 2.0 * sum(a1[:cind + 1], axis=0) / total + else: + pval = 2.0 * sum(a1[:cind], axis=0) / total + else: + find = int(floor(ind)) + if (ind == floor(ind)): + pval = 2.0 * sum(a1[find:], axis=0) / total + else: + pval = 2.0 * sum(a1[find + 1:], axis=0) / total + return AB, min(1.0, pval) + + # otherwise compute normal approximation + if N % 2: # N odd + mnAB = n * (N + 1.0) ** 2 / 4.0 / N + varAB = n * m * (N + 1.0) * (3 + N ** 2) / (48.0 * N ** 2) + else: + mnAB = n * (N + 2.0) / 4.0 + varAB = m * n * (N + 2) * (N - 2.0) / 48 / (N - 1.0) + if repeats: # adjust variance estimates + # compute sum(tj * rj**2,axis=0) + fac = sum(symrank ** 2, axis=0) + if N % 2: # N odd + varAB = m * n * \ + (16 * N * fac - (N + 1) ** 4) / (16.0 * N ** 2 * (N - 1)) + else: # N even + varAB = m * n * \ + (16 * fac - N * (N + 2) ** 2) / (16.0 * N * (N - 1)) + z = (AB - mnAB) / sqrt(varAB) + pval = distributions.norm.sf(abs(z)) * 2.0 + return AB, pval + + +def bartlett(*args): + """ + Perform Bartlett's test for equal variances + + Bartlett's test tests the null hypothesis that all input samples + are from populations with equal variances. For samples + from significantly non-normal populations, Levene's test + `levene`_ is more robust. + + Parameters + ---------- + sample1, sample2,... : array_like + arrays of sample data. May be different lengths. + + Returns + ------- + T : float + The test statistic. + p-value : float + The p-value of the test. + + References + ---------- + .. [1] http://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm + + .. [2] Snedecor, George W. and Cochran, William G. (1989), Statistical + Methods, Eighth Edition, Iowa State University Press. + + """ + k = len(args) + if k < 2: + raise ValueError("Must enter at least two input sample vectors.") + Ni = zeros(k) + ssq = zeros(k, 'd') + for j in range(k): + Ni[j] = len(args[j]) + ssq[j] = np.var(args[j], ddof=1) + Ntot = sum(Ni, axis=0) + spsq = sum((Ni - 1) * ssq, axis=0) / (1.0 * (Ntot - k)) + numer = (Ntot * 1.0 - k) * log(spsq) - sum((Ni - 1.0) * log(ssq), axis=0) + denom = 1.0 + (1.0 / (3 * (k - 1))) * \ + ((sum(1.0 / (Ni - 1.0), axis=0)) - 1.0 / (Ntot - k)) + T = numer / denom + pval = distributions.chi2.sf(T, k - 1) # 1 - cdf + return T, pval + + +def levene(*args, **kwds): + """ + Perform Levene test for equal variances. + + The Levene test tests the null hypothesis that all input samples + are from populations with equal variances. Levene's test is an + alternative to Bartlett's test `bartlett` in the case where + there are significant deviations from normality. + + Parameters + ---------- + sample1, sample2, ... : array_like + The sample data, possibly with different lengths + center : {'mean', 'median', 'trimmed'}, optional + Which function of the data to use in the test. The default + is 'median'. + proportiontocut : float, optional + When `center` is 'trimmed', this gives the proportion of data points + to cut from each end. (See `scipy.stats.trim_mean`.) + Default is 0.05. + + Returns + ------- + W : float + The test statistic. + p-value : float + The p-value for the test. + + Notes + ----- + Three variations of Levene's test are possible. The possibilities + and their recommended usages are: + + * 'median' : Recommended for skewed (non-normal) distributions> + * 'mean' : Recommended for symmetric, moderate-tailed distributions. + * 'trimmed' : Recommended for heavy-tailed distributions. + + References + ---------- + .. [1] http://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm + .. [2] Levene, H. (1960). In Contributions to Probability and Statistics: + Essays in Honor of Harold Hotelling, I. Olkin et al. eds., + Stanford University Press, pp. 278-292. + .. [3] Brown, M. B. and Forsythe, A. B. (1974), Journal of the American + Statistical Association, 69, 364-367 + + """ + # Handle keyword arguments. + center = 'median' + proportiontocut = 0.05 + for kw, value in kwds.items(): + if kw not in ['center', 'proportiontocut']: + raise TypeError( + "levene() got an unexpected keyword argument '%s'" % kw) + if kw == 'center': + center = value + else: + proportiontocut = value + + k = len(args) + if k < 2: + raise ValueError("Must enter at least two input sample vectors.") + Ni = zeros(k) + Yci = zeros(k, 'd') + + if not center in ['mean', 'median', 'trimmed']: + raise ValueError("Keyword argument
must be 'mean', 'median'" + + "or 'trimmed'.") + + if center == 'median': + func = lambda x: np.median(x, axis=0) + elif center == 'mean': + func = lambda x: np.mean(x, axis=0) + else: # center == 'trimmed' + args = tuple(stats.trimboth(np.sort(arg), proportiontocut) + for arg in args) + func = lambda x: np.mean(x, axis=0) + + for j in range(k): + Ni[j] = len(args[j]) + Yci[j] = func(args[j]) + Ntot = sum(Ni, axis=0) + + # compute Zij's + Zij = [None] * k + for i in range(k): + Zij[i] = abs(asarray(args[i]) - Yci[i]) + # compute Zbari + Zbari = zeros(k, 'd') + Zbar = 0.0 + for i in range(k): + Zbari[i] = np.mean(Zij[i], axis=0) + Zbar += Zbari[i] * Ni[i] + Zbar /= Ntot + + numer = (Ntot - k) * sum(Ni * (Zbari - Zbar) ** 2, axis=0) + + # compute denom_variance + dvar = 0.0 + for i in range(k): + dvar += sum((Zij[i] - Zbari[i]) ** 2, axis=0) + + denom = (k - 1.0) * dvar + + W = numer / denom + pval = distributions.f.sf(W, k - 1, Ntot - k) # 1 - cdf + return W, pval + + +@setastest(False) +def binom_test(x, n=None, p=0.5): + """ + Perform a test that the probability of success is p. + + This is an exact, two-sided test of the null hypothesis + that the probability of success in a Bernoulli experiment + is `p`. + + Parameters + ---------- + x : integer or array_like + the number of successes, or if x has length 2, it is the + number of successes and the number of failures. + n : integer + the number of trials. This is ignored if x gives both the + number of successes and failures + p : float, optional + The hypothesized probability of success. 0 <= p <= 1. The + default value is p = 0.5 + + Returns + ------- + p-value : float + The p-value of the hypothesis test + + References + ---------- + .. [1] http://en.wikipedia.org/wiki/Binomial_test + + """ + x = atleast_1d(x).astype(np.integer) + if len(x) == 2: + n = x[1] + x[0] + x = x[0] + elif len(x) == 1: + x = x[0] + if n is None or n < x: + raise ValueError("n must be >= x") + n = np.int_(n) + else: + raise ValueError("Incorrect length for x.") + + if (p > 1.0) or (p < 0.0): + raise ValueError("p must be in range [0,1]") + + d = distributions.binom.pmf(x, n, p) + rerr = 1 + 1e-7 + if (x == p * n): + # special case as shortcut, would also be handled by `else` below + pval = 1. + elif (x < p * n): + i = np.arange(np.ceil(p * n), n + 1) + y = np.sum(distributions.binom.pmf(i, n, p) <= d * rerr, axis=0) + pval = distributions.binom.cdf( + x, n, p) + distributions.binom.sf(n - y, n, p) + else: + i = np.arange(np.floor(p * n) + 1) + y = np.sum(distributions.binom.pmf(i, n, p) <= d * rerr, axis=0) + pval = distributions.binom.cdf( + y - 1, n, p) + distributions.binom.sf(x - 1, n, p) + + return min(1.0, pval) + + +def _apply_func(x, g, func): + # g is list of indices into x + # separating x into different groups + # func should be applied over the groups + g = unique(r_[0, g, len(x)]) + output = [] + for k in range(len(g) - 1): + output.append(func(x[g[k]:g[k + 1]])) + return asarray(output) + + +def fligner(*args, **kwds): + """ + Perform Fligner's test for equal variances. + + Fligner's test tests the null hypothesis that all input samples + are from populations with equal variances. Fligner's test is + non-parametric in contrast to Bartlett's test `bartlett` and + Levene's test `levene`. + + Parameters + ---------- + sample1, sample2, ... : array_like + arrays of sample data. Need not be the same length + center : {'mean', 'median', 'trimmed'}, optional + keyword argument controlling which function of the data + is used in computing the test statistic. The default + is 'median'. + proportiontocut : float, optional + When `center` is 'trimmed', this gives the proportion of data points + to cut from each end. (See `scipy.stats.trim_mean`.) + Default is 0.05. + + Returns + ------- + Xsq : float + the test statistic + p-value : float + the p-value for the hypothesis test + + Notes + ----- + As with Levene's test there are three variants + of Fligner's test that differ by the measure of central + tendency used in the test. See `levene` for more information. + + References + ---------- + .. [1] http://www.stat.psu.edu/~bgl/center/tr/TR993.ps + + .. [2] Fligner, M.A. and Killeen, T.J. (1976). Distribution-free two-sample + tests for scale. 'Journal of the American Statistical Association.' + 71(353), 210-213. + + """ + # Handle keyword arguments. + center = 'median' + proportiontocut = 0.05 + for kw, value in kwds.items(): + if kw not in ['center', 'proportiontocut']: + raise TypeError( + "fligner() got an unexpected keyword argument '%s'" % kw) + if kw == 'center': + center = value + else: + proportiontocut = value + + k = len(args) + if k < 2: + raise ValueError("Must enter at least two input sample vectors.") + + if not center in ['mean', 'median', 'trimmed']: + raise ValueError("Keyword argument
must be 'mean', 'median'" + + "or 'trimmed'.") + + if center == 'median': + func = lambda x: np.median(x, axis=0) + elif center == 'mean': + func = lambda x: np.mean(x, axis=0) + else: # center == 'trimmed' + args = tuple(stats.trimboth(arg, proportiontocut) for arg in args) + func = lambda x: np.mean(x, axis=0) + + Ni = asarray([len(args[j]) for j in range(k)]) + Yci = asarray([func(args[j]) for j in range(k)]) + Ntot = sum(Ni, axis=0) + # compute Zij's + Zij = [abs(asarray(args[i]) - Yci[i]) for i in range(k)] + allZij = [] + g = [0] + for i in range(k): + allZij.extend(list(Zij[i])) + g.append(len(allZij)) + + ranks = stats.rankdata(allZij) + a = distributions.norm.ppf(ranks / (2 * (Ntot + 1.0)) + 0.5) + + # compute Aibar + Aibar = _apply_func(a, g, sum) / Ni + anbar = np.mean(a, axis=0) + varsq = np.var(a, axis=0, ddof=1) + Xsq = sum(Ni * (asarray(Aibar) - anbar) ** 2.0, axis=0) / varsq + pval = distributions.chi2.sf(Xsq, k - 1) # 1 - cdf + return Xsq, pval + + +def mood(x, y, axis=0): + """ + Perform Mood's test for equal scale parameters. + + Mood's two-sample test for scale parameters is a non-parametric + test for the null hypothesis that two samples are drawn from the + same distribution with the same scale parameter. + + Parameters + ---------- + x, y : array_like + Arrays of sample data. + axis: int, optional + The axis along which the samples are tested. `x` and `y` can be of + different length along `axis`. + If `axis` is None, `x` and `y` are flattened and the test is done on + all values in the flattened arrays. + + Returns + ------- + z : scalar or ndarray + The z-score for the hypothesis test. For 1-D inputs a scalar is + returned; + p-value : scalar ndarray + The p-value for the hypothesis test. + + See Also + -------- + fligner : A non-parametric test for the equality of k variances + ansari : A non-parametric test for the equality of 2 variances + bartlett : A parametric test for equality of k variances in normal samples + levene : A parametric test for equality of k variances + + Notes + ----- + The data are assumed to be drawn from probability distributions ``f(x)`` + and ``f(x/s) / s`` respectively, for some probability density function f. + The null hypothesis is that ``s == 1``. + + For multi-dimensional arrays, if the inputs are of shapes + ``(n0, n1, n2, n3)`` and ``(n0, m1, n2, n3)``, then if ``axis=1``, the + resulting z and p values will have shape ``(n0, n2, n3)``. Note that + ``n1`` and ``m1`` don't have to be equal, but the other dimensions do. + + Examples + -------- + >>> from scipy import stats + >>> x2 = np.random.randn(2, 45, 6, 7) + >>> x1 = np.random.randn(2, 30, 6, 7) + >>> z, p = stats.mood(x1, x2, axis=1) + >>> p.shape + (2, 6, 7) + + Find the number of points where the difference in scale is not significant: + + >>> (p > 0.1).sum() + 74 + + Perform the test with different scales: + + >>> x1 = np.random.randn(2, 30) + >>> x2 = np.random.randn(2, 35) * 10.0 + >>> stats.mood(x1, x2, axis=1) + (array([-5.84332354, -5.6840814 ]), array([5.11694980e-09, 1.31517628e-08])) + + """ + x = np.asarray(x, dtype=float) + y = np.asarray(y, dtype=float) + + if axis is None: + x = x.flatten() + y = y.flatten() + axis = 0 + + # Determine shape of the result arrays + res_shape = tuple([x.shape[ax] + for ax in range(len(x.shape)) if ax != axis]) + if not (res_shape == tuple([y.shape[ax] for ax in range(len(y.shape)) if + ax != axis])): + raise ValueError("Dimensions of x and y on all axes except `axis` " + "should match") + + n = x.shape[axis] + m = y.shape[axis] + N = m + n + if N < 3: + raise ValueError("Not enough observations.") + + xy = np.concatenate((x, y), axis=axis) + if axis != 0: + xy = np.rollaxis(xy, axis) + + xy = xy.reshape(xy.shape[0], -1) + + # Generalized to the n-dimensional case by adding the axis argument, and + # using for loops, since rankdata is not vectorized. For improving + # performance consider vectorizing rankdata function. + all_ranks = np.zeros_like(xy) + for j in range(xy.shape[1]): + all_ranks[:, j] = stats.rankdata(xy[:, j]) + + Ri = all_ranks[:n] + M = sum((Ri - (N + 1.0) / 2) ** 2, axis=0) + # Approx stat. + mnM = n * (N * N - 1.0) / 12 + varM = m * n * (N + 1.0) * (N + 2) * (N - 2) / 180 + z = (M - mnM) / sqrt(varM) + + # sf for right tail, cdf for left tail. Factor 2 for two-sidedness + z_pos = z > 0 + pval = np.zeros_like(z) + pval[z_pos] = 2 * distributions.norm.sf(z[z_pos]) + pval[~z_pos] = 2 * distributions.norm.cdf(z[~z_pos]) + + if res_shape == (): + # Return scalars, not 0-D arrays + z = z[0] + pval = pval[0] + else: + z.shape = res_shape + pval.shape = res_shape + + return z, pval + + +def wilcoxon(x, y=None, zero_method="wilcox", correction=False): + """ + Calculate the Wilcoxon signed-rank test. + + The Wilcoxon signed-rank test tests the null hypothesis that two + related paired samples come from the same distribution. In particular, + it tests whether the distribution of the differences x - y is symmetric + about zero. It is a non-parametric version of the paired T-test. + + Parameters + ---------- + x : array_like + The first set of measurements. + y : array_like, optional + The second set of measurements. If `y` is not given, then the `x` + array is considered to be the differences between the two sets of + measurements. + zero_method : string, {"pratt", "wilcox", "zsplit"}, optional + "pratt": + Pratt treatment: includes zero-differences in the ranking process + (more conservative) + "wilcox": + Wilcox treatment: discards all zero-differences + "zsplit": + Zero rank split: just like Pratt, but spliting the zero rank + between positive and negative ones + correction : bool, optional + If True, apply continuity correction by adjusting the Wilcoxon rank + statistic by 0.5 towards the mean value when computing the + z-statistic. Default is False. + + Returns + ------- + T : float + The sum of the ranks of the differences above or below zero, whichever + is smaller. + p-value : float + The two-sided p-value for the test. + + Notes + ----- + Because the normal approximation is used for the calculations, the + samples used should be large. A typical rule is to require that + n > 20. + + References + ---------- + .. [1] http://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test + + """ + + if not zero_method in ["wilcox", "pratt", "zsplit"]: + raise ValueError("Zero method should be either 'wilcox' \ + or 'pratt' or 'zsplit'") + + if y is None: + d = x + else: + x, y = map(asarray, (x, y)) + if len(x) != len(y): + raise ValueError('Unequal N in wilcoxon. Aborting.') + d = x - y + + if zero_method == "wilcox": + # Keep all non-zero differences + d = compress(not_equal(d, 0), d, axis=-1) + + count = len(d) + if (count < 10): + warnings.warn( + "Warning: sample size too small for normal approximation.") + r = stats.rankdata(abs(d)) + r_plus = sum((d > 0) * r, axis=0) + r_minus = sum((d < 0) * r, axis=0) + + if zero_method == "zsplit": + r_zero = sum((d == 0) * r, axis=0) + r_plus += r_zero / 2. + r_minus += r_zero / 2. + + T = min(r_plus, r_minus) + mn = count * (count + 1.) * 0.25 + se = count * (count + 1.) * (2. * count + 1.) + + if zero_method == "pratt": + r = r[d != 0] + + _replist, repnum = find_repeats(r) + if repnum.size != 0: + # Correction for repeated elements. + se -= 0.5 * (repnum * (repnum * repnum - 1)).sum() + + se = sqrt(se / 24) + correction = 0.5 * int(bool(correction)) * np.sign(T - mn) + z = (T - mn - correction) / se + prob = 2. * distributions.norm.sf(abs(z)) + return T, prob + + +def _hermnorm(N): + # return the negatively normalized hermite polynomials up to order N-1 + # (inclusive) + # using the recursive relationship + # p_n+1 = p_n(x)' - x*p_n(x) + # and p_0(x) = 1 + plist = [None] * N + plist[0] = poly1d(1) + for n in range(1, N): + plist[n] = plist[n - 1].deriv() - poly1d([1, 0]) * plist[n - 1] + return plist + + +def pdf_fromgamma(g1, g2, g3=0.0, g4=None): + if g4 is None: + g4 = 3 * g2 * g2 + sigsq = 1.0 / g2 + sig = sqrt(sigsq) + mu = g1 * sig ** 3.0 + p12 = _hermnorm(13) + for k in range(13): + p12[k] = p12[k] / sig ** k + + # Add all of the terms to polynomial + totp = p12[0] - (g1 / 6.0 * p12[3]) + \ + (g2 / 24.0 * p12[4] + g1 * g1 / 72.0 * p12[6]) - \ + (g3 / 120.0 * p12[5] + g1 * g2 / 144.0 * p12[7] + g1 ** 3.0 / 1296.0 * p12[9]) + \ + (g4 / 720 * p12[6] + (g2 * g2 / 1152.0 + g1 * g3 / 720) * p12[8] + + g1 * g1 * g2 / 1728.0 * p12[10] + g1 ** 4.0 / 31104.0 * p12[12]) + # Final normalization + totp = totp / sqrt(2 * pi) / sig + + def thefunc(x): + xn = (x - mu) / sig + return totp(xn) * exp(-xn * xn / 2.0) + return thefunc + + +def _circfuncs_common(samples, high, low): + samples = np.asarray(samples) + if samples.size == 0: + return np.nan, np.nan + + ang = (samples - low) * 2 * pi / (high - low) + return samples, ang + + +def circmean(samples, high=2 * pi, low=0, axis=None): + """ + Compute the circular mean for samples in a range. + + Parameters + ---------- + samples : array_like + Input array. + high : float or int, optional + High boundary for circular mean range. Default is ``2*pi``. + low : float or int, optional + Low boundary for circular mean range. Default is 0. + axis : int, optional + Axis along which means are computed. The default is to compute + the mean of the flattened array. + + Returns + ------- + circmean : float + Circular mean. + + """ + samples, ang = _circfuncs_common(samples, high, low) + res = angle(np.mean(exp(1j * ang), axis=axis)) + mask = res < 0 + if (mask.ndim > 0): + res[mask] += 2 * pi + elif mask: + res = res + 2 * pi + + return res * (high - low) / 2.0 / pi + low + + +def circvar(samples, high=2 * pi, low=0, axis=None): + """ + Compute the circular variance for samples assumed to be in a range + + Parameters + ---------- + samples : array_like + Input array. + low : float or int, optional + Low boundary for circular variance range. Default is 0. + high : float or int, optional + High boundary for circular variance range. Default is ``2*pi``. + axis : int, optional + Axis along which variances are computed. The default is to compute + the variance of the flattened array. + + Returns + ------- + circvar : float + Circular variance. + + Notes + ----- + This uses a definition of circular variance that in the limit of small + angles returns a number close to the 'linear' variance. + + """ + samples, ang = _circfuncs_common(samples, high, low) + res = np.mean(exp(1j * ang), axis=axis) + R = abs(res) + return ((high - low) / 2.0 / pi) ** 2 * 2 * log(1 / R) + + +def circstd(samples, high=2 * pi, low=0, axis=None): + """ + Compute the circular standard deviation for samples assumed to be in the + range [low to high]. + + Parameters + ---------- + samples : array_like + Input array. + low : float or int, optional + Low boundary for circular standard deviation range. Default is 0. + high : float or int, optional + High boundary for circular standard deviation range. + Default is ``2*pi``. + axis : int, optional + Axis along which standard deviations are computed. The default is + to compute the standard deviation of the flattened array. + + Returns + ------- + circstd : float + Circular standard deviation. + + Notes + ----- + This uses a definition of circular standard deviation that in the limit of + small angles returns a number close to the 'linear' standard deviation. + + """ + samples, ang = _circfuncs_common(samples, high, low) + res = np.mean(exp(1j * ang), axis=axis) + R = abs(res) + return ((high - low) / 2.0 / pi) * sqrt(-2 * log(R)) + + +# Tests to include (from R) -- some of these already in stats. +# +# X Ansari-Bradley +# X Bartlett (and Levene) +# X Binomial +# Y Pearson's Chi-squared (stats.chisquare) +# Y Association Between Paired samples (stats.pearsonr, stats.spearmanr) +# stats.kendalltau) -- these need work though +# Fisher's exact test +# X Fligner-Killeen Test +# Y Friedman Rank Sum (stats.friedmanchisquare?) +# Y Kruskal-Wallis +# Y Kolmogorov-Smirnov +# Cochran-Mantel-Haenszel Chi-Squared for Count +# McNemar's Chi-squared for Count +# X Mood Two-Sample +# X Test For Equal Means in One-Way Layout (see stats.ttest also) +# Pairwise Comparisons of proportions +# Pairwise t tests +# Tabulate p values for pairwise comparisons +# Pairwise Wilcoxon rank sum tests +# Power calculations two sample test of prop. +# Power calculations for one and two sample t tests +# Equal or Given Proportions +# Trend in Proportions +# Quade Test +# Y Student's T Test +# Y F Test to compare two variances +# XY Wilcoxon Rank Sum and Signed Rank Tests diff --git a/pywafo/src/wafo/stats/stats.py b/pywafo/src/wafo/stats/stats.py index 8e7f1b6..97291f5 100644 --- a/pywafo/src/wafo/stats/stats.py +++ b/pywafo/src/wafo/stats/stats.py @@ -1,4354 +1,4370 @@ -# Copyright (c) Gary Strangman. All rights reserved -# -# Disclaimer -# -# This software is provided "as-is". There are no expressed or implied -# warranties of any kind, including, but not limited to, the warranties -# of merchantability and fitness for a given application. In no event -# shall Gary Strangman be liable for any direct, indirect, incidental, -# special, exemplary or consequential damages (including, but not limited -# to, loss of use, data or profits, or business interruption) however -# caused and on any theory of liability, whether in contract, strict -# liability or tort (including negligence or otherwise) arising in any way -# out of the use of this software, even if advised of the possibility of -# such damage. -# - -# -# Heavily adapted for use by SciPy 2002 by Travis Oliphant -""" -A collection of basic statistical functions for python. The function -names appear below. - - Some scalar functions defined here are also available in the scipy.special - package where they work on arbitrary sized arrays. - -Disclaimers: The function list is obviously incomplete and, worse, the -functions are not optimized. All functions have been tested (some more -so than others), but they are far from bulletproof. Thus, as with any -free software, no warranty or guarantee is expressed or implied. :-) A -few extra functions that don't appear in the list below can be found by -interested treasure-hunters. These functions don't necessarily have -both list and array versions but were deemed useful. - -Central Tendency ----------------- -.. autosummary:: - :toctree: generated/ - - gmean - hmean - mode - -Moments -------- -.. autosummary:: - :toctree: generated/ - - moment - variation - skew - kurtosis - normaltest - -Moments Handling NaN: - -.. autosummary:: - :toctree: generated/ - - nanmean - nanmedian - nanstd - -Altered Versions ----------------- -.. autosummary:: - :toctree: generated/ - - tmean - tvar - tstd - tsem - describe - -Frequency Stats ---------------- -.. autosummary:: - :toctree: generated/ - - itemfreq - scoreatpercentile - percentileofscore - histogram - cumfreq - relfreq - -Variability ------------ -.. autosummary:: - :toctree: generated/ - - obrientransform - signaltonoise - sem - -Trimming Functions ------------------- -.. autosummary:: - :toctree: generated/ - - threshold - trimboth - trim1 - -Correlation Functions ---------------------- -.. autosummary:: - :toctree: generated/ - - pearsonr - fisher_exact - spearmanr - pointbiserialr - kendalltau - linregress - -Inferential Stats ------------------ -.. autosummary:: - :toctree: generated/ - - ttest_1samp - ttest_ind - ttest_rel - chisquare - power_divergence - ks_2samp - mannwhitneyu - ranksums - wilcoxon - kruskal - friedmanchisquare - -Probability Calculations ------------------------- -.. autosummary:: - :toctree: generated/ - - chisqprob - zprob - fprob - betai - -ANOVA Functions ---------------- -.. autosummary:: - :toctree: generated/ - - f_oneway - f_value - -Support Functions ------------------ -.. autosummary:: - :toctree: generated/ - - ss - square_of_sums - rankdata - -References ----------- -.. [CRCProbStat2000] Zwillinger, D. and Kokoska, S. (2000). CRC Standard - Probability and Statistics Tables and Formulae. Chapman & Hall: New - York. 2000. - -""" - -from __future__ import division, print_function, absolute_import - -import warnings -import math - -#from .six import xrange - -# friedmanchisquare patch uses python sum -pysum = sum # save it before it gets overwritten - -# Scipy imports. -from scipy.lib.six import callable, string_types -from numpy import array, asarray, ma, zeros, sum -import scipy.special as special -import scipy.linalg as linalg -import numpy as np - -from . import futil -from . import distributions -try: - from ._rank import rankdata, tiecorrect -except: - rankdata=tiecorrect=None -__all__ = ['find_repeats', 'gmean', 'hmean', 'mode', - 'tmean', 'tvar', 'tmin', 'tmax', 'tstd', 'tsem', - 'moment', 'variation', 'skew', 'kurtosis', 'describe', - 'skewtest', 'kurtosistest', 'normaltest', 'jarque_bera', - 'itemfreq', 'scoreatpercentile', 'percentileofscore', - 'histogram', 'histogram2', 'cumfreq', 'relfreq', - 'obrientransform', 'signaltonoise', 'sem', 'zmap', 'zscore', - 'threshold', 'sigmaclip', 'trimboth', 'trim1', 'trim_mean', - 'f_oneway', 'pearsonr', 'fisher_exact', - 'spearmanr', 'pointbiserialr', 'kendalltau', 'linregress', - 'ttest_1samp', 'ttest_ind', 'ttest_rel', 'kstest', - 'chisquare', 'power_divergence', 'ks_2samp', 'mannwhitneyu', - 'tiecorrect', 'ranksums', 'kruskal', 'friedmanchisquare', - 'zprob', 'chisqprob', 'ksprob', 'fprob', 'betai', - 'f_value_wilks_lambda', 'f_value', 'f_value_multivariate', - 'ss', 'square_of_sums', - 'fastsort', 'rankdata', - 'nanmean', 'nanstd', 'nanmedian', - ] - - -def _chk_asarray(a, axis): - if axis is None: - a = np.ravel(a) - outaxis = 0 - else: - a = np.asarray(a) - outaxis = axis - return a, outaxis - - -def _chk2_asarray(a, b, axis): - if axis is None: - a = np.ravel(a) - b = np.ravel(b) - outaxis = 0 - else: - a = np.asarray(a) - b = np.asarray(b) - outaxis = axis - return a, b, outaxis - - -def find_repeats(arr): - """ - Find repeats and repeat counts. - - Parameters - ---------- - arr : array_like - Input array - - Returns - ------- - find_repeats : tuple - Returns a tuple of two 1-D ndarrays. The first ndarray are the repeats - as sorted, unique values that are repeated in `arr`. The second - ndarray are the counts mapped one-to-one of the repeated values - in the first ndarray. - - Examples - -------- - >>> sp.stats.find_repeats([2, 1, 2, 3, 2, 2, 5]) - (array([ 2. ]), array([ 4 ], dtype=int32) - - >>> sp.stats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]]) - (array([ 4., 5.]), array([2, 2], dtype=int32)) - - """ - v1,v2, n = futil.dfreps(arr) - return v1[:n],v2[:n] - -####### -### NAN friendly functions -######## - - -def nanmean(x, axis=0): - """ - Compute the mean over the given axis ignoring nans. - - Parameters - ---------- - x : ndarray - Input array. - axis : int, optional - Axis along which the mean is computed. Default is 0, i.e. the - first axis. - - Returns - ------- - m : float - The mean of `x`, ignoring nans. - - See Also - -------- - nanstd, nanmedian - - Examples - -------- - >>> from scipy import stats - >>> a = np.linspace(0, 4, 3) - >>> a - array([ 0., 2., 4.]) - >>> a[-1] = np.nan - >>> stats.nanmean(a) - 1.0 - - """ - x, axis = _chk_asarray(x, axis) - x = x.copy() - Norig = x.shape[axis] - mask = np.isnan(x) - factor = 1.0 - np.sum(mask, axis) / Norig - - x[mask] = 0.0 - return np.mean(x, axis) / factor - - -def nanstd(x, axis=0, bias=False): - """ - Compute the standard deviation over the given axis, ignoring nans. - - Parameters - ---------- - x : array_like - Input array. - axis : int or None, optional - Axis along which the standard deviation is computed. Default is 0. - If None, compute over the whole array `x`. - bias : bool, optional - If True, the biased (normalized by N) definition is used. If False - (default), the unbiased definition is used. - - Returns - ------- - s : float - The standard deviation. - - See Also - -------- - nanmean, nanmedian - - Examples - -------- - >>> from scipy import stats - >>> a = np.arange(10, dtype=float) - >>> a[1:3] = np.nan - >>> np.std(a) - nan - >>> stats.nanstd(a) - 2.9154759474226504 - >>> stats.nanstd(a.reshape(2, 5), axis=1) - array([ 2.0817, 1.5811]) - >>> stats.nanstd(a.reshape(2, 5), axis=None) - 2.9154759474226504 - - """ - x, axis = _chk_asarray(x, axis) - x = x.copy() - Norig = x.shape[axis] - - mask = np.isnan(x) - Nnan = np.sum(mask, axis) * 1.0 - n = Norig - Nnan - - x[mask] = 0.0 - m1 = np.sum(x, axis) / n - - if axis: - d = x - np.expand_dims(m1, axis) - else: - d = x - m1 - - d *= d - - m2 = np.sum(d, axis) - m1 * m1 * Nnan - - if bias: - m2c = m2 / n - else: - m2c = m2 / (n - 1.0) - - return np.sqrt(m2c) - - -def _nanmedian(arr1d): # This only works on 1d arrays - """Private function for rank a arrays. Compute the median ignoring Nan. - - Parameters - ---------- - arr1d : ndarray - Input array, of rank 1. - - Results - ------- - m : float - The median. - """ - cond = 1-np.isnan(arr1d) - x = np.sort(np.compress(cond,arr1d,axis=-1)) - if x.size == 0: - return np.nan - return np.median(x) - - -def nanmedian(x, axis=0): - """ - Compute the median along the given axis ignoring nan values. - - Parameters - ---------- - x : array_like - Input array. - axis : int, optional - Axis along which the median is computed. Default is 0, i.e. the - first axis. - - Returns - ------- - m : float - The median of `x` along `axis`. - - See Also - -------- - nanstd, nanmean - - Examples - -------- - >>> from scipy import stats - >>> a = np.array([0, 3, 1, 5, 5, np.nan]) - >>> stats.nanmedian(a) - array(3.0) - - >>> b = np.array([0, 3, 1, 5, 5, np.nan, 5]) - >>> stats.nanmedian(b) - array(4.0) - - Example with axis: - - >>> c = np.arange(30.).reshape(5,6) - >>> idx = np.array([False, False, False, True, False] * 6).reshape(5,6) - >>> c[idx] = np.nan - >>> c - array([[ 0., 1., 2., nan, 4., 5.], - [ 6., 7., nan, 9., 10., 11.], - [ 12., nan, 14., 15., 16., 17.], - [ nan, 19., 20., 21., 22., nan], - [ 24., 25., 26., 27., nan, 29.]]) - >>> stats.nanmedian(c, axis=1) - array([ 2. , 9. , 15. , 20.5, 26. ]) - - """ - x, axis = _chk_asarray(x, axis) - if x.ndim == 0: - return float(x.item()) - x = x.copy() - x = np.apply_along_axis(_nanmedian, axis, x) - if x.ndim == 0: - x = float(x.item()) - return x - - -##################################### -######## CENTRAL TENDENCY ######## -##################################### - - -def gmean(a, axis=0, dtype=None): - """ - Compute the geometric mean along the specified axis. - - Returns the geometric average of the array elements. - That is: n-th root of (x1 * x2 * ... * xn) - - Parameters - ---------- - a : array_like - Input array or object that can be converted to an array. - axis : int, optional, default axis=0 - Axis along which the geometric mean is computed. - dtype : dtype, optional - Type of the returned array and of the accumulator in which the - elements are summed. If dtype is not specified, it defaults to the - dtype of a, unless a has an integer dtype with a precision less than - that of the default platform integer. In that case, the default - platform integer is used. - - Returns - ------- - gmean : ndarray - see dtype parameter above - - See Also - -------- - numpy.mean : Arithmetic average - numpy.average : Weighted average - hmean : Harmonic mean - - Notes - ----- - The geometric average is computed over a single dimension of the input - array, axis=0 by default, or all values in the array if axis=None. - float64 intermediate and return values are used for integer inputs. - - Use masked arrays to ignore any non-finite values in the input or that - arise in the calculations such as Not a Number and infinity because masked - arrays automatically mask any non-finite values. - - """ - if not isinstance(a, np.ndarray): # if not an ndarray object attempt to convert it - log_a = np.log(np.array(a, dtype=dtype)) - elif dtype: # Must change the default dtype allowing array type - if isinstance(a,np.ma.MaskedArray): - log_a = np.log(np.ma.asarray(a, dtype=dtype)) - else: - log_a = np.log(np.asarray(a, dtype=dtype)) - else: - log_a = np.log(a) - return np.exp(log_a.mean(axis=axis)) - - -def hmean(a, axis=0, dtype=None): - """ - Calculates the harmonic mean along the specified axis. - - That is: n / (1/x1 + 1/x2 + ... + 1/xn) - - Parameters - ---------- - a : array_like - Input array, masked array or object that can be converted to an array. - axis : int, optional, default axis=0 - Axis along which the harmonic mean is computed. - dtype : dtype, optional - Type of the returned array and of the accumulator in which the - elements are summed. If `dtype` is not specified, it defaults to the - dtype of `a`, unless `a` has an integer `dtype` with a precision less - than that of the default platform integer. In that case, the default - platform integer is used. - - Returns - ------- - hmean : ndarray - see `dtype` parameter above - - See Also - -------- - numpy.mean : Arithmetic average - numpy.average : Weighted average - gmean : Geometric mean - - Notes - ----- - The harmonic mean is computed over a single dimension of the input - array, axis=0 by default, or all values in the array if axis=None. - float64 intermediate and return values are used for integer inputs. - - Use masked arrays to ignore any non-finite values in the input or that - arise in the calculations such as Not a Number and infinity. - - """ - if not isinstance(a, np.ndarray): - a = np.array(a, dtype=dtype) - if np.all(a > 0): # Harmonic mean only defined if greater than zero - if isinstance(a, np.ma.MaskedArray): - size = a.count(axis) - else: - if axis is None: - a = a.ravel() - size = a.shape[0] - else: - size = a.shape[axis] - return size / np.sum(1.0/a, axis=axis, dtype=dtype) - else: - raise ValueError("Harmonic mean only defined if all elements greater than zero") - - -def mode(a, axis=0): - """ - Returns an array of the modal (most common) value in the passed array. - - If there is more than one such value, only the first is returned. - The bin-count for the modal bins is also returned. - - Parameters - ---------- - a : array_like - n-dimensional array of which to find mode(s). - axis : int, optional - Axis along which to operate. Default is 0, i.e. the first axis. - - Returns - ------- - vals : ndarray - Array of modal values. - counts : ndarray - Array of counts for each mode. - - Examples - -------- - >>> a = np.array([[6, 8, 3, 0], - [3, 2, 1, 7], - [8, 1, 8, 4], - [5, 3, 0, 5], - [4, 7, 5, 9]]) - >>> from scipy import stats - >>> stats.mode(a) - (array([[ 3., 1., 0., 0.]]), array([[ 1., 1., 1., 1.]])) - - To get mode of whole array, specify axis=None: - - >>> stats.mode(a, axis=None) - (array([ 3.]), array([ 3.])) - - """ - a, axis = _chk_asarray(a, axis) - scores = np.unique(np.ravel(a)) # get ALL unique values - testshape = list(a.shape) - testshape[axis] = 1 - oldmostfreq = np.zeros(testshape) - oldcounts = np.zeros(testshape) - for score in scores: - template = (a == score) - counts = np.expand_dims(np.sum(template, axis),axis) - mostfrequent = np.where(counts > oldcounts, score, oldmostfreq) - oldcounts = np.maximum(counts, oldcounts) - oldmostfreq = mostfrequent - return mostfrequent, oldcounts - - -def mask_to_limits(a, limits, inclusive): - """Mask an array for values outside of given limits. - - This is primarily a utility function. - - Parameters - ---------- - a : array - limits : (float or None, float or None) - A tuple consisting of the (lower limit, upper limit). Values in the - input array less than the lower limit or greater than the upper limit - will be masked out. None implies no limit. - inclusive : (bool, bool) - A tuple consisting of the (lower flag, upper flag). These flags - determine whether values exactly equal to lower or upper are allowed. - - Returns - ------- - A MaskedArray. - - Raises - ------ - A ValueError if there are no values within the given limits. - """ - lower_limit, upper_limit = limits - lower_include, upper_include = inclusive - am = ma.MaskedArray(a) - if lower_limit is not None: - if lower_include: - am = ma.masked_less(am, lower_limit) - else: - am = ma.masked_less_equal(am, lower_limit) - - if upper_limit is not None: - if upper_include: - am = ma.masked_greater(am, upper_limit) - else: - am = ma.masked_greater_equal(am, upper_limit) - - if am.count() == 0: - raise ValueError("No array values within given limits") - - return am - - -def tmean(a, limits=None, inclusive=(True, True)): - """ - Compute the trimmed mean. - - This function finds the arithmetic mean of given values, ignoring values - outside the given `limits`. - - Parameters - ---------- - a : array_like - Array of values. - limits : None or (lower limit, upper limit), optional - Values in the input array less than the lower limit or greater than the - upper limit will be ignored. When limits is None (default), then all - values are used. Either of the limit values in the tuple can also be - None representing a half-open interval. - inclusive : (bool, bool), optional - A tuple consisting of the (lower flag, upper flag). These flags - determine whether values exactly equal to the lower or upper limits - are included. The default value is (True, True). - - Returns - ------- - tmean : float - - """ - a = asarray(a) - if limits is None: - return np.mean(a, None) - - am = mask_to_limits(a.ravel(), limits, inclusive) - return am.mean() - - -def masked_var(am): - m = am.mean() - s = ma.add.reduce((am - m)**2) - n = am.count() - 1.0 - return s / n - - -def tvar(a, limits=None, inclusive=(True, True)): - """ - Compute the trimmed variance - - This function computes the sample variance of an array of values, - while ignoring values which are outside of given `limits`. - - Parameters - ---------- - a : array_like - Array of values. - limits : None or (lower limit, upper limit), optional - Values in the input array less than the lower limit or greater than the - upper limit will be ignored. When limits is None, then all values are - used. Either of the limit values in the tuple can also be None - representing a half-open interval. The default value is None. - inclusive : (bool, bool), optional - A tuple consisting of the (lower flag, upper flag). These flags - determine whether values exactly equal to the lower or upper limits - are included. The default value is (True, True). - - Returns - ------- - tvar : float - Trimmed variance. - - Notes - ----- - `tvar` computes the unbiased sample variance, i.e. it uses a correction - factor ``n / (n - 1)``. - - """ - a = asarray(a) - a = a.astype(float).ravel() - if limits is None: - n = len(a) - return a.var()*(n/(n-1.)) - am = mask_to_limits(a, limits, inclusive) - return masked_var(am) - - -def tmin(a, lowerlimit=None, axis=0, inclusive=True): - """ - Compute the trimmed minimum - - This function finds the miminum value of an array `a` along the - specified axis, but only considering values greater than a specified - lower limit. - - Parameters - ---------- - a : array_like - array of values - lowerlimit : None or float, optional - Values in the input array less than the given limit will be ignored. - When lowerlimit is None, then all values are used. The default value - is None. - axis : None or int, optional - Operate along this axis. None means to use the flattened array and - the default is zero - inclusive : {True, False}, optional - This flag determines whether values exactly equal to the lower limit - are included. The default value is True. - - Returns - ------- - tmin : float - - """ - a, axis = _chk_asarray(a, axis) - am = mask_to_limits(a, (lowerlimit, None), (inclusive, False)) - return ma.minimum.reduce(am, axis) - - -def tmax(a, upperlimit=None, axis=0, inclusive=True): - """ - Compute the trimmed maximum - - This function computes the maximum value of an array along a given axis, - while ignoring values larger than a specified upper limit. - - Parameters - ---------- - a : array_like - array of values - upperlimit : None or float, optional - Values in the input array greater than the given limit will be ignored. - When upperlimit is None, then all values are used. The default value - is None. - axis : None or int, optional - Operate along this axis. None means to use the flattened array and - the default is zero. - inclusive : {True, False}, optional - This flag determines whether values exactly equal to the upper limit - are included. The default value is True. - - Returns - ------- - tmax : float - - """ - a, axis = _chk_asarray(a, axis) - am = mask_to_limits(a, (None, upperlimit), (False, inclusive)) - return ma.maximum.reduce(am, axis) - - -def tstd(a, limits=None, inclusive=(True, True)): - """ - Compute the trimmed sample standard deviation - - This function finds the sample standard deviation of given values, - ignoring values outside the given `limits`. - - Parameters - ---------- - a : array_like - array of values - limits : None or (lower limit, upper limit), optional - Values in the input array less than the lower limit or greater than the - upper limit will be ignored. When limits is None, then all values are - used. Either of the limit values in the tuple can also be None - representing a half-open interval. The default value is None. - inclusive : (bool, bool), optional - A tuple consisting of the (lower flag, upper flag). These flags - determine whether values exactly equal to the lower or upper limits - are included. The default value is (True, True). - - Returns - ------- - tstd : float - - Notes - ----- - `tstd` computes the unbiased sample standard deviation, i.e. it uses a - correction factor ``n / (n - 1)``. - - """ - return np.sqrt(tvar(a, limits, inclusive)) - - -def tsem(a, limits=None, inclusive=(True, True)): - """ - Compute the trimmed standard error of the mean. - - This function finds the standard error of the mean for given - values, ignoring values outside the given `limits`. - - Parameters - ---------- - a : array_like - array of values - limits : None or (lower limit, upper limit), optional - Values in the input array less than the lower limit or greater than the - upper limit will be ignored. When limits is None, then all values are - used. Either of the limit values in the tuple can also be None - representing a half-open interval. The default value is None. - inclusive : (bool, bool), optional - A tuple consisting of the (lower flag, upper flag). These flags - determine whether values exactly equal to the lower or upper limits - are included. The default value is (True, True). - - Returns - ------- - tsem : float - - Notes - ----- - `tsem` uses unbiased sample standard deviation, i.e. it uses a - correction factor ``n / (n - 1)``. - - """ - a = np.asarray(a).ravel() - if limits is None: - return a.std(ddof=1) / np.sqrt(a.size) - - am = mask_to_limits(a, limits, inclusive) - sd = np.sqrt(masked_var(am)) - return sd / np.sqrt(am.count()) - - -##################################### -############ MOMENTS ############# -##################################### - -def moment(a, moment=1, axis=0): - """ - Calculates the nth moment about the mean for a sample. - - Generally used to calculate coefficients of skewness and - kurtosis. - - Parameters - ---------- - a : array_like - data - moment : int - order of central moment that is returned - axis : int or None - Axis along which the central moment is computed. If None, then the data - array is raveled. The default axis is zero. - - Returns - ------- - n-th central moment : ndarray or float - The appropriate moment along the given axis or over all values if axis - is None. The denominator for the moment calculation is the number of - observations, no degrees of freedom correction is done. - - """ - a, axis = _chk_asarray(a, axis) - if moment == 1: - # By definition the first moment about the mean is 0. - shape = list(a.shape) - del shape[axis] - if shape: - # return an actual array of the appropriate shape - return np.zeros(shape, dtype=float) - else: - # the input was 1D, so return a scalar instead of a rank-0 array - return np.float64(0.0) - else: - mn = np.expand_dims(np.mean(a,axis), axis) - s = np.power((a-mn), moment) - return np.mean(s, axis) - - -def variation(a, axis=0): - """ - Computes the coefficient of variation, the ratio of the biased standard - deviation to the mean. - - Parameters - ---------- - a : array_like - Input array. - axis : int or None - Axis along which to calculate the coefficient of variation. - - References - ---------- - .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard - Probability and Statistics Tables and Formulae. Chapman & Hall: New - York. 2000. - - """ - a, axis = _chk_asarray(a, axis) - return a.std(axis)/a.mean(axis) - - -def skew(a, axis=0, bias=True): - """ - Computes the skewness of a data set. - - For normally distributed data, the skewness should be about 0. A skewness - value > 0 means that there is more weight in the left tail of the - distribution. The function `skewtest` can be used to determine if the - skewness value is close enough to 0, statistically speaking. - - Parameters - ---------- - a : ndarray - data - axis : int or None - axis along which skewness is calculated - bias : bool - If False, then the calculations are corrected for statistical bias. - - Returns - ------- - skewness : ndarray - The skewness of values along an axis, returning 0 where all values are - equal. - - References - ---------- - [CRCProbStat2000]_ Section 2.2.24.1 - - .. [CRCProbStat2000] Zwillinger, D. and Kokoska, S. (2000). CRC Standard - Probability and Statistics Tables and Formulae. Chapman & Hall: New - York. 2000. - - """ - a, axis = _chk_asarray(a,axis) - n = a.shape[axis] - m2 = moment(a, 2, axis) - m3 = moment(a, 3, axis) - zero = (m2 == 0) - vals = np.where(zero, 0, m3 / m2**1.5) - if not bias: - can_correct = (n > 2) & (m2 > 0) - if can_correct.any(): - m2 = np.extract(can_correct, m2) - m3 = np.extract(can_correct, m3) - nval = np.sqrt((n-1.0)*n)/(n-2.0)*m3/m2**1.5 - np.place(vals, can_correct, nval) - if vals.ndim == 0: - return vals.item() - return vals - - -def kurtosis(a, axis=0, fisher=True, bias=True): - """ - Computes the kurtosis (Fisher or Pearson) of a dataset. - - Kurtosis is the fourth central moment divided by the square of the - variance. If Fisher's definition is used, then 3.0 is subtracted from - the result to give 0.0 for a normal distribution. - - If bias is False then the kurtosis is calculated using k statistics to - eliminate bias coming from biased moment estimators - - Use `kurtosistest` to see if result is close enough to normal. - - Parameters - ---------- - a : array - data for which the kurtosis is calculated - axis : int or None - Axis along which the kurtosis is calculated - fisher : bool - If True, Fisher's definition is used (normal ==> 0.0). If False, - Pearson's definition is used (normal ==> 3.0). - bias : bool - If False, then the calculations are corrected for statistical bias. - - Returns - ------- - kurtosis : array - The kurtosis of values along an axis. If all values are equal, - return -3 for Fisher's definition and 0 for Pearson's definition. - - References - ---------- - .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard - Probability and Statistics Tables and Formulae. Chapman & Hall: New - York. 2000. - - """ - a, axis = _chk_asarray(a, axis) - n = a.shape[axis] - m2 = moment(a,2,axis) - m4 = moment(a,4,axis) - zero = (m2 == 0) - olderr = np.seterr(all='ignore') - try: - vals = np.where(zero, 0, m4 / m2**2.0) - finally: - np.seterr(**olderr) - - if not bias: - can_correct = (n > 3) & (m2 > 0) - if can_correct.any(): - m2 = np.extract(can_correct, m2) - m4 = np.extract(can_correct, m4) - nval = 1.0/(n-2)/(n-3)*((n*n-1.0)*m4/m2**2.0-3*(n-1)**2.0) - np.place(vals, can_correct, nval+3.0) - - if vals.ndim == 0: - vals = vals.item() # array scalar - - if fisher: - return vals - 3 - else: - return vals - - -def describe(a, axis=0): - """ - Computes several descriptive statistics of the passed array. - - Parameters - ---------- - a : array_like - data - axis : int or None - axis along which statistics are calculated. If axis is None, then data - array is raveled. The default axis is zero. - - Returns - ------- - size of the data : int - length of data along axis - (min, max): tuple of ndarrays or floats - minimum and maximum value of data array - arithmetic mean : ndarray or float - mean of data along axis - unbiased variance : ndarray or float - variance of the data along axis, denominator is number of observations - minus one. - biased skewness : ndarray or float - skewness, based on moment calculations with denominator equal to the - number of observations, i.e. no degrees of freedom correction - biased kurtosis : ndarray or float - kurtosis (Fisher), the kurtosis is normalized so that it is zero for the - normal distribution. No degrees of freedom or bias correction is used. - - See Also - -------- - skew - kurtosis - - """ - a, axis = _chk_asarray(a, axis) - n = a.shape[axis] - mm = (np.min(a, axis=axis), np.max(a, axis=axis)) - m = np.mean(a, axis=axis) - v = np.var(a, axis=axis, ddof=1) - sk = skew(a, axis) - kurt = kurtosis(a, axis) - return n, mm, m, v, sk, kurt - -##################################### -######## NORMALITY TESTS ########## -##################################### - - -def skewtest(a, axis=0): - """ - Tests whether the skew is different from the normal distribution. - - This function tests the null hypothesis that the skewness of - the population that the sample was drawn from is the same - as that of a corresponding normal distribution. - - Parameters - ---------- - a : array - axis : int or None - - Returns - ------- - z-score : float - The computed z-score for this test. - p-value : float - a 2-sided p-value for the hypothesis test - - Notes - ----- - The sample size must be at least 8. - - """ - a, axis = _chk_asarray(a, axis) - if axis is None: - a = np.ravel(a) - axis = 0 - b2 = skew(a, axis) - n = float(a.shape[axis]) - if n < 8: - raise ValueError( - "skewtest is not valid with less than 8 samples; %i samples" - " were given." % int(n)) - y = b2 * math.sqrt(((n + 1) * (n + 3)) / (6.0 * (n - 2))) - beta2 = (3.0 * (n * n + 27 * n - 70) * (n + 1) * (n + 3) / - ((n - 2.0) * (n + 5) * (n + 7) * (n + 9))) - W2 = -1 + math.sqrt(2 * (beta2 - 1)) - delta = 1 / math.sqrt(0.5 * math.log(W2)) - alpha = math.sqrt(2.0 / (W2 - 1)) - y = np.where(y == 0, 1, y) - Z = delta * np.log(y / alpha + np.sqrt((y / alpha) ** 2 + 1)) - return Z, 2 * distributions.norm.sf(np.abs(Z)) - - -def kurtosistest(a, axis=0): - """ - Tests whether a dataset has normal kurtosis - - This function tests the null hypothesis that the kurtosis - of the population from which the sample was drawn is that - of the normal distribution: ``kurtosis = 3(n-1)/(n+1)``. - - Parameters - ---------- - a : array - array of the sample data - axis : int or None - the axis to operate along, or None to work on the whole array. - The default is the first axis. - - Returns - ------- - z-score : float - The computed z-score for this test. - p-value : float - The 2-sided p-value for the hypothesis test - - Notes - ----- - Valid only for n>20. The Z-score is set to 0 for bad entries. - - """ - a, axis = _chk_asarray(a, axis) - n = float(a.shape[axis]) - if n < 5: - raise ValueError( - "kurtosistest requires at least 5 observations; %i observations" - " were given." % int(n)) - if n < 20: - warnings.warn( - "kurtosistest only valid for n>=20 ... continuing anyway, n=%i" % - int(n)) - b2 = kurtosis(a, axis, fisher=False) - E = 3.0*(n-1) / (n+1) - varb2 = 24.0*n*(n-2)*(n-3) / ((n+1)*(n+1)*(n+3)*(n+5)) - x = (b2-E)/np.sqrt(varb2) - sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) / - (n*(n-2)*(n-3))) - A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2))) - term1 = 1 - 2/(9.0*A) - denom = 1 + x*np.sqrt(2/(A-4.0)) - denom = np.where(denom < 0, 99, denom) - term2 = np.where(denom < 0, term1, np.power((1-2.0/A)/denom,1/3.0)) - Z = (term1 - term2) / np.sqrt(2/(9.0*A)) - Z = np.where(denom == 99, 0, Z) - if Z.ndim == 0: - Z = Z[()] - # JPNote: p-value sometimes larger than 1 - # zprob uses upper tail, so Z needs to be positive - return Z, 2 * distributions.norm.sf(np.abs(Z)) - - -def normaltest(a, axis=0): - """ - Tests whether a sample differs from a normal distribution. - - This function tests the null hypothesis that a sample comes - from a normal distribution. It is based on D'Agostino and - Pearson's [1]_, [2]_ test that combines skew and kurtosis to - produce an omnibus test of normality. - - - Parameters - ---------- - a : array_like - The array containing the data to be tested. - axis : int or None - If None, the array is treated as a single data set, regardless of - its shape. Otherwise, each 1-d array along axis `axis` is tested. - - Returns - ------- - k2 : float or array - `s^2 + k^2`, where `s` is the z-score returned by `skewtest` and - `k` is the z-score returned by `kurtosistest`. - p-value : float or array - A 2-sided chi squared probability for the hypothesis test. - - References - ---------- - .. [1] D'Agostino, R. B. (1971), "An omnibus test of normality for - moderate and large sample size," Biometrika, 58, 341-348 - - .. [2] D'Agostino, R. and Pearson, E. S. (1973), "Testing for - departures from normality," Biometrika, 60, 613-622 - - """ - a, axis = _chk_asarray(a, axis) - s,p = skewtest(a,axis) - k,p = kurtosistest(a,axis) - k2 = s*s + k*k - return k2, chisqprob(k2,2) - - -def jarque_bera(x): - """ - Perform the Jarque-Bera goodness of fit test on sample data. - - The Jarque-Bera test tests whether the sample data has the skewness and - kurtosis matching a normal distribution. - - Note that this test only works for a large enough number of data samples - (>2000) as the test statistic asymptotically has a Chi-squared distribution - with 2 degrees of freedom. - - Parameters - ---------- - x : array_like - Observations of a random variable. - - Returns - ------- - jb_value : float - The test statistic. - p : float - The p-value for the hypothesis test. - - References - ---------- - .. [1] Jarque, C. and Bera, A. (1980) "Efficient tests for normality, - homoscedasticity and serial independence of regression residuals", - 6 Econometric Letters 255-259. - - Examples - -------- - >>> from scipy import stats - >>> np.random.seed(987654321) - >>> x = np.random.normal(0, 1, 100000) - >>> y = np.random.rayleigh(1, 100000) - >>> stats.jarque_bera(x) - (4.7165707989581342, 0.09458225503041906) - >>> stats.jarque_bera(y) - (6713.7098548143422, 0.0) - - """ - x = np.asarray(x) - n = float(x.size) - if n == 0: - raise ValueError('At least one observation is required.') - - mu = x.mean() - diffx = x - mu - skewness = (1 / n * np.sum(diffx**3)) / (1 / n * np.sum(diffx**2))**(3 / 2.) - kurtosis = (1 / n * np.sum(diffx**4)) / (1 / n * np.sum(diffx**2))**2 - jb_value = n / 6 * (skewness**2 + (kurtosis - 3)**2 / 4) - p = 1 - distributions.chi2.cdf(jb_value, 2) - - return jb_value, p - - -##################################### -###### FREQUENCY FUNCTIONS ####### -##################################### - -def itemfreq(a): - """ - Returns a 2-D array of item frequencies. - - Parameters - ---------- - a : (N,) array_like - Input array. - - Returns - ------- - itemfreq : (K, 2) ndarray - A 2-D frequency table. Column 1 contains sorted, unique values from - `a`, column 2 contains their respective counts. - - Examples - -------- - >>> a = np.array([1, 1, 5, 0, 1, 2, 2, 0, 1, 4]) - >>> stats.itemfreq(a) - array([[ 0., 2.], - [ 1., 4.], - [ 2., 2.], - [ 4., 1.], - [ 5., 1.]]) - >>> np.bincount(a) - array([2, 4, 2, 0, 1, 1]) - - >>> stats.itemfreq(a/10.) - array([[ 0. , 2. ], - [ 0.1, 4. ], - [ 0.2, 2. ], - [ 0.4, 1. ], - [ 0.5, 1. ]]) - - """ - items, inv = np.unique(a, return_inverse=True) - freq = np.bincount(inv) - return np.array([items, freq]).T - - -def scoreatpercentile(a, per, limit=(), interpolation_method='fraction', - axis=None): - """ - Calculate the score at a given percentile of the input sequence. - - For example, the score at `per=50` is the median. If the desired quantile - lies between two data points, we interpolate between them, according to - the value of `interpolation`. If the parameter `limit` is provided, it - should be a tuple (lower, upper) of two values. - - Parameters - ---------- - a : array_like - A 1-D array of values from which to extract score. - per : array_like - Percentile(s) at which to extract score. Values should be in range - [0,100]. - limit : tuple, optional - Tuple of two scalars, the lower and upper limits within which to - compute the percentile. Values of `a` outside - this (closed) interval will be ignored. - interpolation : {'fraction', 'lower', 'higher'}, optional - This optional parameter specifies the interpolation method to use, - when the desired quantile lies between two data points `i` and `j` - - - fraction: ``i + (j - i) * fraction`` where ``fraction`` is the - fractional part of the index surrounded by ``i`` and ``j``. - - lower: ``i``. - - higher: ``j``. - - axis : int, optional - Axis along which the percentiles are computed. The default (None) - is to compute the median along a flattened version of the array. - - Returns - ------- - score : float (or sequence of floats) - Score at percentile. - - See Also - -------- - percentileofscore - - Examples - -------- - >>> from scipy import stats - >>> a = np.arange(100) - >>> stats.scoreatpercentile(a, 50) - 49.5 - - """ - # adapted from NumPy's percentile function - a = np.asarray(a) - - if limit: - a = a[(limit[0] <= a) & (a <= limit[1])] - - if per == 0: - return a.min(axis=axis) - elif per == 100: - return a.max(axis=axis) - - sorted = np.sort(a, axis=axis) - if axis is None: - axis = 0 - - return _compute_qth_percentile(sorted, per, interpolation_method, axis) - - -# handle sequence of per's without calling sort multiple times -def _compute_qth_percentile(sorted, per, interpolation_method, axis): - if not np.isscalar(per): - return [_compute_qth_percentile(sorted, i, interpolation_method, axis) - for i in per] - - if (per < 0) or (per > 100): - raise ValueError("percentile must be in the range [0, 100]") - - indexer = [slice(None)] * sorted.ndim - idx = per / 100. * (sorted.shape[axis] - 1) - - if int(idx) != idx: - # round fractional indices according to interpolation method - if interpolation_method == 'lower': - idx = int(np.floor(idx)) - elif interpolation_method == 'higher': - idx = int(np.ceil(idx)) - elif interpolation_method == 'fraction': - pass # keep idx as fraction and interpolate - else: - raise ValueError("interpolation_method can only be 'fraction', " - "'lower' or 'higher'") - - i = int(idx) - if i == idx: - indexer[axis] = slice(i, i + 1) - weights = array(1) - sumval = 1.0 - else: - indexer[axis] = slice(i, i + 2) - j = i + 1 - weights = array([(j - idx), (idx - i)], float) - wshape = [1] * sorted.ndim - wshape[axis] = 2 - weights.shape = wshape - sumval = weights.sum() - - # Use np.add.reduce to coerce data type - return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval - - -def percentileofscore(a, score, kind='rank'): - """ - The percentile rank of a score relative to a list of scores. - - A `percentileofscore` of, for example, 80% means that 80% of the - scores in `a` are below the given score. In the case of gaps or - ties, the exact definition depends on the optional keyword, `kind`. - - Parameters - ---------- - a : array_like - Array of scores to which `score` is compared. - score : int or float - Score that is compared to the elements in `a`. - kind : {'rank', 'weak', 'strict', 'mean'}, optional - This optional parameter specifies the interpretation of the - resulting score: - - - "rank": Average percentage ranking of score. In case of - multiple matches, average the percentage rankings of - all matching scores. - - "weak": This kind corresponds to the definition of a cumulative - distribution function. A percentileofscore of 80% - means that 80% of values are less than or equal - to the provided score. - - "strict": Similar to "weak", except that only values that are - strictly less than the given score are counted. - - "mean": The average of the "weak" and "strict" scores, often used in - testing. See - - http://en.wikipedia.org/wiki/Percentile_rank - - Returns - ------- - pcos : float - Percentile-position of score (0-100) relative to `a`. - - Examples - -------- - Three-quarters of the given values lie below a given score: - - >>> percentileofscore([1, 2, 3, 4], 3) - 75.0 - - With multiple matches, note how the scores of the two matches, 0.6 - and 0.8 respectively, are averaged: - - >>> percentileofscore([1, 2, 3, 3, 4], 3) - 70.0 - - Only 2/5 values are strictly less than 3: - - >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='strict') - 40.0 - - But 4/5 values are less than or equal to 3: - - >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='weak') - 80.0 - - The average between the weak and the strict scores is - - >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='mean') - 60.0 - - """ - a = np.array(a) - n = len(a) - - if kind == 'rank': - if not(np.any(a == score)): - a = np.append(a, score) - a_len = np.array(list(range(len(a)))) - else: - a_len = np.array(list(range(len(a)))) + 1.0 - - a = np.sort(a) - idx = [a == score] - pct = (np.mean(a_len[idx]) / n) * 100.0 - return pct - - elif kind == 'strict': - return sum(a < score) / float(n) * 100 - elif kind == 'weak': - return sum(a <= score) / float(n) * 100 - elif kind == 'mean': - return (sum(a < score) + sum(a <= score)) * 50 / float(n) - else: - raise ValueError("kind can only be 'rank', 'strict', 'weak' or 'mean'") - - -def histogram2(a, bins): - """ - Compute histogram using divisions in bins. - - Count the number of times values from array `a` fall into - numerical ranges defined by `bins`. Range x is given by - bins[x] <= range_x < bins[x+1] where x =0,N and N is the - length of the `bins` array. The last range is given by - bins[N] <= range_N < infinity. Values less than bins[0] are - not included in the histogram. - - Parameters - ---------- - a : array_like of rank 1 - The array of values to be assigned into bins - bins : array_like of rank 1 - Defines the ranges of values to use during histogramming. - - Returns - ------- - histogram2 : ndarray of rank 1 - Each value represents the occurrences for a given bin (range) of - values. - - """ - # comment: probably obsoleted by numpy.histogram() - n = np.searchsorted(np.sort(a), bins) - n = np.concatenate([n, [len(a)]]) - return n[1:]-n[:-1] - - -def histogram(a, numbins=10, defaultlimits=None, weights=None, printextras=False): - """ - Separates the range into several bins and returns the number of instances - in each bin. - - Parameters - ---------- - a : array_like - Array of scores which will be put into bins. - numbins : int, optional - The number of bins to use for the histogram. Default is 10. - defaultlimits : tuple (lower, upper), optional - The lower and upper values for the range of the histogram. - If no value is given, a range slightly larger then the range of the - values in a is used. Specifically ``(a.min() - s, a.max() + s)``, - where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``. - weights : array_like, optional - The weights for each value in `a`. Default is None, which gives each - value a weight of 1.0 - printextras : bool, optional - If True, if there are extra points (i.e. the points that fall outside - the bin limits) a warning is raised saying how many of those points - there are. Default is False. - - Returns - ------- - histogram : ndarray - Number of points (or sum of weights) in each bin. - low_range : float - Lowest value of histogram, the lower limit of the first bin. - binsize : float - The size of the bins (all bins have the same size). - extrapoints : int - The number of points outside the range of the histogram. - - See Also - -------- - numpy.histogram - - Notes - ----- - This histogram is based on numpy's histogram but has a larger range by - default if default limits is not set. - - """ - a = np.ravel(a) - if defaultlimits is None: - # no range given, so use values in `a` - data_min = a.min() - data_max = a.max() - # Have bins extend past min and max values slightly - s = (data_max - data_min) / (2. * (numbins - 1.)) - defaultlimits = (data_min - s, data_max + s) - # use numpy's histogram method to compute bins - hist, bin_edges = np.histogram(a, bins=numbins, range=defaultlimits, - weights=weights) - # hist are not always floats, convert to keep with old output - hist = np.array(hist, dtype=float) - # fixed width for bins is assumed, as numpy's histogram gives - # fixed width bins for int values for 'bins' - binsize = bin_edges[1] - bin_edges[0] - # calculate number of extra points - extrapoints = len([v for v in a - if defaultlimits[0] > v or v > defaultlimits[1]]) - if extrapoints > 0 and printextras: - warnings.warn("Points outside given histogram range = %s" - % extrapoints) - return (hist, defaultlimits[0], binsize, extrapoints) - - -def cumfreq(a, numbins=10, defaultreallimits=None, weights=None): - """ - Returns a cumulative frequency histogram, using the histogram function. - - Parameters - ---------- - a : array_like - Input array. - numbins : int, optional - The number of bins to use for the histogram. Default is 10. - defaultlimits : tuple (lower, upper), optional - The lower and upper values for the range of the histogram. - If no value is given, a range slightly larger than the range of the - values in `a` is used. Specifically ``(a.min() - s, a.max() + s)``, - where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``. - weights : array_like, optional - The weights for each value in `a`. Default is None, which gives each - value a weight of 1.0 - - Returns - ------- - cumfreq : ndarray - Binned values of cumulative frequency. - lowerreallimit : float - Lower real limit - binsize : float - Width of each bin. - extrapoints : int - Extra points. - - Examples - -------- - >>> x = [1, 4, 2, 1, 3, 1] - >>> cumfreqs, lowlim, binsize, extrapoints = sp.stats.cumfreq(x, numbins=4) - >>> cumfreqs - array([ 3., 4., 5., 6.]) - >>> cumfreqs, lowlim, binsize, extrapoints = \ - ... sp.stats.cumfreq(x, numbins=4, defaultreallimits=(1.5, 5)) - >>> cumfreqs - array([ 1., 2., 3., 3.]) - >>> extrapoints - 3 - - """ - h,l,b,e = histogram(a, numbins, defaultreallimits, weights=weights) - cumhist = np.cumsum(h*1, axis=0) - return cumhist,l,b,e - - -def relfreq(a, numbins=10, defaultreallimits=None, weights=None): - """ - Returns a relative frequency histogram, using the histogram function. - - Parameters - ---------- - a : array_like - Input array. - numbins : int, optional - The number of bins to use for the histogram. Default is 10. - defaultreallimits : tuple (lower, upper), optional - The lower and upper values for the range of the histogram. - If no value is given, a range slightly larger then the range of the - values in a is used. Specifically ``(a.min() - s, a.max() + s)``, - where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``. - weights : array_like, optional - The weights for each value in `a`. Default is None, which gives each - value a weight of 1.0 - - Returns - ------- - relfreq : ndarray - Binned values of relative frequency. - lowerreallimit : float - Lower real limit - binsize : float - Width of each bin. - extrapoints : int - Extra points. - - Examples - -------- - >>> a = np.array([1, 4, 2, 1, 3, 1]) - >>> relfreqs, lowlim, binsize, extrapoints = sp.stats.relfreq(a, numbins=4) - >>> relfreqs - array([ 0.5 , 0.16666667, 0.16666667, 0.16666667]) - >>> np.sum(relfreqs) # relative frequencies should add up to 1 - 0.99999999999999989 - - """ - h, l, b, e = histogram(a, numbins, defaultreallimits, weights=weights) - h = np.array(h / float(np.array(a).shape[0])) - return h, l, b, e - - -##################################### -###### VARIABILITY FUNCTIONS ##### -##################################### - -def obrientransform(*args): - """ - Computes the O'Brien transform on input data (any number of arrays). - - Used to test for homogeneity of variance prior to running one-way stats. - Each array in ``*args`` is one level of a factor. - If `f_oneway` is run on the transformed data and found significant, - the variances are unequal. From Maxwell and Delaney [1]_, p.112. - - Parameters - ---------- - args : tuple of array_like - Any number of arrays. - - Returns - ------- - obrientransform : ndarray - Transformed data for use in an ANOVA. The first dimension - of the result corresponds to the sequence of transformed - arrays. If the arrays given are all 1-D of the same length, - the return value is a 2-D array; otherwise it is a 1-D array - of type object, with each element being an ndarray. - - References - ---------- - .. [1] S. E. Maxwell and H. D. Delaney, "Designing Experiments and - Analyzing Data: A Model Comparison Perspective", Wadsworth, 1990. - - Examples - -------- - We'll test the following data sets for differences in their variance. - - >>> x = [10, 11, 13, 9, 7, 12, 12, 9, 10] - >>> y = [13, 21, 5, 10, 8, 14, 10, 12, 7, 15] - - Apply the O'Brien transform to the data. - - >>> tx, ty = obrientransform(x, y) - - Use `scipy.stats.f_oneway` to apply a one-way ANOVA test to the - transformed data. - - >>> from scipy.stats import f_oneway - >>> F, p = f_oneway(tx, ty) - >>> p - 0.1314139477040335 - - If we require that ``p < 0.05`` for significance, we cannot conclude - that the variances are different. - """ - TINY = np.sqrt(np.finfo(float).eps) - - # `arrays` will hold the transformed arguments. - arrays = [] - - for arg in args: - a = np.asarray(arg) - n = len(a) - mu = np.mean(a) - sq = (a - mu)**2 - sumsq = sq.sum() - - # The O'Brien transform. - t = ((n - 1.5) * n * sq - 0.5 * sumsq) / ((n - 1) * (n - 2)) - - # Check that the mean of the transformed data is equal to the - # original variance. - var = sumsq / (n - 1) - if abs(var - np.mean(t)) > TINY: - raise ValueError('Lack of convergence in obrientransform.') - - arrays.append(t) - - # If the arrays are not all the same shape, calling np.array(arrays) - # creates a 1-D array with dtype `object` in numpy 1.6+. In numpy - # 1.5.x, it raises an exception. To work around this, we explicitly - # set the dtype to `object` when the arrays are not all the same shape. - if len(arrays) < 2 or all(x.shape == arrays[0].shape for x in arrays[1:]): - dt = None - else: - dt = object - return np.array(arrays, dtype=dt) - - -def signaltonoise(a, axis=0, ddof=0): - """ - The signal-to-noise ratio of the input data. - - Returns the signal-to-noise ratio of `a`, here defined as the mean - divided by the standard deviation. - - Parameters - ---------- - a : array_like - An array_like object containing the sample data. - axis : int or None, optional - If axis is equal to None, the array is first ravel'd. If axis is an - integer, this is the axis over which to operate. Default is 0. - ddof : int, optional - Degrees of freedom correction for standard deviation. Default is 0. - - Returns - ------- - s2n : ndarray - The mean to standard deviation ratio(s) along `axis`, or 0 where the - standard deviation is 0. - - """ - a = np.asanyarray(a) - m = a.mean(axis) - sd = a.std(axis=axis, ddof=ddof) - return np.where(sd == 0, 0, m/sd) - - -def sem(a, axis=0, ddof=1): - """ - Calculates the standard error of the mean (or standard error of - measurement) of the values in the input array. - - Parameters - ---------- - a : array_like - An array containing the values for which the standard error is - returned. - axis : int or None, optional. - If axis is None, ravel `a` first. If axis is an integer, this will be - the axis over which to operate. Defaults to 0. - ddof : int, optional - Delta degrees-of-freedom. How many degrees of freedom to adjust - for bias in limited samples relative to the population estimate - of variance. Defaults to 1. - - Returns - ------- - s : ndarray or float - The standard error of the mean in the sample(s), along the input axis. - - Notes - ----- - The default value for `ddof` is different to the default (0) used by other - ddof containing routines, such as np.std nd stats.nanstd. - - Examples - -------- - Find standard error along the first axis: - - >>> from scipy import stats - >>> a = np.arange(20).reshape(5,4) - >>> stats.sem(a) - array([ 2.8284, 2.8284, 2.8284, 2.8284]) - - Find standard error across the whole array, using n degrees of freedom: - - >>> stats.sem(a, axis=None, ddof=0) - 1.2893796958227628 - - """ - a, axis = _chk_asarray(a, axis) - n = a.shape[axis] - s = np.std(a,axis=axis, ddof=ddof) / np.sqrt(n) # JP check normalization - return s - - -def zscore(a, axis=0, ddof=0): - """ - Calculates the z score of each value in the sample, relative to the sample - mean and standard deviation. - - Parameters - ---------- - a : array_like - An array like object containing the sample data. - axis : int or None, optional - If `axis` is equal to None, the array is first raveled. If `axis` is - an integer, this is the axis over which to operate. Default is 0. - ddof : int, optional - Degrees of freedom correction in the calculation of the - standard deviation. Default is 0. - - Returns - ------- - zscore : array_like - The z-scores, standardized by mean and standard deviation of input - array `a`. - - Notes - ----- - This function preserves ndarray subclasses, and works also with - matrices and masked arrays (it uses `asanyarray` instead of `asarray` - for parameters). - - Examples - -------- - >>> a = np.array([ 0.7972, 0.0767, 0.4383, 0.7866, 0.8091, 0.1954, - 0.6307, 0.6599, 0.1065, 0.0508]) - >>> from scipy import stats - >>> stats.zscore(a) - array([ 1.1273, -1.247 , -0.0552, 1.0923, 1.1664, -0.8559, 0.5786, - 0.6748, -1.1488, -1.3324]) - - Computing along a specified axis, using n-1 degrees of freedom (``ddof=1``) - to calculate the standard deviation: - - >>> b = np.array([[ 0.3148, 0.0478, 0.6243, 0.4608], - [ 0.7149, 0.0775, 0.6072, 0.9656], - [ 0.6341, 0.1403, 0.9759, 0.4064], - [ 0.5918, 0.6948, 0.904 , 0.3721], - [ 0.0921, 0.2481, 0.1188, 0.1366]]) - >>> stats.zscore(b, axis=1, ddof=1) - array([[-0.19264823, -1.28415119, 1.07259584, 0.40420358], - [ 0.33048416, -1.37380874, 0.04251374, 1.00081084], - [ 0.26796377, -1.12598418, 1.23283094, -0.37481053], - [-0.22095197, 0.24468594, 1.19042819, -1.21416216], - [-0.82780366, 1.4457416 , -0.43867764, -0.1792603 ]]) - """ - a = np.asanyarray(a) - mns = a.mean(axis=axis) - sstd = a.std(axis=axis, ddof=ddof) - if axis and mns.ndim < a.ndim: - return ((a - np.expand_dims(mns, axis=axis)) / - np.expand_dims(sstd,axis=axis)) - else: - return (a - mns) / sstd - - -def zmap(scores, compare, axis=0, ddof=0): - """ - Calculates the relative z-scores. - - Returns an array of z-scores, i.e., scores that are standardized to zero - mean and unit variance, where mean and variance are calculated from the - comparison array. - - Parameters - ---------- - scores : array_like - The input for which z-scores are calculated. - compare : array_like - The input from which the mean and standard deviation of the - normalization are taken; assumed to have the same dimension as - `scores`. - axis : int or None, optional - Axis over which mean and variance of `compare` are calculated. - Default is 0. - ddof : int, optional - Degrees of freedom correction in the calculation of the - standard deviation. Default is 0. - - Returns - ------- - zscore : array_like - Z-scores, in the same shape as `scores`. - - Notes - ----- - This function preserves ndarray subclasses, and works also with - matrices and masked arrays (it uses `asanyarray` instead of `asarray` - for parameters). - - Examples - -------- - >>> a = [0.5, 2.0, 2.5, 3] - >>> b = [0, 1, 2, 3, 4] - >>> zmap(a, b) - array([-1.06066017, 0. , 0.35355339, 0.70710678]) - """ - scores, compare = map(np.asanyarray, [scores, compare]) - mns = compare.mean(axis=axis) - sstd = compare.std(axis=axis, ddof=ddof) - if axis and mns.ndim < compare.ndim: - return ((scores - np.expand_dims(mns, axis=axis)) / - np.expand_dims(sstd,axis=axis)) - else: - return (scores - mns) / sstd - - -##################################### -####### TRIMMING FUNCTIONS ####### -##################################### - -def threshold(a, threshmin=None, threshmax=None, newval=0): - """ - Clip array to a given value. - - Similar to numpy.clip(), except that values less than `threshmin` or - greater than `threshmax` are replaced by `newval`, instead of by - `threshmin` and `threshmax` respectively. - - Parameters - ---------- - a : array_like - Data to threshold. - threshmin : float, int or None, optional - Minimum threshold, defaults to None. - threshmax : float, int or None, optional - Maximum threshold, defaults to None. - newval : float or int, optional - Value to put in place of values in `a` outside of bounds. - Defaults to 0. - - Returns - ------- - out : ndarray - The clipped input array, with values less than `threshmin` or - greater than `threshmax` replaced with `newval`. - - Examples - -------- - >>> a = np.array([9, 9, 6, 3, 1, 6, 1, 0, 0, 8]) - >>> from scipy import stats - >>> stats.threshold(a, threshmin=2, threshmax=8, newval=-1) - array([-1, -1, 6, 3, -1, 6, -1, -1, -1, 8]) - - """ - a = asarray(a).copy() - mask = zeros(a.shape, dtype=bool) - if threshmin is not None: - mask |= (a < threshmin) - if threshmax is not None: - mask |= (a > threshmax) - a[mask] = newval - return a - - -def sigmaclip(a, low=4., high=4.): - """ - Iterative sigma-clipping of array elements. - - The output array contains only those elements of the input array `c` - that satisfy the conditions :: - - mean(c) - std(c)*low < c < mean(c) + std(c)*high - - Starting from the full sample, all elements outside the critical range are - removed. The iteration continues with a new critical range until no - elements are outside the range. - - Parameters - ---------- - a : array_like - Data array, will be raveled if not 1-D. - low : float, optional - Lower bound factor of sigma clipping. Default is 4. - high : float, optional - Upper bound factor of sigma clipping. Default is 4. - - Returns - ------- - c : ndarray - Input array with clipped elements removed. - critlower : float - Lower threshold value use for clipping. - critlupper : float - Upper threshold value use for clipping. - - Examples - -------- - >>> a = np.concatenate((np.linspace(9.5,10.5,31), np.linspace(0,20,5))) - >>> fact = 1.5 - >>> c, low, upp = sigmaclip(a, fact, fact) - >>> c - array([ 9.96666667, 10. , 10.03333333, 10. ]) - >>> c.var(), c.std() - (0.00055555555555555165, 0.023570226039551501) - >>> low, c.mean() - fact*c.std(), c.min() - (9.9646446609406727, 9.9646446609406727, 9.9666666666666668) - >>> upp, c.mean() + fact*c.std(), c.max() - (10.035355339059327, 10.035355339059327, 10.033333333333333) - - >>> a = np.concatenate((np.linspace(9.5,10.5,11), - np.linspace(-100,-50,3))) - >>> c, low, upp = sigmaclip(a, 1.8, 1.8) - >>> (c == np.linspace(9.5,10.5,11)).all() - True - - """ - c = np.asarray(a).ravel() - delta = 1 - while delta: - c_std = c.std() - c_mean = c.mean() - size = c.size - critlower = c_mean - c_std*low - critupper = c_mean + c_std*high - c = c[(c > critlower) & (c < critupper)] - delta = size-c.size - return c, critlower, critupper - - -def trimboth(a, proportiontocut, axis=0): - """ - Slices off a proportion of items from both ends of an array. - - Slices off the passed proportion of items from both ends of the passed - array (i.e., with `proportiontocut` = 0.1, slices leftmost 10% **and** - rightmost 10% of scores). You must pre-sort the array if you want - 'proper' trimming. Slices off less if proportion results in a - non-integer slice index (i.e., conservatively slices off - `proportiontocut`). - - Parameters - ---------- - a : array_like - Data to trim. - proportiontocut : float - Proportion (in range 0-1) of total data set to trim of each end. - axis : int or None, optional - Axis along which the observations are trimmed. The default is to trim - along axis=0. If axis is None then the array will be flattened before - trimming. - - Returns - ------- - out : ndarray - Trimmed version of array `a`. - - See Also - -------- - trim_mean - - Examples - -------- - >>> from scipy import stats - >>> a = np.arange(20) - >>> b = stats.trimboth(a, 0.1) - >>> b.shape - (16,) - - """ - a = np.asarray(a) - if axis is None: - a = a.ravel() - axis = 0 - - nobs = a.shape[axis] - lowercut = int(proportiontocut * nobs) - uppercut = nobs - lowercut - if (lowercut >= uppercut): - raise ValueError("Proportion too big.") - - sl = [slice(None)] * a.ndim - sl[axis] = slice(lowercut, uppercut) - return a[sl] - - -def trim1(a, proportiontocut, tail='right'): - """ - Slices off a proportion of items from ONE end of the passed array - distribution. - - If `proportiontocut` = 0.1, slices off 'leftmost' or 'rightmost' - 10% of scores. Slices off LESS if proportion results in a non-integer - slice index (i.e., conservatively slices off `proportiontocut` ). - - Parameters - ---------- - a : array_like - Input array - proportiontocut : float - Fraction to cut off of 'left' or 'right' of distribution - tail : {'left', 'right'}, optional - Defaults to 'right'. - - Returns - ------- - trim1 : ndarray - Trimmed version of array `a` - - """ - a = asarray(a) - if tail.lower() == 'right': - lowercut = 0 - uppercut = len(a) - int(proportiontocut*len(a)) - elif tail.lower() == 'left': - lowercut = int(proportiontocut*len(a)) - uppercut = len(a) - - return a[lowercut:uppercut] - - -def trim_mean(a, proportiontocut, axis=0): - """ - Return mean of array after trimming distribution from both lower and upper - tails. - - If `proportiontocut` = 0.1, slices off 'leftmost' and 'rightmost' 10% of - scores. Slices off LESS if proportion results in a non-integer slice - index (i.e., conservatively slices off `proportiontocut` ). - - Parameters - ---------- - a : array_like - Input array - proportiontocut : float - Fraction to cut off of both tails of the distribution - axis : int or None, optional - Axis along which the trimmed means are computed. The default is axis=0. - If axis is None then the trimmed mean will be computed for the - flattened array. - - Returns - ------- - trim_mean : ndarray - Mean of trimmed array. - - See Also - -------- - trimboth - - Examples - -------- - >>> from scipy import stats - >>> x = np.arange(20) - >>> stats.trim_mean(x, 0.1) - 9.5 - >>> x2 = x.reshape(5, 4) - >>> x2 - array([[ 0, 1, 2, 3], - [ 4, 5, 6, 7], - [ 8, 9, 10, 11], - [12, 13, 14, 15], - [16, 17, 18, 19]]) - >>> stats.trim_mean(x2, 0.25) - array([ 8., 9., 10., 11.]) - >>> stats.trim_mean(x2, 0.25, axis=1) - array([ 1.5, 5.5, 9.5, 13.5, 17.5]) - - """ - a = np.asarray(a) - if axis is None: - nobs = a.size - else: - nobs = a.shape[axis] - lowercut = int(proportiontocut * nobs) - uppercut = nobs - lowercut - 1 - if (lowercut > uppercut): - raise ValueError("Proportion too big.") - - try: - atmp = np.partition(a, (lowercut, uppercut), axis) - except AttributeError: - atmp = np.sort(a, axis) - - newa = trimboth(atmp, proportiontocut, axis=axis) - return np.mean(newa, axis=axis) - - -def f_oneway(*args): - """ - Performs a 1-way ANOVA. - - The one-way ANOVA tests the null hypothesis that two or more groups have - the same population mean. The test is applied to samples from two or - more groups, possibly with differing sizes. - - Parameters - ---------- - sample1, sample2, ... : array_like - The sample measurements for each group. - - Returns - ------- - F-value : float - The computed F-value of the test. - p-value : float - The associated p-value from the F-distribution. - - Notes - ----- - The ANOVA test has important assumptions that must be satisfied in order - for the associated p-value to be valid. - - 1. The samples are independent. - 2. Each sample is from a normally distributed population. - 3. The population standard deviations of the groups are all equal. This - property is known as homoscedasticity. - - If these assumptions are not true for a given set of data, it may still be - possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`) although - with some loss of power. - - The algorithm is from Heiman[2], pp.394-7. - - - References - ---------- - .. [1] Lowry, Richard. "Concepts and Applications of Inferential - Statistics". Chapter 14. - http://faculty.vassar.edu/lowry/ch14pt1.html - - .. [2] Heiman, G.W. Research Methods in Statistics. 2002. - - """ - args = list(map(np.asarray, args)) # convert to an numpy array - na = len(args) # ANOVA on 'na' groups, each in it's own array - alldata = np.concatenate(args) - bign = len(alldata) - sstot = ss(alldata) - (square_of_sums(alldata) / float(bign)) - ssbn = 0 - for a in args: - ssbn += square_of_sums(a) / float(len(a)) - ssbn -= (square_of_sums(alldata) / float(bign)) - sswn = sstot - ssbn - dfbn = na - 1 - dfwn = bign - na - msb = ssbn / float(dfbn) - msw = sswn / float(dfwn) - f = msb / msw - prob = fprob(dfbn, dfwn, f) - return f, prob - - -def pearsonr(x, y): - """ - Calculates a Pearson correlation coefficient and the p-value for testing - non-correlation. - - The Pearson correlation coefficient measures the linear relationship - between two datasets. Strictly speaking, Pearson's correlation requires - that each dataset be normally distributed. Like other correlation - coefficients, this one varies between -1 and +1 with 0 implying no - correlation. Correlations of -1 or +1 imply an exact linear - relationship. Positive correlations imply that as x increases, so does - y. Negative correlations imply that as x increases, y decreases. - - The p-value roughly indicates the probability of an uncorrelated system - producing datasets that have a Pearson correlation at least as extreme - as the one computed from these datasets. The p-values are not entirely - reliable but are probably reasonable for datasets larger than 500 or so. - - Parameters - ---------- - x : (N,) array_like - Input - y : (N,) array_like - Input - - Returns - ------- - (Pearson's correlation coefficient, - 2-tailed p-value) - - References - ---------- - http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation - - """ - # x and y should have same length. - x = np.asarray(x) - y = np.asarray(y) - n = len(x) - mx = x.mean() - my = y.mean() - xm, ym = x-mx, y-my - r_num = np.add.reduce(xm * ym) - r_den = np.sqrt(ss(xm) * ss(ym)) - r = r_num / r_den - - # Presumably, if abs(r) > 1, then it is only some small artifact of floating - # point arithmetic. - r = max(min(r, 1.0), -1.0) - df = n-2 - if abs(r) == 1.0: - prob = 0.0 - else: - t_squared = r*r * (df / ((1.0 - r) * (1.0 + r))) - prob = betai(0.5*df, 0.5, df / (df + t_squared)) - return r, prob - - -def fisher_exact(table, alternative='two-sided'): - """Performs a Fisher exact test on a 2x2 contingency table. - - Parameters - ---------- - table : array_like of ints - A 2x2 contingency table. Elements should be non-negative integers. - alternative : {'two-sided', 'less', 'greater'}, optional - Which alternative hypothesis to the null hypothesis the test uses. - Default is 'two-sided'. - - Returns - ------- - oddsratio : float - This is prior odds ratio and not a posterior estimate. - p_value : float - P-value, the probability of obtaining a distribution at least as - extreme as the one that was actually observed, assuming that the - null hypothesis is true. - - See Also - -------- - chi2_contingency : Chi-square test of independence of variables in a - contingency table. - - Notes - ----- - The calculated odds ratio is different from the one R uses. In R language, - this implementation returns the (more common) "unconditional Maximum - Likelihood Estimate", while R uses the "conditional Maximum Likelihood - Estimate". - - For tables with large numbers the (inexact) chi-square test implemented - in the function `chi2_contingency` can also be used. - - Examples - -------- - Say we spend a few days counting whales and sharks in the Atlantic and - Indian oceans. In the Atlantic ocean we find 8 whales and 1 shark, in the - Indian ocean 2 whales and 5 sharks. Then our contingency table is:: - - Atlantic Indian - whales 8 2 - sharks 1 5 - - We use this table to find the p-value: - - >>> oddsratio, pvalue = stats.fisher_exact([[8, 2], [1, 5]]) - >>> pvalue - 0.0349... - - The probability that we would observe this or an even more imbalanced ratio - by chance is about 3.5%. A commonly used significance level is 5%, if we - adopt that we can therefore conclude that our observed imbalance is - statistically significant; whales prefer the Atlantic while sharks prefer - the Indian ocean. - - """ - hypergeom = distributions.hypergeom - c = np.asarray(table, dtype=np.int64) # int32 is not enough for the algorithm - if not c.shape == (2, 2): - raise ValueError("The input `table` must be of shape (2, 2).") - - if np.any(c < 0): - raise ValueError("All values in `table` must be nonnegative.") - - if 0 in c.sum(axis=0) or 0 in c.sum(axis=1): - # If both values in a row or column are zero, the p-value is 1 and - # the odds ratio is NaN. - return np.nan, 1.0 - - if c[1,0] > 0 and c[0,1] > 0: - oddsratio = c[0,0] * c[1,1] / float(c[1,0] * c[0,1]) - else: - oddsratio = np.inf - - n1 = c[0,0] + c[0,1] - n2 = c[1,0] + c[1,1] - n = c[0,0] + c[1,0] - - def binary_search(n, n1, n2, side): - """Binary search for where to begin lower/upper halves in two-sided - test. - """ - if side == "upper": - minval = mode - maxval = n - else: - minval = 0 - maxval = mode - guess = -1 - while maxval - minval > 1: - if maxval == minval + 1 and guess == minval: - guess = maxval - else: - guess = (maxval + minval) // 2 - pguess = hypergeom.pmf(guess, n1 + n2, n1, n) - if side == "upper": - ng = guess - 1 - else: - ng = guess + 1 - if pguess <= pexact and hypergeom.pmf(ng, n1 + n2, n1, n) > pexact: - break - elif pguess < pexact: - maxval = guess - else: - minval = guess - if guess == -1: - guess = minval - if side == "upper": - while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon: - guess -= 1 - while hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon: - guess += 1 - else: - while hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon: - guess += 1 - while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon: - guess -= 1 - return guess - - if alternative == 'less': - pvalue = hypergeom.cdf(c[0,0], n1 + n2, n1, n) - elif alternative == 'greater': - # Same formula as the 'less' case, but with the second column. - pvalue = hypergeom.cdf(c[0,1], n1 + n2, n1, c[0,1] + c[1,1]) - elif alternative == 'two-sided': - mode = int(float((n + 1) * (n1 + 1)) / (n1 + n2 + 2)) - pexact = hypergeom.pmf(c[0,0], n1 + n2, n1, n) - pmode = hypergeom.pmf(mode, n1 + n2, n1, n) - - epsilon = 1 - 1e-4 - if float(np.abs(pexact - pmode)) / np.abs(np.max(pexact, pmode)) <= 1 - epsilon: - return oddsratio, 1. - - elif c[0,0] < mode: - plower = hypergeom.cdf(c[0,0], n1 + n2, n1, n) - if hypergeom.pmf(n, n1 + n2, n1, n) > pexact / epsilon: - return oddsratio, plower - - guess = binary_search(n, n1, n2, "upper") - pvalue = plower + hypergeom.sf(guess - 1, n1 + n2, n1, n) - else: - pupper = hypergeom.sf(c[0,0] - 1, n1 + n2, n1, n) - if hypergeom.pmf(0, n1 + n2, n1, n) > pexact / epsilon: - return oddsratio, pupper - - guess = binary_search(n, n1, n2, "lower") - pvalue = pupper + hypergeom.cdf(guess, n1 + n2, n1, n) - else: - msg = "`alternative` should be one of {'two-sided', 'less', 'greater'}" - raise ValueError(msg) - - if pvalue > 1.0: - pvalue = 1.0 - return oddsratio, pvalue - - -def spearmanr(a, b=None, axis=0): - """ - Calculates a Spearman rank-order correlation coefficient and the p-value - to test for non-correlation. - - The Spearman correlation is a nonparametric measure of the monotonicity - of the relationship between two datasets. Unlike the Pearson correlation, - the Spearman correlation does not assume that both datasets are normally - distributed. Like other correlation coefficients, this one varies - between -1 and +1 with 0 implying no correlation. Correlations of -1 or - +1 imply an exact monotonic relationship. Positive correlations imply that - as x increases, so does y. Negative correlations imply that as x - increases, y decreases. - - The p-value roughly indicates the probability of an uncorrelated system - producing datasets that have a Spearman correlation at least as extreme - as the one computed from these datasets. The p-values are not entirely - reliable but are probably reasonable for datasets larger than 500 or so. - - Parameters - ---------- - a, b : 1D or 2D array_like, b is optional - One or two 1-D or 2-D arrays containing multiple variables and - observations. Each column of `a` and `b` represents a variable, and - each row entry a single observation of those variables. See also - `axis`. Both arrays need to have the same length in the `axis` - dimension. - axis : int or None, optional - If axis=0 (default), then each column represents a variable, with - observations in the rows. If axis=0, the relationship is transposed: - each row represents a variable, while the columns contain observations. - If axis=None, then both arrays will be raveled. - - Returns - ------- - rho : float or ndarray (2-D square) - Spearman correlation matrix or correlation coefficient (if only 2 - variables are given as parameters. Correlation matrix is square with - length equal to total number of variables (columns or rows) in a and b - combined. - p-value : float - The two-sided p-value for a hypothesis test whose null hypothesis is - that two sets of data are uncorrelated, has same dimension as rho. - - Notes - ----- - Changes in scipy 0.8.0: rewrite to add tie-handling, and axis. - - References - ---------- - [CRCProbStat2000]_ Section 14.7 - - .. [CRCProbStat2000] Zwillinger, D. and Kokoska, S. (2000). CRC Standard - Probability and Statistics Tables and Formulae. Chapman & Hall: New - York. 2000. - - Examples - -------- - >>> spearmanr([1,2,3,4,5],[5,6,7,8,7]) - (0.82078268166812329, 0.088587005313543798) - >>> np.random.seed(1234321) - >>> x2n=np.random.randn(100,2) - >>> y2n=np.random.randn(100,2) - >>> spearmanr(x2n) - (0.059969996999699973, 0.55338590803773591) - >>> spearmanr(x2n[:,0], x2n[:,1]) - (0.059969996999699973, 0.55338590803773591) - >>> rho, pval = spearmanr(x2n,y2n) - >>> rho - array([[ 1. , 0.05997 , 0.18569457, 0.06258626], - [ 0.05997 , 1. , 0.110003 , 0.02534653], - [ 0.18569457, 0.110003 , 1. , 0.03488749], - [ 0.06258626, 0.02534653, 0.03488749, 1. ]]) - >>> pval - array([[ 0. , 0.55338591, 0.06435364, 0.53617935], - [ 0.55338591, 0. , 0.27592895, 0.80234077], - [ 0.06435364, 0.27592895, 0. , 0.73039992], - [ 0.53617935, 0.80234077, 0.73039992, 0. ]]) - >>> rho, pval = spearmanr(x2n.T, y2n.T, axis=1) - >>> rho - array([[ 1. , 0.05997 , 0.18569457, 0.06258626], - [ 0.05997 , 1. , 0.110003 , 0.02534653], - [ 0.18569457, 0.110003 , 1. , 0.03488749], - [ 0.06258626, 0.02534653, 0.03488749, 1. ]]) - >>> spearmanr(x2n, y2n, axis=None) - (0.10816770419260482, 0.1273562188027364) - >>> spearmanr(x2n.ravel(), y2n.ravel()) - (0.10816770419260482, 0.1273562188027364) - - >>> xint = np.random.randint(10,size=(100,2)) - >>> spearmanr(xint) - (0.052760927029710199, 0.60213045837062351) - - """ - a, axisout = _chk_asarray(a, axis) - ar = np.apply_along_axis(rankdata,axisout,a) - - br = None - if not b is None: - b, axisout = _chk_asarray(b, axis) - br = np.apply_along_axis(rankdata,axisout,b) - n = a.shape[axisout] - rs = np.corrcoef(ar,br,rowvar=axisout) - - olderr = np.seterr(divide='ignore') # rs can have elements equal to 1 - try: - t = rs * np.sqrt((n-2) / ((rs+1.0)*(1.0-rs))) - finally: - np.seterr(**olderr) - prob = distributions.t.sf(np.abs(t),n-2)*2 - - if rs.shape == (2,2): - return rs[1,0], prob[1,0] - else: - return rs, prob - - -def pointbiserialr(x, y): - """Calculates a point biserial correlation coefficient and the associated - p-value. - - The point biserial correlation is used to measure the relationship - between a binary variable, x, and a continuous variable, y. Like other - correlation coefficients, this one varies between -1 and +1 with 0 - implying no correlation. Correlations of -1 or +1 imply a determinative - relationship. - - This function uses a shortcut formula but produces the same result as - `pearsonr`. - - Parameters - ---------- - x : array_like of bools - Input array. - y : array_like - Input array. - - Returns - ------- - r : float - R value - p-value : float - 2-tailed p-value - - References - ---------- - http://en.wikipedia.org/wiki/Point-biserial_correlation_coefficient - - Examples - -------- - >>> from scipy import stats - >>> a = np.array([0, 0, 0, 1, 1, 1, 1]) - >>> b = np.arange(7) - >>> stats.pointbiserialr(a, b) - (0.8660254037844386, 0.011724811003954652) - >>> stats.pearsonr(a, b) - (0.86602540378443871, 0.011724811003954626) - >>> np.corrcoef(a, b) - array([[ 1. , 0.8660254], - [ 0.8660254, 1. ]]) - - """ - x = np.asarray(x, dtype=bool) - y = np.asarray(y, dtype=float) - n = len(x) - - # phat is the fraction of x values that are True - phat = x.sum() / float(len(x)) - y0 = y[~x] # y-values where x is False - y1 = y[x] # y-values where x is True - y0m = y0.mean() - y1m = y1.mean() - - # phat - phat**2 is more stable than phat*(1-phat) - rpb = (y1m - y0m) * np.sqrt(phat - phat**2) / y.std() - - df = n-2 - # fixme: see comment about TINY in pearsonr() - TINY = 1e-20 - t = rpb*np.sqrt(df/((1.0-rpb+TINY)*(1.0+rpb+TINY))) - prob = betai(0.5*df, 0.5, df/(df+t*t)) - return rpb, prob - - -def kendalltau(x, y, initial_lexsort=True): - """ - Calculates Kendall's tau, a correlation measure for ordinal data. - - Kendall's tau is a measure of the correspondence between two rankings. - Values close to 1 indicate strong agreement, values close to -1 indicate - strong disagreement. This is the tau-b version of Kendall's tau which - accounts for ties. - - Parameters - ---------- - x, y : array_like - Arrays of rankings, of the same shape. If arrays are not 1-D, they will - be flattened to 1-D. - initial_lexsort : bool, optional - Whether to use lexsort or quicksort as the sorting method for the - initial sort of the inputs. Default is lexsort (True), for which - `kendalltau` is of complexity O(n log(n)). If False, the complexity is - O(n^2), but with a smaller pre-factor (so quicksort may be faster for - small arrays). - - Returns - ------- - Kendall's tau : float - The tau statistic. - p-value : float - The two-sided p-value for a hypothesis test whose null hypothesis is - an absence of association, tau = 0. - - Notes - ----- - The definition of Kendall's tau that is used is:: - - tau = (P - Q) / sqrt((P + Q + T) * (P + Q + U)) - - where P is the number of concordant pairs, Q the number of discordant - pairs, T the number of ties only in `x`, and U the number of ties only in - `y`. If a tie occurs for the same pair in both `x` and `y`, it is not - added to either T or U. - - References - ---------- - W.R. Knight, "A Computer Method for Calculating Kendall's Tau with - Ungrouped Data", Journal of the American Statistical Association, Vol. 61, - No. 314, Part 1, pp. 436-439, 1966. - - Examples - -------- - >>> x1 = [12, 2, 1, 12, 2] - >>> x2 = [1, 4, 7, 1, 0] - >>> tau, p_value = sp.stats.kendalltau(x1, x2) - >>> tau - -0.47140452079103173 - >>> p_value - 0.24821309157521476 - - """ - - x = np.asarray(x).ravel() - y = np.asarray(y).ravel() - n = np.int64(len(x)) - temp = list(range(n)) # support structure used by mergesort - # this closure recursively sorts sections of perm[] by comparing - # elements of y[perm[]] using temp[] as support - # returns the number of swaps required by an equivalent bubble sort - - def mergesort(offs, length): - exchcnt = 0 - if length == 1: - return 0 - if length == 2: - if y[perm[offs]] <= y[perm[offs+1]]: - return 0 - t = perm[offs] - perm[offs] = perm[offs+1] - perm[offs+1] = t - return 1 - length0 = length // 2 - length1 = length - length0 - middle = offs + length0 - exchcnt += mergesort(offs, length0) - exchcnt += mergesort(middle, length1) - if y[perm[middle - 1]] < y[perm[middle]]: - return exchcnt - # merging - i = j = k = 0 - while j < length0 or k < length1: - if k >= length1 or (j < length0 and y[perm[offs + j]] <= - y[perm[middle + k]]): - temp[i] = perm[offs + j] - d = i - j - j += 1 - else: - temp[i] = perm[middle + k] - d = (offs + i) - (middle + k) - k += 1 - if d > 0: - exchcnt += d - i += 1 - perm[offs:offs+length] = temp[0:length] - return exchcnt - - # initial sort on values of x and, if tied, on values of y - if initial_lexsort: - # sort implemented as mergesort, worst case: O(n log(n)) - perm = np.lexsort((y, x)) - else: - # sort implemented as quicksort, 30% faster but with worst case: O(n^2) - perm = list(range(n)) - perm.sort(key=lambda a: (x[a], y[a])) - - # compute joint ties - first = 0 - t = 0 - for i in xrange(1, n): - if x[perm[first]] != x[perm[i]] or y[perm[first]] != y[perm[i]]: - t += ((i - first) * (i - first - 1)) // 2 - first = i - t += ((n - first) * (n - first - 1)) // 2 - - # compute ties in x - first = 0 - u = 0 - for i in xrange(1,n): - if x[perm[first]] != x[perm[i]]: - u += ((i - first) * (i - first - 1)) // 2 - first = i - u += ((n - first) * (n - first - 1)) // 2 - - # count exchanges - exchanges = mergesort(0, n) - # compute ties in y after mergesort with counting - first = 0 - v = 0 - for i in xrange(1,n): - if y[perm[first]] != y[perm[i]]: - v += ((i - first) * (i - first - 1)) // 2 - first = i - v += ((n - first) * (n - first - 1)) // 2 - - tot = (n * (n - 1)) // 2 - if tot == u or tot == v: - return (np.nan, np.nan) # Special case for all ties in both ranks - - # Prevent overflow; equal to np.sqrt((tot - u) * (tot - v)) - denom = np.exp(0.5 * (np.log(tot - u) + np.log(tot - v))) - tau = ((tot - (v + u - t)) - 2.0 * exchanges) / denom - - # what follows reproduces the ending of Gary Strangman's original - # stats.kendalltau() in SciPy - svar = (4.0 * n + 10.0) / (9.0 * n * (n - 1)) - z = tau / np.sqrt(svar) - prob = special.erfc(np.abs(z) / 1.4142136) - - return tau, prob - - -def linregress(x, y=None): - """ - Calculate a regression line - - This computes a least-squares regression for two sets of measurements. - - Parameters - ---------- - x, y : array_like - two sets of measurements. Both arrays should have the same length. - If only x is given (and y=None), then it must be a two-dimensional - array where one dimension has length 2. The two sets of measurements - are then found by splitting the array along the length-2 dimension. - - Returns - ------- - slope : float - slope of the regression line - intercept : float - intercept of the regression line - r-value : float - correlation coefficient - p-value : float - two-sided p-value for a hypothesis test whose null hypothesis is - that the slope is zero. - stderr : float - Standard error of the estimate - - - Examples - -------- - >>> from scipy import stats - >>> import numpy as np - >>> x = np.random.random(10) - >>> y = np.random.random(10) - >>> slope, intercept, r_value, p_value, std_err = stats.linregress(x,y) - - # To get coefficient of determination (r_squared) - - >>> print "r-squared:", r_value**2 - r-squared: 0.15286643777 - - """ - TINY = 1.0e-20 - if y is None: # x is a (2, N) or (N, 2) shaped array_like - x = asarray(x) - if x.shape[0] == 2: - x, y = x - elif x.shape[1] == 2: - x, y = x.T - else: - msg = "If only `x` is given as input, it has to be of shape (2, N) \ - or (N, 2), provided shape was %s" % str(x.shape) - raise ValueError(msg) - else: - x = asarray(x) - y = asarray(y) - n = len(x) - xmean = np.mean(x,None) - ymean = np.mean(y,None) - - # average sum of squares: - ssxm, ssxym, ssyxm, ssym = np.cov(x, y, bias=1).flat - r_num = ssxym - r_den = np.sqrt(ssxm*ssym) - if r_den == 0.0: - r = 0.0 - else: - r = r_num / r_den - # test for numerical error propagation - if (r > 1.0): - r = 1.0 - elif (r < -1.0): - r = -1.0 - - df = n-2 - t = r*np.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY))) - prob = distributions.t.sf(np.abs(t),df)*2 - slope = r_num / ssxm - intercept = ymean - slope*xmean - sterrest = np.sqrt((1-r*r)*ssym / ssxm / df) - return slope, intercept, r, prob, sterrest - - -##################################### -##### INFERENTIAL STATISTICS ##### -##################################### - -def ttest_1samp(a, popmean, axis=0): - """ - Calculates the T-test for the mean of ONE group of scores. - - This is a two-sided test for the null hypothesis that the expected value - (mean) of a sample of independent observations `a` is equal to the given - population mean, `popmean`. - - Parameters - ---------- - a : array_like - sample observation - popmean : float or array_like - expected value in null hypothesis, if array_like than it must have the - same shape as `a` excluding the axis dimension - axis : int, optional, (default axis=0) - Axis can equal None (ravel array first), or an integer (the axis - over which to operate on a). - - Returns - ------- - t : float or array - t-statistic - prob : float or array - two-tailed p-value - - Examples - -------- - >>> from scipy import stats - - >>> np.random.seed(7654567) # fix seed to get the same result - >>> rvs = stats.norm.rvs(loc=5, scale=10, size=(50,2)) - - Test if mean of random sample is equal to true mean, and different mean. - We reject the null hypothesis in the second case and don't reject it in - the first case. - - >>> stats.ttest_1samp(rvs,5.0) - (array([-0.68014479, -0.04323899]), array([ 0.49961383, 0.96568674])) - >>> stats.ttest_1samp(rvs,0.0) - (array([ 2.77025808, 4.11038784]), array([ 0.00789095, 0.00014999])) - - Examples using axis and non-scalar dimension for population mean. - - >>> stats.ttest_1samp(rvs,[5.0,0.0]) - (array([-0.68014479, 4.11038784]), array([ 4.99613833e-01, 1.49986458e-04])) - >>> stats.ttest_1samp(rvs.T,[5.0,0.0],axis=1) - (array([-0.68014479, 4.11038784]), array([ 4.99613833e-01, 1.49986458e-04])) - >>> stats.ttest_1samp(rvs,[[5.0],[0.0]]) - (array([[-0.68014479, -0.04323899], - [ 2.77025808, 4.11038784]]), array([[ 4.99613833e-01, 9.65686743e-01], - [ 7.89094663e-03, 1.49986458e-04]])) - - """ - a, axis = _chk_asarray(a, axis) - n = a.shape[axis] - df = n - 1 - - d = np.mean(a, axis) - popmean - v = np.var(a, axis, ddof=1) - denom = np.sqrt(v / float(n)) - - t = np.divide(d, denom) - t, prob = _ttest_finish(df, t) - - return t, prob - - -def _ttest_finish(df,t): - """Common code between all 3 t-test functions.""" - prob = distributions.t.sf(np.abs(t), df) * 2 # use np.abs to get upper tail - if t.ndim == 0: - t = t[()] - - return t, prob - - -def ttest_ind(a, b, axis=0, equal_var=True): - """ - Calculates the T-test for the means of TWO INDEPENDENT samples of scores. - - This is a two-sided test for the null hypothesis that 2 independent samples - have identical average (expected) values. This test assumes that the - populations have identical variances. - - Parameters - ---------- - a, b : array_like - The arrays must have the same shape, except in the dimension - corresponding to `axis` (the first, by default). - axis : int, optional - Axis can equal None (ravel array first), or an integer (the axis - over which to operate on a and b). - equal_var : bool, optional - If True (default), perform a standard independent 2 sample test - that assumes equal population variances [1]_. - If False, perform Welch's t-test, which does not assume equal - population variance [2]_. - - .. versionadded:: 0.11.0 - - Returns - ------- - t : float or array - The calculated t-statistic. - prob : float or array - The two-tailed p-value. - - Notes - ----- - We can use this test, if we observe two independent samples from - the same or different population, e.g. exam scores of boys and - girls or of two ethnic groups. The test measures whether the - average (expected) value differs significantly across samples. If - we observe a large p-value, for example larger than 0.05 or 0.1, - then we cannot reject the null hypothesis of identical average scores. - If the p-value is smaller than the threshold, e.g. 1%, 5% or 10%, - then we reject the null hypothesis of equal averages. - - References - ---------- - .. [1] http://en.wikipedia.org/wiki/T-test#Independent_two-sample_t-test - - .. [2] http://en.wikipedia.org/wiki/Welch%27s_t_test - - Examples - -------- - >>> from scipy import stats - >>> np.random.seed(12345678) - - Test with sample with identical means: - - >>> rvs1 = stats.norm.rvs(loc=5,scale=10,size=500) - >>> rvs2 = stats.norm.rvs(loc=5,scale=10,size=500) - >>> stats.ttest_ind(rvs1,rvs2) - (0.26833823296239279, 0.78849443369564776) - >>> stats.ttest_ind(rvs1,rvs2, equal_var = False) - (0.26833823296239279, 0.78849452749500748) - - `ttest_ind` underestimates p for unequal variances: - - >>> rvs3 = stats.norm.rvs(loc=5, scale=20, size=500) - >>> stats.ttest_ind(rvs1, rvs3) - (-0.46580283298287162, 0.64145827413436174) - >>> stats.ttest_ind(rvs1, rvs3, equal_var = False) - (-0.46580283298287162, 0.64149646246569292) - - When n1 != n2, the equal variance t-statistic is no longer equal to the - unequal variance t-statistic: - - >>> rvs4 = stats.norm.rvs(loc=5, scale=20, size=100) - >>> stats.ttest_ind(rvs1, rvs4) - (-0.99882539442782481, 0.3182832709103896) - >>> stats.ttest_ind(rvs1, rvs4, equal_var = False) - (-0.69712570584654099, 0.48716927725402048) - - T-test with different means, variance, and n: - - >>> rvs5 = stats.norm.rvs(loc=8, scale=20, size=100) - >>> stats.ttest_ind(rvs1, rvs5) - (-1.4679669854490653, 0.14263895620529152) - >>> stats.ttest_ind(rvs1, rvs5, equal_var = False) - (-0.94365973617132992, 0.34744170334794122) - - """ - a, b, axis = _chk2_asarray(a, b, axis) - if a.size == 0 or b.size == 0: - return (np.nan, np.nan) - - v1 = np.var(a, axis, ddof=1) - v2 = np.var(b, axis, ddof=1) - n1 = a.shape[axis] - n2 = b.shape[axis] - - if (equal_var): - df = n1 + n2 - 2 - svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / float(df) - denom = np.sqrt(svar * (1.0 / n1 + 1.0 / n2)) - else: - vn1 = v1 / n1 - vn2 = v2 / n2 - df = ((vn1 + vn2)**2) / ((vn1**2) / (n1 - 1) + (vn2**2) / (n2 - 1)) - - # If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0). - # Hence it doesn't matter what df is as long as it's not NaN. - df = np.where(np.isnan(df), 1, df) - denom = np.sqrt(vn1 + vn2) - - d = np.mean(a, axis) - np.mean(b, axis) - t = np.divide(d, denom) - t, prob = _ttest_finish(df, t) - - return t, prob - - -def ttest_rel(a, b, axis=0): - """ - Calculates the T-test on TWO RELATED samples of scores, a and b. - - This is a two-sided test for the null hypothesis that 2 related or - repeated samples have identical average (expected) values. - - Parameters - ---------- - a, b : array_like - The arrays must have the same shape. - axis : int, optional, (default axis=0) - Axis can equal None (ravel array first), or an integer (the axis - over which to operate on a and b). - - Returns - ------- - t : float or array - t-statistic - prob : float or array - two-tailed p-value - - Notes - ----- - Examples for the use are scores of the same set of student in - different exams, or repeated sampling from the same units. The - test measures whether the average score differs significantly - across samples (e.g. exams). If we observe a large p-value, for - example greater than 0.05 or 0.1 then we cannot reject the null - hypothesis of identical average scores. If the p-value is smaller - than the threshold, e.g. 1%, 5% or 10%, then we reject the null - hypothesis of equal averages. Small p-values are associated with - large t-statistics. - - References - ---------- - http://en.wikipedia.org/wiki/T-test#Dependent_t-test - - Examples - -------- - >>> from scipy import stats - >>> np.random.seed(12345678) # fix random seed to get same numbers - - >>> rvs1 = stats.norm.rvs(loc=5,scale=10,size=500) - >>> rvs2 = (stats.norm.rvs(loc=5,scale=10,size=500) + - ... stats.norm.rvs(scale=0.2,size=500)) - >>> stats.ttest_rel(rvs1,rvs2) - (0.24101764965300962, 0.80964043445811562) - >>> rvs3 = (stats.norm.rvs(loc=8,scale=10,size=500) + - ... stats.norm.rvs(scale=0.2,size=500)) - >>> stats.ttest_rel(rvs1,rvs3) - (-3.9995108708727933, 7.3082402191726459e-005) - - """ - a, b, axis = _chk2_asarray(a, b, axis) - if a.shape[axis] != b.shape[axis]: - raise ValueError('unequal length arrays') - - if a.size == 0 or b.size == 0: - return (np.nan, np.nan) - - n = a.shape[axis] - df = float(n - 1) - - d = (a - b).astype(np.float64) - v = np.var(d, axis, ddof=1) - dm = np.mean(d, axis) - denom = np.sqrt(v / float(n)) - - t = np.divide(dm, denom) - t, prob = _ttest_finish(df, t) - - return t, prob - - -def kstest(rvs, cdf, args=(), N=20, alternative='two-sided', mode='approx'): - """ - Perform the Kolmogorov-Smirnov test for goodness of fit. - - This performs a test of the distribution G(x) of an observed - random variable against a given distribution F(x). Under the null - hypothesis the two distributions are identical, G(x)=F(x). The - alternative hypothesis can be either 'two-sided' (default), 'less' - or 'greater'. The KS test is only valid for continuous distributions. - - Parameters - ---------- - rvs : str, array or callable - If a string, it should be the name of a distribution in `scipy.stats`. - If an array, it should be a 1-D array of observations of random - variables. - If a callable, it should be a function to generate random variables; - it is required to have a keyword argument `size`. - cdf : str or callable - If a string, it should be the name of a distribution in `scipy.stats`. - If `rvs` is a string then `cdf` can be False or the same as `rvs`. - If a callable, that callable is used to calculate the cdf. - args : tuple, sequence, optional - Distribution parameters, used if `rvs` or `cdf` are strings. - N : int, optional - Sample size if `rvs` is string or callable. Default is 20. - alternative : {'two-sided', 'less','greater'}, optional - Defines the alternative hypothesis (see explanation above). - Default is 'two-sided'. - mode : 'approx' (default) or 'asymp', optional - Defines the distribution used for calculating the p-value. - - - 'approx' : use approximation to exact distribution of test statistic - - 'asymp' : use asymptotic distribution of test statistic - - Returns - ------- - D : float - KS test statistic, either D, D+ or D-. - p-value : float - One-tailed or two-tailed p-value. - - Notes - ----- - In the one-sided test, the alternative is that the empirical - cumulative distribution function of the random variable is "less" - or "greater" than the cumulative distribution function F(x) of the - hypothesis, ``G(x)<=F(x)``, resp. ``G(x)>=F(x)``. - - Examples - -------- - >>> from scipy import stats - - >>> x = np.linspace(-15, 15, 9) - >>> stats.kstest(x, 'norm') - (0.44435602715924361, 0.038850142705171065) - - >>> np.random.seed(987654321) # set random seed to get the same result - >>> stats.kstest('norm', False, N=100) - (0.058352892479417884, 0.88531190944151261) - - The above lines are equivalent to: - - >>> np.random.seed(987654321) - >>> stats.kstest(stats.norm.rvs(size=100), 'norm') - (0.058352892479417884, 0.88531190944151261) - - *Test against one-sided alternative hypothesis* - - Shift distribution to larger values, so that ``cdf_dgp(x) < norm.cdf(x)``: - - >>> np.random.seed(987654321) - >>> x = stats.norm.rvs(loc=0.2, size=100) - >>> stats.kstest(x,'norm', alternative = 'less') - (0.12464329735846891, 0.040989164077641749) - - Reject equal distribution against alternative hypothesis: less - - >>> stats.kstest(x,'norm', alternative = 'greater') - (0.0072115233216311081, 0.98531158590396395) - - Don't reject equal distribution against alternative hypothesis: greater - - >>> stats.kstest(x,'norm', mode='asymp') - (0.12464329735846891, 0.08944488871182088) - - *Testing t distributed random variables against normal distribution* - - With 100 degrees of freedom the t distribution looks close to the normal - distribution, and the K-S test does not reject the hypothesis that the - sample came from the normal distribution: - - >>> np.random.seed(987654321) - >>> stats.kstest(stats.t.rvs(100,size=100),'norm') - (0.072018929165471257, 0.67630062862479168) - - With 3 degrees of freedom the t distribution looks sufficiently different - from the normal distribution, that we can reject the hypothesis that the - sample came from the normal distribution at the 10% level: - - >>> np.random.seed(987654321) - >>> stats.kstest(stats.t.rvs(3,size=100),'norm') - (0.131016895759829, 0.058826222555312224) - - """ - if isinstance(rvs, string_types): - if (not cdf) or (cdf == rvs): - cdf = getattr(distributions, rvs).cdf - rvs = getattr(distributions, rvs).rvs - else: - raise AttributeError("if rvs is string, cdf has to be the " - "same distribution") - - if isinstance(cdf, string_types): - cdf = getattr(distributions, cdf).cdf - if callable(rvs): - kwds = {'size':N} - vals = np.sort(rvs(*args,**kwds)) - else: - vals = np.sort(rvs) - N = len(vals) - cdfvals = cdf(vals, *args) - - # to not break compatibility with existing code - if alternative == 'two_sided': - alternative = 'two-sided' - - if alternative in ['two-sided', 'greater']: - Dplus = (np.arange(1.0, N+1)/N - cdfvals).max() - if alternative == 'greater': - return Dplus, distributions.ksone.sf(Dplus,N) - - if alternative in ['two-sided', 'less']: - Dmin = (cdfvals - np.arange(0.0, N)/N).max() - if alternative == 'less': - return Dmin, distributions.ksone.sf(Dmin,N) - - if alternative == 'two-sided': - D = np.max([Dplus,Dmin]) - if mode == 'asymp': - return D, distributions.kstwobign.sf(D*np.sqrt(N)) - if mode == 'approx': - pval_two = distributions.kstwobign.sf(D*np.sqrt(N)) - if N > 2666 or pval_two > 0.80 - N*0.3/1000.0: - return D, distributions.kstwobign.sf(D*np.sqrt(N)) - else: - return D, distributions.ksone.sf(D,N)*2 - - -# Map from names to lambda_ values used in power_divergence(). -_power_div_lambda_names = { - "pearson": 1, - "log-likelihood": 0, - "freeman-tukey": -0.5, - "mod-log-likelihood": -1, - "neyman": -2, - "cressie-read": 2/3, -} - - -def _count(a, axis=None): - """ - Count the number of non-masked elements of an array. - - This function behaves like np.ma.count(), but is much faster - for ndarrays. - """ - if hasattr(a, 'count'): - num = a.count(axis=axis) - if isinstance(num, np.ndarray) and num.ndim == 0: - # In some cases, the `count` method returns a scalar array (e.g. - # np.array(3)), but we want a plain integer. - num = int(num) - else: - if axis is None: - num = a.size - else: - num = a.shape[axis] - return num - - -def power_divergence(f_obs, f_exp=None, ddof=0, axis=0, lambda_=None): - """ - Cressie-Read power divergence statistic and goodness of fit test. - - This function tests the null hypothesis that the categorical data - has the given frequencies, using the Cressie-Read power divergence - statistic. - - Parameters - ---------- - f_obs : array_like - Observed frequencies in each category. - f_exp : array_like, optional - Expected frequencies in each category. By default the categories are - assumed to be equally likely. - ddof : int, optional - "Delta degrees of freedom": adjustment to the degrees of freedom - for the p-value. The p-value is computed using a chi-squared - distribution with ``k - 1 - ddof`` degrees of freedom, where `k` - is the number of observed frequencies. The default value of `ddof` - is 0. - axis : int or None, optional - The axis of the broadcast result of `f_obs` and `f_exp` along which to - apply the test. If axis is None, all values in `f_obs` are treated - as a single data set. Default is 0. - lambda_ : float or str, optional - `lambda_` gives the power in the Cressie-Read power divergence - statistic. The default is 1. For convenience, `lambda_` may be - assigned one of the following strings, in which case the - corresponding numerical value is used:: - - String Value Description - "pearson" 1 Pearson's chi-squared statistic. - In this case, the function is - equivalent to `stats.chisquare`. - "log-likelihood" 0 Log-likelihood ratio. Also known as - the G-test [3]_. - "freeman-tukey" -1/2 Freeman-Tukey statistic. - "mod-log-likelihood" -1 Modified log-likelihood ratio. - "neyman" -2 Neyman's statistic. - "cressie-read" 2/3 The power recommended in [5]_. - - Returns - ------- - stat : float or ndarray - The Cressie-Read power divergence test statistic. The value is - a float if `axis` is None or if` `f_obs` and `f_exp` are 1-D. - p : float or ndarray - The p-value of the test. The value is a float if `ddof` and the - return value `stat` are scalars. - - See Also - -------- - chisquare - - Notes - ----- - This test is invalid when the observed or expected frequencies in each - category are too small. A typical rule is that all of the observed - and expected frequencies should be at least 5. - - When `lambda_` is less than zero, the formula for the statistic involves - dividing by `f_obs`, so a warning or error may be generated if any value - in `f_obs` is 0. - - Similarly, a warning or error may be generated if any value in `f_exp` is - zero when `lambda_` >= 0. - - The default degrees of freedom, k-1, are for the case when no parameters - of the distribution are estimated. If p parameters are estimated by - efficient maximum likelihood then the correct degrees of freedom are - k-1-p. If the parameters are estimated in a different way, then the - dof can be between k-1-p and k-1. However, it is also possible that - the asymptotic distribution is not a chisquare, in which case this - test is not appropriate. - - This function handles masked arrays. If an element of `f_obs` or `f_exp` - is masked, then data at that position is ignored, and does not count - towards the size of the data set. - - .. versionadded:: 0.13.0 - - References - ---------- - .. [1] Lowry, Richard. "Concepts and Applications of Inferential - Statistics". Chapter 8. http://faculty.vassar.edu/lowry/ch8pt1.html - .. [2] "Chi-squared test", http://en.wikipedia.org/wiki/Chi-squared_test - .. [3] "G-test", http://en.wikipedia.org/wiki/G-test - .. [4] Sokal, R. R. and Rohlf, F. J. "Biometry: the principles and - practice of statistics in biological research", New York: Freeman - (1981) - .. [5] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit - Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984), - pp. 440-464. - - Examples - -------- - - (See `chisquare` for more examples.) - - When just `f_obs` is given, it is assumed that the expected frequencies - are uniform and given by the mean of the observed frequencies. Here we - perform a G-test (i.e. use the log-likelihood ratio statistic): - - >>> power_divergence([16, 18, 16, 14, 12, 12], method='log-likelihood') - (2.006573162632538, 0.84823476779463769) - - The expected frequencies can be given with the `f_exp` argument: - - >>> power_divergence([16, 18, 16, 14, 12, 12], - ... f_exp=[16, 16, 16, 16, 16, 8], - ... lambda_='log-likelihood') - (3.5, 0.62338762774958223) - - When `f_obs` is 2-D, by default the test is applied to each column. - - >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T - >>> obs.shape - (6, 2) - >>> power_divergence(obs, lambda_="log-likelihood") - (array([ 2.00657316, 6.77634498]), array([ 0.84823477, 0.23781225])) - - By setting ``axis=None``, the test is applied to all data in the array, - which is equivalent to applying the test to the flattened array. - - >>> power_divergence(obs, axis=None) - (23.31034482758621, 0.015975692534127565) - >>> power_divergence(obs.ravel()) - (23.31034482758621, 0.015975692534127565) - - `ddof` is the change to make to the default degrees of freedom. - - >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=1) - (2.0, 0.73575888234288467) - - The calculation of the p-values is done by broadcasting the - test statistic with `ddof`. - - >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=[0,1,2]) - (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ])) - - `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has - shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting - `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared - statistics, we must use ``axis=1``: - - >>> power_divergence([16, 18, 16, 14, 12, 12], - ... f_exp=[[16, 16, 16, 16, 16, 8], - ... [8, 20, 20, 16, 12, 12]], - ... axis=1) - (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846])) - - """ - # Convert the input argument `lambda_` to a numerical value. - if isinstance(lambda_, string_types): - if lambda_ not in _power_div_lambda_names: - names = repr(list(_power_div_lambda_names.keys()))[1:-1] - raise ValueError("invalid string for lambda_: {0!r}. Valid strings " - "are {1}".format(lambda_, names)) - lambda_ = _power_div_lambda_names[lambda_] - elif lambda_ is None: - lambda_ = 1 - - f_obs = np.asanyarray(f_obs) - - if f_exp is not None: - f_exp = np.atleast_1d(np.asanyarray(f_exp)) - else: - # Compute the equivalent of - # f_exp = f_obs.mean(axis=axis, keepdims=True) - # Older versions of numpy do not have the 'keepdims' argument, so - # we have to do a little work to achieve the same result. - # Ignore 'invalid' errors so the edge case of a data set with length 0 - # is handled without spurious warnings. - with np.errstate(invalid='ignore'): - f_exp = np.atleast_1d(f_obs.mean(axis=axis)) - if axis is not None: - reduced_shape = list(f_obs.shape) - reduced_shape[axis] = 1 - f_exp.shape = reduced_shape - - # `terms` is the array of terms that are summed along `axis` to create - # the test statistic. We use some specialized code for a few special - # cases of lambda_. - if lambda_ == 1: - # Pearson's chi-squared statistic - terms = (f_obs - f_exp)**2 / f_exp - elif lambda_ == 0: - # Log-likelihood ratio (i.e. G-test) - terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp) - elif lambda_ == -1: - # Modified log-likelihood ratio - terms = 2.0 * special.xlogy(f_exp, f_exp / f_obs) - else: - # General Cressie-Read power divergence. - terms = f_obs * ((f_obs / f_exp)**lambda_ - 1) - terms /= 0.5 * lambda_ * (lambda_ + 1) - - stat = terms.sum(axis=axis) - - num_obs = _count(terms, axis=axis) - ddof = asarray(ddof) - p = chisqprob(stat, num_obs - 1 - ddof) - - return stat, p - - -def chisquare(f_obs, f_exp=None, ddof=0, axis=0): - """ - Calculates a one-way chi square test. - - The chi square test tests the null hypothesis that the categorical data - has the given frequencies. - - Parameters - ---------- - f_obs : array_like - Observed frequencies in each category. - f_exp : array_like, optional - Expected frequencies in each category. By default the categories are - assumed to be equally likely. - ddof : int, optional - "Delta degrees of freedom": adjustment to the degrees of freedom - for the p-value. The p-value is computed using a chi-squared - distribution with ``k - 1 - ddof`` degrees of freedom, where `k` - is the number of observed frequencies. The default value of `ddof` - is 0. - axis : int or None, optional - The axis of the broadcast result of `f_obs` and `f_exp` along which to - apply the test. If axis is None, all values in `f_obs` are treated - as a single data set. Default is 0. - - Returns - ------- - chisq : float or ndarray - The chi-squared test statistic. The value is a float if `axis` is - None or `f_obs` and `f_exp` are 1-D. - p : float or ndarray - The p-value of the test. The value is a float if `ddof` and the - return value `chisq` are scalars. - - See Also - -------- - power_divergence - mstats.chisquare - - Notes - ----- - This test is invalid when the observed or expected frequencies in each - category are too small. A typical rule is that all of the observed - and expected frequencies should be at least 5. - - The default degrees of freedom, k-1, are for the case when no parameters - of the distribution are estimated. If p parameters are estimated by - efficient maximum likelihood then the correct degrees of freedom are - k-1-p. If the parameters are estimated in a different way, then the - dof can be between k-1-p and k-1. However, it is also possible that - the asymptotic distribution is not a chisquare, in which case this - test is not appropriate. - - References - ---------- - .. [1] Lowry, Richard. "Concepts and Applications of Inferential - Statistics". Chapter 8. http://faculty.vassar.edu/lowry/ch8pt1.html - .. [2] "Chi-squared test", http://en.wikipedia.org/wiki/Chi-squared_test - - Examples - -------- - When just `f_obs` is given, it is assumed that the expected frequencies - are uniform and given by the mean of the observed frequencies. - - >>> chisquare([16, 18, 16, 14, 12, 12]) - (2.0, 0.84914503608460956) - - With `f_exp` the expected frequencies can be given. - - >>> chisquare([16, 18, 16, 14, 12, 12], f_exp=[16, 16, 16, 16, 16, 8]) - (3.5, 0.62338762774958223) - - When `f_obs` is 2-D, by default the test is applied to each column. - - >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T - >>> obs.shape - (6, 2) - >>> chisquare(obs) - (array([ 2. , 6.66666667]), array([ 0.84914504, 0.24663415])) - - By setting ``axis=None``, the test is applied to all data in the array, - which is equivalent to applying the test to the flattened array. - - >>> chisquare(obs, axis=None) - (23.31034482758621, 0.015975692534127565) - >>> chisquare(obs.ravel()) - (23.31034482758621, 0.015975692534127565) - - `ddof` is the change to make to the default degrees of freedom. - - >>> chisquare([16, 18, 16, 14, 12, 12], ddof=1) - (2.0, 0.73575888234288467) - - The calculation of the p-values is done by broadcasting the - chi-squared statistic with `ddof`. - - >>> chisquare([16, 18, 16, 14, 12, 12], ddof=[0,1,2]) - (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ])) - - `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has - shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting - `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared - statistics, we use ``axis=1``: - - >>> chisquare([16, 18, 16, 14, 12, 12], - ... f_exp=[[16, 16, 16, 16, 16, 8], [8, 20, 20, 16, 12, 12]], - ... axis=1) - (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846])) - - """ - return power_divergence(f_obs, f_exp=f_exp, ddof=ddof, axis=axis, - lambda_="pearson") - - -def ks_2samp(data1, data2): - """ - Computes the Kolmogorov-Smirnov statistic on 2 samples. - - This is a two-sided test for the null hypothesis that 2 independent samples - are drawn from the same continuous distribution. - - Parameters - ---------- - a, b : sequence of 1-D ndarrays - two arrays of sample observations assumed to be drawn from a continuous - distribution, sample sizes can be different - - Returns - ------- - D : float - KS statistic - p-value : float - two-tailed p-value - - Notes - ----- - This tests whether 2 samples are drawn from the same distribution. Note - that, like in the case of the one-sample K-S test, the distribution is - assumed to be continuous. - - This is the two-sided test, one-sided tests are not implemented. - The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution. - - If the K-S statistic is small or the p-value is high, then we cannot - reject the hypothesis that the distributions of the two samples - are the same. - - Examples - -------- - >>> from scipy import stats - >>> np.random.seed(12345678) #fix random seed to get the same result - >>> n1 = 200 # size of first sample - >>> n2 = 300 # size of second sample - - For a different distribution, we can reject the null hypothesis since the - pvalue is below 1%: - - >>> rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1) - >>> rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5) - >>> stats.ks_2samp(rvs1, rvs2) - (0.20833333333333337, 4.6674975515806989e-005) - - For a slightly different distribution, we cannot reject the null hypothesis - at a 10% or lower alpha since the p-value at 0.144 is higher than 10% - - >>> rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0) - >>> stats.ks_2samp(rvs1, rvs3) - (0.10333333333333333, 0.14498781825751686) - - For an identical distribution, we cannot reject the null hypothesis since - the p-value is high, 41%: - - >>> rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0) - >>> stats.ks_2samp(rvs1, rvs4) - (0.07999999999999996, 0.41126949729859719) - - """ - data1, data2 = map(asarray, (data1, data2)) - n1 = data1.shape[0] - n2 = data2.shape[0] - n1 = len(data1) - n2 = len(data2) - data1 = np.sort(data1) - data2 = np.sort(data2) - data_all = np.concatenate([data1,data2]) - cdf1 = np.searchsorted(data1,data_all,side='right')/(1.0*n1) - cdf2 = (np.searchsorted(data2,data_all,side='right'))/(1.0*n2) - d = np.max(np.absolute(cdf1-cdf2)) - # Note: d absolute not signed distance - en = np.sqrt(n1*n2/float(n1+n2)) - try: - prob = ksprob((en+0.12+0.11/en)*d) - except: - prob = 1.0 - return d, prob - - -def mannwhitneyu(x, y, use_continuity=True): - """ - Computes the Mann-Whitney rank test on samples x and y. - - Parameters - ---------- - x, y : array_like - Array of samples, should be one-dimensional. - use_continuity : bool, optional - Whether a continuity correction (1/2.) should be taken into - account. Default is True. - - Returns - ------- - u : float - The Mann-Whitney statistics. - prob : float - One-sided p-value assuming a asymptotic normal distribution. - - Notes - ----- - Use only when the number of observation in each sample is > 20 and - you have 2 independent samples of ranks. Mann-Whitney U is - significant if the u-obtained is LESS THAN or equal to the critical - value of U. - - This test corrects for ties and by default uses a continuity correction. - The reported p-value is for a one-sided hypothesis, to get the two-sided - p-value multiply the returned p-value by 2. - - """ - x = asarray(x) - y = asarray(y) - n1 = len(x) - n2 = len(y) - ranked = rankdata(np.concatenate((x,y))) - rankx = ranked[0:n1] # get the x-ranks - u1 = n1*n2 + (n1*(n1+1))/2.0 - np.sum(rankx,axis=0) # calc U for x - u2 = n1*n2 - u1 # remainder is U for y - bigu = max(u1,u2) - smallu = min(u1,u2) - T = tiecorrect(ranked) - if T == 0: - raise ValueError('All numbers are identical in amannwhitneyu') - sd = np.sqrt(T*n1*n2*(n1+n2+1)/12.0) - - if use_continuity: - # normal approximation for prob calc with continuity correction - z = abs((bigu-0.5-n1*n2/2.0) / sd) - else: - z = abs((bigu-n1*n2/2.0) / sd) # normal approximation for prob calc - return smallu, distributions.norm.sf(z) # (1.0 - zprob(z)) - - -def ranksums(x, y): - """ - Compute the Wilcoxon rank-sum statistic for two samples. - - The Wilcoxon rank-sum test tests the null hypothesis that two sets - of measurements are drawn from the same distribution. The alternative - hypothesis is that values in one sample are more likely to be - larger than the values in the other sample. - - This test should be used to compare two samples from continuous - distributions. It does not handle ties between measurements - in x and y. For tie-handling and an optional continuity correction - see `scipy.stats.mannwhitneyu`. - - Parameters - ---------- - x,y : array_like - The data from the two samples - - Returns - ------- - z-statistic : float - The test statistic under the large-sample approximation that the - rank sum statistic is normally distributed - p-value : float - The two-sided p-value of the test - - References - ---------- - .. [1] http://en.wikipedia.org/wiki/Wilcoxon_rank-sum_test - - """ - x,y = map(np.asarray, (x, y)) - n1 = len(x) - n2 = len(y) - alldata = np.concatenate((x,y)) - ranked = rankdata(alldata) - x = ranked[:n1] - y = ranked[n1:] - s = np.sum(x,axis=0) - expected = n1*(n1+n2+1) / 2.0 - z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0) - prob = 2 * distributions.norm.sf(abs(z)) - return z, prob - - -def kruskal(*args): - """ - Compute the Kruskal-Wallis H-test for independent samples - - The Kruskal-Wallis H-test tests the null hypothesis that the population - median of all of the groups are equal. It is a non-parametric version of - ANOVA. The test works on 2 or more independent samples, which may have - different sizes. Note that rejecting the null hypothesis does not - indicate which of the groups differs. Post-hoc comparisons between - groups are required to determine which groups are different. - - Parameters - ---------- - sample1, sample2, ... : array_like - Two or more arrays with the sample measurements can be given as - arguments. - - Returns - ------- - H-statistic : float - The Kruskal-Wallis H statistic, corrected for ties - p-value : float - The p-value for the test using the assumption that H has a chi - square distribution - - Notes - ----- - Due to the assumption that H has a chi square distribution, the number - of samples in each group must not be too small. A typical rule is - that each sample must have at least 5 measurements. - - References - ---------- - .. [1] http://en.wikipedia.org/wiki/Kruskal-Wallis_one-way_analysis_of_variance - - """ - args = list(map(np.asarray, args)) # convert to a numpy array - na = len(args) # Kruskal-Wallis on 'na' groups, each in it's own array - if na < 2: - raise ValueError("Need at least two groups in stats.kruskal()") - n = np.asarray(list(map(len, args))) - - alldata = np.concatenate(args) - - ranked = rankdata(alldata) # Rank the data - T = tiecorrect(ranked) # Correct for ties - if T == 0: - raise ValueError('All numbers are identical in kruskal') - - # Compute sum^2/n for each group and sum - j = np.insert(np.cumsum(n), 0, 0) - ssbn = 0 - for i in range(na): - ssbn += square_of_sums(ranked[j[i]:j[i+1]]) / float(n[i]) - - totaln = np.sum(n) - h = 12.0 / (totaln * (totaln + 1)) * ssbn - 3 * (totaln + 1) - df = na - 1 - h = h / float(T) - return h, chisqprob(h, df) - - -def friedmanchisquare(*args): - """ - Computes the Friedman test for repeated measurements - - The Friedman test tests the null hypothesis that repeated measurements of - the same individuals have the same distribution. It is often used - to test for consistency among measurements obtained in different ways. - For example, if two measurement techniques are used on the same set of - individuals, the Friedman test can be used to determine if the two - measurement techniques are consistent. - - Parameters - ---------- - measurements1, measurements2, measurements3... : array_like - Arrays of measurements. All of the arrays must have the same number - of elements. At least 3 sets of measurements must be given. - - Returns - ------- - friedman chi-square statistic : float - the test statistic, correcting for ties - p-value : float - the associated p-value assuming that the test statistic has a chi - squared distribution - - Notes - ----- - Due to the assumption that the test statistic has a chi squared - distribution, the p-value is only reliable for n > 10 and more than - 6 repeated measurements. - - References - ---------- - .. [1] http://en.wikipedia.org/wiki/Friedman_test - - """ - k = len(args) - if k < 3: - raise ValueError('\nLess than 3 levels. Friedman test not appropriate.\n') - - n = len(args[0]) - for i in range(1, k): - if len(args[i]) != n: - raise ValueError('Unequal N in friedmanchisquare. Aborting.') - - # Rank data - data = np.vstack(args).T - data = data.astype(float) - for i in range(len(data)): - data[i] = rankdata(data[i]) - - # Handle ties - ties = 0 - for i in range(len(data)): - replist, repnum = find_repeats(array(data[i])) - for t in repnum: - ties += t*(t*t-1) - c = 1 - ties / float(k*(k*k-1)*n) - - ssbn = pysum(pysum(data)**2) - chisq = (12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1)) / c - return chisq, chisqprob(chisq,k-1) - - -##################################### -#### PROBABILITY CALCULATIONS #### -##################################### - -zprob = special.ndtr - - -def chisqprob(chisq, df): - """ - Probability value (1-tail) for the Chi^2 probability distribution. - - Broadcasting rules apply. - - Parameters - ---------- - chisq : array_like or float > 0 - - df : array_like or float, probably int >= 1 - - Returns - ------- - chisqprob : ndarray - The area from `chisq` to infinity under the Chi^2 probability - distribution with degrees of freedom `df`. - - """ - return special.chdtrc(df,chisq) - -ksprob = special.kolmogorov -fprob = special.fdtrc - - -def betai(a, b, x): - """ - Returns the incomplete beta function. - - I_x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt) - - where a,b>0 and B(a,b) = G(a)*G(b)/(G(a+b)) where G(a) is the gamma - function of a. - - The standard broadcasting rules apply to a, b, and x. - - Parameters - ---------- - a : array_like or float > 0 - - b : array_like or float > 0 - - x : array_like or float - x will be clipped to be no greater than 1.0 . - - Returns - ------- - betai : ndarray - Incomplete beta function. - - """ - x = np.asarray(x) - x = np.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0 - return special.betainc(a, b, x) - - -##################################### -####### ANOVA CALCULATIONS ####### -##################################### - -def f_value_wilks_lambda(ER, EF, dfnum, dfden, a, b): - """Calculation of Wilks lambda F-statistic for multivarite data, per - Maxwell & Delaney p.657. - """ - if isinstance(ER, (int, float)): - ER = array([[ER]]) - if isinstance(EF, (int, float)): - EF = array([[EF]]) - lmbda = linalg.det(EF) / linalg.det(ER) - if (a-1)**2 + (b-1)**2 == 5: - q = 1 - else: - q = np.sqrt(((a-1)**2*(b-1)**2 - 2) / ((a-1)**2 + (b-1)**2 - 5)) - n_um = (1 - lmbda**(1.0/q))*(a-1)*(b-1) - d_en = lmbda**(1.0/q) / (n_um*q - 0.5*(a-1)*(b-1) + 1) - return n_um / d_en - - -def f_value(ER, EF, dfR, dfF): - """ - Returns an F-statistic for a restricted vs. unrestricted model. - - Parameters - ---------- - ER : float - `ER` is the sum of squared residuals for the restricted model - or null hypothesis - - EF : float - `EF` is the sum of squared residuals for the unrestricted model - or alternate hypothesis - - dfR : int - `dfR` is the degrees of freedom in the restricted model - - dfF : int - `dfF` is the degrees of freedom in the unrestricted model - - Returns - ------- - F-statistic : float - - """ - return ((ER-EF)/float(dfR-dfF) / (EF/float(dfF))) - - -def f_value_multivariate(ER, EF, dfnum, dfden): - """ - Returns a multivariate F-statistic. - - Parameters - ---------- - ER : ndarray - Error associated with the null hypothesis (the Restricted model). - From a multivariate F calculation. - EF : ndarray - Error associated with the alternate hypothesis (the Full model) - From a multivariate F calculation. - dfnum : int - Degrees of freedom the Restricted model. - dfden : int - Degrees of freedom associated with the Restricted model. - - Returns - ------- - fstat : float - The computed F-statistic. - - """ - if isinstance(ER, (int, float)): - ER = array([[ER]]) - if isinstance(EF, (int, float)): - EF = array([[EF]]) - n_um = (linalg.det(ER) - linalg.det(EF)) / float(dfnum) - d_en = linalg.det(EF) / float(dfden) - return n_um / d_en - - -##################################### -####### SUPPORT FUNCTIONS ######## -##################################### - -def ss(a, axis=0): - """ - Squares each element of the input array, and returns the sum(s) of that. - - Parameters - ---------- - a : array_like - Input array. - axis : int or None, optional - The axis along which to calculate. If None, use whole array. - Default is 0, i.e. along the first axis. - - Returns - ------- - ss : ndarray - The sum along the given axis for (a**2). - - See also - -------- - square_of_sums : The square(s) of the sum(s) (the opposite of `ss`). - - Examples - -------- - >>> from scipy import stats - >>> a = np.array([1., 2., 5.]) - >>> stats.ss(a) - 30.0 - - And calculating along an axis: - - >>> b = np.array([[1., 2., 5.], [2., 5., 6.]]) - >>> stats.ss(b, axis=1) - array([ 30., 65.]) - - """ - a, axis = _chk_asarray(a, axis) - return np.sum(a*a, axis) - - -def square_of_sums(a, axis=0): - """ - Sums elements of the input array, and returns the square(s) of that sum. - - Parameters - ---------- - a : array_like - Input array. - axis : int or None, optional - If axis is None, ravel `a` first. If `axis` is an integer, this will - be the axis over which to operate. Defaults to 0. - - Returns - ------- - square_of_sums : float or ndarray - The square of the sum over `axis`. - - See also - -------- - ss : The sum of squares (the opposite of `square_of_sums`). - - Examples - -------- - >>> from scipy import stats - >>> a = np.arange(20).reshape(5,4) - >>> stats.square_of_sums(a) - array([ 1600., 2025., 2500., 3025.]) - >>> stats.square_of_sums(a, axis=None) - 36100.0 - - """ - a, axis = _chk_asarray(a, axis) - s = np.sum(a,axis) - if not np.isscalar(s): - return s.astype(float)*s - else: - return float(s)*s - - -def fastsort(a): - """ - Sort an array and provide the argsort. - - Parameters - ---------- - a : array_like - Input array. - - Returns - ------- - fastsort : ndarray of type int - sorted indices into the original array - - """ - # TODO: the wording in the docstring is nonsense. - it = np.argsort(a) - as_ = a[it] - return as_, it +# Copyright (c) Gary Strangman. All rights reserved +# +# Disclaimer +# +# This software is provided "as-is". There are no expressed or implied +# warranties of any kind, including, but not limited to, the warranties +# of merchantability and fitness for a given application. In no event +# shall Gary Strangman be liable for any direct, indirect, incidental, +# special, exemplary or consequential damages (including, but not limited +# to, loss of use, data or profits, or business interruption) however +# caused and on any theory of liability, whether in contract, strict +# liability or tort (including negligence or otherwise) arising in any way +# out of the use of this software, even if advised of the possibility of +# such damage. +# + +# +# Heavily adapted for use by SciPy 2002 by Travis Oliphant +""" +A collection of basic statistical functions for python. The function +names appear below. + + Some scalar functions defined here are also available in the scipy.special + package where they work on arbitrary sized arrays. + +Disclaimers: The function list is obviously incomplete and, worse, the +functions are not optimized. All functions have been tested (some more +so than others), but they are far from bulletproof. Thus, as with any +free software, no warranty or guarantee is expressed or implied. :-) A +few extra functions that don't appear in the list below can be found by +interested treasure-hunters. These functions don't necessarily have +both list and array versions but were deemed useful. + +Central Tendency +---------------- +.. autosummary:: + :toctree: generated/ + + gmean + hmean + mode + +Moments +------- +.. autosummary:: + :toctree: generated/ + + moment + variation + skew + kurtosis + normaltest + +Moments Handling NaN: + +.. autosummary:: + :toctree: generated/ + + nanmean + nanmedian + nanstd + +Altered Versions +---------------- +.. autosummary:: + :toctree: generated/ + + tmean + tvar + tstd + tsem + describe + +Frequency Stats +--------------- +.. autosummary:: + :toctree: generated/ + + itemfreq + scoreatpercentile + percentileofscore + histogram + cumfreq + relfreq + +Variability +----------- +.. autosummary:: + :toctree: generated/ + + obrientransform + signaltonoise + sem + +Trimming Functions +------------------ +.. autosummary:: + :toctree: generated/ + + threshold + trimboth + trim1 + +Correlation Functions +--------------------- +.. autosummary:: + :toctree: generated/ + + pearsonr + fisher_exact + spearmanr + pointbiserialr + kendalltau + linregress + +Inferential Stats +----------------- +.. autosummary:: + :toctree: generated/ + + ttest_1samp + ttest_ind + ttest_rel + chisquare + power_divergence + ks_2samp + mannwhitneyu + ranksums + wilcoxon + kruskal + friedmanchisquare + +Probability Calculations +------------------------ +.. autosummary:: + :toctree: generated/ + + chisqprob + zprob + fprob + betai + +ANOVA Functions +--------------- +.. autosummary:: + :toctree: generated/ + + f_oneway + f_value + +Support Functions +----------------- +.. autosummary:: + :toctree: generated/ + + ss + square_of_sums + rankdata + +References +---------- +.. [CRCProbStat2000] Zwillinger, D. and Kokoska, S. (2000). CRC Standard + Probability and Statistics Tables and Formulae. Chapman & Hall: New + York. 2000. + +""" + +from __future__ import division, print_function, absolute_import + +import warnings +import math + +#from .six import xrange + +# friedmanchisquare patch uses python sum +pysum = sum # save it before it gets overwritten + +# Scipy imports. +from scipy.lib.six import callable, string_types +from numpy import array, asarray, ma, zeros, sum +import scipy.special as special +import scipy.linalg as linalg +import numpy as np + +from . import futil +from . import distributions +try: + from scipy.stats._rank import rankdata, tiecorrect +except: + rankdata = tiecorrect = None +__all__ = ['find_repeats', 'gmean', 'hmean', 'mode', + 'tmean', 'tvar', 'tmin', 'tmax', 'tstd', 'tsem', + 'moment', 'variation', 'skew', 'kurtosis', 'describe', + 'skewtest', 'kurtosistest', 'normaltest', 'jarque_bera', + 'itemfreq', 'scoreatpercentile', 'percentileofscore', + 'histogram', 'histogram2', 'cumfreq', 'relfreq', + 'obrientransform', 'signaltonoise', 'sem', 'zmap', 'zscore', + 'threshold', 'sigmaclip', 'trimboth', 'trim1', 'trim_mean', + 'f_oneway', 'pearsonr', 'fisher_exact', + 'spearmanr', 'pointbiserialr', 'kendalltau', 'linregress', + 'ttest_1samp', 'ttest_ind', 'ttest_rel', 'kstest', + 'chisquare', 'power_divergence', 'ks_2samp', 'mannwhitneyu', + 'tiecorrect', 'ranksums', 'kruskal', 'friedmanchisquare', + 'zprob', 'chisqprob', 'ksprob', 'fprob', 'betai', + 'f_value_wilks_lambda', 'f_value', 'f_value_multivariate', + 'ss', 'square_of_sums', + 'fastsort', 'rankdata', + 'nanmean', 'nanstd', 'nanmedian', + ] + + +def _chk_asarray(a, axis): + if axis is None: + a = np.ravel(a) + outaxis = 0 + else: + a = np.asarray(a) + outaxis = axis + return a, outaxis + + +def _chk2_asarray(a, b, axis): + if axis is None: + a = np.ravel(a) + b = np.ravel(b) + outaxis = 0 + else: + a = np.asarray(a) + b = np.asarray(b) + outaxis = axis + return a, b, outaxis + + +def find_repeats(arr): + """ + Find repeats and repeat counts. + + Parameters + ---------- + arr : array_like + Input array + + Returns + ------- + find_repeats : tuple + Returns a tuple of two 1-D ndarrays. The first ndarray are the repeats + as sorted, unique values that are repeated in `arr`. The second + ndarray are the counts mapped one-to-one of the repeated values + in the first ndarray. + + Examples + -------- + >>> sp.stats.find_repeats([2, 1, 2, 3, 2, 2, 5]) + (array([ 2. ]), array([ 4 ], dtype=int32) + + >>> sp.stats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]]) + (array([ 4., 5.]), array([2, 2], dtype=int32)) + + """ + v1, v2, n = futil.dfreps(arr) + return v1[:n], v2[:n] + +# +# NAN friendly functions +# + + +def nanmean(x, axis=0): + """ + Compute the mean over the given axis ignoring nans. + + Parameters + ---------- + x : ndarray + Input array. + axis : int, optional + Axis along which the mean is computed. Default is 0, i.e. the + first axis. + + Returns + ------- + m : float + The mean of `x`, ignoring nans. + + See Also + -------- + nanstd, nanmedian + + Examples + -------- + >>> from scipy import stats + >>> a = np.linspace(0, 4, 3) + >>> a + array([ 0., 2., 4.]) + >>> a[-1] = np.nan + >>> stats.nanmean(a) + 1.0 + + """ + x, axis = _chk_asarray(x, axis) + x = x.copy() + Norig = x.shape[axis] + mask = np.isnan(x) + factor = 1.0 - np.sum(mask, axis) / Norig + + x[mask] = 0.0 + return np.mean(x, axis) / factor + + +def nanstd(x, axis=0, bias=False): + """ + Compute the standard deviation over the given axis, ignoring nans. + + Parameters + ---------- + x : array_like + Input array. + axis : int or None, optional + Axis along which the standard deviation is computed. Default is 0. + If None, compute over the whole array `x`. + bias : bool, optional + If True, the biased (normalized by N) definition is used. If False + (default), the unbiased definition is used. + + Returns + ------- + s : float + The standard deviation. + + See Also + -------- + nanmean, nanmedian + + Examples + -------- + >>> from scipy import stats + >>> a = np.arange(10, dtype=float) + >>> a[1:3] = np.nan + >>> np.std(a) + nan + >>> stats.nanstd(a) + 2.9154759474226504 + >>> stats.nanstd(a.reshape(2, 5), axis=1) + array([ 2.0817, 1.5811]) + >>> stats.nanstd(a.reshape(2, 5), axis=None) + 2.9154759474226504 + + """ + x, axis = _chk_asarray(x, axis) + x = x.copy() + Norig = x.shape[axis] + + mask = np.isnan(x) + Nnan = np.sum(mask, axis) * 1.0 + n = Norig - Nnan + + x[mask] = 0.0 + m1 = np.sum(x, axis) / n + + if axis: + d = x - np.expand_dims(m1, axis) + else: + d = x - m1 + + d *= d + + m2 = np.sum(d, axis) - m1 * m1 * Nnan + + if bias: + m2c = m2 / n + else: + m2c = m2 / (n - 1.0) + + return np.sqrt(m2c) + + +def _nanmedian(arr1d): # This only works on 1d arrays + """Private function for rank a arrays. Compute the median ignoring Nan. + + Parameters + ---------- + arr1d : ndarray + Input array, of rank 1. + + Results + ------- + m : float + The median. + """ + cond = 1 - np.isnan(arr1d) + x = np.sort(np.compress(cond, arr1d, axis=-1)) + if x.size == 0: + return np.nan + return np.median(x) + + +def nanmedian(x, axis=0): + """ + Compute the median along the given axis ignoring nan values. + + Parameters + ---------- + x : array_like + Input array. + axis : int, optional + Axis along which the median is computed. Default is 0, i.e. the + first axis. + + Returns + ------- + m : float + The median of `x` along `axis`. + + See Also + -------- + nanstd, nanmean + + Examples + -------- + >>> from scipy import stats + >>> a = np.array([0, 3, 1, 5, 5, np.nan]) + >>> stats.nanmedian(a) + array(3.0) + + >>> b = np.array([0, 3, 1, 5, 5, np.nan, 5]) + >>> stats.nanmedian(b) + array(4.0) + + Example with axis: + + >>> c = np.arange(30.).reshape(5,6) + >>> idx = np.array([False, False, False, True, False] * 6).reshape(5,6) + >>> c[idx] = np.nan + >>> c + array([[ 0., 1., 2., nan, 4., 5.], + [ 6., 7., nan, 9., 10., 11.], + [ 12., nan, 14., 15., 16., 17.], + [ nan, 19., 20., 21., 22., nan], + [ 24., 25., 26., 27., nan, 29.]]) + >>> stats.nanmedian(c, axis=1) + array([ 2. , 9. , 15. , 20.5, 26. ]) + + """ + x, axis = _chk_asarray(x, axis) + if x.ndim == 0: + return float(x.item()) + x = x.copy() + x = np.apply_along_axis(_nanmedian, axis, x) + if x.ndim == 0: + x = float(x.item()) + return x + + +# +# CENTRAL TENDENCY ######## +# + + +def gmean(a, axis=0, dtype=None): + """ + Compute the geometric mean along the specified axis. + + Returns the geometric average of the array elements. + That is: n-th root of (x1 * x2 * ... * xn) + + Parameters + ---------- + a : array_like + Input array or object that can be converted to an array. + axis : int, optional, default axis=0 + Axis along which the geometric mean is computed. + dtype : dtype, optional + Type of the returned array and of the accumulator in which the + elements are summed. If dtype is not specified, it defaults to the + dtype of a, unless a has an integer dtype with a precision less than + that of the default platform integer. In that case, the default + platform integer is used. + + Returns + ------- + gmean : ndarray + see dtype parameter above + + See Also + -------- + numpy.mean : Arithmetic average + numpy.average : Weighted average + hmean : Harmonic mean + + Notes + ----- + The geometric average is computed over a single dimension of the input + array, axis=0 by default, or all values in the array if axis=None. + float64 intermediate and return values are used for integer inputs. + + Use masked arrays to ignore any non-finite values in the input or that + arise in the calculations such as Not a Number and infinity because masked + arrays automatically mask any non-finite values. + + """ + # if not an ndarray object attempt to convert it + if not isinstance(a, np.ndarray): + log_a = np.log(np.array(a, dtype=dtype)) + elif dtype: # Must change the default dtype allowing array type + if isinstance(a, np.ma.MaskedArray): + log_a = np.log(np.ma.asarray(a, dtype=dtype)) + else: + log_a = np.log(np.asarray(a, dtype=dtype)) + else: + log_a = np.log(a) + return np.exp(log_a.mean(axis=axis)) + + +def hmean(a, axis=0, dtype=None): + """ + Calculates the harmonic mean along the specified axis. + + That is: n / (1/x1 + 1/x2 + ... + 1/xn) + + Parameters + ---------- + a : array_like + Input array, masked array or object that can be converted to an array. + axis : int, optional, default axis=0 + Axis along which the harmonic mean is computed. + dtype : dtype, optional + Type of the returned array and of the accumulator in which the + elements are summed. If `dtype` is not specified, it defaults to the + dtype of `a`, unless `a` has an integer `dtype` with a precision less + than that of the default platform integer. In that case, the default + platform integer is used. + + Returns + ------- + hmean : ndarray + see `dtype` parameter above + + See Also + -------- + numpy.mean : Arithmetic average + numpy.average : Weighted average + gmean : Geometric mean + + Notes + ----- + The harmonic mean is computed over a single dimension of the input + array, axis=0 by default, or all values in the array if axis=None. + float64 intermediate and return values are used for integer inputs. + + Use masked arrays to ignore any non-finite values in the input or that + arise in the calculations such as Not a Number and infinity. + + """ + if not isinstance(a, np.ndarray): + a = np.array(a, dtype=dtype) + if np.all(a > 0): # Harmonic mean only defined if greater than zero + if isinstance(a, np.ma.MaskedArray): + size = a.count(axis) + else: + if axis is None: + a = a.ravel() + size = a.shape[0] + else: + size = a.shape[axis] + return size / np.sum(1.0 / a, axis=axis, dtype=dtype) + else: + raise ValueError( + "Harmonic mean only defined if all elements greater than zero") + + +def mode(a, axis=0): + """ + Returns an array of the modal (most common) value in the passed array. + + If there is more than one such value, only the first is returned. + The bin-count for the modal bins is also returned. + + Parameters + ---------- + a : array_like + n-dimensional array of which to find mode(s). + axis : int, optional + Axis along which to operate. Default is 0, i.e. the first axis. + + Returns + ------- + vals : ndarray + Array of modal values. + counts : ndarray + Array of counts for each mode. + + Examples + -------- + >>> a = np.array([[6, 8, 3, 0], + [3, 2, 1, 7], + [8, 1, 8, 4], + [5, 3, 0, 5], + [4, 7, 5, 9]]) + >>> from scipy import stats + >>> stats.mode(a) + (array([[ 3., 1., 0., 0.]]), array([[ 1., 1., 1., 1.]])) + + To get mode of whole array, specify axis=None: + + >>> stats.mode(a, axis=None) + (array([ 3.]), array([ 3.])) + + """ + a, axis = _chk_asarray(a, axis) + scores = np.unique(np.ravel(a)) # get ALL unique values + testshape = list(a.shape) + testshape[axis] = 1 + oldmostfreq = np.zeros(testshape) + oldcounts = np.zeros(testshape) + for score in scores: + template = (a == score) + counts = np.expand_dims(np.sum(template, axis), axis) + mostfrequent = np.where(counts > oldcounts, score, oldmostfreq) + oldcounts = np.maximum(counts, oldcounts) + oldmostfreq = mostfrequent + return mostfrequent, oldcounts + + +def mask_to_limits(a, limits, inclusive): + """Mask an array for values outside of given limits. + + This is primarily a utility function. + + Parameters + ---------- + a : array + limits : (float or None, float or None) + A tuple consisting of the (lower limit, upper limit). Values in the + input array less than the lower limit or greater than the upper limit + will be masked out. None implies no limit. + inclusive : (bool, bool) + A tuple consisting of the (lower flag, upper flag). These flags + determine whether values exactly equal to lower or upper are allowed. + + Returns + ------- + A MaskedArray. + + Raises + ------ + A ValueError if there are no values within the given limits. + """ + lower_limit, upper_limit = limits + lower_include, upper_include = inclusive + am = ma.MaskedArray(a) + if lower_limit is not None: + if lower_include: + am = ma.masked_less(am, lower_limit) + else: + am = ma.masked_less_equal(am, lower_limit) + + if upper_limit is not None: + if upper_include: + am = ma.masked_greater(am, upper_limit) + else: + am = ma.masked_greater_equal(am, upper_limit) + + if am.count() == 0: + raise ValueError("No array values within given limits") + + return am + + +def tmean(a, limits=None, inclusive=(True, True)): + """ + Compute the trimmed mean. + + This function finds the arithmetic mean of given values, ignoring values + outside the given `limits`. + + Parameters + ---------- + a : array_like + Array of values. + limits : None or (lower limit, upper limit), optional + Values in the input array less than the lower limit or greater than the + upper limit will be ignored. When limits is None (default), then all + values are used. Either of the limit values in the tuple can also be + None representing a half-open interval. + inclusive : (bool, bool), optional + A tuple consisting of the (lower flag, upper flag). These flags + determine whether values exactly equal to the lower or upper limits + are included. The default value is (True, True). + + Returns + ------- + tmean : float + + """ + a = asarray(a) + if limits is None: + return np.mean(a, None) + + am = mask_to_limits(a.ravel(), limits, inclusive) + return am.mean() + + +def masked_var(am): + m = am.mean() + s = ma.add.reduce((am - m) ** 2) + n = am.count() - 1.0 + return s / n + + +def tvar(a, limits=None, inclusive=(True, True)): + """ + Compute the trimmed variance + + This function computes the sample variance of an array of values, + while ignoring values which are outside of given `limits`. + + Parameters + ---------- + a : array_like + Array of values. + limits : None or (lower limit, upper limit), optional + Values in the input array less than the lower limit or greater than the + upper limit will be ignored. When limits is None, then all values are + used. Either of the limit values in the tuple can also be None + representing a half-open interval. The default value is None. + inclusive : (bool, bool), optional + A tuple consisting of the (lower flag, upper flag). These flags + determine whether values exactly equal to the lower or upper limits + are included. The default value is (True, True). + + Returns + ------- + tvar : float + Trimmed variance. + + Notes + ----- + `tvar` computes the unbiased sample variance, i.e. it uses a correction + factor ``n / (n - 1)``. + + """ + a = asarray(a) + a = a.astype(float).ravel() + if limits is None: + n = len(a) + return a.var() * (n / (n - 1.)) + am = mask_to_limits(a, limits, inclusive) + return masked_var(am) + + +def tmin(a, lowerlimit=None, axis=0, inclusive=True): + """ + Compute the trimmed minimum + + This function finds the miminum value of an array `a` along the + specified axis, but only considering values greater than a specified + lower limit. + + Parameters + ---------- + a : array_like + array of values + lowerlimit : None or float, optional + Values in the input array less than the given limit will be ignored. + When lowerlimit is None, then all values are used. The default value + is None. + axis : None or int, optional + Operate along this axis. None means to use the flattened array and + the default is zero + inclusive : {True, False}, optional + This flag determines whether values exactly equal to the lower limit + are included. The default value is True. + + Returns + ------- + tmin : float + + """ + a, axis = _chk_asarray(a, axis) + am = mask_to_limits(a, (lowerlimit, None), (inclusive, False)) + return ma.minimum.reduce(am, axis) + + +def tmax(a, upperlimit=None, axis=0, inclusive=True): + """ + Compute the trimmed maximum + + This function computes the maximum value of an array along a given axis, + while ignoring values larger than a specified upper limit. + + Parameters + ---------- + a : array_like + array of values + upperlimit : None or float, optional + Values in the input array greater than the given limit will be ignored. + When upperlimit is None, then all values are used. The default value + is None. + axis : None or int, optional + Operate along this axis. None means to use the flattened array and + the default is zero. + inclusive : {True, False}, optional + This flag determines whether values exactly equal to the upper limit + are included. The default value is True. + + Returns + ------- + tmax : float + + """ + a, axis = _chk_asarray(a, axis) + am = mask_to_limits(a, (None, upperlimit), (False, inclusive)) + return ma.maximum.reduce(am, axis) + + +def tstd(a, limits=None, inclusive=(True, True)): + """ + Compute the trimmed sample standard deviation + + This function finds the sample standard deviation of given values, + ignoring values outside the given `limits`. + + Parameters + ---------- + a : array_like + array of values + limits : None or (lower limit, upper limit), optional + Values in the input array less than the lower limit or greater than the + upper limit will be ignored. When limits is None, then all values are + used. Either of the limit values in the tuple can also be None + representing a half-open interval. The default value is None. + inclusive : (bool, bool), optional + A tuple consisting of the (lower flag, upper flag). These flags + determine whether values exactly equal to the lower or upper limits + are included. The default value is (True, True). + + Returns + ------- + tstd : float + + Notes + ----- + `tstd` computes the unbiased sample standard deviation, i.e. it uses a + correction factor ``n / (n - 1)``. + + """ + return np.sqrt(tvar(a, limits, inclusive)) + + +def tsem(a, limits=None, inclusive=(True, True)): + """ + Compute the trimmed standard error of the mean. + + This function finds the standard error of the mean for given + values, ignoring values outside the given `limits`. + + Parameters + ---------- + a : array_like + array of values + limits : None or (lower limit, upper limit), optional + Values in the input array less than the lower limit or greater than the + upper limit will be ignored. When limits is None, then all values are + used. Either of the limit values in the tuple can also be None + representing a half-open interval. The default value is None. + inclusive : (bool, bool), optional + A tuple consisting of the (lower flag, upper flag). These flags + determine whether values exactly equal to the lower or upper limits + are included. The default value is (True, True). + + Returns + ------- + tsem : float + + Notes + ----- + `tsem` uses unbiased sample standard deviation, i.e. it uses a + correction factor ``n / (n - 1)``. + + """ + a = np.asarray(a).ravel() + if limits is None: + return a.std(ddof=1) / np.sqrt(a.size) + + am = mask_to_limits(a, limits, inclusive) + sd = np.sqrt(masked_var(am)) + return sd / np.sqrt(am.count()) + + +# +# MOMENTS ############# +# + +def moment(a, moment=1, axis=0): + """ + Calculates the nth moment about the mean for a sample. + + Generally used to calculate coefficients of skewness and + kurtosis. + + Parameters + ---------- + a : array_like + data + moment : int + order of central moment that is returned + axis : int or None + Axis along which the central moment is computed. If None, then the data + array is raveled. The default axis is zero. + + Returns + ------- + n-th central moment : ndarray or float + The appropriate moment along the given axis or over all values if axis + is None. The denominator for the moment calculation is the number of + observations, no degrees of freedom correction is done. + + """ + a, axis = _chk_asarray(a, axis) + if moment == 1: + # By definition the first moment about the mean is 0. + shape = list(a.shape) + del shape[axis] + if shape: + # return an actual array of the appropriate shape + return np.zeros(shape, dtype=float) + else: + # the input was 1D, so return a scalar instead of a rank-0 array + return np.float64(0.0) + else: + mn = np.expand_dims(np.mean(a, axis), axis) + s = np.power((a - mn), moment) + return np.mean(s, axis) + + +def variation(a, axis=0): + """ + Computes the coefficient of variation, the ratio of the biased standard + deviation to the mean. + + Parameters + ---------- + a : array_like + Input array. + axis : int or None + Axis along which to calculate the coefficient of variation. + + References + ---------- + .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard + Probability and Statistics Tables and Formulae. Chapman & Hall: New + York. 2000. + + """ + a, axis = _chk_asarray(a, axis) + return a.std(axis) / a.mean(axis) + + +def skew(a, axis=0, bias=True): + """ + Computes the skewness of a data set. + + For normally distributed data, the skewness should be about 0. A skewness + value > 0 means that there is more weight in the left tail of the + distribution. The function `skewtest` can be used to determine if the + skewness value is close enough to 0, statistically speaking. + + Parameters + ---------- + a : ndarray + data + axis : int or None + axis along which skewness is calculated + bias : bool + If False, then the calculations are corrected for statistical bias. + + Returns + ------- + skewness : ndarray + The skewness of values along an axis, returning 0 where all values are + equal. + + References + ---------- + [CRCProbStat2000]_ Section 2.2.24.1 + + .. [CRCProbStat2000] Zwillinger, D. and Kokoska, S. (2000). CRC Standard + Probability and Statistics Tables and Formulae. Chapman & Hall: New + York. 2000. + + """ + a, axis = _chk_asarray(a, axis) + n = a.shape[axis] + m2 = moment(a, 2, axis) + m3 = moment(a, 3, axis) + zero = (m2 == 0) + vals = np.where(zero, 0, m3 / m2 ** 1.5) + if not bias: + can_correct = (n > 2) & (m2 > 0) + if can_correct.any(): + m2 = np.extract(can_correct, m2) + m3 = np.extract(can_correct, m3) + nval = np.sqrt((n - 1.0) * n) / (n - 2.0) * m3 / m2 ** 1.5 + np.place(vals, can_correct, nval) + if vals.ndim == 0: + return vals.item() + return vals + + +def kurtosis(a, axis=0, fisher=True, bias=True): + """ + Computes the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + Use `kurtosistest` to see if result is close enough to normal. + + Parameters + ---------- + a : array + data for which the kurtosis is calculated + axis : int or None + Axis along which the kurtosis is calculated + fisher : bool + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool + If False, then the calculations are corrected for statistical bias. + + Returns + ------- + kurtosis : array + The kurtosis of values along an axis. If all values are equal, + return -3 for Fisher's definition and 0 for Pearson's definition. + + References + ---------- + .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard + Probability and Statistics Tables and Formulae. Chapman & Hall: New + York. 2000. + + """ + a, axis = _chk_asarray(a, axis) + n = a.shape[axis] + m2 = moment(a, 2, axis) + m4 = moment(a, 4, axis) + zero = (m2 == 0) + olderr = np.seterr(all='ignore') + try: + vals = np.where(zero, 0, m4 / m2 ** 2.0) + finally: + np.seterr(**olderr) + + if not bias: + can_correct = (n > 3) & (m2 > 0) + if can_correct.any(): + m2 = np.extract(can_correct, m2) + m4 = np.extract(can_correct, m4) + nval = 1.0 / \ + (n - 2) / (n - 3) * \ + ((n * n - 1.0) * m4 / m2 ** 2.0 - 3 * (n - 1) ** 2.0) + np.place(vals, can_correct, nval + 3.0) + + if vals.ndim == 0: + vals = vals.item() # array scalar + + if fisher: + return vals - 3 + else: + return vals + + +def describe(a, axis=0): + """ + Computes several descriptive statistics of the passed array. + + Parameters + ---------- + a : array_like + data + axis : int or None + axis along which statistics are calculated. If axis is None, then data + array is raveled. The default axis is zero. + + Returns + ------- + size of the data : int + length of data along axis + (min, max): tuple of ndarrays or floats + minimum and maximum value of data array + arithmetic mean : ndarray or float + mean of data along axis + unbiased variance : ndarray or float + variance of the data along axis, denominator is number of observations + minus one. + biased skewness : ndarray or float + skewness, based on moment calculations with denominator equal to the + number of observations, i.e. no degrees of freedom correction + biased kurtosis : ndarray or float + kurtosis (Fisher), the kurtosis is normalized so that it is zero for the + normal distribution. No degrees of freedom or bias correction is used. + + See Also + -------- + skew + kurtosis + + """ + a, axis = _chk_asarray(a, axis) + n = a.shape[axis] + mm = (np.min(a, axis=axis), np.max(a, axis=axis)) + m = np.mean(a, axis=axis) + v = np.var(a, axis=axis, ddof=1) + sk = skew(a, axis) + kurt = kurtosis(a, axis) + return n, mm, m, v, sk, kurt + +# +# NORMALITY TESTS ########## +# + + +def skewtest(a, axis=0): + """ + Tests whether the skew is different from the normal distribution. + + This function tests the null hypothesis that the skewness of + the population that the sample was drawn from is the same + as that of a corresponding normal distribution. + + Parameters + ---------- + a : array + axis : int or None + + Returns + ------- + z-score : float + The computed z-score for this test. + p-value : float + a 2-sided p-value for the hypothesis test + + Notes + ----- + The sample size must be at least 8. + + """ + a, axis = _chk_asarray(a, axis) + if axis is None: + a = np.ravel(a) + axis = 0 + b2 = skew(a, axis) + n = float(a.shape[axis]) + if n < 8: + raise ValueError( + "skewtest is not valid with less than 8 samples; %i samples" + " were given." % int(n)) + y = b2 * math.sqrt(((n + 1) * (n + 3)) / (6.0 * (n - 2))) + beta2 = (3.0 * (n * n + 27 * n - 70) * (n + 1) * (n + 3) / + ((n - 2.0) * (n + 5) * (n + 7) * (n + 9))) + W2 = -1 + math.sqrt(2 * (beta2 - 1)) + delta = 1 / math.sqrt(0.5 * math.log(W2)) + alpha = math.sqrt(2.0 / (W2 - 1)) + y = np.where(y == 0, 1, y) + Z = delta * np.log(y / alpha + np.sqrt((y / alpha) ** 2 + 1)) + return Z, 2 * distributions.norm.sf(np.abs(Z)) + + +def kurtosistest(a, axis=0): + """ + Tests whether a dataset has normal kurtosis + + This function tests the null hypothesis that the kurtosis + of the population from which the sample was drawn is that + of the normal distribution: ``kurtosis = 3(n-1)/(n+1)``. + + Parameters + ---------- + a : array + array of the sample data + axis : int or None + the axis to operate along, or None to work on the whole array. + The default is the first axis. + + Returns + ------- + z-score : float + The computed z-score for this test. + p-value : float + The 2-sided p-value for the hypothesis test + + Notes + ----- + Valid only for n>20. The Z-score is set to 0 for bad entries. + + """ + a, axis = _chk_asarray(a, axis) + n = float(a.shape[axis]) + if n < 5: + raise ValueError( + "kurtosistest requires at least 5 observations; %i observations" + " were given." % int(n)) + if n < 20: + warnings.warn( + "kurtosistest only valid for n>=20 ... continuing anyway, n=%i" % + int(n)) + b2 = kurtosis(a, axis, fisher=False) + E = 3.0 * (n - 1) / (n + 1) + varb2 = 24.0 * n * \ + (n - 2) * (n - 3) / ((n + 1) * (n + 1) * (n + 3) * (n + 5)) + x = (b2 - E) / np.sqrt(varb2) + sqrtbeta1 = 6.0 * (n * n - 5 * n + 2) / ((n + 7) * (n + 9)) * np.sqrt((6.0 * (n + 3) * (n + 5)) / + (n * (n - 2) * (n - 3))) + A = 6.0 + 8.0 / sqrtbeta1 * \ + (2.0 / sqrtbeta1 + np.sqrt(1 + 4.0 / (sqrtbeta1 ** 2))) + term1 = 1 - 2 / (9.0 * A) + denom = 1 + x * np.sqrt(2 / (A - 4.0)) + denom = np.where(denom < 0, 99, denom) + term2 = np.where( + denom < 0, term1, np.power((1 - 2.0 / A) / denom, 1 / 3.0)) + Z = (term1 - term2) / np.sqrt(2 / (9.0 * A)) + Z = np.where(denom == 99, 0, Z) + if Z.ndim == 0: + Z = Z[()] + # JPNote: p-value sometimes larger than 1 + # zprob uses upper tail, so Z needs to be positive + return Z, 2 * distributions.norm.sf(np.abs(Z)) + + +def normaltest(a, axis=0): + """ + Tests whether a sample differs from a normal distribution. + + This function tests the null hypothesis that a sample comes + from a normal distribution. It is based on D'Agostino and + Pearson's [1]_, [2]_ test that combines skew and kurtosis to + produce an omnibus test of normality. + + + Parameters + ---------- + a : array_like + The array containing the data to be tested. + axis : int or None + If None, the array is treated as a single data set, regardless of + its shape. Otherwise, each 1-d array along axis `axis` is tested. + + Returns + ------- + k2 : float or array + `s^2 + k^2`, where `s` is the z-score returned by `skewtest` and + `k` is the z-score returned by `kurtosistest`. + p-value : float or array + A 2-sided chi squared probability for the hypothesis test. + + References + ---------- + .. [1] D'Agostino, R. B. (1971), "An omnibus test of normality for + moderate and large sample size," Biometrika, 58, 341-348 + + .. [2] D'Agostino, R. and Pearson, E. S. (1973), "Testing for + departures from normality," Biometrika, 60, 613-622 + + """ + a, axis = _chk_asarray(a, axis) + s, _p = skewtest(a, axis) + k, _p = kurtosistest(a, axis) + k2 = s * s + k * k + return k2, chisqprob(k2, 2) + + +def jarque_bera(x): + """ + Perform the Jarque-Bera goodness of fit test on sample data. + + The Jarque-Bera test tests whether the sample data has the skewness and + kurtosis matching a normal distribution. + + Note that this test only works for a large enough number of data samples + (>2000) as the test statistic asymptotically has a Chi-squared distribution + with 2 degrees of freedom. + + Parameters + ---------- + x : array_like + Observations of a random variable. + + Returns + ------- + jb_value : float + The test statistic. + p : float + The p-value for the hypothesis test. + + References + ---------- + .. [1] Jarque, C. and Bera, A. (1980) "Efficient tests for normality, + homoscedasticity and serial independence of regression residuals", + 6 Econometric Letters 255-259. + + Examples + -------- + >>> from scipy import stats + >>> np.random.seed(987654321) + >>> x = np.random.normal(0, 1, 100000) + >>> y = np.random.rayleigh(1, 100000) + >>> stats.jarque_bera(x) + (4.7165707989581342, 0.09458225503041906) + >>> stats.jarque_bera(y) + (6713.7098548143422, 0.0) + + """ + x = np.asarray(x) + n = float(x.size) + if n == 0: + raise ValueError('At least one observation is required.') + + mu = x.mean() + diffx = x - mu + skewness = (1 / n * np.sum(diffx ** 3)) / \ + (1 / n * np.sum(diffx ** 2)) ** (3 / 2.) + kurtosis = (1 / n * np.sum(diffx ** 4)) / (1 / n * np.sum(diffx ** 2)) ** 2 + jb_value = n / 6 * (skewness ** 2 + (kurtosis - 3) ** 2 / 4) + p = 1 - distributions.chi2.cdf(jb_value, 2) + + return jb_value, p + + +# +# FREQUENCY FUNCTIONS ####### +# + +def itemfreq(a): + """ + Returns a 2-D array of item frequencies. + + Parameters + ---------- + a : (N,) array_like + Input array. + + Returns + ------- + itemfreq : (K, 2) ndarray + A 2-D frequency table. Column 1 contains sorted, unique values from + `a`, column 2 contains their respective counts. + + Examples + -------- + >>> a = np.array([1, 1, 5, 0, 1, 2, 2, 0, 1, 4]) + >>> stats.itemfreq(a) + array([[ 0., 2.], + [ 1., 4.], + [ 2., 2.], + [ 4., 1.], + [ 5., 1.]]) + >>> np.bincount(a) + array([2, 4, 2, 0, 1, 1]) + + >>> stats.itemfreq(a/10.) + array([[ 0. , 2. ], + [ 0.1, 4. ], + [ 0.2, 2. ], + [ 0.4, 1. ], + [ 0.5, 1. ]]) + + """ + items, inv = np.unique(a, return_inverse=True) + freq = np.bincount(inv) + return np.array([items, freq]).T + + +def scoreatpercentile(a, per, limit=(), interpolation_method='fraction', + axis=None): + """ + Calculate the score at a given percentile of the input sequence. + + For example, the score at `per=50` is the median. If the desired quantile + lies between two data points, we interpolate between them, according to + the value of `interpolation`. If the parameter `limit` is provided, it + should be a tuple (lower, upper) of two values. + + Parameters + ---------- + a : array_like + A 1-D array of values from which to extract score. + per : array_like + Percentile(s) at which to extract score. Values should be in range + [0,100]. + limit : tuple, optional + Tuple of two scalars, the lower and upper limits within which to + compute the percentile. Values of `a` outside + this (closed) interval will be ignored. + interpolation : {'fraction', 'lower', 'higher'}, optional + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j` + + - fraction: ``i + (j - i) * fraction`` where ``fraction`` is the + fractional part of the index surrounded by ``i`` and ``j``. + - lower: ``i``. + - higher: ``j``. + + axis : int, optional + Axis along which the percentiles are computed. The default (None) + is to compute the median along a flattened version of the array. + + Returns + ------- + score : float (or sequence of floats) + Score at percentile. + + See Also + -------- + percentileofscore + + Examples + -------- + >>> from scipy import stats + >>> a = np.arange(100) + >>> stats.scoreatpercentile(a, 50) + 49.5 + + """ + # adapted from NumPy's percentile function + a = np.asarray(a) + + if limit: + a = a[(limit[0] <= a) & (a <= limit[1])] + + if per == 0: + return a.min(axis=axis) + elif per == 100: + return a.max(axis=axis) + + sorted = np.sort(a, axis=axis) + if axis is None: + axis = 0 + + return _compute_qth_percentile(sorted, per, interpolation_method, axis) + + +# handle sequence of per's without calling sort multiple times +def _compute_qth_percentile(sorted, per, interpolation_method, axis): + if not np.isscalar(per): + return [_compute_qth_percentile(sorted, i, interpolation_method, axis) + for i in per] + + if (per < 0) or (per > 100): + raise ValueError("percentile must be in the range [0, 100]") + + indexer = [slice(None)] * sorted.ndim + idx = per / 100. * (sorted.shape[axis] - 1) + + if int(idx) != idx: + # round fractional indices according to interpolation method + if interpolation_method == 'lower': + idx = int(np.floor(idx)) + elif interpolation_method == 'higher': + idx = int(np.ceil(idx)) + elif interpolation_method == 'fraction': + pass # keep idx as fraction and interpolate + else: + raise ValueError("interpolation_method can only be 'fraction', " + "'lower' or 'higher'") + + i = int(idx) + if i == idx: + indexer[axis] = slice(i, i + 1) + weights = array(1) + sumval = 1.0 + else: + indexer[axis] = slice(i, i + 2) + j = i + 1 + weights = array([(j - idx), (idx - i)], float) + wshape = [1] * sorted.ndim + wshape[axis] = 2 + weights.shape = wshape + sumval = weights.sum() + + # Use np.add.reduce to coerce data type + return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval + + +def percentileofscore(a, score, kind='rank'): + """ + The percentile rank of a score relative to a list of scores. + + A `percentileofscore` of, for example, 80% means that 80% of the + scores in `a` are below the given score. In the case of gaps or + ties, the exact definition depends on the optional keyword, `kind`. + + Parameters + ---------- + a : array_like + Array of scores to which `score` is compared. + score : int or float + Score that is compared to the elements in `a`. + kind : {'rank', 'weak', 'strict', 'mean'}, optional + This optional parameter specifies the interpretation of the + resulting score: + + - "rank": Average percentage ranking of score. In case of + multiple matches, average the percentage rankings of + all matching scores. + - "weak": This kind corresponds to the definition of a cumulative + distribution function. A percentileofscore of 80% + means that 80% of values are less than or equal + to the provided score. + - "strict": Similar to "weak", except that only values that are + strictly less than the given score are counted. + - "mean": The average of the "weak" and "strict" scores, often used in + testing. See + + http://en.wikipedia.org/wiki/Percentile_rank + + Returns + ------- + pcos : float + Percentile-position of score (0-100) relative to `a`. + + Examples + -------- + Three-quarters of the given values lie below a given score: + + >>> percentileofscore([1, 2, 3, 4], 3) + 75.0 + + With multiple matches, note how the scores of the two matches, 0.6 + and 0.8 respectively, are averaged: + + >>> percentileofscore([1, 2, 3, 3, 4], 3) + 70.0 + + Only 2/5 values are strictly less than 3: + + >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='strict') + 40.0 + + But 4/5 values are less than or equal to 3: + + >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='weak') + 80.0 + + The average between the weak and the strict scores is + + >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='mean') + 60.0 + + """ + a = np.array(a) + n = len(a) + + if kind == 'rank': + if not(np.any(a == score)): + a = np.append(a, score) + a_len = np.array(list(range(len(a)))) + else: + a_len = np.array(list(range(len(a)))) + 1.0 + + a = np.sort(a) + idx = [a == score] + pct = (np.mean(a_len[idx]) / n) * 100.0 + return pct + + elif kind == 'strict': + return sum(a < score) / float(n) * 100 + elif kind == 'weak': + return sum(a <= score) / float(n) * 100 + elif kind == 'mean': + return (sum(a < score) + sum(a <= score)) * 50 / float(n) + else: + raise ValueError("kind can only be 'rank', 'strict', 'weak' or 'mean'") + + +def histogram2(a, bins): + """ + Compute histogram using divisions in bins. + + Count the number of times values from array `a` fall into + numerical ranges defined by `bins`. Range x is given by + bins[x] <= range_x < bins[x+1] where x =0,N and N is the + length of the `bins` array. The last range is given by + bins[N] <= range_N < infinity. Values less than bins[0] are + not included in the histogram. + + Parameters + ---------- + a : array_like of rank 1 + The array of values to be assigned into bins + bins : array_like of rank 1 + Defines the ranges of values to use during histogramming. + + Returns + ------- + histogram2 : ndarray of rank 1 + Each value represents the occurrences for a given bin (range) of + values. + + """ + # comment: probably obsoleted by numpy.histogram() + n = np.searchsorted(np.sort(a), bins) + n = np.concatenate([n, [len(a)]]) + return n[1:] - n[:-1] + + +def histogram(a, numbins=10, defaultlimits=None, weights=None, + printextras=False): + """ + Separates the range into several bins and returns the number of instances + in each bin. + + Parameters + ---------- + a : array_like + Array of scores which will be put into bins. + numbins : int, optional + The number of bins to use for the histogram. Default is 10. + defaultlimits : tuple (lower, upper), optional + The lower and upper values for the range of the histogram. + If no value is given, a range slightly larger then the range of the + values in a is used. Specifically ``(a.min() - s, a.max() + s)``, + where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``. + weights : array_like, optional + The weights for each value in `a`. Default is None, which gives each + value a weight of 1.0 + printextras : bool, optional + If True, if there are extra points (i.e. the points that fall outside + the bin limits) a warning is raised saying how many of those points + there are. Default is False. + + Returns + ------- + histogram : ndarray + Number of points (or sum of weights) in each bin. + low_range : float + Lowest value of histogram, the lower limit of the first bin. + binsize : float + The size of the bins (all bins have the same size). + extrapoints : int + The number of points outside the range of the histogram. + + See Also + -------- + numpy.histogram + + Notes + ----- + This histogram is based on numpy's histogram but has a larger range by + default if default limits is not set. + + """ + a = np.ravel(a) + if defaultlimits is None: + # no range given, so use values in `a` + data_min = a.min() + data_max = a.max() + # Have bins extend past min and max values slightly + s = (data_max - data_min) / (2. * (numbins - 1.)) + defaultlimits = (data_min - s, data_max + s) + # use numpy's histogram method to compute bins + hist, bin_edges = np.histogram(a, bins=numbins, range=defaultlimits, + weights=weights) + # hist are not always floats, convert to keep with old output + hist = np.array(hist, dtype=float) + # fixed width for bins is assumed, as numpy's histogram gives + # fixed width bins for int values for 'bins' + binsize = bin_edges[1] - bin_edges[0] + # calculate number of extra points + extrapoints = len([v for v in a + if defaultlimits[0] > v or v > defaultlimits[1]]) + if extrapoints > 0 and printextras: + warnings.warn("Points outside given histogram range = %s" + % extrapoints) + return (hist, defaultlimits[0], binsize, extrapoints) + + +def cumfreq(a, numbins=10, defaultreallimits=None, weights=None): + """ + Returns a cumulative frequency histogram, using the histogram function. + + Parameters + ---------- + a : array_like + Input array. + numbins : int, optional + The number of bins to use for the histogram. Default is 10. + defaultlimits : tuple (lower, upper), optional + The lower and upper values for the range of the histogram. + If no value is given, a range slightly larger than the range of the + values in `a` is used. Specifically ``(a.min() - s, a.max() + s)``, + where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``. + weights : array_like, optional + The weights for each value in `a`. Default is None, which gives each + value a weight of 1.0 + + Returns + ------- + cumfreq : ndarray + Binned values of cumulative frequency. + lowerreallimit : float + Lower real limit + binsize : float + Width of each bin. + extrapoints : int + Extra points. + + Examples + -------- + >>> x = [1, 4, 2, 1, 3, 1] + >>> cumfreqs, lowlim, binsize, extrapoints = sp.stats.cumfreq(x, numbins=4) + >>> cumfreqs + array([ 3., 4., 5., 6.]) + >>> cumfreqs, lowlim, binsize, extrapoints = \ + ... sp.stats.cumfreq(x, numbins=4, defaultreallimits=(1.5, 5)) + >>> cumfreqs + array([ 1., 2., 3., 3.]) + >>> extrapoints + 3 + + """ + h, l, b, e = histogram(a, numbins, defaultreallimits, weights=weights) + cumhist = np.cumsum(h * 1, axis=0) + return cumhist, l, b, e + + +def relfreq(a, numbins=10, defaultreallimits=None, weights=None): + """ + Returns a relative frequency histogram, using the histogram function. + + Parameters + ---------- + a : array_like + Input array. + numbins : int, optional + The number of bins to use for the histogram. Default is 10. + defaultreallimits : tuple (lower, upper), optional + The lower and upper values for the range of the histogram. + If no value is given, a range slightly larger then the range of the + values in a is used. Specifically ``(a.min() - s, a.max() + s)``, + where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``. + weights : array_like, optional + The weights for each value in `a`. Default is None, which gives each + value a weight of 1.0 + + Returns + ------- + relfreq : ndarray + Binned values of relative frequency. + lowerreallimit : float + Lower real limit + binsize : float + Width of each bin. + extrapoints : int + Extra points. + + Examples + -------- + >>> a = np.array([1, 4, 2, 1, 3, 1]) + >>> relfreqs, lowlim, binsize, extrapoints = sp.stats.relfreq(a, numbins=4) + >>> relfreqs + array([ 0.5 , 0.16666667, 0.16666667, 0.16666667]) + >>> np.sum(relfreqs) # relative frequencies should add up to 1 + 0.99999999999999989 + + """ + h, l, b, e = histogram(a, numbins, defaultreallimits, weights=weights) + h = np.array(h / float(np.array(a).shape[0])) + return h, l, b, e + + +# +# VARIABILITY FUNCTIONS ##### +# + +def obrientransform(*args): + """ + Computes the O'Brien transform on input data (any number of arrays). + + Used to test for homogeneity of variance prior to running one-way stats. + Each array in ``*args`` is one level of a factor. + If `f_oneway` is run on the transformed data and found significant, + the variances are unequal. From Maxwell and Delaney [1]_, p.112. + + Parameters + ---------- + args : tuple of array_like + Any number of arrays. + + Returns + ------- + obrientransform : ndarray + Transformed data for use in an ANOVA. The first dimension + of the result corresponds to the sequence of transformed + arrays. If the arrays given are all 1-D of the same length, + the return value is a 2-D array; otherwise it is a 1-D array + of type object, with each element being an ndarray. + + References + ---------- + .. [1] S. E. Maxwell and H. D. Delaney, "Designing Experiments and + Analyzing Data: A Model Comparison Perspective", Wadsworth, 1990. + + Examples + -------- + We'll test the following data sets for differences in their variance. + + >>> x = [10, 11, 13, 9, 7, 12, 12, 9, 10] + >>> y = [13, 21, 5, 10, 8, 14, 10, 12, 7, 15] + + Apply the O'Brien transform to the data. + + >>> tx, ty = obrientransform(x, y) + + Use `scipy.stats.f_oneway` to apply a one-way ANOVA test to the + transformed data. + + >>> from scipy.stats import f_oneway + >>> F, p = f_oneway(tx, ty) + >>> p + 0.1314139477040335 + + If we require that ``p < 0.05`` for significance, we cannot conclude + that the variances are different. + """ + TINY = np.sqrt(np.finfo(float).eps) + + # `arrays` will hold the transformed arguments. + arrays = [] + + for arg in args: + a = np.asarray(arg) + n = len(a) + mu = np.mean(a) + sq = (a - mu) ** 2 + sumsq = sq.sum() + + # The O'Brien transform. + t = ((n - 1.5) * n * sq - 0.5 * sumsq) / ((n - 1) * (n - 2)) + + # Check that the mean of the transformed data is equal to the + # original variance. + var = sumsq / (n - 1) + if abs(var - np.mean(t)) > TINY: + raise ValueError('Lack of convergence in obrientransform.') + + arrays.append(t) + + # If the arrays are not all the same shape, calling np.array(arrays) + # creates a 1-D array with dtype `object` in numpy 1.6+. In numpy + # 1.5.x, it raises an exception. To work around this, we explicitly + # set the dtype to `object` when the arrays are not all the same shape. + if len(arrays) < 2 or all(x.shape == arrays[0].shape for x in arrays[1:]): + dt = None + else: + dt = object + return np.array(arrays, dtype=dt) + + +def signaltonoise(a, axis=0, ddof=0): + """ + The signal-to-noise ratio of the input data. + + Returns the signal-to-noise ratio of `a`, here defined as the mean + divided by the standard deviation. + + Parameters + ---------- + a : array_like + An array_like object containing the sample data. + axis : int or None, optional + If axis is equal to None, the array is first ravel'd. If axis is an + integer, this is the axis over which to operate. Default is 0. + ddof : int, optional + Degrees of freedom correction for standard deviation. Default is 0. + + Returns + ------- + s2n : ndarray + The mean to standard deviation ratio(s) along `axis`, or 0 where the + standard deviation is 0. + + """ + a = np.asanyarray(a) + m = a.mean(axis) + sd = a.std(axis=axis, ddof=ddof) + return np.where(sd == 0, 0, m / sd) + + +def sem(a, axis=0, ddof=1): + """ + Calculates the standard error of the mean (or standard error of + measurement) of the values in the input array. + + Parameters + ---------- + a : array_like + An array containing the values for which the standard error is + returned. + axis : int or None, optional. + If axis is None, ravel `a` first. If axis is an integer, this will be + the axis over which to operate. Defaults to 0. + ddof : int, optional + Delta degrees-of-freedom. How many degrees of freedom to adjust + for bias in limited samples relative to the population estimate + of variance. Defaults to 1. + + Returns + ------- + s : ndarray or float + The standard error of the mean in the sample(s), along the input axis. + + Notes + ----- + The default value for `ddof` is different to the default (0) used by other + ddof containing routines, such as np.std nd stats.nanstd. + + Examples + -------- + Find standard error along the first axis: + + >>> from scipy import stats + >>> a = np.arange(20).reshape(5,4) + >>> stats.sem(a) + array([ 2.8284, 2.8284, 2.8284, 2.8284]) + + Find standard error across the whole array, using n degrees of freedom: + + >>> stats.sem(a, axis=None, ddof=0) + 1.2893796958227628 + + """ + a, axis = _chk_asarray(a, axis) + n = a.shape[axis] + s = np.std(a, axis=axis, ddof=ddof) / np.sqrt(n) # JP check normalization + return s + + +def zscore(a, axis=0, ddof=0): + """ + Calculates the z score of each value in the sample, relative to the sample + mean and standard deviation. + + Parameters + ---------- + a : array_like + An array like object containing the sample data. + axis : int or None, optional + If `axis` is equal to None, the array is first raveled. If `axis` is + an integer, this is the axis over which to operate. Default is 0. + ddof : int, optional + Degrees of freedom correction in the calculation of the + standard deviation. Default is 0. + + Returns + ------- + zscore : array_like + The z-scores, standardized by mean and standard deviation of input + array `a`. + + Notes + ----- + This function preserves ndarray subclasses, and works also with + matrices and masked arrays (it uses `asanyarray` instead of `asarray` + for parameters). + + Examples + -------- + >>> a = np.array([ 0.7972, 0.0767, 0.4383, 0.7866, 0.8091, 0.1954, + 0.6307, 0.6599, 0.1065, 0.0508]) + >>> from scipy import stats + >>> stats.zscore(a) + array([ 1.1273, -1.247 , -0.0552, 1.0923, 1.1664, -0.8559, 0.5786, + 0.6748, -1.1488, -1.3324]) + + Computing along a specified axis, using n-1 degrees of freedom (``ddof=1``) + to calculate the standard deviation: + + >>> b = np.array([[ 0.3148, 0.0478, 0.6243, 0.4608], + [ 0.7149, 0.0775, 0.6072, 0.9656], + [ 0.6341, 0.1403, 0.9759, 0.4064], + [ 0.5918, 0.6948, 0.904 , 0.3721], + [ 0.0921, 0.2481, 0.1188, 0.1366]]) + >>> stats.zscore(b, axis=1, ddof=1) + array([[-0.19264823, -1.28415119, 1.07259584, 0.40420358], + [ 0.33048416, -1.37380874, 0.04251374, 1.00081084], + [ 0.26796377, -1.12598418, 1.23283094, -0.37481053], + [-0.22095197, 0.24468594, 1.19042819, -1.21416216], + [-0.82780366, 1.4457416 , -0.43867764, -0.1792603 ]]) + """ + a = np.asanyarray(a) + mns = a.mean(axis=axis) + sstd = a.std(axis=axis, ddof=ddof) + if axis and mns.ndim < a.ndim: + return ((a - np.expand_dims(mns, axis=axis)) / + np.expand_dims(sstd, axis=axis)) + else: + return (a - mns) / sstd + + +def zmap(scores, compare, axis=0, ddof=0): + """ + Calculates the relative z-scores. + + Returns an array of z-scores, i.e., scores that are standardized to zero + mean and unit variance, where mean and variance are calculated from the + comparison array. + + Parameters + ---------- + scores : array_like + The input for which z-scores are calculated. + compare : array_like + The input from which the mean and standard deviation of the + normalization are taken; assumed to have the same dimension as + `scores`. + axis : int or None, optional + Axis over which mean and variance of `compare` are calculated. + Default is 0. + ddof : int, optional + Degrees of freedom correction in the calculation of the + standard deviation. Default is 0. + + Returns + ------- + zscore : array_like + Z-scores, in the same shape as `scores`. + + Notes + ----- + This function preserves ndarray subclasses, and works also with + matrices and masked arrays (it uses `asanyarray` instead of `asarray` + for parameters). + + Examples + -------- + >>> a = [0.5, 2.0, 2.5, 3] + >>> b = [0, 1, 2, 3, 4] + >>> zmap(a, b) + array([-1.06066017, 0. , 0.35355339, 0.70710678]) + """ + scores, compare = map(np.asanyarray, [scores, compare]) + mns = compare.mean(axis=axis) + sstd = compare.std(axis=axis, ddof=ddof) + if axis and mns.ndim < compare.ndim: + return ((scores - np.expand_dims(mns, axis=axis)) / + np.expand_dims(sstd, axis=axis)) + else: + return (scores - mns) / sstd + + +# +# TRIMMING FUNCTIONS ####### +# + +def threshold(a, threshmin=None, threshmax=None, newval=0): + """ + Clip array to a given value. + + Similar to numpy.clip(), except that values less than `threshmin` or + greater than `threshmax` are replaced by `newval`, instead of by + `threshmin` and `threshmax` respectively. + + Parameters + ---------- + a : array_like + Data to threshold. + threshmin : float, int or None, optional + Minimum threshold, defaults to None. + threshmax : float, int or None, optional + Maximum threshold, defaults to None. + newval : float or int, optional + Value to put in place of values in `a` outside of bounds. + Defaults to 0. + + Returns + ------- + out : ndarray + The clipped input array, with values less than `threshmin` or + greater than `threshmax` replaced with `newval`. + + Examples + -------- + >>> a = np.array([9, 9, 6, 3, 1, 6, 1, 0, 0, 8]) + >>> from scipy import stats + >>> stats.threshold(a, threshmin=2, threshmax=8, newval=-1) + array([-1, -1, 6, 3, -1, 6, -1, -1, -1, 8]) + + """ + a = asarray(a).copy() + mask = zeros(a.shape, dtype=bool) + if threshmin is not None: + mask |= (a < threshmin) + if threshmax is not None: + mask |= (a > threshmax) + a[mask] = newval + return a + + +def sigmaclip(a, low=4., high=4.): + """ + Iterative sigma-clipping of array elements. + + The output array contains only those elements of the input array `c` + that satisfy the conditions :: + + mean(c) - std(c)*low < c < mean(c) + std(c)*high + + Starting from the full sample, all elements outside the critical range are + removed. The iteration continues with a new critical range until no + elements are outside the range. + + Parameters + ---------- + a : array_like + Data array, will be raveled if not 1-D. + low : float, optional + Lower bound factor of sigma clipping. Default is 4. + high : float, optional + Upper bound factor of sigma clipping. Default is 4. + + Returns + ------- + c : ndarray + Input array with clipped elements removed. + critlower : float + Lower threshold value use for clipping. + critlupper : float + Upper threshold value use for clipping. + + Examples + -------- + >>> a = np.concatenate((np.linspace(9.5,10.5,31), np.linspace(0,20,5))) + >>> fact = 1.5 + >>> c, low, upp = sigmaclip(a, fact, fact) + >>> c + array([ 9.96666667, 10. , 10.03333333, 10. ]) + >>> c.var(), c.std() + (0.00055555555555555165, 0.023570226039551501) + >>> low, c.mean() - fact*c.std(), c.min() + (9.9646446609406727, 9.9646446609406727, 9.9666666666666668) + >>> upp, c.mean() + fact*c.std(), c.max() + (10.035355339059327, 10.035355339059327, 10.033333333333333) + + >>> a = np.concatenate((np.linspace(9.5,10.5,11), + np.linspace(-100,-50,3))) + >>> c, low, upp = sigmaclip(a, 1.8, 1.8) + >>> (c == np.linspace(9.5,10.5,11)).all() + True + + """ + c = np.asarray(a).ravel() + delta = 1 + while delta: + c_std = c.std() + c_mean = c.mean() + size = c.size + critlower = c_mean - c_std * low + critupper = c_mean + c_std * high + c = c[(c > critlower) & (c < critupper)] + delta = size - c.size + return c, critlower, critupper + + +def trimboth(a, proportiontocut, axis=0): + """ + Slices off a proportion of items from both ends of an array. + + Slices off the passed proportion of items from both ends of the passed + array (i.e., with `proportiontocut` = 0.1, slices leftmost 10% **and** + rightmost 10% of scores). You must pre-sort the array if you want + 'proper' trimming. Slices off less if proportion results in a + non-integer slice index (i.e., conservatively slices off + `proportiontocut`). + + Parameters + ---------- + a : array_like + Data to trim. + proportiontocut : float + Proportion (in range 0-1) of total data set to trim of each end. + axis : int or None, optional + Axis along which the observations are trimmed. The default is to trim + along axis=0. If axis is None then the array will be flattened before + trimming. + + Returns + ------- + out : ndarray + Trimmed version of array `a`. + + See Also + -------- + trim_mean + + Examples + -------- + >>> from scipy import stats + >>> a = np.arange(20) + >>> b = stats.trimboth(a, 0.1) + >>> b.shape + (16,) + + """ + a = np.asarray(a) + if axis is None: + a = a.ravel() + axis = 0 + + nobs = a.shape[axis] + lowercut = int(proportiontocut * nobs) + uppercut = nobs - lowercut + if (lowercut >= uppercut): + raise ValueError("Proportion too big.") + + sl = [slice(None)] * a.ndim + sl[axis] = slice(lowercut, uppercut) + return a[sl] + + +def trim1(a, proportiontocut, tail='right'): + """ + Slices off a proportion of items from ONE end of the passed array + distribution. + + If `proportiontocut` = 0.1, slices off 'leftmost' or 'rightmost' + 10% of scores. Slices off LESS if proportion results in a non-integer + slice index (i.e., conservatively slices off `proportiontocut` ). + + Parameters + ---------- + a : array_like + Input array + proportiontocut : float + Fraction to cut off of 'left' or 'right' of distribution + tail : {'left', 'right'}, optional + Defaults to 'right'. + + Returns + ------- + trim1 : ndarray + Trimmed version of array `a` + + """ + a = asarray(a) + if tail.lower() == 'right': + lowercut = 0 + uppercut = len(a) - int(proportiontocut * len(a)) + elif tail.lower() == 'left': + lowercut = int(proportiontocut * len(a)) + uppercut = len(a) + + return a[lowercut:uppercut] + + +def trim_mean(a, proportiontocut, axis=0): + """ + Return mean of array after trimming distribution from both lower and upper + tails. + + If `proportiontocut` = 0.1, slices off 'leftmost' and 'rightmost' 10% of + scores. Slices off LESS if proportion results in a non-integer slice + index (i.e., conservatively slices off `proportiontocut` ). + + Parameters + ---------- + a : array_like + Input array + proportiontocut : float + Fraction to cut off of both tails of the distribution + axis : int or None, optional + Axis along which the trimmed means are computed. The default is axis=0. + If axis is None then the trimmed mean will be computed for the + flattened array. + + Returns + ------- + trim_mean : ndarray + Mean of trimmed array. + + See Also + -------- + trimboth + + Examples + -------- + >>> from scipy import stats + >>> x = np.arange(20) + >>> stats.trim_mean(x, 0.1) + 9.5 + >>> x2 = x.reshape(5, 4) + >>> x2 + array([[ 0, 1, 2, 3], + [ 4, 5, 6, 7], + [ 8, 9, 10, 11], + [12, 13, 14, 15], + [16, 17, 18, 19]]) + >>> stats.trim_mean(x2, 0.25) + array([ 8., 9., 10., 11.]) + >>> stats.trim_mean(x2, 0.25, axis=1) + array([ 1.5, 5.5, 9.5, 13.5, 17.5]) + + """ + a = np.asarray(a) + if axis is None: + nobs = a.size + else: + nobs = a.shape[axis] + lowercut = int(proportiontocut * nobs) + uppercut = nobs - lowercut - 1 + if (lowercut > uppercut): + raise ValueError("Proportion too big.") + + try: + atmp = np.partition(a, (lowercut, uppercut), axis) + except AttributeError: + atmp = np.sort(a, axis) + + newa = trimboth(atmp, proportiontocut, axis=axis) + return np.mean(newa, axis=axis) + + +def f_oneway(*args): + """ + Performs a 1-way ANOVA. + + The one-way ANOVA tests the null hypothesis that two or more groups have + the same population mean. The test is applied to samples from two or + more groups, possibly with differing sizes. + + Parameters + ---------- + sample1, sample2, ... : array_like + The sample measurements for each group. + + Returns + ------- + F-value : float + The computed F-value of the test. + p-value : float + The associated p-value from the F-distribution. + + Notes + ----- + The ANOVA test has important assumptions that must be satisfied in order + for the associated p-value to be valid. + + 1. The samples are independent. + 2. Each sample is from a normally distributed population. + 3. The population standard deviations of the groups are all equal. This + property is known as homoscedasticity. + + If these assumptions are not true for a given set of data, it may still be + possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`) although + with some loss of power. + + The algorithm is from Heiman[2], pp.394-7. + + + References + ---------- + .. [1] Lowry, Richard. "Concepts and Applications of Inferential + Statistics". Chapter 14. + http://faculty.vassar.edu/lowry/ch14pt1.html + + .. [2] Heiman, G.W. Research Methods in Statistics. 2002. + + """ + args = list(map(np.asarray, args)) # convert to an numpy array + na = len(args) # ANOVA on 'na' groups, each in it's own array + alldata = np.concatenate(args) + bign = len(alldata) + sstot = ss(alldata) - (square_of_sums(alldata) / float(bign)) + ssbn = 0 + for a in args: + ssbn += square_of_sums(a) / float(len(a)) + ssbn -= (square_of_sums(alldata) / float(bign)) + sswn = sstot - ssbn + dfbn = na - 1 + dfwn = bign - na + msb = ssbn / float(dfbn) + msw = sswn / float(dfwn) + f = msb / msw + prob = fprob(dfbn, dfwn, f) + return f, prob + + +def pearsonr(x, y): + """ + Calculates a Pearson correlation coefficient and the p-value for testing + non-correlation. + + The Pearson correlation coefficient measures the linear relationship + between two datasets. Strictly speaking, Pearson's correlation requires + that each dataset be normally distributed. Like other correlation + coefficients, this one varies between -1 and +1 with 0 implying no + correlation. Correlations of -1 or +1 imply an exact linear + relationship. Positive correlations imply that as x increases, so does + y. Negative correlations imply that as x increases, y decreases. + + The p-value roughly indicates the probability of an uncorrelated system + producing datasets that have a Pearson correlation at least as extreme + as the one computed from these datasets. The p-values are not entirely + reliable but are probably reasonable for datasets larger than 500 or so. + + Parameters + ---------- + x : (N,) array_like + Input + y : (N,) array_like + Input + + Returns + ------- + (Pearson's correlation coefficient, + 2-tailed p-value) + + References + ---------- + http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation + + """ + # x and y should have same length. + x = np.asarray(x) + y = np.asarray(y) + n = len(x) + mx = x.mean() + my = y.mean() + xm, ym = x - mx, y - my + r_num = np.add.reduce(xm * ym) + r_den = np.sqrt(ss(xm) * ss(ym)) + r = r_num / r_den + + # Presumably, if abs(r) > 1, then it is only some small artifact of floating + # point arithmetic. + r = max(min(r, 1.0), -1.0) + df = n - 2 + if abs(r) == 1.0: + prob = 0.0 + else: + t_squared = r * r * (df / ((1.0 - r) * (1.0 + r))) + prob = betai(0.5 * df, 0.5, df / (df + t_squared)) + return r, prob + + +def fisher_exact(table, alternative='two-sided'): + """Performs a Fisher exact test on a 2x2 contingency table. + + Parameters + ---------- + table : array_like of ints + A 2x2 contingency table. Elements should be non-negative integers. + alternative : {'two-sided', 'less', 'greater'}, optional + Which alternative hypothesis to the null hypothesis the test uses. + Default is 'two-sided'. + + Returns + ------- + oddsratio : float + This is prior odds ratio and not a posterior estimate. + p_value : float + P-value, the probability of obtaining a distribution at least as + extreme as the one that was actually observed, assuming that the + null hypothesis is true. + + See Also + -------- + chi2_contingency : Chi-square test of independence of variables in a + contingency table. + + Notes + ----- + The calculated odds ratio is different from the one R uses. In R language, + this implementation returns the (more common) "unconditional Maximum + Likelihood Estimate", while R uses the "conditional Maximum Likelihood + Estimate". + + For tables with large numbers the (inexact) chi-square test implemented + in the function `chi2_contingency` can also be used. + + Examples + -------- + Say we spend a few days counting whales and sharks in the Atlantic and + Indian oceans. In the Atlantic ocean we find 8 whales and 1 shark, in the + Indian ocean 2 whales and 5 sharks. Then our contingency table is:: + + Atlantic Indian + whales 8 2 + sharks 1 5 + + We use this table to find the p-value: + + >>> oddsratio, pvalue = stats.fisher_exact([[8, 2], [1, 5]]) + >>> pvalue + 0.0349... + + The probability that we would observe this or an even more imbalanced ratio + by chance is about 3.5%. A commonly used significance level is 5%, if we + adopt that we can therefore conclude that our observed imbalance is + statistically significant; whales prefer the Atlantic while sharks prefer + the Indian ocean. + + """ + hypergeom = distributions.hypergeom + # int32 is not enough for the algorithm + c = np.asarray(table, dtype=np.int64) + if not c.shape == (2, 2): + raise ValueError("The input `table` must be of shape (2, 2).") + + if np.any(c < 0): + raise ValueError("All values in `table` must be nonnegative.") + + if 0 in c.sum(axis=0) or 0 in c.sum(axis=1): + # If both values in a row or column are zero, the p-value is 1 and + # the odds ratio is NaN. + return np.nan, 1.0 + + if c[1, 0] > 0 and c[0, 1] > 0: + oddsratio = c[0, 0] * c[1, 1] / float(c[1, 0] * c[0, 1]) + else: + oddsratio = np.inf + + n1 = c[0, 0] + c[0, 1] + n2 = c[1, 0] + c[1, 1] + n = c[0, 0] + c[1, 0] + + def binary_search(n, n1, n2, side): + """Binary search for where to begin lower/upper halves in two-sided + test. + """ + if side == "upper": + minval = mode + maxval = n + else: + minval = 0 + maxval = mode + guess = -1 + while maxval - minval > 1: + if maxval == minval + 1 and guess == minval: + guess = maxval + else: + guess = (maxval + minval) // 2 + pguess = hypergeom.pmf(guess, n1 + n2, n1, n) + if side == "upper": + ng = guess - 1 + else: + ng = guess + 1 + if pguess <= pexact and hypergeom.pmf(ng, n1 + n2, n1, n) > pexact: + break + elif pguess < pexact: + maxval = guess + else: + minval = guess + if guess == -1: + guess = minval + if side == "upper": + while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon: + guess -= 1 + while hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon: + guess += 1 + else: + while hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon: + guess += 1 + while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon: + guess -= 1 + return guess + + if alternative == 'less': + pvalue = hypergeom.cdf(c[0, 0], n1 + n2, n1, n) + elif alternative == 'greater': + # Same formula as the 'less' case, but with the second column. + pvalue = hypergeom.cdf(c[0, 1], n1 + n2, n1, c[0, 1] + c[1, 1]) + elif alternative == 'two-sided': + mode = int(float((n + 1) * (n1 + 1)) / (n1 + n2 + 2)) + pexact = hypergeom.pmf(c[0, 0], n1 + n2, n1, n) + pmode = hypergeom.pmf(mode, n1 + n2, n1, n) + + epsilon = 1 - 1e-4 + if float(np.abs(pexact - pmode)) / np.abs(np.max(pexact, pmode)) <= 1 - epsilon: + return oddsratio, 1. + + elif c[0, 0] < mode: + plower = hypergeom.cdf(c[0, 0], n1 + n2, n1, n) + if hypergeom.pmf(n, n1 + n2, n1, n) > pexact / epsilon: + return oddsratio, plower + + guess = binary_search(n, n1, n2, "upper") + pvalue = plower + hypergeom.sf(guess - 1, n1 + n2, n1, n) + else: + pupper = hypergeom.sf(c[0, 0] - 1, n1 + n2, n1, n) + if hypergeom.pmf(0, n1 + n2, n1, n) > pexact / epsilon: + return oddsratio, pupper + + guess = binary_search(n, n1, n2, "lower") + pvalue = pupper + hypergeom.cdf(guess, n1 + n2, n1, n) + else: + msg = "`alternative` should be one of {'two-sided', 'less', 'greater'}" + raise ValueError(msg) + + if pvalue > 1.0: + pvalue = 1.0 + return oddsratio, pvalue + + +def spearmanr(a, b=None, axis=0): + """ + Calculates a Spearman rank-order correlation coefficient and the p-value + to test for non-correlation. + + The Spearman correlation is a nonparametric measure of the monotonicity + of the relationship between two datasets. Unlike the Pearson correlation, + the Spearman correlation does not assume that both datasets are normally + distributed. Like other correlation coefficients, this one varies + between -1 and +1 with 0 implying no correlation. Correlations of -1 or + +1 imply an exact monotonic relationship. Positive correlations imply that + as x increases, so does y. Negative correlations imply that as x + increases, y decreases. + + The p-value roughly indicates the probability of an uncorrelated system + producing datasets that have a Spearman correlation at least as extreme + as the one computed from these datasets. The p-values are not entirely + reliable but are probably reasonable for datasets larger than 500 or so. + + Parameters + ---------- + a, b : 1D or 2D array_like, b is optional + One or two 1-D or 2-D arrays containing multiple variables and + observations. Each column of `a` and `b` represents a variable, and + each row entry a single observation of those variables. See also + `axis`. Both arrays need to have the same length in the `axis` + dimension. + axis : int or None, optional + If axis=0 (default), then each column represents a variable, with + observations in the rows. If axis=0, the relationship is transposed: + each row represents a variable, while the columns contain observations. + If axis=None, then both arrays will be raveled. + + Returns + ------- + rho : float or ndarray (2-D square) + Spearman correlation matrix or correlation coefficient (if only 2 + variables are given as parameters. Correlation matrix is square with + length equal to total number of variables (columns or rows) in a and b + combined. + p-value : float + The two-sided p-value for a hypothesis test whose null hypothesis is + that two sets of data are uncorrelated, has same dimension as rho. + + Notes + ----- + Changes in scipy 0.8.0: rewrite to add tie-handling, and axis. + + References + ---------- + [CRCProbStat2000]_ Section 14.7 + + .. [CRCProbStat2000] Zwillinger, D. and Kokoska, S. (2000). CRC Standard + Probability and Statistics Tables and Formulae. Chapman & Hall: New + York. 2000. + + Examples + -------- + >>> spearmanr([1,2,3,4,5],[5,6,7,8,7]) + (0.82078268166812329, 0.088587005313543798) + >>> np.random.seed(1234321) + >>> x2n=np.random.randn(100,2) + >>> y2n=np.random.randn(100,2) + >>> spearmanr(x2n) + (0.059969996999699973, 0.55338590803773591) + >>> spearmanr(x2n[:,0], x2n[:,1]) + (0.059969996999699973, 0.55338590803773591) + >>> rho, pval = spearmanr(x2n,y2n) + >>> rho + array([[ 1. , 0.05997 , 0.18569457, 0.06258626], + [ 0.05997 , 1. , 0.110003 , 0.02534653], + [ 0.18569457, 0.110003 , 1. , 0.03488749], + [ 0.06258626, 0.02534653, 0.03488749, 1. ]]) + >>> pval + array([[ 0. , 0.55338591, 0.06435364, 0.53617935], + [ 0.55338591, 0. , 0.27592895, 0.80234077], + [ 0.06435364, 0.27592895, 0. , 0.73039992], + [ 0.53617935, 0.80234077, 0.73039992, 0. ]]) + >>> rho, pval = spearmanr(x2n.T, y2n.T, axis=1) + >>> rho + array([[ 1. , 0.05997 , 0.18569457, 0.06258626], + [ 0.05997 , 1. , 0.110003 , 0.02534653], + [ 0.18569457, 0.110003 , 1. , 0.03488749], + [ 0.06258626, 0.02534653, 0.03488749, 1. ]]) + >>> spearmanr(x2n, y2n, axis=None) + (0.10816770419260482, 0.1273562188027364) + >>> spearmanr(x2n.ravel(), y2n.ravel()) + (0.10816770419260482, 0.1273562188027364) + + >>> xint = np.random.randint(10,size=(100,2)) + >>> spearmanr(xint) + (0.052760927029710199, 0.60213045837062351) + + """ + a, axisout = _chk_asarray(a, axis) + ar = np.apply_along_axis(rankdata, axisout, a) + + br = None + if not b is None: + b, axisout = _chk_asarray(b, axis) + br = np.apply_along_axis(rankdata, axisout, b) + n = a.shape[axisout] + rs = np.corrcoef(ar, br, rowvar=axisout) + + olderr = np.seterr(divide='ignore') # rs can have elements equal to 1 + try: + t = rs * np.sqrt((n - 2) / ((rs + 1.0) * (1.0 - rs))) + finally: + np.seterr(**olderr) + prob = distributions.t.sf(np.abs(t), n - 2) * 2 + + if rs.shape == (2, 2): + return rs[1, 0], prob[1, 0] + else: + return rs, prob + + +def pointbiserialr(x, y): + """Calculates a point biserial correlation coefficient and the associated + p-value. + + The point biserial correlation is used to measure the relationship + between a binary variable, x, and a continuous variable, y. Like other + correlation coefficients, this one varies between -1 and +1 with 0 + implying no correlation. Correlations of -1 or +1 imply a determinative + relationship. + + This function uses a shortcut formula but produces the same result as + `pearsonr`. + + Parameters + ---------- + x : array_like of bools + Input array. + y : array_like + Input array. + + Returns + ------- + r : float + R value + p-value : float + 2-tailed p-value + + References + ---------- + http://en.wikipedia.org/wiki/Point-biserial_correlation_coefficient + + Examples + -------- + >>> from scipy import stats + >>> a = np.array([0, 0, 0, 1, 1, 1, 1]) + >>> b = np.arange(7) + >>> stats.pointbiserialr(a, b) + (0.8660254037844386, 0.011724811003954652) + >>> stats.pearsonr(a, b) + (0.86602540378443871, 0.011724811003954626) + >>> np.corrcoef(a, b) + array([[ 1. , 0.8660254], + [ 0.8660254, 1. ]]) + + """ + x = np.asarray(x, dtype=bool) + y = np.asarray(y, dtype=float) + n = len(x) + + # phat is the fraction of x values that are True + phat = x.sum() / float(len(x)) + y0 = y[~x] # y-values where x is False + y1 = y[x] # y-values where x is True + y0m = y0.mean() + y1m = y1.mean() + + # phat - phat**2 is more stable than phat*(1-phat) + rpb = (y1m - y0m) * np.sqrt(phat - phat ** 2) / y.std() + + df = n - 2 + # fixme: see comment about TINY in pearsonr() + TINY = 1e-20 + t = rpb * np.sqrt(df / ((1.0 - rpb + TINY) * (1.0 + rpb + TINY))) + prob = betai(0.5 * df, 0.5, df / (df + t * t)) + return rpb, prob + + +def kendalltau(x, y, initial_lexsort=True): + """ + Calculates Kendall's tau, a correlation measure for ordinal data. + + Kendall's tau is a measure of the correspondence between two rankings. + Values close to 1 indicate strong agreement, values close to -1 indicate + strong disagreement. This is the tau-b version of Kendall's tau which + accounts for ties. + + Parameters + ---------- + x, y : array_like + Arrays of rankings, of the same shape. If arrays are not 1-D, they will + be flattened to 1-D. + initial_lexsort : bool, optional + Whether to use lexsort or quicksort as the sorting method for the + initial sort of the inputs. Default is lexsort (True), for which + `kendalltau` is of complexity O(n log(n)). If False, the complexity is + O(n^2), but with a smaller pre-factor (so quicksort may be faster for + small arrays). + + Returns + ------- + Kendall's tau : float + The tau statistic. + p-value : float + The two-sided p-value for a hypothesis test whose null hypothesis is + an absence of association, tau = 0. + + Notes + ----- + The definition of Kendall's tau that is used is:: + + tau = (P - Q) / sqrt((P + Q + T) * (P + Q + U)) + + where P is the number of concordant pairs, Q the number of discordant + pairs, T the number of ties only in `x`, and U the number of ties only in + `y`. If a tie occurs for the same pair in both `x` and `y`, it is not + added to either T or U. + + References + ---------- + W.R. Knight, "A Computer Method for Calculating Kendall's Tau with + Ungrouped Data", Journal of the American Statistical Association, Vol. 61, + No. 314, Part 1, pp. 436-439, 1966. + + Examples + -------- + >>> x1 = [12, 2, 1, 12, 2] + >>> x2 = [1, 4, 7, 1, 0] + >>> tau, p_value = sp.stats.kendalltau(x1, x2) + >>> tau + -0.47140452079103173 + >>> p_value + 0.24821309157521476 + + """ + + x = np.asarray(x).ravel() + y = np.asarray(y).ravel() + n = np.int64(len(x)) + temp = list(range(n)) # support structure used by mergesort + # this closure recursively sorts sections of perm[] by comparing + # elements of y[perm[]] using temp[] as support + # returns the number of swaps required by an equivalent bubble sort + + def mergesort(offs, length): + exchcnt = 0 + if length == 1: + return 0 + if length == 2: + if y[perm[offs]] <= y[perm[offs + 1]]: + return 0 + t = perm[offs] + perm[offs] = perm[offs + 1] + perm[offs + 1] = t + return 1 + length0 = length // 2 + length1 = length - length0 + middle = offs + length0 + exchcnt += mergesort(offs, length0) + exchcnt += mergesort(middle, length1) + if y[perm[middle - 1]] < y[perm[middle]]: + return exchcnt + # merging + i = j = k = 0 + while j < length0 or k < length1: + if k >= length1 or (j < length0 and y[perm[offs + j]] <= + y[perm[middle + k]]): + temp[i] = perm[offs + j] + d = i - j + j += 1 + else: + temp[i] = perm[middle + k] + d = (offs + i) - (middle + k) + k += 1 + if d > 0: + exchcnt += d + i += 1 + perm[offs:offs + length] = temp[0:length] + return exchcnt + + # initial sort on values of x and, if tied, on values of y + if initial_lexsort: + # sort implemented as mergesort, worst case: O(n log(n)) + perm = np.lexsort((y, x)) + else: + # sort implemented as quicksort, 30% faster but with worst case: O(n^2) + perm = list(range(n)) + perm.sort(key=lambda a: (x[a], y[a])) + + # compute joint ties + first = 0 + t = 0 + for i in xrange(1, n): + if x[perm[first]] != x[perm[i]] or y[perm[first]] != y[perm[i]]: + t += ((i - first) * (i - first - 1)) // 2 + first = i + t += ((n - first) * (n - first - 1)) // 2 + + # compute ties in x + first = 0 + u = 0 + for i in xrange(1, n): + if x[perm[first]] != x[perm[i]]: + u += ((i - first) * (i - first - 1)) // 2 + first = i + u += ((n - first) * (n - first - 1)) // 2 + + # count exchanges + exchanges = mergesort(0, n) + # compute ties in y after mergesort with counting + first = 0 + v = 0 + for i in xrange(1, n): + if y[perm[first]] != y[perm[i]]: + v += ((i - first) * (i - first - 1)) // 2 + first = i + v += ((n - first) * (n - first - 1)) // 2 + + tot = (n * (n - 1)) // 2 + if tot == u or tot == v: + return (np.nan, np.nan) # Special case for all ties in both ranks + + # Prevent overflow; equal to np.sqrt((tot - u) * (tot - v)) + denom = np.exp(0.5 * (np.log(tot - u) + np.log(tot - v))) + tau = ((tot - (v + u - t)) - 2.0 * exchanges) / denom + + # what follows reproduces the ending of Gary Strangman's original + # stats.kendalltau() in SciPy + svar = (4.0 * n + 10.0) / (9.0 * n * (n - 1)) + z = tau / np.sqrt(svar) + prob = special.erfc(np.abs(z) / 1.4142136) + + return tau, prob + + +def linregress(x, y=None): + """ + Calculate a regression line + + This computes a least-squares regression for two sets of measurements. + + Parameters + ---------- + x, y : array_like + two sets of measurements. Both arrays should have the same length. + If only x is given (and y=None), then it must be a two-dimensional + array where one dimension has length 2. The two sets of measurements + are then found by splitting the array along the length-2 dimension. + + Returns + ------- + slope : float + slope of the regression line + intercept : float + intercept of the regression line + r-value : float + correlation coefficient + p-value : float + two-sided p-value for a hypothesis test whose null hypothesis is + that the slope is zero. + stderr : float + Standard error of the estimate + + + Examples + -------- + >>> from scipy import stats + >>> import numpy as np + >>> x = np.random.random(10) + >>> y = np.random.random(10) + >>> slope, intercept, r_value, p_value, std_err = stats.linregress(x,y) + + # To get coefficient of determination (r_squared) + + >>> print "r-squared:", r_value**2 + r-squared: 0.15286643777 + + """ + TINY = 1.0e-20 + if y is None: # x is a (2, N) or (N, 2) shaped array_like + x = asarray(x) + if x.shape[0] == 2: + x, y = x + elif x.shape[1] == 2: + x, y = x.T + else: + msg = "If only `x` is given as input, it has to be of shape (2, N) \ + or (N, 2), provided shape was %s" % str(x.shape) + raise ValueError(msg) + else: + x = asarray(x) + y = asarray(y) + n = len(x) + xmean = np.mean(x, None) + ymean = np.mean(y, None) + + # average sum of squares: + ssxm, ssxym, ssyxm, ssym = np.cov(x, y, bias=1).flat + r_num = ssxym + r_den = np.sqrt(ssxm * ssym) + if r_den == 0.0: + r = 0.0 + else: + r = r_num / r_den + # test for numerical error propagation + if (r > 1.0): + r = 1.0 + elif (r < -1.0): + r = -1.0 + + df = n - 2 + t = r * np.sqrt(df / ((1.0 - r + TINY) * (1.0 + r + TINY))) + prob = distributions.t.sf(np.abs(t), df) * 2 + slope = r_num / ssxm + intercept = ymean - slope * xmean + sterrest = np.sqrt((1 - r * r) * ssym / ssxm / df) + return slope, intercept, r, prob, sterrest + + +# +# INFERENTIAL STATISTICS ##### +# + +def ttest_1samp(a, popmean, axis=0): + """ + Calculates the T-test for the mean of ONE group of scores. + + This is a two-sided test for the null hypothesis that the expected value + (mean) of a sample of independent observations `a` is equal to the given + population mean, `popmean`. + + Parameters + ---------- + a : array_like + sample observation + popmean : float or array_like + expected value in null hypothesis, if array_like than it must have the + same shape as `a` excluding the axis dimension + axis : int, optional, (default axis=0) + Axis can equal None (ravel array first), or an integer (the axis + over which to operate on a). + + Returns + ------- + t : float or array + t-statistic + prob : float or array + two-tailed p-value + + Examples + -------- + >>> from scipy import stats + + >>> np.random.seed(7654567) # fix seed to get the same result + >>> rvs = stats.norm.rvs(loc=5, scale=10, size=(50,2)) + + Test if mean of random sample is equal to true mean, and different mean. + We reject the null hypothesis in the second case and don't reject it in + the first case. + + >>> stats.ttest_1samp(rvs,5.0) + (array([-0.68014479, -0.04323899]), array([ 0.49961383, 0.96568674])) + >>> stats.ttest_1samp(rvs,0.0) + (array([ 2.77025808, 4.11038784]), array([ 0.00789095, 0.00014999])) + + Examples using axis and non-scalar dimension for population mean. + + >>> stats.ttest_1samp(rvs,[5.0,0.0]) + (array([-0.68014479, 4.11038784]), array([ 4.99613833e-01, 1.49986458e-04])) + >>> stats.ttest_1samp(rvs.T,[5.0,0.0],axis=1) + (array([-0.68014479, 4.11038784]), array([ 4.99613833e-01, 1.49986458e-04])) + >>> stats.ttest_1samp(rvs,[[5.0],[0.0]]) + (array([[-0.68014479, -0.04323899], + [ 2.77025808, 4.11038784]]), array([[ 4.99613833e-01, 9.65686743e-01], + [ 7.89094663e-03, 1.49986458e-04]])) + + """ + a, axis = _chk_asarray(a, axis) + n = a.shape[axis] + df = n - 1 + + d = np.mean(a, axis) - popmean + v = np.var(a, axis, ddof=1) + denom = np.sqrt(v / float(n)) + + t = np.divide(d, denom) + t, prob = _ttest_finish(df, t) + + return t, prob + + +def _ttest_finish(df, t): + """Common code between all 3 t-test functions.""" + # use np.abs to get upper tail + prob = distributions.t.sf(np.abs(t), df) * 2 + if t.ndim == 0: + t = t[()] + + return t, prob + + +def ttest_ind(a, b, axis=0, equal_var=True): + """ + Calculates the T-test for the means of TWO INDEPENDENT samples of scores. + + This is a two-sided test for the null hypothesis that 2 independent samples + have identical average (expected) values. This test assumes that the + populations have identical variances. + + Parameters + ---------- + a, b : array_like + The arrays must have the same shape, except in the dimension + corresponding to `axis` (the first, by default). + axis : int, optional + Axis can equal None (ravel array first), or an integer (the axis + over which to operate on a and b). + equal_var : bool, optional + If True (default), perform a standard independent 2 sample test + that assumes equal population variances [1]_. + If False, perform Welch's t-test, which does not assume equal + population variance [2]_. + + .. versionadded:: 0.11.0 + + Returns + ------- + t : float or array + The calculated t-statistic. + prob : float or array + The two-tailed p-value. + + Notes + ----- + We can use this test, if we observe two independent samples from + the same or different population, e.g. exam scores of boys and + girls or of two ethnic groups. The test measures whether the + average (expected) value differs significantly across samples. If + we observe a large p-value, for example larger than 0.05 or 0.1, + then we cannot reject the null hypothesis of identical average scores. + If the p-value is smaller than the threshold, e.g. 1%, 5% or 10%, + then we reject the null hypothesis of equal averages. + + References + ---------- + .. [1] http://en.wikipedia.org/wiki/T-test#Independent_two-sample_t-test + + .. [2] http://en.wikipedia.org/wiki/Welch%27s_t_test + + Examples + -------- + >>> from scipy import stats + >>> np.random.seed(12345678) + + Test with sample with identical means: + + >>> rvs1 = stats.norm.rvs(loc=5,scale=10,size=500) + >>> rvs2 = stats.norm.rvs(loc=5,scale=10,size=500) + >>> stats.ttest_ind(rvs1,rvs2) + (0.26833823296239279, 0.78849443369564776) + >>> stats.ttest_ind(rvs1,rvs2, equal_var = False) + (0.26833823296239279, 0.78849452749500748) + + `ttest_ind` underestimates p for unequal variances: + + >>> rvs3 = stats.norm.rvs(loc=5, scale=20, size=500) + >>> stats.ttest_ind(rvs1, rvs3) + (-0.46580283298287162, 0.64145827413436174) + >>> stats.ttest_ind(rvs1, rvs3, equal_var = False) + (-0.46580283298287162, 0.64149646246569292) + + When n1 != n2, the equal variance t-statistic is no longer equal to the + unequal variance t-statistic: + + >>> rvs4 = stats.norm.rvs(loc=5, scale=20, size=100) + >>> stats.ttest_ind(rvs1, rvs4) + (-0.99882539442782481, 0.3182832709103896) + >>> stats.ttest_ind(rvs1, rvs4, equal_var = False) + (-0.69712570584654099, 0.48716927725402048) + + T-test with different means, variance, and n: + + >>> rvs5 = stats.norm.rvs(loc=8, scale=20, size=100) + >>> stats.ttest_ind(rvs1, rvs5) + (-1.4679669854490653, 0.14263895620529152) + >>> stats.ttest_ind(rvs1, rvs5, equal_var = False) + (-0.94365973617132992, 0.34744170334794122) + + """ + a, b, axis = _chk2_asarray(a, b, axis) + if a.size == 0 or b.size == 0: + return (np.nan, np.nan) + + v1 = np.var(a, axis, ddof=1) + v2 = np.var(b, axis, ddof=1) + n1 = a.shape[axis] + n2 = b.shape[axis] + + if (equal_var): + df = n1 + n2 - 2 + svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / float(df) + denom = np.sqrt(svar * (1.0 / n1 + 1.0 / n2)) + else: + vn1 = v1 / n1 + vn2 = v2 / n2 + df = ((vn1 + vn2) ** 2) / \ + ((vn1 ** 2) / (n1 - 1) + (vn2 ** 2) / (n2 - 1)) + + # If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0). + # Hence it doesn't matter what df is as long as it's not NaN. + df = np.where(np.isnan(df), 1, df) + denom = np.sqrt(vn1 + vn2) + + d = np.mean(a, axis) - np.mean(b, axis) + t = np.divide(d, denom) + t, prob = _ttest_finish(df, t) + + return t, prob + + +def ttest_rel(a, b, axis=0): + """ + Calculates the T-test on TWO RELATED samples of scores, a and b. + + This is a two-sided test for the null hypothesis that 2 related or + repeated samples have identical average (expected) values. + + Parameters + ---------- + a, b : array_like + The arrays must have the same shape. + axis : int, optional, (default axis=0) + Axis can equal None (ravel array first), or an integer (the axis + over which to operate on a and b). + + Returns + ------- + t : float or array + t-statistic + prob : float or array + two-tailed p-value + + Notes + ----- + Examples for the use are scores of the same set of student in + different exams, or repeated sampling from the same units. The + test measures whether the average score differs significantly + across samples (e.g. exams). If we observe a large p-value, for + example greater than 0.05 or 0.1 then we cannot reject the null + hypothesis of identical average scores. If the p-value is smaller + than the threshold, e.g. 1%, 5% or 10%, then we reject the null + hypothesis of equal averages. Small p-values are associated with + large t-statistics. + + References + ---------- + http://en.wikipedia.org/wiki/T-test#Dependent_t-test + + Examples + -------- + >>> from scipy import stats + >>> np.random.seed(12345678) # fix random seed to get same numbers + + >>> rvs1 = stats.norm.rvs(loc=5,scale=10,size=500) + >>> rvs2 = (stats.norm.rvs(loc=5,scale=10,size=500) + + ... stats.norm.rvs(scale=0.2,size=500)) + >>> stats.ttest_rel(rvs1,rvs2) + (0.24101764965300962, 0.80964043445811562) + >>> rvs3 = (stats.norm.rvs(loc=8,scale=10,size=500) + + ... stats.norm.rvs(scale=0.2,size=500)) + >>> stats.ttest_rel(rvs1,rvs3) + (-3.9995108708727933, 7.3082402191726459e-005) + + """ + a, b, axis = _chk2_asarray(a, b, axis) + if a.shape[axis] != b.shape[axis]: + raise ValueError('unequal length arrays') + + if a.size == 0 or b.size == 0: + return (np.nan, np.nan) + + n = a.shape[axis] + df = float(n - 1) + + d = (a - b).astype(np.float64) + v = np.var(d, axis, ddof=1) + dm = np.mean(d, axis) + denom = np.sqrt(v / float(n)) + + t = np.divide(dm, denom) + t, prob = _ttest_finish(df, t) + + return t, prob + + +def kstest(rvs, cdf, args=(), N=20, alternative='two-sided', mode='approx'): + """ + Perform the Kolmogorov-Smirnov test for goodness of fit. + + This performs a test of the distribution G(x) of an observed + random variable against a given distribution F(x). Under the null + hypothesis the two distributions are identical, G(x)=F(x). The + alternative hypothesis can be either 'two-sided' (default), 'less' + or 'greater'. The KS test is only valid for continuous distributions. + + Parameters + ---------- + rvs : str, array or callable + If a string, it should be the name of a distribution in `scipy.stats`. + If an array, it should be a 1-D array of observations of random + variables. + If a callable, it should be a function to generate random variables; + it is required to have a keyword argument `size`. + cdf : str or callable + If a string, it should be the name of a distribution in `scipy.stats`. + If `rvs` is a string then `cdf` can be False or the same as `rvs`. + If a callable, that callable is used to calculate the cdf. + args : tuple, sequence, optional + Distribution parameters, used if `rvs` or `cdf` are strings. + N : int, optional + Sample size if `rvs` is string or callable. Default is 20. + alternative : {'two-sided', 'less','greater'}, optional + Defines the alternative hypothesis (see explanation above). + Default is 'two-sided'. + mode : 'approx' (default) or 'asymp', optional + Defines the distribution used for calculating the p-value. + + - 'approx' : use approximation to exact distribution of test statistic + - 'asymp' : use asymptotic distribution of test statistic + + Returns + ------- + D : float + KS test statistic, either D, D+ or D-. + p-value : float + One-tailed or two-tailed p-value. + + Notes + ----- + In the one-sided test, the alternative is that the empirical + cumulative distribution function of the random variable is "less" + or "greater" than the cumulative distribution function F(x) of the + hypothesis, ``G(x)<=F(x)``, resp. ``G(x)>=F(x)``. + + Examples + -------- + >>> from scipy import stats + + >>> x = np.linspace(-15, 15, 9) + >>> stats.kstest(x, 'norm') + (0.44435602715924361, 0.038850142705171065) + + >>> np.random.seed(987654321) # set random seed to get the same result + >>> stats.kstest('norm', False, N=100) + (0.058352892479417884, 0.88531190944151261) + + The above lines are equivalent to: + + >>> np.random.seed(987654321) + >>> stats.kstest(stats.norm.rvs(size=100), 'norm') + (0.058352892479417884, 0.88531190944151261) + + *Test against one-sided alternative hypothesis* + + Shift distribution to larger values, so that ``cdf_dgp(x) < norm.cdf(x)``: + + >>> np.random.seed(987654321) + >>> x = stats.norm.rvs(loc=0.2, size=100) + >>> stats.kstest(x,'norm', alternative = 'less') + (0.12464329735846891, 0.040989164077641749) + + Reject equal distribution against alternative hypothesis: less + + >>> stats.kstest(x,'norm', alternative = 'greater') + (0.0072115233216311081, 0.98531158590396395) + + Don't reject equal distribution against alternative hypothesis: greater + + >>> stats.kstest(x,'norm', mode='asymp') + (0.12464329735846891, 0.08944488871182088) + + *Testing t distributed random variables against normal distribution* + + With 100 degrees of freedom the t distribution looks close to the normal + distribution, and the K-S test does not reject the hypothesis that the + sample came from the normal distribution: + + >>> np.random.seed(987654321) + >>> stats.kstest(stats.t.rvs(100,size=100),'norm') + (0.072018929165471257, 0.67630062862479168) + + With 3 degrees of freedom the t distribution looks sufficiently different + from the normal distribution, that we can reject the hypothesis that the + sample came from the normal distribution at the 10% level: + + >>> np.random.seed(987654321) + >>> stats.kstest(stats.t.rvs(3,size=100),'norm') + (0.131016895759829, 0.058826222555312224) + + """ + if isinstance(rvs, string_types): + if (not cdf) or (cdf == rvs): + cdf = getattr(distributions, rvs).cdf + rvs = getattr(distributions, rvs).rvs + else: + raise AttributeError("if rvs is string, cdf has to be the " + "same distribution") + + if isinstance(cdf, string_types): + cdf = getattr(distributions, cdf).cdf + if callable(rvs): + kwds = {'size': N} + vals = np.sort(rvs(*args, **kwds)) + else: + vals = np.sort(rvs) + N = len(vals) + cdfvals = cdf(vals, *args) + + # to not break compatibility with existing code + if alternative == 'two_sided': + alternative = 'two-sided' + + if alternative in ['two-sided', 'greater']: + Dplus = (np.arange(1.0, N + 1) / N - cdfvals).max() + if alternative == 'greater': + return Dplus, distributions.ksone.sf(Dplus, N) + + if alternative in ['two-sided', 'less']: + Dmin = (cdfvals - np.arange(0.0, N) / N).max() + if alternative == 'less': + return Dmin, distributions.ksone.sf(Dmin, N) + + if alternative == 'two-sided': + D = np.max([Dplus, Dmin]) + if mode == 'asymp': + return D, distributions.kstwobign.sf(D * np.sqrt(N)) + if mode == 'approx': + pval_two = distributions.kstwobign.sf(D * np.sqrt(N)) + if N > 2666 or pval_two > 0.80 - N * 0.3 / 1000.0: + return D, distributions.kstwobign.sf(D * np.sqrt(N)) + else: + return D, distributions.ksone.sf(D, N) * 2 + + +# Map from names to lambda_ values used in power_divergence(). +_power_div_lambda_names = { + "pearson": 1, + "log-likelihood": 0, + "freeman-tukey": -0.5, + "mod-log-likelihood": -1, + "neyman": -2, + "cressie-read": 2 / 3, +} + + +def _count(a, axis=None): + """ + Count the number of non-masked elements of an array. + + This function behaves like np.ma.count(), but is much faster + for ndarrays. + """ + if hasattr(a, 'count'): + num = a.count(axis=axis) + if isinstance(num, np.ndarray) and num.ndim == 0: + # In some cases, the `count` method returns a scalar array (e.g. + # np.array(3)), but we want a plain integer. + num = int(num) + else: + if axis is None: + num = a.size + else: + num = a.shape[axis] + return num + + +def power_divergence(f_obs, f_exp=None, ddof=0, axis=0, lambda_=None): + """ + Cressie-Read power divergence statistic and goodness of fit test. + + This function tests the null hypothesis that the categorical data + has the given frequencies, using the Cressie-Read power divergence + statistic. + + Parameters + ---------- + f_obs : array_like + Observed frequencies in each category. + f_exp : array_like, optional + Expected frequencies in each category. By default the categories are + assumed to be equally likely. + ddof : int, optional + "Delta degrees of freedom": adjustment to the degrees of freedom + for the p-value. The p-value is computed using a chi-squared + distribution with ``k - 1 - ddof`` degrees of freedom, where `k` + is the number of observed frequencies. The default value of `ddof` + is 0. + axis : int or None, optional + The axis of the broadcast result of `f_obs` and `f_exp` along which to + apply the test. If axis is None, all values in `f_obs` are treated + as a single data set. Default is 0. + lambda_ : float or str, optional + `lambda_` gives the power in the Cressie-Read power divergence + statistic. The default is 1. For convenience, `lambda_` may be + assigned one of the following strings, in which case the + corresponding numerical value is used:: + + String Value Description + "pearson" 1 Pearson's chi-squared statistic. + In this case, the function is + equivalent to `stats.chisquare`. + "log-likelihood" 0 Log-likelihood ratio. Also known as + the G-test [3]_. + "freeman-tukey" -1/2 Freeman-Tukey statistic. + "mod-log-likelihood" -1 Modified log-likelihood ratio. + "neyman" -2 Neyman's statistic. + "cressie-read" 2/3 The power recommended in [5]_. + + Returns + ------- + stat : float or ndarray + The Cressie-Read power divergence test statistic. The value is + a float if `axis` is None or if` `f_obs` and `f_exp` are 1-D. + p : float or ndarray + The p-value of the test. The value is a float if `ddof` and the + return value `stat` are scalars. + + See Also + -------- + chisquare + + Notes + ----- + This test is invalid when the observed or expected frequencies in each + category are too small. A typical rule is that all of the observed + and expected frequencies should be at least 5. + + When `lambda_` is less than zero, the formula for the statistic involves + dividing by `f_obs`, so a warning or error may be generated if any value + in `f_obs` is 0. + + Similarly, a warning or error may be generated if any value in `f_exp` is + zero when `lambda_` >= 0. + + The default degrees of freedom, k-1, are for the case when no parameters + of the distribution are estimated. If p parameters are estimated by + efficient maximum likelihood then the correct degrees of freedom are + k-1-p. If the parameters are estimated in a different way, then the + dof can be between k-1-p and k-1. However, it is also possible that + the asymptotic distribution is not a chisquare, in which case this + test is not appropriate. + + This function handles masked arrays. If an element of `f_obs` or `f_exp` + is masked, then data at that position is ignored, and does not count + towards the size of the data set. + + .. versionadded:: 0.13.0 + + References + ---------- + .. [1] Lowry, Richard. "Concepts and Applications of Inferential + Statistics". Chapter 8. http://faculty.vassar.edu/lowry/ch8pt1.html + .. [2] "Chi-squared test", http://en.wikipedia.org/wiki/Chi-squared_test + .. [3] "G-test", http://en.wikipedia.org/wiki/G-test + .. [4] Sokal, R. R. and Rohlf, F. J. "Biometry: the principles and + practice of statistics in biological research", New York: Freeman + (1981) + .. [5] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit + Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984), + pp. 440-464. + + Examples + -------- + + (See `chisquare` for more examples.) + + When just `f_obs` is given, it is assumed that the expected frequencies + are uniform and given by the mean of the observed frequencies. Here we + perform a G-test (i.e. use the log-likelihood ratio statistic): + + >>> power_divergence([16, 18, 16, 14, 12, 12], method='log-likelihood') + (2.006573162632538, 0.84823476779463769) + + The expected frequencies can be given with the `f_exp` argument: + + >>> power_divergence([16, 18, 16, 14, 12, 12], + ... f_exp=[16, 16, 16, 16, 16, 8], + ... lambda_='log-likelihood') + (3.5, 0.62338762774958223) + + When `f_obs` is 2-D, by default the test is applied to each column. + + >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T + >>> obs.shape + (6, 2) + >>> power_divergence(obs, lambda_="log-likelihood") + (array([ 2.00657316, 6.77634498]), array([ 0.84823477, 0.23781225])) + + By setting ``axis=None``, the test is applied to all data in the array, + which is equivalent to applying the test to the flattened array. + + >>> power_divergence(obs, axis=None) + (23.31034482758621, 0.015975692534127565) + >>> power_divergence(obs.ravel()) + (23.31034482758621, 0.015975692534127565) + + `ddof` is the change to make to the default degrees of freedom. + + >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=1) + (2.0, 0.73575888234288467) + + The calculation of the p-values is done by broadcasting the + test statistic with `ddof`. + + >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=[0,1,2]) + (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ])) + + `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has + shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting + `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared + statistics, we must use ``axis=1``: + + >>> power_divergence([16, 18, 16, 14, 12, 12], + ... f_exp=[[16, 16, 16, 16, 16, 8], + ... [8, 20, 20, 16, 12, 12]], + ... axis=1) + (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846])) + + """ + # Convert the input argument `lambda_` to a numerical value. + if isinstance(lambda_, string_types): + if lambda_ not in _power_div_lambda_names: + names = repr(list(_power_div_lambda_names.keys()))[1:-1] + raise ValueError("invalid string for lambda_: {0!r}. Valid strings " + "are {1}".format(lambda_, names)) + lambda_ = _power_div_lambda_names[lambda_] + elif lambda_ is None: + lambda_ = 1 + + f_obs = np.asanyarray(f_obs) + + if f_exp is not None: + f_exp = np.atleast_1d(np.asanyarray(f_exp)) + else: + # Compute the equivalent of + # f_exp = f_obs.mean(axis=axis, keepdims=True) + # Older versions of numpy do not have the 'keepdims' argument, so + # we have to do a little work to achieve the same result. + # Ignore 'invalid' errors so the edge case of a data set with length 0 + # is handled without spurious warnings. + with np.errstate(invalid='ignore'): + f_exp = np.atleast_1d(f_obs.mean(axis=axis)) + if axis is not None: + reduced_shape = list(f_obs.shape) + reduced_shape[axis] = 1 + f_exp.shape = reduced_shape + + # `terms` is the array of terms that are summed along `axis` to create + # the test statistic. We use some specialized code for a few special + # cases of lambda_. + if lambda_ == 1: + # Pearson's chi-squared statistic + terms = (f_obs - f_exp) ** 2 / f_exp + elif lambda_ == 0: + # Log-likelihood ratio (i.e. G-test) + terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp) + elif lambda_ == -1: + # Modified log-likelihood ratio + terms = 2.0 * special.xlogy(f_exp, f_exp / f_obs) + else: + # General Cressie-Read power divergence. + terms = f_obs * ((f_obs / f_exp) ** lambda_ - 1) + terms /= 0.5 * lambda_ * (lambda_ + 1) + + stat = terms.sum(axis=axis) + + num_obs = _count(terms, axis=axis) + ddof = asarray(ddof) + p = chisqprob(stat, num_obs - 1 - ddof) + + return stat, p + + +def chisquare(f_obs, f_exp=None, ddof=0, axis=0): + """ + Calculates a one-way chi square test. + + The chi square test tests the null hypothesis that the categorical data + has the given frequencies. + + Parameters + ---------- + f_obs : array_like + Observed frequencies in each category. + f_exp : array_like, optional + Expected frequencies in each category. By default the categories are + assumed to be equally likely. + ddof : int, optional + "Delta degrees of freedom": adjustment to the degrees of freedom + for the p-value. The p-value is computed using a chi-squared + distribution with ``k - 1 - ddof`` degrees of freedom, where `k` + is the number of observed frequencies. The default value of `ddof` + is 0. + axis : int or None, optional + The axis of the broadcast result of `f_obs` and `f_exp` along which to + apply the test. If axis is None, all values in `f_obs` are treated + as a single data set. Default is 0. + + Returns + ------- + chisq : float or ndarray + The chi-squared test statistic. The value is a float if `axis` is + None or `f_obs` and `f_exp` are 1-D. + p : float or ndarray + The p-value of the test. The value is a float if `ddof` and the + return value `chisq` are scalars. + + See Also + -------- + power_divergence + mstats.chisquare + + Notes + ----- + This test is invalid when the observed or expected frequencies in each + category are too small. A typical rule is that all of the observed + and expected frequencies should be at least 5. + + The default degrees of freedom, k-1, are for the case when no parameters + of the distribution are estimated. If p parameters are estimated by + efficient maximum likelihood then the correct degrees of freedom are + k-1-p. If the parameters are estimated in a different way, then the + dof can be between k-1-p and k-1. However, it is also possible that + the asymptotic distribution is not a chisquare, in which case this + test is not appropriate. + + References + ---------- + .. [1] Lowry, Richard. "Concepts and Applications of Inferential + Statistics". Chapter 8. http://faculty.vassar.edu/lowry/ch8pt1.html + .. [2] "Chi-squared test", http://en.wikipedia.org/wiki/Chi-squared_test + + Examples + -------- + When just `f_obs` is given, it is assumed that the expected frequencies + are uniform and given by the mean of the observed frequencies. + + >>> chisquare([16, 18, 16, 14, 12, 12]) + (2.0, 0.84914503608460956) + + With `f_exp` the expected frequencies can be given. + + >>> chisquare([16, 18, 16, 14, 12, 12], f_exp=[16, 16, 16, 16, 16, 8]) + (3.5, 0.62338762774958223) + + When `f_obs` is 2-D, by default the test is applied to each column. + + >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T + >>> obs.shape + (6, 2) + >>> chisquare(obs) + (array([ 2. , 6.66666667]), array([ 0.84914504, 0.24663415])) + + By setting ``axis=None``, the test is applied to all data in the array, + which is equivalent to applying the test to the flattened array. + + >>> chisquare(obs, axis=None) + (23.31034482758621, 0.015975692534127565) + >>> chisquare(obs.ravel()) + (23.31034482758621, 0.015975692534127565) + + `ddof` is the change to make to the default degrees of freedom. + + >>> chisquare([16, 18, 16, 14, 12, 12], ddof=1) + (2.0, 0.73575888234288467) + + The calculation of the p-values is done by broadcasting the + chi-squared statistic with `ddof`. + + >>> chisquare([16, 18, 16, 14, 12, 12], ddof=[0,1,2]) + (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ])) + + `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has + shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting + `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared + statistics, we use ``axis=1``: + + >>> chisquare([16, 18, 16, 14, 12, 12], + ... f_exp=[[16, 16, 16, 16, 16, 8], [8, 20, 20, 16, 12, 12]], + ... axis=1) + (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846])) + + """ + return power_divergence(f_obs, f_exp=f_exp, ddof=ddof, axis=axis, + lambda_="pearson") + + +def ks_2samp(data1, data2): + """ + Computes the Kolmogorov-Smirnov statistic on 2 samples. + + This is a two-sided test for the null hypothesis that 2 independent samples + are drawn from the same continuous distribution. + + Parameters + ---------- + a, b : sequence of 1-D ndarrays + two arrays of sample observations assumed to be drawn from a continuous + distribution, sample sizes can be different + + Returns + ------- + D : float + KS statistic + p-value : float + two-tailed p-value + + Notes + ----- + This tests whether 2 samples are drawn from the same distribution. Note + that, like in the case of the one-sample K-S test, the distribution is + assumed to be continuous. + + This is the two-sided test, one-sided tests are not implemented. + The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution. + + If the K-S statistic is small or the p-value is high, then we cannot + reject the hypothesis that the distributions of the two samples + are the same. + + Examples + -------- + >>> from scipy import stats + >>> np.random.seed(12345678) #fix random seed to get the same result + >>> n1 = 200 # size of first sample + >>> n2 = 300 # size of second sample + + For a different distribution, we can reject the null hypothesis since the + pvalue is below 1%: + + >>> rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1) + >>> rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5) + >>> stats.ks_2samp(rvs1, rvs2) + (0.20833333333333337, 4.6674975515806989e-005) + + For a slightly different distribution, we cannot reject the null hypothesis + at a 10% or lower alpha since the p-value at 0.144 is higher than 10% + + >>> rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0) + >>> stats.ks_2samp(rvs1, rvs3) + (0.10333333333333333, 0.14498781825751686) + + For an identical distribution, we cannot reject the null hypothesis since + the p-value is high, 41%: + + >>> rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0) + >>> stats.ks_2samp(rvs1, rvs4) + (0.07999999999999996, 0.41126949729859719) + + """ + data1, data2 = map(asarray, (data1, data2)) + #n1 = data1.shape[0] + #n2 = data2.shape[0] + n1 = len(data1) + n2 = len(data2) + data1 = np.sort(data1) + data2 = np.sort(data2) + data_all = np.concatenate([data1, data2]) + cdf1 = np.searchsorted(data1, data_all, side='right') / (1.0 * n1) + cdf2 = (np.searchsorted(data2, data_all, side='right')) / (1.0 * n2) + d = np.max(np.absolute(cdf1 - cdf2)) + # Note: d absolute not signed distance + en = np.sqrt(n1 * n2 / float(n1 + n2)) + try: + prob = ksprob((en + 0.12 + 0.11 / en) * d) + except: + prob = 1.0 + return d, prob + + +def mannwhitneyu(x, y, use_continuity=True): + """ + Computes the Mann-Whitney rank test on samples x and y. + + Parameters + ---------- + x, y : array_like + Array of samples, should be one-dimensional. + use_continuity : bool, optional + Whether a continuity correction (1/2.) should be taken into + account. Default is True. + + Returns + ------- + u : float + The Mann-Whitney statistics. + prob : float + One-sided p-value assuming a asymptotic normal distribution. + + Notes + ----- + Use only when the number of observation in each sample is > 20 and + you have 2 independent samples of ranks. Mann-Whitney U is + significant if the u-obtained is LESS THAN or equal to the critical + value of U. + + This test corrects for ties and by default uses a continuity correction. + The reported p-value is for a one-sided hypothesis, to get the two-sided + p-value multiply the returned p-value by 2. + + """ + x = asarray(x) + y = asarray(y) + n1 = len(x) + n2 = len(y) + ranked = rankdata(np.concatenate((x, y))) + rankx = ranked[0:n1] # get the x-ranks + # calc U for x + u1 = n1 * n2 + (n1 * (n1 + 1)) / 2.0 - np.sum(rankx, axis=0) + u2 = n1 * n2 - u1 # remainder is U for y + bigu = max(u1, u2) + smallu = min(u1, u2) + T = tiecorrect(ranked) + if T == 0: + raise ValueError('All numbers are identical in amannwhitneyu') + sd = np.sqrt(T * n1 * n2 * (n1 + n2 + 1) / 12.0) + + if use_continuity: + # normal approximation for prob calc with continuity correction + z = abs((bigu - 0.5 - n1 * n2 / 2.0) / sd) + else: + # normal approximation for prob calc + z = abs((bigu - n1 * n2 / 2.0) / sd) + return smallu, distributions.norm.sf(z) # (1.0 - zprob(z)) + + +def ranksums(x, y): + """ + Compute the Wilcoxon rank-sum statistic for two samples. + + The Wilcoxon rank-sum test tests the null hypothesis that two sets + of measurements are drawn from the same distribution. The alternative + hypothesis is that values in one sample are more likely to be + larger than the values in the other sample. + + This test should be used to compare two samples from continuous + distributions. It does not handle ties between measurements + in x and y. For tie-handling and an optional continuity correction + see `scipy.stats.mannwhitneyu`. + + Parameters + ---------- + x,y : array_like + The data from the two samples + + Returns + ------- + z-statistic : float + The test statistic under the large-sample approximation that the + rank sum statistic is normally distributed + p-value : float + The two-sided p-value of the test + + References + ---------- + .. [1] http://en.wikipedia.org/wiki/Wilcoxon_rank-sum_test + + """ + x, y = map(np.asarray, (x, y)) + n1 = len(x) + n2 = len(y) + alldata = np.concatenate((x, y)) + ranked = rankdata(alldata) + x = ranked[:n1] + y = ranked[n1:] + s = np.sum(x, axis=0) + expected = n1 * (n1 + n2 + 1) / 2.0 + z = (s - expected) / np.sqrt(n1 * n2 * (n1 + n2 + 1) / 12.0) + prob = 2 * distributions.norm.sf(abs(z)) + return z, prob + + +def kruskal(*args): + """ + Compute the Kruskal-Wallis H-test for independent samples + + The Kruskal-Wallis H-test tests the null hypothesis that the population + median of all of the groups are equal. It is a non-parametric version of + ANOVA. The test works on 2 or more independent samples, which may have + different sizes. Note that rejecting the null hypothesis does not + indicate which of the groups differs. Post-hoc comparisons between + groups are required to determine which groups are different. + + Parameters + ---------- + sample1, sample2, ... : array_like + Two or more arrays with the sample measurements can be given as + arguments. + + Returns + ------- + H-statistic : float + The Kruskal-Wallis H statistic, corrected for ties + p-value : float + The p-value for the test using the assumption that H has a chi + square distribution + + Notes + ----- + Due to the assumption that H has a chi square distribution, the number + of samples in each group must not be too small. A typical rule is + that each sample must have at least 5 measurements. + + References + ---------- + .. [1] http://en.wikipedia.org/wiki/Kruskal-Wallis_one-way_analysis_of_variance + + """ + args = list(map(np.asarray, args)) # convert to a numpy array + na = len(args) # Kruskal-Wallis on 'na' groups, each in it's own array + if na < 2: + raise ValueError("Need at least two groups in stats.kruskal()") + n = np.asarray(list(map(len, args))) + + alldata = np.concatenate(args) + + ranked = rankdata(alldata) # Rank the data + T = tiecorrect(ranked) # Correct for ties + if T == 0: + raise ValueError('All numbers are identical in kruskal') + + # Compute sum^2/n for each group and sum + j = np.insert(np.cumsum(n), 0, 0) + ssbn = 0 + for i in range(na): + ssbn += square_of_sums(ranked[j[i]:j[i + 1]]) / float(n[i]) + + totaln = np.sum(n) + h = 12.0 / (totaln * (totaln + 1)) * ssbn - 3 * (totaln + 1) + df = na - 1 + h = h / float(T) + return h, chisqprob(h, df) + + +def friedmanchisquare(*args): + """ + Computes the Friedman test for repeated measurements + + The Friedman test tests the null hypothesis that repeated measurements of + the same individuals have the same distribution. It is often used + to test for consistency among measurements obtained in different ways. + For example, if two measurement techniques are used on the same set of + individuals, the Friedman test can be used to determine if the two + measurement techniques are consistent. + + Parameters + ---------- + measurements1, measurements2, measurements3... : array_like + Arrays of measurements. All of the arrays must have the same number + of elements. At least 3 sets of measurements must be given. + + Returns + ------- + friedman chi-square statistic : float + the test statistic, correcting for ties + p-value : float + the associated p-value assuming that the test statistic has a chi + squared distribution + + Notes + ----- + Due to the assumption that the test statistic has a chi squared + distribution, the p-value is only reliable for n > 10 and more than + 6 repeated measurements. + + References + ---------- + .. [1] http://en.wikipedia.org/wiki/Friedman_test + + """ + k = len(args) + if k < 3: + raise ValueError( + '\nLess than 3 levels. Friedman test not appropriate.\n') + + n = len(args[0]) + for i in range(1, k): + if len(args[i]) != n: + raise ValueError('Unequal N in friedmanchisquare. Aborting.') + + # Rank data + data = np.vstack(args).T + data = data.astype(float) + for i in range(len(data)): + data[i] = rankdata(data[i]) + + # Handle ties + ties = 0 + for i in range(len(data)): + _replist, repnum = find_repeats(array(data[i])) + for t in repnum: + ties += t * (t * t - 1) + c = 1 - ties / float(k * (k * k - 1) * n) + + ssbn = pysum(pysum(data) ** 2) + chisq = (12.0 / (k * n * (k + 1)) * ssbn - 3 * n * (k + 1)) / c + return chisq, chisqprob(chisq, k - 1) + + +# +# PROBABILITY CALCULATIONS #### +# + +zprob = special.ndtr + + +def chisqprob(chisq, df): + """ + Probability value (1-tail) for the Chi^2 probability distribution. + + Broadcasting rules apply. + + Parameters + ---------- + chisq : array_like or float > 0 + + df : array_like or float, probably int >= 1 + + Returns + ------- + chisqprob : ndarray + The area from `chisq` to infinity under the Chi^2 probability + distribution with degrees of freedom `df`. + + """ + return special.chdtrc(df, chisq) + +ksprob = special.kolmogorov +fprob = special.fdtrc + + +def betai(a, b, x): + """ + Returns the incomplete beta function. + + I_x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt) + + where a,b>0 and B(a,b) = G(a)*G(b)/(G(a+b)) where G(a) is the gamma + function of a. + + The standard broadcasting rules apply to a, b, and x. + + Parameters + ---------- + a : array_like or float > 0 + + b : array_like or float > 0 + + x : array_like or float + x will be clipped to be no greater than 1.0 . + + Returns + ------- + betai : ndarray + Incomplete beta function. + + """ + x = np.asarray(x) + x = np.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0 + return special.betainc(a, b, x) + + +# +# ANOVA CALCULATIONS ####### +# + +def f_value_wilks_lambda(ER, EF, dfnum, dfden, a, b): + """Calculation of Wilks lambda F-statistic for multivarite data, per + Maxwell & Delaney p.657. + """ + if isinstance(ER, (int, float)): + ER = array([[ER]]) + if isinstance(EF, (int, float)): + EF = array([[EF]]) + lmbda = linalg.det(EF) / linalg.det(ER) + if (a - 1) ** 2 + (b - 1) ** 2 == 5: + q = 1 + else: + q = np.sqrt( + ((a - 1) ** 2 * (b - 1) ** 2 - 2) / ((a - 1) ** 2 + (b - 1) ** 2 - 5)) + n_um = (1 - lmbda ** (1.0 / q)) * (a - 1) * (b - 1) + d_en = lmbda ** (1.0 / q) / (n_um * q - 0.5 * (a - 1) * (b - 1) + 1) + return n_um / d_en + + +def f_value(ER, EF, dfR, dfF): + """ + Returns an F-statistic for a restricted vs. unrestricted model. + + Parameters + ---------- + ER : float + `ER` is the sum of squared residuals for the restricted model + or null hypothesis + + EF : float + `EF` is the sum of squared residuals for the unrestricted model + or alternate hypothesis + + dfR : int + `dfR` is the degrees of freedom in the restricted model + + dfF : int + `dfF` is the degrees of freedom in the unrestricted model + + Returns + ------- + F-statistic : float + + """ + return ((ER - EF) / float(dfR - dfF) / (EF / float(dfF))) + + +def f_value_multivariate(ER, EF, dfnum, dfden): + """ + Returns a multivariate F-statistic. + + Parameters + ---------- + ER : ndarray + Error associated with the null hypothesis (the Restricted model). + From a multivariate F calculation. + EF : ndarray + Error associated with the alternate hypothesis (the Full model) + From a multivariate F calculation. + dfnum : int + Degrees of freedom the Restricted model. + dfden : int + Degrees of freedom associated with the Restricted model. + + Returns + ------- + fstat : float + The computed F-statistic. + + """ + if isinstance(ER, (int, float)): + ER = array([[ER]]) + if isinstance(EF, (int, float)): + EF = array([[EF]]) + n_um = (linalg.det(ER) - linalg.det(EF)) / float(dfnum) + d_en = linalg.det(EF) / float(dfden) + return n_um / d_en + + +# +# SUPPORT FUNCTIONS ######## +# + +def ss(a, axis=0): + """ + Squares each element of the input array, and returns the sum(s) of that. + + Parameters + ---------- + a : array_like + Input array. + axis : int or None, optional + The axis along which to calculate. If None, use whole array. + Default is 0, i.e. along the first axis. + + Returns + ------- + ss : ndarray + The sum along the given axis for (a**2). + + See also + -------- + square_of_sums : The square(s) of the sum(s) (the opposite of `ss`). + + Examples + -------- + >>> from scipy import stats + >>> a = np.array([1., 2., 5.]) + >>> stats.ss(a) + 30.0 + + And calculating along an axis: + + >>> b = np.array([[1., 2., 5.], [2., 5., 6.]]) + >>> stats.ss(b, axis=1) + array([ 30., 65.]) + + """ + a, axis = _chk_asarray(a, axis) + return np.sum(a * a, axis) + + +def square_of_sums(a, axis=0): + """ + Sums elements of the input array, and returns the square(s) of that sum. + + Parameters + ---------- + a : array_like + Input array. + axis : int or None, optional + If axis is None, ravel `a` first. If `axis` is an integer, this will + be the axis over which to operate. Defaults to 0. + + Returns + ------- + square_of_sums : float or ndarray + The square of the sum over `axis`. + + See also + -------- + ss : The sum of squares (the opposite of `square_of_sums`). + + Examples + -------- + >>> from scipy import stats + >>> a = np.arange(20).reshape(5,4) + >>> stats.square_of_sums(a) + array([ 1600., 2025., 2500., 3025.]) + >>> stats.square_of_sums(a, axis=None) + 36100.0 + + """ + a, axis = _chk_asarray(a, axis) + s = np.sum(a, axis) + if not np.isscalar(s): + return s.astype(float) * s + else: + return float(s) * s + + +def fastsort(a): + """ + Sort an array and provide the argsort. + + Parameters + ---------- + a : array_like + Input array. + + Returns + ------- + fastsort : ndarray of type int + sorted indices into the original array + + """ + # TODO: the wording in the docstring is nonsense. + it = np.argsort(a) + as_ = a[it] + return as_, it diff --git a/pywafo/src/wafo/stats/tests/__init__.py b/pywafo/src/wafo/stats/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pywafo/src/wafo/stats/tests/common_tests.py b/pywafo/src/wafo/stats/tests/common_tests.py index 3b9691c..726d485 100644 --- a/pywafo/src/wafo/stats/tests/common_tests.py +++ b/pywafo/src/wafo/stats/tests/common_tests.py @@ -1,154 +1,156 @@ -from __future__ import division, print_function, absolute_import - -import inspect -import warnings - -import numpy as np -import numpy.testing as npt - -#from scipy.lib._version import NumpyVersion -from scipy import stats - - -#NUMPY_BELOW_1_7 = NumpyVersion(np.__version__) < '1.7.0' -NUMPY_BELOW_1_7 =np.__version__ < '1.7.0' - - -def check_normalization(distfn, args, distname): - norm_moment = distfn.moment(0, *args) - npt.assert_allclose(norm_moment, 1.0) - - # this is a temporary plug: either ncf or expect is problematic; - # best be marked as a knownfail, but I've no clue how to do it. - if distname == "ncf": - atol, rtol = 1e-5, 0 - else: - atol, rtol = 1e-7, 1e-7 - - normalization_expect = distfn.expect(lambda x: 1, args=args) - npt.assert_allclose(normalization_expect, 1.0, atol=atol, rtol=rtol, - err_msg=distname, verbose=True) - - normalization_cdf = distfn.cdf(distfn.b, *args) - npt.assert_allclose(normalization_cdf, 1.0) - - -def check_moment(distfn, arg, m, v, msg): - m1 = distfn.moment(1, *arg) - m2 = distfn.moment(2, *arg) - if not np.isinf(m): - npt.assert_almost_equal(m1, m, decimal=10, err_msg=msg + - ' - 1st moment') - else: # or np.isnan(m1), - npt.assert_(np.isinf(m1), - msg + ' - 1st moment -infinite, m1=%s' % str(m1)) - - if not np.isinf(v): - npt.assert_almost_equal(m2 - m1 * m1, v, decimal=10, err_msg=msg + - ' - 2ndt moment') - else: # or np.isnan(m2), - npt.assert_(np.isinf(m2), - msg + ' - 2nd moment -infinite, m2=%s' % str(m2)) - - -def check_mean_expect(distfn, arg, m, msg): - if np.isfinite(m): - m1 = distfn.expect(lambda x: x, arg) - npt.assert_almost_equal(m1, m, decimal=5, err_msg=msg + - ' - 1st moment (expect)') - - -def check_var_expect(distfn, arg, m, v, msg): - if np.isfinite(v): - m2 = distfn.expect(lambda x: x*x, arg) - npt.assert_almost_equal(m2, v + m*m, decimal=5, err_msg=msg + - ' - 2st moment (expect)') - - -def check_skew_expect(distfn, arg, m, v, s, msg): - if np.isfinite(s): - m3e = distfn.expect(lambda x: np.power(x-m, 3), arg) - npt.assert_almost_equal(m3e, s * np.power(v, 1.5), - decimal=5, err_msg=msg + ' - skew') - else: - npt.assert_(np.isnan(s)) - - -def check_kurt_expect(distfn, arg, m, v, k, msg): - if np.isfinite(k): - m4e = distfn.expect(lambda x: np.power(x-m, 4), arg) - npt.assert_allclose(m4e, (k + 3.) * np.power(v, 2), atol=1e-5, rtol=1e-5, - err_msg=msg + ' - kurtosis') - else: - npt.assert_(np.isnan(k)) - - -def check_entropy(distfn, arg, msg): - ent = distfn.entropy(*arg) - npt.assert_(not np.isnan(ent), msg + 'test Entropy is nan') - - -def check_private_entropy(distfn, args, superclass): - # compare a generic _entropy with the distribution-specific implementation - npt.assert_allclose(distfn._entropy(*args), - superclass._entropy(distfn, *args)) - - -def check_edge_support(distfn, args): - # Make sure the x=self.a and self.b are handled correctly. - x = [distfn.a, distfn.b] - if isinstance(distfn, stats.rv_continuous): - npt.assert_equal(distfn.cdf(x, *args), [0.0, 1.0]) - npt.assert_equal(distfn.logcdf(x, *args), [-np.inf, 0.0]) - - npt.assert_equal(distfn.sf(x, *args), [1.0, 0.0]) - npt.assert_equal(distfn.logsf(x, *args), [0.0, -np.inf]) - - if isinstance(distfn, stats.rv_discrete): - x = [distfn.a - 1, distfn.b] - npt.assert_equal(distfn.ppf([0.0, 1.0], *args), x) - npt.assert_equal(distfn.isf([0.0, 1.0], *args), x[::-1]) - - # out-of-bounds for isf & ppf - npt.assert_(np.isnan(distfn.isf([-1, 2], *args)).all()) - npt.assert_(np.isnan(distfn.ppf([-1, 2], *args)).all()) - - -def check_named_args(distfn, x, shape_args, defaults, meths): - ## Check calling w/ named arguments. - - # check consistency of shapes, numargs and _parse signature - signature = inspect.getargspec(distfn._parse_args) - npt.assert_(signature.varargs is None) - npt.assert_(signature.keywords is None) - npt.assert_(signature.defaults == defaults) - - shape_argnames = signature.args[1:-len(defaults)] # self, a, b, loc=0, scale=1 - if distfn.shapes: - shapes_ = distfn.shapes.replace(',', ' ').split() - else: - shapes_ = '' - npt.assert_(len(shapes_) == distfn.numargs) - npt.assert_(len(shapes_) == len(shape_argnames)) - - # check calling w/ named arguments - shape_args = list(shape_args) - - vals = [meth(x, *shape_args) for meth in meths] - npt.assert_(np.all(np.isfinite(vals))) - - names, a, k = shape_argnames[:], shape_args[:], {} - while names: - k.update({names.pop(): a.pop()}) - v = [meth(x, *a, **k) for meth in meths] - npt.assert_array_equal(vals, v) - if not 'n' in k.keys(): - # `n` is first parameter of moment(), so can't be used as named arg - with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) - npt.assert_equal(distfn.moment(1, *a, **k), - distfn.moment(1, *shape_args)) - - # unknown arguments should not go through: - k.update({'kaboom': 42}) - npt.assert_raises(TypeError, distfn.cdf, x, **k) +from __future__ import division, print_function, absolute_import + +import inspect +import warnings + +import numpy as np +import numpy.testing as npt + +#from scipy.lib._version import NumpyVersion +from scipy import stats + + +#NUMPY_BELOW_1_7 = NumpyVersion(np.__version__) < '1.7.0' +NUMPY_BELOW_1_7 = np.__version__ < '1.7.0' + + +def check_normalization(distfn, args, distname): + norm_moment = distfn.moment(0, *args) + npt.assert_allclose(norm_moment, 1.0) + + # this is a temporary plug: either ncf or expect is problematic; + # best be marked as a knownfail, but I've no clue how to do it. + if distname == "ncf": + atol, rtol = 1e-5, 0 + else: + atol, rtol = 1e-7, 1e-7 + + normalization_expect = distfn.expect(lambda x: 1, args=args) + npt.assert_allclose(normalization_expect, 1.0, atol=atol, rtol=rtol, + err_msg=distname, verbose=True) + + normalization_cdf = distfn.cdf(distfn.b, *args) + npt.assert_allclose(normalization_cdf, 1.0) + + +def check_moment(distfn, arg, m, v, msg): + m1 = distfn.moment(1, *arg) + m2 = distfn.moment(2, *arg) + if not np.isinf(m): + npt.assert_almost_equal(m1, m, decimal=10, err_msg=msg + + ' - 1st moment') + else: # or np.isnan(m1), + npt.assert_(np.isinf(m1), + msg + ' - 1st moment -infinite, m1=%s' % str(m1)) + + if not np.isinf(v): + npt.assert_almost_equal(m2 - m1 * m1, v, decimal=10, err_msg=msg + + ' - 2ndt moment') + else: # or np.isnan(m2), + npt.assert_(np.isinf(m2), + msg + ' - 2nd moment -infinite, m2=%s' % str(m2)) + + +def check_mean_expect(distfn, arg, m, msg): + if np.isfinite(m): + m1 = distfn.expect(lambda x: x, arg) + npt.assert_almost_equal(m1, m, decimal=5, err_msg=msg + + ' - 1st moment (expect)') + + +def check_var_expect(distfn, arg, m, v, msg): + if np.isfinite(v): + m2 = distfn.expect(lambda x: x * x, arg) + npt.assert_almost_equal(m2, v + m * m, decimal=5, err_msg=msg + + ' - 2st moment (expect)') + + +def check_skew_expect(distfn, arg, m, v, s, msg): + if np.isfinite(s): + m3e = distfn.expect(lambda x: np.power(x - m, 3), arg) + npt.assert_almost_equal(m3e, s * np.power(v, 1.5), + decimal=5, err_msg=msg + ' - skew') + else: + npt.assert_(np.isnan(s)) + + +def check_kurt_expect(distfn, arg, m, v, k, msg): + if np.isfinite(k): + m4e = distfn.expect(lambda x: np.power(x - m, 4), arg) + npt.assert_allclose( + m4e, (k + 3.) * np.power(v, 2), atol=1e-5, rtol=1e-5, + err_msg=msg + ' - kurtosis') + else: + npt.assert_(np.isnan(k)) + + +def check_entropy(distfn, arg, msg): + ent = distfn.entropy(*arg) + npt.assert_(not np.isnan(ent), msg + 'test Entropy is nan') + + +def check_private_entropy(distfn, args, superclass): + # compare a generic _entropy with the distribution-specific implementation + npt.assert_allclose(distfn._entropy(*args), + superclass._entropy(distfn, *args)) + + +def check_edge_support(distfn, args): + # Make sure the x=self.a and self.b are handled correctly. + x = [distfn.a, distfn.b] + if isinstance(distfn, stats.rv_continuous): + npt.assert_equal(distfn.cdf(x, *args), [0.0, 1.0]) + npt.assert_equal(distfn.logcdf(x, *args), [-np.inf, 0.0]) + + npt.assert_equal(distfn.sf(x, *args), [1.0, 0.0]) + npt.assert_equal(distfn.logsf(x, *args), [0.0, -np.inf]) + + if isinstance(distfn, stats.rv_discrete): + x = [distfn.a - 1, distfn.b] + npt.assert_equal(distfn.ppf([0.0, 1.0], *args), x) + npt.assert_equal(distfn.isf([0.0, 1.0], *args), x[::-1]) + + # out-of-bounds for isf & ppf + npt.assert_(np.isnan(distfn.isf([-1, 2], *args)).all()) + npt.assert_(np.isnan(distfn.ppf([-1, 2], *args)).all()) + + +def check_named_args(distfn, x, shape_args, defaults, meths): + # Check calling w/ named arguments. + + # check consistency of shapes, numargs and _parse signature + signature = inspect.getargspec(distfn._parse_args) + npt.assert_(signature.varargs is None) + npt.assert_(signature.keywords is None) + npt.assert_(signature.defaults == defaults) + + # self, a, b, loc=0, scale=1 + shape_argnames = signature.args[1:-len(defaults)] + if distfn.shapes: + shapes_ = distfn.shapes.replace(',', ' ').split() + else: + shapes_ = '' + npt.assert_(len(shapes_) == distfn.numargs) + npt.assert_(len(shapes_) == len(shape_argnames)) + + # check calling w/ named arguments + shape_args = list(shape_args) + + vals = [meth(x, *shape_args) for meth in meths] + npt.assert_(np.all(np.isfinite(vals))) + + names, a, k = shape_argnames[:], shape_args[:], {} + while names: + k.update({names.pop(): a.pop()}) + v = [meth(x, *a, **k) for meth in meths] + npt.assert_array_equal(vals, v) + if not 'n' in k.keys(): + # `n` is first parameter of moment(), so can't be used as named arg + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + npt.assert_equal(distfn.moment(1, *a, **k), + distfn.moment(1, *shape_args)) + + # unknown arguments should not go through: + k.update({'kaboom': 42}) + npt.assert_raises(TypeError, distfn.cdf, x, **k) diff --git a/pywafo/src/wafo/stats/tests/test_binned_statistic.py b/pywafo/src/wafo/stats/tests/test_binned_statistic.py index 26cc4be..51d7e34 100644 --- a/pywafo/src/wafo/stats/tests/test_binned_statistic.py +++ b/pywafo/src/wafo/stats/tests/test_binned_statistic.py @@ -1,5 +1,4 @@ from __future__ import division, print_function, absolute_import - import numpy as np from numpy.testing import assert_array_almost_equal, run_module_suite from scipy.stats import \ @@ -235,4 +234,5 @@ class TestBinnedStatistic(object): if __name__ == "__main__": + #unittest.main() run_module_suite() diff --git a/pywafo/src/wafo/stats/tests/test_continuous_basic.py b/pywafo/src/wafo/stats/tests/test_continuous_basic.py index d4fe56c..be2868c 100644 --- a/pywafo/src/wafo/stats/tests/test_continuous_basic.py +++ b/pywafo/src/wafo/stats/tests/test_continuous_basic.py @@ -6,8 +6,9 @@ import numpy as np import numpy.testing as npt from scipy import integrate -from scipy import stats -from common_tests import (check_normalization, check_moment, check_mean_expect, +from wafo import stats +from wafo.stats.tests.common_tests import (check_normalization, check_moment, + check_mean_expect, check_var_expect, check_skew_expect, check_kurt_expect, check_entropy, check_private_entropy, NUMPY_BELOW_1_7, check_edge_support, check_named_args) @@ -158,7 +159,7 @@ distmissing = ['wald', 'gausshyper', 'genexpon', 'rv_continuous', 'johnsonsb', 'truncexpon', 'rice', 'invgauss', 'invgamma', 'powerlognorm'] -distmiss = [[dist,args] for dist,args in distcont if dist in distmissing] +distmiss = [[dist, args] for dist, args in distcont if dist in distmissing] distslow = ['rdist', 'gausshyper', 'recipinvgauss', 'ksone', 'genexpon', 'vonmises', 'vonmises_line', 'mielke', 'semicircular', 'cosine', 'invweibull', 'powerlognorm', 'johnsonsu', 'kstwobign'] @@ -181,7 +182,8 @@ def _silence_fp_errors(func): def test_cont_basic(): # this test skips slow distributions with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=integrate.IntegrationWarning) +# warnings.filterwarnings('ignore', +# category=integrate.IntegrationWarning) for distname, arg in distcont[:]: if distname in distslow: continue @@ -233,14 +235,15 @@ def test_cont_basic(): def test_cont_basic_slow(): # same as above for slow distributions with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=integrate.IntegrationWarning) +# warnings.filterwarnings('ignore', +# category=integrate.IntegrationWarning) for distname, arg in distcont[:]: if distname not in distslow: continue distfn = getattr(stats, distname) np.random.seed(765456) sn = 500 - rvs = distfn.rvs(size=sn,*arg) + rvs = distfn.rvs(size=sn, *arg) sm = rvs.mean() sv = rvs.var() m, v = distfn.stats(*arg) @@ -284,7 +287,8 @@ def test_cont_basic_slow(): @npt.dec.slow def test_moments(): with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=integrate.IntegrationWarning) +# warnings.filterwarnings('ignore', +# category=integrate.IntegrationWarning) knf = npt.dec.knownfailureif fail_normalization = set(['vonmises', 'ksone']) fail_higher = set(['vonmises', 'ksone', 'ncf']) @@ -312,124 +316,130 @@ def check_sample_meanvar_(distfn, arg, m, v, sm, sv, sn, msg): check_sample_var(sv, sn, v) -def check_sample_mean(sm,v,n, popmean): +def check_sample_mean(sm, v, n, popmean): # from stats.stats.ttest_1samp(a, popmean): # Calculates the t-obtained for the independent samples T-test on ONE group # of scores a, given a population mean. # # Returns: t-value, two-tailed prob - df = n-1 - svar = ((n-1)*v) / float(df) # looks redundant - t = (sm-popmean) / np.sqrt(svar*(1.0/n)) - prob = stats.betai(0.5*df, 0.5, df/(df+t*t)) - - # return t,prob - npt.assert_(prob > 0.01, 'mean fail, t,prob = %f, %f, m, sm=%f,%f' % - (t, prob, popmean, sm)) - - -def check_sample_var(sv,n, popvar): - # two-sided chisquare test for sample variance equal to hypothesized variance - df = n-1 - chi2 = (n-1)*popvar/float(popvar) - pval = stats.chisqprob(chi2,df)*2 - npt.assert_(pval > 0.01, 'var fail, t, pval = %f, %f, v, sv=%f, %f' % - (chi2,pval,popvar,sv)) - - -def check_cdf_ppf(distfn,arg,msg): + df = n - 1 + svar = ((n - 1) * v) / float(df) # looks redundant + t = (sm - popmean) / np.sqrt(svar * (1.0 / n)) + prob = stats.betai(0.5 * df, 0.5, df / (df + t * t)) + + # return t,prob + npt.assert_(prob > 0.01, 'mean fail, t,prob = %f, %f, m, sm=%f,%f' % + (t, prob, popmean, sm)) + + +def check_sample_var(sv, n, popvar): + # two-sided chisquare test for sample variance equal to hypothesized + # variance + df = n - 1 + chi2 = (n - 1) * popvar / float(popvar) + pval = stats.chisqprob(chi2, df) * 2 + npt.assert_(pval > 0.01, 'var fail, t, pval = %f, %f, v, sv=%f, %f' % + (chi2, pval, popvar, sv)) + + +def check_cdf_ppf(distfn, arg, msg): values = [0.001, 0.5, 0.999] npt.assert_almost_equal(distfn.cdf(distfn.ppf(values, *arg), *arg), values, decimal=DECIMAL, err_msg=msg + ' - cdf-ppf roundtrip') -def check_sf_isf(distfn,arg,msg): - npt.assert_almost_equal(distfn.sf(distfn.isf([0.1,0.5,0.9], *arg), *arg), - [0.1,0.5,0.9], decimal=DECIMAL, err_msg=msg + - ' - sf-isf roundtrip') - npt.assert_almost_equal(distfn.cdf([0.1,0.9], *arg), - 1.0-distfn.sf([0.1,0.9], *arg), - decimal=DECIMAL, err_msg=msg + - ' - cdf-sf relationship') - - -def check_pdf(distfn, arg, msg): - # compares pdf at median with numerical derivative of cdf - median = distfn.ppf(0.5, *arg) - eps = 1e-6 - pdfv = distfn.pdf(median, *arg) - if (pdfv < 1e-4) or (pdfv > 1e4): - # avoid checking a case where pdf is close to zero or huge (singularity) - median = median + 0.1 - pdfv = distfn.pdf(median, *arg) - cdfdiff = (distfn.cdf(median + eps, *arg) - - distfn.cdf(median - eps, *arg))/eps/2.0 - # replace with better diff and better test (more points), - # actually, this works pretty well - npt.assert_almost_equal(pdfv, cdfdiff, - decimal=DECIMAL, err_msg=msg + ' - cdf-pdf relationship') - - -def check_pdf_logpdf(distfn, args, msg): - # compares pdf at several points with the log of the pdf - points = np.array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]) - vals = distfn.ppf(points, *args) - pdf = distfn.pdf(vals, *args) - logpdf = distfn.logpdf(vals, *args) - pdf = pdf[pdf != 0] - logpdf = logpdf[np.isfinite(logpdf)] - npt.assert_almost_equal(np.log(pdf), logpdf, decimal=7, err_msg=msg + " - logpdf-log(pdf) relationship") - - -def check_sf_logsf(distfn, args, msg): - # compares sf at several points with the log of the sf - points = np.array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]) - vals = distfn.ppf(points, *args) - sf = distfn.sf(vals, *args) - logsf = distfn.logsf(vals, *args) - sf = sf[sf != 0] - logsf = logsf[np.isfinite(logsf)] - npt.assert_almost_equal(np.log(sf), logsf, decimal=7, err_msg=msg + " - logsf-log(sf) relationship") - - -def check_cdf_logcdf(distfn, args, msg): - # compares cdf at several points with the log of the cdf - points = np.array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]) - vals = distfn.ppf(points, *args) - cdf = distfn.cdf(vals, *args) - logcdf = distfn.logcdf(vals, *args) - cdf = cdf[cdf != 0] - logcdf = logcdf[np.isfinite(logcdf)] - npt.assert_almost_equal(np.log(cdf), logcdf, decimal=7, err_msg=msg + " - logcdf-log(cdf) relationship") - - -def check_distribution_rvs(dist, args, alpha, rvs): - # test from scipy.stats.tests - # this version reuses existing random variables - D,pval = stats.kstest(rvs, dist, args=args, N=1000) - if (pval < alpha): - D,pval = stats.kstest(dist,'',args=args, N=1000) - npt.assert_(pval > alpha, "D = " + str(D) + "; pval = " + str(pval) + - "; alpha = " + str(alpha) + "\nargs = " + str(args)) - - -def check_vecentropy(distfn, args): - npt.assert_equal(distfn.vecentropy(*args), distfn._entropy(*args)) - -@npt.dec.skipif(NUMPY_BELOW_1_7) -def check_loc_scale(distfn, arg, m, v, msg): - loc, scale = 10.0, 10.0 - mt, vt = distfn.stats(loc=loc, scale=scale, *arg) - npt.assert_allclose(m*scale + loc, mt) - npt.assert_allclose(v*scale*scale, vt) - - -def check_ppf_private(distfn, arg, msg): - #fails by design for truncnorm self.nb not defined - ppfs = distfn._ppf(np.array([0.1, 0.5, 0.9]), *arg) - npt.assert_(not np.any(np.isnan(ppfs)), msg + 'ppf private is nan') - - -if __name__ == "__main__": - npt.run_module_suite() +def check_sf_isf(distfn, arg, msg): + npt.assert_almost_equal(distfn.sf(distfn.isf([0.1, 0.5, 0.9], *arg), *arg), + [0.1, 0.5, 0.9], decimal=DECIMAL, err_msg=msg + + ' - sf-isf roundtrip') + npt.assert_almost_equal(distfn.cdf([0.1, 0.9], *arg), + 1.0 - distfn.sf([0.1, 0.9], *arg), + decimal=DECIMAL, err_msg=msg + + ' - cdf-sf relationship') + + +def check_pdf(distfn, arg, msg): + # compares pdf at median with numerical derivative of cdf + median = distfn.ppf(0.5, *arg) + eps = 1e-6 + pdfv = distfn.pdf(median, *arg) + if (pdfv < 1e-4) or (pdfv > 1e4): + # avoid checking a case where pdf is close to zero or huge + # (singularity) + median = median + 0.1 + pdfv = distfn.pdf(median, *arg) + cdfdiff = (distfn.cdf(median + eps, *arg) - + distfn.cdf(median - eps, *arg)) / eps / 2.0 + # replace with better diff and better test (more points), + # actually, this works pretty well + npt.assert_almost_equal(pdfv, cdfdiff, decimal=DECIMAL, + err_msg=msg + ' - cdf-pdf relationship') + + +def check_pdf_logpdf(distfn, args, msg): + # compares pdf at several points with the log of the pdf + points = np.array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]) + vals = distfn.ppf(points, *args) + pdf = distfn.pdf(vals, *args) + logpdf = distfn.logpdf(vals, *args) + pdf = pdf[pdf != 0] + logpdf = logpdf[np.isfinite(logpdf)] + npt.assert_almost_equal(np.log(pdf), logpdf, decimal=7, + err_msg=msg + " - logpdf-log(pdf) relationship") + + +def check_sf_logsf(distfn, args, msg): + # compares sf at several points with the log of the sf + points = np.array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]) + vals = distfn.ppf(points, *args) + sf = distfn.sf(vals, *args) + logsf = distfn.logsf(vals, *args) + sf = sf[sf != 0] + logsf = logsf[np.isfinite(logsf)] + npt.assert_almost_equal(np.log(sf), logsf, decimal=7, + err_msg=msg + " - logsf-log(sf) relationship") + + +def check_cdf_logcdf(distfn, args, msg): + # compares cdf at several points with the log of the cdf + points = np.array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]) + vals = distfn.ppf(points, *args) + cdf = distfn.cdf(vals, *args) + logcdf = distfn.logcdf(vals, *args) + cdf = cdf[cdf != 0] + logcdf = logcdf[np.isfinite(logcdf)] + npt.assert_almost_equal(np.log(cdf), logcdf, decimal=7, + err_msg=msg + " - logcdf-log(cdf) relationship") + + +def check_distribution_rvs(dist, args, alpha, rvs): + # test from scipy.stats.tests + # this version reuses existing random variables + D, pval = stats.kstest(rvs, dist, args=args, N=1000) + if (pval < alpha): + D, pval = stats.kstest(dist, '', args=args, N=1000) + npt.assert_(pval > alpha, "D = " + str(D) + "; pval = " + str(pval) + + "; alpha = " + str(alpha) + "\nargs = " + str(args)) + + +def check_vecentropy(distfn, args): + npt.assert_equal(distfn.vecentropy(*args), distfn._entropy(*args)) + + +@npt.dec.skipif(NUMPY_BELOW_1_7) +def check_loc_scale(distfn, arg, m, v, msg): + loc, scale = 10.0, 10.0 + mt, vt = distfn.stats(loc=loc, scale=scale, *arg) + npt.assert_allclose(m * scale + loc, mt) + npt.assert_allclose(v * scale * scale, vt) + + +def check_ppf_private(distfn, arg, msg): + # fails by design for truncnorm self.nb not defined + ppfs = distfn._ppf(np.array([0.1, 0.5, 0.9]), *arg) + npt.assert_(not np.any(np.isnan(ppfs)), msg + 'ppf private is nan') + + +if __name__ == "__main__": + npt.run_module_suite() diff --git a/pywafo/src/wafo/stats/tests/test_discrete_basic.py b/pywafo/src/wafo/stats/tests/test_discrete_basic.py index d01d9c1..90008e5 100644 --- a/pywafo/src/wafo/stats/tests/test_discrete_basic.py +++ b/pywafo/src/wafo/stats/tests/test_discrete_basic.py @@ -3,25 +3,26 @@ from __future__ import division, print_function, absolute_import import numpy.testing as npt import numpy as np try: - from scipy.lib.six import xrange + from wafo.stats.six import xrange except: pass -from scipy import stats -from .common_tests import (check_normalization, check_moment, check_mean_expect, +from wafo import stats +from wafo.stats.tests.common_tests import (check_normalization, check_moment, + check_mean_expect, check_var_expect, check_skew_expect, check_kurt_expect, check_entropy, check_private_entropy, check_edge_support, check_named_args) knf = npt.dec.knownfailureif distdiscrete = [ - ['bernoulli',(0.3,)], + ['bernoulli', (0.3, )], ['binom', (5, 0.4)], - ['boltzmann',(1.4, 19)], + ['boltzmann', (1.4, 19)], ['dlaplace', (0.8,)], # 0.5 ['geom', (0.5,)], - ['hypergeom',(30, 12, 6)], - ['hypergeom',(21,3,12)], # numpy.random (3,18,12) numpy ticket:921 - ['hypergeom',(21,18,11)], # numpy.random (18,3,11) numpy ticket:921 + ['hypergeom', (30, 12, 6)], + ['hypergeom', (21, 3, 12)], # numpy.random (3,18,12) numpy ticket:921 + ['hypergeom', (21, 18, 11)], # numpy.random (18,3,11) numpy ticket:921 ['logser', (0.6,)], # reenabled, numpy ticket:921 ['nbinom', (5, 0.5)], ['nbinom', (0.4, 0.4)], # from tickets: 583 @@ -39,7 +40,7 @@ def test_discrete_basic(): np.random.seed(9765456) rvs = distfn.rvs(size=2000, *arg) supp = np.unique(rvs) - m, v = distfn.stats(*arg) + #_m, v = distfn.stats(*arg) yield check_cdf_ppf, distfn, arg, supp, distname + ' cdf_ppf' yield check_pmf_cdf, distfn, arg, distname @@ -55,7 +56,7 @@ def test_discrete_basic(): if distname in seen: continue seen.add(distname) - distfn = getattr(stats,distname) + distfn = getattr(stats, distname) locscale_defaults = (0,) meths = [distfn.pmf, distfn.logpmf, distfn.cdf, distfn.logcdf, distfn.logsf] @@ -73,7 +74,7 @@ def test_discrete_basic(): def test_moments(): for distname, arg in distdiscrete: - distfn = getattr(stats,distname) + distfn = getattr(stats, distname) m, v, s, k = distfn.stats(*arg, moments='mvsk') yield check_normalization, distfn, arg, distname @@ -89,7 +90,7 @@ def test_moments(): # frozen distr moments yield check_moment_frozen, distfn, arg, m, 1 - yield check_moment_frozen, distfn, arg, v+m*m, 2 + yield check_moment_frozen, distfn, arg, v + m * m, 2 def check_cdf_ppf(distfn, arg, supp, msg): @@ -107,7 +108,7 @@ def check_cdf_ppf(distfn, arg, supp, msg): def check_pmf_cdf(distfn, arg, distname): startind = np.int(distfn.ppf(0.01, *arg) - 1) index = list(range(startind, startind + 10)) - cdfs, pmfs_cum = distfn.cdf(index,*arg), distfn.pmf(index, *arg).cumsum() + cdfs, pmfs_cum = distfn.cdf(index, *arg), distfn.pmf(index, *arg).cumsum() atol, rtol = 1e-10, 1e-10 if distname == 'skellam': # ncx2 accuracy @@ -157,7 +158,7 @@ def check_discrete_chisquare(distfn, arg, rvs, alpha, msg): """ n = len(rvs) nsupp = 20 - wsupp = 1.0/nsupp + wsupp = 1.0 / nsupp # construct intervals with minimum mass 1/nsupp # intervals are left-half-open as in a cdf difference @@ -166,30 +167,30 @@ def check_discrete_chisquare(distfn, arg, rvs, alpha, msg): distsupp = [max(distfn.a, -1000)] distmass = [] for ii in distsupport: - current = distfn.cdf(ii,*arg) - if current - last >= wsupp-1e-14: + current = distfn.cdf(ii, *arg) + if current - last >= wsupp - 1e-14: distsupp.append(ii) distmass.append(current - last) last = current - if current > (1-wsupp): + if current > (1 - wsupp): break if distsupp[-1] < distfn.b: distsupp.append(distfn.b) - distmass.append(1-last) + distmass.append(1 - last) distsupp = np.array(distsupp) distmass = np.array(distmass) # convert intervals to right-half-open as required by histogram - histsupp = distsupp+1e-8 + histsupp = distsupp + 1e-8 histsupp[0] = distfn.a # find sample frequencies and perform chisquare test - freq,hsupp = np.histogram(rvs,histsupp) - cdfs = distfn.cdf(distsupp,*arg) - (chis,pval) = stats.chisquare(np.array(freq),n*distmass) + freq, _hsupp = np.histogram(rvs, histsupp) + #cdfs = distfn.cdf(distsupp, *arg) + (_chis, pval) = stats.chisquare(np.array(freq), n * distmass) npt.assert_(pval > alpha, 'chisquare - test for %s' - ' at arg = %s with pval = %s' % (msg,str(arg),str(pval))) + ' at arg = %s with pval = %s' % (msg, str(arg), str(pval))) def check_scale_docstring(distfn): diff --git a/pywafo/src/wafo/stats/tests/test_distributions.py b/pywafo/src/wafo/stats/tests/test_distributions.py index 8016e4f..a7c6c7b 100644 --- a/pywafo/src/wafo/stats/tests/test_distributions.py +++ b/pywafo/src/wafo/stats/tests/test_distributions.py @@ -1,1865 +1,1931 @@ -""" Test functions for stats module - -""" -from __future__ import division, print_function, absolute_import - -import warnings -import re -import sys - -from numpy.testing import (TestCase, run_module_suite, assert_equal, - assert_array_equal, assert_almost_equal, assert_array_almost_equal, - assert_allclose, assert_, assert_raises, rand, dec) -from nose import SkipTest - -import numpy -import numpy as np -from numpy import typecodes, array -from scipy.lib._version import NumpyVersion -from scipy import special -import scipy.stats as stats -from scipy.stats._distn_infrastructure import argsreduce -from scipy.special import xlogy - - -# python -OO strips docstrings -DOCSTRINGS_STRIPPED = sys.flags.optimize > 1 - - -# generate test cases to test cdf and distribution consistency -dists = ['uniform','norm','lognorm','expon','beta', - 'powerlaw','bradford','burr','fisk','cauchy','halfcauchy', - 'foldcauchy','gamma','gengamma','loggamma', - 'alpha','anglit','arcsine','betaprime', - 'dgamma','exponweib','exponpow','frechet_l','frechet_r', - 'gilbrat','f','ncf','chi2','chi','nakagami','genpareto', - 'genextreme','genhalflogistic','pareto','lomax','halfnorm', - 'halflogistic','fatiguelife','foldnorm','ncx2','t','nct', - 'weibull_min','weibull_max','dweibull','maxwell','rayleigh', - 'genlogistic', 'logistic','gumbel_l','gumbel_r','gompertz', - 'hypsecant', 'laplace', 'reciprocal','triang','tukeylambda', - 'vonmises', 'vonmises_line', 'pearson3'] - -# check function for test generator - - -def check_distribution(dist, args, alpha): - D,pval = stats.kstest(dist,'', args=args, N=1000) - if (pval < alpha): - D,pval = stats.kstest(dist,'',args=args, N=1000) - # if (pval < alpha): - # D,pval = stats.kstest(dist,'',args=args, N=1000) - assert_(pval > alpha, msg="D = " + str(D) + "; pval = " + str(pval) + - "; alpha = " + str(alpha) + "\nargs = " + str(args)) - -# nose test generator - - -def test_all_distributions(): - for dist in dists: - distfunc = getattr(stats, dist) - nargs = distfunc.numargs - alpha = 0.01 - if dist == 'fatiguelife': - alpha = 0.001 - - if dist == 'frechet': - args = tuple(2*rand(1))+(0,)+tuple(2*rand(2)) - elif dist == 'triang': - args = tuple(rand(nargs)) - elif dist == 'reciprocal': - vals = rand(nargs) - vals[1] = vals[0] + 1.0 - args = tuple(vals) - elif dist == 'vonmises': - yield check_distribution, dist, (10,), alpha - yield check_distribution, dist, (101,), alpha - args = tuple(1.0+rand(nargs)) - else: - args = tuple(1.0+rand(nargs)) - - yield check_distribution, dist, args, alpha - - -def check_vonmises_pdf_periodic(k,l,s,x): - vm = stats.vonmises(k,loc=l,scale=s) - assert_almost_equal(vm.pdf(x),vm.pdf(x % (2*numpy.pi*s))) - - -def check_vonmises_cdf_periodic(k,l,s,x): - vm = stats.vonmises(k,loc=l,scale=s) - assert_almost_equal(vm.cdf(x) % 1,vm.cdf(x % (2*numpy.pi*s)) % 1) - - -def test_vonmises_pdf_periodic(): - for k in [0.1, 1, 101]: - for x in [0,1,numpy.pi,10,100]: - yield check_vonmises_pdf_periodic, k, 0, 1, x - yield check_vonmises_pdf_periodic, k, 1, 1, x - yield check_vonmises_pdf_periodic, k, 0, 10, x - - yield check_vonmises_cdf_periodic, k, 0, 1, x - yield check_vonmises_cdf_periodic, k, 1, 1, x - yield check_vonmises_cdf_periodic, k, 0, 10, x - - -def test_vonmises_line_support(): - assert_equal(stats.vonmises_line.a, -np.pi) - assert_equal(stats.vonmises_line.b, np.pi) - - -class TestRandInt(TestCase): - def test_rvs(self): - vals = stats.randint.rvs(5,30,size=100) - assert_(numpy.all(vals < 30) & numpy.all(vals >= 5)) - assert_(len(vals) == 100) - vals = stats.randint.rvs(5,30,size=(2,50)) - assert_(numpy.shape(vals) == (2,50)) - assert_(vals.dtype.char in typecodes['AllInteger']) - val = stats.randint.rvs(15,46) - assert_((val >= 15) & (val < 46)) - assert_(isinstance(val, numpy.ScalarType), msg=repr(type(val))) - val = stats.randint(15,46).rvs(3) - assert_(val.dtype.char in typecodes['AllInteger']) - - def test_pdf(self): - k = numpy.r_[0:36] - out = numpy.where((k >= 5) & (k < 30), 1.0/(30-5), 0) - vals = stats.randint.pmf(k,5,30) - assert_array_almost_equal(vals,out) - - def test_cdf(self): - x = numpy.r_[0:36:100j] - k = numpy.floor(x) - out = numpy.select([k >= 30,k >= 5],[1.0,(k-5.0+1)/(30-5.0)],0) - vals = stats.randint.cdf(x,5,30) - assert_array_almost_equal(vals, out, decimal=12) - - -class TestBinom(TestCase): - def test_rvs(self): - vals = stats.binom.rvs(10, 0.75, size=(2, 50)) - assert_(numpy.all(vals >= 0) & numpy.all(vals <= 10)) - assert_(numpy.shape(vals) == (2, 50)) - assert_(vals.dtype.char in typecodes['AllInteger']) - val = stats.binom.rvs(10, 0.75) - assert_(isinstance(val, int)) - val = stats.binom(10, 0.75).rvs(3) - assert_(isinstance(val, numpy.ndarray)) - assert_(val.dtype.char in typecodes['AllInteger']) - - def test_pmf(self): - # regression test for Ticket #1842 - vals1 = stats.binom.pmf(100, 100,1) - vals2 = stats.binom.pmf(0, 100,0) - assert_allclose(vals1, 1.0, rtol=1e-15, atol=0) - assert_allclose(vals2, 1.0, rtol=1e-15, atol=0) - - def test_entropy(self): - # Basic entropy tests. - b = stats.binom(2, 0.5) - expected_p = np.array([0.25, 0.5, 0.25]) - expected_h = -sum(xlogy(expected_p, expected_p)) - h = b.entropy() - assert_allclose(h, expected_h) - - b = stats.binom(2, 0.0) - h = b.entropy() - assert_equal(h, 0.0) - - b = stats.binom(2, 1.0) - h = b.entropy() - assert_equal(h, 0.0) - - -class TestBernoulli(TestCase): - def test_rvs(self): - vals = stats.bernoulli.rvs(0.75, size=(2, 50)) - assert_(numpy.all(vals >= 0) & numpy.all(vals <= 1)) - assert_(numpy.shape(vals) == (2, 50)) - assert_(vals.dtype.char in typecodes['AllInteger']) - val = stats.bernoulli.rvs(0.75) - assert_(isinstance(val, int)) - val = stats.bernoulli(0.75).rvs(3) - assert_(isinstance(val, numpy.ndarray)) - assert_(val.dtype.char in typecodes['AllInteger']) - - def test_entropy(self): - # Simple tests of entropy. - b = stats.bernoulli(0.25) - expected_h = -0.25*np.log(0.25) - 0.75*np.log(0.75) - h = b.entropy() - assert_allclose(h, expected_h) - - b = stats.bernoulli(0.0) - h = b.entropy() - assert_equal(h, 0.0) - - b = stats.bernoulli(1.0) - h = b.entropy() - assert_equal(h, 0.0) - - -class TestNBinom(TestCase): - def test_rvs(self): - vals = stats.nbinom.rvs(10, 0.75, size=(2, 50)) - assert_(numpy.all(vals >= 0)) - assert_(numpy.shape(vals) == (2, 50)) - assert_(vals.dtype.char in typecodes['AllInteger']) - val = stats.nbinom.rvs(10, 0.75) - assert_(isinstance(val, int)) - val = stats.nbinom(10, 0.75).rvs(3) - assert_(isinstance(val, numpy.ndarray)) - assert_(val.dtype.char in typecodes['AllInteger']) - - def test_pmf(self): - # regression test for ticket 1779 - assert_allclose(np.exp(stats.nbinom.logpmf(700, 721, 0.52)), - stats.nbinom.pmf(700, 721, 0.52)) - - -class TestGeom(TestCase): - def test_rvs(self): - vals = stats.geom.rvs(0.75, size=(2, 50)) - assert_(numpy.all(vals >= 0)) - assert_(numpy.shape(vals) == (2, 50)) - assert_(vals.dtype.char in typecodes['AllInteger']) - val = stats.geom.rvs(0.75) - assert_(isinstance(val, int)) - val = stats.geom(0.75).rvs(3) - assert_(isinstance(val, numpy.ndarray)) - assert_(val.dtype.char in typecodes['AllInteger']) - - def test_pmf(self): - vals = stats.geom.pmf([1,2,3],0.5) - assert_array_almost_equal(vals,[0.5,0.25,0.125]) - - def test_logpmf(self): - # regression test for ticket 1793 - vals1 = np.log(stats.geom.pmf([1,2,3], 0.5)) - vals2 = stats.geom.logpmf([1,2,3], 0.5) - assert_allclose(vals1, vals2, rtol=1e-15, atol=0) - - def test_cdf_sf(self): - vals = stats.geom.cdf([1, 2, 3], 0.5) - vals_sf = stats.geom.sf([1, 2, 3], 0.5) - expected = array([0.5, 0.75, 0.875]) - assert_array_almost_equal(vals, expected) - assert_array_almost_equal(vals_sf, 1-expected) - - def test_logcdf_logsf(self): - vals = stats.geom.logcdf([1, 2, 3], 0.5) - vals_sf = stats.geom.logsf([1, 2, 3], 0.5) - expected = array([0.5, 0.75, 0.875]) - assert_array_almost_equal(vals, np.log(expected)) - assert_array_almost_equal(vals_sf, np.log1p(-expected)) - - def test_ppf(self): - vals = stats.geom.ppf([0.5, 0.75, 0.875], 0.5) - expected = array([1.0, 2.0, 3.0]) - assert_array_almost_equal(vals, expected) - - -class TestTruncnorm(TestCase): - def test_ppf_ticket1131(self): - vals = stats.truncnorm.ppf([-0.5,0,1e-4,0.5, 1-1e-4,1,2], -1., 1., - loc=[3]*7, scale=2) - expected = np.array([np.nan, 1, 1.00056419, 3, 4.99943581, 5, np.nan]) - assert_array_almost_equal(vals, expected) - - def test_isf_ticket1131(self): - vals = stats.truncnorm.isf([-0.5,0,1e-4,0.5, 1-1e-4,1,2], -1., 1., - loc=[3]*7, scale=2) - expected = np.array([np.nan, 5, 4.99943581, 3, 1.00056419, 1, np.nan]) - assert_array_almost_equal(vals, expected) - - def test_gh_2477_small_values(self): - # Check a case that worked in the original issue. - low, high = -11, -10 - x = stats.truncnorm.rvs(low, high, 0, 1, size=10) - assert_(low < x.min() < x.max() < high) - # Check a case that failed in the original issue. - low, high = 10, 11 - x = stats.truncnorm.rvs(low, high, 0, 1, size=10) - assert_(low < x.min() < x.max() < high) - - def test_gh_2477_large_values(self): - # Check a case that fails because of extreme tailness. - raise SkipTest('truncnorm rvs is know to fail at extreme tails') - low, high = 100, 101 - x = stats.truncnorm.rvs(low, high, 0, 1, size=10) - assert_(low < x.min() < x.max() < high) - - def test_gh_1489_trac_962_rvs(self): - # Check the original example. - low, high = 10, 15 - x = stats.truncnorm.rvs(low, high, 0, 1, size=10) - assert_(low < x.min() < x.max() < high) - - -class TestHypergeom(TestCase): - def test_rvs(self): - vals = stats.hypergeom.rvs(20, 10, 3, size=(2, 50)) - assert_(numpy.all(vals >= 0) & - numpy.all(vals <= 3)) - assert_(numpy.shape(vals) == (2, 50)) - assert_(vals.dtype.char in typecodes['AllInteger']) - val = stats.hypergeom.rvs(20, 3, 10) - assert_(isinstance(val, int)) - val = stats.hypergeom(20, 3, 10).rvs(3) - assert_(isinstance(val, numpy.ndarray)) - assert_(val.dtype.char in typecodes['AllInteger']) - - def test_precision(self): - # comparison number from mpmath - M = 2500 - n = 50 - N = 500 - tot = M - good = n - hgpmf = stats.hypergeom.pmf(2, tot, good, N) - assert_almost_equal(hgpmf, 0.0010114963068932233, 11) - - def test_precision2(self): - # Test hypergeom precision for large numbers. See #1218. - # Results compared with those from R. - oranges = 9.9e4 - pears = 1.1e5 - fruits_eaten = np.array([3, 3.8, 3.9, 4, 4.1, 4.2, 5]) * 1e4 - quantile = 2e4 - res = [] - for eaten in fruits_eaten: - res.append(stats.hypergeom.sf(quantile, oranges + pears, oranges, eaten)) - expected = np.array([0, 1.904153e-114, 2.752693e-66, 4.931217e-32, - 8.265601e-11, 0.1237904, 1]) - assert_allclose(res, expected, atol=0, rtol=5e-7) - - # Test with array_like first argument - quantiles = [1.9e4, 2e4, 2.1e4, 2.15e4] - res2 = stats.hypergeom.sf(quantiles, oranges + pears, oranges, 4.2e4) - expected2 = [1, 0.1237904, 6.511452e-34, 3.277667e-69] - assert_allclose(res2, expected2, atol=0, rtol=5e-7) - - def test_entropy(self): - # Simple tests of entropy. - hg = stats.hypergeom(4, 1, 1) - h = hg.entropy() - expected_p = np.array([0.75, 0.25]) - expected_h = -np.sum(xlogy(expected_p, expected_p)) - assert_allclose(h, expected_h) - - hg = stats.hypergeom(1, 1, 1) - h = hg.entropy() - assert_equal(h, 0.0) - - -class TestLoggamma(TestCase): - - def test_stats(self): - # The following precomputed values are from the table in section 2.2 - # of "A Statistical Study of Log-Gamma Distribution", by Ping Shing - # Chan (thesis, McMaster University, 1993). - table = np.array([ - # c, mean, var, skew, exc. kurt. - 0.5, -1.9635, 4.9348, -1.5351, 4.0000, - 1.0, -0.5772, 1.6449, -1.1395, 2.4000, - 12.0, 2.4427, 0.0869, -0.2946, 0.1735, - ]).reshape(-1, 5) - for c, mean, var, skew, kurt in table: - computed = stats.loggamma.stats(c, moments='msvk') - assert_array_almost_equal(computed, [mean, var, skew, kurt], - decimal=4) - - -class TestLogser(TestCase): - def test_rvs(self): - vals = stats.logser.rvs(0.75, size=(2, 50)) - assert_(numpy.all(vals >= 1)) - assert_(numpy.shape(vals) == (2, 50)) - assert_(vals.dtype.char in typecodes['AllInteger']) - val = stats.logser.rvs(0.75) - assert_(isinstance(val, int)) - val = stats.logser(0.75).rvs(3) - assert_(isinstance(val, numpy.ndarray)) - assert_(val.dtype.char in typecodes['AllInteger']) - - -class TestPareto(TestCase): - def test_stats(self): - # Check the stats() method with some simple values. Also check - # that the calculations do not trigger RuntimeWarnings. - with warnings.catch_warnings(): - warnings.simplefilter("error", RuntimeWarning) - - m, v, s, k = stats.pareto.stats(0.5, moments='mvsk') - assert_equal(m, np.inf) - assert_equal(v, np.inf) - assert_equal(s, np.nan) - assert_equal(k, np.nan) - - m, v, s, k = stats.pareto.stats(1.0, moments='mvsk') - assert_equal(m, np.inf) - assert_equal(v, np.inf) - assert_equal(s, np.nan) - assert_equal(k, np.nan) - - m, v, s, k = stats.pareto.stats(1.5, moments='mvsk') - assert_equal(m, 3.0) - assert_equal(v, np.inf) - assert_equal(s, np.nan) - assert_equal(k, np.nan) - - m, v, s, k = stats.pareto.stats(2.0, moments='mvsk') - assert_equal(m, 2.0) - assert_equal(v, np.inf) - assert_equal(s, np.nan) - assert_equal(k, np.nan) - - m, v, s, k = stats.pareto.stats(2.5, moments='mvsk') - assert_allclose(m, 2.5 / 1.5) - assert_allclose(v, 2.5 / (1.5*1.5*0.5)) - assert_equal(s, np.nan) - assert_equal(k, np.nan) - - m, v, s, k = stats.pareto.stats(3.0, moments='mvsk') - assert_allclose(m, 1.5) - assert_allclose(v, 0.75) - assert_equal(s, np.nan) - assert_equal(k, np.nan) - - m, v, s, k = stats.pareto.stats(3.5, moments='mvsk') - assert_allclose(m, 3.5 / 2.5) - assert_allclose(v, 3.5 / (2.5*2.5*1.5)) - assert_allclose(s, (2*4.5/0.5)*np.sqrt(1.5/3.5)) - assert_equal(k, np.nan) - - m, v, s, k = stats.pareto.stats(4.0, moments='mvsk') - assert_allclose(m, 4.0 / 3.0) - assert_allclose(v, 4.0 / 18.0) - assert_allclose(s, 2*(1+4.0)/(4.0-3) * np.sqrt((4.0-2)/4.0)) - assert_equal(k, np.nan) - - m, v, s, k = stats.pareto.stats(4.5, moments='mvsk') - assert_allclose(m, 4.5 / 3.5) - assert_allclose(v, 4.5 / (3.5*3.5*2.5)) - assert_allclose(s, (2*5.5/1.5) * np.sqrt(2.5/4.5)) - assert_allclose(k, 6*(4.5**3 + 4.5**2 - 6*4.5 - 2)/(4.5*1.5*0.5)) - - -class TestPearson3(TestCase): - def test_rvs(self): - vals = stats.pearson3.rvs(0.1, size=(2, 50)) - assert_(numpy.shape(vals) == (2, 50)) - assert_(vals.dtype.char in typecodes['AllFloat']) - val = stats.pearson3.rvs(0.5) - assert_(isinstance(val, float)) - val = stats.pearson3(0.5).rvs(3) - assert_(isinstance(val, numpy.ndarray)) - assert_(val.dtype.char in typecodes['AllFloat']) - assert_(len(val) == 3) - - def test_pdf(self): - vals = stats.pearson3.pdf(2, [0.0, 0.1, 0.2]) - assert_allclose(vals, np.array([0.05399097, 0.05555481, 0.05670246]), - atol=1e-6) - vals = stats.pearson3.pdf(-3, 0.1) - assert_allclose(vals, np.array([0.00313791]), atol=1e-6) - vals = stats.pearson3.pdf([-3,-2,-1,0,1], 0.1) - assert_allclose(vals, np.array([0.00313791, 0.05192304, 0.25028092, - 0.39885918, 0.23413173]), atol=1e-6) - - def test_cdf(self): - vals = stats.pearson3.cdf(2, [0.0, 0.1, 0.2]) - assert_allclose(vals, np.array([0.97724987, 0.97462004, 0.97213626]), - atol=1e-6) - vals = stats.pearson3.cdf(-3, 0.1) - assert_allclose(vals, [0.00082256], atol=1e-6) - vals = stats.pearson3.cdf([-3,-2,-1,0,1], 0.1) - assert_allclose(vals, [8.22563821e-04, 1.99860448e-02, 1.58550710e-01, - 5.06649130e-01, 8.41442111e-01], atol=1e-6) - - -class TestPoisson(TestCase): - def test_rvs(self): - vals = stats.poisson.rvs(0.5, size=(2, 50)) - assert_(numpy.all(vals >= 0)) - assert_(numpy.shape(vals) == (2, 50)) - assert_(vals.dtype.char in typecodes['AllInteger']) - val = stats.poisson.rvs(0.5) - assert_(isinstance(val, int)) - val = stats.poisson(0.5).rvs(3) - assert_(isinstance(val, numpy.ndarray)) - assert_(val.dtype.char in typecodes['AllInteger']) - - def test_stats(self): - mu = 16.0 - result = stats.poisson.stats(mu, moments='mvsk') - assert_allclose(result, [mu, mu, np.sqrt(1.0/mu), 1.0/mu]) - - -class TestZipf(TestCase): - def test_rvs(self): - vals = stats.zipf.rvs(1.5, size=(2, 50)) - assert_(numpy.all(vals >= 1)) - assert_(numpy.shape(vals) == (2, 50)) - assert_(vals.dtype.char in typecodes['AllInteger']) - val = stats.zipf.rvs(1.5) - assert_(isinstance(val, int)) - val = stats.zipf(1.5).rvs(3) - assert_(isinstance(val, numpy.ndarray)) - assert_(val.dtype.char in typecodes['AllInteger']) - - def test_moments(self): - # n-th moment is finite iff a > n + 1 - m, v = stats.zipf.stats(a=2.8) - assert_(np.isfinite(m)) - assert_equal(v, np.inf) - - s, k = stats.zipf.stats(a=4.8, moments='sk') - assert_(not np.isfinite([s, k]).all()) - - -class TestDLaplace(TestCase): - def test_rvs(self): - vals = stats.dlaplace.rvs(1.5, size=(2, 50)) - assert_(numpy.shape(vals) == (2, 50)) - assert_(vals.dtype.char in typecodes['AllInteger']) - val = stats.dlaplace.rvs(1.5) - assert_(isinstance(val, int)) - val = stats.dlaplace(1.5).rvs(3) - assert_(isinstance(val, numpy.ndarray)) - assert_(val.dtype.char in typecodes['AllInteger']) - assert_(stats.dlaplace.rvs(0.8) is not None) - - - def test_stats(self): - # compare the explicit formulas w/ direct summation using pmf - a = 1. - dl = stats.dlaplace(a) - m, v, s, k = dl.stats('mvsk') - - N = 37 - xx = np.arange(-N, N+1) - pp = dl.pmf(xx) - m2, m4 = np.sum(pp*xx**2), np.sum(pp*xx**4) - assert_equal((m, s), (0,0)) - assert_allclose((v, k), (m2, m4/m2**2 - 3.), atol=1e-14, rtol=1e-8) - - def test_stats2(self): - a = np.log(2.) - dl = stats.dlaplace(a) - m, v, s, k = dl.stats('mvsk') - assert_equal((m, s), (0.,0.)) - assert_allclose((v, k), (4., 3.25)) - - -class TestInvGamma(TestCase): - @dec.skipif(NumpyVersion(np.__version__) < '1.7.0', - "assert_* funcs broken with inf/nan") - def test_invgamma_inf_gh_1866(self): - # invgamma's moments are only finite for a>n - # specific numbers checked w/ boost 1.54 - with warnings.catch_warnings(): - warnings.simplefilter('error', RuntimeWarning) - mvsk = stats.invgamma.stats(a=19.31, moments='mvsk') - assert_allclose(mvsk, - [0.05461496450, 0.0001723162534, 1.020362676, 2.055616582]) - - a = [1.1, 3.1, 5.6] - mvsk = stats.invgamma.stats(a=a, moments='mvsk') - expected = ([10., 0.476190476, 0.2173913043], # mmm - [np.inf, 0.2061430632, 0.01312749422], # vvv - [np.nan, 41.95235392, 2.919025532], # sss - [np.nan, np.nan, 24.51923076]) # kkk - for x, y in zip(mvsk, expected): - assert_almost_equal(x, y) - - -class TestF(TestCase): - def test_f_moments(self): - # n-th moment of F distributions is only finite for n < dfd / 2 - m, v, s, k = stats.f.stats(11, 6.5, moments='mvsk') - assert_(np.isfinite(m)) - assert_(np.isfinite(v)) - assert_(np.isfinite(s)) - assert_(not np.isfinite(k)) - - def test_moments_warnings(self): - # no warnings should be generated for dfd = 2, 4, 6, 8 (div by zero) - with warnings.catch_warnings(): - warnings.simplefilter('error', RuntimeWarning) - stats.f.stats(dfn=[11]*4, dfd=[2, 4, 6, 8], moments='mvsk') - - @dec.knownfailureif(True, 'f stats does not properly broadcast') - def test_stats_broadcast(self): - # stats do not fully broadcast just yet - mv = stats.f.stats(dfn=11, dfd=[11, 12]) - - -def test_rvgeneric_std(): - # Regression test for #1191 - assert_array_almost_equal(stats.t.std([5, 6]), [1.29099445, 1.22474487]) - - -class TestRvDiscrete(TestCase): - def test_rvs(self): - states = [-1,0,1,2,3,4] - probability = [0.0,0.3,0.4,0.0,0.3,0.0] - samples = 1000 - r = stats.rv_discrete(name='sample',values=(states,probability)) - x = r.rvs(size=samples) - assert_(isinstance(x, numpy.ndarray)) - - for s,p in zip(states,probability): - assert_(abs(sum(x == s)/float(samples) - p) < 0.05) - - x = r.rvs() - assert_(isinstance(x, int)) - - def test_entropy(self): - # Basic tests of entropy. - pvals = np.array([0.25, 0.45, 0.3]) - p = stats.rv_discrete(values=([0, 1, 2], pvals)) - expected_h = -sum(xlogy(pvals, pvals)) - h = p.entropy() - assert_allclose(h, expected_h) - - p = stats.rv_discrete(values=([0, 1, 2], [1.0, 0, 0])) - h = p.entropy() - assert_equal(h, 0.0) - - -class TestExpon(TestCase): - def test_zero(self): - assert_equal(stats.expon.pdf(0),1) - - def test_tail(self): # Regression test for ticket 807 - assert_equal(stats.expon.cdf(1e-18), 1e-18) - assert_equal(stats.expon.isf(stats.expon.sf(40)), 40) - - -class TestGenExpon(TestCase): - def test_pdf_unity_area(self): - from scipy.integrate import simps - # PDF should integrate to one - assert_almost_equal(simps(stats.genexpon.pdf(numpy.arange(0,10,0.01), - 0.5, 0.5, 2.0), - dx=0.01), 1, 1) - - def test_cdf_bounds(self): - # CDF should always be positive - cdf = stats.genexpon.cdf(numpy.arange(0, 10, 0.01), 0.5, 0.5, 2.0) - assert_(numpy.all((0 <= cdf) & (cdf <= 1))) - - -class TestExponpow(TestCase): - def test_tail(self): - assert_almost_equal(stats.exponpow.cdf(1e-10, 2.), 1e-20) - assert_almost_equal(stats.exponpow.isf(stats.exponpow.sf(5, .8), .8), 5) - - -class TestSkellam(TestCase): - def test_pmf(self): - # comparison to R - k = numpy.arange(-10, 15) - mu1, mu2 = 10, 5 - skpmfR = numpy.array( - [4.2254582961926893e-005, 1.1404838449648488e-004, - 2.8979625801752660e-004, 6.9177078182101231e-004, - 1.5480716105844708e-003, 3.2412274963433889e-003, - 6.3373707175123292e-003, 1.1552351566696643e-002, - 1.9606152375042644e-002, 3.0947164083410337e-002, - 4.5401737566767360e-002, 6.1894328166820688e-002, - 7.8424609500170578e-002, 9.2418812533573133e-002, - 1.0139793148019728e-001, 1.0371927988298846e-001, - 9.9076583077406091e-002, 8.8546660073089561e-002, - 7.4187842052486810e-002, 5.8392772862200251e-002, - 4.3268692953013159e-002, 3.0248159818374226e-002, - 1.9991434305603021e-002, 1.2516877303301180e-002, - 7.4389876226229707e-003]) - - assert_almost_equal(stats.skellam.pmf(k, mu1, mu2), skpmfR, decimal=15) - - def test_cdf(self): - # comparison to R, only 5 decimals - k = numpy.arange(-10, 15) - mu1, mu2 = 10, 5 - skcdfR = numpy.array( - [6.4061475386192104e-005, 1.7810985988267694e-004, - 4.6790611790020336e-004, 1.1596768997212152e-003, - 2.7077485103056847e-003, 5.9489760066490718e-003, - 1.2286346724161398e-002, 2.3838698290858034e-002, - 4.3444850665900668e-002, 7.4392014749310995e-002, - 1.1979375231607835e-001, 1.8168808048289900e-001, - 2.6011268998306952e-001, 3.5253150251664261e-001, - 4.5392943399683988e-001, 5.5764871387982828e-001, - 6.5672529695723436e-001, 7.4527195703032389e-001, - 8.1945979908281064e-001, 8.7785257194501087e-001, - 9.2112126489802404e-001, 9.5136942471639818e-001, - 9.7136085902200120e-001, 9.8387773632530240e-001, - 9.9131672394792536e-001]) - - assert_almost_equal(stats.skellam.cdf(k, mu1, mu2), skcdfR, decimal=5) - - -class TestLognorm(TestCase): - def test_pdf(self): - # Regression test for Ticket #1471: avoid nan with 0/0 situation - with np.errstate(divide='ignore'): - pdf = stats.lognorm.pdf([0, 0.5, 1], 1) - assert_array_almost_equal(pdf, [0.0, 0.62749608, 0.39894228]) - - -class TestBeta(TestCase): - def test_logpdf(self): - # Regression test for Ticket #1326: avoid nan with 0*log(0) situation - logpdf = stats.beta.logpdf(0,1,0.5) - assert_almost_equal(logpdf, -0.69314718056) - logpdf = stats.beta.logpdf(0,0.5,1) - assert_almost_equal(logpdf, np.inf) - - def test_logpdf_ticket_1866(self): - alpha, beta = 267, 1472 - x = np.array([0.2, 0.5, 0.6]) - b = stats.beta(alpha, beta) - assert_allclose(b.logpdf(x).sum(), -1201.699061824062) - assert_allclose(b.pdf(x), np.exp(b.logpdf(x))) - - -class TestBetaPrime(TestCase): - def test_logpdf(self): - alpha, beta = 267, 1472 - x = np.array([0.2, 0.5, 0.6]) - b = stats.betaprime(alpha, beta) - assert_(np.isfinite(b.logpdf(x)).all()) - assert_allclose(b.pdf(x), np.exp(b.logpdf(x))) - - -class TestGamma(TestCase): - def test_pdf(self): - # a few test cases to compare with R - pdf = stats.gamma.pdf(90, 394, scale=1./5) - assert_almost_equal(pdf, 0.002312341) - - pdf = stats.gamma.pdf(3, 10, scale=1./5) - assert_almost_equal(pdf, 0.1620358) - - def test_logpdf(self): - # Regression test for Ticket #1326: cornercase avoid nan with 0*log(0) - # situation - logpdf = stats.gamma.logpdf(0,1) - assert_almost_equal(logpdf, 0) - - -class TestChi2(TestCase): - # regression tests after precision improvements, ticket:1041, not verified - def test_precision(self): - assert_almost_equal(stats.chi2.pdf(1000, 1000), 8.919133934753128e-003, 14) - assert_almost_equal(stats.chi2.pdf(100, 100), 0.028162503162596778, 14) - - -class TestArrayArgument(TestCase): # test for ticket:992 - def test_noexception(self): - rvs = stats.norm.rvs(loc=(np.arange(5)), scale=np.ones(5), size=(10,5)) - assert_equal(rvs.shape, (10,5)) - - -class TestDocstring(TestCase): - def test_docstrings(self): - # See ticket #761 - if stats.rayleigh.__doc__ is not None: - self.assertTrue("rayleigh" in stats.rayleigh.__doc__.lower()) - if stats.bernoulli.__doc__ is not None: - self.assertTrue("bernoulli" in stats.bernoulli.__doc__.lower()) - - def test_no_name_arg(self): - # If name is not given, construction shouldn't fail. See #1508. - stats.rv_continuous() - stats.rv_discrete() - - -class TestEntropy(TestCase): - def test_entropy_positive(self): - # See ticket #497 - pk = [0.5,0.2,0.3] - qk = [0.1,0.25,0.65] - eself = stats.entropy(pk,pk) - edouble = stats.entropy(pk,qk) - assert_(0.0 == eself) - assert_(edouble >= 0.0) - - def test_entropy_base(self): - pk = np.ones(16, float) - S = stats.entropy(pk, base=2.) - assert_(abs(S - 4.) < 1.e-5) - - qk = np.ones(16, float) - qk[:8] = 2. - S = stats.entropy(pk, qk) - S2 = stats.entropy(pk, qk, base=2.) - assert_(abs(S/S2 - np.log(2.)) < 1.e-5) - - def test_entropy_zero(self): - # Test for PR-479 - assert_almost_equal(stats.entropy([0, 1, 2]), 0.63651416829481278, - decimal=12) - - def test_entropy_2d(self): - pk = [[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]] - qk = [[0.2, 0.1], [0.3, 0.6], [0.5, 0.3]] - assert_array_almost_equal(stats.entropy(pk, qk), - [0.1933259, 0.18609809]) - - @dec.skipif(NumpyVersion(np.__version__) < '1.7.0', - "assert_* funcs broken with inf/nan") - def test_entropy_2d_zero(self): - pk = [[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]] - qk = [[0.0, 0.1], [0.3, 0.6], [0.5, 0.3]] - assert_array_almost_equal(stats.entropy(pk, qk), - [np.inf, 0.18609809]) - - pk[0][0] = 0.0 - assert_array_almost_equal(stats.entropy(pk, qk), - [0.17403988, 0.18609809]) - - -def TestArgsreduce(): - a = array([1,3,2,1,2,3,3]) - b,c = argsreduce(a > 1, a, 2) - - assert_array_equal(b, [3,2,2,3,3]) - assert_array_equal(c, [2,2,2,2,2]) - - b,c = argsreduce(2 > 1, a, 2) - assert_array_equal(b, a[0]) - assert_array_equal(c, [2]) - - b,c = argsreduce(a > 0, a, 2) - assert_array_equal(b, a) - assert_array_equal(c, [2] * numpy.size(a)) - - -class TestFitMethod(object): - skip = ['ncf'] - - @dec.slow - def test_fit(self): - def check(func, dist, args, alpha): - if dist in self.skip: - raise SkipTest("%s fit known to fail" % dist) - distfunc = getattr(stats, dist) - with np.errstate(all='ignore'): - res = distfunc.rvs(*args, **{'size':200}) - vals = distfunc.fit(res) - vals2 = distfunc.fit(res, optimizer='powell') - # Only check the length of the return - # FIXME: should check the actual results to see if we are 'close' - # to what was created --- but what is 'close' enough - if dist == 'frechet': - assert_(len(vals) == len(args)) - assert_(len(vals2) == len(args)) - else: - assert_(len(vals) == 2+len(args)) - assert_(len(vals2) == 2+len(args)) - - for func, dist, args, alpha in test_all_distributions(): - yield check, func, dist, args, alpha - - @dec.slow - def test_fix_fit(self): - def check(func, dist, args, alpha): - # Not sure why 'ncf', and 'beta' are failing - # frechet has different len(args) than distfunc.numargs - if dist in self.skip + ['frechet']: - raise SkipTest("%s fit known to fail" % dist) - distfunc = getattr(stats, dist) - with np.errstate(all='ignore'): - res = distfunc.rvs(*args, **{'size':200}) - vals = distfunc.fit(res,floc=0) - vals2 = distfunc.fit(res,fscale=1) - assert_(len(vals) == 2+len(args)) - assert_(vals[-2] == 0) - assert_(vals2[-1] == 1) - assert_(len(vals2) == 2+len(args)) - if len(args) > 0: - vals3 = distfunc.fit(res, f0=args[0]) - assert_(len(vals3) == 2+len(args)) - assert_(vals3[0] == args[0]) - if len(args) > 1: - vals4 = distfunc.fit(res, f1=args[1]) - assert_(len(vals4) == 2+len(args)) - assert_(vals4[1] == args[1]) - if len(args) > 2: - vals5 = distfunc.fit(res, f2=args[2]) - assert_(len(vals5) == 2+len(args)) - assert_(vals5[2] == args[2]) - - for func, dist, args, alpha in test_all_distributions(): - yield check, func, dist, args, alpha - - def test_fix_fit_2args_lognorm(self): - # Regression test for #1551. - np.random.seed(12345) - with np.errstate(all='ignore'): - x = stats.lognorm.rvs(0.25, 0., 20.0, size=20) - assert_allclose(np.array(stats.lognorm.fit(x, floc=0, fscale=20)), - [0.25888672, 0, 20], atol=1e-5) - - def test_fix_fit_norm(self): - x = np.arange(1, 6) - - loc, scale = stats.norm.fit(x) - assert_almost_equal(loc, 3) - assert_almost_equal(scale, np.sqrt(2)) - - loc, scale = stats.norm.fit(x, floc=2) - assert_equal(loc, 2) - assert_equal(scale, np.sqrt(3)) - - loc, scale = stats.norm.fit(x, fscale=2) - assert_almost_equal(loc, 3) - assert_equal(scale, 2) - - def test_fix_fit_gamma(self): - x = np.arange(1, 6) - meanlog = np.log(x).mean() - - # A basic test of gamma.fit with floc=0. - floc = 0 - a, loc, scale = stats.gamma.fit(x, floc=floc) - s = np.log(x.mean()) - meanlog - assert_almost_equal(np.log(a) - special.digamma(a), s, decimal=5) - assert_equal(loc, floc) - assert_almost_equal(scale, x.mean()/a, decimal=8) - - # Regression tests for gh-2514. - # The problem was that if `floc=0` was given, any other fixed - # parameters were ignored. - f0 = 1 - floc = 0 - a, loc, scale = stats.gamma.fit(x, f0=f0, floc=floc) - assert_equal(a, f0) - assert_equal(loc, floc) - assert_almost_equal(scale, x.mean()/a, decimal=8) - - f0 = 2 - floc = 0 - a, loc, scale = stats.gamma.fit(x, f0=f0, floc=floc) - assert_equal(a, f0) - assert_equal(loc, floc) - assert_almost_equal(scale, x.mean()/a, decimal=8) - - # loc and scale fixed. - floc = 0 - fscale = 2 - a, loc, scale = stats.gamma.fit(x, floc=floc, fscale=fscale) - assert_equal(loc, floc) - assert_equal(scale, fscale) - c = meanlog - np.log(fscale) - assert_almost_equal(special.digamma(a), c) - - def test_fix_fit_beta(self): - # Test beta.fit when both floc and fscale are given. - - def mlefunc(a, b, x): - # Zeros of this function are critical points of - # the maximum likelihood function. - n = len(x) - s1 = np.log(x).sum() - s2 = np.log(1-x).sum() - psiab = special.psi(a + b) - func = [s1 - n * (-psiab + special.psi(a)), - s2 - n * (-psiab + special.psi(b))] - return func - - # Basic test with floc and fscale given. - x = np.array([0.125, 0.25, 0.5]) - a, b, loc, scale = stats.beta.fit(x, floc=0, fscale=1) - assert_equal(loc, 0) - assert_equal(scale, 1) - assert_allclose(mlefunc(a, b, x), [0,0], atol=1e-6) - - # Basic test with f0, floc and fscale given. - # This is also a regression test for gh-2514. - x = np.array([0.125, 0.25, 0.5]) - a, b, loc, scale = stats.beta.fit(x, f0=2, floc=0, fscale=1) - assert_equal(a, 2) - assert_equal(loc, 0) - assert_equal(scale, 1) - da, db = mlefunc(a, b, x) - assert_allclose(db, 0, atol=1e-5) - - # Same floc and fscale values as above, but reverse the data - # and fix b (f1). - x2 = 1 - x - a2, b2, loc2, scale2 = stats.beta.fit(x2, f1=2, floc=0, fscale=1) - assert_equal(b2, 2) - assert_equal(loc2, 0) - assert_equal(scale2, 1) - da, db = mlefunc(a2, b2, x2) - assert_allclose(da, 0, atol=1e-5) - # a2 of this test should equal b from above. - assert_almost_equal(a2, b) - - # Check for detection of data out of bounds when floc and fscale - # are given. - assert_raises(ValueError, stats.beta.fit, x, floc=0.5, fscale=1) - y = np.array([0, .5, 1]) - assert_raises(ValueError, stats.beta.fit, y, floc=0, fscale=1) - assert_raises(ValueError, stats.beta.fit, y, floc=0, fscale=1, f0=2) - assert_raises(ValueError, stats.beta.fit, y, floc=0, fscale=1, f1=2) - - # Check that attempting to fix all the parameters raises a ValueError. - assert_raises(ValueError, stats.beta.fit, y, f0=0, f1=1, - floc=2, fscale=3) - - -class TestFrozen(TestCase): - # Test that a frozen distribution gives the same results as the original object. - # - # Only tested for the normal distribution (with loc and scale specified) - # and for the gamma distribution (with a shape parameter specified). - def test_norm(self): - dist = stats.norm - frozen = stats.norm(loc=10.0, scale=3.0) - - result_f = frozen.pdf(20.0) - result = dist.pdf(20.0, loc=10.0, scale=3.0) - assert_equal(result_f, result) - - result_f = frozen.cdf(20.0) - result = dist.cdf(20.0, loc=10.0, scale=3.0) - assert_equal(result_f, result) - - result_f = frozen.ppf(0.25) - result = dist.ppf(0.25, loc=10.0, scale=3.0) - assert_equal(result_f, result) - - result_f = frozen.isf(0.25) - result = dist.isf(0.25, loc=10.0, scale=3.0) - assert_equal(result_f, result) - - result_f = frozen.sf(10.0) - result = dist.sf(10.0, loc=10.0, scale=3.0) - assert_equal(result_f, result) - - result_f = frozen.median() - result = dist.median(loc=10.0, scale=3.0) - assert_equal(result_f, result) - - result_f = frozen.mean() - result = dist.mean(loc=10.0, scale=3.0) - assert_equal(result_f, result) - - result_f = frozen.var() - result = dist.var(loc=10.0, scale=3.0) - assert_equal(result_f, result) - - result_f = frozen.std() - result = dist.std(loc=10.0, scale=3.0) - assert_equal(result_f, result) - - result_f = frozen.entropy() - result = dist.entropy(loc=10.0, scale=3.0) - assert_equal(result_f, result) - - result_f = frozen.moment(2) - result = dist.moment(2,loc=10.0, scale=3.0) - assert_equal(result_f, result) - - def test_gamma(self): - a = 2.0 - dist = stats.gamma - frozen = stats.gamma(a) - - result_f = frozen.pdf(20.0) - result = dist.pdf(20.0, a) - assert_equal(result_f, result) - - result_f = frozen.cdf(20.0) - result = dist.cdf(20.0, a) - assert_equal(result_f, result) - - result_f = frozen.ppf(0.25) - result = dist.ppf(0.25, a) - assert_equal(result_f, result) - - result_f = frozen.isf(0.25) - result = dist.isf(0.25, a) - assert_equal(result_f, result) - - result_f = frozen.sf(10.0) - result = dist.sf(10.0, a) - assert_equal(result_f, result) - - result_f = frozen.median() - result = dist.median(a) - assert_equal(result_f, result) - - result_f = frozen.mean() - result = dist.mean(a) - assert_equal(result_f, result) - - result_f = frozen.var() - result = dist.var(a) - assert_equal(result_f, result) - - result_f = frozen.std() - result = dist.std(a) - assert_equal(result_f, result) - - result_f = frozen.entropy() - result = dist.entropy(a) - assert_equal(result_f, result) - - result_f = frozen.moment(2) - result = dist.moment(2, a) - assert_equal(result_f, result) - - def test_regression_ticket_1293(self): - # Create a frozen distribution. - frozen = stats.lognorm(1) - # Call one of its methods that does not take any keyword arguments. - m1 = frozen.moment(2) - # Now call a method that takes a keyword argument. - frozen.stats(moments='mvsk') - # Call moment(2) again. - # After calling stats(), the following was raising an exception. - # So this test passes if the following does not raise an exception. - m2 = frozen.moment(2) - # The following should also be true, of course. But it is not - # the focus of this test. - assert_equal(m1, m2) - - -class TestExpect(TestCase): - # Test for expect method. - # - # Uses normal distribution and beta distribution for finite bounds, and - # hypergeom for discrete distribution with finite support - def test_norm(self): - v = stats.norm.expect(lambda x: (x-5)*(x-5), loc=5, scale=2) - assert_almost_equal(v, 4, decimal=14) - - m = stats.norm.expect(lambda x: (x), loc=5, scale=2) - assert_almost_equal(m, 5, decimal=14) - - lb = stats.norm.ppf(0.05, loc=5, scale=2) - ub = stats.norm.ppf(0.95, loc=5, scale=2) - prob90 = stats.norm.expect(lambda x: 1, loc=5, scale=2, lb=lb, ub=ub) - assert_almost_equal(prob90, 0.9, decimal=14) - - prob90c = stats.norm.expect(lambda x: 1, loc=5, scale=2, lb=lb, ub=ub, - conditional=True) - assert_almost_equal(prob90c, 1., decimal=14) - - def test_beta(self): - # case with finite support interval - v = stats.beta.expect(lambda x: (x-19/3.)*(x-19/3.), args=(10,5), - loc=5, scale=2) - assert_almost_equal(v, 1./18., decimal=13) - - m = stats.beta.expect(lambda x: x, args=(10,5), loc=5., scale=2.) - assert_almost_equal(m, 19/3., decimal=13) - - ub = stats.beta.ppf(0.95, 10, 10, loc=5, scale=2) - lb = stats.beta.ppf(0.05, 10, 10, loc=5, scale=2) - prob90 = stats.beta.expect(lambda x: 1., args=(10,10), loc=5., - scale=2.,lb=lb, ub=ub, conditional=False) - assert_almost_equal(prob90, 0.9, decimal=13) - - prob90c = stats.beta.expect(lambda x: 1, args=(10,10), loc=5, - scale=2, lb=lb, ub=ub, conditional=True) - assert_almost_equal(prob90c, 1., decimal=13) - - def test_hypergeom(self): - # test case with finite bounds - - # without specifying bounds - m_true, v_true = stats.hypergeom.stats(20, 10, 8, loc=5.) - m = stats.hypergeom.expect(lambda x: x, args=(20, 10, 8), loc=5.) - assert_almost_equal(m, m_true, decimal=13) - - v = stats.hypergeom.expect(lambda x: (x-9.)**2, args=(20, 10, 8), - loc=5.) - assert_almost_equal(v, v_true, decimal=14) - - # with bounds, bounds equal to shifted support - v_bounds = stats.hypergeom.expect(lambda x: (x-9.)**2, args=(20, 10, 8), - loc=5., lb=5, ub=13) - assert_almost_equal(v_bounds, v_true, decimal=14) - - # drop boundary points - prob_true = 1-stats.hypergeom.pmf([5, 13], 20, 10, 8, loc=5).sum() - prob_bounds = stats.hypergeom.expect(lambda x: 1, args=(20, 10, 8), - loc=5., lb=6, ub=12) - assert_almost_equal(prob_bounds, prob_true, decimal=13) - - # conditional - prob_bc = stats.hypergeom.expect(lambda x: 1, args=(20, 10, 8), loc=5., - lb=6, ub=12, conditional=True) - assert_almost_equal(prob_bc, 1, decimal=14) - - # check simple integral - prob_b = stats.hypergeom.expect(lambda x: 1, args=(20, 10, 8), - lb=0, ub=8) - assert_almost_equal(prob_b, 1, decimal=13) - - def test_poisson(self): - # poisson, use lower bound only - prob_bounds = stats.poisson.expect(lambda x: 1, args=(2,), lb=3, - conditional=False) - prob_b_true = 1-stats.poisson.cdf(2,2) - assert_almost_equal(prob_bounds, prob_b_true, decimal=14) - - prob_lb = stats.poisson.expect(lambda x: 1, args=(2,), lb=2, - conditional=True) - assert_almost_equal(prob_lb, 1, decimal=14) - - def test_genhalflogistic(self): - # genhalflogistic, changes upper bound of support in _argcheck - # regression test for gh-2622 - halflog = stats.genhalflogistic - # check consistency when calling expect twice with the same input - res1 = halflog.expect(args=(1.5,)) - halflog.expect(args=(0.5,)) - res2 = halflog.expect(args=(1.5,)) - assert_almost_equal(res1, res2, decimal=14) - - def test_rice_overflow(self): - # rice.pdf(999, 0.74) was inf since special.i0 silentyly overflows - # check that using i0e fixes it - assert_(np.isfinite(stats.rice.pdf(999, 0.74))) - - assert_(np.isfinite(stats.rice.expect(lambda x: 1, args=(0.74,)))) - assert_(np.isfinite(stats.rice.expect(lambda x: 2, args=(0.74,)))) - assert_(np.isfinite(stats.rice.expect(lambda x: 3, args=(0.74,)))) - - -class TestNct(TestCase): - def test_nc_parameter(self): - # Parameter values c<=0 were not enabled (gh-2402). - # For negative values c and for c=0 results of rv.cdf(0) below were nan - rv = stats.nct(5, 0) - assert_equal(rv.cdf(0), 0.5) - rv = stats.nct(5, -1) - assert_almost_equal(rv.cdf(0), 0.841344746069, decimal=10) - - def test_broadcasting(self): - res = stats.nct.pdf(5, np.arange(4,7)[:,None], np.linspace(0.1, 1, 4)) - expected = array([[0.00321886, 0.00557466, 0.00918418, 0.01442997], - [0.00217142, 0.00395366, 0.00683888, 0.01126276], - [0.00153078, 0.00291093, 0.00525206, 0.00900815]]) - assert_allclose(res, expected, rtol=1e-5) - - def text_variance_gh_issue_2401(self): - # Computation of the variance of a non-central t-distribution resulted - # in a TypeError: ufunc 'isinf' not supported for the input types, - # and the inputs could not be safely coerced to any supported types - # according to the casting rule 'safe' - rv = stats.nct(4, 0) - assert_equal(rv.var(), 2.0) - - def test_nct_inf_moments(self): - # n-th moment of nct only exists for df > n - m, v, s, k = stats.nct.stats(df=1.9, nc=0.3, moments='mvsk') - assert_(np.isfinite(m)) - assert_equal([v, s, k], [np.inf, np.nan, np.nan]) - - m, v, s, k = stats.nct.stats(df=3.1, nc=0.3, moments='mvsk') - assert_(np.isfinite([m, v, s]).all()) - assert_equal(k, np.nan) - - -class TestRice(TestCase): - def test_rice_zero_b(self): - # rice distribution should work with b=0, cf gh-2164 - x = [0.2, 1., 5.] - assert_(np.isfinite(stats.rice.pdf(x, b=0.)).all()) - assert_(np.isfinite(stats.rice.logpdf(x, b=0.)).all()) - assert_(np.isfinite(stats.rice.cdf(x, b=0.)).all()) - assert_(np.isfinite(stats.rice.logcdf(x, b=0.)).all()) - - q = [0.1, 0.1, 0.5, 0.9] - assert_(np.isfinite(stats.rice.ppf(q, b=0.)).all()) - - mvsk = stats.rice.stats(0, moments='mvsk') - assert_(np.isfinite(mvsk).all()) - - # furthermore, pdf is continuous as b\to 0 - # rice.pdf(x, b\to 0) = x exp(-x^2/2) + O(b^2) - # see e.g. Abramovich & Stegun 9.6.7 & 9.6.10 - b = 1e-8 - assert_allclose(stats.rice.pdf(x, 0), stats.rice.pdf(x, b), - atol=b, rtol=0) - - def test_rice_rvs(self): - rvs = stats.rice.rvs - assert_equal(rvs(b=3.).size, 1) - assert_equal(rvs(b=3., size=(3, 5)).shape, (3, 5)) - - -class TestErlang(TestCase): - def test_erlang_runtimewarning(self): - # erlang should generate a RuntimeWarning if a non-integer - # shape parameter is used. - with warnings.catch_warnings(): - warnings.simplefilter("error", RuntimeWarning) - - # The non-integer shape parameter 1.3 should trigger a RuntimeWarning - assert_raises(RuntimeWarning, - stats.erlang.rvs, 1.3, loc=0, scale=1, size=4) - - # Calling the fit method with `f0` set to an integer should - # *not* trigger a RuntimeWarning. It should return the same - # values as gamma.fit(...). - data = [0.5, 1.0, 2.0, 4.0] - result_erlang = stats.erlang.fit(data, f0=1) - result_gamma = stats.gamma.fit(data, f0=1) - assert_allclose(result_erlang, result_gamma, rtol=1e-3) - - -class TestRdist(TestCase): - @dec.slow - def test_rdist_cdf_gh1285(self): - # check workaround in rdist._cdf for issue gh-1285. - distfn = stats.rdist - values = [0.001, 0.5, 0.999] - assert_almost_equal(distfn.cdf(distfn.ppf(values, 541.0), 541.0), - values, decimal=5) - - -def test_540_567(): - # test for nan returned in tickets 540, 567 - assert_almost_equal(stats.norm.cdf(-1.7624320982),0.03899815971089126, - decimal=10, err_msg='test_540_567') - assert_almost_equal(stats.norm.cdf(-1.7624320983),0.038998159702449846, - decimal=10, err_msg='test_540_567') - assert_almost_equal(stats.norm.cdf(1.38629436112, loc=0.950273420309, - scale=0.204423758009),0.98353464004309321, - decimal=10, err_msg='test_540_567') - - -def test_regression_ticket_1316(): - # The following was raising an exception, because _construct_default_doc() - # did not handle the default keyword extradoc=None. See ticket #1316. - g = stats._continuous_distns.gamma_gen(name='gamma') - - -def test_regression_ticket_1326(): - # adjust to avoid nan with 0*log(0) - assert_almost_equal(stats.chi2.pdf(0.0, 2), 0.5, 14) - - -def test_regression_tukey_lambda(): - # Make sure that Tukey-Lambda distribution correctly handles non-positive lambdas. - x = np.linspace(-5.0, 5.0, 101) - - olderr = np.seterr(divide='ignore') - try: - for lam in [0.0, -1.0, -2.0, np.array([[-1.0], [0.0], [-2.0]])]: - p = stats.tukeylambda.pdf(x, lam) - assert_((p != 0.0).all()) - assert_(~np.isnan(p).all()) - - lam = np.array([[-1.0], [0.0], [2.0]]) - p = stats.tukeylambda.pdf(x, lam) - finally: - np.seterr(**olderr) - - assert_(~np.isnan(p).all()) - assert_((p[0] != 0.0).all()) - assert_((p[1] != 0.0).all()) - assert_((p[2] != 0.0).any()) - assert_((p[2] == 0.0).any()) - - -@dec.skipif(DOCSTRINGS_STRIPPED) -def test_regression_ticket_1421(): - assert_('pdf(x, mu, loc=0, scale=1)' not in stats.poisson.__doc__) - assert_('pmf(x,' in stats.poisson.__doc__) - - -def test_nan_arguments_gh_issue_1362(): - assert_(np.isnan(stats.t.logcdf(1, np.nan))) - assert_(np.isnan(stats.t.cdf(1, np.nan))) - assert_(np.isnan(stats.t.logsf(1, np.nan))) - assert_(np.isnan(stats.t.sf(1, np.nan))) - assert_(np.isnan(stats.t.pdf(1, np.nan))) - assert_(np.isnan(stats.t.logpdf(1, np.nan))) - assert_(np.isnan(stats.t.ppf(1, np.nan))) - assert_(np.isnan(stats.t.isf(1, np.nan))) - - assert_(np.isnan(stats.bernoulli.logcdf(np.nan, 0.5))) - assert_(np.isnan(stats.bernoulli.cdf(np.nan, 0.5))) - assert_(np.isnan(stats.bernoulli.logsf(np.nan, 0.5))) - assert_(np.isnan(stats.bernoulli.sf(np.nan, 0.5))) - assert_(np.isnan(stats.bernoulli.pmf(np.nan, 0.5))) - assert_(np.isnan(stats.bernoulli.logpmf(np.nan, 0.5))) - assert_(np.isnan(stats.bernoulli.ppf(np.nan, 0.5))) - assert_(np.isnan(stats.bernoulli.isf(np.nan, 0.5))) - - -def test_frozen_fit_ticket_1536(): - np.random.seed(5678) - true = np.array([0.25, 0., 0.5]) - x = stats.lognorm.rvs(true[0], true[1], true[2], size=100) - - olderr = np.seterr(divide='ignore') - try: - params = np.array(stats.lognorm.fit(x, floc=0.)) - finally: - np.seterr(**olderr) - - assert_almost_equal(params, true, decimal=2) - - params = np.array(stats.lognorm.fit(x, fscale=0.5, loc=0)) - assert_almost_equal(params, true, decimal=2) - - params = np.array(stats.lognorm.fit(x, f0=0.25, loc=0)) - assert_almost_equal(params, true, decimal=2) - - params = np.array(stats.lognorm.fit(x, f0=0.25, floc=0)) - assert_almost_equal(params, true, decimal=2) - - np.random.seed(5678) - loc = 1 - floc = 0.9 - x = stats.norm.rvs(loc, 2., size=100) - params = np.array(stats.norm.fit(x, floc=floc)) - expected = np.array([floc, np.sqrt(((x-floc)**2).mean())]) - assert_almost_equal(params, expected, decimal=4) - - -def test_regression_ticket_1530(): - # Check the starting value works for Cauchy distribution fit. - np.random.seed(654321) - rvs = stats.cauchy.rvs(size=100) - params = stats.cauchy.fit(rvs) - expected = (0.045, 1.142) - assert_almost_equal(params, expected, decimal=1) - - -def test_tukeylambda_stats_ticket_1545(): - # Some test for the variance and kurtosis of the Tukey Lambda distr. - # See test_tukeylamdba_stats.py for more tests. - - mv = stats.tukeylambda.stats(0, moments='mvsk') - # Known exact values: - expected = [0, np.pi**2/3, 0, 1.2] - assert_almost_equal(mv, expected, decimal=10) - - mv = stats.tukeylambda.stats(3.13, moments='mvsk') - # 'expected' computed with mpmath. - expected = [0, 0.0269220858861465102, 0, -0.898062386219224104] - assert_almost_equal(mv, expected, decimal=10) - - mv = stats.tukeylambda.stats(0.14, moments='mvsk') - # 'expected' computed with mpmath. - expected = [0, 2.11029702221450250, 0, -0.02708377353223019456] - assert_almost_equal(mv, expected, decimal=10) - - -def test_poisson_logpmf_ticket_1436(): - assert_(np.isfinite(stats.poisson.logpmf(1500, 200))) - - -def test_powerlaw_stats(): - """Test the powerlaw stats function. - - This unit test is also a regression test for ticket 1548. - - The exact values are: - mean: - mu = a / (a + 1) - variance: - sigma**2 = a / ((a + 2) * (a + 1) ** 2) - skewness: - One formula (see http://en.wikipedia.org/wiki/Skewness) is - gamma_1 = (E[X**3] - 3*mu*E[X**2] + 2*mu**3) / sigma**3 - A short calculation shows that E[X**k] is a / (a + k), so gamma_1 - can be implemented as - n = a/(a+3) - 3*(a/(a+1))*a/(a+2) + 2*(a/(a+1))**3 - d = sqrt(a/((a+2)*(a+1)**2)) ** 3 - gamma_1 = n/d - Either by simplifying, or by a direct calculation of mu_3 / sigma**3, - one gets the more concise formula: - gamma_1 = -2.0 * ((a - 1) / (a + 3)) * sqrt((a + 2) / a) - kurtosis: (See http://en.wikipedia.org/wiki/Kurtosis) - The excess kurtosis is - gamma_2 = mu_4 / sigma**4 - 3 - A bit of calculus and algebra (sympy helps) shows that - mu_4 = 3*a*(3*a**2 - a + 2) / ((a+1)**4 * (a+2) * (a+3) * (a+4)) - so - gamma_2 = 3*(3*a**2 - a + 2) * (a+2) / (a*(a+3)*(a+4)) - 3 - which can be rearranged to - gamma_2 = 6 * (a**3 - a**2 - 6*a + 2) / (a*(a+3)*(a+4)) - """ - cases = [(1.0, (0.5, 1./12, 0.0, -1.2)), - (2.0, (2./3, 2./36, -0.56568542494924734, -0.6))] - for a, exact_mvsk in cases: - mvsk = stats.powerlaw.stats(a, moments="mvsk") - assert_array_almost_equal(mvsk, exact_mvsk) - - -def test_ksone_fit_freeze(): - # Regression test for ticket #1638. - d = np.array( - [-0.18879233, 0.15734249, 0.18695107, 0.27908787, -0.248649, - -0.2171497, 0.12233512, 0.15126419, 0.03119282, 0.4365294, - 0.08930393, -0.23509903, 0.28231224, -0.09974875, -0.25196048, - 0.11102028, 0.1427649, 0.10176452, 0.18754054, 0.25826724, - 0.05988819, 0.0531668, 0.21906056, 0.32106729, 0.2117662, - 0.10886442, 0.09375789, 0.24583286, -0.22968366, -0.07842391, - -0.31195432, -0.21271196, 0.1114243, -0.13293002, 0.01331725, - -0.04330977, -0.09485776, -0.28434547, 0.22245721, -0.18518199, - -0.10943985, -0.35243174, 0.06897665, -0.03553363, -0.0701746, - -0.06037974, 0.37670779, -0.21684405]) - - try: - olderr = np.seterr(invalid='ignore') - with warnings.catch_warnings(): - warnings.simplefilter('ignore', UserWarning) - warnings.simplefilter('ignore', RuntimeWarning) - stats.ksone.fit(d) - finally: - np.seterr(**olderr) - - -def test_norm_logcdf(): - # Test precision of the logcdf of the normal distribution. - # This precision was enhanced in ticket 1614. - x = -np.asarray(list(range(0, 120, 4))) - # Values from R - expected = [-0.69314718, -10.36010149, -35.01343716, -75.41067300, - -131.69539607, -203.91715537, -292.09872100, -396.25241451, - -516.38564863, -652.50322759, -804.60844201, -972.70364403, - -1156.79057310, -1356.87055173, -1572.94460885, -1805.01356068, - -2053.07806561, -2317.13866238, -2597.19579746, -2893.24984493, - -3205.30112136, -3533.34989701, -3877.39640444, -4237.44084522, - -4613.48339520, -5005.52420869, -5413.56342187, -5837.60115548, - -6277.63751711, -6733.67260303] - - olderr = np.seterr(divide='ignore') - try: - assert_allclose(stats.norm().logcdf(x), expected, atol=1e-8) - finally: - np.seterr(**olderr) - - -def test_hypergeom_interval_1802(): - # these two had endless loops - assert_equal(stats.hypergeom.interval(.95, 187601, 43192, 757), - (152.0, 197.0)) - assert_equal(stats.hypergeom.interval(.945, 187601, 43192, 757), - (152.0, 197.0)) - # this was working also before - assert_equal(stats.hypergeom.interval(.94, 187601, 43192, 757), - (153.0, 196.0)) - - # degenerate case .a == .b - assert_equal(stats.hypergeom.ppf(0.02, 100, 100, 8), 8) - assert_equal(stats.hypergeom.ppf(1, 100, 100, 8), 8) - - -def test_distribution_too_many_args(): - # Check that a TypeError is raised when too many args are given to a method - # Regression test for ticket 1815. - x = np.linspace(0.1, 0.7, num=5) - assert_raises(TypeError, stats.gamma.pdf, x, 2, 3, loc=1.0) - assert_raises(TypeError, stats.gamma.pdf, x, 2, 3, 4, loc=1.0) - assert_raises(TypeError, stats.gamma.pdf, x, 2, 3, 4, 5) - assert_raises(TypeError, stats.gamma.pdf, x, 2, 3, loc=1.0, scale=0.5) - assert_raises(TypeError, stats.gamma.rvs, 2., 3, loc=1.0, scale=0.5) - assert_raises(TypeError, stats.gamma.cdf, x, 2., 3, loc=1.0, scale=0.5) - assert_raises(TypeError, stats.gamma.ppf, x, 2., 3, loc=1.0, scale=0.5) - assert_raises(TypeError, stats.gamma.stats, 2., 3, loc=1.0, scale=0.5) - assert_raises(TypeError, stats.gamma.entropy, 2., 3, loc=1.0, scale=0.5) - assert_raises(TypeError, stats.gamma.fit, x, 2., 3, loc=1.0, scale=0.5) - - # These should not give errors - stats.gamma.pdf(x, 2, 3) # loc=3 - stats.gamma.pdf(x, 2, 3, 4) # loc=3, scale=4 - stats.gamma.stats(2., 3) - stats.gamma.stats(2., 3, 4) - stats.gamma.stats(2., 3, 4, 'mv') - stats.gamma.rvs(2., 3, 4, 5) - stats.gamma.fit(stats.gamma.rvs(2., size=7), 2.) - - # Also for a discrete distribution - stats.geom.pmf(x, 2, loc=3) # no error, loc=3 - assert_raises(TypeError, stats.geom.pmf, x, 2, 3, 4) - assert_raises(TypeError, stats.geom.pmf, x, 2, 3, loc=4) - - # And for distributions with 0, 2 and 3 args respectively - assert_raises(TypeError, stats.expon.pdf, x, 3, loc=1.0) - assert_raises(TypeError, stats.exponweib.pdf, x, 3, 4, 5, loc=1.0) - assert_raises(TypeError, stats.exponweib.pdf, x, 3, 4, 5, 0.1, 0.1) - assert_raises(TypeError, stats.ncf.pdf, x, 3, 4, 5, 6, loc=1.0) - assert_raises(TypeError, stats.ncf.pdf, x, 3, 4, 5, 6, 1.0, scale=0.5) - stats.ncf.pdf(x, 3, 4, 5, 6, 1.0) # 3 args, plus loc/scale - - -def test_ncx2_tails_ticket_955(): - # Trac #955 -- check that the cdf computed by special functions - # matches the integrated pdf - a = stats.ncx2.cdf(np.arange(20, 25, 0.2), 2, 1.07458615e+02) - b = stats.ncx2.veccdf(np.arange(20, 25, 0.2), 2, 1.07458615e+02) - assert_allclose(a, b, rtol=1e-3, atol=0) - - -def test_foldnorm_zero(): - # Parameter value c=0 was not enabled, see gh-2399. - rv = stats.foldnorm(0, scale=1) - assert_equal(rv.cdf(0), 0) # rv.cdf(0) previously resulted in: nan - - -def test_stats_shapes_argcheck(): - # stats method was failing for vector shapes if some of the values - # were outside of the allowed range, see gh-2678 - mv3 = stats.invgamma.stats([0.0, 0.5, 1.0], 1, 0.5) # 0 is not a legal `a` - mv2 = stats.invgamma.stats([0.5, 1.0], 1, 0.5) - mv2_augmented = tuple(np.r_[np.nan, _] for _ in mv2) - assert_equal(mv2_augmented, mv3) - - mv3 = stats.lognorm.stats([2, 2.4, -1]) # -1 is not a legal shape parameter - mv2 = stats.lognorm.stats([2, 2.4]) - mv2_augmented = tuple(np.r_[_, np.nan] for _ in mv2) - assert_equal(mv2_augmented, mv3) - - # FIXME: this is only a quick-and-dirty test of a quick-and-dirty bugfix. - # stats method with multiple shape parameters is not properly vectorized - # anyway, so some distributions may or may not fail. - - -## Test subclassing distributions w/ explicit shapes - -class _distr_gen(stats.rv_continuous): - def _pdf(self, x, a): - return 42 - - -class _distr2_gen(stats.rv_continuous): - def _cdf(self, x, a): - return 42 * a + x - - -class _distr3_gen(stats.rv_continuous): - def _pdf(self, x, a, b): - return a + b - - def _cdf(self, x, a): - # Different # of shape params from _pdf, to be able to check that - # inspection catches the inconsistency.""" - return 42 * a + x - - -class _distr6_gen(stats.rv_continuous): - # Two shape parameters (both _pdf and _cdf defined, consistent shapes.) - def _pdf(self, x, a, b): - return a*x + b - - def _cdf(self, x, a, b): - return 42 * a + x - - -class TestSubclassingExplicitShapes(TestCase): - # Construct a distribution w/ explicit shapes parameter and test it. - - def test_correct_shapes(self): - dummy_distr = _distr_gen(name='dummy', shapes='a') - assert_equal(dummy_distr.pdf(1, a=1), 42) - - def test_wrong_shapes_1(self): - dummy_distr = _distr_gen(name='dummy', shapes='A') - assert_raises(TypeError, dummy_distr.pdf, 1, **dict(a=1)) - - def test_wrong_shapes_2(self): - dummy_distr = _distr_gen(name='dummy', shapes='a, b, c') - dct = dict(a=1, b=2, c=3) - assert_raises(TypeError, dummy_distr.pdf, 1, **dct) - - def test_shapes_string(self): - # shapes must be a string - dct = dict(name='dummy', shapes=42) - assert_raises(TypeError, _distr_gen, **dct) - - def test_shapes_identifiers_1(self): - # shapes must be a comma-separated list of valid python identifiers - dct = dict(name='dummy', shapes='(!)') - assert_raises(SyntaxError, _distr_gen, **dct) - - def test_shapes_identifiers_2(self): - dct = dict(name='dummy', shapes='4chan') - assert_raises(SyntaxError, _distr_gen, **dct) - - def test_shapes_identifiers_3(self): - dct = dict(name='dummy', shapes='m(fti)') - assert_raises(SyntaxError, _distr_gen, **dct) - - def test_shapes_identifiers_nodefaults(self): - dct = dict(name='dummy', shapes='a=2') - assert_raises(SyntaxError, _distr_gen, **dct) - - def test_shapes_args(self): - dct = dict(name='dummy', shapes='*args') - assert_raises(SyntaxError, _distr_gen, **dct) - - def test_shapes_kwargs(self): - dct = dict(name='dummy', shapes='**kwargs') - assert_raises(SyntaxError, _distr_gen, **dct) - - def test_shapes_keywords(self): - # python keywords cannot be used for shape parameters - dct = dict(name='dummy', shapes='a, b, c, lambda') - assert_raises(SyntaxError, _distr_gen, **dct) - - def test_shapes_signature(self): - # test explicit shapes which agree w/ the signature of _pdf - class _dist_gen(stats.rv_continuous): - def _pdf(self, x, a): - return stats.norm._pdf(x) * a - - dist = _dist_gen(shapes='a') - assert_equal(dist.pdf(0.5, a=2), stats.norm.pdf(0.5)*2) - - def test_shapes_signature_inconsistent(self): - # test explicit shapes which do not agree w/ the signature of _pdf - class _dist_gen(stats.rv_continuous): - def _pdf(self, x, a): - return stats.norm._pdf(x) * a - - dist = _dist_gen(shapes='a, b') - assert_raises(TypeError, dist.pdf, 0.5, **dict(a=1, b=2)) - - def test_star_args(self): - # test _pdf with only starargs - # NB: **kwargs of pdf will never reach _pdf - class _dist_gen(stats.rv_continuous): - def _pdf(self, x, *args): - extra_kwarg = args[0] - return stats.norm._pdf(x) * extra_kwarg - - dist = _dist_gen(shapes='extra_kwarg') - assert_equal(dist.pdf(0.5, extra_kwarg=33), stats.norm.pdf(0.5)*33) - assert_equal(dist.pdf(0.5, 33), stats.norm.pdf(0.5)*33) - assert_raises(TypeError, dist.pdf, 0.5, **dict(xxx=33)) - - def test_star_args_2(self): - # test _pdf with named & starargs - # NB: **kwargs of pdf will never reach _pdf - class _dist_gen(stats.rv_continuous): - def _pdf(self, x, offset, *args): - extra_kwarg = args[0] - return stats.norm._pdf(x) * extra_kwarg + offset - - dist = _dist_gen(shapes='offset, extra_kwarg') - assert_equal(dist.pdf(0.5, offset=111, extra_kwarg=33), - stats.norm.pdf(0.5)*33 + 111) - assert_equal(dist.pdf(0.5, 111, 33), - stats.norm.pdf(0.5)*33 + 111) - - def test_extra_kwarg(self): - # **kwargs to _pdf are ignored. - # this is a limitation of the framework (_pdf(x, *goodargs)) - class _distr_gen(stats.rv_continuous): - def _pdf(self, x, *args, **kwargs): - # _pdf should handle *args, **kwargs itself. Here "handling" is - # ignoring *args and looking for ``extra_kwarg`` and using that. - extra_kwarg = kwargs.pop('extra_kwarg', 1) - return stats.norm._pdf(x) * extra_kwarg - - dist = _distr_gen(shapes='extra_kwarg') - assert_equal(dist.pdf(1, extra_kwarg=3), stats.norm.pdf(1)) - - def shapes_empty_string(self): - # shapes='' is equivalent to shapes=None - class _dist_gen(stats.rv_continuous): - def _pdf(self, x): - return stats.norm.pdf(x) - - dist = _dist_gen(shapes='') - assert_equal(dist.pdf(0.5), stats.norm.pdf(0.5)) - - -class TestSubclassingNoShapes(TestCase): - # Construct a distribution w/o explicit shapes parameter and test it. - - def test_only__pdf(self): - dummy_distr = _distr_gen(name='dummy') - assert_equal(dummy_distr.pdf(1, a=1), 42) - - def test_only__cdf(self): - # _pdf is determined from _cdf by taking numerical derivative - dummy_distr = _distr2_gen(name='dummy') - assert_almost_equal(dummy_distr.pdf(1, a=1), 1) - - @dec.skipif(DOCSTRINGS_STRIPPED) - def test_signature_inspection(self): - # check that _pdf signature inspection works correctly, and is used in - # the class docstring - dummy_distr = _distr_gen(name='dummy') - assert_equal(dummy_distr.numargs, 1) - assert_equal(dummy_distr.shapes, 'a') - res = re.findall('logpdf\(x, a, loc=0, scale=1\)', - dummy_distr.__doc__) - assert_(len(res) == 1) - - @dec.skipif(DOCSTRINGS_STRIPPED) - def test_signature_inspection_2args(self): - # same for 2 shape params and both _pdf and _cdf defined - dummy_distr = _distr6_gen(name='dummy') - assert_equal(dummy_distr.numargs, 2) - assert_equal(dummy_distr.shapes, 'a, b') - res = re.findall('logpdf\(x, a, b, loc=0, scale=1\)', - dummy_distr.__doc__) - assert_(len(res) == 1) - - def test_signature_inspection_2args_incorrect_shapes(self): - # both _pdf and _cdf defined, but shapes are inconsistent: raises - try: - _distr3_gen(name='dummy') - except TypeError: - pass - else: - raise AssertionError('TypeError not raised.') - - def test_defaults_raise(self): - # default arguments should raise - class _dist_gen(stats.rv_continuous): - def _pdf(self, x, a=42): - return 42 - assert_raises(TypeError, _dist_gen, **dict(name='dummy')) - - def test_starargs_raise(self): - # without explicit shapes, *args are not allowed - class _dist_gen(stats.rv_continuous): - def _pdf(self, x, a, *args): - return 42 - assert_raises(TypeError, _dist_gen, **dict(name='dummy')) - - def test_kwargs_raise(self): - # without explicit shapes, **kwargs are not allowed - class _dist_gen(stats.rv_continuous): - def _pdf(self, x, a, **kwargs): - return 42 - assert_raises(TypeError, _dist_gen, **dict(name='dummy')) - - -@dec.skipif(DOCSTRINGS_STRIPPED) -def test_docstrings(): - badones = [',\s*,', '\(\s*,', '^\s*:'] - for distname in stats.__all__: - dist = getattr(stats, distname) - if isinstance(dist, (stats.rv_discrete, stats.rv_continuous)): - for regex in badones: - assert_(re.search(regex, dist.__doc__) is None) - - -def test_infinite_input(): - assert_almost_equal(stats.skellam.sf(np.inf, 10, 11), 0) - assert_almost_equal(stats.ncx2._cdf(np.inf, 8, 0.1), 1) - - -if __name__ == "__main__": - run_module_suite() +""" Test functions for stats module + +""" +from __future__ import division, print_function, absolute_import +#import unittest +import warnings +import re +import sys + +from numpy.testing import (TestCase, run_module_suite, assert_equal, + assert_array_equal, assert_almost_equal, + assert_array_almost_equal, + assert_allclose, assert_, assert_raises, rand, dec) +from nose import SkipTest + +import numpy +import numpy as np +from numpy import typecodes, array +#from scipy.lib._version import NumpyVersion +from scipy import special +import wafo.stats as stats +from wafo.stats._distn_infrastructure import argsreduce +from scipy.special import xlogy + + +# python -OO strips docstrings +DOCSTRINGS_STRIPPED = sys.flags.optimize > 1 + + +# generate test cases to test cdf and distribution consistency +dists = ['uniform', 'norm', 'lognorm', 'expon', 'beta', + 'powerlaw', 'bradford', 'burr', 'fisk', 'cauchy', 'halfcauchy', + 'foldcauchy', 'gamma', 'gengamma', 'loggamma', + 'alpha', 'anglit', 'arcsine', 'betaprime', + 'dgamma', 'exponweib', 'exponpow', 'frechet_l', 'frechet_r', + 'gilbrat', 'f', 'ncf', 'chi2', 'chi', 'nakagami', 'genpareto', + 'genextreme', 'genhalflogistic', 'pareto', 'lomax', 'halfnorm', + 'halflogistic', 'fatiguelife', 'foldnorm', 'ncx2', 't', 'nct', + 'weibull_min', 'weibull_max', 'dweibull', 'maxwell', 'rayleigh', + 'genlogistic', 'logistic', 'gumbel_l', 'gumbel_r', 'gompertz', + 'hypsecant', 'laplace', 'reciprocal', 'triang', 'tukeylambda', + 'vonmises', 'vonmises_line', 'pearson3'] + +# check function for test generator + + +def check_distribution(dist, args, alpha): + D, pval = stats.kstest(dist, '', args=args, N=1000) + if (pval < alpha): + D, pval = stats.kstest(dist, '', args=args, N=1000) + # if (pval < alpha): + # D,pval = stats.kstest(dist,'',args=args, N=1000) + assert_(pval > alpha, msg="D = " + str(D) + "; pval = " + str(pval) + + "; alpha = " + str(alpha) + "\nargs = " + str(args)) + +# nose test generator + + +def test_all_distributions(): + for dist in dists: + distfunc = getattr(stats, dist) + nargs = distfunc.numargs + alpha = 0.01 + if dist == 'fatiguelife': + alpha = 0.001 + + if dist == 'frechet': + args = tuple(2 * rand(1)) + (0,) + tuple(2 * rand(2)) + elif dist == 'triang': + args = tuple(rand(nargs)) + elif dist == 'reciprocal': + vals = rand(nargs) + vals[1] = vals[0] + 1.0 + args = tuple(vals) + elif dist == 'vonmises': + yield check_distribution, dist, (10,), alpha + yield check_distribution, dist, (101,), alpha + args = tuple(1.0 + rand(nargs)) + else: + args = tuple(1.0 + rand(nargs)) + + yield check_distribution, dist, args, alpha + + +def check_vonmises_pdf_periodic(k, l, s, x): + vm = stats.vonmises(k, loc=l, scale=s) + assert_almost_equal(vm.pdf(x), vm.pdf(x % (2 * numpy.pi * s))) + + +def check_vonmises_cdf_periodic(k, l, s, x): + vm = stats.vonmises(k, loc=l, scale=s) + assert_almost_equal(vm.cdf(x) % 1, vm.cdf(x % (2 * numpy.pi * s)) % 1) + + +def test_vonmises_pdf_periodic(): + for k in [0.1, 1, 101]: + for x in [0, 1, numpy.pi, 10, 100]: + yield check_vonmises_pdf_periodic, k, 0, 1, x + yield check_vonmises_pdf_periodic, k, 1, 1, x + yield check_vonmises_pdf_periodic, k, 0, 10, x + + yield check_vonmises_cdf_periodic, k, 0, 1, x + yield check_vonmises_cdf_periodic, k, 1, 1, x + yield check_vonmises_cdf_periodic, k, 0, 10, x + + +def test_vonmises_line_support(): + assert_equal(stats.vonmises_line.a, -np.pi) + assert_equal(stats.vonmises_line.b, np.pi) + + +class TestRandInt(TestCase): + + def test_rvs(self): + vals = stats.randint.rvs(5, 30, size=100) + assert_(numpy.all(vals < 30) & numpy.all(vals >= 5)) + assert_(len(vals) == 100) + vals = stats.randint.rvs(5, 30, size=(2, 50)) + assert_(numpy.shape(vals) == (2, 50)) + assert_(vals.dtype.char in typecodes['AllInteger']) + val = stats.randint.rvs(15, 46) + assert_((val >= 15) & (val < 46)) + assert_(isinstance(val, numpy.ScalarType), msg=repr(type(val))) + val = stats.randint(15, 46).rvs(3) + assert_(val.dtype.char in typecodes['AllInteger']) + + def test_pdf(self): + k = numpy.r_[0:36] + out = numpy.where((k >= 5) & (k < 30), 1.0 / (30 - 5), 0) + vals = stats.randint.pmf(k, 5, 30) + assert_array_almost_equal(vals, out) + + def test_cdf(self): + x = numpy.r_[0:36:100j] + k = numpy.floor(x) + out = numpy.select( + [k >= 30, k >= 5], [1.0, (k - 5.0 + 1) / (30 - 5.0)], 0) + vals = stats.randint.cdf(x, 5, 30) + assert_array_almost_equal(vals, out, decimal=12) + + +class TestBinom(TestCase): + + def test_rvs(self): + vals = stats.binom.rvs(10, 0.75, size=(2, 50)) + assert_(numpy.all(vals >= 0) & numpy.all(vals <= 10)) + assert_(numpy.shape(vals) == (2, 50)) + assert_(vals.dtype.char in typecodes['AllInteger']) + val = stats.binom.rvs(10, 0.75) + assert_(isinstance(val, int)) + val = stats.binom(10, 0.75).rvs(3) + assert_(isinstance(val, numpy.ndarray)) + assert_(val.dtype.char in typecodes['AllInteger']) + + def test_pmf(self): + # regression test for Ticket #1842 + vals1 = stats.binom.pmf(100, 100, 1) + vals2 = stats.binom.pmf(0, 100, 0) + assert_allclose(vals1, 1.0, rtol=1e-15, atol=0) + assert_allclose(vals2, 1.0, rtol=1e-15, atol=0) + + def test_entropy(self): + # Basic entropy tests. + b = stats.binom(2, 0.5) + expected_p = np.array([0.25, 0.5, 0.25]) + expected_h = -sum(xlogy(expected_p, expected_p)) + h = b.entropy() + assert_allclose(h, expected_h) + + b = stats.binom(2, 0.0) + h = b.entropy() + assert_equal(h, 0.0) + + b = stats.binom(2, 1.0) + h = b.entropy() + assert_equal(h, 0.0) + + +class TestBernoulli(TestCase): + + def test_rvs(self): + vals = stats.bernoulli.rvs(0.75, size=(2, 50)) + assert_(numpy.all(vals >= 0) & numpy.all(vals <= 1)) + assert_(numpy.shape(vals) == (2, 50)) + assert_(vals.dtype.char in typecodes['AllInteger']) + val = stats.bernoulli.rvs(0.75) + assert_(isinstance(val, int)) + val = stats.bernoulli(0.75).rvs(3) + assert_(isinstance(val, numpy.ndarray)) + assert_(val.dtype.char in typecodes['AllInteger']) + + def test_entropy(self): + # Simple tests of entropy. + b = stats.bernoulli(0.25) + expected_h = -0.25 * np.log(0.25) - 0.75 * np.log(0.75) + h = b.entropy() + assert_allclose(h, expected_h) + + b = stats.bernoulli(0.0) + h = b.entropy() + assert_equal(h, 0.0) + + b = stats.bernoulli(1.0) + h = b.entropy() + assert_equal(h, 0.0) + + +class TestNBinom(TestCase): + + def test_rvs(self): + vals = stats.nbinom.rvs(10, 0.75, size=(2, 50)) + assert_(numpy.all(vals >= 0)) + assert_(numpy.shape(vals) == (2, 50)) + assert_(vals.dtype.char in typecodes['AllInteger']) + val = stats.nbinom.rvs(10, 0.75) + assert_(isinstance(val, int)) + val = stats.nbinom(10, 0.75).rvs(3) + assert_(isinstance(val, numpy.ndarray)) + assert_(val.dtype.char in typecodes['AllInteger']) + + def test_pmf(self): + # regression test for ticket 1779 + assert_allclose(np.exp(stats.nbinom.logpmf(700, 721, 0.52)), + stats.nbinom.pmf(700, 721, 0.52)) + + +class TestGeom(TestCase): + + def test_rvs(self): + vals = stats.geom.rvs(0.75, size=(2, 50)) + assert_(numpy.all(vals >= 0)) + assert_(numpy.shape(vals) == (2, 50)) + assert_(vals.dtype.char in typecodes['AllInteger']) + val = stats.geom.rvs(0.75) + assert_(isinstance(val, int)) + val = stats.geom(0.75).rvs(3) + assert_(isinstance(val, numpy.ndarray)) + assert_(val.dtype.char in typecodes['AllInteger']) + + def test_pmf(self): + vals = stats.geom.pmf([1, 2, 3], 0.5) + assert_array_almost_equal(vals, [0.5, 0.25, 0.125]) + + def test_logpmf(self): + # regression test for ticket 1793 + vals1 = np.log(stats.geom.pmf([1, 2, 3], 0.5)) + vals2 = stats.geom.logpmf([1, 2, 3], 0.5) + assert_allclose(vals1, vals2, rtol=1e-15, atol=0) + + def test_cdf_sf(self): + vals = stats.geom.cdf([1, 2, 3], 0.5) + vals_sf = stats.geom.sf([1, 2, 3], 0.5) + expected = array([0.5, 0.75, 0.875]) + assert_array_almost_equal(vals, expected) + assert_array_almost_equal(vals_sf, 1 - expected) + + def test_logcdf_logsf(self): + vals = stats.geom.logcdf([1, 2, 3], 0.5) + vals_sf = stats.geom.logsf([1, 2, 3], 0.5) + expected = array([0.5, 0.75, 0.875]) + assert_array_almost_equal(vals, np.log(expected)) + assert_array_almost_equal(vals_sf, np.log1p(-expected)) + + def test_ppf(self): + vals = stats.geom.ppf([0.5, 0.75, 0.875], 0.5) + expected = array([1.0, 2.0, 3.0]) + assert_array_almost_equal(vals, expected) + + +class TestTruncnorm(TestCase): + + def test_ppf_ticket1131(self): + vals = stats.truncnorm.ppf( + [-0.5, 0, 1e-4, 0.5, 1 - 1e-4, 1, 2], -1., 1., + loc=[3] * 7, scale=2) + expected = np.array([np.nan, 1, 1.00056419, 3, 4.99943581, 5, np.nan]) + assert_array_almost_equal(vals, expected) + + def test_isf_ticket1131(self): + vals = stats.truncnorm.isf( + [-0.5, 0, 1e-4, 0.5, 1 - 1e-4, 1, 2], -1., 1., + loc=[3] * 7, scale=2) + expected = np.array([np.nan, 5, 4.99943581, 3, 1.00056419, 1, np.nan]) + assert_array_almost_equal(vals, expected) + + def test_gh_2477_small_values(self): + # Check a case that worked in the original issue. + low, high = -11, -10 + x = stats.truncnorm.rvs(low, high, 0, 1, size=10) + assert_(low < x.min() < x.max() < high) + # Check a case that failed in the original issue. + low, high = 10, 11 + x = stats.truncnorm.rvs(low, high, 0, 1, size=10) + assert_(low < x.min() < x.max() < high) + + def test_gh_2477_large_values(self): + # Check a case that fails because of extreme tailness. + raise SkipTest('truncnorm rvs is know to fail at extreme tails') + low, high = 100, 101 + x = stats.truncnorm.rvs(low, high, 0, 1, size=10) + assert_(low < x.min() < x.max() < high) + + def test_gh_1489_trac_962_rvs(self): + # Check the original example. + low, high = 10, 15 + x = stats.truncnorm.rvs(low, high, 0, 1, size=10) + assert_(low < x.min() < x.max() < high) + + +class TestHypergeom(TestCase): + + def test_rvs(self): + vals = stats.hypergeom.rvs(20, 10, 3, size=(2, 50)) + assert_(numpy.all(vals >= 0) & + numpy.all(vals <= 3)) + assert_(numpy.shape(vals) == (2, 50)) + assert_(vals.dtype.char in typecodes['AllInteger']) + val = stats.hypergeom.rvs(20, 3, 10) + assert_(isinstance(val, int)) + val = stats.hypergeom(20, 3, 10).rvs(3) + assert_(isinstance(val, numpy.ndarray)) + assert_(val.dtype.char in typecodes['AllInteger']) + + def test_precision(self): + # comparison number from mpmath + M = 2500 + n = 50 + N = 500 + tot = M + good = n + hgpmf = stats.hypergeom.pmf(2, tot, good, N) + assert_almost_equal(hgpmf, 0.0010114963068932233, 11) + + def test_precision2(self): + # Test hypergeom precision for large numbers. See #1218. + # Results compared with those from R. + oranges = 9.9e4 + pears = 1.1e5 + fruits_eaten = np.array([3, 3.8, 3.9, 4, 4.1, 4.2, 5]) * 1e4 + quantile = 2e4 + res = [] + for eaten in fruits_eaten: + res.append( + stats.hypergeom.sf(quantile, oranges + pears, oranges, eaten)) + expected = np.array([0, 1.904153e-114, 2.752693e-66, 4.931217e-32, + 8.265601e-11, 0.1237904, 1]) + assert_allclose(res, expected, atol=0, rtol=5e-7) + + # Test with array_like first argument + quantiles = [1.9e4, 2e4, 2.1e4, 2.15e4] + res2 = stats.hypergeom.sf(quantiles, oranges + pears, oranges, 4.2e4) + expected2 = [1, 0.1237904, 6.511452e-34, 3.277667e-69] + assert_allclose(res2, expected2, atol=0, rtol=5e-7) + + def test_entropy(self): + # Simple tests of entropy. + hg = stats.hypergeom(4, 1, 1) + h = hg.entropy() + expected_p = np.array([0.75, 0.25]) + expected_h = -np.sum(xlogy(expected_p, expected_p)) + assert_allclose(h, expected_h) + + hg = stats.hypergeom(1, 1, 1) + h = hg.entropy() + assert_equal(h, 0.0) + + +class TestLoggamma(TestCase): + + def test_stats(self): + # The following precomputed values are from the table in section 2.2 + # of "A Statistical Study of Log-Gamma Distribution", by Ping Shing + # Chan (thesis, McMaster University, 1993). + table = np.array([ + # c, mean, var, skew, exc. kurt. + 0.5, -1.9635, 4.9348, -1.5351, 4.0000, + 1.0, -0.5772, 1.6449, -1.1395, 2.4000, + 12.0, 2.4427, 0.0869, -0.2946, 0.1735, + ]).reshape(-1, 5) + for c, mean, var, skew, kurt in table: + computed = stats.loggamma.stats(c, moments='msvk') + assert_array_almost_equal(computed, [mean, var, skew, kurt], + decimal=4) + + +class TestLogser(TestCase): + + def test_rvs(self): + vals = stats.logser.rvs(0.75, size=(2, 50)) + assert_(numpy.all(vals >= 1)) + assert_(numpy.shape(vals) == (2, 50)) + assert_(vals.dtype.char in typecodes['AllInteger']) + val = stats.logser.rvs(0.75) + assert_(isinstance(val, int)) + val = stats.logser(0.75).rvs(3) + assert_(isinstance(val, numpy.ndarray)) + assert_(val.dtype.char in typecodes['AllInteger']) + + +class TestPareto(TestCase): + + def test_stats(self): + # Check the stats() method with some simple values. Also check + # that the calculations do not trigger RuntimeWarnings. + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + + m, v, s, k = stats.pareto.stats(0.5, moments='mvsk') + assert_equal(m, np.inf) + assert_equal(v, np.inf) + assert_equal(s, np.nan) + assert_equal(k, np.nan) + + m, v, s, k = stats.pareto.stats(1.0, moments='mvsk') + assert_equal(m, np.inf) + assert_equal(v, np.inf) + assert_equal(s, np.nan) + assert_equal(k, np.nan) + + m, v, s, k = stats.pareto.stats(1.5, moments='mvsk') + assert_equal(m, 3.0) + assert_equal(v, np.inf) + assert_equal(s, np.nan) + assert_equal(k, np.nan) + + m, v, s, k = stats.pareto.stats(2.0, moments='mvsk') + assert_equal(m, 2.0) + assert_equal(v, np.inf) + assert_equal(s, np.nan) + assert_equal(k, np.nan) + + m, v, s, k = stats.pareto.stats(2.5, moments='mvsk') + assert_allclose(m, 2.5 / 1.5) + assert_allclose(v, 2.5 / (1.5 * 1.5 * 0.5)) + assert_equal(s, np.nan) + assert_equal(k, np.nan) + + m, v, s, k = stats.pareto.stats(3.0, moments='mvsk') + assert_allclose(m, 1.5) + assert_allclose(v, 0.75) + assert_equal(s, np.nan) + assert_equal(k, np.nan) + + m, v, s, k = stats.pareto.stats(3.5, moments='mvsk') + assert_allclose(m, 3.5 / 2.5) + assert_allclose(v, 3.5 / (2.5 * 2.5 * 1.5)) + assert_allclose(s, (2 * 4.5 / 0.5) * np.sqrt(1.5 / 3.5)) + assert_equal(k, np.nan) + + m, v, s, k = stats.pareto.stats(4.0, moments='mvsk') + assert_allclose(m, 4.0 / 3.0) + assert_allclose(v, 4.0 / 18.0) + assert_allclose( + s, 2 * (1 + 4.0) / (4.0 - 3) * np.sqrt((4.0 - 2) / 4.0)) + assert_equal(k, np.nan) + + m, v, s, k = stats.pareto.stats(4.5, moments='mvsk') + assert_allclose(m, 4.5 / 3.5) + assert_allclose(v, 4.5 / (3.5 * 3.5 * 2.5)) + assert_allclose(s, (2 * 5.5 / 1.5) * np.sqrt(2.5 / 4.5)) + assert_allclose( + k, 6 * (4.5 ** 3 + 4.5 ** 2 - 6 * 4.5 - 2) / (4.5 * 1.5 * 0.5)) + + +class TestPearson3(TestCase): + + def test_rvs(self): + vals = stats.pearson3.rvs(0.1, size=(2, 50)) + assert_(numpy.shape(vals) == (2, 50)) + assert_(vals.dtype.char in typecodes['AllFloat']) + val = stats.pearson3.rvs(0.5) + assert_(isinstance(val, float)) + val = stats.pearson3(0.5).rvs(3) + assert_(isinstance(val, numpy.ndarray)) + assert_(val.dtype.char in typecodes['AllFloat']) + assert_(len(val) == 3) + + def test_pdf(self): + vals = stats.pearson3.pdf(2, [0.0, 0.1, 0.2]) + assert_allclose(vals, np.array([0.05399097, 0.05555481, 0.05670246]), + atol=1e-6) + vals = stats.pearson3.pdf(-3, 0.1) + assert_allclose(vals, np.array([0.00313791]), atol=1e-6) + vals = stats.pearson3.pdf([-3, -2, -1, 0, 1], 0.1) + assert_allclose(vals, np.array([0.00313791, 0.05192304, 0.25028092, + 0.39885918, 0.23413173]), atol=1e-6) + + def test_cdf(self): + vals = stats.pearson3.cdf(2, [0.0, 0.1, 0.2]) + assert_allclose(vals, np.array([0.97724987, 0.97462004, 0.97213626]), + atol=1e-6) + vals = stats.pearson3.cdf(-3, 0.1) + assert_allclose(vals, [0.00082256], atol=1e-6) + vals = stats.pearson3.cdf([-3, -2, -1, 0, 1], 0.1) + assert_allclose(vals, [8.22563821e-04, 1.99860448e-02, 1.58550710e-01, + 5.06649130e-01, 8.41442111e-01], atol=1e-6) + + +class TestPoisson(TestCase): + + def test_rvs(self): + vals = stats.poisson.rvs(0.5, size=(2, 50)) + assert_(numpy.all(vals >= 0)) + assert_(numpy.shape(vals) == (2, 50)) + assert_(vals.dtype.char in typecodes['AllInteger']) + val = stats.poisson.rvs(0.5) + assert_(isinstance(val, int)) + val = stats.poisson(0.5).rvs(3) + assert_(isinstance(val, numpy.ndarray)) + assert_(val.dtype.char in typecodes['AllInteger']) + + def test_stats(self): + mu = 16.0 + result = stats.poisson.stats(mu, moments='mvsk') + assert_allclose(result, [mu, mu, np.sqrt(1.0 / mu), 1.0 / mu]) + + +class TestZipf(TestCase): + + def test_rvs(self): + vals = stats.zipf.rvs(1.5, size=(2, 50)) + assert_(numpy.all(vals >= 1)) + assert_(numpy.shape(vals) == (2, 50)) + assert_(vals.dtype.char in typecodes['AllInteger']) + val = stats.zipf.rvs(1.5) + assert_(isinstance(val, int)) + val = stats.zipf(1.5).rvs(3) + assert_(isinstance(val, numpy.ndarray)) + assert_(val.dtype.char in typecodes['AllInteger']) + + def test_moments(self): + # n-th moment is finite iff a > n + 1 + m, v = stats.zipf.stats(a=2.8) + assert_(np.isfinite(m)) + assert_equal(v, np.inf) + + s, k = stats.zipf.stats(a=4.8, moments='sk') + assert_(not np.isfinite([s, k]).all()) + + +class TestDLaplace(TestCase): + + def test_rvs(self): + vals = stats.dlaplace.rvs(1.5, size=(2, 50)) + assert_(numpy.shape(vals) == (2, 50)) + assert_(vals.dtype.char in typecodes['AllInteger']) + val = stats.dlaplace.rvs(1.5) + assert_(isinstance(val, int)) + val = stats.dlaplace(1.5).rvs(3) + assert_(isinstance(val, numpy.ndarray)) + assert_(val.dtype.char in typecodes['AllInteger']) + assert_(stats.dlaplace.rvs(0.8) is not None) + + def test_stats(self): + # compare the explicit formulas w/ direct summation using pmf + a = 1. + dl = stats.dlaplace(a) + m, v, s, k = dl.stats('mvsk') + + N = 37 + xx = np.arange(-N, N + 1) + pp = dl.pmf(xx) + m2, m4 = np.sum(pp * xx ** 2), np.sum(pp * xx ** 4) + assert_equal((m, s), (0, 0)) + assert_allclose((v, k), (m2, m4 / m2 ** 2 - 3.), atol=1e-14, rtol=1e-8) + + def test_stats2(self): + a = np.log(2.) + dl = stats.dlaplace(a) + m, v, s, k = dl.stats('mvsk') + assert_equal((m, s), (0., 0.)) + assert_allclose((v, k), (4., 3.25)) + + +class TestInvGamma(TestCase): + +# @dec.skipif(NumpyVersion(np.__version__) < '1.7.0', +# "assert_* funcs broken with inf/nan") + def test_invgamma_inf_gh_1866(self): + # invgamma's moments are only finite for a>n + # specific numbers checked w/ boost 1.54 + with warnings.catch_warnings(): + warnings.simplefilter('error', RuntimeWarning) + mvsk = stats.invgamma.stats(a=19.31, moments='mvsk') + assert_allclose(mvsk, + [0.05461496450, 0.0001723162534, + 1.020362676, 2.055616582]) + + a = [1.1, 3.1, 5.6] + mvsk = stats.invgamma.stats(a=a, moments='mvsk') + expected = ([10., 0.476190476, 0.2173913043], # mmm + [np.inf, 0.2061430632, 0.01312749422], # vvv + [np.nan, 41.95235392, 2.919025532], # sss + [np.nan, np.nan, 24.51923076]) # kkk + for x, y in zip(mvsk, expected): + assert_almost_equal(x, y) + + +class TestF(TestCase): + + def test_f_moments(self): + # n-th moment of F distributions is only finite for n < dfd / 2 + m, v, s, k = stats.f.stats(11, 6.5, moments='mvsk') + assert_(np.isfinite(m)) + assert_(np.isfinite(v)) + assert_(np.isfinite(s)) + assert_(not np.isfinite(k)) + + def test_moments_warnings(self): + # no warnings should be generated for dfd = 2, 4, 6, 8 (div by zero) + with warnings.catch_warnings(): + warnings.simplefilter('error', RuntimeWarning) + stats.f.stats(dfn=[11] * 4, dfd=[2, 4, 6, 8], moments='mvsk') + + #@dec.knownfailureif(True, 'f stats does not properly broadcast') + def test_stats_broadcast(self): + # stats do not fully broadcast just yet + _mv = stats.f.stats(dfn=11, dfd=[11, 12]) + + +def test_rvgeneric_std(): + # Regression test for #1191 + assert_array_almost_equal(stats.t.std([5, 6]), [1.29099445, 1.22474487]) + + +class TestRvDiscrete(TestCase): + + def test_rvs(self): + states = [-1, 0, 1, 2, 3, 4] + probability = [0.0, 0.3, 0.4, 0.0, 0.3, 0.0] + samples = 1000 + r = stats.rv_discrete(name='sample', values=(states, probability)) + x = r.rvs(size=samples) + assert_(isinstance(x, numpy.ndarray)) + + for s, p in zip(states, probability): + assert_(abs(sum(x == s) / float(samples) - p) < 0.05) + + x = r.rvs() + assert_(isinstance(x, int)) + + def test_entropy(self): + # Basic tests of entropy. + pvals = np.array([0.25, 0.45, 0.3]) + p = stats.rv_discrete(values=([0, 1, 2], pvals)) + expected_h = -sum(xlogy(pvals, pvals)) + h = p.entropy() + assert_allclose(h, expected_h) + + p = stats.rv_discrete(values=([0, 1, 2], [1.0, 0, 0])) + h = p.entropy() + assert_equal(h, 0.0) + + +class TestExpon(TestCase): + + def test_zero(self): + assert_equal(stats.expon.pdf(0), 1) + + def test_tail(self): # Regression test for ticket 807 + assert_equal(stats.expon.cdf(1e-18), 1e-18) + assert_equal(stats.expon.isf(stats.expon.sf(40)), 40) + + +class TestGenExpon(TestCase): + + def test_pdf_unity_area(self): + from scipy.integrate import simps + # PDF should integrate to one + assert_almost_equal(simps(stats.genexpon.pdf(numpy.arange(0, 10, 0.01), + 0.5, 0.5, 2.0), + dx=0.01), 1, 1) + + def test_cdf_bounds(self): + # CDF should always be positive + cdf = stats.genexpon.cdf(numpy.arange(0, 10, 0.01), 0.5, 0.5, 2.0) + assert_(numpy.all((0 <= cdf) & (cdf <= 1))) + + +class TestExponpow(TestCase): + + def test_tail(self): + assert_almost_equal(stats.exponpow.cdf(1e-10, 2.), 1e-20) + assert_almost_equal( + stats.exponpow.isf(stats.exponpow.sf(5, .8), .8), 5) + + +class TestSkellam(TestCase): + + def test_pmf(self): + # comparison to R + k = numpy.arange(-10, 15) + mu1, mu2 = 10, 5 + skpmfR = numpy.array( + [4.2254582961926893e-005, 1.1404838449648488e-004, + 2.8979625801752660e-004, 6.9177078182101231e-004, + 1.5480716105844708e-003, 3.2412274963433889e-003, + 6.3373707175123292e-003, 1.1552351566696643e-002, + 1.9606152375042644e-002, 3.0947164083410337e-002, + 4.5401737566767360e-002, 6.1894328166820688e-002, + 7.8424609500170578e-002, 9.2418812533573133e-002, + 1.0139793148019728e-001, 1.0371927988298846e-001, + 9.9076583077406091e-002, 8.8546660073089561e-002, + 7.4187842052486810e-002, 5.8392772862200251e-002, + 4.3268692953013159e-002, 3.0248159818374226e-002, + 1.9991434305603021e-002, 1.2516877303301180e-002, + 7.4389876226229707e-003]) + + assert_almost_equal(stats.skellam.pmf(k, mu1, mu2), skpmfR, decimal=15) + + def test_cdf(self): + # comparison to R, only 5 decimals + k = numpy.arange(-10, 15) + mu1, mu2 = 10, 5 + skcdfR = numpy.array( + [6.4061475386192104e-005, 1.7810985988267694e-004, + 4.6790611790020336e-004, 1.1596768997212152e-003, + 2.7077485103056847e-003, 5.9489760066490718e-003, + 1.2286346724161398e-002, 2.3838698290858034e-002, + 4.3444850665900668e-002, 7.4392014749310995e-002, + 1.1979375231607835e-001, 1.8168808048289900e-001, + 2.6011268998306952e-001, 3.5253150251664261e-001, + 4.5392943399683988e-001, 5.5764871387982828e-001, + 6.5672529695723436e-001, 7.4527195703032389e-001, + 8.1945979908281064e-001, 8.7785257194501087e-001, + 9.2112126489802404e-001, 9.5136942471639818e-001, + 9.7136085902200120e-001, 9.8387773632530240e-001, + 9.9131672394792536e-001]) + + assert_almost_equal(stats.skellam.cdf(k, mu1, mu2), skcdfR, decimal=5) + + +class TestLognorm(TestCase): + + def test_pdf(self): + # Regression test for Ticket #1471: avoid nan with 0/0 situation + with np.errstate(divide='ignore'): + pdf = stats.lognorm.pdf([0, 0.5, 1], 1) + assert_array_almost_equal(pdf, [0.0, 0.62749608, 0.39894228]) + + +class TestBeta(TestCase): + + def test_logpdf(self): + # Regression test for Ticket #1326: avoid nan with 0*log(0) situation + logpdf = stats.beta.logpdf(0, 1, 0.5) + assert_almost_equal(logpdf, -0.69314718056) + logpdf = stats.beta.logpdf(0, 0.5, 1) + assert_almost_equal(logpdf, np.inf) + + def test_logpdf_ticket_1866(self): + alpha, beta = 267, 1472 + x = np.array([0.2, 0.5, 0.6]) + b = stats.beta(alpha, beta) + assert_allclose(b.logpdf(x).sum(), -1201.699061824062) + assert_allclose(b.pdf(x), np.exp(b.logpdf(x))) + + +class TestBetaPrime(TestCase): + + def test_logpdf(self): + alpha, beta = 267, 1472 + x = np.array([0.2, 0.5, 0.6]) + b = stats.betaprime(alpha, beta) + assert_(np.isfinite(b.logpdf(x)).all()) + assert_allclose(b.pdf(x), np.exp(b.logpdf(x))) + + +class TestGamma(TestCase): + + def test_pdf(self): + # a few test cases to compare with R + pdf = stats.gamma.pdf(90, 394, scale=1. / 5) + assert_almost_equal(pdf, 0.002312341) + + pdf = stats.gamma.pdf(3, 10, scale=1. / 5) + assert_almost_equal(pdf, 0.1620358) + + def test_logpdf(self): + # Regression test for Ticket #1326: cornercase avoid nan with 0*log(0) + # situation + logpdf = stats.gamma.logpdf(0, 1) + assert_almost_equal(logpdf, 0) + + +class TestChi2(TestCase): + # regression tests after precision improvements, ticket:1041, not verified + + def test_precision(self): + assert_almost_equal( + stats.chi2.pdf(1000, 1000), 8.919133934753128e-003, 14) + assert_almost_equal(stats.chi2.pdf(100, 100), 0.028162503162596778, 14) + + +class TestArrayArgument(TestCase): # test for ticket:992 + + def test_noexception(self): + rvs = stats.norm.rvs( + loc=(np.arange(5)), scale=np.ones(5), size=(10, 5)) + assert_equal(rvs.shape, (10, 5)) + + +class TestDocstring(TestCase): + + def test_docstrings(self): + # See ticket #761 + if stats.rayleigh.__doc__ is not None: + self.assertTrue("rayleigh" in stats.rayleigh.__doc__.lower()) + if stats.bernoulli.__doc__ is not None: + self.assertTrue("bernoulli" in stats.bernoulli.__doc__.lower()) + + def test_no_name_arg(self): + # If name is not given, construction shouldn't fail. See #1508. + stats.rv_continuous() + stats.rv_discrete() + + +class TestEntropy(TestCase): + + def test_entropy_positive(self): + # See ticket #497 + pk = [0.5, 0.2, 0.3] + qk = [0.1, 0.25, 0.65] + eself = stats.entropy(pk, pk) + edouble = stats.entropy(pk, qk) + assert_(0.0 == eself) + assert_(edouble >= 0.0) + + def test_entropy_base(self): + pk = np.ones(16, float) + S = stats.entropy(pk, base=2.) + assert_(abs(S - 4.) < 1.e-5) + + qk = np.ones(16, float) + qk[:8] = 2. + S = stats.entropy(pk, qk) + S2 = stats.entropy(pk, qk, base=2.) + assert_(abs(S / S2 - np.log(2.)) < 1.e-5) + + def test_entropy_zero(self): + # Test for PR-479 + assert_almost_equal(stats.entropy([0, 1, 2]), 0.63651416829481278, + decimal=12) + + def test_entropy_2d(self): + pk = [[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]] + qk = [[0.2, 0.1], [0.3, 0.6], [0.5, 0.3]] + assert_array_almost_equal(stats.entropy(pk, qk), + [0.1933259, 0.18609809]) + +# @dec.skipif(NumpyVersion(np.__version__) < '1.7.0', +# "assert_* funcs broken with inf/nan") + def test_entropy_2d_zero(self): + pk = [[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]] + qk = [[0.0, 0.1], [0.3, 0.6], [0.5, 0.3]] + assert_array_almost_equal(stats.entropy(pk, qk), + [np.inf, 0.18609809]) + + pk[0][0] = 0.0 + assert_array_almost_equal(stats.entropy(pk, qk), + [0.17403988, 0.18609809]) + + +def TestArgsreduce(): + a = array([1, 3, 2, 1, 2, 3, 3]) + b, c = argsreduce(a > 1, a, 2) + + assert_array_equal(b, [3, 2, 2, 3, 3]) + assert_array_equal(c, [2, 2, 2, 2, 2]) + + b, c = argsreduce(2 > 1, a, 2) + assert_array_equal(b, a[0]) + assert_array_equal(c, [2]) + + b, c = argsreduce(a > 0, a, 2) + assert_array_equal(b, a) + assert_array_equal(c, [2] * numpy.size(a)) + + +class TestFitMethod(object): + skip = ['ncf'] + + @dec.slow + def test_fit(self): + def check(func, dist, args, alpha): + if dist in self.skip: + raise SkipTest("%s fit known to fail" % dist) + distfunc = getattr(stats, dist) + with np.errstate(all='ignore'): + res = distfunc.rvs(*args, **{'size': 200}) + vals = distfunc.fit(res) + vals2 = distfunc.fit(res, optimizer='powell') + # Only check the length of the return + # FIXME: should check the actual results to see if we are 'close' + # to what was created --- but what is 'close' enough + if dist == 'frechet': + assert_(len(vals) == len(args)) + assert_(len(vals2) == len(args)) + else: + assert_(len(vals) == 2 + len(args)) + assert_(len(vals2) == 2 + len(args)) + + for func, dist, args, alpha in test_all_distributions(): + yield check, func, dist, args, alpha + + @dec.slow + def test_fix_fit(self): + def check(func, dist, args, alpha): + # Not sure why 'ncf', and 'beta' are failing + # frechet has different len(args) than distfunc.numargs + if dist in self.skip + ['frechet']: + raise SkipTest("%s fit known to fail" % dist) + distfunc = getattr(stats, dist) + with np.errstate(all='ignore'): + res = distfunc.rvs(*args, **{'size': 200}) + vals = distfunc.fit(res, floc=0) + vals2 = distfunc.fit(res, fscale=1) + assert_(len(vals) == 2 + len(args)) + assert_(vals[-2] == 0) + assert_(vals2[-1] == 1) + assert_(len(vals2) == 2 + len(args)) + if len(args) > 0: + vals3 = distfunc.fit(res, f0=args[0]) + assert_(len(vals3) == 2 + len(args)) + assert_(vals3[0] == args[0]) + if len(args) > 1: + vals4 = distfunc.fit(res, f1=args[1]) + assert_(len(vals4) == 2 + len(args)) + assert_(vals4[1] == args[1]) + if len(args) > 2: + vals5 = distfunc.fit(res, f2=args[2]) + assert_(len(vals5) == 2 + len(args)) + assert_(vals5[2] == args[2]) + + for func, dist, args, alpha in test_all_distributions(): + yield check, func, dist, args, alpha + + def test_fix_fit_2args_lognorm(self): + # Regression test for #1551. + np.random.seed(12345) + with np.errstate(all='ignore'): + x = stats.lognorm.rvs(0.25, 0., 20.0, size=20) + assert_allclose(np.array(stats.lognorm.fit(x, floc=0, fscale=20)), + [0.25888672, 0, 20], atol=1e-5) + + def test_fix_fit_norm(self): + x = np.arange(1, 6) + + loc, scale = stats.norm.fit(x) + assert_almost_equal(loc, 3) + assert_almost_equal(scale, np.sqrt(2)) + + loc, scale = stats.norm.fit(x, floc=2) + assert_equal(loc, 2) + assert_equal(scale, np.sqrt(3)) + + loc, scale = stats.norm.fit(x, fscale=2) + assert_almost_equal(loc, 3) + assert_equal(scale, 2) + + def test_fix_fit_gamma(self): + x = np.arange(1, 6) + meanlog = np.log(x).mean() + + # A basic test of gamma.fit with floc=0. + floc = 0 + a, loc, scale = stats.gamma.fit(x, floc=floc) + s = np.log(x.mean()) - meanlog + assert_almost_equal(np.log(a) - special.digamma(a), s, decimal=5) + assert_equal(loc, floc) + assert_almost_equal(scale, x.mean() / a, decimal=8) + + # Regression tests for gh-2514. + # The problem was that if `floc=0` was given, any other fixed + # parameters were ignored. + f0 = 1 + floc = 0 + a, loc, scale = stats.gamma.fit(x, f0=f0, floc=floc) + assert_equal(a, f0) + assert_equal(loc, floc) + assert_almost_equal(scale, x.mean() / a, decimal=8) + + f0 = 2 + floc = 0 + a, loc, scale = stats.gamma.fit(x, f0=f0, floc=floc) + assert_equal(a, f0) + assert_equal(loc, floc) + assert_almost_equal(scale, x.mean() / a, decimal=8) + + # loc and scale fixed. + floc = 0 + fscale = 2 + a, loc, scale = stats.gamma.fit(x, floc=floc, fscale=fscale) + assert_equal(loc, floc) + assert_equal(scale, fscale) + c = meanlog - np.log(fscale) + assert_almost_equal(special.digamma(a), c) + + def test_fix_fit_beta(self): + # Test beta.fit when both floc and fscale are given. + + def mlefunc(a, b, x): + # Zeros of this function are critical points of + # the maximum likelihood function. + n = len(x) + s1 = np.log(x).sum() + s2 = np.log(1 - x).sum() + psiab = special.psi(a + b) + func = [s1 - n * (-psiab + special.psi(a)), + s2 - n * (-psiab + special.psi(b))] + return func + + # Basic test with floc and fscale given. + x = np.array([0.125, 0.25, 0.5]) + a, b, loc, scale = stats.beta.fit(x, floc=0, fscale=1) + assert_equal(loc, 0) + assert_equal(scale, 1) + assert_allclose(mlefunc(a, b, x), [0, 0], atol=1e-6) + + # Basic test with f0, floc and fscale given. + # This is also a regression test for gh-2514. + x = np.array([0.125, 0.25, 0.5]) + a, b, loc, scale = stats.beta.fit(x, f0=2, floc=0, fscale=1) + assert_equal(a, 2) + assert_equal(loc, 0) + assert_equal(scale, 1) + _da, db = mlefunc(a, b, x) + assert_allclose(db, 0, atol=1e-5) + + # Same floc and fscale values as above, but reverse the data + # and fix b (f1). + x2 = 1 - x + a2, b2, loc2, scale2 = stats.beta.fit(x2, f1=2, floc=0, fscale=1) + assert_equal(b2, 2) + assert_equal(loc2, 0) + assert_equal(scale2, 1) + da, db = mlefunc(a2, b2, x2) + assert_allclose(da, 0, atol=1e-5) + # a2 of this test should equal b from above. + assert_almost_equal(a2, b) + + # Check for detection of data out of bounds when floc and fscale + # are given. + assert_raises(ValueError, stats.beta.fit, x, floc=0.5, fscale=1) + y = np.array([0, .5, 1]) + assert_raises(ValueError, stats.beta.fit, y, floc=0, fscale=1) + assert_raises(ValueError, stats.beta.fit, y, floc=0, fscale=1, f0=2) + assert_raises(ValueError, stats.beta.fit, y, floc=0, fscale=1, f1=2) + + # Check that attempting to fix all the parameters raises a ValueError. + assert_raises(ValueError, stats.beta.fit, y, f0=0, f1=1, + floc=2, fscale=3) + + +class TestFrozen(TestCase): + # Test that a frozen distribution gives the same results as the original + # object. + # Only tested for the normal distribution (with loc and scale specified) + # and for the gamma distribution (with a shape parameter specified). + + def test_norm(self): + dist = stats.norm + frozen = stats.norm(loc=10.0, scale=3.0) + + result_f = frozen.pdf(20.0) + result = dist.pdf(20.0, loc=10.0, scale=3.0) + assert_equal(result_f, result) + + result_f = frozen.cdf(20.0) + result = dist.cdf(20.0, loc=10.0, scale=3.0) + assert_equal(result_f, result) + + result_f = frozen.ppf(0.25) + result = dist.ppf(0.25, loc=10.0, scale=3.0) + assert_equal(result_f, result) + + result_f = frozen.isf(0.25) + result = dist.isf(0.25, loc=10.0, scale=3.0) + assert_equal(result_f, result) + + result_f = frozen.sf(10.0) + result = dist.sf(10.0, loc=10.0, scale=3.0) + assert_equal(result_f, result) + + result_f = frozen.median() + result = dist.median(loc=10.0, scale=3.0) + assert_equal(result_f, result) + + result_f = frozen.mean() + result = dist.mean(loc=10.0, scale=3.0) + assert_equal(result_f, result) + + result_f = frozen.var() + result = dist.var(loc=10.0, scale=3.0) + assert_equal(result_f, result) + + result_f = frozen.std() + result = dist.std(loc=10.0, scale=3.0) + assert_equal(result_f, result) + + result_f = frozen.entropy() + result = dist.entropy(loc=10.0, scale=3.0) + assert_equal(result_f, result) + + result_f = frozen.moment(2) + result = dist.moment(2, loc=10.0, scale=3.0) + assert_equal(result_f, result) + + def test_gamma(self): + a = 2.0 + dist = stats.gamma + frozen = stats.gamma(a) + + result_f = frozen.pdf(20.0) + result = dist.pdf(20.0, a) + assert_equal(result_f, result) + + result_f = frozen.cdf(20.0) + result = dist.cdf(20.0, a) + assert_equal(result_f, result) + + result_f = frozen.ppf(0.25) + result = dist.ppf(0.25, a) + assert_equal(result_f, result) + + result_f = frozen.isf(0.25) + result = dist.isf(0.25, a) + assert_equal(result_f, result) + + result_f = frozen.sf(10.0) + result = dist.sf(10.0, a) + assert_equal(result_f, result) + + result_f = frozen.median() + result = dist.median(a) + assert_equal(result_f, result) + + result_f = frozen.mean() + result = dist.mean(a) + assert_equal(result_f, result) + + result_f = frozen.var() + result = dist.var(a) + assert_equal(result_f, result) + + result_f = frozen.std() + result = dist.std(a) + assert_equal(result_f, result) + + result_f = frozen.entropy() + result = dist.entropy(a) + assert_equal(result_f, result) + + result_f = frozen.moment(2) + result = dist.moment(2, a) + assert_equal(result_f, result) + + def test_regression_ticket_1293(self): + # Create a frozen distribution. + frozen = stats.lognorm(1) + # Call one of its methods that does not take any keyword arguments. + m1 = frozen.moment(2) + # Now call a method that takes a keyword argument. + frozen.stats(moments='mvsk') + # Call moment(2) again. + # After calling stats(), the following was raising an exception. + # So this test passes if the following does not raise an exception. + m2 = frozen.moment(2) + # The following should also be true, of course. But it is not + # the focus of this test. + assert_equal(m1, m2) + + +class TestExpect(TestCase): + # Test for expect method. + # + # Uses normal distribution and beta distribution for finite bounds, and + # hypergeom for discrete distribution with finite support + + def test_norm(self): + v = stats.norm.expect(lambda x: (x - 5) * (x - 5), loc=5, scale=2) + assert_almost_equal(v, 4, decimal=14) + + m = stats.norm.expect(lambda x: (x), loc=5, scale=2) + assert_almost_equal(m, 5, decimal=14) + + lb = stats.norm.ppf(0.05, loc=5, scale=2) + ub = stats.norm.ppf(0.95, loc=5, scale=2) + prob90 = stats.norm.expect(lambda x: 1, loc=5, scale=2, lb=lb, ub=ub) + assert_almost_equal(prob90, 0.9, decimal=14) + + prob90c = stats.norm.expect(lambda x: 1, loc=5, scale=2, lb=lb, ub=ub, + conditional=True) + assert_almost_equal(prob90c, 1., decimal=14) + + def test_beta(self): + # case with finite support interval + v = stats.beta.expect( + lambda x: (x - 19 / 3.) * (x - 19 / 3.), args=(10, 5), + loc=5, scale=2) + assert_almost_equal(v, 1. / 18., decimal=13) + + m = stats.beta.expect(lambda x: x, args=(10, 5), loc=5., scale=2.) + assert_almost_equal(m, 19 / 3., decimal=13) + + ub = stats.beta.ppf(0.95, 10, 10, loc=5, scale=2) + lb = stats.beta.ppf(0.05, 10, 10, loc=5, scale=2) + prob90 = stats.beta.expect(lambda x: 1., args=(10, 10), loc=5., + scale=2., lb=lb, ub=ub, conditional=False) + assert_almost_equal(prob90, 0.9, decimal=13) + + prob90c = stats.beta.expect(lambda x: 1, args=(10, 10), loc=5, + scale=2, lb=lb, ub=ub, conditional=True) + assert_almost_equal(prob90c, 1., decimal=13) + + def test_hypergeom(self): + # test case with finite bounds + + # without specifying bounds + m_true, v_true = stats.hypergeom.stats(20, 10, 8, loc=5.) + m = stats.hypergeom.expect(lambda x: x, args=(20, 10, 8), loc=5.) + assert_almost_equal(m, m_true, decimal=13) + + v = stats.hypergeom.expect(lambda x: (x - 9.) ** 2, args=(20, 10, 8), + loc=5.) + assert_almost_equal(v, v_true, decimal=14) + + # with bounds, bounds equal to shifted support + v_bounds = stats.hypergeom.expect( + lambda x: (x - 9.) ** 2, args=(20, 10, 8), + loc=5., lb=5, ub=13) + assert_almost_equal(v_bounds, v_true, decimal=14) + + # drop boundary points + prob_true = 1 - stats.hypergeom.pmf([5, 13], 20, 10, 8, loc=5).sum() + prob_bounds = stats.hypergeom.expect(lambda x: 1, args=(20, 10, 8), + loc=5., lb=6, ub=12) + assert_almost_equal(prob_bounds, prob_true, decimal=13) + + # conditional + prob_bc = stats.hypergeom.expect(lambda x: 1, args=(20, 10, 8), loc=5., + lb=6, ub=12, conditional=True) + assert_almost_equal(prob_bc, 1, decimal=14) + + # check simple integral + prob_b = stats.hypergeom.expect(lambda x: 1, args=(20, 10, 8), + lb=0, ub=8) + assert_almost_equal(prob_b, 1, decimal=13) + + def test_poisson(self): + # poisson, use lower bound only + prob_bounds = stats.poisson.expect(lambda x: 1, args=(2,), lb=3, + conditional=False) + prob_b_true = 1 - stats.poisson.cdf(2, 2) + assert_almost_equal(prob_bounds, prob_b_true, decimal=14) + + prob_lb = stats.poisson.expect(lambda x: 1, args=(2,), lb=2, + conditional=True) + assert_almost_equal(prob_lb, 1, decimal=14) + + def test_genhalflogistic(self): + # genhalflogistic, changes upper bound of support in _argcheck + # regression test for gh-2622 + halflog = stats.genhalflogistic + # check consistency when calling expect twice with the same input + res1 = halflog.expect(args=(1.5,)) + halflog.expect(args=(0.5,)) + res2 = halflog.expect(args=(1.5,)) + assert_almost_equal(res1, res2, decimal=14) + + def test_rice_overflow(self): + # rice.pdf(999, 0.74) was inf since special.i0 silentyly overflows + # check that using i0e fixes it + assert_(np.isfinite(stats.rice.pdf(999, 0.74))) + + assert_(np.isfinite(stats.rice.expect(lambda x: 1, args=(0.74,)))) + assert_(np.isfinite(stats.rice.expect(lambda x: 2, args=(0.74,)))) + assert_(np.isfinite(stats.rice.expect(lambda x: 3, args=(0.74,)))) + + +class TestNct(TestCase): + + def test_nc_parameter(self): + # Parameter values c<=0 were not enabled (gh-2402). + # For negative values c and for c=0 results of rv.cdf(0) below were nan + rv = stats.nct(5, 0) + assert_equal(rv.cdf(0), 0.5) + rv = stats.nct(5, -1) + assert_almost_equal(rv.cdf(0), 0.841344746069, decimal=10) + + def test_broadcasting(self): + res = stats.nct.pdf( + 5, np.arange(4, 7)[:, None], np.linspace(0.1, 1, 4)) + expected = array([[0.00321886, 0.00557466, 0.00918418, 0.01442997], + [0.00217142, 0.00395366, 0.00683888, 0.01126276], + [0.00153078, 0.00291093, 0.00525206, 0.00900815]]) + assert_allclose(res, expected, rtol=1e-5) + + def text_variance_gh_issue_2401(self): + # Computation of the variance of a non-central t-distribution resulted + # in a TypeError: ufunc 'isinf' not supported for the input types, + # and the inputs could not be safely coerced to any supported types + # according to the casting rule 'safe' + rv = stats.nct(4, 0) + assert_equal(rv.var(), 2.0) + + def test_nct_inf_moments(self): + # n-th moment of nct only exists for df > n + m, v, s, k = stats.nct.stats(df=1.9, nc=0.3, moments='mvsk') + assert_(np.isfinite(m)) + assert_equal([v, s, k], [np.inf, np.nan, np.nan]) + + m, v, s, k = stats.nct.stats(df=3.1, nc=0.3, moments='mvsk') + assert_(np.isfinite([m, v, s]).all()) + assert_equal(k, np.nan) + + +class TestRice(TestCase): + + def test_rice_zero_b(self): + # rice distribution should work with b=0, cf gh-2164 + x = [0.2, 1., 5.] + assert_(np.isfinite(stats.rice.pdf(x, b=0.)).all()) + assert_(np.isfinite(stats.rice.logpdf(x, b=0.)).all()) + assert_(np.isfinite(stats.rice.cdf(x, b=0.)).all()) + assert_(np.isfinite(stats.rice.logcdf(x, b=0.)).all()) + + q = [0.1, 0.1, 0.5, 0.9] + assert_(np.isfinite(stats.rice.ppf(q, b=0.)).all()) + + mvsk = stats.rice.stats(0, moments='mvsk') + assert_(np.isfinite(mvsk).all()) + + # furthermore, pdf is continuous as b\to 0 + # rice.pdf(x, b\to 0) = x exp(-x^2/2) + O(b^2) + # see e.g. Abramovich & Stegun 9.6.7 & 9.6.10 + b = 1e-8 + assert_allclose(stats.rice.pdf(x, 0), stats.rice.pdf(x, b), + atol=b, rtol=0) + + def test_rice_rvs(self): + rvs = stats.rice.rvs + assert_equal(rvs(b=3.).size, 1) + assert_equal(rvs(b=3., size=(3, 5)).shape, (3, 5)) + + +class TestErlang(TestCase): + + def test_erlang_runtimewarning(self): + # erlang should generate a RuntimeWarning if a non-integer + # shape parameter is used. + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + + # The non-integer shape parameter 1.3 should trigger a + # RuntimeWarning + assert_raises(RuntimeWarning, + stats.erlang.rvs, 1.3, loc=0, scale=1, size=4) + + # Calling the fit method with `f0` set to an integer should + # *not* trigger a RuntimeWarning. It should return the same + # values as gamma.fit(...). + data = [0.5, 1.0, 2.0, 4.0] + result_erlang = stats.erlang.fit(data, f0=1) + result_gamma = stats.gamma.fit(data, f0=1) + assert_allclose(result_erlang, result_gamma, rtol=1e-3) + + +class TestRdist(TestCase): + + @dec.slow + def test_rdist_cdf_gh1285(self): + # check workaround in rdist._cdf for issue gh-1285. + distfn = stats.rdist + values = [0.001, 0.5, 0.999] + assert_almost_equal(distfn.cdf(distfn.ppf(values, 541.0), 541.0), + values, decimal=5) + + +def test_540_567(): + # test for nan returned in tickets 540, 567 + assert_almost_equal(stats.norm.cdf(-1.7624320982), 0.03899815971089126, + decimal=10, err_msg='test_540_567') + assert_almost_equal(stats.norm.cdf(-1.7624320983), 0.038998159702449846, + decimal=10, err_msg='test_540_567') + assert_almost_equal(stats.norm.cdf(1.38629436112, loc=0.950273420309, + scale=0.204423758009), + 0.98353464004309321, decimal=10, + err_msg='test_540_567') + + +def test_regression_ticket_1316(): + # The following was raising an exception, because _construct_default_doc() + # did not handle the default keyword extradoc=None. See ticket #1316. + _g = stats._continuous_distns.gamma_gen(name='gamma') + + +def test_regression_ticket_1326(): + # adjust to avoid nan with 0*log(0) + assert_almost_equal(stats.chi2.pdf(0.0, 2), 0.5, 14) + + +def test_regression_tukey_lambda(): + # Make sure that Tukey-Lambda distribution correctly handles non-positive + # lambdas. + x = np.linspace(-5.0, 5.0, 101) + + olderr = np.seterr(divide='ignore') + try: + for lam in [0.0, -1.0, -2.0, np.array([[-1.0], [0.0], [-2.0]])]: + p = stats.tukeylambda.pdf(x, lam) + assert_((p != 0.0).all()) + assert_(~np.isnan(p).all()) + + lam = np.array([[-1.0], [0.0], [2.0]]) + p = stats.tukeylambda.pdf(x, lam) + finally: + np.seterr(**olderr) + + assert_(~np.isnan(p).all()) + assert_((p[0] != 0.0).all()) + assert_((p[1] != 0.0).all()) + assert_((p[2] != 0.0).any()) + assert_((p[2] == 0.0).any()) + + +@dec.skipif(DOCSTRINGS_STRIPPED) +def test_regression_ticket_1421(): + assert_('pdf(x, mu, loc=0, scale=1)' not in stats.poisson.__doc__) + assert_('pmf(x,' in stats.poisson.__doc__) + + +def test_nan_arguments_gh_issue_1362(): + assert_(np.isnan(stats.t.logcdf(1, np.nan))) + assert_(np.isnan(stats.t.cdf(1, np.nan))) + assert_(np.isnan(stats.t.logsf(1, np.nan))) + assert_(np.isnan(stats.t.sf(1, np.nan))) + assert_(np.isnan(stats.t.pdf(1, np.nan))) + assert_(np.isnan(stats.t.logpdf(1, np.nan))) + assert_(np.isnan(stats.t.ppf(1, np.nan))) + assert_(np.isnan(stats.t.isf(1, np.nan))) + + assert_(np.isnan(stats.bernoulli.logcdf(np.nan, 0.5))) + assert_(np.isnan(stats.bernoulli.cdf(np.nan, 0.5))) + assert_(np.isnan(stats.bernoulli.logsf(np.nan, 0.5))) + assert_(np.isnan(stats.bernoulli.sf(np.nan, 0.5))) + assert_(np.isnan(stats.bernoulli.pmf(np.nan, 0.5))) + assert_(np.isnan(stats.bernoulli.logpmf(np.nan, 0.5))) + assert_(np.isnan(stats.bernoulli.ppf(np.nan, 0.5))) + assert_(np.isnan(stats.bernoulli.isf(np.nan, 0.5))) + + +def test_frozen_fit_ticket_1536(): + np.random.seed(5678) + true = np.array([0.25, 0., 0.5]) + x = stats.lognorm.rvs(true[0], true[1], true[2], size=100) + + olderr = np.seterr(divide='ignore') + try: + params = np.array(stats.lognorm.fit(x, floc=0.)) + finally: + np.seterr(**olderr) + + assert_almost_equal(params, true, decimal=2) + + params = np.array(stats.lognorm.fit(x, fscale=0.5, loc=0)) + assert_almost_equal(params, true, decimal=2) + + params = np.array(stats.lognorm.fit(x, f0=0.25, loc=0)) + assert_almost_equal(params, true, decimal=2) + + params = np.array(stats.lognorm.fit(x, f0=0.25, floc=0)) + assert_almost_equal(params, true, decimal=2) + + np.random.seed(5678) + loc = 1 + floc = 0.9 + x = stats.norm.rvs(loc, 2., size=100) + params = np.array(stats.norm.fit(x, floc=floc)) + expected = np.array([floc, np.sqrt(((x - floc) ** 2).mean())]) + assert_almost_equal(params, expected, decimal=4) + + +def test_regression_ticket_1530(): + # Check the starting value works for Cauchy distribution fit. + np.random.seed(654321) + rvs = stats.cauchy.rvs(size=100) + params = stats.cauchy.fit(rvs) + expected = (0.045, 1.142) + assert_almost_equal(params, expected, decimal=1) + + +def test_tukeylambda_stats_ticket_1545(): + # Some test for the variance and kurtosis of the Tukey Lambda distr. + # See test_tukeylamdba_stats.py for more tests. + + mv = stats.tukeylambda.stats(0, moments='mvsk') + # Known exact values: + expected = [0, np.pi ** 2 / 3, 0, 1.2] + assert_almost_equal(mv, expected, decimal=10) + + mv = stats.tukeylambda.stats(3.13, moments='mvsk') + # 'expected' computed with mpmath. + expected = [0, 0.0269220858861465102, 0, -0.898062386219224104] + assert_almost_equal(mv, expected, decimal=10) + + mv = stats.tukeylambda.stats(0.14, moments='mvsk') + # 'expected' computed with mpmath. + expected = [0, 2.11029702221450250, 0, -0.02708377353223019456] + assert_almost_equal(mv, expected, decimal=10) + + +def test_poisson_logpmf_ticket_1436(): + assert_(np.isfinite(stats.poisson.logpmf(1500, 200))) + + +def test_powerlaw_stats(): + """Test the powerlaw stats function. + + This unit test is also a regression test for ticket 1548. + + The exact values are: + mean: + mu = a / (a + 1) + variance: + sigma**2 = a / ((a + 2) * (a + 1) ** 2) + skewness: + One formula (see http://en.wikipedia.org/wiki/Skewness) is + gamma_1 = (E[X**3] - 3*mu*E[X**2] + 2*mu**3) / sigma**3 + A short calculation shows that E[X**k] is a / (a + k), so gamma_1 + can be implemented as + n = a/(a+3) - 3*(a/(a+1))*a/(a+2) + 2*(a/(a+1))**3 + d = sqrt(a/((a+2)*(a+1)**2)) ** 3 + gamma_1 = n/d + Either by simplifying, or by a direct calculation of mu_3 / sigma**3, + one gets the more concise formula: + gamma_1 = -2.0 * ((a - 1) / (a + 3)) * sqrt((a + 2) / a) + kurtosis: (See http://en.wikipedia.org/wiki/Kurtosis) + The excess kurtosis is + gamma_2 = mu_4 / sigma**4 - 3 + A bit of calculus and algebra (sympy helps) shows that + mu_4 = 3*a*(3*a**2 - a + 2) / ((a+1)**4 * (a+2) * (a+3) * (a+4)) + so + gamma_2 = 3*(3*a**2 - a + 2) * (a+2) / (a*(a+3)*(a+4)) - 3 + which can be rearranged to + gamma_2 = 6 * (a**3 - a**2 - 6*a + 2) / (a*(a+3)*(a+4)) + """ + cases = [(1.0, (0.5, 1. / 12, 0.0, -1.2)), + (2.0, (2. / 3, 2. / 36, -0.56568542494924734, -0.6))] + for a, exact_mvsk in cases: + mvsk = stats.powerlaw.stats(a, moments="mvsk") + assert_array_almost_equal(mvsk, exact_mvsk) + + +def test_ksone_fit_freeze(): + # Regression test for ticket #1638. + d = np.array( + [-0.18879233, 0.15734249, 0.18695107, 0.27908787, -0.248649, + -0.2171497, 0.12233512, 0.15126419, 0.03119282, 0.4365294, + 0.08930393, -0.23509903, 0.28231224, -0.09974875, -0.25196048, + 0.11102028, 0.1427649, 0.10176452, 0.18754054, 0.25826724, + 0.05988819, 0.0531668, 0.21906056, 0.32106729, 0.2117662, + 0.10886442, 0.09375789, 0.24583286, -0.22968366, -0.07842391, + -0.31195432, -0.21271196, 0.1114243, -0.13293002, 0.01331725, + -0.04330977, -0.09485776, -0.28434547, 0.22245721, -0.18518199, + -0.10943985, -0.35243174, 0.06897665, -0.03553363, -0.0701746, + -0.06037974, 0.37670779, -0.21684405]) + + try: + olderr = np.seterr(invalid='ignore') + with warnings.catch_warnings(): + warnings.simplefilter('ignore', UserWarning) + warnings.simplefilter('ignore', RuntimeWarning) + stats.ksone.fit(d) + finally: + np.seterr(**olderr) + + +def test_norm_logcdf(): + # Test precision of the logcdf of the normal distribution. + # This precision was enhanced in ticket 1614. + x = -np.asarray(list(range(0, 120, 4))) + # Values from R + expected = [-0.69314718, -10.36010149, -35.01343716, -75.41067300, + -131.69539607, -203.91715537, -292.09872100, -396.25241451, + -516.38564863, -652.50322759, -804.60844201, -972.70364403, + -1156.79057310, -1356.87055173, -1572.94460885, -1805.01356068, + -2053.07806561, -2317.13866238, -2597.19579746, -2893.24984493, + -3205.30112136, -3533.34989701, -3877.39640444, -4237.44084522, + -4613.48339520, -5005.52420869, -5413.56342187, -5837.60115548, + -6277.63751711, -6733.67260303] + + olderr = np.seterr(divide='ignore') + try: + assert_allclose(stats.norm().logcdf(x), expected, atol=1e-8) + finally: + np.seterr(**olderr) + + +def test_hypergeom_interval_1802(): + # these two had endless loops + assert_equal(stats.hypergeom.interval(.95, 187601, 43192, 757), + (152.0, 197.0)) + assert_equal(stats.hypergeom.interval(.945, 187601, 43192, 757), + (152.0, 197.0)) + # this was working also before + assert_equal(stats.hypergeom.interval(.94, 187601, 43192, 757), + (153.0, 196.0)) + + # degenerate case .a == .b + assert_equal(stats.hypergeom.ppf(0.02, 100, 100, 8), 8) + assert_equal(stats.hypergeom.ppf(1, 100, 100, 8), 8) + + +def test_distribution_too_many_args(): + # Check that a TypeError is raised when too many args are given to a method + # Regression test for ticket 1815. + x = np.linspace(0.1, 0.7, num=5) + assert_raises(TypeError, stats.gamma.pdf, x, 2, 3, loc=1.0) + assert_raises(TypeError, stats.gamma.pdf, x, 2, 3, 4, loc=1.0) + assert_raises(TypeError, stats.gamma.pdf, x, 2, 3, 4, 5) + assert_raises(TypeError, stats.gamma.pdf, x, 2, 3, loc=1.0, scale=0.5) + assert_raises(TypeError, stats.gamma.rvs, 2., 3, loc=1.0, scale=0.5) + assert_raises(TypeError, stats.gamma.cdf, x, 2., 3, loc=1.0, scale=0.5) + assert_raises(TypeError, stats.gamma.ppf, x, 2., 3, loc=1.0, scale=0.5) + assert_raises(TypeError, stats.gamma.stats, 2., 3, loc=1.0, scale=0.5) + assert_raises(TypeError, stats.gamma.entropy, 2., 3, loc=1.0, scale=0.5) + assert_raises(TypeError, stats.gamma.fit, x, 2., 3, loc=1.0, scale=0.5) + + # These should not give errors + stats.gamma.pdf(x, 2, 3) # loc=3 + stats.gamma.pdf(x, 2, 3, 4) # loc=3, scale=4 + stats.gamma.stats(2., 3) + stats.gamma.stats(2., 3, 4) + stats.gamma.stats(2., 3, 4, 'mv') + stats.gamma.rvs(2., 3, 4, 5) + stats.gamma.fit(stats.gamma.rvs(2., size=7), 2.) + + # Also for a discrete distribution + stats.geom.pmf(x, 2, loc=3) # no error, loc=3 + assert_raises(TypeError, stats.geom.pmf, x, 2, 3, 4) + assert_raises(TypeError, stats.geom.pmf, x, 2, 3, loc=4) + + # And for distributions with 0, 2 and 3 args respectively + assert_raises(TypeError, stats.expon.pdf, x, 3, loc=1.0) + assert_raises(TypeError, stats.exponweib.pdf, x, 3, 4, 5, loc=1.0) + assert_raises(TypeError, stats.exponweib.pdf, x, 3, 4, 5, 0.1, 0.1) + assert_raises(TypeError, stats.ncf.pdf, x, 3, 4, 5, 6, loc=1.0) + assert_raises(TypeError, stats.ncf.pdf, x, 3, 4, 5, 6, 1.0, scale=0.5) + stats.ncf.pdf(x, 3, 4, 5, 6, 1.0) # 3 args, plus loc/scale + + +def test_ncx2_tails_ticket_955(): + # Trac #955 -- check that the cdf computed by special functions + # matches the integrated pdf + a = stats.ncx2.cdf(np.arange(20, 25, 0.2), 2, 1.07458615e+02) + b = stats.ncx2.veccdf(np.arange(20, 25, 0.2), 2, 1.07458615e+02) + assert_allclose(a, b, rtol=1e-3, atol=0) + + +def test_foldnorm_zero(): + # Parameter value c=0 was not enabled, see gh-2399. + rv = stats.foldnorm(0, scale=1) + assert_equal(rv.cdf(0), 0) # rv.cdf(0) previously resulted in: nan + + +def test_stats_shapes_argcheck(): + # stats method was failing for vector shapes if some of the values + # were outside of the allowed range, see gh-2678 + mv3 = stats.invgamma.stats([0.0, 0.5, 1.0], 1, 0.5) # 0 is not a legal `a` + mv2 = stats.invgamma.stats([0.5, 1.0], 1, 0.5) + mv2_augmented = tuple(np.r_[np.nan, _] for _ in mv2) + assert_equal(mv2_augmented, mv3) + + # -1 is not a legal shape parameter + mv3 = stats.lognorm.stats([2, 2.4, -1]) + mv2 = stats.lognorm.stats([2, 2.4]) + mv2_augmented = tuple(np.r_[_, np.nan] for _ in mv2) + assert_equal(mv2_augmented, mv3) + + # FIXME: this is only a quick-and-dirty test of a quick-and-dirty bugfix. + # stats method with multiple shape parameters is not properly vectorized + # anyway, so some distributions may or may not fail. + + +# Test subclassing distributions w/ explicit shapes + +class _distr_gen(stats.rv_continuous): + + def _pdf(self, x, a): + return 42 + + +class _distr2_gen(stats.rv_continuous): + + def _cdf(self, x, a): + return 42 * a + x + + +class _distr3_gen(stats.rv_continuous): + + def _pdf(self, x, a, b): + return a + b + + def _cdf(self, x, a): + # Different # of shape params from _pdf, to be able to check that + # inspection catches the inconsistency.""" + return 42 * a + x + + +class _distr6_gen(stats.rv_continuous): + # Two shape parameters (both _pdf and _cdf defined, consistent shapes.) + + def _pdf(self, x, a, b): + return a * x + b + + def _cdf(self, x, a, b): + return 42 * a + x + + +class TestSubclassingExplicitShapes(TestCase): + # Construct a distribution w/ explicit shapes parameter and test it. + + def test_correct_shapes(self): + dummy_distr = _distr_gen(name='dummy', shapes='a') + assert_equal(dummy_distr.pdf(1, a=1), 42) + + def test_wrong_shapes_1(self): + dummy_distr = _distr_gen(name='dummy', shapes='A') + assert_raises(TypeError, dummy_distr.pdf, 1, **dict(a=1)) + + def test_wrong_shapes_2(self): + dummy_distr = _distr_gen(name='dummy', shapes='a, b, c') + dct = dict(a=1, b=2, c=3) + assert_raises(TypeError, dummy_distr.pdf, 1, **dct) + + def test_shapes_string(self): + # shapes must be a string + dct = dict(name='dummy', shapes=42) + assert_raises(TypeError, _distr_gen, **dct) + + def test_shapes_identifiers_1(self): + # shapes must be a comma-separated list of valid python identifiers + dct = dict(name='dummy', shapes='(!)') + assert_raises(SyntaxError, _distr_gen, **dct) + + def test_shapes_identifiers_2(self): + dct = dict(name='dummy', shapes='4chan') + assert_raises(SyntaxError, _distr_gen, **dct) + + def test_shapes_identifiers_3(self): + dct = dict(name='dummy', shapes='m(fti)') + assert_raises(SyntaxError, _distr_gen, **dct) + + def test_shapes_identifiers_nodefaults(self): + dct = dict(name='dummy', shapes='a=2') + assert_raises(SyntaxError, _distr_gen, **dct) + + def test_shapes_args(self): + dct = dict(name='dummy', shapes='*args') + assert_raises(SyntaxError, _distr_gen, **dct) + + def test_shapes_kwargs(self): + dct = dict(name='dummy', shapes='**kwargs') + assert_raises(SyntaxError, _distr_gen, **dct) + + def test_shapes_keywords(self): + # python keywords cannot be used for shape parameters + dct = dict(name='dummy', shapes='a, b, c, lambda') + assert_raises(SyntaxError, _distr_gen, **dct) + + def test_shapes_signature(self): + # test explicit shapes which agree w/ the signature of _pdf + class _dist_gen(stats.rv_continuous): + + def _pdf(self, x, a): + return stats.norm._pdf(x) * a + + dist = _dist_gen(shapes='a') + assert_equal(dist.pdf(0.5, a=2), stats.norm.pdf(0.5) * 2) + + def test_shapes_signature_inconsistent(self): + # test explicit shapes which do not agree w/ the signature of _pdf + class _dist_gen(stats.rv_continuous): + + def _pdf(self, x, a): + return stats.norm._pdf(x) * a + + dist = _dist_gen(shapes='a, b') + assert_raises(TypeError, dist.pdf, 0.5, **dict(a=1, b=2)) + + def test_star_args(self): + # test _pdf with only starargs + # NB: **kwargs of pdf will never reach _pdf + class _dist_gen(stats.rv_continuous): + + def _pdf(self, x, *args): + extra_kwarg = args[0] + return stats.norm._pdf(x) * extra_kwarg + + dist = _dist_gen(shapes='extra_kwarg') + assert_equal(dist.pdf(0.5, extra_kwarg=33), stats.norm.pdf(0.5) * 33) + assert_equal(dist.pdf(0.5, 33), stats.norm.pdf(0.5) * 33) + assert_raises(TypeError, dist.pdf, 0.5, **dict(xxx=33)) + + def test_star_args_2(self): + # test _pdf with named & starargs + # NB: **kwargs of pdf will never reach _pdf + class _dist_gen(stats.rv_continuous): + + def _pdf(self, x, offset, *args): + extra_kwarg = args[0] + return stats.norm._pdf(x) * extra_kwarg + offset + + dist = _dist_gen(shapes='offset, extra_kwarg') + assert_equal(dist.pdf(0.5, offset=111, extra_kwarg=33), + stats.norm.pdf(0.5) * 33 + 111) + assert_equal(dist.pdf(0.5, 111, 33), + stats.norm.pdf(0.5) * 33 + 111) + + def test_extra_kwarg(self): + # **kwargs to _pdf are ignored. + # this is a limitation of the framework (_pdf(x, *goodargs)) + class _distr_gen(stats.rv_continuous): + + def _pdf(self, x, *args, **kwargs): + # _pdf should handle *args, **kwargs itself. Here "handling" + # is ignoring *args and looking for ``extra_kwarg`` and using + # that. + extra_kwarg = kwargs.pop('extra_kwarg', 1) + return stats.norm._pdf(x) * extra_kwarg + + dist = _distr_gen(shapes='extra_kwarg') + assert_equal(dist.pdf(1, extra_kwarg=3), stats.norm.pdf(1)) + + def shapes_empty_string(self): + # shapes='' is equivalent to shapes=None + class _dist_gen(stats.rv_continuous): + + def _pdf(self, x): + return stats.norm.pdf(x) + + dist = _dist_gen(shapes='') + assert_equal(dist.pdf(0.5), stats.norm.pdf(0.5)) + + +class TestSubclassingNoShapes(TestCase): + # Construct a distribution w/o explicit shapes parameter and test it. + + def test_only__pdf(self): + dummy_distr = _distr_gen(name='dummy') + assert_equal(dummy_distr.pdf(1, a=1), 42) + + def test_only__cdf(self): + # _pdf is determined from _cdf by taking numerical derivative + dummy_distr = _distr2_gen(name='dummy') + assert_almost_equal(dummy_distr.pdf(1, a=1), 1) + + @dec.skipif(DOCSTRINGS_STRIPPED) + def test_signature_inspection(self): + # check that _pdf signature inspection works correctly, and is used in + # the class docstring + dummy_distr = _distr_gen(name='dummy') + assert_equal(dummy_distr.numargs, 1) + assert_equal(dummy_distr.shapes, 'a') + res = re.findall('logpdf\(x, a, loc=0, scale=1\)', + dummy_distr.__doc__) + assert_(len(res) == 1) + + @dec.skipif(DOCSTRINGS_STRIPPED) + def test_signature_inspection_2args(self): + # same for 2 shape params and both _pdf and _cdf defined + dummy_distr = _distr6_gen(name='dummy') + assert_equal(dummy_distr.numargs, 2) + assert_equal(dummy_distr.shapes, 'a, b') + res = re.findall('logpdf\(x, a, b, loc=0, scale=1\)', + dummy_distr.__doc__) + assert_(len(res) == 1) + + def test_signature_inspection_2args_incorrect_shapes(self): + # both _pdf and _cdf defined, but shapes are inconsistent: raises + try: + _distr3_gen(name='dummy') + except TypeError: + pass + else: + raise AssertionError('TypeError not raised.') + + def test_defaults_raise(self): + # default arguments should raise + class _dist_gen(stats.rv_continuous): + + def _pdf(self, x, a=42): + return 42 + assert_raises(TypeError, _dist_gen, **dict(name='dummy')) + + def test_starargs_raise(self): + # without explicit shapes, *args are not allowed + class _dist_gen(stats.rv_continuous): + + def _pdf(self, x, a, *args): + return 42 + assert_raises(TypeError, _dist_gen, **dict(name='dummy')) + + def test_kwargs_raise(self): + # without explicit shapes, **kwargs are not allowed + class _dist_gen(stats.rv_continuous): + + def _pdf(self, x, a, **kwargs): + return 42 + assert_raises(TypeError, _dist_gen, **dict(name='dummy')) + + +@dec.skipif(DOCSTRINGS_STRIPPED) +def test_docstrings(): + badones = [',\s*,', '\(\s*,', '^\s*:'] + for distname in stats.__all__: + dist = getattr(stats, distname) + if isinstance(dist, (stats.rv_discrete, stats.rv_continuous)): + for regex in badones: + assert_(re.search(regex, dist.__doc__) is None) + + +def test_infinite_input(): + assert_almost_equal(stats.skellam.sf(np.inf, 10, 11), 0) + assert_almost_equal(stats.ncx2._cdf(np.inf, 8, 0.1), 1) + + +if __name__ == "__main__": + #unittest.main() + run_module_suite() diff --git a/pywafo/src/wafo/stats/tests/test_fit.py b/pywafo/src/wafo/stats/tests/test_fit.py index be81d2c..8b7776b 100644 --- a/pywafo/src/wafo/stats/tests/test_fit.py +++ b/pywafo/src/wafo/stats/tests/test_fit.py @@ -7,7 +7,7 @@ from numpy.testing import dec from wafo import stats -from .test_continuous_basic import distcont +from wafo.stats.tests.test_continuous_basic import distcont # this is not a proper statistical test for convergence, but only # verifies that the estimate and true values don't differ by too much @@ -45,14 +45,14 @@ skip_fit = [ def test_cont_fit(): # this tests the closeness of the estimated parameters to the true # parameters with fit method of continuous distributions - # Note: is slow, some distributions don't converge with sample size <= 10000 + # Note: slow, some distributions don't converge with sample size <= 10000 for distname, arg in distcont: if distname not in skip_fit: - yield check_cont_fit, distname,arg + yield check_cont_fit, distname, arg -def check_cont_fit(distname,arg): +def check_cont_fit(distname, arg): if distname in failing_fits: # Skip failing fits unless overridden xfail = True @@ -62,14 +62,16 @@ def check_cont_fit(distname,arg): pass if xfail: msg = "Fitting %s doesn't work reliably yet" % distname - msg += " [Set environment variable SCIPY_XFAIL=1 to run this test nevertheless.]" + msg += " [Set environment variable SCIPY_XFAIL=1 to run this " + \ + "test nevertheless.]" dec.knownfailureif(True, msg)(lambda: None)() distfn = getattr(stats, distname) - - truearg = np.hstack([arg,[0.0,1.0]]) - diffthreshold = np.max(np.vstack([truearg*thresh_percent, - np.ones(distfn.numargs+2)*thresh_min]),0) + + truearg = np.hstack([arg, [0.0, 1.0]]) + diffthreshold = np.max(np.vstack([ + truearg * thresh_percent, + np.ones(distfn.numargs + 2) * thresh_min]), 0) for fit_size in fit_sizes: # Note that if a fit succeeds, the other fit_sizes are skipped @@ -77,12 +79,16 @@ def check_cont_fit(distname,arg): with np.errstate(all='ignore'): rvs = distfn.rvs(size=fit_size, *arg) - est = distfn.fit(rvs) # start with default values + #phat = distfn.fit2(rvs) + phat = distfn.fit2(rvs, method='mps') + est = phat.par + #est = distfn.fit(rvs) # start with default values diff = est - truearg # threshold for location - diffthreshold[-2] = np.max([np.abs(rvs.mean())*thresh_percent,thresh_min]) + diffthreshold[-2] = np.max([np.abs(rvs.mean()) * thresh_percent, + thresh_min]) if np.any(np.isnan(est)): raise AssertionError('nan returned in fit') diff --git a/pywafo/src/wafo/stats/tests/test_morestats.py b/pywafo/src/wafo/stats/tests/test_morestats.py index 88f00b9..1e09f36 100644 --- a/pywafo/src/wafo/stats/tests/test_morestats.py +++ b/pywafo/src/wafo/stats/tests/test_morestats.py @@ -1,789 +1,807 @@ -# Author: Travis Oliphant, 2002 -# -# Further enhancements and tests added by numerous SciPy developers. -# -from __future__ import division, print_function, absolute_import - -import warnings - -import numpy as np -from numpy.random import RandomState -from numpy.testing import (TestCase, run_module_suite, assert_array_equal, - assert_almost_equal, assert_array_less, assert_array_almost_equal, - assert_raises, assert_, assert_allclose, assert_equal, dec) - -from scipy import stats - -# Matplotlib is not a scipy dependency but is optionally used in probplot, so -# check if it's available -try: - import matplotlib.pyplot as plt - have_matplotlib = True -except: - have_matplotlib = False - - -g1 = [1.006, 0.996, 0.998, 1.000, 0.992, 0.993, 1.002, 0.999, 0.994, 1.000] -g2 = [0.998, 1.006, 1.000, 1.002, 0.997, 0.998, 0.996, 1.000, 1.006, 0.988] -g3 = [0.991, 0.987, 0.997, 0.999, 0.995, 0.994, 1.000, 0.999, 0.996, 0.996] -g4 = [1.005, 1.002, 0.994, 1.000, 0.995, 0.994, 0.998, 0.996, 1.002, 0.996] -g5 = [0.998, 0.998, 0.982, 0.990, 1.002, 0.984, 0.996, 0.993, 0.980, 0.996] -g6 = [1.009, 1.013, 1.009, 0.997, 0.988, 1.002, 0.995, 0.998, 0.981, 0.996] -g7 = [0.990, 1.004, 0.996, 1.001, 0.998, 1.000, 1.018, 1.010, 0.996, 1.002] -g8 = [0.998, 1.000, 1.006, 1.000, 1.002, 0.996, 0.998, 0.996, 1.002, 1.006] -g9 = [1.002, 0.998, 0.996, 0.995, 0.996, 1.004, 1.004, 0.998, 0.999, 0.991] -g10 = [0.991, 0.995, 0.984, 0.994, 0.997, 0.997, 0.991, 0.998, 1.004, 0.997] - - -class TestShapiro(TestCase): - def test_basic(self): - x1 = [0.11,7.87,4.61,10.14,7.95,3.14,0.46, - 4.43,0.21,4.75,0.71,1.52,3.24, - 0.93,0.42,4.97,9.53,4.55,0.47,6.66] - w,pw = stats.shapiro(x1) - assert_almost_equal(w,0.90047299861907959,6) - assert_almost_equal(pw,0.042089745402336121,6) - x2 = [1.36,1.14,2.92,2.55,1.46,1.06,5.27,-1.11, - 3.48,1.10,0.88,-0.51,1.46,0.52,6.20,1.69, - 0.08,3.67,2.81,3.49] - w,pw = stats.shapiro(x2) - assert_almost_equal(w,0.9590270,6) - assert_almost_equal(pw,0.52460,3) - - def test_bad_arg(self): - # Length of x is less than 3. - x = [1] - assert_raises(ValueError, stats.shapiro, x) - - -class TestAnderson(TestCase): - def test_normal(self): - rs = RandomState(1234567890) - x1 = rs.standard_exponential(size=50) - x2 = rs.standard_normal(size=50) - A,crit,sig = stats.anderson(x1) - assert_array_less(crit[:-1], A) - A,crit,sig = stats.anderson(x2) - assert_array_less(A, crit[-2:]) - - def test_expon(self): - rs = RandomState(1234567890) - x1 = rs.standard_exponential(size=50) - x2 = rs.standard_normal(size=50) - A,crit,sig = stats.anderson(x1,'expon') - assert_array_less(A, crit[-2:]) - olderr = np.seterr(all='ignore') - try: - A,crit,sig = stats.anderson(x2,'expon') - finally: - np.seterr(**olderr) - assert_(A > crit[-1]) - - def test_bad_arg(self): - assert_raises(ValueError, stats.anderson, [1], dist='plate_of_shrimp') - - -class TestAnsari(TestCase): - - def test_small(self): - x = [1,2,3,3,4] - y = [3,2,6,1,6,1,4,1] - W, pval = stats.ansari(x,y) - assert_almost_equal(W,23.5,11) - assert_almost_equal(pval,0.13499256881897437,11) - - def test_approx(self): - ramsay = np.array((111, 107, 100, 99, 102, 106, 109, 108, 104, 99, - 101, 96, 97, 102, 107, 113, 116, 113, 110, 98)) - parekh = np.array((107, 108, 106, 98, 105, 103, 110, 105, 104, - 100, 96, 108, 103, 104, 114, 114, 113, 108, 106, 99)) - - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', - message="Ties preclude use of exact statistic.") - W, pval = stats.ansari(ramsay, parekh) - - assert_almost_equal(W,185.5,11) - assert_almost_equal(pval,0.18145819972867083,11) - - def test_exact(self): - W,pval = stats.ansari([1,2,3,4],[15,5,20,8,10,12]) - assert_almost_equal(W,10.0,11) - assert_almost_equal(pval,0.533333333333333333,7) - - def test_bad_arg(self): - assert_raises(ValueError, stats.ansari, [], [1]) - assert_raises(ValueError, stats.ansari, [1], []) - - -class TestBartlett(TestCase): - - def test_data(self): - args = [g1, g2, g3, g4, g5, g6, g7, g8, g9, g10] - T, pval = stats.bartlett(*args) - assert_almost_equal(T,20.78587342806484,7) - assert_almost_equal(pval,0.0136358632781,7) - - def test_bad_arg(self): - # Too few args raises ValueError. - assert_raises(ValueError, stats.bartlett, [1]) - - -class TestLevene(TestCase): - - def test_data(self): - args = [g1, g2, g3, g4, g5, g6, g7, g8, g9, g10] - W, pval = stats.levene(*args) - assert_almost_equal(W,1.7059176930008939,7) - assert_almost_equal(pval,0.0990829755522,7) - - def test_trimmed1(self): - # Test that center='trimmed' gives the same result as center='mean' - # when proportiontocut=0. - W1, pval1 = stats.levene(g1, g2, g3, center='mean') - W2, pval2 = stats.levene(g1, g2, g3, center='trimmed', proportiontocut=0.0) - assert_almost_equal(W1, W2) - assert_almost_equal(pval1, pval2) - - def test_trimmed2(self): - x = [1.2, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 100.0] - y = [0.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 200.0] - np.random.seed(1234) - x2 = np.random.permutation(x) - - # Use center='trimmed' - W0, pval0 = stats.levene(x, y, center='trimmed', proportiontocut=0.125) - W1, pval1 = stats.levene(x2, y, center='trimmed', proportiontocut=0.125) - # Trim the data here, and use center='mean' - W2, pval2 = stats.levene(x[1:-1], y[1:-1], center='mean') - # Result should be the same. - assert_almost_equal(W0, W2) - assert_almost_equal(W1, W2) - assert_almost_equal(pval1, pval2) - - def test_equal_mean_median(self): - x = np.linspace(-1,1,21) - np.random.seed(1234) - x2 = np.random.permutation(x) - y = x**3 - W1, pval1 = stats.levene(x, y, center='mean') - W2, pval2 = stats.levene(x2, y, center='median') - assert_almost_equal(W1, W2) - assert_almost_equal(pval1, pval2) - - def test_bad_keyword(self): - x = np.linspace(-1,1,21) - assert_raises(TypeError, stats.levene, x, x, portiontocut=0.1) - - def test_bad_center_value(self): - x = np.linspace(-1,1,21) - assert_raises(ValueError, stats.levene, x, x, center='trim') - - def test_too_few_args(self): - assert_raises(ValueError, stats.levene, [1]) - - -class TestBinomP(TestCase): - - def test_data(self): - pval = stats.binom_test(100,250) - assert_almost_equal(pval,0.0018833009350757682,11) - pval = stats.binom_test(201,405) - assert_almost_equal(pval,0.92085205962670713,11) - pval = stats.binom_test([682,243],p=3.0/4) - assert_almost_equal(pval,0.38249155957481695,11) - - def test_bad_len_x(self): - # Length of x must be 1 or 2. - assert_raises(ValueError, stats.binom_test, [1,2,3]) - - def test_bad_n(self): - # len(x) is 1, but n is invalid. - # Missing n - assert_raises(ValueError, stats.binom_test, [100]) - # n less than x[0] - assert_raises(ValueError, stats.binom_test, [100], n=50) - - def test_bad_p(self): - assert_raises(ValueError, stats.binom_test, [50, 50], p=2.0) - - -class TestFindRepeats(TestCase): - - def test_basic(self): - a = [1,2,3,4,1,2,3,4,1,2,5] - res,nums = stats.find_repeats(a) - assert_array_equal(res,[1,2,3,4]) - assert_array_equal(nums,[3,3,2,2]) - - def test_empty_result(self): - # Check that empty arrays are returned when there are no repeats. - a = [10, 20, 50, 30, 40] - repeated, counts = stats.find_repeats(a) - assert_array_equal(repeated, []) - assert_array_equal(counts, []) - - -class TestFligner(TestCase): - - def test_data(self): - # numbers from R: fligner.test in package stats - x1 = np.arange(5) - assert_array_almost_equal(stats.fligner(x1,x1**2), - (3.2282229927203536, 0.072379187848207877), 11) - - def test_trimmed1(self): - # Test that center='trimmed' gives the same result as center='mean' - # when proportiontocut=0. - Xsq1, pval1 = stats.fligner(g1, g2, g3, center='mean') - Xsq2, pval2 = stats.fligner(g1, g2, g3, center='trimmed', proportiontocut=0.0) - assert_almost_equal(Xsq1, Xsq2) - assert_almost_equal(pval1, pval2) - - def test_trimmed2(self): - x = [1.2, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 100.0] - y = [0.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 200.0] - # Use center='trimmed' - Xsq1, pval1 = stats.fligner(x, y, center='trimmed', proportiontocut=0.125) - # Trim the data here, and use center='mean' - Xsq2, pval2 = stats.fligner(x[1:-1], y[1:-1], center='mean') - # Result should be the same. - assert_almost_equal(Xsq1, Xsq2) - assert_almost_equal(pval1, pval2) - - # The following test looks reasonable at first, but fligner() uses the - # function stats.rankdata(), and in one of the cases in this test, - # there are ties, while in the other (because of normal rounding - # errors) there are not. This difference leads to differences in the - # third significant digit of W. - # - #def test_equal_mean_median(self): - # x = np.linspace(-1,1,21) - # y = x**3 - # W1, pval1 = stats.fligner(x, y, center='mean') - # W2, pval2 = stats.fligner(x, y, center='median') - # assert_almost_equal(W1, W2) - # assert_almost_equal(pval1, pval2) - - def test_bad_keyword(self): - x = np.linspace(-1,1,21) - assert_raises(TypeError, stats.fligner, x, x, portiontocut=0.1) - - def test_bad_center_value(self): - x = np.linspace(-1,1,21) - assert_raises(ValueError, stats.fligner, x, x, center='trim') - - def test_bad_num_args(self): - # Too few args raises ValueError. - assert_raises(ValueError, stats.fligner, [1]) - - -class TestMood(TestCase): - def test_mood(self): - # numbers from R: mood.test in package stats - x1 = np.arange(5) - assert_array_almost_equal(stats.mood(x1, x1**2), - (-1.3830857299399906, 0.16663858066771478), 11) - - def test_mood_order_of_args(self): - # z should change sign when the order of arguments changes, pvalue - # should not change - np.random.seed(1234) - x1 = np.random.randn(10, 1) - x2 = np.random.randn(15, 1) - z1, p1 = stats.mood(x1, x2) - z2, p2 = stats.mood(x2, x1) - assert_array_almost_equal([z1, p1], [-z2, p2]) - - def test_mood_with_axis_none(self): - #Test with axis = None, compare with results from R - x1 = [-0.626453810742332, 0.183643324222082, -0.835628612410047, - 1.59528080213779, 0.329507771815361, -0.820468384118015, - 0.487429052428485, 0.738324705129217, 0.575781351653492, - -0.305388387156356, 1.51178116845085, 0.389843236411431, - -0.621240580541804, -2.2146998871775, 1.12493091814311, - -0.0449336090152309, -0.0161902630989461, 0.943836210685299, - 0.821221195098089, 0.593901321217509] - - x2 = [-0.896914546624981, 0.184849184646742, 1.58784533120882, - -1.13037567424629, -0.0802517565509893, 0.132420284381094, - 0.707954729271733, -0.23969802417184, 1.98447393665293, - -0.138787012119665, 0.417650750792556, 0.981752777463662, - -0.392695355503813, -1.03966897694891, 1.78222896030858, - -2.31106908460517, 0.878604580921265, 0.035806718015226, - 1.01282869212708, 0.432265154539617, 2.09081920524915, - -1.19992581964387, 1.58963820029007, 1.95465164222325, - 0.00493777682814261, -2.45170638784613, 0.477237302613617, - -0.596558168631403, 0.792203270299649, 0.289636710177348] - - x1 = np.array(x1) - x2 = np.array(x2) - x1.shape = (10, 2) - x2.shape = (15, 2) - assert_array_almost_equal(stats.mood(x1, x2, axis=None), - [-1.31716607555, 0.18778296257]) - - def test_mood_2d(self): - # Test if the results of mood test in 2-D case are consistent with the - # R result for the same inputs. Numbers from R mood.test(). - ny = 5 - np.random.seed(1234) - x1 = np.random.randn(10, ny) - x2 = np.random.randn(15, ny) - z_vectest, pval_vectest = stats.mood(x1, x2) - - for j in range(ny): - assert_array_almost_equal([z_vectest[j], pval_vectest[j]], - stats.mood(x1[:, j], x2[:, j])) - - # inverse order of dimensions - x1 = x1.transpose() - x2 = x2.transpose() - z_vectest, pval_vectest = stats.mood(x1, x2, axis=1) - - for i in range(ny): - # check axis handling is self consistent - assert_array_almost_equal([z_vectest[i], pval_vectest[i]], - stats.mood(x1[i, :], x2[i, :])) - - def test_mood_3d(self): - shape = (10, 5, 6) - np.random.seed(1234) - x1 = np.random.randn(*shape) - x2 = np.random.randn(*shape) - - for axis in range(3): - z_vectest, pval_vectest = stats.mood(x1, x2, axis=axis) - # Tests that result for 3-D arrays is equal to that for the - # same calculation on a set of 1-D arrays taken from the - # 3-D array - axes_idx = ([1, 2], [0, 2], [0, 1]) # the two axes != axis - for i in range(shape[axes_idx[axis][0]]): - for j in range(shape[axes_idx[axis][1]]): - if axis == 0: - slice1 = x1[:, i, j] - slice2 = x2[:, i, j] - elif axis == 1: - slice1 = x1[i, :, j] - slice2 = x2[i, :, j] - else: - slice1 = x1[i, j, :] - slice2 = x2[i, j, :] - - assert_array_almost_equal([z_vectest[i, j], - pval_vectest[i, j]], - stats.mood(slice1, slice2)) - - def test_mood_bad_arg(self): - # Raise ValueError when the sum of the lengths of the args is less than 3 - assert_raises(ValueError, stats.mood, [1], []) - - -class TestProbplot(TestCase): - - def test_basic(self): - np.random.seed(12345) - x = stats.norm.rvs(size=20) - osm, osr = stats.probplot(x, fit=False) - osm_expected = [-1.8241636, -1.38768012, -1.11829229, -0.91222575, - -0.73908135, -0.5857176, -0.44506467, -0.31273668, - -0.18568928, -0.06158146, 0.06158146, 0.18568928, - 0.31273668, 0.44506467, 0.5857176, 0.73908135, - 0.91222575, 1.11829229, 1.38768012, 1.8241636] - assert_allclose(osr, np.sort(x)) - assert_allclose(osm, osm_expected) - - res, res_fit = stats.probplot(x, fit=True) - res_fit_expected = [1.05361841, 0.31297795, 0.98741609] - assert_allclose(res_fit, res_fit_expected) - - def test_sparams_keyword(self): - np.random.seed(123456) - x = stats.norm.rvs(size=100) - # Check that None, () and 0 (loc=0, for normal distribution) all work - # and give the same results - osm1, osr1 = stats.probplot(x, sparams=None, fit=False) - osm2, osr2 = stats.probplot(x, sparams=0, fit=False) - osm3, osr3 = stats.probplot(x, sparams=(), fit=False) - assert_allclose(osm1, osm2) - assert_allclose(osm1, osm3) - assert_allclose(osr1, osr2) - assert_allclose(osr1, osr3) - # Check giving (loc, scale) params for normal distribution - osm, osr = stats.probplot(x, sparams=(), fit=False) - - def test_dist_keyword(self): - np.random.seed(12345) - x = stats.norm.rvs(size=20) - osm1, osr1 = stats.probplot(x, fit=False, dist='t', sparams=(3,)) - osm2, osr2 = stats.probplot(x, fit=False, dist=stats.t, sparams=(3,)) - assert_allclose(osm1, osm2) - assert_allclose(osr1, osr2) - - assert_raises(ValueError, stats.probplot, x, dist='wrong-dist-name') - assert_raises(AttributeError, stats.probplot, x, dist=[]) - - class custom_dist(object): - """Some class that looks just enough like a distribution.""" - def ppf(self, q): - return stats.norm.ppf(q, loc=2) - - osm1, osr1 = stats.probplot(x, sparams=(2,), fit=False) - osm2, osr2 = stats.probplot(x, dist=custom_dist(), fit=False) - assert_allclose(osm1, osm2) - assert_allclose(osr1, osr2) - - @dec.skipif(not have_matplotlib) - def test_plot_kwarg(self): - np.random.seed(7654321) - fig = plt.figure() - fig.add_subplot(111) - x = stats.t.rvs(3, size=100) - res1, fitres1 = stats.probplot(x, plot=plt) - plt.close() - res2, fitres2 = stats.probplot(x, plot=None) - res3 = stats.probplot(x, fit=False, plot=plt) - plt.close() - res4 = stats.probplot(x, fit=False, plot=None) - # Check that results are consistent between combinations of `fit` and - # `plot` keywords. - assert_(len(res1) == len(res2) == len(res3) == len(res4) == 2) - assert_allclose(res1, res2) - assert_allclose(res1, res3) - assert_allclose(res1, res4) - assert_allclose(fitres1, fitres2) - - # Check that a Matplotlib Axes object is accepted - fig = plt.figure() - ax = fig.add_subplot(111) - stats.probplot(x, fit=False, plot=ax) - plt.close() - - def test_probplot_bad_args(self): - # Raise ValueError when given an invalid distribution. - assert_raises(ValueError, stats.probplot, [1], dist="plate_of_shrimp") - - -def test_wilcoxon_bad_arg(): - # Raise ValueError when two args of different lengths are given or - # zero_method is unknown. - assert_raises(ValueError, stats.wilcoxon, [1], [1,2]) - assert_raises(ValueError, stats.wilcoxon, [1,2], [1,2], "dummy") - - -def test_mvsdist_bad_arg(): - # Raise ValueError if fewer than two data points are given. - data = [1] - assert_raises(ValueError, stats.mvsdist, data) - - -def test_kstat_bad_arg(): - # Raise ValueError if n > 4 or n > 1. - data = [1] - n = 10 - assert_raises(ValueError, stats.kstat, data, n=n) - - -def test_kstatvar_bad_arg(): - # Raise ValueError is n is not 1 or 2. - data = [1] - n = 10 - assert_raises(ValueError, stats.kstatvar, data, n=n) - - -def test_ppcc_max_bad_arg(): - # Raise ValueError when given an invalid distribution. - data = [1] - assert_raises(ValueError, stats.ppcc_max, data, dist="plate_of_shrimp") - - -class TestBoxcox_llf(TestCase): - - def test_basic(self): - np.random.seed(54321) - x = stats.norm.rvs(size=10000, loc=10) - lmbda = 1 - llf = stats.boxcox_llf(lmbda, x) - llf_expected = -x.size / 2. * np.log(np.sum(x.std()**2)) - assert_allclose(llf, llf_expected) - - def test_array_like(self): - np.random.seed(54321) - x = stats.norm.rvs(size=100, loc=10) - lmbda = 1 - llf = stats.boxcox_llf(lmbda, x) - llf2 = stats.boxcox_llf(lmbda, list(x)) - assert_allclose(llf, llf2, rtol=1e-12) - - def test_2d_input(self): - # Note: boxcox_llf() was already working with 2-D input (sort of), so - # keep it like that. boxcox() doesn't work with 2-D input though, due - # to brent() returning a scalar. - np.random.seed(54321) - x = stats.norm.rvs(size=100, loc=10) - lmbda = 1 - llf = stats.boxcox_llf(lmbda, x) - llf2 = stats.boxcox_llf(lmbda, np.vstack([x, x]).T) - assert_allclose([llf, llf], llf2, rtol=1e-12) - - def test_empty(self): - assert_(np.isnan(stats.boxcox_llf(1, []))) - - -class TestBoxcox(TestCase): - - def test_fixed_lmbda(self): - np.random.seed(12345) - x = stats.loggamma.rvs(5, size=50) + 5 - xt = stats.boxcox(x, lmbda=1) - assert_allclose(xt, x - 1) - xt = stats.boxcox(x, lmbda=-1) - assert_allclose(xt, 1 - 1/x) - - xt = stats.boxcox(x, lmbda=0) - assert_allclose(xt, np.log(x)) - - # Also test that array_like input works - xt = stats.boxcox(list(x), lmbda=0) - assert_allclose(xt, np.log(x)) - - def test_lmbda_None(self): - np.random.seed(1234567) - # Start from normal rv's, do inverse transform to check that - # optimization function gets close to the right answer. - np.random.seed(1245) - lmbda = 2.5 - x = stats.norm.rvs(loc=10, size=50000) - x_inv = (x * lmbda + 1)**(-lmbda) - xt, maxlog = stats.boxcox(x_inv) - - assert_almost_equal(maxlog, -1 / lmbda, decimal=2) - - def test_alpha(self): - np.random.seed(1234) - x = stats.loggamma.rvs(5, size=50) + 5 - - # Some regular values for alpha, on a small sample size - _, _, interval = stats.boxcox(x, alpha=0.75) - assert_allclose(interval, [4.004485780226041, 5.138756355035744]) - _, _, interval = stats.boxcox(x, alpha=0.05) - assert_allclose(interval, [1.2138178554857557, 8.209033272375663]) - - # Try some extreme values, see we don't hit the N=500 limit - x = stats.loggamma.rvs(7, size=500) + 15 - _, _, interval = stats.boxcox(x, alpha=0.001) - assert_allclose(interval, [0.3988867, 11.40553131]) - _, _, interval = stats.boxcox(x, alpha=0.999) - assert_allclose(interval, [5.83316246, 5.83735292]) - - def test_boxcox_bad_arg(self): - # Raise ValueError if any data value is negative. - x = np.array([-1]) - assert_raises(ValueError, stats.boxcox, x) - - def test_empty(self): - assert_(stats.boxcox([]).shape == (0,)) - - -class TestBoxcoxNormmax(TestCase): - def setUp(self): - np.random.seed(12345) - self.x = stats.loggamma.rvs(5, size=50) + 5 - - def test_pearsonr(self): - maxlog = stats.boxcox_normmax(self.x) - assert_allclose(maxlog, 1.804465325046) - - def test_mle(self): - maxlog = stats.boxcox_normmax(self.x, method='mle') - assert_allclose(maxlog, 1.758101454114) - - # Check that boxcox() uses 'mle' - _, maxlog_boxcox = stats.boxcox(self.x) - assert_allclose(maxlog_boxcox, maxlog) - - def test_all(self): - maxlog_all = stats.boxcox_normmax(self.x, method='all') - assert_allclose(maxlog_all, [1.804465325046, 1.758101454114]) - - -class TestBoxcoxNormplot(TestCase): - def setUp(self): - np.random.seed(7654321) - self.x = stats.loggamma.rvs(5, size=500) + 5 - - def test_basic(self): - N = 5 - lmbdas, ppcc = stats.boxcox_normplot(self.x, -10, 10, N=N) - ppcc_expected = [0.57783375, 0.83610988, 0.97524311, 0.99756057, - 0.95843297] - assert_allclose(lmbdas, np.linspace(-10, 10, num=N)) - assert_allclose(ppcc, ppcc_expected) - - @dec.skipif(not have_matplotlib) - def test_plot_kwarg(self): - # Check with the matplotlib.pyplot module - fig = plt.figure() - fig.add_subplot(111) - stats.boxcox_normplot(self.x, -20, 20, plot=plt) - plt.close() - - # Check that a Matplotlib Axes object is accepted - fig.add_subplot(111) - ax = fig.add_subplot(111) - stats.boxcox_normplot(self.x, -20, 20, plot=ax) - plt.close() - - def test_invalid_inputs(self): - # `lb` has to be larger than `la` - assert_raises(ValueError, stats.boxcox_normplot, self.x, 1, 0) - # `x` can not contain negative values - assert_raises(ValueError, stats.boxcox_normplot, [-1, 1] , 0, 1) - - def test_empty(self): - assert_(stats.boxcox_normplot([], 0, 1).size == 0) - - -class TestCircFuncs(TestCase): - def test_circfuncs(self): - x = np.array([355,5,2,359,10,350]) - M = stats.circmean(x, high=360) - Mval = 0.167690146 - assert_allclose(M, Mval, rtol=1e-7) - - V = stats.circvar(x, high=360) - Vval = 42.51955609 - assert_allclose(V, Vval, rtol=1e-7) - - S = stats.circstd(x, high=360) - Sval = 6.520702116 - assert_allclose(S, Sval, rtol=1e-7) - - def test_circfuncs_small(self): - x = np.array([20,21,22,18,19,20.5,19.2]) - M1 = x.mean() - M2 = stats.circmean(x, high=360) - assert_allclose(M2, M1, rtol=1e-5) - - V1 = x.var() - V2 = stats.circvar(x, high=360) - assert_allclose(V2, V1, rtol=1e-4) - - S1 = x.std() - S2 = stats.circstd(x, high=360) - assert_allclose(S2, S1, rtol=1e-4) - - def test_circmean_axis(self): - x = np.array([[355,5,2,359,10,350], - [351,7,4,352,9,349], - [357,9,8,358,4,356]]) - M1 = stats.circmean(x, high=360) - M2 = stats.circmean(x.ravel(), high=360) - assert_allclose(M1, M2, rtol=1e-14) - - M1 = stats.circmean(x, high=360, axis=1) - M2 = [stats.circmean(x[i], high=360) for i in range(x.shape[0])] - assert_allclose(M1, M2, rtol=1e-14) - - M1 = stats.circmean(x, high=360, axis=0) - M2 = [stats.circmean(x[:,i], high=360) for i in range(x.shape[1])] - assert_allclose(M1, M2, rtol=1e-14) - - def test_circvar_axis(self): - x = np.array([[355,5,2,359,10,350], - [351,7,4,352,9,349], - [357,9,8,358,4,356]]) - - V1 = stats.circvar(x, high=360) - V2 = stats.circvar(x.ravel(), high=360) - assert_allclose(V1, V2, rtol=1e-11) - - V1 = stats.circvar(x, high=360, axis=1) - V2 = [stats.circvar(x[i], high=360) for i in range(x.shape[0])] - assert_allclose(V1, V2, rtol=1e-11) - - V1 = stats.circvar(x, high=360, axis=0) - V2 = [stats.circvar(x[:,i], high=360) for i in range(x.shape[1])] - assert_allclose(V1, V2, rtol=1e-11) - - def test_circstd_axis(self): - x = np.array([[355,5,2,359,10,350], - [351,7,4,352,9,349], - [357,9,8,358,4,356]]) - - S1 = stats.circstd(x, high=360) - S2 = stats.circstd(x.ravel(), high=360) - assert_allclose(S1, S2, rtol=1e-11) - - S1 = stats.circstd(x, high=360, axis=1) - S2 = [stats.circstd(x[i], high=360) for i in range(x.shape[0])] - assert_allclose(S1, S2, rtol=1e-11) - - S1 = stats.circstd(x, high=360, axis=0) - S2 = [stats.circstd(x[:,i], high=360) for i in range(x.shape[1])] - assert_allclose(S1, S2, rtol=1e-11) - - def test_circfuncs_array_like(self): - x = [355,5,2,359,10,350] - assert_allclose(stats.circmean(x, high=360), 0.167690146, rtol=1e-7) - assert_allclose(stats.circvar(x, high=360), 42.51955609, rtol=1e-7) - assert_allclose(stats.circstd(x, high=360), 6.520702116, rtol=1e-7) - - def test_empty(self): - assert_(np.isnan(stats.circmean([]))) - assert_(np.isnan(stats.circstd([]))) - assert_(np.isnan(stats.circvar([]))) - - -def test_accuracy_wilcoxon(): - freq = [1, 4, 16, 15, 8, 4, 5, 1, 2] - nums = range(-4, 5) - x = np.concatenate([[u] * v for u, v in zip(nums, freq)]) - y = np.zeros(x.size) - - T, p = stats.wilcoxon(x, y, "pratt") - assert_allclose(T, 423) - assert_allclose(p, 0.00197547303533107) - - T, p = stats.wilcoxon(x, y, "zsplit") - assert_allclose(T, 441) - assert_allclose(p, 0.0032145343172473055) - - T, p = stats.wilcoxon(x, y, "wilcox") - assert_allclose(T, 327) - assert_allclose(p, 0.00641346115861) - - # Test the 'correction' option, using values computed in R with: - # > wilcox.test(x, y, paired=TRUE, exact=FALSE, correct={FALSE,TRUE}) - x = np.array([120, 114, 181, 188, 180, 146, 121, 191, 132, 113, 127, 112]) - y = np.array([133, 143, 119, 189, 112, 199, 198, 113, 115, 121, 142, 187]) - T, p = stats.wilcoxon(x, y, correction=False) - assert_equal(T, 34) - assert_allclose(p, 0.6948866, rtol=1e-6) - T, p = stats.wilcoxon(x, y, correction=True) - assert_equal(T, 34) - assert_allclose(p, 0.7240817, rtol=1e-6) - - -def test_wilcoxon_tie(): - # Regression test for gh-2391. - # Corresponding R code is: - # > result = wilcox.test(rep(0.1, 10), exact=FALSE, correct=FALSE) - # > result$p.value - # [1] 0.001565402 - # > result = wilcox.test(rep(0.1, 10), exact=FALSE, correct=TRUE) - # > result$p.value - # [1] 0.001904195 - stat, p = stats.wilcoxon([0.1] * 10) - expected_p = 0.001565402 - assert_equal(stat, 0) - assert_allclose(p, expected_p, rtol=1e-6) - - stat, p = stats.wilcoxon([0.1] * 10, correction=True) - expected_p = 0.001904195 - assert_equal(stat, 0) - assert_allclose(p, expected_p, rtol=1e-6) - - -if __name__ == "__main__": - run_module_suite() +# Author: Travis Oliphant, 2002 +# +# Further enhancements and tests added by numerous SciPy developers. +# +from __future__ import division, print_function, absolute_import + +import warnings + +import numpy as np +from numpy.random import RandomState +from numpy.testing import (TestCase, run_module_suite, assert_array_equal, + assert_almost_equal, assert_array_less, + assert_array_almost_equal, assert_raises, assert_, + assert_allclose, assert_equal, dec) + +from wafo import stats + +# Matplotlib is not a scipy dependency but is optionally used in probplot, so +# check if it's available +try: + import matplotlib.pyplot as plt + have_matplotlib = True +except: + have_matplotlib = False + + +g1 = [1.006, 0.996, 0.998, 1.000, 0.992, 0.993, 1.002, 0.999, 0.994, 1.000] +g2 = [0.998, 1.006, 1.000, 1.002, 0.997, 0.998, 0.996, 1.000, 1.006, 0.988] +g3 = [0.991, 0.987, 0.997, 0.999, 0.995, 0.994, 1.000, 0.999, 0.996, 0.996] +g4 = [1.005, 1.002, 0.994, 1.000, 0.995, 0.994, 0.998, 0.996, 1.002, 0.996] +g5 = [0.998, 0.998, 0.982, 0.990, 1.002, 0.984, 0.996, 0.993, 0.980, 0.996] +g6 = [1.009, 1.013, 1.009, 0.997, 0.988, 1.002, 0.995, 0.998, 0.981, 0.996] +g7 = [0.990, 1.004, 0.996, 1.001, 0.998, 1.000, 1.018, 1.010, 0.996, 1.002] +g8 = [0.998, 1.000, 1.006, 1.000, 1.002, 0.996, 0.998, 0.996, 1.002, 1.006] +g9 = [1.002, 0.998, 0.996, 0.995, 0.996, 1.004, 1.004, 0.998, 0.999, 0.991] +g10 = [0.991, 0.995, 0.984, 0.994, 0.997, 0.997, 0.991, 0.998, 1.004, 0.997] + + +class TestShapiro(TestCase): + + def test_basic(self): + x1 = [0.11, 7.87, 4.61, 10.14, 7.95, 3.14, 0.46, + 4.43, 0.21, 4.75, 0.71, 1.52, 3.24, + 0.93, 0.42, 4.97, 9.53, 4.55, 0.47, 6.66] + w, pw = stats.shapiro(x1) + assert_almost_equal(w, 0.90047299861907959, 6) + assert_almost_equal(pw, 0.042089745402336121, 6) + x2 = [1.36, 1.14, 2.92, 2.55, 1.46, 1.06, 5.27, -1.11, + 3.48, 1.10, 0.88, -0.51, 1.46, 0.52, 6.20, 1.69, + 0.08, 3.67, 2.81, 3.49] + w, pw = stats.shapiro(x2) + assert_almost_equal(w, 0.9590270, 6) + assert_almost_equal(pw, 0.52460, 3) + + def test_bad_arg(self): + # Length of x is less than 3. + x = [1] + assert_raises(ValueError, stats.shapiro, x) + + +class TestAnderson(TestCase): + + def test_normal(self): + rs = RandomState(1234567890) + x1 = rs.standard_exponential(size=50) + x2 = rs.standard_normal(size=50) + A, crit, _sig = stats.anderson(x1) + assert_array_less(crit[:-1], A) + A, crit, _sig = stats.anderson(x2) + assert_array_less(A, crit[-2:]) + + def test_expon(self): + rs = RandomState(1234567890) + x1 = rs.standard_exponential(size=50) + x2 = rs.standard_normal(size=50) + A, crit, _sig = stats.anderson(x1, 'expon') + assert_array_less(A, crit[-2:]) + olderr = np.seterr(all='ignore') + try: + A, crit, _sig = stats.anderson(x2, 'expon') + finally: + np.seterr(**olderr) + assert_(A > crit[-1]) + + def test_bad_arg(self): + assert_raises(ValueError, stats.anderson, [1], dist='plate_of_shrimp') + + +class TestAnsari(TestCase): + + def test_small(self): + x = [1, 2, 3, 3, 4] + y = [3, 2, 6, 1, 6, 1, 4, 1] + W, pval = stats.ansari(x, y) + assert_almost_equal(W, 23.5, 11) + assert_almost_equal(pval, 0.13499256881897437, 11) + + def test_approx(self): + ramsay = np.array((111, 107, 100, 99, 102, 106, 109, 108, 104, 99, + 101, 96, 97, 102, 107, 113, 116, 113, 110, 98)) + parekh = np.array((107, 108, 106, 98, 105, 103, 110, 105, 104, 100, + 96, 108, 103, 104, 114, 114, 113, 108, 106, 99)) + + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', + message="Ties preclude use of exact " + + "statistic.") + W, pval = stats.ansari(ramsay, parekh) + + assert_almost_equal(W, 185.5, 11) + assert_almost_equal(pval, 0.18145819972867083, 11) + + def test_exact(self): + W, pval = stats.ansari([1, 2, 3, 4], [15, 5, 20, 8, 10, 12]) + assert_almost_equal(W, 10.0, 11) + assert_almost_equal(pval, 0.533333333333333333, 7) + + def test_bad_arg(self): + assert_raises(ValueError, stats.ansari, [], [1]) + assert_raises(ValueError, stats.ansari, [1], []) + + +class TestBartlett(TestCase): + + def test_data(self): + args = [g1, g2, g3, g4, g5, g6, g7, g8, g9, g10] + T, pval = stats.bartlett(*args) + assert_almost_equal(T, 20.78587342806484, 7) + assert_almost_equal(pval, 0.0136358632781, 7) + + def test_bad_arg(self): + # Too few args raises ValueError. + assert_raises(ValueError, stats.bartlett, [1]) + + +class TestLevene(TestCase): + + def test_data(self): + args = [g1, g2, g3, g4, g5, g6, g7, g8, g9, g10] + W, pval = stats.levene(*args) + assert_almost_equal(W, 1.7059176930008939, 7) + assert_almost_equal(pval, 0.0990829755522, 7) + + def test_trimmed1(self): + # Test that center='trimmed' gives the same result as center='mean' + # when proportiontocut=0. + W1, pval1 = stats.levene(g1, g2, g3, center='mean') + W2, pval2 = stats.levene( + g1, g2, g3, center='trimmed', proportiontocut=0.0) + assert_almost_equal(W1, W2) + assert_almost_equal(pval1, pval2) + + def test_trimmed2(self): + x = [1.2, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 100.0] + y = [0.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 200.0] + np.random.seed(1234) + x2 = np.random.permutation(x) + + # Use center='trimmed' + W0, _pval0 = stats.levene(x, y, center='trimmed', + proportiontocut=0.125) + W1, pval1 = stats.levene( + x2, y, center='trimmed', proportiontocut=0.125) + # Trim the data here, and use center='mean' + W2, pval2 = stats.levene(x[1:-1], y[1:-1], center='mean') + # Result should be the same. + assert_almost_equal(W0, W2) + assert_almost_equal(W1, W2) + assert_almost_equal(pval1, pval2) + + def test_equal_mean_median(self): + x = np.linspace(-1, 1, 21) + np.random.seed(1234) + x2 = np.random.permutation(x) + y = x ** 3 + W1, pval1 = stats.levene(x, y, center='mean') + W2, pval2 = stats.levene(x2, y, center='median') + assert_almost_equal(W1, W2) + assert_almost_equal(pval1, pval2) + + def test_bad_keyword(self): + x = np.linspace(-1, 1, 21) + assert_raises(TypeError, stats.levene, x, x, portiontocut=0.1) + + def test_bad_center_value(self): + x = np.linspace(-1, 1, 21) + assert_raises(ValueError, stats.levene, x, x, center='trim') + + def test_too_few_args(self): + assert_raises(ValueError, stats.levene, [1]) + + +class TestBinomP(TestCase): + + def test_data(self): + pval = stats.binom_test(100, 250) + assert_almost_equal(pval, 0.0018833009350757682, 11) + pval = stats.binom_test(201, 405) + assert_almost_equal(pval, 0.92085205962670713, 11) + pval = stats.binom_test([682, 243], p=3.0 / 4) + assert_almost_equal(pval, 0.38249155957481695, 11) + + def test_bad_len_x(self): + # Length of x must be 1 or 2. + assert_raises(ValueError, stats.binom_test, [1, 2, 3]) + + def test_bad_n(self): + # len(x) is 1, but n is invalid. + # Missing n + assert_raises(ValueError, stats.binom_test, [100]) + # n less than x[0] + assert_raises(ValueError, stats.binom_test, [100], n=50) + + def test_bad_p(self): + assert_raises(ValueError, stats.binom_test, [50, 50], p=2.0) + + +class TestFindRepeats(TestCase): + + def test_basic(self): + a = [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 5] + res, nums = stats.find_repeats(a) + assert_array_equal(res, [1, 2, 3, 4]) + assert_array_equal(nums, [3, 3, 2, 2]) + + def test_empty_result(self): + # Check that empty arrays are returned when there are no repeats. + a = [10, 20, 50, 30, 40] + repeated, counts = stats.find_repeats(a) + assert_array_equal(repeated, []) + assert_array_equal(counts, []) + + +class TestFligner(TestCase): + + def test_data(self): + # numbers from R: fligner.test in package stats + x1 = np.arange(5) + assert_array_almost_equal(stats.fligner(x1, x1 ** 2), + (3.2282229927203536, 0.072379187848207877), + 11) + + def test_trimmed1(self): + # Test that center='trimmed' gives the same result as center='mean' + # when proportiontocut=0. + Xsq1, pval1 = stats.fligner(g1, g2, g3, center='mean') + Xsq2, pval2 = stats.fligner( + g1, g2, g3, center='trimmed', proportiontocut=0.0) + assert_almost_equal(Xsq1, Xsq2) + assert_almost_equal(pval1, pval2) + + def test_trimmed2(self): + x = [1.2, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 100.0] + y = [0.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 200.0] + # Use center='trimmed' + Xsq1, pval1 = stats.fligner( + x, y, center='trimmed', proportiontocut=0.125) + # Trim the data here, and use center='mean' + Xsq2, pval2 = stats.fligner(x[1:-1], y[1:-1], center='mean') + # Result should be the same. + assert_almost_equal(Xsq1, Xsq2) + assert_almost_equal(pval1, pval2) + + # The following test looks reasonable at first, but fligner() uses the + # function stats.rankdata(), and in one of the cases in this test, + # there are ties, while in the other (because of normal rounding + # errors) there are not. This difference leads to differences in the + # third significant digit of W. + # + # def test_equal_mean_median(self): + # x = np.linspace(-1,1,21) + # y = x**3 + # W1, pval1 = stats.fligner(x, y, center='mean') + # W2, pval2 = stats.fligner(x, y, center='median') + # assert_almost_equal(W1, W2) + # assert_almost_equal(pval1, pval2) + + def test_bad_keyword(self): + x = np.linspace(-1, 1, 21) + assert_raises(TypeError, stats.fligner, x, x, portiontocut=0.1) + + def test_bad_center_value(self): + x = np.linspace(-1, 1, 21) + assert_raises(ValueError, stats.fligner, x, x, center='trim') + + def test_bad_num_args(self): + # Too few args raises ValueError. + assert_raises(ValueError, stats.fligner, [1]) + + +class TestMood(TestCase): + + def test_mood(self): + # numbers from R: mood.test in package stats + x1 = np.arange(5) + assert_array_almost_equal(stats.mood(x1, x1 ** 2), + (-1.3830857299399906, 0.16663858066771478), + 11) + + def test_mood_order_of_args(self): + # z should change sign when the order of arguments changes, pvalue + # should not change + np.random.seed(1234) + x1 = np.random.randn(10, 1) + x2 = np.random.randn(15, 1) + z1, p1 = stats.mood(x1, x2) + z2, p2 = stats.mood(x2, x1) + assert_array_almost_equal([z1, p1], [-z2, p2]) + + def test_mood_with_axis_none(self): + # Test with axis = None, compare with results from R + x1 = [-0.626453810742332, 0.183643324222082, -0.835628612410047, + 1.59528080213779, 0.329507771815361, -0.820468384118015, + 0.487429052428485, 0.738324705129217, 0.575781351653492, + -0.305388387156356, 1.51178116845085, 0.389843236411431, + -0.621240580541804, -2.2146998871775, 1.12493091814311, + -0.0449336090152309, -0.0161902630989461, 0.943836210685299, + 0.821221195098089, 0.593901321217509] + + x2 = [-0.896914546624981, 0.184849184646742, 1.58784533120882, + -1.13037567424629, -0.0802517565509893, 0.132420284381094, + 0.707954729271733, -0.23969802417184, 1.98447393665293, + -0.138787012119665, 0.417650750792556, 0.981752777463662, + -0.392695355503813, -1.03966897694891, 1.78222896030858, + -2.31106908460517, 0.878604580921265, 0.035806718015226, + 1.01282869212708, 0.432265154539617, 2.09081920524915, + -1.19992581964387, 1.58963820029007, 1.95465164222325, + 0.00493777682814261, -2.45170638784613, 0.477237302613617, + -0.596558168631403, 0.792203270299649, 0.289636710177348] + + x1 = np.array(x1) + x2 = np.array(x2) + x1.shape = (10, 2) + x2.shape = (15, 2) + assert_array_almost_equal(stats.mood(x1, x2, axis=None), + [-1.31716607555, 0.18778296257]) + + def test_mood_2d(self): + # Test if the results of mood test in 2-D case are consistent with the + # R result for the same inputs. Numbers from R mood.test(). + ny = 5 + np.random.seed(1234) + x1 = np.random.randn(10, ny) + x2 = np.random.randn(15, ny) + z_vectest, pval_vectest = stats.mood(x1, x2) + + for j in range(ny): + assert_array_almost_equal([z_vectest[j], pval_vectest[j]], + stats.mood(x1[:, j], x2[:, j])) + + # inverse order of dimensions + x1 = x1.transpose() + x2 = x2.transpose() + z_vectest, pval_vectest = stats.mood(x1, x2, axis=1) + + for i in range(ny): + # check axis handling is self consistent + assert_array_almost_equal([z_vectest[i], pval_vectest[i]], + stats.mood(x1[i, :], x2[i, :])) + + def test_mood_3d(self): + shape = (10, 5, 6) + np.random.seed(1234) + x1 = np.random.randn(*shape) + x2 = np.random.randn(*shape) + + for axis in range(3): + z_vectest, pval_vectest = stats.mood(x1, x2, axis=axis) + # Tests that result for 3-D arrays is equal to that for the + # same calculation on a set of 1-D arrays taken from the + # 3-D array + axes_idx = ([1, 2], [0, 2], [0, 1]) # the two axes != axis + for i in range(shape[axes_idx[axis][0]]): + for j in range(shape[axes_idx[axis][1]]): + if axis == 0: + slice1 = x1[:, i, j] + slice2 = x2[:, i, j] + elif axis == 1: + slice1 = x1[i, :, j] + slice2 = x2[i, :, j] + else: + slice1 = x1[i, j, :] + slice2 = x2[i, j, :] + + assert_array_almost_equal([z_vectest[i, j], + pval_vectest[i, j]], + stats.mood(slice1, slice2)) + + def test_mood_bad_arg(self): + # Raise ValueError when the sum of the lengths of the args is less than + # 3 + assert_raises(ValueError, stats.mood, [1], []) + + +class TestProbplot(TestCase): + + def test_basic(self): + np.random.seed(12345) + x = stats.norm.rvs(size=20) + osm, osr = stats.probplot(x, fit=False) + osm_expected = [-1.8241636, -1.38768012, -1.11829229, -0.91222575, + -0.73908135, -0.5857176, -0.44506467, -0.31273668, + -0.18568928, -0.06158146, 0.06158146, 0.18568928, + 0.31273668, 0.44506467, 0.5857176, 0.73908135, + 0.91222575, 1.11829229, 1.38768012, 1.8241636] + assert_allclose(osr, np.sort(x)) + assert_allclose(osm, osm_expected) + + _res, res_fit = stats.probplot(x, fit=True) + res_fit_expected = [1.05361841, 0.31297795, 0.98741609] + assert_allclose(res_fit, res_fit_expected) + + def test_sparams_keyword(self): + np.random.seed(123456) + x = stats.norm.rvs(size=100) + # Check that None, () and 0 (loc=0, for normal distribution) all work + # and give the same results + osm1, osr1 = stats.probplot(x, sparams=None, fit=False) + osm2, osr2 = stats.probplot(x, sparams=0, fit=False) + osm3, osr3 = stats.probplot(x, sparams=(), fit=False) + assert_allclose(osm1, osm2) + assert_allclose(osm1, osm3) + assert_allclose(osr1, osr2) + assert_allclose(osr1, osr3) + # Check giving (loc, scale) params for normal distribution + _osm, _osr = stats.probplot(x, sparams=(), fit=False) + + def test_dist_keyword(self): + np.random.seed(12345) + x = stats.norm.rvs(size=20) + osm1, osr1 = stats.probplot(x, fit=False, dist='t', sparams=(3,)) + osm2, osr2 = stats.probplot(x, fit=False, dist=stats.t, sparams=(3,)) + assert_allclose(osm1, osm2) + assert_allclose(osr1, osr2) + + assert_raises(ValueError, stats.probplot, x, dist='wrong-dist-name') + assert_raises(AttributeError, stats.probplot, x, dist=[]) + + class custom_dist(object): + + """Some class that looks just enough like a distribution.""" + + def ppf(self, q): + return stats.norm.ppf(q, loc=2) + + osm1, osr1 = stats.probplot(x, sparams=(2,), fit=False) + osm2, osr2 = stats.probplot(x, dist=custom_dist(), fit=False) + assert_allclose(osm1, osm2) + assert_allclose(osr1, osr2) + + @dec.skipif(not have_matplotlib) + def test_plot_kwarg(self): + np.random.seed(7654321) + fig = plt.figure() + fig.add_subplot(111) + x = stats.t.rvs(3, size=100) + res1, fitres1 = stats.probplot(x, plot=plt) + plt.close() + res2, fitres2 = stats.probplot(x, plot=None) + res3 = stats.probplot(x, fit=False, plot=plt) + plt.close() + res4 = stats.probplot(x, fit=False, plot=None) + # Check that results are consistent between combinations of `fit` and + # `plot` keywords. + assert_(len(res1) == len(res2) == len(res3) == len(res4) == 2) + assert_allclose(res1, res2) + assert_allclose(res1, res3) + assert_allclose(res1, res4) + assert_allclose(fitres1, fitres2) + + # Check that a Matplotlib Axes object is accepted + fig = plt.figure() + ax = fig.add_subplot(111) + stats.probplot(x, fit=False, plot=ax) + plt.close() + + def test_probplot_bad_args(self): + # Raise ValueError when given an invalid distribution. + assert_raises(ValueError, stats.probplot, [1], dist="plate_of_shrimp") + + +def test_wilcoxon_bad_arg(): + # Raise ValueError when two args of different lengths are given or + # zero_method is unknown. + assert_raises(ValueError, stats.wilcoxon, [1], [1, 2]) + assert_raises(ValueError, stats.wilcoxon, [1, 2], [1, 2], "dummy") + + +def test_mvsdist_bad_arg(): + # Raise ValueError if fewer than two data points are given. + data = [1] + assert_raises(ValueError, stats.mvsdist, data) + + +def test_kstat_bad_arg(): + # Raise ValueError if n > 4 or n > 1. + data = [1] + n = 10 + assert_raises(ValueError, stats.kstat, data, n=n) + + +def test_kstatvar_bad_arg(): + # Raise ValueError is n is not 1 or 2. + data = [1] + n = 10 + assert_raises(ValueError, stats.kstatvar, data, n=n) + + +def test_ppcc_max_bad_arg(): + # Raise ValueError when given an invalid distribution. + data = [1] + assert_raises(ValueError, stats.ppcc_max, data, dist="plate_of_shrimp") + + +class TestBoxcox_llf(TestCase): + + def test_basic(self): + np.random.seed(54321) + x = stats.norm.rvs(size=10000, loc=10) + lmbda = 1 + llf = stats.boxcox_llf(lmbda, x) + llf_expected = -x.size / 2. * np.log(np.sum(x.std() ** 2)) + assert_allclose(llf, llf_expected) + + def test_array_like(self): + np.random.seed(54321) + x = stats.norm.rvs(size=100, loc=10) + lmbda = 1 + llf = stats.boxcox_llf(lmbda, x) + llf2 = stats.boxcox_llf(lmbda, list(x)) + assert_allclose(llf, llf2, rtol=1e-12) + + def test_2d_input(self): + # Note: boxcox_llf() was already working with 2-D input (sort of), so + # keep it like that. boxcox() doesn't work with 2-D input though, due + # to brent() returning a scalar. + np.random.seed(54321) + x = stats.norm.rvs(size=100, loc=10) + lmbda = 1 + llf = stats.boxcox_llf(lmbda, x) + llf2 = stats.boxcox_llf(lmbda, np.vstack([x, x]).T) + assert_allclose([llf, llf], llf2, rtol=1e-12) + + def test_empty(self): + assert_(np.isnan(stats.boxcox_llf(1, []))) + + +class TestBoxcox(TestCase): + + def test_fixed_lmbda(self): + np.random.seed(12345) + x = stats.loggamma.rvs(5, size=50) + 5 + xt = stats.boxcox(x, lmbda=1) + assert_allclose(xt, x - 1) + xt = stats.boxcox(x, lmbda=-1) + assert_allclose(xt, 1 - 1 / x) + + xt = stats.boxcox(x, lmbda=0) + assert_allclose(xt, np.log(x)) + + # Also test that array_like input works + xt = stats.boxcox(list(x), lmbda=0) + assert_allclose(xt, np.log(x)) + + def test_lmbda_None(self): + np.random.seed(1234567) + # Start from normal rv's, do inverse transform to check that + # optimization function gets close to the right answer. + np.random.seed(1245) + lmbda = 2.5 + x = stats.norm.rvs(loc=10, size=50000) + x_inv = (x * lmbda + 1) ** (-lmbda) + _xt, maxlog = stats.boxcox(x_inv) + + assert_almost_equal(maxlog, -1 / lmbda, decimal=2) + + def test_alpha(self): + np.random.seed(1234) + x = stats.loggamma.rvs(5, size=50) + 5 + + # Some regular values for alpha, on a small sample size + _, _, interval = stats.boxcox(x, alpha=0.75) + assert_allclose(interval, [4.004485780226041, 5.138756355035744]) + _, _, interval = stats.boxcox(x, alpha=0.05) + assert_allclose(interval, [1.2138178554857557, 8.209033272375663]) + + # Try some extreme values, see we don't hit the N=500 limit + x = stats.loggamma.rvs(7, size=500) + 15 + _, _, interval = stats.boxcox(x, alpha=0.001) + assert_allclose(interval, [0.3988867, 11.40553131]) + _, _, interval = stats.boxcox(x, alpha=0.999) + assert_allclose(interval, [5.83316246, 5.83735292]) + + def test_boxcox_bad_arg(self): + # Raise ValueError if any data value is negative. + x = np.array([-1]) + assert_raises(ValueError, stats.boxcox, x) + + def test_empty(self): + assert_(stats.boxcox([]).shape == (0,)) + + +class TestBoxcoxNormmax(TestCase): + + def setUp(self): + np.random.seed(12345) + self.x = stats.loggamma.rvs(5, size=50) + 5 + + def test_pearsonr(self): + maxlog = stats.boxcox_normmax(self.x) + assert_allclose(maxlog, 1.804465325046) + + def test_mle(self): + maxlog = stats.boxcox_normmax(self.x, method='mle') + assert_allclose(maxlog, 1.758101454114) + + # Check that boxcox() uses 'mle' + _, maxlog_boxcox = stats.boxcox(self.x) + assert_allclose(maxlog_boxcox, maxlog) + + def test_all(self): + maxlog_all = stats.boxcox_normmax(self.x, method='all') + assert_allclose(maxlog_all, [1.804465325046, 1.758101454114]) + + +class TestBoxcoxNormplot(TestCase): + + def setUp(self): + np.random.seed(7654321) + self.x = stats.loggamma.rvs(5, size=500) + 5 + + def test_basic(self): + N = 5 + lmbdas, ppcc = stats.boxcox_normplot(self.x, -10, 10, N=N) + ppcc_expected = [0.57783375, 0.83610988, 0.97524311, 0.99756057, + 0.95843297] + assert_allclose(lmbdas, np.linspace(-10, 10, num=N)) + assert_allclose(ppcc, ppcc_expected) + + @dec.skipif(not have_matplotlib) + def test_plot_kwarg(self): + # Check with the matplotlib.pyplot module + fig = plt.figure() + fig.add_subplot(111) + stats.boxcox_normplot(self.x, -20, 20, plot=plt) + plt.close() + + # Check that a Matplotlib Axes object is accepted + fig.add_subplot(111) + ax = fig.add_subplot(111) + stats.boxcox_normplot(self.x, -20, 20, plot=ax) + plt.close() + + def test_invalid_inputs(self): + # `lb` has to be larger than `la` + assert_raises(ValueError, stats.boxcox_normplot, self.x, 1, 0) + # `x` can not contain negative values + assert_raises(ValueError, stats.boxcox_normplot, [-1, 1], 0, 1) + + def test_empty(self): + assert_(stats.boxcox_normplot([], 0, 1).size == 0) + + +class TestCircFuncs(TestCase): + + def test_circfuncs(self): + x = np.array([355, 5, 2, 359, 10, 350]) + M = stats.circmean(x, high=360) + Mval = 0.167690146 + assert_allclose(M, Mval, rtol=1e-7) + + V = stats.circvar(x, high=360) + Vval = 42.51955609 + assert_allclose(V, Vval, rtol=1e-7) + + S = stats.circstd(x, high=360) + Sval = 6.520702116 + assert_allclose(S, Sval, rtol=1e-7) + + def test_circfuncs_small(self): + x = np.array([20, 21, 22, 18, 19, 20.5, 19.2]) + M1 = x.mean() + M2 = stats.circmean(x, high=360) + assert_allclose(M2, M1, rtol=1e-5) + + V1 = x.var() + V2 = stats.circvar(x, high=360) + assert_allclose(V2, V1, rtol=1e-4) + + S1 = x.std() + S2 = stats.circstd(x, high=360) + assert_allclose(S2, S1, rtol=1e-4) + + def test_circmean_axis(self): + x = np.array([[355, 5, 2, 359, 10, 350], + [351, 7, 4, 352, 9, 349], + [357, 9, 8, 358, 4, 356]]) + M1 = stats.circmean(x, high=360) + M2 = stats.circmean(x.ravel(), high=360) + assert_allclose(M1, M2, rtol=1e-14) + + M1 = stats.circmean(x, high=360, axis=1) + M2 = [stats.circmean(x[i], high=360) for i in range(x.shape[0])] + assert_allclose(M1, M2, rtol=1e-14) + + M1 = stats.circmean(x, high=360, axis=0) + M2 = [stats.circmean(x[:, i], high=360) for i in range(x.shape[1])] + assert_allclose(M1, M2, rtol=1e-14) + + def test_circvar_axis(self): + x = np.array([[355, 5, 2, 359, 10, 350], + [351, 7, 4, 352, 9, 349], + [357, 9, 8, 358, 4, 356]]) + + V1 = stats.circvar(x, high=360) + V2 = stats.circvar(x.ravel(), high=360) + assert_allclose(V1, V2, rtol=1e-11) + + V1 = stats.circvar(x, high=360, axis=1) + V2 = [stats.circvar(x[i], high=360) for i in range(x.shape[0])] + assert_allclose(V1, V2, rtol=1e-11) + + V1 = stats.circvar(x, high=360, axis=0) + V2 = [stats.circvar(x[:, i], high=360) for i in range(x.shape[1])] + assert_allclose(V1, V2, rtol=1e-11) + + def test_circstd_axis(self): + x = np.array([[355, 5, 2, 359, 10, 350], + [351, 7, 4, 352, 9, 349], + [357, 9, 8, 358, 4, 356]]) + + S1 = stats.circstd(x, high=360) + S2 = stats.circstd(x.ravel(), high=360) + assert_allclose(S1, S2, rtol=1e-11) + + S1 = stats.circstd(x, high=360, axis=1) + S2 = [stats.circstd(x[i], high=360) for i in range(x.shape[0])] + assert_allclose(S1, S2, rtol=1e-11) + + S1 = stats.circstd(x, high=360, axis=0) + S2 = [stats.circstd(x[:, i], high=360) for i in range(x.shape[1])] + assert_allclose(S1, S2, rtol=1e-11) + + def test_circfuncs_array_like(self): + x = [355, 5, 2, 359, 10, 350] + assert_allclose(stats.circmean(x, high=360), 0.167690146, rtol=1e-7) + assert_allclose(stats.circvar(x, high=360), 42.51955609, rtol=1e-7) + assert_allclose(stats.circstd(x, high=360), 6.520702116, rtol=1e-7) + + def test_empty(self): + assert_(np.isnan(stats.circmean([]))) + assert_(np.isnan(stats.circstd([]))) + assert_(np.isnan(stats.circvar([]))) + + +def test_accuracy_wilcoxon(): + freq = [1, 4, 16, 15, 8, 4, 5, 1, 2] + nums = range(-4, 5) + x = np.concatenate([[u] * v for u, v in zip(nums, freq)]) + y = np.zeros(x.size) + + T, p = stats.wilcoxon(x, y, "pratt") + assert_allclose(T, 423) + assert_allclose(p, 0.00197547303533107) + + T, p = stats.wilcoxon(x, y, "zsplit") + assert_allclose(T, 441) + assert_allclose(p, 0.0032145343172473055) + + T, p = stats.wilcoxon(x, y, "wilcox") + assert_allclose(T, 327) + assert_allclose(p, 0.00641346115861) + + # Test the 'correction' option, using values computed in R with: + # > wilcox.test(x, y, paired=TRUE, exact=FALSE, correct={FALSE,TRUE}) + x = np.array([120, 114, 181, 188, 180, 146, 121, 191, 132, 113, 127, 112]) + y = np.array([133, 143, 119, 189, 112, 199, 198, 113, 115, 121, 142, 187]) + T, p = stats.wilcoxon(x, y, correction=False) + assert_equal(T, 34) + assert_allclose(p, 0.6948866, rtol=1e-6) + T, p = stats.wilcoxon(x, y, correction=True) + assert_equal(T, 34) + assert_allclose(p, 0.7240817, rtol=1e-6) + + +def test_wilcoxon_tie(): + # Regression test for gh-2391. + # Corresponding R code is: + # > result = wilcox.test(rep(0.1, 10), exact=FALSE, correct=FALSE) + # > result$p.value + # [1] 0.001565402 + # > result = wilcox.test(rep(0.1, 10), exact=FALSE, correct=TRUE) + # > result$p.value + # [1] 0.001904195 + stat, p = stats.wilcoxon([0.1] * 10) + expected_p = 0.001565402 + assert_equal(stat, 0) + assert_allclose(p, expected_p, rtol=1e-6) + + stat, p = stats.wilcoxon([0.1] * 10, correction=True) + expected_p = 0.001904195 + assert_equal(stat, 0) + assert_allclose(p, expected_p, rtol=1e-6) + + +if __name__ == "__main__": + run_module_suite() diff --git a/pywafo/src/wafo/stats/tests/test_multivariate.py b/pywafo/src/wafo/stats/tests/test_multivariate.py index b6a9a9b..5d39d1c 100644 --- a/pywafo/src/wafo/stats/tests/test_multivariate.py +++ b/pywafo/src/wafo/stats/tests/test_multivariate.py @@ -11,11 +11,11 @@ import numpy import numpy as np import scipy.linalg -import scipy.stats._multivariate -from scipy.stats import multivariate_normal -from scipy.stats import norm +#import wafo.stats._multivariate +from wafo.stats import multivariate_normal +from wafo.stats import norm -from scipy.stats._multivariate import _psd_pinv_decomposed_log_pdet +from wafo.stats._multivariate import _psd_pinv_decomposed_log_pdet from scipy.integrate import romb @@ -70,7 +70,7 @@ def test_large_pseudo_determinant(): #assert_allclose(np.linalg.slogdet(cov[:npos, :npos]), (1, large_total_log)) # Check the pseudo-determinant. - U, log_pdet = scipy.stats._multivariate._psd_pinv_decomposed_log_pdet(cov) + U, log_pdet = _psd_pinv_decomposed_log_pdet(cov) assert_allclose(log_pdet, large_total_log) diff --git a/pywafo/src/wafo/stats/tests/test_rank.py b/pywafo/src/wafo/stats/tests/test_rank.py index 0561bd7..10e0dc1 100644 --- a/pywafo/src/wafo/stats/tests/test_rank.py +++ b/pywafo/src/wafo/stats/tests/test_rank.py @@ -4,7 +4,7 @@ import numpy as np from numpy.testing import TestCase, run_module_suite, assert_equal, \ assert_array_equal -from scipy.stats import rankdata, tiecorrect +from wafo.stats import rankdata, tiecorrect class TestTieCorrect(TestCase): diff --git a/pywafo/src/wafo/stats/tests/test_tukeylambda_stats.py b/pywafo/src/wafo/stats/tests/test_tukeylambda_stats.py index 9d3d654..b1e74a1 100644 --- a/pywafo/src/wafo/stats/tests/test_tukeylambda_stats.py +++ b/pywafo/src/wafo/stats/tests/test_tukeylambda_stats.py @@ -18,7 +18,7 @@ def test_tukeylambda_stats_known_exact(): # lambda = 0 var = tukeylambda_variance(0) - assert_allclose(var, np.pi**2 / 3, atol=1e-12) + assert_allclose(var, np.pi ** 2 / 3, atol=1e-12) kurt = tukeylambda_kurtosis(0) assert_allclose(kurt, 1.2, atol=1e-10) @@ -26,7 +26,7 @@ def test_tukeylambda_stats_known_exact(): var = tukeylambda_variance(0.5) assert_allclose(var, 4 - np.pi, atol=1e-12) kurt = tukeylambda_kurtosis(0.5) - desired = (5./3 - np.pi/2) / (np.pi/4 - 1)**2 - 3 + desired = (5. / 3 - np.pi / 2) / (np.pi / 4 - 1) ** 2 - 3 assert_allclose(kurt, desired, atol=1e-10) # lambda = 1 diff --git a/pywafo/src/wafo/test/test_misc.py b/pywafo/src/wafo/test/test_misc.py index 81bf112..9052dc6 100644 --- a/pywafo/src/wafo/test/test_misc.py +++ b/pywafo/src/wafo/test/test_misc.py @@ -1,523 +1,470 @@ -import numpy as np # @UnusedImport -#@UnusedImport -from numpy import cos, exp, linspace, pi, sin, diff, arange, ones -from numpy.random import randn # @UnusedImport -from wafo.data import sea # @UnusedImport -from wafo.misc import (JITImport, Bunch, detrendma, DotDict, findcross, ecross, findextrema, # @UnusedImport - #@UnusedImport - findrfc, rfcfilter, findtp, findtc, findoutliers, - common_shape, argsreduce, stirlerr, getshipchar, betaloge, - #@UnusedImport - #@UnusedImport - gravity, nextpow2, discretize, polar2cart, - cart2polar, meshgrid, tranproc) # @UnusedImport + +from numpy.testing import (run_module_suite, assert_equal, assert_almost_equal, + assert_array_equal, assert_array_almost_equal) + +import numpy as np +from numpy import array, cos, exp, linspace, pi, sin, diff, arange, ones +from wafo.data import sea +from wafo.misc import (JITImport, Bunch, detrendma, DotDict, findcross, ecross, + findextrema, findrfc, rfcfilter, findtp, findtc, + findoutliers, common_shape, argsreduce, stirlerr, + getshipchar, betaloge, hygfz, + gravity, nextpow2, discretize, polar2cart, + cart2polar, tranproc) def test_JITImport(): - ''' - >>> np = JITImport('numpy') - >>> np.exp(0)==1.0 - True - ''' + np = JITImport('numpy') + assert_equal(1.0, np.exp(0)) def test_bunch(): - ''' - >>> d = Bunch(test1=1,test2=3) - >>> d.test1; d.test2 - 1 - 3 - ''' + d = Bunch(test1=1, test2=3) + assert_equal(1, d.test1) + assert_equal(3, d.test2) def test_dotdict(): - ''' - >>> d = DotDict(test1=1,test2=3) - >>> d.test1; d.test2 - 1 - 3 - ''' + d = DotDict(test1=1, test2=3) + assert_equal(1, d.test1) + assert_equal(3, d.test2) def test_detrendma(): - ''' - >>> x = linspace(0,1,200) - >>> y = exp(x)+0.1*cos(20*2*pi*x) - >>> y0 = detrendma(y,20); tr = y-y0 - >>> y0,tr - (array([ -1.05815186e-02, -2.48280355e-02, -7.01800760e-02, - -1.27193089e-01, -1.71915213e-01, -1.85125121e-01, - -1.59745361e-01, -1.03571981e-01, -3.62676515e-02, - 1.82219951e-02, 4.09039083e-02, 2.50630186e-02, - -2.11478040e-02, -7.78521440e-02, -1.21116040e-01, - -1.32178923e-01, -1.04689244e-01, -4.71541301e-02, - 2.03417510e-02, 7.38826137e-02, 8.95349902e-02, - 6.68738432e-02, 1.46828486e-02, -4.68648556e-02, - -9.39871606e-02, -1.08465407e-01, -8.46710629e-02, - -3.17365657e-02, 2.99669288e-02, 7.66864134e-02, - 9.04482283e-02, 6.59902473e-02, 1.27914062e-02, - -4.85841870e-02, -9.44185349e-02, -1.06987444e-01, - -8.13964951e-02, -2.74687460e-02, 3.40438793e-02, - 7.94643163e-02, 9.13222681e-02, 6.50922520e-02, - 1.09390148e-02, -5.02028639e-02, -9.47031411e-02, - -1.05349757e-01, -7.79872833e-02, -2.31196073e-02, - 3.81412653e-02, 8.22178144e-02, 9.21605209e-02, - 6.41850565e-02, 9.13184690e-03, -5.17149253e-02, - -9.48363260e-02, -1.03549587e-01, -7.44424124e-02, - -1.86890490e-02, 4.22594607e-02, 8.49486437e-02, - 9.29666543e-02, 6.32740911e-02, 7.37625254e-03, - -5.31142920e-02, -9.48133620e-02, -1.01584110e-01, - -7.07607748e-02, -1.41768231e-02, 4.63990484e-02, - 8.76587937e-02, 9.37446001e-02, 6.23650231e-02, - 5.67876495e-03, -5.43947621e-02, -9.46294406e-02, - -9.94504301e-02, -6.69411601e-02, -9.58252265e-03, - 5.05608316e-02, 9.03505172e-02, 9.44985623e-02, - 6.14637631e-02, 4.04610591e-03, -5.55500040e-02, - -9.42796647e-02, -9.71455674e-02, -6.29822440e-02, - -4.90556961e-03, 5.47458452e-02, 9.30263409e-02, - 9.52330253e-02, 6.05764719e-02, 2.48519180e-03, - -5.65735506e-02, -9.37590405e-02, -9.46664506e-02, - -5.88825766e-02, -1.45202622e-04, 5.89553685e-02, - 9.56890756e-02, 9.59527629e-02, 5.97095676e-02, - 1.00314001e-03, -5.74587921e-02, -9.30624694e-02, - -9.20099048e-02, -5.46405701e-02, 4.69953603e-03, - 6.31909369e-02, 9.83418277e-02, 9.66628470e-02, - 5.88697331e-02, -3.92724035e-04, -5.81989687e-02, - -9.21847386e-02, -8.91726414e-02, -5.02544862e-02, - 9.62981387e-03, 6.74543554e-02, 1.00988010e-01, - 9.73686580e-02, 5.80639242e-02, -1.69485946e-03, - -5.87871620e-02, -9.11205115e-02, -8.61512458e-02, - -4.57224228e-02, 1.46470222e-02, 7.17477118e-02, - 1.03631355e-01, 9.80758938e-02, 5.72993780e-02, - -2.89550192e-03, -5.92162868e-02, -8.98643173e-02, - -8.29421650e-02, -4.10422999e-02, 1.97527907e-02, - 7.60733908e-02, 1.06275925e-01, 9.87905812e-02, - 5.65836223e-02, -3.98665495e-03, -5.94790815e-02, - -8.84105398e-02, -7.95416952e-02, -3.62118451e-02, - 2.49490024e-02, 8.04340881e-02, 1.08926128e-01, - 9.95190863e-02, 5.59244846e-02, -4.96008086e-03, - -5.95680980e-02, -8.67534061e-02, -7.59459673e-02, - -3.12285782e-02, 3.02378095e-02, 8.48328258e-02, - 1.11586726e-01, 1.00268126e-01, 5.53301029e-02, - -5.80729079e-03, -5.94756912e-02, -8.48869734e-02, - -7.21509330e-02, -2.60897955e-02, 3.56216499e-02, - 8.92729678e-02, 1.14262857e-01, 1.01044781e-01, - 5.48089359e-02, -6.51953427e-03, -5.91940075e-02, - -8.28051165e-02, -6.81523491e-02, -2.07925530e-02, - 4.11032641e-02, 9.37582360e-02, 1.16960041e-01, - 1.13824241e-01, 7.82451609e-02, 2.87461256e-02, - -1.07566250e-02, -2.01779675e-02, 8.98967999e-03, - 7.03952281e-02, 1.45278564e-01, 2.09706186e-01, - 2.43802139e-01, 2.39414013e-01, 2.03257341e-01, - 1.54325635e-01, 1.16564992e-01, 1.09638547e-01, - 1.41342814e-01, 2.04600808e-01, 2.80191671e-01, - 3.44164010e-01, 3.77073744e-01]), array([ - 1.11058152, 1.11058152, 1.11058152, 1.11058152, 1.11058152, - 1.11058152, 1.11058152, 1.11058152, 1.11058152, 1.11058152, - 1.11058152, 1.11058152, 1.11058152, 1.11058152, 1.11058152, - 1.11058152, 1.11058152, 1.11058152, 1.11058152, 1.11058152, - 1.11599212, 1.12125245, 1.12643866, 1.13166607, 1.13704477, - 1.14263723, 1.14843422, 1.15435845, 1.16029443, 1.16613308, - 1.17181383, 1.17734804, 1.18281471, 1.18833001, 1.19400259, - 1.19989168, 1.20598434, 1.21220048, 1.21842384, 1.22454684, - 1.23051218, 1.23633498, 1.24209697, 1.24791509, 1.25389641, - 1.26009689, 1.26649987, 1.27302256, 1.27954802, 1.28597031, - 1.29223546, 1.29836228, 1.30443522, 1.31057183, 1.31687751, - 1.32340488, 1.3301336 , 1.33697825, 1.34382132, 1.35055864, - 1.35713958, 1.36358668, 1.36998697, 1.37645853, 1.38310497, - 1.38997553, 1.39704621, 1.40422902, 1.41140604, 1.41847493, - 1.4253885 , 1.43217295, 1.43891784, 1.44574164, 1.45274607, - 1.45997696, 1.46740665, 1.47494469, 1.48247285, 1.48989073, - 1.49715462, 1.50429437, 1.51140198, 1.51859618, 1.52597672, - 1.53358594, 1.54139257, 1.5493038 , 1.55720119, 1.56498641, - 1.57261924, 1.58013316, 1.58762252, 1.5952062 , 1.60298187, - 1.61098836, 1.6191908 , 1.62749412, 1.63577979, 1.64395163, - 1.65197298, 1.65988092, 1.66777202, 1.67576523, 1.68395602, - 1.69237968, 1.70099778, 1.70971307, 1.71840707, 1.72698583, - 1.73541631, 1.74373911, 1.75205298, 1.76047677, 1.76910369, - 1.77796544, 1.78702008, 1.79616827, 1.80529169, 1.81429875, - 1.82316 , 1.83191959, 1.84067831, 1.84955481, 1.85863994, - 1.86796178, 1.87747491, 1.88707803, 1.89665308, 1.9061109 , - 1.91542572, 1.92464514, 1.9338719 , 1.94322436, 1.9527909 , - 1.96259596, 1.97259069, 1.9826719 , 1.99272195, 2.00265419, - 2.01244653, 2.02215 , 2.0318692 , 2.04172204, 2.05179437, - 2.06210696, 2.07260759, 2.08319129, 2.09374092, 2.10417247, - 2.11446752, 2.12468051, 2.13491776, 2.14529665, 2.1559004 , - 2.16674609, 2.17777817, 2.18889002, 2.19996511, 2.21092214, - 2.22174641, 2.23249567, 2.24327791, 2.25420982, 2.26537192, - 2.2767776 , 2.28836802, 2.30003501, 2.3116628 , 2.32317284, - 2.33455419, 2.34586786, 2.35722337, 2.36873665, 2.38048542, - 2.39247934, 2.4046564 , 2.41690694, 2.42911606, 2.44120808, - 2.44120808, 2.44120808, 2.44120808, 2.44120808, 2.44120808, - 2.44120808, 2.44120808, 2.44120808, 2.44120808, 2.44120808, - 2.44120808, 2.44120808, 2.44120808, 2.44120808, 2.44120808, - 2.44120808, 2.44120808, 2.44120808, 2.44120808, 2.44120808])) - ''' + x = linspace(0, 1, 200) + y = exp(x) + 0.1 * cos(20 * 2 * pi * x) + y0 = detrendma(y, 20) + tr = y - y0 + assert_array_almost_equal( + y0, + array( + [-1.05815186e-02, -2.48280355e-02, -7.01800760e-02, + -1.27193089e-01, -1.71915213e-01, -1.85125121e-01, + -1.59745361e-01, -1.03571981e-01, -3.62676515e-02, + 1.82219951e-02, 4.09039083e-02, 2.50630186e-02, + -2.11478040e-02, -7.78521440e-02, -1.21116040e-01, + -1.32178923e-01, -1.04689244e-01, -4.71541301e-02, + 2.03417510e-02, 7.38826137e-02, 8.95349902e-02, + 6.68738432e-02, 1.46828486e-02, -4.68648556e-02, + -9.39871606e-02, -1.08465407e-01, -8.46710629e-02, + -3.17365657e-02, 2.99669288e-02, 7.66864134e-02, + 9.04482283e-02, 6.59902473e-02, 1.27914062e-02, + -4.85841870e-02, -9.44185349e-02, -1.06987444e-01, + -8.13964951e-02, -2.74687460e-02, 3.40438793e-02, + 7.94643163e-02, 9.13222681e-02, 6.50922520e-02, + 1.09390148e-02, -5.02028639e-02, -9.47031411e-02, + -1.05349757e-01, -7.79872833e-02, -2.31196073e-02, + 3.81412653e-02, 8.22178144e-02, 9.21605209e-02, + 6.41850565e-02, 9.13184690e-03, -5.17149253e-02, + -9.48363260e-02, -1.03549587e-01, -7.44424124e-02, + -1.86890490e-02, 4.22594607e-02, 8.49486437e-02, + 9.29666543e-02, 6.32740911e-02, 7.37625254e-03, + -5.31142920e-02, -9.48133620e-02, -1.01584110e-01, + -7.07607748e-02, -1.41768231e-02, 4.63990484e-02, + 8.76587937e-02, 9.37446001e-02, 6.23650231e-02, + 5.67876495e-03, -5.43947621e-02, -9.46294406e-02, + -9.94504301e-02, -6.69411601e-02, -9.58252265e-03, + 5.05608316e-02, 9.03505172e-02, 9.44985623e-02, + 6.14637631e-02, 4.04610591e-03, -5.55500040e-02, + -9.42796647e-02, -9.71455674e-02, -6.29822440e-02, + -4.90556961e-03, 5.47458452e-02, 9.30263409e-02, + 9.52330253e-02, 6.05764719e-02, 2.48519180e-03, + -5.65735506e-02, -9.37590405e-02, -9.46664506e-02, + -5.88825766e-02, -1.45202622e-04, 5.89553685e-02, + 9.56890756e-02, 9.59527629e-02, 5.97095676e-02, + 1.00314001e-03, -5.74587921e-02, -9.30624694e-02, + -9.20099048e-02, -5.46405701e-02, 4.69953603e-03, + 6.31909369e-02, 9.83418277e-02, 9.66628470e-02, + 5.88697331e-02, -3.92724035e-04, -5.81989687e-02, + -9.21847386e-02, -8.91726414e-02, -5.02544862e-02, + 9.62981387e-03, 6.74543554e-02, 1.00988010e-01, + 9.73686580e-02, 5.80639242e-02, -1.69485946e-03, + -5.87871620e-02, -9.11205115e-02, -8.61512458e-02, + -4.57224228e-02, 1.46470222e-02, 7.17477118e-02, + 1.03631355e-01, 9.80758938e-02, 5.72993780e-02, + -2.89550192e-03, -5.92162868e-02, -8.98643173e-02, + -8.29421650e-02, -4.10422999e-02, 1.97527907e-02, + 7.60733908e-02, 1.06275925e-01, 9.87905812e-02, + 5.65836223e-02, -3.98665495e-03, -5.94790815e-02, + -8.84105398e-02, -7.95416952e-02, -3.62118451e-02, + 2.49490024e-02, 8.04340881e-02, 1.08926128e-01, + 9.95190863e-02, 5.59244846e-02, -4.96008086e-03, + -5.95680980e-02, -8.67534061e-02, -7.59459673e-02, + -3.12285782e-02, 3.02378095e-02, 8.48328258e-02, + 1.11586726e-01, 1.00268126e-01, 5.53301029e-02, + -5.80729079e-03, -5.94756912e-02, -8.48869734e-02, + -7.21509330e-02, -2.60897955e-02, 3.56216499e-02, + 8.92729678e-02, 1.14262857e-01, 1.01044781e-01, + 5.48089359e-02, -6.51953427e-03, -5.91940075e-02, + -8.28051165e-02, -6.81523491e-02, -2.07925530e-02, + 4.11032641e-02, 9.37582360e-02, 1.16960041e-01, + 1.13824241e-01, 7.82451609e-02, 2.87461256e-02, + -1.07566250e-02, -2.01779675e-02, 8.98967999e-03, + 7.03952281e-02, 1.45278564e-01, 2.09706186e-01, + 2.43802139e-01, 2.39414013e-01, 2.03257341e-01, + 1.54325635e-01, 1.16564992e-01, 1.09638547e-01, + 1.41342814e-01, 2.04600808e-01, 2.80191671e-01, + 3.44164010e-01, 3.77073744e-01 + ])) + assert_array_almost_equal(tr, array([ + 1.11058152, 1.11058152, 1.11058152, 1.11058152, 1.11058152, + 1.11058152, 1.11058152, 1.11058152, 1.11058152, 1.11058152, + 1.11058152, 1.11058152, 1.11058152, 1.11058152, 1.11058152, + 1.11058152, 1.11058152, 1.11058152, 1.11058152, 1.11058152, + 1.11599212, 1.12125245, 1.12643866, 1.13166607, 1.13704477, + 1.14263723, 1.14843422, 1.15435845, 1.16029443, 1.16613308, + 1.17181383, 1.17734804, 1.18281471, 1.18833001, 1.19400259, + 1.19989168, 1.20598434, 1.21220048, 1.21842384, 1.22454684, + 1.23051218, 1.23633498, 1.24209697, 1.24791509, 1.25389641, + 1.26009689, 1.26649987, 1.27302256, 1.27954802, 1.28597031, + 1.29223546, 1.29836228, 1.30443522, 1.31057183, 1.31687751, + 1.32340488, 1.3301336, 1.33697825, 1.34382132, 1.35055864, + 1.35713958, 1.36358668, 1.36998697, 1.37645853, 1.38310497, + 1.38997553, 1.39704621, 1.40422902, 1.41140604, 1.41847493, + 1.4253885, 1.43217295, 1.43891784, 1.44574164, 1.45274607, + 1.45997696, 1.46740665, 1.47494469, 1.48247285, 1.48989073, + 1.49715462, 1.50429437, 1.51140198, 1.51859618, 1.52597672, + 1.53358594, 1.54139257, 1.5493038, 1.55720119, 1.56498641, + 1.57261924, 1.58013316, 1.58762252, 1.5952062, 1.60298187, + 1.61098836, 1.6191908, 1.62749412, 1.63577979, 1.64395163, + 1.65197298, 1.65988092, 1.66777202, 1.67576523, 1.68395602, + 1.69237968, 1.70099778, 1.70971307, 1.71840707, 1.72698583, + 1.73541631, 1.74373911, 1.75205298, 1.76047677, 1.76910369, + 1.77796544, 1.78702008, 1.79616827, 1.80529169, 1.81429875, + 1.82316, 1.83191959, 1.84067831, 1.84955481, 1.85863994, + 1.86796178, 1.87747491, 1.88707803, 1.89665308, 1.9061109, + 1.91542572, 1.92464514, 1.9338719, 1.94322436, 1.9527909, + 1.96259596, 1.97259069, 1.9826719, 1.99272195, 2.00265419, + 2.01244653, 2.02215, 2.0318692, 2.04172204, 2.05179437, + 2.06210696, 2.07260759, 2.08319129, 2.09374092, 2.10417247, + 2.11446752, 2.12468051, 2.13491776, 2.14529665, 2.1559004, + 2.16674609, 2.17777817, 2.18889002, 2.19996511, 2.21092214, + 2.22174641, 2.23249567, 2.24327791, 2.25420982, 2.26537192, + 2.2767776, 2.28836802, 2.30003501, 2.3116628, 2.32317284, + 2.33455419, 2.34586786, 2.35722337, 2.36873665, 2.38048542, + 2.39247934, 2.4046564, 2.41690694, 2.42911606, 2.44120808, + 2.44120808, 2.44120808, 2.44120808, 2.44120808, 2.44120808, + 2.44120808, 2.44120808, 2.44120808, 2.44120808, 2.44120808, + 2.44120808, 2.44120808, 2.44120808, 2.44120808, 2.44120808, + 2.44120808, 2.44120808, 2.44120808, 2.44120808, 2.44120808])) def test_findcross_and_ecross(): - ''' - >>> findcross([0, 0, 1, -1, 1],0) - array([1, 2, 3]) - >>> findcross([0, 1, -1, 1],0) - array([0, 1, 2]) - - >>> t = linspace(0,7*pi,250) - >>> x = sin(t) - >>> ind = findcross(x,0.75) - >>> ind - array([ 9, 25, 80, 97, 151, 168, 223, 239]) - >>> t0 = ecross(t,x,ind,0.75) - >>> t0 - array([ 0.84910514, 2.2933879 , 7.13205663, 8.57630119, - 13.41484739, 14.85909194, 19.69776067, 21.14204343]) - ''' + assert_array_equal(findcross([0, 0, 1, -1, 1], 0), np.array([1, 2, 3])) + assert_array_equal(findcross([0, 1, -1, 1], 0), np.array([0, 1, 2])) + + t = linspace(0, 7 * pi, 250) + x = sin(t) + ind = findcross(x, 0.75) + assert_array_equal(ind, np.array([9, 25, 80, 97, 151, 168, 223, 239])) + t0 = ecross(t, x, ind, 0.75) + assert_array_almost_equal(t0, np.array([0.84910514, 2.2933879, 7.13205663, + 8.57630119, 13.41484739, 14.85909194, + 19.69776067, 21.14204343])) def test_findextrema(): - ''' - >>> t = linspace(0,7*pi,250) - >>> x = sin(t) - >>> ind = findextrema(x) - >>> ind - array([ 18, 53, 89, 125, 160, 196, 231]) - ''' + t = linspace(0, 7 * pi, 250) + x = sin(t) + ind = findextrema(x) + assert_array_almost_equal(ind, np.array([18, 53, 89, 125, 160, 196, 231])) def test_findrfc(): - ''' - >>> t = linspace(0,7*pi,250) - >>> x = sin(t)+0.1*sin(50*t) - >>> ind = findextrema(x) - >>> ind - array([ 1, 3, 4, 6, 7, 9, 11, 13, 14, 16, 18, 19, 21, - 23, 25, 26, 28, 29, 31, 33, 35, 36, 38, 39, 41, 43, - 45, 46, 48, 50, 51, 53, 55, 56, 58, 60, 61, 63, 65, - 67, 68, 70, 71, 73, 75, 77, 78, 80, 81, 83, 85, 87, - 88, 90, 92, 93, 95, 97, 99, 100, 102, 103, 105, 107, 109, - 110, 112, 113, 115, 117, 119, 120, 122, 124, 125, 127, 129, 131, - 132, 134, 135, 137, 139, 141, 142, 144, 145, 147, 149, 151, 152, - 154, 156, 157, 159, 161, 162, 164, 166, 167, 169, 171, 173, 174, - 176, 177, 179, 181, 183, 184, 186, 187, 189, 191, 193, 194, 196, - 198, 199, 201, 203, 205, 206, 208, 209, 211, 213, 215, 216, 218, - 219, 221, 223, 225, 226, 228, 230, 231, 233, 235, 237, 238, 240, - 241, 243, 245, 247, 248]) - >>> ti, tp = t[ind], x[ind] - >>> ind1 = findrfc(tp,0.3) - >>> ind1 - array([ 0, 9, 32, 53, 74, 95, 116, 137]) - >>> tp[ind1] - array([-0.00743352, 1.08753972, -1.07206545, 1.09550837, -1.07940458, - 1.07849396, -1.0995006 , 1.08094452]) - ''' + t = linspace(0, 7 * pi, 250) + x = sin(t) + 0.1 * sin(50 * t) + ind = findextrema(x) + assert_array_almost_equal( + ind, + np.array( + [1, 3, 4, 6, 7, 9, 11, 13, 14, 16, 18, 19, 21, + 23, 25, 26, 28, 29, 31, 33, 35, 36, 38, 39, 41, 43, + 45, 46, 48, 50, 51, 53, 55, 56, 58, 60, 61, 63, 65, + 67, 68, 70, 71, 73, 75, 77, 78, 80, 81, 83, 85, 87, + 88, 90, 92, 93, 95, 97, 99, 100, 102, 103, 105, 107, 109, + 110, 112, 113, 115, 117, 119, 120, 122, 124, 125, 127, 129, 131, + 132, 134, 135, 137, 139, 141, 142, 144, 145, 147, 149, 151, 152, + 154, 156, 157, 159, 161, 162, 164, 166, 167, 169, 171, 173, 174, + 176, 177, 179, 181, 183, 184, 186, 187, 189, 191, 193, 194, 196, + 198, 199, 201, 203, 205, 206, 208, 209, 211, 213, 215, 216, 218, + 219, 221, 223, 225, 226, 228, 230, 231, 233, 235, 237, 238, 240, + 241, 243, 245, 247, 248])) + _ti, tp = t[ind], x[ind] + ind1 = findrfc(tp, 0.3) + assert_array_almost_equal( + ind1, + np.array([0, 9, 32, 53, 74, 95, 116, 137])) + assert_array_almost_equal( + tp[ind1], + np.array( + [-0.00743352, 1.08753972, -1.07206545, 1.09550837, -1.07940458, + 1.07849396, -1.0995006, 1.08094452])) def test_rfcfilter(): - ''' - # 1. Filtered signal y is the turning points of x. - >>> x = sea() - >>> y = rfcfilter(x[:,1], h=0, method=1) - >>> y[0:5] - array([-1.2004945 , 0.83950546, -0.09049454, -0.02049454, -0.09049454]) + # 1. Filtered signal y is the turning points of x. + x = sea() + y = rfcfilter(x[:, 1], h=0, method=1) + assert_array_almost_equal( + y[0:5], + np.array([-1.2004945, 0.83950546, -0.09049454, + -0.02049454, -0.09049454])) # 2. This removes all rainflow cycles with range less than 0.5. - >>> y1 = rfcfilter(x[:,1], h=0.5) - >>> y1[0:5] - array([-1.2004945 , 0.83950546, -0.43049454, 0.34950546, -0.51049454]) - - >>> t = linspace(0,7*pi,250) - >>> x = sin(t)+0.1*sin(50*t) - >>> ind = findextrema(x) - >>> ind - array([ 1, 3, 4, 6, 7, 9, 11, 13, 14, 16, 18, 19, 21, - 23, 25, 26, 28, 29, 31, 33, 35, 36, 38, 39, 41, 43, - 45, 46, 48, 50, 51, 53, 55, 56, 58, 60, 61, 63, 65, - 67, 68, 70, 71, 73, 75, 77, 78, 80, 81, 83, 85, 87, - 88, 90, 92, 93, 95, 97, 99, 100, 102, 103, 105, 107, 109, - 110, 112, 113, 115, 117, 119, 120, 122, 124, 125, 127, 129, 131, - 132, 134, 135, 137, 139, 141, 142, 144, 145, 147, 149, 151, 152, - 154, 156, 157, 159, 161, 162, 164, 166, 167, 169, 171, 173, 174, - 176, 177, 179, 181, 183, 184, 186, 187, 189, 191, 193, 194, 196, - 198, 199, 201, 203, 205, 206, 208, 209, 211, 213, 215, 216, 218, - 219, 221, 223, 225, 226, 228, 230, 231, 233, 235, 237, 238, 240, - 241, 243, 245, 247, 248]) - >>> ti, tp = t[ind], x[ind] - >>> tp03 = rfcfilter(tp,0.3) - >>> tp03 - array([-0.00743352, 1.08753972, -1.07206545, 1.09550837, -1.07940458, - 1.07849396, -1.0995006 , 1.08094452, 0.11983423]) - ''' + y1 = rfcfilter(x[:, 1], h=0.5) + assert_array_almost_equal( + y1[0:5], + np.array([-1.2004945, 0.83950546, -0.43049454, + 0.34950546, -0.51049454])) + + t = linspace(0, 7 * pi, 250) + x = sin(t) + 0.1 * sin(50 * t) + ind = findextrema(x) + assert_array_almost_equal( + ind, + np.array( + [1, 3, 4, 6, 7, 9, 11, 13, 14, 16, 18, 19, 21, + 23, 25, 26, 28, 29, 31, 33, 35, 36, 38, 39, 41, 43, + 45, 46, 48, 50, 51, 53, 55, 56, 58, 60, 61, 63, 65, + 67, 68, 70, 71, 73, 75, 77, 78, 80, 81, 83, 85, 87, + 88, 90, 92, 93, 95, 97, 99, 100, 102, 103, 105, 107, 109, + 110, 112, 113, 115, 117, 119, 120, 122, 124, 125, 127, 129, 131, + 132, 134, 135, 137, 139, 141, 142, 144, 145, 147, 149, 151, 152, + 154, 156, 157, 159, 161, 162, 164, 166, 167, 169, 171, 173, 174, + 176, 177, 179, 181, 183, 184, 186, 187, 189, 191, 193, 194, 196, + 198, 199, 201, 203, 205, 206, 208, 209, 211, 213, 215, 216, 218, + 219, 221, 223, 225, 226, 228, 230, 231, 233, 235, 237, 238, 240, + 241, 243, 245, 247, 248])) + _ti, tp = t[ind], x[ind] + tp03 = rfcfilter(tp, 0.3) + assert_array_almost_equal( + tp03, + np.array( + [-0.00743352, 1.08753972, -1.07206545, 1.09550837, -1.07940458, + 1.07849396, -1.0995006, 1.08094452, 0.11983423])) def test_findtp(): - ''' - >>> import numpy as np - >>> x = sea() - >>> x1 = x[0:200,:] - >>> itp = findtp(x1[:,1],0,'Mw') - >>> itph = findtp(x1[:,1],0.3,'Mw') - >>> itp - array([ 11, 21, 22, 24, 26, 28, 31, 39, 43, 45, 47, 51, 56, - 64, 70, 78, 82, 84, 89, 94, 101, 108, 119, 131, 141, 148, - 149, 150, 159, 173, 184, 190, 199]) - >>> itph - array([ 11, 28, 31, 39, 47, 51, 56, 64, 70, 78, 89, 94, 101, - 108, 119, 131, 141, 148, 159, 173, 184, 190, 199]) - ''' + x = sea() + x1 = x[0:200, :] + itp = findtp(x1[:, 1], 0, 'Mw') + itph = findtp(x1[:, 1], 0.3, 'Mw') + assert_array_almost_equal( + itp, + np.array( + [11, 21, 22, 24, 26, 28, 31, 39, 43, 45, 47, 51, 56, + 64, 70, 78, 82, 84, 89, 94, 101, 108, 119, 131, 141, 148, + 149, 150, 159, 173, 184, 190, 199])) + assert_array_almost_equal( + itph, + np.array( + [11, 28, 31, 39, 47, 51, 56, 64, 70, 78, 89, 94, 101, + 108, 119, 131, 141, 148, 159, 173, 184, 190, 199])) def test_findtc(): - ''' - >>> x = sea() - >>> x1 = x[0:200,:] - >>> itc, iv = findtc(x1[:,1],0,'dw') - >>> itc - array([ 28, 31, 39, 56, 64, 69, 78, 82, 83, 89, 94, 101, 108, - 119, 131, 140, 148, 159, 173, 184]) - >>> iv - array([ 19, 29, 34, 53, 60, 67, 76, 81, 82, 84, 90, 99, 103, - 112, 127, 137, 143, 154, 166, 180, 185]) - ''' + x = sea() + x1 = x[0:200, :] + itc, iv = findtc(x1[:, 1], 0, 'dw') + assert_array_almost_equal( + itc, + np.array( + [28, 31, 39, 56, 64, 69, 78, 82, 83, 89, 94, 101, 108, + 119, 131, 140, 148, 159, 173, 184])) + assert_array_almost_equal( + iv, + np.array( + [19, 29, 34, 53, 60, 67, 76, 81, 82, 84, 90, 99, 103, + 112, 127, 137, 143, 154, 166, 180, 185])) def test_findoutliers(): - ''' - >>> xx = sea() - >>> dt = diff(xx[:2,0]) - >>> dcrit = 5*dt - >>> ddcrit = 9.81/2*dt*dt - >>> zcrit = 0 - >>> [inds, indg] = findoutliers(xx[:,1],zcrit,dcrit,ddcrit,verbose=True) - Found 0 spurious positive jumps of Dx - Found 0 spurious negative jumps of Dx - Found 37 spurious positive jumps of D^2x - Found 200 spurious negative jumps of D^2x - Found 244 consecutive equal values - Found the total of 1152 spurious points - >>> inds - array([ 6, 7, 8, ..., 9509, 9510, 9511]) - >>> indg - array([ 0, 1, 2, ..., 9521, 9522, 9523]) - ''' - + xx = sea() + dt = diff(xx[:2, 0]) + dcrit = 5 * dt + ddcrit = 9.81 / 2 * dt * dt + zcrit = 0 + [inds, indg] = findoutliers(xx[:, 1], zcrit, dcrit, ddcrit, verbose=False) + assert_array_almost_equal(inds[np.r_[0, 1, 2, -3, -2, -1]], + np.array([6, 7, 8, 9509, 9510, 9511])) + assert_array_almost_equal(indg[np.r_[0, 1, 2, -3, -2, -1]], + np.array([0, 1, 2, 9521, 9522, 9523])) + + +def test_hygfz(): + #y = hyp2f1_taylor(-1, -4, 1, .9) + assert_equal(4.6, hygfz(-1, -4, 1, .9)) + assert_almost_equal(1.0464328112173522, hygfz(0.1, 0.2, 0.3, 0.5)) + assert_almost_equal(1.2027034401166194, hygfz(0.1, 0.2, 0.3, 0.95)) + #assert_equal(1.661006238211309e-07, hygfz(5, -300, 10, 0.5)) + assert_equal(0.118311386286, hygfz(0.5, -99.0, 1.5, 0.5625)) + assert_equal(0.0965606007742, hygfz(0.5, -149.0, 1.5, 0.5625)) + assert_equal(0.49234384000963544+0.60513406166123973j, hygfz(1, 1, 4, 3+4j)) def test_common_shape(): - ''' - >>> import numpy as np - >>> A = np.ones((4,1)) - >>> B = 2 - >>> C = np.ones((1,5))*5 - >>> common_shape(A,B,C) - (4, 5) - >>> common_shape(A,B,C,shape=(3,4,1)) - (3, 4, 5) - >>> A = np.ones((4,1)) - >>> B = 2 - >>> C = np.ones((1,5))*5 - >>> common_shape(A,B,C) - (4, 5) - >>> common_shape(A,B,C,shape=(3,4,1)) - (3, 4, 5) - ''' + A = np.ones((4, 1)) + B = 2 + C = np.ones((1, 5)) * 5 + assert_array_equal(common_shape(A, B, C), (4, 5)) + assert_array_equal(common_shape(A, B, C, shape=(3, 4, 1)), (3, 4, 5)) + A = np.ones((4, 1)) + B = 2 + C = np.ones((1, 5)) * 5 + assert_array_equal(common_shape(A, B, C), (4, 5)) + assert_array_equal(common_shape(A, B, C, shape=(3, 4, 1)), (3, 4, 5)) def test_argsreduce(): - ''' - >>> import numpy as np - >>> rand = np.random.random_sample - >>> A = linspace(0,19,20).reshape((4,5)) - >>> B = 2 - >>> C = range(5) - >>> cond = np.ones(A.shape) - >>> [A1,B1,C1] = argsreduce(cond,A,B,C) - >>> B1.shape - (20,) - >>> cond[2,:] = 0 - >>> [A2,B2,C2] = argsreduce(cond,A,B,C) - >>> B2.shape - (15,) - >>> A2;B2;C2 - array([ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 15., - 16., 17., 18., 19.]) - array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) - array([0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4]) - ''' + A = linspace(0, 19, 20).reshape((4, 5)) + B = 2 + C = range(5) + cond = np.ones(A.shape) + [_A1, B1, _C1] = argsreduce(cond, A, B, C) + assert_equal(B1.shape, (20,)) + cond[2, :] = 0 + [A2, B2, C2] = argsreduce(cond, A, B, C) + assert_equal(B2.shape, (15,)) + assert_array_equal(A2, + np.array([0., 1., 2., 3., 4., 5., 6., 7., + 8., 9., 15., 16., 17., 18., 19.])) + assert_array_equal( + B2, np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])) + assert_array_equal( + C2, np.array([0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4])) def test_stirlerr(): - ''' - >>> stirlerr(range(5)) - array([ inf, 0.08106147, 0.0413407 , 0.02767793, 0.02079067]) - ''' + assert_array_almost_equal(stirlerr(range(5)), + np.array([np.inf, 0.08106147, 0.0413407, 0.02767793, + 0.02079067])) def test_getshipchar(): - ''' - >>> sc = getshipchar(10,'service_speed') - >>> names = ['beam', 'beamSTD', 'draught', - ... 'draughtSTD', 'length', 'lengthSTD', - ... 'max_deadweight', 'max_deadweightSTD', 'propeller_diameter', - ... 'propeller_diameterSTD', 'service_speed', 'service_speedSTD'] - >>> for name in names: print( '%s : %g' % (name, sc[name])) - beam : 29 - beamSTD : 2.9 - draught : 9.6 - draughtSTD : 2.112 - length : 216 - lengthSTD : 2.01131 - max_deadweight : 30969 - max_deadweightSTD : 3096.9 - propeller_diameter : 6.76117 - propeller_diameterSTD : 0.20267 - service_speed : 10 - service_speedSTD : 0 - ''' + sc = getshipchar(10, 'service_speed') + true_sc = dict(beam=29, + beamSTD=2.9, + draught=9.6, + draughtSTD=2.112, + length=216, + lengthSTD=2.011309883194276, + max_deadweight=30969, + max_deadweightSTD=3096.9, + propeller_diameter=6.761165385916601, + propeller_diameterSTD=0.20267047566705432, + service_speed=10, + service_speedSTD=0) + + for name, val in true_sc.iteritems(): + assert_almost_equal(val, sc[name]) def test_betaloge(): - ''' - >>> betaloge(3, arange(4)) - array([ inf, -1.09861229, -2.48490665, -3.40119738]) - ''' + assert_array_almost_equal(betaloge(3, arange(4)), + np.array([np.inf, -1.09861229, -2.48490665, -3.40119738])) def test_gravity(): - ''' - >>> phi = linspace(0,45,5) - >>> gravity(phi) - array([ 9.78049 , 9.78245014, 9.78803583, 9.79640552, 9.80629387]) - ''' + phi = linspace(0, 45, 5) + assert_array_almost_equal(gravity(phi), + np.array([9.78049, 9.78245014, 9.78803583, + 9.79640552, 9.80629387])) def test_nextpow2(): - ''' - >>> nextpow2(10) - 4 - >>> nextpow2(np.arange(5)) - 3 - ''' + assert_equal(nextpow2(10), 4) + assert_equal(nextpow2(np.arange(5)), 3) def test_discretize(): - ''' - >>> x, y = discretize(np.cos,0,np.pi) - >>> x; y - array([ 0. , 0.19634954, 0.39269908, 0.58904862, 0.78539816, - 0.9817477 , 1.17809725, 1.37444679, 1.57079633, 1.76714587, - 1.96349541, 2.15984495, 2.35619449, 2.55254403, 2.74889357, - 2.94524311, 3.14159265]) - array([ 1.00000000e+00, 9.80785280e-01, 9.23879533e-01, - 8.31469612e-01, 7.07106781e-01, 5.55570233e-01, - 3.82683432e-01, 1.95090322e-01, 6.12323400e-17, - -1.95090322e-01, -3.82683432e-01, -5.55570233e-01, - -7.07106781e-01, -8.31469612e-01, -9.23879533e-01, - -9.80785280e-01, -1.00000000e+00]) - ''' + x, y = discretize(np.cos, 0, np.pi) + assert_array_almost_equal( + x, + np.array( + [0., 0.19634954, 0.39269908, 0.58904862, 0.78539816, + 0.9817477, 1.17809725, 1.37444679, 1.57079633, 1.76714587, + 1.96349541, 2.15984495, 2.35619449, 2.55254403, 2.74889357, + 2.94524311, 3.14159265])) + assert_array_almost_equal( + y, np.array([1.00000000e+00, 9.80785280e-01, + 9.23879533e-01, + 8.31469612e-01, 7.07106781e-01, 5.55570233e-01, + 3.82683432e-01, 1.95090322e-01, 6.12323400e-17, + -1.95090322e-01, -3.82683432e-01, -5.55570233e-01, + -7.07106781e-01, -8.31469612e-01, -9.23879533e-01, + -9.80785280e-01, -1.00000000e+00])) def test_discretize_adaptive(): - ''' - >>> x, y = discretize(np.cos,0,np.pi, method='adaptive') - >>> x; y - array([ 0. , 0.19634954, 0.39269908, 0.58904862, 0.78539816, - 0.9817477 , 1.17809725, 1.37444679, 1.57079633, 1.76714587, - 1.96349541, 2.15984495, 2.35619449, 2.55254403, 2.74889357, - 2.94524311, 3.14159265]) - array([ 1.00000000e+00, 9.80785280e-01, 9.23879533e-01, - 8.31469612e-01, 7.07106781e-01, 5.55570233e-01, - 3.82683432e-01, 1.95090322e-01, 6.12323400e-17, - -1.95090322e-01, -3.82683432e-01, -5.55570233e-01, - -7.07106781e-01, -8.31469612e-01, -9.23879533e-01, - -9.80785280e-01, -1.00000000e+00]) - ''' - - -def test_pol2cart_n_cart2pol(): - ''' - >>> r = 5 - >>> t = linspace(0,pi,20) - >>> x, y = polar2cart(t,r) - >>> x; y - array([ 5. , 4.93180652, 4.72908621, 4.39736876, 3.94570255, - 3.38640786, 2.73474079, 2.00847712, 1.22742744, 0.41289673, - -0.41289673, -1.22742744, -2.00847712, -2.73474079, -3.38640786, - -3.94570255, -4.39736876, -4.72908621, -4.93180652, -5. ]) - array([ 0.00000000e+00, 8.22972951e-01, 1.62349735e+00, - 2.37973697e+00, 3.07106356e+00, 3.67861955e+00, - 4.18583239e+00, 4.57886663e+00, 4.84700133e+00, - 4.98292247e+00, 4.98292247e+00, 4.84700133e+00, - 4.57886663e+00, 4.18583239e+00, 3.67861955e+00, - 3.07106356e+00, 2.37973697e+00, 1.62349735e+00, - 8.22972951e-01, 6.12323400e-16]) - >>> ti, ri = cart2polar(x,y) - >>> ti;ri - array([ 0. , 0.16534698, 0.33069396, 0.49604095, 0.66138793, - 0.82673491, 0.99208189, 1.15742887, 1.32277585, 1.48812284, - 1.65346982, 1.8188168 , 1.98416378, 2.14951076, 2.31485774, - 2.48020473, 2.64555171, 2.81089869, 2.97624567, 3.14159265]) - array([ 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., - 5., 5., 5., 5., 5., 5., 5.]) - ''' - - -def test_meshgrid(): - ''' - >>> x = np.linspace(0,1,3) # coordinates along x axis - >>> y = np.linspace(0,1,2) # coordinates along y axis - >>> xv, yv = meshgrid(x,y) # extend x and y for a 2D xy grid - >>> xv - array([[ 0. , 0.5, 1. ], - [ 0. , 0.5, 1. ]]) - >>> yv - array([[ 0., 0., 0.], - [ 1., 1., 1.]]) - >>> xv, yv = meshgrid(x,y, sparse=True) # make sparse output arrays - >>> xv - array([[ 0. , 0.5, 1. ]]) - >>> yv - array([[ 0.], - [ 1.]]) - - >>> meshgrid(x,y,sparse=True,indexing='ij') # change to matrix indexing - [array([[ 0. ], - [ 0.5], - [ 1. ]]), array([[ 0., 1.]])] - >>> meshgrid(x,y,indexing='ij') - [array([[ 0. , 0. ], - [ 0.5, 0.5], - [ 1. , 1. ]]), array([[ 0., 1.], - [ 0., 1.], - [ 0., 1.]])] - - >>> meshgrid(0,1,5) # just a 3D point - [array([[[0]]]), array([[[1]]]), array([[[5]]])] - >>> map(np.squeeze,meshgrid(0,1,5)) # just a 3D point - [array(0), array(1), array(5)] - >>> meshgrid(3) - array([3]) - >>> meshgrid(y) # 1D grid y is just returned - array([ 0., 1.]) - - `meshgrid` is very useful to evaluate functions on a grid. - - >>> x = np.arange(-5, 5, 0.1) - >>> y = np.arange(-5, 5, 0.1) - >>> xx, yy = meshgrid(x, y, sparse=True) - >>> z = np.sin(xx**2+yy**2)/(xx**2+yy**2) - ''' + x, y = discretize(np.cos, 0, np.pi, method='adaptive') + assert_array_almost_equal( + x, + np.array( + [0., 0.19634954, 0.39269908, 0.58904862, 0.78539816, + 0.9817477, 1.17809725, 1.37444679, 1.57079633, 1.76714587, + 1.96349541, 2.15984495, 2.35619449, 2.55254403, 2.74889357, + 2.94524311, 3.14159265])) + assert_array_almost_equal( + y, + np.array( + [1.00000000e+00, 9.80785280e-01, 9.23879533e-01, + 8.31469612e-01, 7.07106781e-01, 5.55570233e-01, + 3.82683432e-01, 1.95090322e-01, 6.12323400e-17, + -1.95090322e-01, -3.82683432e-01, -5.55570233e-01, + -7.07106781e-01, -8.31469612e-01, -9.23879533e-01, + -9.80785280e-01, -1.00000000e+00])) + + +def test_polar2cart_n_cart2polar(): + r = 5 + t = linspace(0, pi, 20) + x, y = polar2cart(t, r) + assert_array_almost_equal( + x, + np.array( + [5., 4.93180652, 4.72908621, 4.39736876, 3.94570255, + 3.38640786, 2.73474079, 2.00847712, 1.22742744, 0.41289673, + -0.41289673, -1.22742744, -2.00847712, -2.73474079, -3.38640786, + -3.94570255, -4.39736876, -4.72908621, -4.93180652, -5.])) + assert_array_almost_equal( + y, + np.array( + [0.00000000e+00, 8.22972951e-01, 1.62349735e+00, + 2.37973697e+00, 3.07106356e+00, 3.67861955e+00, + 4.18583239e+00, 4.57886663e+00, 4.84700133e+00, + 4.98292247e+00, 4.98292247e+00, 4.84700133e+00, + 4.57886663e+00, 4.18583239e+00, 3.67861955e+00, + 3.07106356e+00, 2.37973697e+00, 1.62349735e+00, + 8.22972951e-01, 6.12323400e-16])) + ti, ri = cart2polar(x, y) + assert_array_almost_equal( + ti, + np.array( + [0., 0.16534698, 0.33069396, 0.49604095, 0.66138793, + 0.82673491, 0.99208189, 1.15742887, 1.32277585, 1.48812284, + 1.65346982, 1.8188168, 1.98416378, 2.14951076, 2.31485774, + 2.48020473, 2.64555171, 2.81089869, 2.97624567, 3.14159265])) + assert_array_almost_equal( + ri, + np.array( + [5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., + 5., 5., 5., 5., 5., 5., 5.])) def test_tranproc(): - ''' - >>> import wafo.transform.models as wtm - >>> tr = wtm.TrHermite() - >>> x = linspace(-5,5,501) - >>> g = tr(x) - >>> y0, y1 = tranproc(x, g, range(5), ones(5)) - >>> y0;y1 - array([ 0.02659612, 1.00115284, 1.92872532, 2.81453257, 3.66292878]) - array([ 1.00005295, 0.9501118 , 0.90589954, 0.86643821, 0.83096482]) - ''' + import wafo.transform.models as wtm + tr = wtm.TrHermite() + x = linspace(-5, 5, 501) + g = tr(x) + y0, y1 = tranproc(x, g, range(5), ones(5)) + assert_array_almost_equal( + y0, + np.array([0.02659612, 1.00115284, 1.92872532, + 2.81453257, 3.66292878])) + assert_array_almost_equal( + y1, + np.array([1.00005295, 0.9501118, 0.90589954, + 0.86643821, 0.83096482])) + + if __name__ == '__main__': - import doctest - doctest.testmod() + run_module_suite()