# -*- coding: utf-8 -*-
import numpy as np
from scipy.stats import rv_continuous, beta, binom
from scipy import special
[docs]class beinf_gen(rv_continuous):
r"""A zero- and one- inflated beta (BEINF) random variable
Notes
-----
The probability density function for BEINF is:
.. math::
f(x;a,b,p,q) = p(1-q)\delta(x)+(1-p)f_{\mathrm{beta}}(x;a,b) + pq\delta(x-1)
where
.. math::
f_{\mathrm{beta}}(x;a,b)=\frac{1}{\mathrm{B}(a,b)}x^{a-1}(1-x)^{b-1},
:math:`\delta(x)` is the delta function, and :math:`\mathrm{B}(a,b)` is the
beta function (:py:data:`scipy.special.beta`).
:class:`beinf` takes :math:`a>0`, :math:`b>0`, :math:`0\leq p \leq 1`, :math:`0\leq q \leq 1`
as shape parameters.
:class:`beinf` is an instance of a subclass of :py:class:`~scipy.stats.rv_continuous`, and therefore
inherits all of the methods within :py:class:`~scipy.stats.rv_continuous`. Some of those methods
have been subclassed here:
``_argcheck``
``_cdf``
``_pdf``
``_ppf``
Additional methods added to :class:`beinf` are:
``beta_moments(data_sub)``
Computes the parameters for the beta distribution using the method
of moments.
``cdf_eval(x,data_params,data)``
Chooses the appropriate method for evaluating the cumulative
distribution function for data at points x, and computes it.
``check_cases(data)``
Checks to see if data satisfies any of cases 1-4.
``ecdf(x, X_samp, p=None, q=None)``
The empirical distribution function X_samp at points x.
``fit(data)``
This replaces the ``fit`` method in :py:class:`~scipy.stats.rv_continuous`, but
is not subclassed. Computes the MLE parameters for the BEINF distribution.
``fit_beta(data_sub)``
Computes the MLE parameters for the beta distribution.
"""
#####################################################################
######################### Subclassed Methods ########################
#####################################################################
def _pdf(self, x, a, b, p, q):
'''
Subclass the _pdf method (returns the pdf of the
BEINF distribution)
'''
def logpdf_beta(x,a,b):
# log of the pdf for beta distribution (taken from scipy.stats.beta)
lPx = special.xlog1py(b-1.0, - x) + special.xlogy(a-1.0, x)
lPx -= special.betaln(a, b)
return lPx
def delta(x):
if isinstance(x,np.float):
#if x comes in as float, turn it into a numpy array
x = np.array([x])
y = np.zeros(x.shape)
y[x==0.0] = 1.0
return y
vals = delta(x)*p*(1-q) + (1.-p)*np.exp(logpdf_beta(x,a,b)) + p*q*delta(x-1.)
# boolean list for random variable
return vals
def _cdf(self, x, a, b, p, q):
if np.all(p==1.):
#cdf is only described by bernoulli distribution when p=1
return p*binom._cdf(x,1,q)
else:
# the ranges of x that break up the piecewise
# cdf
condlist = [x<0.0,
x==0.0,
np.logical_and(x>0.0, x<1.0),
x>=1.0]
# the piecewise cdf associated with the entries in condlist
choicelist = [0.0,
p*(1-q),
p*binom._cdf(x,1,q) + (1-p)*special.btdtr(a,b,x),
1.0]
return np.select(condlist, choicelist)
def _ppf(self, rho, a, b, p, q):
# subclass the _ppf method (returns the inverse cdf of the
# beinf distribution). NOTE: This is not a true inverse
# when p!=0 since the same SIC value can be returned
# for a range of probabilities at the endpoints
if np.all(p==1):
gamma_ = 0.0
else:
gamma_ = (rho - p*(1-q))/(1-p)
condlist=[np.logical_and(rho>=0., rho<=p*(1-q)),
np.logical_and(rho>p*(1-q),rho<(1-p*q)),
np.logical_and(rho>=1-p*q, rho<=1.)]
choicelist=[0.0, special.btdtri(a,b,gamma_), 1.0]
return np.select(condlist, choicelist)
def _argcheck(self,a,b,p,q):
#subclass the argcheck method to ensure parameters
#are constrained to their bounds
check1 = np.logical_and(a>=0.,b>=0.)
check2 = np.logical_and(p>=0.,p<=1.)
check3 = np.logical_and(q>=0.,q<=1.)
if check1==True and check2==True and check3==True:
return True
else:
return False
#####################################################################
######################### Methods Unique ############################
######################### to the beinf ############################
######################### distribution ############################
#####################################################################
[docs] def cdf_eval(self,x,data_params,data):
r'''
Evaluates the cumulative distribution function (empirically or parametrically)
for a random variable :math:`X\sim\mathrm{BEINF}(a,b,p,q)`. When
:math:`a` and :math:`b` are unkown (i.e. when :code:`a=np.inf`
and :code:`b=np.inf`),
the cdf is evaluated using :meth:`~beinf.beinf_gen.ecdf`. When :math:`a`
and :math:`b` are known `or` when :math:`p=1`, the cdf is
evaluated using :py:meth:`~scipy.stats.rv_continuous.cdf`. This is used
after applying TAQM (see the tutorial on using the :class:`taqm` class).
Args:
x (float or ndarray):
The value(s) at which the ecdf is evaluated.
data_params (ndarray):
An array containing the four parameters a, b, p, and q.
data (ndarray):
The data which defines the ecdf when :math:`a` and :math:`b`
are unknown (i.e. when :code:`a=np.inf` and :code:`b=np.inf`).
This array may contain the values :code:`np.inf`,
0, and 1.
Returns: cdf_vals (ndarry):
The cdf or ecdf for :math:`X\sim\mathrm{BEINF}(a,b,p,q)`
evaluated at x.
'''
a,b,p,q = data_params[0],data_params[1],data_params[2],data_params[3]
if a==np.inf and p!=1.0:
cdf_vals = beinf.ecdf(x,data,p,q)
else:
rv = beinf(a,b,p,q)
cdf_vals = rv.cdf(x)
return cdf_vals
[docs] def ecdf(self, x, X_samp, p=None, q=None):
r'''
For computing the empirical cumulative distribution function (ecdf)
for a random variable :math:`X\sim\mathrm{BEINF}(a,b,p,q)` when the parameters
:math:`a` and :math:`b` (or additionally :math:`p` and :math:`q`) are unkown.
Args:
x (float or ndarray):
The value(s) at which the ecdf is evaluated
X_samp (float or ndarray):
A sample that lies on either: the open
interval (0,1) when p and q **are** included as
arguments, `or` the closed interval
[0,1] when p and q **are not** included as arguments.
p (float, optional):
Shape parameter(s) for the beinf distribution.
q (float, optional):
Shape parameter(s) for the beinf distribution.
Returns: ecdf_vals (ndarray):
The ecdf for X_samp, evaluated at x.
'''
if isinstance(x,np.float):
#if x comes in as float, turn it into a numpy array
x = np.array([x])
if isinstance(X_samp,np.float):
#if X_samp comes in as float, turn it into a numpy array
X_samp = np.array([X_samp])
if p!=None and q!=None:
# sort the values of X_samp from smallest to largest
# but take out 0's and 1's because p and q take care of endpoints
xs = np.sort(X_samp[np.logical_and(X_samp!=0.0,X_samp!=1.0)])
# also take out any inf values
xs = np.sort(xs[xs!=np.inf])
# get the sample size of xs satisfying xs<=x for each x
ys = map(lambda vals: len(xs[xs<=vals]), x)
# the ranges of x that break up the piecewise
# cdf
condlist = [x<0.0,
x==0.0,
np.logical_and(x>0.0, x<1.0),
x>=1.0]
# the piecewise cdf associated with the entries in condlist
choicelist = [0.0,
p*(1-q),
p*(1-q) + (1-p)*np.array(ys)/float(len(xs)),
1.0]
else:
# sort the values of X_samp from smallest to largest
xs = np.sort(X_samp)
# get the sample size of xs satisfying xs<=x for each x
ys = map(lambda vals: len(xs[xs<=vals]), x)
# the ranges of x that break up the piecewise
# cdf
condlist = [x<0.0,
np.logical_and(x>=0.0,x<=1.0)]
# the piecewise cdf associated with the entries in condlist
choicelist = [0.0,
np.array(ys)/float(len(xs))]
ecdf_vals = np.select(condlist, choicelist)
return ecdf_vals
[docs] def fit(self, data):
'''
Computes the MLE parameters :math:`a`, :math:`b`, :math:`p`,
and :math:`q` for the BEINF distribution to the given data.
Args:
data (ndarray):
The data to be fit to the BEINF distribution.
Returns: a,b, p, q (floats):
The shape parameters for the BEINF distribution. When :math:`a`
and :math:`b` cannot be fit, they are returned as
:code:`a=np.inf` and :code:`b=np.inf`.
'''
data = np.copy(data)
n=len(data)
# Compute mle estimates p and q based on the complete sample
#indicator function
I = np.zeros(n)
I[np.logical_or(data==0.0,data==1.0)] = 1.0
# p is the fraction of the sample that contains either
# 0 or 1
p=I.sum()/float(n)
# Of those number of 0's and 1's in the sample, q
# is the fraction of 1's.
if p==0.0:
q = 0.0
else:
q=(data*I).sum()/I.sum()
# Compute estimates for a and b from the sub-sample of data
# neither zero nor one
# sub-set of data that lies on (0,1)
data_sub = data[np.logical_and(data!=0.0,data!=1.0)]
# see if any of the special cases are true for data
cases = self.check_cases(data)
if cases==False:
# when all of the cases are false for data,
# data_sub can be fit using beinf.fit_beta
a, b = self.fit_beta(data_sub)
else:
# if any of cases 1-4 are True, don't fit beta portion of
# the beinf distribution and set a and b equal to infinity
# (note that these values of a and b can still be passed
# into beinf... the resultant random variable will just
# be described by the bernoulli distribution
a,b = np.inf, np.inf
return a,b,p,q
[docs] def fit_beta(self,data_sub):
'''
Computes the MLE parameters :math:`a` and :math:`b` for the
beta distribution to the given data.
Args:
data_sub (ndarray):
The data to be fit to the beta distribution. The values in
data_sub should lie on the open interval (0,1).
Returns: a,b (floats):
The shape parameters for the beta distribution.
'''
# start out with a and b as arbitrary negative values.
# This is to handle instances when errors are raised using the
# beta.fit method. When errors are raised this function sets a
# and b equal to estimates obtained from the method of moments
a=-10
b=-10
while a<0.0 or b<0.0:
try:
# try to fit the data to the beta distribution
# using the built-in beta.fit method
a,b,loc,scale = beta.fit(data_sub,floc=0.0,fscale=1.0-0.0)
if a<0.0 or b<0.0:
# this is to deal with when a and b are found
# succesfully by beta.fit, but are non-sensical
# negative values. In this case use the
# method of moments to find a and b
a, b = self.beta_moments(data_sub)
except:
# this is to deal with when the beta.fit function
# does not converge. In this case use the method
# of moments to find a and b
a, b = self.beta_moments(data_sub)
return a, b
[docs] def beta_moments(self,data_sub):
'''
Computes the method of moments estimates of `a` and `b` for the
beta distribution.
Args:
data_sub (ndarray):
The data to be fit to the beta distribution. The values in
data_sub should lie on the open interval (0,1).
Returns: a,b (floats):
The shape parameters for the beta distribution.
'''
# geometric mean
data_bar = data_sub.mean()
#sample standard devation
data_var = data_sub.var(ddof=1)
#method-of-moments estimates of a and b
a = data_bar*(data_bar*(1.-data_bar)/data_var - 1.)
b = (1-data_bar)*(data_bar*(1.-data_bar)/data_var - 1.)
return a, b
[docs] def check_cases(self,data):
r'''
Checks whether data satisfies any of cases 1-4 described in
Section 3b of Dirkson et al, 2018. When True, the parameters a
and b for the beta-portion of the BEINF distribution cannot be fit.
Args:
data (ndarray):
The values for which to test if cases 1-4 apply.
Returns: `True` or `False`
Boolean value of `True` or `False`. `True` if any of the cases
are satisfied; `False` if all of the cases are
not satisfied.
'''
case1,case2,case3,case4 = False, False, False, False
n = len(data)
x_sub = data[np.logical_and(data!=0.0,data!=1.0)]
m = n-len(x_sub) #number of zeros and ones in data
s_bar = x_sub.mean()
v_bar = np.var(x_sub,ddof=1)
if n-m==0:
case1 = True
if n-m==1:
case2 = True
if n-m>1 and v_bar<1e-20:
case3 = True
if v_bar>=s_bar*(1.-s_bar):
case4 = True
if case1==False and case2==False and case3==False and case4==False:
return False
else:
return True
beinf = beinf_gen(a=np.nextafter(0,-1),b=1.0,name='beinf',
shapes='a,b,p,q')