Source code for beinf

# -*- coding: utf-8 -*-

import numpy as np
from scipy.stats import rv_continuous, beta, binom
from scipy import special

[docs]class beinf_gen(rv_continuous): r"""A zero- and one- inflated beta (BEINF) random variable Notes ----- The probability density function for BEINF is: .. math:: f(x;a,b,p,q) = p(1-q)\delta(x)+(1-p)f_{\mathrm{beta}}(x;a,b) + pq\delta(x-1) where .. math:: f_{\mathrm{beta}}(x;a,b)=\frac{1}{\mathrm{B}(a,b)}x^{a-1}(1-x)^{b-1}, :math:`\delta(x)` is the delta function, and :math:`\mathrm{B}(a,b)` is the beta function (:py:data:`scipy.special.beta`). :class:`beinf` takes :math:`a>0`, :math:`b>0`, :math:`0\leq p \leq 1`, :math:`0\leq q \leq 1` as shape parameters. :class:`beinf` is an instance of a subclass of :py:class:`~scipy.stats.rv_continuous`, and therefore inherits all of the methods within :py:class:`~scipy.stats.rv_continuous`. Some of those methods have been subclassed here: ``_argcheck`` ``_cdf`` ``_pdf`` ``_ppf`` Additional methods added to :class:`beinf` are: ``beta_moments(data_sub)`` Computes the parameters for the beta distribution using the method of moments. ``cdf_eval(x,data_params,data)`` Chooses the appropriate method for evaluating the cumulative distribution function for data at points x, and computes it. ``check_cases(data)`` Checks to see if data satisfies any of cases 1-4. ``ecdf(x, X_samp, p=None, q=None)`` The empirical distribution function X_samp at points x. ``fit(data)`` This replaces the ``fit`` method in :py:class:`~scipy.stats.rv_continuous`, but is not subclassed. Computes the MLE parameters for the BEINF distribution. ``fit_beta(data_sub)`` Computes the MLE parameters for the beta distribution. """ ##################################################################### ######################### Subclassed Methods ######################## ##################################################################### def _pdf(self, x, a, b, p, q): ''' Subclass the _pdf method (returns the pdf of the BEINF distribution) ''' def logpdf_beta(x,a,b): # log of the pdf for beta distribution (taken from scipy.stats.beta) lPx = special.xlog1py(b-1.0, - x) + special.xlogy(a-1.0, x) lPx -= special.betaln(a, b) return lPx def delta(x): if isinstance(x,np.float): #if x comes in as float, turn it into a numpy array x = np.array([x]) y = np.zeros(x.shape) y[x==0.0] = 1.0 return y vals = delta(x)*p*(1-q) + (1.-p)*np.exp(logpdf_beta(x,a,b)) + p*q*delta(x-1.) # boolean list for random variable return vals def _cdf(self, x, a, b, p, q): if np.all(p==1.): #cdf is only described by bernoulli distribution when p=1 return p*binom._cdf(x,1,q) else: # the ranges of x that break up the piecewise # cdf condlist = [x<0.0, x==0.0, np.logical_and(x>0.0, x<1.0), x>=1.0] # the piecewise cdf associated with the entries in condlist choicelist = [0.0, p*(1-q), p*binom._cdf(x,1,q) + (1-p)*special.btdtr(a,b,x), 1.0] return np.select(condlist, choicelist) def _ppf(self, rho, a, b, p, q): # subclass the _ppf method (returns the inverse cdf of the # beinf distribution). NOTE: This is not a true inverse # when p!=0 since the same SIC value can be returned # for a range of probabilities at the endpoints if np.all(p==1): gamma_ = 0.0 else: gamma_ = (rho - p*(1-q))/(1-p) condlist=[np.logical_and(rho>=0., rho<=p*(1-q)), np.logical_and(rho>p*(1-q),rho<(1-p*q)), np.logical_and(rho>=1-p*q, rho<=1.)] choicelist=[0.0, special.btdtri(a,b,gamma_), 1.0] return np.select(condlist, choicelist) def _argcheck(self,a,b,p,q): #subclass the argcheck method to ensure parameters #are constrained to their bounds check1 = np.logical_and(a>=0.,b>=0.) check2 = np.logical_and(p>=0.,p<=1.) check3 = np.logical_and(q>=0.,q<=1.) if check1==True and check2==True and check3==True: return True else: return False ##################################################################### ######################### Methods Unique ############################ ######################### to the beinf ############################ ######################### distribution ############################ #####################################################################
[docs] def cdf_eval(self,x,data_params,data): r''' Evaluates the cumulative distribution function (empirically or parametrically) for a random variable :math:`X\sim\mathrm{BEINF}(a,b,p,q)`. When :math:`a` and :math:`b` are unkown (i.e. when :code:`a=np.inf` and :code:`b=np.inf`), the cdf is evaluated using :meth:`~beinf.beinf_gen.ecdf`. When :math:`a` and :math:`b` are known `or` when :math:`p=1`, the cdf is evaluated using :py:meth:`~scipy.stats.rv_continuous.cdf`. This is used after applying TAQM (see the tutorial on using the :class:`taqm` class). Args: x (float or ndarray): The value(s) at which the ecdf is evaluated. data_params (ndarray): An array containing the four parameters a, b, p, and q. data (ndarray): The data which defines the ecdf when :math:`a` and :math:`b` are unknown (i.e. when :code:`a=np.inf` and :code:`b=np.inf`). This array may contain the values :code:`np.inf`, 0, and 1. Returns: cdf_vals (ndarry): The cdf or ecdf for :math:`X\sim\mathrm{BEINF}(a,b,p,q)` evaluated at x. ''' a,b,p,q = data_params[0],data_params[1],data_params[2],data_params[3] if a==np.inf and p!=1.0: cdf_vals = beinf.ecdf(x,data,p,q) else: rv = beinf(a,b,p,q) cdf_vals = rv.cdf(x) return cdf_vals
[docs] def ecdf(self, x, X_samp, p=None, q=None): r''' For computing the empirical cumulative distribution function (ecdf) for a random variable :math:`X\sim\mathrm{BEINF}(a,b,p,q)` when the parameters :math:`a` and :math:`b` (or additionally :math:`p` and :math:`q`) are unkown. Args: x (float or ndarray): The value(s) at which the ecdf is evaluated X_samp (float or ndarray): A sample that lies on either: the open interval (0,1) when p and q **are** included as arguments, `or` the closed interval [0,1] when p and q **are not** included as arguments. p (float, optional): Shape parameter(s) for the beinf distribution. q (float, optional): Shape parameter(s) for the beinf distribution. Returns: ecdf_vals (ndarray): The ecdf for X_samp, evaluated at x. ''' if isinstance(x,np.float): #if x comes in as float, turn it into a numpy array x = np.array([x]) if isinstance(X_samp,np.float): #if X_samp comes in as float, turn it into a numpy array X_samp = np.array([X_samp]) if p!=None and q!=None: # sort the values of X_samp from smallest to largest # but take out 0's and 1's because p and q take care of endpoints xs = np.sort(X_samp[np.logical_and(X_samp!=0.0,X_samp!=1.0)]) # also take out any inf values xs = np.sort(xs[xs!=np.inf]) # get the sample size of xs satisfying xs<=x for each x ys = map(lambda vals: len(xs[xs<=vals]), x) # the ranges of x that break up the piecewise # cdf condlist = [x<0.0, x==0.0, np.logical_and(x>0.0, x<1.0), x>=1.0] # the piecewise cdf associated with the entries in condlist choicelist = [0.0, p*(1-q), p*(1-q) + (1-p)*np.array(ys)/float(len(xs)), 1.0] else: # sort the values of X_samp from smallest to largest xs = np.sort(X_samp) # get the sample size of xs satisfying xs<=x for each x ys = map(lambda vals: len(xs[xs<=vals]), x) # the ranges of x that break up the piecewise # cdf condlist = [x<0.0, np.logical_and(x>=0.0,x<=1.0)] # the piecewise cdf associated with the entries in condlist choicelist = [0.0, np.array(ys)/float(len(xs))] ecdf_vals = np.select(condlist, choicelist) return ecdf_vals
[docs] def fit(self, data): ''' Computes the MLE parameters :math:`a`, :math:`b`, :math:`p`, and :math:`q` for the BEINF distribution to the given data. Args: data (ndarray): The data to be fit to the BEINF distribution. Returns: a,b, p, q (floats): The shape parameters for the BEINF distribution. When :math:`a` and :math:`b` cannot be fit, they are returned as :code:`a=np.inf` and :code:`b=np.inf`. ''' data = np.copy(data) n=len(data) # Compute mle estimates p and q based on the complete sample #indicator function I = np.zeros(n) I[np.logical_or(data==0.0,data==1.0)] = 1.0 # p is the fraction of the sample that contains either # 0 or 1 p=I.sum()/float(n) # Of those number of 0's and 1's in the sample, q # is the fraction of 1's. if p==0.0: q = 0.0 else: q=(data*I).sum()/I.sum() # Compute estimates for a and b from the sub-sample of data # neither zero nor one # sub-set of data that lies on (0,1) data_sub = data[np.logical_and(data!=0.0,data!=1.0)] # see if any of the special cases are true for data cases = self.check_cases(data) if cases==False: # when all of the cases are false for data, # data_sub can be fit using beinf.fit_beta a, b = self.fit_beta(data_sub) else: # if any of cases 1-4 are True, don't fit beta portion of # the beinf distribution and set a and b equal to infinity # (note that these values of a and b can still be passed # into beinf... the resultant random variable will just # be described by the bernoulli distribution a,b = np.inf, np.inf return a,b,p,q
[docs] def fit_beta(self,data_sub): ''' Computes the MLE parameters :math:`a` and :math:`b` for the beta distribution to the given data. Args: data_sub (ndarray): The data to be fit to the beta distribution. The values in data_sub should lie on the open interval (0,1). Returns: a,b (floats): The shape parameters for the beta distribution. ''' # start out with a and b as arbitrary negative values. # This is to handle instances when errors are raised using the # beta.fit method. When errors are raised this function sets a # and b equal to estimates obtained from the method of moments a=-10 b=-10 while a<0.0 or b<0.0: try: # try to fit the data to the beta distribution # using the built-in beta.fit method a,b,loc,scale = beta.fit(data_sub,floc=0.0,fscale=1.0-0.0) if a<0.0 or b<0.0: # this is to deal with when a and b are found # succesfully by beta.fit, but are non-sensical # negative values. In this case use the # method of moments to find a and b a, b = self.beta_moments(data_sub) except: # this is to deal with when the beta.fit function # does not converge. In this case use the method # of moments to find a and b a, b = self.beta_moments(data_sub) return a, b
[docs] def beta_moments(self,data_sub): ''' Computes the method of moments estimates of `a` and `b` for the beta distribution. Args: data_sub (ndarray): The data to be fit to the beta distribution. The values in data_sub should lie on the open interval (0,1). Returns: a,b (floats): The shape parameters for the beta distribution. ''' # geometric mean data_bar = data_sub.mean() #sample standard devation data_var = data_sub.var(ddof=1) #method-of-moments estimates of a and b a = data_bar*(data_bar*(1.-data_bar)/data_var - 1.) b = (1-data_bar)*(data_bar*(1.-data_bar)/data_var - 1.) return a, b
[docs] def check_cases(self,data): r''' Checks whether data satisfies any of cases 1-4 described in Section 3b of Dirkson et al, 2018. When True, the parameters a and b for the beta-portion of the BEINF distribution cannot be fit. Args: data (ndarray): The values for which to test if cases 1-4 apply. Returns: `True` or `False` Boolean value of `True` or `False`. `True` if any of the cases are satisfied; `False` if all of the cases are not satisfied. ''' case1,case2,case3,case4 = False, False, False, False n = len(data) x_sub = data[np.logical_and(data!=0.0,data!=1.0)] m = n-len(x_sub) #number of zeros and ones in data s_bar = x_sub.mean() v_bar = np.var(x_sub,ddof=1) if n-m==0: case1 = True if n-m==1: case2 = True if n-m>1 and v_bar<1e-20: case3 = True if v_bar>=s_bar*(1.-s_bar): case4 = True if case1==False and case2==False and case3==False and case4==False: return False else: return True
beinf = beinf_gen(a=np.nextafter(0,-1),b=1.0,name='beinf', shapes='a,b,p,q')