Source code for xpipe.tools.selector

"""
Manipulate arrays and parameter distributions
"""

import numpy as np
import itertools


[docs]def partition(lst, n):
    """
    Divides a list into N roughly equal chunks

    Examples
    --------
    Define some test list, and look at the obtained chunks with different :code:`n` values::

        >>> lst = np.arange(20)
        >>> lst
        array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19])
        >>> partition(lst, n=5)
        [array([0, 1, 2, 3]),
         array([4, 5, 6, 7]),
         array([ 8,  9, 10, 11]),
         array([12, 13, 14, 15]),
         array([16, 17, 18, 19])]
        >>> partition(lst, n=6)
        [array([0, 1, 2]),
         array([3, 4, 5, 6]),
         array([7, 8, 9]),
         array([10, 11, 12]),
         array([13, 14, 15, 16]),
         array([17, 18, 19])]

    As we can see, even when :code:`n` is not a divisor of :code:`len(lst)`, it returns
    roughly balanced chunks

    Parameters
    ----------
    lst : list
        list to split up
    n : int
        chunks to make

    Returns
    -------
    list of lists
        list of chunks
    """
    division = len(lst) / float(n)
    return [lst[int(round(division * i)): int(round(division * (i + 1)))]
            for i in range(n) ]


[docs]def safedivide(x, y, eps=1e-8):
    """
    Calculates x / y for arrays, setting result to zero if x ~ 0 OR y ~ 0

    Defined such that

        :math:`5 / 5 \\rightarrow 1`

    but
    
        :math:`5 / 0 \\rightarrow 0` and :math:`0 / 5 \\rightarrow 0` and :math:`0 / 0 \\rightarrow 0`


    Parameters
    ----------
    x : np.array
    y : np.array
    eps : float
        threshold for setting element to zero

    Returns
    -------
    np.array
        x / y where (xabs > eps) and (yabs > eps), 0 elsewhere
    """
    xabs = np.abs(x)
    yabs = np.abs(y)
    gind = np.where((xabs > eps) * (yabs > eps))

    res = np.zeros(shape=xabs.shape)
    res[gind] = x[gind] / y[gind]
    return res


[docs]def selector(pps, limits):
    """
    Applies selection to array based on the passed parameter limits

    Selection is defined like :math:`x\\in[x_0;\\;x_1)`

    Examples
    --------

    First make some mock data::

        np.random.seed(seed = 5)
        nrows = 100000
        ids = np.arange(nrows, dtype=int)
        ra = np.random.uniform(low=0., high=360., size=nrows)
        dec = np.random.uniform(low=-60., high=10., size=nrows)
        z = np.random.uniform(low=0.1, high=1.0, size=nrows)
        lamb = np.random.uniform(low=20, high=40, size=nrows)

        data = np.vstack((ra, dec, z, lamb)).T

    then define binning edges::

        ra_edges = (0., 100., 300.)
        dec_edges = (-60., -30., 10.)
        z_edges = (0.2, 0.4, 0.8)
        lamb_edges = (20., 30., 45., 60.)

        edges = (ra_edges, dec_edges, z_edges, lamb_edges)

    This selection divides the data into 24 bins in 4-D space, which we
    can obtain perform as::

        sinds, bounds, plpairs = sl.selector(data, edges)

    Let's inspect the output, :code:`sinds` contains the boolean indexing arrays to
    select rows by, while :code:`bounds` shows the corresponding parameter boundaries,
    for example::

        >>> bounds[13]
        ((0, (100.0, 300.0), 1), (1, (-60.0, -30.0), 0), (2, (0.2, 0.4), 0), (3, (30.0, 45.0), 1))

    where the first number indicates the column of the data file it corresponds to, and the
    tuple contains the parameter boundaries for that column. The last element is an index for the original parameter
    edge. Indeed we find::

        >>> data[sinds[13], :].min(axis=0)
        array([ 100.03881951,  -59.96803764,    0.20010631,   30.02361767])
        >>> data[sinds[13], :].max(axis=0)
        array([ 299.68649402,  -30.00117535,    0.39994843,   44.99060231])

    which are the boundaries we were expecting

    Parameters
    ----------
    pps : np.array N-D
        numpy array with N rows, containing the parameters to split the sample by
    limits : list of lists
        list of parameter limits, each element contains limits for a columna of pps

    Returns
    -------
    list, list, list
        list of boolean indices (one for each selection),
        list of param limits corresponding to each selection
        parameter limits expanded from sequence to list of tuples
    """

    parinds = np.arange(pps.shape[1], dtype=int)
    plpairs = []
    for pp, limit in zip(parinds, limits):
        plpair = []
        for i, lim in enumerate(limit[:-1]):
            plpair.append((pp, (limit[i], limit[i + 1]), i))
        plpairs.append(plpair)

    sinds = []
    bounds = []
    for pval in itertools.product(*plpairs):
        sval = np.ones(len(pps), dtype=bool)
        bounds.append(pval)
        for (pind, limit, tmp) in pval:
            sval *= (limit[0] <= pps[:, pind]) * (pps[:, pind] < limit[1])
        sinds.append(sval)

    sinds = np.array(sinds)
    return sinds, bounds, plpairs


[docs]def matchdd(pars, refpars, win=None, wref=None, bins=30):
    """
    Matches two D-dimensional distributions by reweighting individual objects

    New weights are assigned by comparing the normalized, weighted histograms
    of the two datasets

    Parameters
    ----------
    pars : np.ndarray
        data table to be reweighted, shape (N1, D)
    refpars : np.ndarray
        reference data table, shape (N2, D)
    win : np.array
        weights for the data table
    wref : np.array
        weights for the reference table
    bins : int or tuple
        number of bins, or tuple of bin edges

    Returns
    -------
    np.array
        new weights for the input catalog

    Notes
    ------
    *TODO: TO BE TESTED*

    """
    if win is None:
        win = np.ones(len(pars))
    if wref is None:
        wref = np.ones(len(refpars))

    refcounts, bin_edges = np.histogramdd(refpars, bins=bins, weights=wref, normed=True)
    counts = np.histogramdd(pars, bins=bin_edges, weights=win, normed=True)[0]
    wratio = safedivide(refcounts, counts)

    _digits = []
    for icol in np.arange(pars.shape[1]):
        _digits.append(np.digitize(pars[:, icol], bins=bin_edges[icol]))
    digits = np.vstack(_digits).T

    ww = np.zeros(len(pars))
    for i, dig in enumerate(digits):
        # checking that point is within bounds for each digitize...
        checkval = True
        for icol in np.arange(pars.shape[1]):
            if dig[icol] == 0 or dig[icol] == len(bin_edges[icol]):
                checkval *= False

        if checkval:
            val = tuple(np.array(dig) - 1)
            ww[i] = wratio[val] * win[i]

    return ww