Source code for xpipe.tools.selector

"""
Manipulate arrays and parameter distributions
"""

import numpy as np
import itertools


[docs]def partition(lst, n): """ Divides a list into N roughly equal chunks Examples -------- Define some test list, and look at the obtained chunks with different :code:`n` values:: >>> lst = np.arange(20) >>> lst array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]) >>> partition(lst, n=5) [array([0, 1, 2, 3]), array([4, 5, 6, 7]), array([ 8, 9, 10, 11]), array([12, 13, 14, 15]), array([16, 17, 18, 19])] >>> partition(lst, n=6) [array([0, 1, 2]), array([3, 4, 5, 6]), array([7, 8, 9]), array([10, 11, 12]), array([13, 14, 15, 16]), array([17, 18, 19])] As we can see, even when :code:`n` is not a divisor of :code:`len(lst)`, it returns roughly balanced chunks Parameters ---------- lst : list list to split up n : int chunks to make Returns ------- list of lists list of chunks """ division = len(lst) / float(n) return [lst[int(round(division * i)): int(round(division * (i + 1)))] for i in range(n) ]
[docs]def safedivide(x, y, eps=1e-8): """ Calculates x / y for arrays, setting result to zero if x ~ 0 OR y ~ 0 Defined such that :math:`5 / 5 \\rightarrow 1` but :math:`5 / 0 \\rightarrow 0` and :math:`0 / 5 \\rightarrow 0` and :math:`0 / 0 \\rightarrow 0` Parameters ---------- x : np.array y : np.array eps : float threshold for setting element to zero Returns ------- np.array x / y where (xabs > eps) and (yabs > eps), 0 elsewhere """ xabs = np.abs(x) yabs = np.abs(y) gind = np.where((xabs > eps) * (yabs > eps)) res = np.zeros(shape=xabs.shape) res[gind] = x[gind] / y[gind] return res
[docs]def selector(pps, limits): """ Applies selection to array based on the passed parameter limits Selection is defined like :math:`x\\in[x_0;\\;x_1)` Examples -------- First make some mock data:: np.random.seed(seed = 5) nrows = 100000 ids = np.arange(nrows, dtype=int) ra = np.random.uniform(low=0., high=360., size=nrows) dec = np.random.uniform(low=-60., high=10., size=nrows) z = np.random.uniform(low=0.1, high=1.0, size=nrows) lamb = np.random.uniform(low=20, high=40, size=nrows) data = np.vstack((ra, dec, z, lamb)).T then define binning edges:: ra_edges = (0., 100., 300.) dec_edges = (-60., -30., 10.) z_edges = (0.2, 0.4, 0.8) lamb_edges = (20., 30., 45., 60.) edges = (ra_edges, dec_edges, z_edges, lamb_edges) This selection divides the data into 24 bins in 4-D space, which we can obtain perform as:: sinds, bounds, plpairs = sl.selector(data, edges) Let's inspect the output, :code:`sinds` contains the boolean indexing arrays to select rows by, while :code:`bounds` shows the corresponding parameter boundaries, for example:: >>> bounds[13] ((0, (100.0, 300.0), 1), (1, (-60.0, -30.0), 0), (2, (0.2, 0.4), 0), (3, (30.0, 45.0), 1)) where the first number indicates the column of the data file it corresponds to, and the tuple contains the parameter boundaries for that column. The last element is an index for the original parameter edge. Indeed we find:: >>> data[sinds[13], :].min(axis=0) array([ 100.03881951, -59.96803764, 0.20010631, 30.02361767]) >>> data[sinds[13], :].max(axis=0) array([ 299.68649402, -30.00117535, 0.39994843, 44.99060231]) which are the boundaries we were expecting Parameters ---------- pps : np.array N-D numpy array with N rows, containing the parameters to split the sample by limits : list of lists list of parameter limits, each element contains limits for a columna of pps Returns ------- list, list, list list of boolean indices (one for each selection), list of param limits corresponding to each selection parameter limits expanded from sequence to list of tuples """ parinds = np.arange(pps.shape[1], dtype=int) plpairs = [] for pp, limit in zip(parinds, limits): plpair = [] for i, lim in enumerate(limit[:-1]): plpair.append((pp, (limit[i], limit[i + 1]), i)) plpairs.append(plpair) sinds = [] bounds = [] for pval in itertools.product(*plpairs): sval = np.ones(len(pps), dtype=bool) bounds.append(pval) for (pind, limit, tmp) in pval: sval *= (limit[0] <= pps[:, pind]) * (pps[:, pind] < limit[1]) sinds.append(sval) sinds = np.array(sinds) return sinds, bounds, plpairs
[docs]def matchdd(pars, refpars, win=None, wref=None, bins=30): """ Matches two D-dimensional distributions by reweighting individual objects New weights are assigned by comparing the normalized, weighted histograms of the two datasets Parameters ---------- pars : np.ndarray data table to be reweighted, shape (N1, D) refpars : np.ndarray reference data table, shape (N2, D) win : np.array weights for the data table wref : np.array weights for the reference table bins : int or tuple number of bins, or tuple of bin edges Returns ------- np.array new weights for the input catalog Notes ------ *TODO: TO BE TESTED* """ if win is None: win = np.ones(len(pars)) if wref is None: wref = np.ones(len(refpars)) refcounts, bin_edges = np.histogramdd(refpars, bins=bins, weights=wref, normed=True) counts = np.histogramdd(pars, bins=bin_edges, weights=win, normed=True)[0] wratio = safedivide(refcounts, counts) _digits = [] for icol in np.arange(pars.shape[1]): _digits.append(np.digitize(pars[:, icol], bins=bin_edges[icol])) digits = np.vstack(_digits).T ww = np.zeros(len(pars)) for i, dig in enumerate(digits): # checking that point is within bounds for each digitize... checkval = True for icol in np.arange(pars.shape[1]): if dig[icol] == 0 or dig[icol] == len(bin_edges[icol]): checkval *= False if checkval: val = tuple(np.array(dig) - 1) ww[i] = wratio[val] * win[i] return ww