1 from scipy.spatial.distance
import pdist, squareform
8 ''' Mask out features which are near duplicates of some other
9 feature(s), based on correlation. The threshold parameter
10 determines how similar features can be to keep them.
11 Roughly based on https://stackoverflow.com/a/66326102
14 if (features.shape[1] > chunksize)
and not _bottom:
16 There are two options, which boils down a question of how information
17 propagates between chunks:
18 1. Binary-split recursion down to the chunksize. This will filter
19 features more towards the bottom of the tree, assuming it's
20 more likely for similar features to be near each other
22 2. Single level tree, with (nearly) all nodes having the same number
23 of features (i.e. chunksize). This performs most filtering at the
24 root of the tree, when all nodes are combined after the initial filter
26 Paradoxically, option 1 appears to result in a higher number of final
27 features returned in spite of applying more filters. This could be a
28 consequence of chained features, e.g. features A and B have a distance
29 of 0.015, B and C have a distance of 0.015, and A and C have 0.03; with
30 a threshold of 0.025, B and C would both be removed if all three were within
31 the same filter. If instead A and B were in one filter (with B then
32 removed), but C was in another, then the final result would be A and C
35 These chains (referred to as "Hamming chains" in the stackoverflow post)
36 can be arbitrarily long, and breaking them should intuitively offer
37 a better representation for ML model features. Option 1 therefore seems
38 to be the better choice, as well as using the smallest chunksize possible -
39 but note that this hasn't been rigorously tested.
41 recursion_size = features.shape[-1] // 2
44 mask = np.concatenate([
46 for i
in range(0, features.shape[1], recursion_size)
51 dist = squareform( pdist(features.T, metric=
'correlation') )
52 dist[np.triu_indices_from(dist)] = -1
53 mask = (0 <= dist) & (dist <= threshold)
54 return ~np.any(mask, axis=1)
59 ''' Scipy's provided spearmanr is too slow for large matrices, and so we instead use a custom implementation.
60 Source: https://stackoverflow.com/questions/52371329/fast-spearman-correlation-between-two-pandas-dataframes/59072032#59072032
62 def rankdata_average(data):
63 ''' Row-based rankdata using method=mean '''
64 dc = np.asarray(data).
copy()
65 sorter = np.apply_along_axis(np.argsort, 1, data)
67 inv = np.empty(data.shape, np.intp)
68 ranks = np.tile(np.arange(data.shape[1]), (len(data), 1))
69 np.put_along_axis(inv, sorter, ranks, axis=1)
71 dc = np.take_along_axis(dc, sorter, 1)
72 res = np.apply_along_axis(
lambda r: r[1:] != r[:-1], 1, dc)
73 obs = np.column_stack([np.ones(len(res), dtype=bool), res])
75 dense = np.take_along_axis(np.apply_along_axis(np.cumsum, 1, obs), inv, 1)
78 nonzero = np.count_nonzero(obs, axis=1)
79 nonzero = pd.Series(nonzero)
80 dense = pd.DataFrame(dense)
81 obs = pd.DataFrame(obs)
84 for _nonzero, nzdf
in obs.groupby(nonzero, sort=
False):
85 nz = np.apply_along_axis(
lambda r: np.nonzero(r)[0], 1, nzdf)
87 _count = np.column_stack([nz, np.ones(len(nz)) * len_r])
88 _dense = dense.reindex(nzdf.index).values
90 _result = 0.5 * (np.take_along_axis(_count, _dense, 1) + np.take_along_axis(_count, _dense - 1, 1) + 1)
91 result = pd.DataFrame(_result, index=nzdf.index)
93 return pd.concat(ranks).sort_index()
96 def compute_corr(x, y):
99 return np.sum(a * a, axis=axis)
107 xm, ym = x - mx[...,
None], y - my[...,
None]
109 r_num = np.add.reduce(xm * ym, axis=-1)
110 r_den = np.sqrt(ss(xm, axis=-1) * ss(ym, axis=-1))
112 with np.errstate(divide=
'ignore', invalid=
"ignore"):
120 rx = rankdata_average(x)
121 ry = rankdata_average(y)
122 return compute_corr(rx, ry)