| // Copyright 2015 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package stats |
| |
| import ( |
| "math" |
| ) |
| |
| // A UDist is the discrete probability distribution of the |
| // Mann-Whitney U statistic for a pair of samples of sizes N1 and N2. |
| // |
| // The details of computing this distribution with no ties can be |
| // found in Mann, Henry B.; Whitney, Donald R. (1947). "On a Test of |
| // Whether one of Two Random Variables is Stochastically Larger than |
| // the Other". Annals of Mathematical Statistics 18 (1): 50–60. |
| // Computing this distribution in the presence of ties is described in |
| // Klotz, J. H. (1966). "The Wilcoxon, Ties, and the Computer". |
| // Journal of the American Statistical Association 61 (315): 772-787 |
| // and Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney |
| // Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7: |
| // 805-813 (the former paper contains details that are glossed over in |
| // the latter paper but has mathematical typesetting issues, so it's |
| // easiest to get the context from the former paper and the details |
| // from the latter). |
| type UDist struct { |
| N1, N2 int |
| |
| // T is the count of the number of ties at each rank in the |
| // input distributions. T may be nil, in which case it is |
| // assumed there are no ties (which is equivalent to an M+N |
| // slice of 1s). It must be the case that Sum(T) == M+N. |
| T []int |
| } |
| |
| // hasTies returns true if d has any tied samples. |
| func (d UDist) hasTies() bool { |
| for _, t := range d.T { |
| if t > 1 { |
| return true |
| } |
| } |
| return false |
| } |
| |
| // p returns the p_{d.N1,d.N2} function defined by Mann, Whitney 1947 |
| // for values of U from 0 up to and including the U argument. |
| // |
| // This algorithm runs in Θ(N1*N2*U) = O(N1²N2²) time and is quite |
| // fast for small values of N1 and N2. However, it does not handle ties. |
| func (d UDist) p(U int) []float64 { |
| // This is a dynamic programming implementation of the |
| // recursive recurrence definition given by Mann and Whitney: |
| // |
| // p_{n,m}(U) = (n * p_{n-1,m}(U-m) + m * p_{n,m-1}(U)) / (n+m) |
| // p_{n,m}(U) = 0 if U < 0 |
| // p_{0,m}(U) = p{n,0}(U) = 1 / nCr(m+n, n) if U = 0 |
| // = 0 if U > 0 |
| // |
| // (Note that there is a typo in the original paper. The first |
| // recursive application of p should be for U-m, not U-M.) |
| // |
| // Since p{n,m} only depends on p{n-1,m} and p{n,m-1}, we only |
| // need to store one "plane" of the three dimensional space at |
| // a time. |
| // |
| // Furthermore, p_{n,m} = p_{m,n}, so we only construct values |
| // for n <= m and obtain the rest through symmetry. |
| // |
| // We organize the computed values of p as followed: |
| // |
| // n → N |
| // m * |
| // ↓ * * |
| // * * * |
| // * * * * |
| // * * * * |
| // M * * * * |
| // |
| // where each * is a slice indexed by U. The code below |
| // computes these left-to-right, top-to-bottom, so it only |
| // stores one row of this matrix at a time. Furthermore, |
| // computing an element in a given U slice only depends on the |
| // same and smaller values of U, so we can overwrite the U |
| // slice we're computing in place as long as we start with the |
| // largest value of U. Finally, even though the recurrence |
| // depends on (n,m) above the diagonal and we use symmetry to |
| // mirror those across the diagonal to (m,n), the mirrored |
| // indexes are always available in the current row, so this |
| // mirroring does not interfere with our ability to recycle |
| // state. |
| |
| N, M := d.N1, d.N2 |
| if N > M { |
| N, M = M, N |
| } |
| |
| memo := make([][]float64, N+1) |
| for n := range memo { |
| memo[n] = make([]float64, U+1) |
| } |
| |
| for m := 0; m <= M; m++ { |
| // Compute p_{0,m}. This is zero except for U=0. |
| memo[0][0] = 1 |
| |
| // Compute the remainder of this row. |
| nlim := N |
| if m < nlim { |
| nlim = m |
| } |
| for n := 1; n <= nlim; n++ { |
| lp := memo[n-1] // p_{n-1,m} |
| var rp []float64 |
| if n <= m-1 { |
| rp = memo[n] // p_{n,m-1} |
| } else { |
| rp = memo[m-1] // p{m-1,n} and m==n |
| } |
| |
| // For a given n,m, U is at most n*m. |
| // |
| // TODO: Actually, it's at most ⌈n*m/2⌉, but |
| // then we need to use more complex symmetries |
| // in the inner loop below. |
| ulim := n * m |
| if U < ulim { |
| ulim = U |
| } |
| |
| out := memo[n] // p_{n,m} |
| nplusm := float64(n + m) |
| for U1 := ulim; U1 >= 0; U1-- { |
| l := 0.0 |
| if U1-m >= 0 { |
| l = float64(n) * lp[U1-m] |
| } |
| r := float64(m) * rp[U1] |
| out[U1] = (l + r) / nplusm |
| } |
| } |
| } |
| return memo[N] |
| } |
| |
| type ukey struct { |
| n1 int // size of first sample |
| twoU int // 2*U statistic for this permutation |
| } |
| |
| // This computes the cumulative counts of the Mann-Whitney U |
| // distribution in the presence of ties using the computation from |
| // Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney |
| // Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7: |
| // 805-813, with much guidance from appendix L of Klotz, A |
| // Computational Approach to Statistics. |
| // |
| // makeUmemo constructs a table memo[K][ukey{n1, 2*U}], where K is the |
| // number of ranks (up to len(t)), n1 is the size of the first sample |
| // (up to the n1 argument), and U is the U statistic (up to the |
| // argument twoU/2). The value of an entry in the memo table is the |
| // number of permutations of a sample of size n1 in a ranking with tie |
| // vector t[:K] having a U statistic <= U. |
| func makeUmemo(twoU, n1 int, t []int) []map[ukey]float64 { |
| // Another candidate for a fast implementation is van de Wiel, |
| // "The split-up algorithm: a fast symbolic method for |
| // computing p-values of distribution-free statistics". This |
| // is what's used by R's coin package. It's a comparatively |
| // recent publication, so it's presumably faster (or perhaps |
| // just more general) than previous techniques, but I can't |
| // get my hands on the paper. |
| // |
| // TODO: ~40% of this function's time is spent in mapassign on |
| // the assignment lines in the two loops and another ~20% in |
| // map access and iteration. Improving map behavior or |
| // replacing the maps altogether with some other constant-time |
| // structure could double performance. |
| // |
| // TODO: The worst case for this function is when there are |
| // few ties. Yet the best case overall is when there are *no* |
| // ties. Can we get the best of both worlds? Use the fast |
| // algorithm for the most part when there are few ties and mix |
| // in the general algorithm just where we need it? That's |
| // certainly possible for sub-problems where t[:k] has no |
| // ties, but that doesn't help if t[0] has a tie but nothing |
| // else does. Is it possible to rearrange the ranks without |
| // messing up our computation of the U statistic for |
| // sub-problems? |
| |
| K := len(t) |
| |
| // Compute a coefficients. The a slice is indexed by k (a[0] |
| // is unused). |
| a := make([]int, K+1) |
| a[1] = t[0] |
| for k := 2; k <= K; k++ { |
| a[k] = a[k-1] + t[k-2] + t[k-1] |
| } |
| |
| // Create the memo table for the counts function, A. The A |
| // slice is indexed by k (A[0] is unused). |
| // |
| // In "The Mann Whitney Distribution Using Linked Lists", they |
| // use linked lists (*gasp*) for this, but within each K it's |
| // really just a memoization table, so it's faster to use a |
| // map. The outer structure is a slice indexed by k because we |
| // need to find all memo entries with certain values of k. |
| // |
| // TODO: The n1 and twoU values in the ukeys follow strict |
| // patterns. For each K value, the n1 values are every integer |
| // between two bounds. For each (K, n1) value, the twoU values |
| // are every integer multiple of a certain base between two |
| // bounds. It might be worth turning these into directly |
| // indexible slices. |
| A := make([]map[ukey]float64, K+1) |
| A[K] = map[ukey]float64{ukey{n1: n1, twoU: twoU}: 0} |
| |
| // Compute memo table (k, n1, twoU) triples from high K values |
| // to low K values. This drives the recurrence relation |
| // downward to figure out all of the needed argument triples. |
| // |
| // TODO: Is it possible to generate this table bottom-up? If |
| // so, this could be a pure dynamic programming algorithm and |
| // we could discard the K dimension. We could at least store |
| // the inputs in a more compact representation that replaces |
| // the twoU dimension with an interval and a step size (as |
| // suggested by Cheung, Klotz, not that they make it at all |
| // clear *why* they're suggesting this). |
| tsum := sumint(t) // always ∑ t[0:k] |
| for k := K - 1; k >= 2; k-- { |
| tsum -= t[k] |
| A[k] = make(map[ukey]float64) |
| |
| // Construct A[k] from A[k+1]. |
| for A_kplus1 := range A[k+1] { |
| rkLow := maxint(0, A_kplus1.n1-tsum) |
| rkHigh := minint(A_kplus1.n1, t[k]) |
| for rk := rkLow; rk <= rkHigh; rk++ { |
| twoU_k := A_kplus1.twoU - rk*(a[k+1]-2*A_kplus1.n1+rk) |
| n1_k := A_kplus1.n1 - rk |
| if twoUmin(n1_k, t[:k], a) <= twoU_k && twoU_k <= twoUmax(n1_k, t[:k], a) { |
| key := ukey{n1: n1_k, twoU: twoU_k} |
| A[k][key] = 0 |
| } |
| } |
| } |
| } |
| |
| // Fill counts in memo table from low K values to high K |
| // values. This unwinds the recurrence relation. |
| |
| // Start with K==2 base case. |
| // |
| // TODO: Later computations depend on these, but these don't |
| // depend on anything (including each other), so if K==2, we |
| // can skip the memo table altogether. |
| if K < 2 { |
| panic("K < 2") |
| } |
| N_2 := t[0] + t[1] |
| for A_2i := range A[2] { |
| Asum := 0.0 |
| r2Low := maxint(0, A_2i.n1-t[0]) |
| r2High := (A_2i.twoU - A_2i.n1*(t[0]-A_2i.n1)) / N_2 |
| for r2 := r2Low; r2 <= r2High; r2++ { |
| Asum += mathChoose(t[0], A_2i.n1-r2) * |
| mathChoose(t[1], r2) |
| } |
| A[2][A_2i] = Asum |
| } |
| |
| // Derive counts for the rest of the memo table. |
| tsum = t[0] // always ∑ t[0:k-1] |
| for k := 3; k <= K; k++ { |
| tsum += t[k-2] |
| |
| // Compute A[k] counts from A[k-1] counts. |
| for A_ki := range A[k] { |
| Asum := 0.0 |
| rkLow := maxint(0, A_ki.n1-tsum) |
| rkHigh := minint(A_ki.n1, t[k-1]) |
| for rk := rkLow; rk <= rkHigh; rk++ { |
| twoU_kminus1 := A_ki.twoU - rk*(a[k]-2*A_ki.n1+rk) |
| n1_kminus1 := A_ki.n1 - rk |
| x, ok := A[k-1][ukey{n1: n1_kminus1, twoU: twoU_kminus1}] |
| if !ok && twoUmax(n1_kminus1, t[:k-1], a) < twoU_kminus1 { |
| x = mathChoose(tsum, n1_kminus1) |
| } |
| Asum += x * mathChoose(t[k-1], rk) |
| } |
| A[k][A_ki] = Asum |
| } |
| } |
| |
| return A |
| } |
| |
| func twoUmin(n1 int, t, a []int) int { |
| K := len(t) |
| twoU := -n1 * n1 |
| n1_k := n1 |
| for k := 1; k <= K; k++ { |
| twoU_k := minint(n1_k, t[k-1]) |
| twoU += twoU_k * a[k] |
| n1_k -= twoU_k |
| } |
| return twoU |
| } |
| |
| func twoUmax(n1 int, t, a []int) int { |
| K := len(t) |
| twoU := -n1 * n1 |
| n1_k := n1 |
| for k := K; k > 0; k-- { |
| twoU_k := minint(n1_k, t[k-1]) |
| twoU += twoU_k * a[k] |
| n1_k -= twoU_k |
| } |
| return twoU |
| } |
| |
| func (d UDist) PMF(U float64) float64 { |
| if U < 0 || U >= 0.5+float64(d.N1*d.N2) { |
| return 0 |
| } |
| |
| if d.hasTies() { |
| // makeUmemo computes the CDF directly. Take its |
| // difference to get the PMF. |
| p1, ok1 := makeUmemo(int(2*U)-1, d.N1, d.T)[len(d.T)][ukey{d.N1, int(2*U) - 1}] |
| p2, ok2 := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}] |
| if !ok1 || !ok2 { |
| panic("makeUmemo did not return expected memoization table") |
| } |
| return (p2 - p1) / mathChoose(d.N1+d.N2, d.N1) |
| } |
| |
| // There are no ties. Use the fast algorithm. U must be integral. |
| Ui := int(math.Floor(U)) |
| // TODO: Use symmetry to minimize U |
| return d.p(Ui)[Ui] |
| } |
| |
| func (d UDist) CDF(U float64) float64 { |
| if U < 0 { |
| return 0 |
| } else if U >= float64(d.N1*d.N2) { |
| return 1 |
| } |
| |
| if d.hasTies() { |
| // TODO: Minimize U? |
| p, ok := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}] |
| if !ok { |
| panic("makeUmemo did not return expected memoization table") |
| } |
| return p / mathChoose(d.N1+d.N2, d.N1) |
| } |
| |
| // There are no ties. Use the fast algorithm. U must be integral. |
| Ui := int(math.Floor(U)) |
| // The distribution is symmetric around U = m * n / 2. Sum up |
| // whichever tail is smaller. |
| flip := Ui >= (d.N1*d.N2+1)/2 |
| if flip { |
| Ui = d.N1*d.N2 - Ui - 1 |
| } |
| pdfs := d.p(Ui) |
| p := 0.0 |
| for _, pdf := range pdfs[:Ui+1] { |
| p += pdf |
| } |
| if flip { |
| p = 1 - p |
| } |
| return p |
| } |
| |
| func (d UDist) Step() float64 { |
| return 0.5 |
| } |
| |
| func (d UDist) Bounds() (float64, float64) { |
| // TODO: More precise bounds when there are ties. |
| return 0, float64(d.N1 * d.N2) |
| } |