// Copyright 2015 The Go Authors. All rights reserved. | |

// Use of this source code is governed by a BSD-style | |

// license that can be found in the LICENSE file. | |

package stats | |

import ( | |

"math" | |

) | |

// A UDist is the discrete probability distribution of the | |

// Mann-Whitney U statistic for a pair of samples of sizes N1 and N2. | |

// | |

// The details of computing this distribution with no ties can be | |

// found in Mann, Henry B.; Whitney, Donald R. (1947). "On a Test of | |

// Whether one of Two Random Variables is Stochastically Larger than | |

// the Other". Annals of Mathematical Statistics 18 (1): 50–60. | |

// Computing this distribution in the presence of ties is described in | |

// Klotz, J. H. (1966). "The Wilcoxon, Ties, and the Computer". | |

// Journal of the American Statistical Association 61 (315): 772-787 | |

// and Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney | |

// Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7: | |

// 805-813 (the former paper contains details that are glossed over in | |

// the latter paper but has mathematical typesetting issues, so it's | |

// easiest to get the context from the former paper and the details | |

// from the latter). | |

type UDist struct { | |

N1, N2 int | |

// T is the count of the number of ties at each rank in the | |

// input distributions. T may be nil, in which case it is | |

// assumed there are no ties (which is equivalent to an M+N | |

// slice of 1s). It must be the case that Sum(T) == M+N. | |

T []int | |

} | |

// hasTies returns true if d has any tied samples. | |

func (d UDist) hasTies() bool { | |

for _, t := range d.T { | |

if t > 1 { | |

return true | |

} | |

} | |

return false | |

} | |

// p returns the p_{d.N1,d.N2} function defined by Mann, Whitney 1947 | |

// for values of U from 0 up to and including the U argument. | |

// | |

// This algorithm runs in Θ(N1*N2*U) = O(N1²N2²) time and is quite | |

// fast for small values of N1 and N2. However, it does not handle ties. | |

func (d UDist) p(U int) []float64 { | |

// This is a dynamic programming implementation of the | |

// recursive recurrence definition given by Mann and Whitney: | |

// | |

// p_{n,m}(U) = (n * p_{n-1,m}(U-m) + m * p_{n,m-1}(U)) / (n+m) | |

// p_{n,m}(U) = 0 if U < 0 | |

// p_{0,m}(U) = p{n,0}(U) = 1 / nCr(m+n, n) if U = 0 | |

// = 0 if U > 0 | |

// | |

// (Note that there is a typo in the original paper. The first | |

// recursive application of p should be for U-m, not U-M.) | |

// | |

// Since p{n,m} only depends on p{n-1,m} and p{n,m-1}, we only | |

// need to store one "plane" of the three dimensional space at | |

// a time. | |

// | |

// Furthermore, p_{n,m} = p_{m,n}, so we only construct values | |

// for n <= m and obtain the rest through symmetry. | |

// | |

// We organize the computed values of p as followed: | |

// | |

// n → N | |

// m * | |

// ↓ * * | |

// * * * | |

// * * * * | |

// * * * * | |

// M * * * * | |

// | |

// where each * is a slice indexed by U. The code below | |

// computes these left-to-right, top-to-bottom, so it only | |

// stores one row of this matrix at a time. Furthermore, | |

// computing an element in a given U slice only depends on the | |

// same and smaller values of U, so we can overwrite the U | |

// slice we're computing in place as long as we start with the | |

// largest value of U. Finally, even though the recurrence | |

// depends on (n,m) above the diagonal and we use symmetry to | |

// mirror those across the diagonal to (m,n), the mirrored | |

// indexes are always available in the current row, so this | |

// mirroring does not interfere with our ability to recycle | |

// state. | |

N, M := d.N1, d.N2 | |

if N > M { | |

N, M = M, N | |

} | |

memo := make([][]float64, N+1) | |

for n := range memo { | |

memo[n] = make([]float64, U+1) | |

} | |

for m := 0; m <= M; m++ { | |

// Compute p_{0,m}. This is zero except for U=0. | |

memo[0][0] = 1 | |

// Compute the remainder of this row. | |

nlim := N | |

if m < nlim { | |

nlim = m | |

} | |

for n := 1; n <= nlim; n++ { | |

lp := memo[n-1] // p_{n-1,m} | |

var rp []float64 | |

if n <= m-1 { | |

rp = memo[n] // p_{n,m-1} | |

} else { | |

rp = memo[m-1] // p{m-1,n} and m==n | |

} | |

// For a given n,m, U is at most n*m. | |

// | |

// TODO: Actually, it's at most ⌈n*m/2⌉, but | |

// then we need to use more complex symmetries | |

// in the inner loop below. | |

ulim := n * m | |

if U < ulim { | |

ulim = U | |

} | |

out := memo[n] // p_{n,m} | |

nplusm := float64(n + m) | |

for U1 := ulim; U1 >= 0; U1-- { | |

l := 0.0 | |

if U1-m >= 0 { | |

l = float64(n) * lp[U1-m] | |

} | |

r := float64(m) * rp[U1] | |

out[U1] = (l + r) / nplusm | |

} | |

} | |

} | |

return memo[N] | |

} | |

type ukey struct { | |

n1 int // size of first sample | |

twoU int // 2*U statistic for this permutation | |

} | |

// This computes the cumulative counts of the Mann-Whitney U | |

// distribution in the presence of ties using the computation from | |

// Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney | |

// Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7: | |

// 805-813, with much guidance from appendix L of Klotz, A | |

// Computational Approach to Statistics. | |

// | |

// makeUmemo constructs a table memo[K][ukey{n1, 2*U}], where K is the | |

// number of ranks (up to len(t)), n1 is the size of the first sample | |

// (up to the n1 argument), and U is the U statistic (up to the | |

// argument twoU/2). The value of an entry in the memo table is the | |

// number of permutations of a sample of size n1 in a ranking with tie | |

// vector t[:K] having a U statistic <= U. | |

func makeUmemo(twoU, n1 int, t []int) []map[ukey]float64 { | |

// Another candidate for a fast implementation is van de Wiel, | |

// "The split-up algorithm: a fast symbolic method for | |

// computing p-values of distribution-free statistics". This | |

// is what's used by R's coin package. It's a comparatively | |

// recent publication, so it's presumably faster (or perhaps | |

// just more general) than previous techniques, but I can't | |

// get my hands on the paper. | |

// | |

// TODO: ~40% of this function's time is spent in mapassign on | |

// the assignment lines in the two loops and another ~20% in | |

// map access and iteration. Improving map behavior or | |

// replacing the maps altogether with some other constant-time | |

// structure could double performance. | |

// | |

// TODO: The worst case for this function is when there are | |

// few ties. Yet the best case overall is when there are *no* | |

// ties. Can we get the best of both worlds? Use the fast | |

// algorithm for the most part when there are few ties and mix | |

// in the general algorithm just where we need it? That's | |

// certainly possible for sub-problems where t[:k] has no | |

// ties, but that doesn't help if t[0] has a tie but nothing | |

// else does. Is it possible to rearrange the ranks without | |

// messing up our computation of the U statistic for | |

// sub-problems? | |

K := len(t) | |

// Compute a coefficients. The a slice is indexed by k (a[0] | |

// is unused). | |

a := make([]int, K+1) | |

a[1] = t[0] | |

for k := 2; k <= K; k++ { | |

a[k] = a[k-1] + t[k-2] + t[k-1] | |

} | |

// Create the memo table for the counts function, A. The A | |

// slice is indexed by k (A[0] is unused). | |

// | |

// In "The Mann Whitney Distribution Using Linked Lists", they | |

// use linked lists (*gasp*) for this, but within each K it's | |

// really just a memoization table, so it's faster to use a | |

// map. The outer structure is a slice indexed by k because we | |

// need to find all memo entries with certain values of k. | |

// | |

// TODO: The n1 and twoU values in the ukeys follow strict | |

// patterns. For each K value, the n1 values are every integer | |

// between two bounds. For each (K, n1) value, the twoU values | |

// are every integer multiple of a certain base between two | |

// bounds. It might be worth turning these into directly | |

// indexible slices. | |

A := make([]map[ukey]float64, K+1) | |

A[K] = map[ukey]float64{ukey{n1: n1, twoU: twoU}: 0} | |

// Compute memo table (k, n1, twoU) triples from high K values | |

// to low K values. This drives the recurrence relation | |

// downward to figure out all of the needed argument triples. | |

// | |

// TODO: Is it possible to generate this table bottom-up? If | |

// so, this could be a pure dynamic programming algorithm and | |

// we could discard the K dimension. We could at least store | |

// the inputs in a more compact representation that replaces | |

// the twoU dimension with an interval and a step size (as | |

// suggested by Cheung, Klotz, not that they make it at all | |

// clear *why* they're suggesting this). | |

tsum := sumint(t) // always ∑ t[0:k] | |

for k := K - 1; k >= 2; k-- { | |

tsum -= t[k] | |

A[k] = make(map[ukey]float64) | |

// Construct A[k] from A[k+1]. | |

for A_kplus1 := range A[k+1] { | |

rkLow := maxint(0, A_kplus1.n1-tsum) | |

rkHigh := minint(A_kplus1.n1, t[k]) | |

for rk := rkLow; rk <= rkHigh; rk++ { | |

twoU_k := A_kplus1.twoU - rk*(a[k+1]-2*A_kplus1.n1+rk) | |

n1_k := A_kplus1.n1 - rk | |

if twoUmin(n1_k, t[:k], a) <= twoU_k && twoU_k <= twoUmax(n1_k, t[:k], a) { | |

key := ukey{n1: n1_k, twoU: twoU_k} | |

A[k][key] = 0 | |

} | |

} | |

} | |

} | |

// Fill counts in memo table from low K values to high K | |

// values. This unwinds the recurrence relation. | |

// Start with K==2 base case. | |

// | |

// TODO: Later computations depend on these, but these don't | |

// depend on anything (including each other), so if K==2, we | |

// can skip the memo table altogether. | |

if K < 2 { | |

panic("K < 2") | |

} | |

N_2 := t[0] + t[1] | |

for A_2i := range A[2] { | |

Asum := 0.0 | |

r2Low := maxint(0, A_2i.n1-t[0]) | |

r2High := (A_2i.twoU - A_2i.n1*(t[0]-A_2i.n1)) / N_2 | |

for r2 := r2Low; r2 <= r2High; r2++ { | |

Asum += mathChoose(t[0], A_2i.n1-r2) * | |

mathChoose(t[1], r2) | |

} | |

A[2][A_2i] = Asum | |

} | |

// Derive counts for the rest of the memo table. | |

tsum = t[0] // always ∑ t[0:k-1] | |

for k := 3; k <= K; k++ { | |

tsum += t[k-2] | |

// Compute A[k] counts from A[k-1] counts. | |

for A_ki := range A[k] { | |

Asum := 0.0 | |

rkLow := maxint(0, A_ki.n1-tsum) | |

rkHigh := minint(A_ki.n1, t[k-1]) | |

for rk := rkLow; rk <= rkHigh; rk++ { | |

twoU_kminus1 := A_ki.twoU - rk*(a[k]-2*A_ki.n1+rk) | |

n1_kminus1 := A_ki.n1 - rk | |

x, ok := A[k-1][ukey{n1: n1_kminus1, twoU: twoU_kminus1}] | |

if !ok && twoUmax(n1_kminus1, t[:k-1], a) < twoU_kminus1 { | |

x = mathChoose(tsum, n1_kminus1) | |

} | |

Asum += x * mathChoose(t[k-1], rk) | |

} | |

A[k][A_ki] = Asum | |

} | |

} | |

return A | |

} | |

func twoUmin(n1 int, t, a []int) int { | |

K := len(t) | |

twoU := -n1 * n1 | |

n1_k := n1 | |

for k := 1; k <= K; k++ { | |

twoU_k := minint(n1_k, t[k-1]) | |

twoU += twoU_k * a[k] | |

n1_k -= twoU_k | |

} | |

return twoU | |

} | |

func twoUmax(n1 int, t, a []int) int { | |

K := len(t) | |

twoU := -n1 * n1 | |

n1_k := n1 | |

for k := K; k > 0; k-- { | |

twoU_k := minint(n1_k, t[k-1]) | |

twoU += twoU_k * a[k] | |

n1_k -= twoU_k | |

} | |

return twoU | |

} | |

func (d UDist) PMF(U float64) float64 { | |

if U < 0 || U >= 0.5+float64(d.N1*d.N2) { | |

return 0 | |

} | |

if d.hasTies() { | |

// makeUmemo computes the CDF directly. Take its | |

// difference to get the PMF. | |

p1, ok1 := makeUmemo(int(2*U)-1, d.N1, d.T)[len(d.T)][ukey{d.N1, int(2*U) - 1}] | |

p2, ok2 := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}] | |

if !ok1 || !ok2 { | |

panic("makeUmemo did not return expected memoization table") | |

} | |

return (p2 - p1) / mathChoose(d.N1+d.N2, d.N1) | |

} | |

// There are no ties. Use the fast algorithm. U must be integral. | |

Ui := int(math.Floor(U)) | |

// TODO: Use symmetry to minimize U | |

return d.p(Ui)[Ui] | |

} | |

func (d UDist) CDF(U float64) float64 { | |

if U < 0 { | |

return 0 | |

} else if U >= float64(d.N1*d.N2) { | |

return 1 | |

} | |

if d.hasTies() { | |

// TODO: Minimize U? | |

p, ok := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}] | |

if !ok { | |

panic("makeUmemo did not return expected memoization table") | |

} | |

return p / mathChoose(d.N1+d.N2, d.N1) | |

} | |

// There are no ties. Use the fast algorithm. U must be integral. | |

Ui := int(math.Floor(U)) | |

// The distribution is symmetric around U = m * n / 2. Sum up | |

// whichever tail is smaller. | |

flip := Ui >= (d.N1*d.N2+1)/2 | |

if flip { | |

Ui = d.N1*d.N2 - Ui - 1 | |

} | |

pdfs := d.p(Ui) | |

p := 0.0 | |

for _, pdf := range pdfs[:Ui+1] { | |

p += pdf | |

} | |

if flip { | |

p = 1 - p | |

} | |

return p | |

} | |

func (d UDist) Step() float64 { | |

return 0.5 | |

} | |

func (d UDist) Bounds() (float64, float64) { | |

// TODO: More precise bounds when there are ties. | |

return 0, float64(d.N1 * d.N2) | |

} |