src/image/jpeg/dct.go - go.git - Git at Google

 // Copyright 2025 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package jpeg

 // Discrete Cosine Transformation (DCT) implementations using the algorithm from
 // Christoph Loeffler, Adriaan Lightenberg, and George S. Mostchytz,
 // “Practical Fast 1-D DCT Algorithms with 11 Multiplications,” ICASSP 1989.
 // https://ieeexplore.ieee.org/document/266596
 //
 // Since the paper is paywalled, the rest of this comment gives a summary.
 //
 // A 1-dimensional forward DCT (1D FDCT) takes as input 8 values x0..x7
 // and transforms them in place into the result values.
 //
 // The mathematical definition of the N-point 1D FDCT is:
 //
 //	X[k] = α_k Σ_n x[n] * cos (2n+1)*k*π/2N
 //
 // where α₀ = √2 and α_k = 1 for k > 0.
 //
 // For our purposes, N=8, so the angles end up being multiples of π/16.
 // The most direct implementation of this definition would require 64 multiplications.
 //
 // Loeffler's paper presents a more efficient computation that requires only
 // 11 multiplications and works in terms of three basic operations:
 //
 //  - A “butterfly” x0, x1 = x0+x1, x0-x1.
 //    The inverse is x0, x1 = (x0+x1)/2, (x0-x1)/2.
 //
 //  - A scaling of x0 by k: x0 *= k. The inverse is scaling by 1/k.
 //
 //  - A rotation of x0, x1 by θ, defined as:
 //    x0, x1 = x0 cos θ + x1 sin θ, -x0 sin θ + x1 cos θ.
 //    The inverse is rotation by -θ.
 //
 // The algorithm proceeds in four stages:
 //
 // Stage 1:
 //  - butterfly x0, x7; x1, x6; x2, x5; x3, x4.
 //
 // Stage 2:
 //  - butterfly x0, x3; x1, x2
 //  - rotate x4, x7 by 3π/16
 //  - rotate x5, x6 by π/16.
 //
 // Stage 3:
 //  - butterfly x0, x1; x4, x6; x7, x5
 //  - rotate x2, x3 by 6π/16 and scale by √2.
 //
 // Stage 4:
 //  - butterfly x7, x4
 //  - scale x5, x6 by √2.
 //
 // Finally, the values are permuted. The permutation can be read as either:
 //  - x0, x4, x2, x6, x7, x3, x5, x1 = x0, x1, x2, x3, x4, x5, x6, x7 (paper's form)
 //  - x0, x1, x2, x3, x4, x5, x6, x7 = x0, x7, x2, x5, x1, x6, x3, x4 (sorted by LHS)
 // The code below uses the second form to make it easier to merge adjacent stores.
 // (Note that unlike in recursive FFT implementations, the permutation here is
 // not always mapping indexes to their bit reversals.)
 //
 // As written above, the rotation requires four multiplications, but it can be
 // reduced to three by refactoring (see [dctBox] below), and the scaling in
 // stage 3 can be merged into the rotation constants, so the overall cost
 // of a 1D FDCT is 11 multiplies.
 //
 // The 1D inverse DCT (IDCT) is the 1D FDCT run backward
 // with all the basic operations inverted.

 // dctBox implements a 3-multiply, 3-add rotation+scaling.
 // Given x0, x1, k*cos θ, and k*sin θ, dctBox returns the
 // rotated and scaled coordinates.
 // (It is called dctBox because the rotate+scale operation
 // is drawn as a box in Figures 1 and 2 in the paper.)
 func dctBox(x0, x1, kcos, ksin int32) (y0, y1 int32) {
 	// y0 = x0*kcos + x1*ksin
 	// y1 = -x0*ksin + x1*kcos
 	ksum := kcos * (x0 + x1)
 	y0 = ksum + (ksin-kcos)*x1
 	y1 = ksum - (kcos+ksin)*x0
 	return y0, y1
 }

 // A block is an 8x8 input to a 2D DCT (either the FDCT or IDCT).
 // The input is actually only 8x8 uint8 values, and the outputs are 8x8 int16,
 // but it is convenient to use int32s for intermediate storage,
 // so we define only a single block type of [8*8]int32.
 //
 // A 2D DCT is implemented as 1D DCTs over the rows and columns.
 //
 // dct_test.go defines a String method for nice printing in tests.
 type block [blockSize]int32

 const blockSize = 8 * 8

 // Note on Numerical Precision
 //
 // The inputs to both the FDCT and IDCT are uint8 values stored in a block,
 // and the outputs are int16s in the same block, but the overall operation
 // uses int32 values as fixed-point intermediate values.
 // In the code comments below, the notation “QN.M” refers to a
 // signed value of 1+N+M significant bits, one of which is the sign bit,
 // and M of which hold fractional (sub-integer) precision.
 // For example, 255 as a Q8.0 value is stored as int32(255),
 // while 255 as a Q8.1 value is stored as int32(510),
 // and 255.5 as a Q8.1 value is int32(511).
 // The notation UQN.M refers to an unsigned value of N+M significant bits.
 // See https://en.wikipedia.org/wiki/Q_(number_format) for more.
 //
 // In general we only need to keep about 16 significant bits, but it is more
 // efficient and somewhat more precise to let unnecessary fractional bits
 // accumulate and shift them away in bulk rather than after every operation.
 // As such, it is important to keep track of the number of fractional bits
 // in each variable at different points in the code, to avoid mistakes like
 // adding numbers with different fractional precisions, as well as to keep
 // track of the total number of bits, to avoid overflow. A comment like:
 //
 //	// x[123] now Q8.2.
 //
 // means that x1, x2, and x3 are all Q8.2 (11-bit) values.
 // Keeping extra precision bits also reduces the size of the errors introduced
 // by using right shift to approximate rounded division.

 // Constants needed for the implementation.
 // These are all 60-bit precision fixed-point constants.
 // The function c(val, b) rounds the constant to b bits.
 // c is simple enough that calls to it with constant args
 // are inlined and constant-propagated down to an inline constant.
 // Each constant is commented with its Ivy definition (see robpike.io/ivy),
 // using this scaling helper function:
 //
 //	op fix x = floor 0.5 + x * 2**60
 const (
 	cos1          = 1130768441178740757 // fix cos 1*pi/16
 	sin1          = 224923827593068887  // fix sin 1*pi/16
 	cos3          = 958619196450722178  // fix cos 3*pi/16
 	sin3          = 640528868967736374  // fix sin 3*pi/16
 	sqrt2         = 1630477228166597777 // fix sqrt 2
 	sqrt2_cos6    = 623956622067911264  // fix (sqrt 2)*cos 6*pi/16
 	sqrt2_sin6    = 1506364539328854985 // fix (sqrt 2)*sin 6*pi/16
 	sqrt2inv      = 815238614083298888  // fix 1/sqrt 2
 	sqrt2inv_cos6 = 311978311033955632  // fix (1/sqrt 2)*cos 6*pi/16
 	sqrt2inv_sin6 = 753182269664427492  // fix (1/sqrt 2)*sin 6*pi/16
 )

 func c(x uint64, bits int) int32 {
 	return int32((x + (1 << (59 - bits))) >> (60 - bits))
 }

 // fdct implements the forward DCT.
 // Inputs are UQ8.0; outputs are Q13.0.
 func fdct(b *block) {
 	fdctCols(b)
 	fdctRows(b)
 }

 // fdctCols applies the 1D DCT to the columns of b.
 // Inputs are UQ8.0 in [0,255] but interpreted as [-128,127].
 // Outputs are Q10.18.
 func fdctCols(b *block) {
 	for i := range 8 {
 		x0 := b[0*8+i]
 		x1 := b[1*8+i]
 		x2 := b[2*8+i]
 		x3 := b[3*8+i]
 		x4 := b[4*8+i]
 		x5 := b[5*8+i]
 		x6 := b[6*8+i]
 		x7 := b[7*8+i]

 		// x[01234567] are UQ8.0 in [0,255].

 		// Stage 1: four butterflies.
 		// In general a butterfly of QN.M inputs produces Q(N+1).M outputs.
 		// A butterfly of UQN.M inputs produces a UQ(N+1).M sum and a QN.M difference.

 		x0, x7 = x0+x7, x0-x7
 		x1, x6 = x1+x6, x1-x6
 		x2, x5 = x2+x5, x2-x5
 		x3, x4 = x3+x4, x3-x4
 		// x[0123] now UQ9.0 in [0, 510].
 		// x[4567] now Q8.0 in [-255,255].

 		// Stage 2: two boxes and two butterflies.
 		// A box on QN.M inputs with B-bit constants
 		// produces Q(N+1).(M+B) outputs.
 		// (The +1 is from the addition.)

 		x4, x7 = dctBox(x4, x7, c(cos3, 18), c(sin3, 18))
 		x5, x6 = dctBox(x5, x6, c(cos1, 18), c(sin1, 18))
 		// x[47] now Q9.18 in [-354, 354].
 		// x[56] now Q9.18 in [-300, 300].

 		x0, x3 = x0+x3, x0-x3
 		x1, x2 = x1+x2, x1-x2
 		// x[01] now UQ10.0 in [0, 1020].
 		// x[23] now Q9.0 in [-510, 510].

 		// Stage 3: one box and three butterflies.

 		x2, x3 = dctBox(x2, x3, c(sqrt2_cos6, 18), c(sqrt2_sin6, 18))
 		// x[23] now Q10.18 in [-943, 943].

 		x0, x1 = x0+x1, x0-x1
 		// x0 now UQ11.0 in [0, 2040].
 		// x1 now Q10.0 in [-1020, 1020].

 		// Store x0, x1, x2, x3 to their permuted targets.
 		// The original +128 in every input value
 		// has cancelled out except in the “DC signal” x0.
 		// Subtracting 128*8 here is equivalent to subtracting 128
 		// from every input before we started, but cheaper.
 		// It also converts x0 from UQ11.18 to Q10.18.
 		b[0*8+i] = (x0 - 128*8) << 18
 		b[4*8+i] = x1 << 18
 		b[2*8+i] = x2
 		b[6*8+i] = x3

 		x4, x6 = x4+x6, x4-x6
 		x7, x5 = x7+x5, x7-x5
 		// x[4567] now Q10.18 in [-654, 654].

 		// Stage 4: two √2 scalings and one butterfly.

 		x5 = (x5 >> 12) * c(sqrt2, 12)
 		x6 = (x6 >> 12) * c(sqrt2, 12)
 		// x[56] still Q10.18 in [-925, 925] (= 654√2).
 		x7, x4 = x7+x4, x7-x4
 		// x[47] still Q10.18 in [-925, 925] (not Q11.18!).
 		// This is not obvious at all! See “Note on 925” below.

 		// Store x4 x5 x6 x7 to their permuted targets.
 		b[1*8+i] = x7
 		b[3*8+i] = x5
 		b[5*8+i] = x6
 		b[7*8+i] = x4
 	}
 }

 // fdctRows applies the 1D DCT to the rows of b.
 // Inputs are Q10.18; outputs are Q13.0.
 func fdctRows(b *block) {
 	for i := range 8 {
 		x := b[8*i : 8*i+8 : 8*i+8]
 		x0 := x[0]
 		x1 := x[1]
 		x2 := x[2]
 		x3 := x[3]
 		x4 := x[4]
 		x5 := x[5]
 		x6 := x[6]
 		x7 := x[7]

 		// x[01234567] are Q10.18 [-1020, 1020].

 		// Stage 1: four butterflies.

 		x0, x7 = x0+x7, x0-x7
 		x1, x6 = x1+x6, x1-x6
 		x2, x5 = x2+x5, x2-x5
 		x3, x4 = x3+x4, x3-x4
 		// x[01234567] now Q11.18 in [-2040, 2040].

 		// Stage 2: two boxes and two butterflies.

 		x4, x7 = dctBox(x4>>14, x7>>14, c(cos3, 14), c(sin3, 14))
 		x5, x6 = dctBox(x5>>14, x6>>14, c(cos1, 14), c(sin1, 14))
 		// x[47] now Q12.18 in [-2830, 2830].
 		// x[56] now Q12.18 in [-2400, 2400].
 		x0, x3 = x0+x3, x0-x3
 		x1, x2 = x1+x2, x1-x2
 		// x[01234567] now Q12.18 in [-4080, 4080].

 		// Stage 3: one box and three butterflies.

 		x2, x3 = dctBox(x2>>14, x3>>14, c(sqrt2_cos6, 14), c(sqrt2_sin6, 14))
 		// x[23] now Q13.18 in [-7539, 7539].
 		x0, x1 = x0+x1, x0-x1
 		// x[01] now Q13.18 in [-8160, 8160].
 		x4, x6 = x4+x6, x4-x6
 		x7, x5 = x7+x5, x7-x5
 		// x[4567] now Q13.18 in [-5230, 5230].

 		// Stage 4: two √2 scalings and one butterfly.

 		x5 = (x5 >> 14) * c(sqrt2, 14)
 		x6 = (x6 >> 14) * c(sqrt2, 14)
 		// x[56] still Q13.18 in [-7397, 7397] (= 5230√2).
 		x7, x4 = x7+x4, x7-x4
 		// x[47] still Q13.18 in [-7395, 7395] (= 2040*3.6246).
 		// See “Note on 925” below.

 		// Cut from Q13.18 to Q13.0.
 		x0 = (x0 + 1<<17) >> 18
 		x1 = (x1 + 1<<17) >> 18
 		x2 = (x2 + 1<<17) >> 18
 		x3 = (x3 + 1<<17) >> 18
 		x4 = (x4 + 1<<17) >> 18
 		x5 = (x5 + 1<<17) >> 18
 		x6 = (x6 + 1<<17) >> 18
 		x7 = (x7 + 1<<17) >> 18

 		// Note: Unlike in fdctCols, saved all stores for the end
 		// because they are adjacent memory locations and some systems
 		// can use multiword stores.
 		x[0] = x0
 		x[1] = x7
 		x[2] = x2
 		x[3] = x5
 		x[4] = x1
 		x[5] = x6
 		x[6] = x3
 		x[7] = x4
 	}
 }

 // “Note on 925”, deferred from above to avoid interrupting code.
 //
 // In fdctCols, heading into stage 2, the values x4, x5, x6, x7 are in [-255, 255].
 // Let's call those specific values b4, b5, b6, b7, and trace how x[4567] evolve:
 //
 // Stage 2:
 //	x4 = b4*cos3 + b7*sin3
 //	x7 = -b4*sin3 + b7*cos3
 //	x5 = b5*cos1 + b6*sin1
 //	x6 = -b5*sin1 + b6*cos1
 //
 // Stage 3:
 //
 //	x4 = x4+x6 =  b4*cos3 + b7*sin3 - b5*sin1 + b6*cos1
 //	x6 = x4-x6 =  b4*cos3 + b7*sin3 + b5*sin1 - b6*cos1
 //	x7 = x7+x5 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1
 //	x5 = x7-x5 = -b4*sin3 + b7*cos3 - b5*cos1 - b6*sin1
 //
 // Stage 4:
 //
 //	x7 = x7+x4 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1 + b4*cos3 + b7*sin3 - b5*sin1 + b6*cos1
 //	   = b4*(cos3-sin3) + b5*(cos1-sin1) + b6*(cos1+sin1) + b7*(cos3+sin3)
 //	   < 255*(0.2759 + 0.7857 + 1.1759 + 1.3871) = 255*3.6246 < 925.
 //
 //	x4 = x7-x4 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1 - b4*cos3 - b7*sin3 + b5*sin1 - b6*cos1
 //	   = -b4*(cos3+sin3) + b5*(cos1+sin1) + b6*(sin1-cos1) + b7*(cos3-sin3)
 //	   < same 925.
 //
 // The fact that x5, x6 are also at most 925 is not a coincidence: we are computing
 // the same kinds of numbers for all four, just with different paths to them.
 //
 // In fdctRows, the same analysis applies, but the initial values are
 // in [-2040, 2040] instead of [-255, 255], so the bound is 2040*3.6246 < 7395.

 // idct implements the inverse DCT.
 // Inputs are UQ8.0; outputs are Q10.3.
 func idct(b *block) {
 	// A 2D IDCT is a 1D IDCT on rows followed by columns.
 	idctRows(b)
 	idctCols(b)
 }

 // idctRows applies the 1D IDCT to the rows of b.
 // Inputs are UQ8.0; outputs are Q9.20.
 func idctRows(b *block) {
 	for i := range 8 {
 		x := b[8*i : 8*i+8 : 8*i+8]
 		x0 := x[0]
 		x7 := x[1]
 		x2 := x[2]
 		x5 := x[3]
 		x1 := x[4]
 		x6 := x[5]
 		x3 := x[6]
 		x4 := x[7]

 		// Run FDCT backward.
 		// Independent operations have been reordered somewhat
 		// to make precision tracking easier.
 		//
 		// Note that “x0, x1 = x0+x1, x0-x1” is now a reverse butterfly
 		// and carries with it an implicit divide by two: the extra bit
 		// is added to the precision, not the value size.

 		// x[01234567] are UQ8.0 in [0, 255].

 		// Stages 4, 3, 2: x0, x1, x2, x3.

 		x0 <<= 17
 		x1 <<= 17
 		// x0, x1 now UQ8.17.
 		x0, x1 = x0+x1, x0-x1
 		// x0 now UQ8.18 in [0, 255].
 		// x1 now Q7.18 in [-127½, 127½].

 		// Note: (1/sqrt 2)*((cos 6*pi/16)+(sin 6*pi/16)) < 0.924, so no new high bit.
 		x2, x3 = dctBox(x2, x3, c(sqrt2inv_cos6, 18), -c(sqrt2inv_sin6, 18))
 		// x[23] now Q8.18 in [-236, 236].
 		x1, x2 = x1+x2, x1-x2
 		x0, x3 = x0+x3, x0-x3
 		// x[0123] now Q8.19 in [-246, 246].

 		// Stages 4, 3, 2: x4, x5, x6, x7.

 		x4 <<= 7
 		x7 <<= 7
 		// x[47] now UQ8.7
 		x7, x4 = x7+x4, x7-x4
 		// x7 now UQ8.8 in [0, 255].
 		// x4 now Q7.8 in [-127½, 127½].

 		x6 = x6 * c(sqrt2inv, 8)
 		x5 = x5 * c(sqrt2inv, 8)
 		// x[56] now UQ8.8 in [0, 181].
 		// Note that 1/√2 has five 0s in its binary representation after
 		// the 8th bit, so this multipliy is actually producing 12 bits of precision.

 		x7, x5 = x7+x5, x7-x5
 		x4, x6 = x4+x6, x4-x6
 		// x[4567] now Q8.9 in [-218, 218].

 		x4, x7 = dctBox(x4>>2, x7>>2, c(cos3, 12), -c(sin3, 12))
 		x5, x6 = dctBox(x5>>2, x6>>2, c(cos1, 12), -c(sin1, 12))
 		// x[4567] now Q9.19 in [-303, 303].

 		// Stage 1.

 		x0, x7 = x0+x7, x0-x7
 		x1, x6 = x1+x6, x1-x6
 		x2, x5 = x2+x5, x2-x5
 		x3, x4 = x3+x4, x3-x4
 		// x[01234567] now Q9.20 in [-275, 275].

 		// Note: we don't need all 20 bits of “precision”,
 		// but it is faster to let idctCols shift it away as part
 		// of other operations rather than downshift here.

 		x[0] = x0
 		x[1] = x1
 		x[2] = x2
 		x[3] = x3
 		x[4] = x4
 		x[5] = x5
 		x[6] = x6
 		x[7] = x7
 	}
 }

 // idctCols applies the 1D IDCT to the columns of b.
 // Inputs are Q9.20.
 // Outputs are Q10.3. That is, the result is the IDCT*8.
 func idctCols(b *block) {
 	for i := range 8 {
 		x0 := b[0*8+i]
 		x7 := b[1*8+i]
 		x2 := b[2*8+i]
 		x5 := b[3*8+i]
 		x1 := b[4*8+i]
 		x6 := b[5*8+i]
 		x3 := b[6*8+i]
 		x4 := b[7*8+i]

 		// x[012345678] are Q9.20.

 		// Start by adding 0.5 to x0 (the incoming DC signal).
 		// The butterflies will add it to all the other values,
 		// and then the final shifts will round properly.
 		x0 += 1 << 19

 		// Stages 4, 3, 2: x0, x1, x2, x3.

 		x0, x1 = (x0+x1)>>2, (x0-x1)>>2
 		// x[01] now Q9.19.
 		// Note: (1/sqrt 2)*((cos 6*pi/16)+(sin 6*pi/16)) < 1, so no new high bit.
 		x2, x3 = dctBox(x2>>13, x3>>13, c(sqrt2inv_cos6, 12), -c(sqrt2inv_sin6, 12))
 		// x[0123] now Q9.19.

 		x1, x2 = x1+x2, x1-x2
 		x0, x3 = x0+x3, x0-x3
 		// x[0123] now Q9.20.

 		// Stages 4, 3, 2: x4, x5, x6, x7.

 		x7, x4 = x7+x4, x7-x4
 		// x[47] now Q9.21.

 		x5 = (x5 >> 13) * c(sqrt2inv, 14)
 		x6 = (x6 >> 13) * c(sqrt2inv, 14)
 		// x[56] now Q9.21.

 		x7, x5 = x7+x5, x7-x5
 		x4, x6 = x4+x6, x4-x6
 		// x[4567] now Q9.22.

 		x4, x7 = dctBox(x4>>14, x7>>14, c(cos3, 12), -c(sin3, 12))
 		x5, x6 = dctBox(x5>>14, x6>>14, c(cos1, 12), -c(sin1, 12))
 		// x[4567] now Q10.20.

 		x0, x7 = x0+x7, x0-x7
 		x1, x6 = x1+x6, x1-x6
 		x2, x5 = x2+x5, x2-x5
 		x3, x4 = x3+x4, x3-x4
 		// x[01234567] now Q10.21.

 		x0 >>= 18
 		x1 >>= 18
 		x2 >>= 18
 		x3 >>= 18
 		x4 >>= 18
 		x5 >>= 18
 		x6 >>= 18
 		x7 >>= 18
 		// x[01234567] now Q10.3.

 		b[0*8+i] = x0
 		b[1*8+i] = x1
 		b[2*8+i] = x2
 		b[3*8+i] = x3
 		b[4*8+i] = x4
 		b[5*8+i] = x5
 		b[6*8+i] = x6
 		b[7*8+i] = x7
 	}
 }
	// Copyright 2025 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package jpeg

	// Discrete Cosine Transformation (DCT) implementations using the algorithm from
	// Christoph Loeffler, Adriaan Lightenberg, and George S. Mostchytz,
	// “Practical Fast 1-D DCT Algorithms with 11 Multiplications,” ICASSP 1989.
	// https://ieeexplore.ieee.org/document/266596
	//
	// Since the paper is paywalled, the rest of this comment gives a summary.
	//
	// A 1-dimensional forward DCT (1D FDCT) takes as input 8 values x0..x7
	// and transforms them in place into the result values.
	//
	// The mathematical definition of the N-point 1D FDCT is:
	//
	// X[k] = α_k Σ_n x[n] * cos (2n+1)kπ/2N
	//
	// where α₀ = √2 and α_k = 1 for k > 0.
	//
	// For our purposes, N=8, so the angles end up being multiples of π/16.
	// The most direct implementation of this definition would require 64 multiplications.
	//
	// Loeffler's paper presents a more efficient computation that requires only
	// 11 multiplications and works in terms of three basic operations:
	//
	// - A “butterfly” x0, x1 = x0+x1, x0-x1.
	// The inverse is x0, x1 = (x0+x1)/2, (x0-x1)/2.
	//
	// - A scaling of x0 by k: x0 *= k. The inverse is scaling by 1/k.
	//
	// - A rotation of x0, x1 by θ, defined as:
	// x0, x1 = x0 cos θ + x1 sin θ, -x0 sin θ + x1 cos θ.
	// The inverse is rotation by -θ.
	//
	// The algorithm proceeds in four stages:
	//
	// Stage 1:
	// - butterfly x0, x7; x1, x6; x2, x5; x3, x4.
	//
	// Stage 2:
	// - butterfly x0, x3; x1, x2
	// - rotate x4, x7 by 3π/16
	// - rotate x5, x6 by π/16.
	//
	// Stage 3:
	// - butterfly x0, x1; x4, x6; x7, x5
	// - rotate x2, x3 by 6π/16 and scale by √2.
	//
	// Stage 4:
	// - butterfly x7, x4
	// - scale x5, x6 by √2.
	//
	// Finally, the values are permuted. The permutation can be read as either:
	// - x0, x4, x2, x6, x7, x3, x5, x1 = x0, x1, x2, x3, x4, x5, x6, x7 (paper's form)
	// - x0, x1, x2, x3, x4, x5, x6, x7 = x0, x7, x2, x5, x1, x6, x3, x4 (sorted by LHS)
	// The code below uses the second form to make it easier to merge adjacent stores.
	// (Note that unlike in recursive FFT implementations, the permutation here is
	// not always mapping indexes to their bit reversals.)
	//
	// As written above, the rotation requires four multiplications, but it can be
	// reduced to three by refactoring (see [dctBox] below), and the scaling in
	// stage 3 can be merged into the rotation constants, so the overall cost
	// of a 1D FDCT is 11 multiplies.
	//
	// The 1D inverse DCT (IDCT) is the 1D FDCT run backward
	// with all the basic operations inverted.

	// dctBox implements a 3-multiply, 3-add rotation+scaling.
	// Given x0, x1, kcos θ, and ksin θ, dctBox returns the
	// rotated and scaled coordinates.
	// (It is called dctBox because the rotate+scale operation
	// is drawn as a box in Figures 1 and 2 in the paper.)
	func dctBox(x0, x1, kcos, ksin int32) (y0, y1 int32) {
	// y0 = x0kcos + x1ksin
	// y1 = -x0ksin + x1kcos
	ksum := kcos * (x0 + x1)
	y0 = ksum + (ksin-kcos)*x1
	y1 = ksum - (kcos+ksin)*x0
	return y0, y1
	}

	// A block is an 8x8 input to a 2D DCT (either the FDCT or IDCT).
	// The input is actually only 8x8 uint8 values, and the outputs are 8x8 int16,
	// but it is convenient to use int32s for intermediate storage,
	// so we define only a single block type of [8*8]int32.
	//
	// A 2D DCT is implemented as 1D DCTs over the rows and columns.
	//
	// dct_test.go defines a String method for nice printing in tests.
	type block [blockSize]int32

	const blockSize = 8 * 8

	// Note on Numerical Precision
	//
	// The inputs to both the FDCT and IDCT are uint8 values stored in a block,
	// and the outputs are int16s in the same block, but the overall operation
	// uses int32 values as fixed-point intermediate values.
	// In the code comments below, the notation “QN.M” refers to a
	// signed value of 1+N+M significant bits, one of which is the sign bit,
	// and M of which hold fractional (sub-integer) precision.
	// For example, 255 as a Q8.0 value is stored as int32(255),
	// while 255 as a Q8.1 value is stored as int32(510),
	// and 255.5 as a Q8.1 value is int32(511).
	// The notation UQN.M refers to an unsigned value of N+M significant bits.
	// See https://en.wikipedia.org/wiki/Q_(number_format) for more.
	//
	// In general we only need to keep about 16 significant bits, but it is more
	// efficient and somewhat more precise to let unnecessary fractional bits
	// accumulate and shift them away in bulk rather than after every operation.
	// As such, it is important to keep track of the number of fractional bits
	// in each variable at different points in the code, to avoid mistakes like
	// adding numbers with different fractional precisions, as well as to keep
	// track of the total number of bits, to avoid overflow. A comment like:
	//
	// // x[123] now Q8.2.
	//
	// means that x1, x2, and x3 are all Q8.2 (11-bit) values.
	// Keeping extra precision bits also reduces the size of the errors introduced
	// by using right shift to approximate rounded division.

	// Constants needed for the implementation.
	// These are all 60-bit precision fixed-point constants.
	// The function c(val, b) rounds the constant to b bits.
	// c is simple enough that calls to it with constant args
	// are inlined and constant-propagated down to an inline constant.
	// Each constant is commented with its Ivy definition (see robpike.io/ivy),
	// using this scaling helper function:
	//
	// op fix x = floor 0.5 + x * 2**60
	const (
	cos1 = 1130768441178740757 // fix cos 1*pi/16
	sin1 = 224923827593068887 // fix sin 1*pi/16
	cos3 = 958619196450722178 // fix cos 3*pi/16
	sin3 = 640528868967736374 // fix sin 3*pi/16
	sqrt2 = 1630477228166597777 // fix sqrt 2
	sqrt2_cos6 = 623956622067911264 // fix (sqrt 2)cos 6pi/16
	sqrt2_sin6 = 1506364539328854985 // fix (sqrt 2)sin 6pi/16
	sqrt2inv = 815238614083298888 // fix 1/sqrt 2
	sqrt2inv_cos6 = 311978311033955632 // fix (1/sqrt 2)cos 6pi/16
	sqrt2inv_sin6 = 753182269664427492 // fix (1/sqrt 2)sin 6pi/16
	)

	func c(x uint64, bits int) int32 {
	return int32((x + (1 << (59 - bits))) >> (60 - bits))
	}

	// fdct implements the forward DCT.
	// Inputs are UQ8.0; outputs are Q13.0.
	func fdct(b *block) {
	fdctCols(b)
	fdctRows(b)
	}

	// fdctCols applies the 1D DCT to the columns of b.
	// Inputs are UQ8.0 in [0,255] but interpreted as [-128,127].
	// Outputs are Q10.18.
	func fdctCols(b *block) {
	for i := range 8 {
	x0 := b[0*8+i]
	x1 := b[1*8+i]
	x2 := b[2*8+i]
	x3 := b[3*8+i]
	x4 := b[4*8+i]
	x5 := b[5*8+i]
	x6 := b[6*8+i]
	x7 := b[7*8+i]

	// x[01234567] are UQ8.0 in [0,255].

	// Stage 1: four butterflies.
	// In general a butterfly of QN.M inputs produces Q(N+1).M outputs.
	// A butterfly of UQN.M inputs produces a UQ(N+1).M sum and a QN.M difference.

	x0, x7 = x0+x7, x0-x7
	x1, x6 = x1+x6, x1-x6
	x2, x5 = x2+x5, x2-x5
	x3, x4 = x3+x4, x3-x4
	// x[0123] now UQ9.0 in [0, 510].
	// x[4567] now Q8.0 in [-255,255].

	// Stage 2: two boxes and two butterflies.
	// A box on QN.M inputs with B-bit constants
	// produces Q(N+1).(M+B) outputs.
	// (The +1 is from the addition.)

	x4, x7 = dctBox(x4, x7, c(cos3, 18), c(sin3, 18))
	x5, x6 = dctBox(x5, x6, c(cos1, 18), c(sin1, 18))
	// x[47] now Q9.18 in [-354, 354].
	// x[56] now Q9.18 in [-300, 300].

	x0, x3 = x0+x3, x0-x3
	x1, x2 = x1+x2, x1-x2
	// x[01] now UQ10.0 in [0, 1020].
	// x[23] now Q9.0 in [-510, 510].

	// Stage 3: one box and three butterflies.

	x2, x3 = dctBox(x2, x3, c(sqrt2_cos6, 18), c(sqrt2_sin6, 18))
	// x[23] now Q10.18 in [-943, 943].

	x0, x1 = x0+x1, x0-x1
	// x0 now UQ11.0 in [0, 2040].
	// x1 now Q10.0 in [-1020, 1020].

	// Store x0, x1, x2, x3 to their permuted targets.
	// The original +128 in every input value
	// has cancelled out except in the “DC signal” x0.
	// Subtracting 128*8 here is equivalent to subtracting 128
	// from every input before we started, but cheaper.
	// It also converts x0 from UQ11.18 to Q10.18.
	b[08+i] = (x0 - 1288) << 18
	b[4*8+i] = x1 << 18
	b[2*8+i] = x2
	b[6*8+i] = x3

	x4, x6 = x4+x6, x4-x6
	x7, x5 = x7+x5, x7-x5
	// x[4567] now Q10.18 in [-654, 654].

	// Stage 4: two √2 scalings and one butterfly.

	x5 = (x5 >> 12) * c(sqrt2, 12)
	x6 = (x6 >> 12) * c(sqrt2, 12)
	// x[56] still Q10.18 in [-925, 925] (= 654√2).
	x7, x4 = x7+x4, x7-x4
	// x[47] still Q10.18 in [-925, 925] (not Q11.18!).
	// This is not obvious at all! See “Note on 925” below.

	// Store x4 x5 x6 x7 to their permuted targets.
	b[1*8+i] = x7
	b[3*8+i] = x5
	b[5*8+i] = x6
	b[7*8+i] = x4
	}
	}

	// fdctRows applies the 1D DCT to the rows of b.
	// Inputs are Q10.18; outputs are Q13.0.
	func fdctRows(b *block) {
	for i := range 8 {
	x := b[8i : 8i+8 : 8*i+8]
	x0 := x[0]
	x1 := x[1]
	x2 := x[2]
	x3 := x[3]
	x4 := x[4]
	x5 := x[5]
	x6 := x[6]
	x7 := x[7]

	// x[01234567] are Q10.18 [-1020, 1020].

	// Stage 1: four butterflies.

	x0, x7 = x0+x7, x0-x7
	x1, x6 = x1+x6, x1-x6
	x2, x5 = x2+x5, x2-x5
	x3, x4 = x3+x4, x3-x4
	// x[01234567] now Q11.18 in [-2040, 2040].

	// Stage 2: two boxes and two butterflies.

	x4, x7 = dctBox(x4>>14, x7>>14, c(cos3, 14), c(sin3, 14))
	x5, x6 = dctBox(x5>>14, x6>>14, c(cos1, 14), c(sin1, 14))
	// x[47] now Q12.18 in [-2830, 2830].
	// x[56] now Q12.18 in [-2400, 2400].
	x0, x3 = x0+x3, x0-x3
	x1, x2 = x1+x2, x1-x2
	// x[01234567] now Q12.18 in [-4080, 4080].

	// Stage 3: one box and three butterflies.

	x2, x3 = dctBox(x2>>14, x3>>14, c(sqrt2_cos6, 14), c(sqrt2_sin6, 14))
	// x[23] now Q13.18 in [-7539, 7539].
	x0, x1 = x0+x1, x0-x1
	// x[01] now Q13.18 in [-8160, 8160].
	x4, x6 = x4+x6, x4-x6
	x7, x5 = x7+x5, x7-x5
	// x[4567] now Q13.18 in [-5230, 5230].

	// Stage 4: two √2 scalings and one butterfly.

	x5 = (x5 >> 14) * c(sqrt2, 14)
	x6 = (x6 >> 14) * c(sqrt2, 14)
	// x[56] still Q13.18 in [-7397, 7397] (= 5230√2).
	x7, x4 = x7+x4, x7-x4
	// x[47] still Q13.18 in [-7395, 7395] (= 2040*3.6246).
	// See “Note on 925” below.

	// Cut from Q13.18 to Q13.0.
	x0 = (x0 + 1<<17) >> 18
	x1 = (x1 + 1<<17) >> 18
	x2 = (x2 + 1<<17) >> 18
	x3 = (x3 + 1<<17) >> 18
	x4 = (x4 + 1<<17) >> 18
	x5 = (x5 + 1<<17) >> 18
	x6 = (x6 + 1<<17) >> 18
	x7 = (x7 + 1<<17) >> 18

	// Note: Unlike in fdctCols, saved all stores for the end
	// because they are adjacent memory locations and some systems
	// can use multiword stores.
	x[0] = x0
	x[1] = x7
	x[2] = x2
	x[3] = x5
	x[4] = x1
	x[5] = x6
	x[6] = x3
	x[7] = x4
	}
	}

	// “Note on 925”, deferred from above to avoid interrupting code.
	//
	// In fdctCols, heading into stage 2, the values x4, x5, x6, x7 are in [-255, 255].
	// Let's call those specific values b4, b5, b6, b7, and trace how x[4567] evolve:
	//
	// Stage 2:
	// x4 = b4cos3 + b7sin3
	// x7 = -b4sin3 + b7cos3
	// x5 = b5cos1 + b6sin1
	// x6 = -b5sin1 + b6cos1
	//
	// Stage 3:
	//
	// x4 = x4+x6 = b4cos3 + b7sin3 - b5sin1 + b6cos1
	// x6 = x4-x6 = b4cos3 + b7sin3 + b5sin1 - b6cos1
	// x7 = x7+x5 = -b4sin3 + b7cos3 + b5cos1 + b6sin1
	// x5 = x7-x5 = -b4sin3 + b7cos3 - b5cos1 - b6sin1
	//
	// Stage 4:
	//
	// x7 = x7+x4 = -b4sin3 + b7cos3 + b5cos1 + b6sin1 + b4cos3 + b7sin3 - b5sin1 + b6cos1
	// = b4(cos3-sin3) + b5(cos1-sin1) + b6(cos1+sin1) + b7(cos3+sin3)
	// < 255(0.2759 + 0.7857 + 1.1759 + 1.3871) = 2553.6246 < 925.
	//
	// x4 = x7-x4 = -b4sin3 + b7cos3 + b5cos1 + b6sin1 - b4cos3 - b7sin3 + b5sin1 - b6cos1
	// = -b4(cos3+sin3) + b5(cos1+sin1) + b6(sin1-cos1) + b7(cos3-sin3)
	// < same 925.
	//
	// The fact that x5, x6 are also at most 925 is not a coincidence: we are computing
	// the same kinds of numbers for all four, just with different paths to them.
	//
	// In fdctRows, the same analysis applies, but the initial values are
	// in [-2040, 2040] instead of [-255, 255], so the bound is 2040*3.6246 < 7395.

	// idct implements the inverse DCT.
	// Inputs are UQ8.0; outputs are Q10.3.
	func idct(b *block) {
	// A 2D IDCT is a 1D IDCT on rows followed by columns.
	idctRows(b)
	idctCols(b)
	}

	// idctRows applies the 1D IDCT to the rows of b.
	// Inputs are UQ8.0; outputs are Q9.20.
	func idctRows(b *block) {
	for i := range 8 {
	x := b[8i : 8i+8 : 8*i+8]
	x0 := x[0]
	x7 := x[1]
	x2 := x[2]
	x5 := x[3]
	x1 := x[4]
	x6 := x[5]
	x3 := x[6]
	x4 := x[7]

	// Run FDCT backward.
	// Independent operations have been reordered somewhat
	// to make precision tracking easier.
	//
	// Note that “x0, x1 = x0+x1, x0-x1” is now a reverse butterfly
	// and carries with it an implicit divide by two: the extra bit
	// is added to the precision, not the value size.

	// x[01234567] are UQ8.0 in [0, 255].

	// Stages 4, 3, 2: x0, x1, x2, x3.

	x0 <<= 17
	x1 <<= 17
	// x0, x1 now UQ8.17.
	x0, x1 = x0+x1, x0-x1
	// x0 now UQ8.18 in [0, 255].
	// x1 now Q7.18 in [-127½, 127½].

	// Note: (1/sqrt 2)((cos 6pi/16)+(sin 6*pi/16)) < 0.924, so no new high bit.
	x2, x3 = dctBox(x2, x3, c(sqrt2inv_cos6, 18), -c(sqrt2inv_sin6, 18))
	// x[23] now Q8.18 in [-236, 236].
	x1, x2 = x1+x2, x1-x2
	x0, x3 = x0+x3, x0-x3
	// x[0123] now Q8.19 in [-246, 246].

	// Stages 4, 3, 2: x4, x5, x6, x7.

	x4 <<= 7
	x7 <<= 7
	// x[47] now UQ8.7
	x7, x4 = x7+x4, x7-x4
	// x7 now UQ8.8 in [0, 255].
	// x4 now Q7.8 in [-127½, 127½].

	x6 = x6 * c(sqrt2inv, 8)
	x5 = x5 * c(sqrt2inv, 8)
	// x[56] now UQ8.8 in [0, 181].
	// Note that 1/√2 has five 0s in its binary representation after
	// the 8th bit, so this multipliy is actually producing 12 bits of precision.

	x7, x5 = x7+x5, x7-x5
	x4, x6 = x4+x6, x4-x6
	// x[4567] now Q8.9 in [-218, 218].

	x4, x7 = dctBox(x4>>2, x7>>2, c(cos3, 12), -c(sin3, 12))
	x5, x6 = dctBox(x5>>2, x6>>2, c(cos1, 12), -c(sin1, 12))
	// x[4567] now Q9.19 in [-303, 303].

	// Stage 1.

	x0, x7 = x0+x7, x0-x7
	x1, x6 = x1+x6, x1-x6
	x2, x5 = x2+x5, x2-x5
	x3, x4 = x3+x4, x3-x4
	// x[01234567] now Q9.20 in [-275, 275].

	// Note: we don't need all 20 bits of “precision”,
	// but it is faster to let idctCols shift it away as part
	// of other operations rather than downshift here.

	x[0] = x0
	x[1] = x1
	x[2] = x2
	x[3] = x3
	x[4] = x4
	x[5] = x5
	x[6] = x6
	x[7] = x7
	}
	}

	// idctCols applies the 1D IDCT to the columns of b.
	// Inputs are Q9.20.
	// Outputs are Q10.3. That is, the result is the IDCT*8.
	func idctCols(b *block) {
	for i := range 8 {
	x0 := b[0*8+i]
	x7 := b[1*8+i]
	x2 := b[2*8+i]
	x5 := b[3*8+i]
	x1 := b[4*8+i]
	x6 := b[5*8+i]
	x3 := b[6*8+i]
	x4 := b[7*8+i]

	// x[012345678] are Q9.20.

	// Start by adding 0.5 to x0 (the incoming DC signal).
	// The butterflies will add it to all the other values,
	// and then the final shifts will round properly.
	x0 += 1 << 19

	// Stages 4, 3, 2: x0, x1, x2, x3.

	x0, x1 = (x0+x1)>>2, (x0-x1)>>2
	// x[01] now Q9.19.
	// Note: (1/sqrt 2)((cos 6pi/16)+(sin 6*pi/16)) < 1, so no new high bit.
	x2, x3 = dctBox(x2>>13, x3>>13, c(sqrt2inv_cos6, 12), -c(sqrt2inv_sin6, 12))
	// x[0123] now Q9.19.

	x1, x2 = x1+x2, x1-x2
	x0, x3 = x0+x3, x0-x3
	// x[0123] now Q9.20.

	// Stages 4, 3, 2: x4, x5, x6, x7.

	x7, x4 = x7+x4, x7-x4
	// x[47] now Q9.21.

	x5 = (x5 >> 13) * c(sqrt2inv, 14)
	x6 = (x6 >> 13) * c(sqrt2inv, 14)
	// x[56] now Q9.21.

	x7, x5 = x7+x5, x7-x5
	x4, x6 = x4+x6, x4-x6
	// x[4567] now Q9.22.

	x4, x7 = dctBox(x4>>14, x7>>14, c(cos3, 12), -c(sin3, 12))
	x5, x6 = dctBox(x5>>14, x6>>14, c(cos1, 12), -c(sin1, 12))
	// x[4567] now Q10.20.

	x0, x7 = x0+x7, x0-x7
	x1, x6 = x1+x6, x1-x6
	x2, x5 = x2+x5, x2-x5
	x3, x4 = x3+x4, x3-x4
	// x[01234567] now Q10.21.

	x0 >>= 18
	x1 >>= 18
	x2 >>= 18
	x3 >>= 18
	x4 >>= 18
	x5 >>= 18
	x6 >>= 18
	x7 >>= 18
	// x[01234567] now Q10.3.

	b[0*8+i] = x0
	b[1*8+i] = x1
	b[2*8+i] = x2
	b[3*8+i] = x3
	b[4*8+i] = x4
	b[5*8+i] = x5
	b[6*8+i] = x6
	b[7*8+i] = x7
	}
	}