http2: optimization of AppendHuffmanString

This uses a uint64 bit buffer so can append 4 bytes at a time, instead
of using append every byte.

GOAMD64=v1
name                   old time/op    new time/op    delta
AppendHuffmanString-4     316ns ± 3%     169ns ± 1%  -46.60%  (p=0.000 n=10+8)

GOAMD64=v3
name                   old time/op    new time/op    delta
AppendHuffmanString-4     263ns ± 4%     114ns ± 1%  -56.68%  (p=0.000 n=10+8)

Change-Id: Ib48a0e0711589dd1b311c377f61e30d7c18c2953
GitHub-Last-Rev: 0a3eb27feefe920ffd511f0c4f93dfad1cbe69ce
GitHub-Pull-Request: golang/net#131
Reviewed-on: https://go-review.googlesource.com/c/net/+/403434
Reviewed-by: Damien Neil <dneil@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
diff --git a/http2/hpack/hpack_test.go b/http2/hpack/hpack_test.go
index a361a2a..84f8b83 100644
--- a/http2/hpack/hpack_test.go
+++ b/http2/hpack/hpack_test.go
@@ -509,6 +509,25 @@
 	}
 }
 
+func BenchmarkAppendHuffmanString(b *testing.B) {
+	b.StopTimer()
+	expected, err := hex.DecodeString(strings.Replace("94e7 821d d7f2 e6c7 b335 dfdf cd5b 3960 d5af 2708 7f36 72c1 ab27 0fb5 291f 9587 3160 65c0 03ed 4ee5 b106 3d50 07",
+		" ", "", -1))
+	if err != nil {
+		b.Fatal(err)
+	}
+	buf := make([]byte, 0, len(expected))
+	b.ReportAllocs()
+	b.StartTimer()
+
+	for i := 0; i < b.N; i++ {
+		enc := AppendHuffmanString(buf, "foo=ASDJKHQKBZXOQWEOPIUAXQWEOIU; max-age=3600; version=1")
+		if string(enc) != string(expected) {
+			b.Fatalf("bogus output %q", enc)
+		}
+	}
+}
+
 func TestHuffmanMaxStrLen(t *testing.T) {
 	const msg = "Some string"
 	huff := AppendHuffmanString(nil, msg)
diff --git a/http2/hpack/huffman.go b/http2/hpack/huffman.go
index fe0b84c..20d083a 100644
--- a/http2/hpack/huffman.go
+++ b/http2/hpack/huffman.go
@@ -169,25 +169,50 @@
 // AppendHuffmanString appends s, as encoded in Huffman codes, to dst
 // and returns the extended buffer.
 func AppendHuffmanString(dst []byte, s string) []byte {
-	rembits := uint8(8)
-
+	// This relies on the maximum huffman code length being 30 (See tables.go huffmanCodeLen array)
+	// So if a uint64 buffer has less than 32 valid bits can always accommodate another huffmanCode.
+	var (
+		x uint64 // buffer
+		n uint   // number valid of bits present in x
+	)
 	for i := 0; i < len(s); i++ {
-		if rembits == 8 {
-			dst = append(dst, 0)
+		c := s[i]
+		n += uint(huffmanCodeLen[c])
+		x <<= huffmanCodeLen[c] % 64
+		x |= uint64(huffmanCodes[c])
+		if n >= 32 {
+			n %= 32             // Normally would be -= 32 but %= 32 informs compiler 0 <= n <= 31 for upcoming shift
+			y := uint32(x >> n) // Compiler doesn't combine memory writes if y isn't uint32
+			dst = append(dst, byte(y>>24), byte(y>>16), byte(y>>8), byte(y))
 		}
-		dst, rembits = appendByteToHuffmanCode(dst, rembits, s[i])
 	}
-
-	if rembits < 8 {
-		// special EOS symbol
-		code := uint32(0x3fffffff)
-		nbits := uint8(30)
-
-		t := uint8(code >> (nbits - rembits))
-		dst[len(dst)-1] |= t
+	// Add padding bits if necessary
+	if over := n % 8; over > 0 {
+		const (
+			eosCode    = 0x3fffffff
+			eosNBits   = 30
+			eosPadByte = eosCode >> (eosNBits - 8)
+		)
+		pad := 8 - over
+		x = (x << pad) | (eosPadByte >> over)
+		n += pad // 8 now divides into n exactly
 	}
-
-	return dst
+	// n in (0, 8, 16, 24, 32)
+	switch n / 8 {
+	case 0:
+		return dst
+	case 1:
+		return append(dst, byte(x))
+	case 2:
+		y := uint16(x)
+		return append(dst, byte(y>>8), byte(y))
+	case 3:
+		y := uint16(x >> 8)
+		return append(dst, byte(y>>8), byte(y), byte(x))
+	}
+	//	case 4:
+	y := uint32(x)
+	return append(dst, byte(y>>24), byte(y>>16), byte(y>>8), byte(y))
 }
 
 // HuffmanEncodeLength returns the number of bytes required to encode
@@ -199,35 +224,3 @@
 	}
 	return (n + 7) / 8
 }
-
-// appendByteToHuffmanCode appends Huffman code for c to dst and
-// returns the extended buffer and the remaining bits in the last
-// element. The appending is not byte aligned and the remaining bits
-// in the last element of dst is given in rembits.
-func appendByteToHuffmanCode(dst []byte, rembits uint8, c byte) ([]byte, uint8) {
-	code := huffmanCodes[c]
-	nbits := huffmanCodeLen[c]
-
-	for {
-		if rembits > nbits {
-			t := uint8(code << (rembits - nbits))
-			dst[len(dst)-1] |= t
-			rembits -= nbits
-			break
-		}
-
-		t := uint8(code >> (nbits - rembits))
-		dst[len(dst)-1] |= t
-
-		nbits -= rembits
-		rembits = 8
-
-		if nbits == 0 {
-			break
-		}
-
-		dst = append(dst, 0)
-	}
-
-	return dst, rembits
-}