ccitt: add the AutoDetectHeight decoding option Before this commit, the decodeEOL method would usually not advance the bitReader, if the expected EOL code was not found. But in some circumstances, it would still advance if it saw a valid decoding mode, since EOL-decoding and mode-decoding was combined into a single table, even though EOL was not listed in Table 1 of the "ITU-T Recommendation T.6" spec. After this commit, EOL's and modes are separate. Change-Id: If4b83baf8083abdf14462576c56aaeb7cc8ff0f9 Reviewed-on: https://go-review.googlesource.com/c/image/+/211238 Reviewed-by: Benny Siegert <bsiegert@gmail.com>

commit: 58c23975cae11f062d4b3b0c143fe248faac195d [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Sat Dec 14 13:27:13 2019 +1100
committer: Nigel Tao <nigeltao@golang.org> Sun Jan 19 04:44:24 2020 +0000
tree: 9078269fb436afc24ce051ec1dd0676b4a354a2f
parent: 9130b4cfad522142c86367afe5e34ce811a85a4b [diff]
diff --git a/ccitt/gen.go b/ccitt/gen.go
index 39f81c2..b181428 100644
--- a/ccitt/gen.go
+++ b/ccitt/gen.go

@@ -307,7 +307,6 @@
 	modeVL2         // Vertical-Left-2
 	modeVL3         // Vertical-Left-3
 	modeExt         // Extension
-	modeEOL         // End-of-Line
 )
 
 // COPY PASTE table.go END
@@ -333,10 +332,6 @@
 	{modeVL2, "000010"},
 	{modeVL3, "0000010"},
 	{modeExt, "0000001"},
-
-	// End-of-Line is not in Table 1, but we look for it at the same time that
-	// we look for other mode codes.
-	{modeEOL, "000000000001"},
 }
 
 var whiteCodes = []code{

diff --git a/ccitt/reader.go b/ccitt/reader.go
index 9326e1c..340de05 100644
--- a/ccitt/reader.go
+++ b/ccitt/reader.go

@@ -49,6 +49,10 @@
 	Group4
 )
 
+// AutoDetectHeight is passed as the height argument to NewReader to indicate
+// that the image height (the number of rows) is not known in advance.
+const AutoDetectHeight = -1
+
 // Options are optional parameters.
 type Options struct {
 	// Align means that some variable-bit-width codes are byte-aligned.
@@ -213,6 +217,7 @@
 		}
 		bitsRead |= bit << (63 - nBitsRead)
 		nBitsRead++
+
 		// The "&1" is redundant, but can eliminate a bounds check.
 		state = int32(decodeTable[state][bit&1])
 		if state < 0 {
@@ -226,6 +231,35 @@
 	}
 }
 
+// decodeEOL decodes the 12-bit EOL code 0000_0000_0001.
+func decodeEOL(b *bitReader) error {
+	nBitsRead, bitsRead := uint32(0), uint64(0)
+	for {
+		bit, err := b.nextBit()
+		if err != nil {
+			if err == io.EOF {
+				err = errMissingEOL
+			}
+			return err
+		}
+		bitsRead |= bit << (63 - nBitsRead)
+		nBitsRead++
+
+		if nBitsRead < 12 {
+			if bit&1 == 0 {
+				continue
+			}
+		} else if bit&1 != 0 {
+			return nil
+		}
+
+		// Unread the bits we've read, then return errMissingEOL.
+		b.bits = (b.bits >> nBitsRead) | bitsRead
+		b.nBits += nBitsRead
+		return errMissingEOL
+	}
+}
+
 type reader struct {
 	br        bitReader
 	subFormat SubFormat
@@ -235,7 +269,10 @@
 
 	// rowsRemaining starts at the image height in pixels, when the reader is
 	// driven through the io.Reader interface, and decrements to zero as rows
-	// are decoded. When driven through DecodeIntoGray, this field is unused.
+	// are decoded. Alternatively, it may be negative if the image height is
+	// not known in advance at the time of the NewReader call.
+	//
+	// When driven through DecodeIntoGray, this field is unused.
 	rowsRemaining int
 
 	// curr and prev hold the current and previous rows. Each element is either
@@ -278,7 +315,12 @@
 	// (but otherwise padding to a byte boundary, with either all 0 bits or all
 	// 1 bits) is invalid according to the spec, but happens in practice when
 	// exporting from Adobe Acrobat to TIFF + CCITT. This package silently
-	// ignores CCITT input that has been truncated in that fashion.
+	// ignores the format error for CCITT input that has been truncated in that
+	// fashion, returning the full decoded image.
+	//
+	// Detecting trailer truncation (just after the final row of pixels)
+	// requires knowing which row is the final row, and therefore does not
+	// trigger if the image height is not known in advance.
 	truncated bool
 
 	// readErr is a sticky error for the Read method.
@@ -306,17 +348,50 @@
 
 		// Decode the next row, if necessary.
 		if z.atStartOfRow {
-			if z.rowsRemaining <= 0 {
-				if z.readErr = z.finishDecode(); z.readErr != nil {
+			if z.rowsRemaining < 0 {
+				// We do not know the image height in advance. See if the next
+				// code is an EOL. If it is, it is consumed. If it isn't, the
+				// bitReader shouldn't advance along the bit stream, and we
+				// simply decode another row of pixel data.
+				//
+				// For the Group4 subFormat, we may need to align to a byte
+				// boundary. For the Group3 subFormat, the previous z.decodeRow
+				// call (or z.startDecode call) has already consumed one of the
+				// 6 consecutive EOL's. The next EOL is actually the second of
+				// 6, in the middle, and we shouldn't align at that point.
+				if z.align && (z.subFormat == Group4) {
+					z.br.alignToByteBoundary()
+				}
+
+				if err := z.decodeEOL(); err == errMissingEOL {
+					// No-op. It's another row of pixel data.
+				} else if err != nil {
+					z.readErr = err
+					break
+				} else {
+					if z.readErr = z.finishDecode(true); z.readErr != nil {
+						break
+					}
+					z.readErr = io.EOF
+					break
+				}
+
+			} else if z.rowsRemaining == 0 {
+				// We do know the image height in advance, and we have already
+				// decoded exactly that many rows.
+				if z.readErr = z.finishDecode(false); z.readErr != nil {
 					break
 				}
 				z.readErr = io.EOF
 				break
+
+			} else {
+				z.rowsRemaining--
 			}
-			if z.readErr = z.decodeRow(z.rowsRemaining == 1); z.readErr != nil {
+
+			if z.readErr = z.decodeRow(z.rowsRemaining == 0); z.readErr != nil {
 				break
 			}
-			z.rowsRemaining--
 		}
 
 		// Pack from z.curr (1 byte per pixel) to p (1 bit per pixel).
@@ -363,7 +438,7 @@
 	return nil
 }
 
-func (z *reader) finishDecode() error {
+func (z *reader) finishDecode(alreadySeenEOL bool) error {
 	numberOfEOLs := 0
 	switch z.subFormat {
 	case Group3:
@@ -376,25 +451,31 @@
 		numberOfEOLs = 5
 
 	case Group4:
-		// The stream ends with two EOL's, the first of which is possibly
-		// byte-aligned.
-		numberOfEOLs = 2
-		if err := z.decodeEOL(); err == nil {
-			numberOfEOLs--
-		} else if err == errInvalidCode {
-			// Try again, this time starting from a byte boundary.
+		autoDetectHeight := z.rowsRemaining < 0
+		if autoDetectHeight {
+			// Aligning to a byte boundary was already handled by reader.Read.
+		} else if z.align {
 			z.br.alignToByteBoundary()
-		} else if err == errMissingEOL {
-			z.truncated = true
-			return nil
-		} else {
+		}
+		// The stream ends with two EOL's. If the first one is missing, and we
+		// had an explicit image height, we just assume that the trailing two
+		// EOL's were truncated and return a nil error.
+		if err := z.decodeEOL(); err != nil {
+			if (err == errMissingEOL) && !autoDetectHeight {
+				z.truncated = true
+				return nil
+			}
 			return err
 		}
+		numberOfEOLs = 1
 
 	default:
 		return errUnsupportedSubFormat
 	}
 
+	if alreadySeenEOL {
+		numberOfEOLs--
+	}
 	for ; numberOfEOLs > 0; numberOfEOLs-- {
 		if err := z.decodeEOL(); err != nil {
 			return err
@@ -404,19 +485,7 @@
 }
 
 func (z *reader) decodeEOL() error {
-	// TODO: EOL doesn't have to be in the modeDecodeTable. It could be in its
-	// own table, or we could just hard-code it, especially if we might need to
-	// cater for optional byte-alignment, or an arbitrary number (potentially
-	// more than 8) of 0-valued padding bits.
-	if mode, err := decode(&z.br, modeDecodeTable[:]); err != nil {
-		if err == errIncompleteCode {
-			return errMissingEOL
-		}
-		return err
-	} else if mode != modeEOL {
-		return errMissingEOL
-	}
-	return nil
+	return decodeEOL(&z.br)
 }
 
 func (z *reader) decodeRow(finalRow bool) error {
@@ -686,7 +755,7 @@
 		z.curr, z.prev = nil, z.curr
 	}
 
-	if err := z.finishDecode(); err != nil {
+	if err := z.finishDecode(false); err != nil {
 		return err
 	}
 
@@ -703,9 +772,12 @@
 // NewReader returns an io.Reader that decodes the CCITT-formatted data in r.
 // The resultant byte stream is one bit per pixel (MSB first), with 1 meaning
 // white and 0 meaning black. Each row in the result is byte-aligned.
+//
+// A negative height, such as passing AutoDetectHeight, means that the image
+// height is not known in advance. A negative width is invalid.
 func NewReader(r io.Reader, order Order, sf SubFormat, width int, height int, opts *Options) io.Reader {
 	readErr := error(nil)
-	if (width < 0) || (height < 0) {
+	if width < 0 {
 		readErr = errInvalidBounds
 	} else if width > maxWidth {
 		readErr = errUnsupportedWidth

diff --git a/ccitt/reader_test.go b/ccitt/reader_test.go
index d9b56fc..8de1c3d 100644
--- a/ccitt/reader_test.go
+++ b/ccitt/reader_test.go

@@ -222,7 +222,10 @@
 		modeH,
 		modeVL1,
 		modeVL1,
-		modeEOL,
+		// The exact value of this final slice element doesn't really matter,
+		// except that testDecodeTable assumes that it (the finalValue) is
+		// different from every previous element.
+		modeVL3,
 	})
 }
 
@@ -282,7 +285,7 @@
 	}
 }
 
-func testRead(t *testing.T, fileName string, sf SubFormat, align, invert bool) {
+func testRead(t *testing.T, fileName string, sf SubFormat, align, invert, truncated bool) {
 	t.Helper()
 
 	const width, height = 153, 55
@@ -369,6 +372,26 @@
 	if got != want {
 		t.Fatalf("got:\n%s\nwant:\n%s", format(got), format(want))
 	}
+
+	// Passing AutoDetectHeight should produce the same output, provided that
+	// the input hasn't truncated the trailing sequence of consecutive EOL's
+	// that marks the end of the image.
+	if !truncated {
+		f, err := os.Open(fileName)
+		if err != nil {
+			t.Fatalf("Open: %v", err)
+		}
+		defer f.Close()
+		adhBytes, err := ioutil.ReadAll(NewReader(f, MSB, sf, width, AutoDetectHeight, opts))
+		if err != nil {
+			t.Fatalf("ReadAll: %v", err)
+		}
+		if s := string(adhBytes); s != want {
+			t.Fatalf("AutoDetectHeight produced different output.\n"+
+				"height=%d:\n%s\nheight=%d:\n%s",
+				AutoDetectHeight, format(s), height, format(want))
+		}
+	}
 }
 
 func TestRead(t *testing.T) {
@@ -392,7 +415,8 @@
 		}
 		align := strings.Contains(fileName, "aligned")
 		invert := strings.Contains(fileName, "inverted")
-		testRead(t, fileName, subFormat, align, invert)
+		truncated := strings.Contains(fileName, "truncated")
+		testRead(t, fileName, subFormat, align, invert, truncated)
 	}
 }
 

diff --git a/ccitt/table.go b/ccitt/table.go
index f01cc12..ef7ea9d 100644
--- a/ccitt/table.go
+++ b/ccitt/table.go

@@ -31,17 +31,7 @@
 
 // modeDecodeTable represents Table 1 and the End-of-Line code.
 //
-//                              +=XXXXX
-// b015                       +-+
-//                            | +=v0010
-// b014                     +-+
-//                          | +=XXXXX
-// b013                   +-+
-//                        | +=XXXXX
-// b012                 +-+
-//                      | +=XXXXX
-// b011               +-+
-//                    | +=XXXXX
+//                    +=XXXXX
 // b009             +-+
 //                  | +=v0009
 // b007           +-+
@@ -72,13 +62,8 @@
 	6:  {7, 8},
 	7:  {9, 10},
 	8:  {^7, ^4},
-	9:  {11, ^9},
+	9:  {0, ^9},
 	10: {^8, ^5},
-	11: {12, 0},
-	12: {13, 0},
-	13: {14, 0},
-	14: {15, 0},
-	15: {0, ^10},
 }
 
 // whiteDecodeTable represents Tables 2 and 3 for a white run.
@@ -733,17 +718,16 @@
 
 // modeEncodeTable represents Table 1 and the End-of-Line code.
 var modeEncodeTable = [...]bitString{
-	0:  {0x0001, 4},  // "0001"
-	1:  {0x0001, 3},  // "001"
-	2:  {0x0001, 1},  // "1"
-	3:  {0x0003, 3},  // "011"
-	4:  {0x0003, 6},  // "000011"
-	5:  {0x0003, 7},  // "0000011"
-	6:  {0x0002, 3},  // "010"
-	7:  {0x0002, 6},  // "000010"
-	8:  {0x0002, 7},  // "0000010"
-	9:  {0x0001, 7},  // "0000001"
-	10: {0x0001, 12}, // "000000000001"
+	0: {0x0001, 4}, // "0001"
+	1: {0x0001, 3}, // "001"
+	2: {0x0001, 1}, // "1"
+	3: {0x0003, 3}, // "011"
+	4: {0x0003, 6}, // "000011"
+	5: {0x0003, 7}, // "0000011"
+	6: {0x0002, 3}, // "010"
+	7: {0x0002, 6}, // "000010"
+	8: {0x0002, 7}, // "0000010"
+	9: {0x0001, 7}, // "0000001"
 }
 
 // whiteEncodeTable2 represents Table 2 for a white run.
@@ -983,7 +967,6 @@
 	modeVL2         // Vertical-Left-2
 	modeVL3         // Vertical-Left-3
 	modeExt         // Extension
-	modeEOL         // End-of-Line
 )
 
 // COPY PASTE table.go END

diff --git a/ccitt/table_test.go b/ccitt/table_test.go
index f37077a..c9ff31f 100644
--- a/ccitt/table_test.go
+++ b/ccitt/table_test.go

@@ -20,10 +20,6 @@
 	{modeVL2, "000010"},
 	{modeVL3, "0000010"},
 	{modeExt, "0000001"},
-
-	// End-of-Line is not in Table 1, but we look for it at the same time that
-	// we look for other mode codes.
-	{modeEOL, "000000000001"},
 }
 
 var whiteCodes = []code{
commit	58c23975cae11f062d4b3b0c143fe248faac195d	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Sat Dec 14 13:27:13 2019 +1100
committer	Nigel Tao <nigeltao@golang.org>	Sun Jan 19 04:44:24 2020 +0000
tree	9078269fb436afc24ce051ec1dd0676b4a354a2f
parent	9130b4cfad522142c86367afe5e34ce811a85a4b [diff]