arm/armspec/spec.go - arch - Git at Google

 // Copyright 2014 The Go Authors.  All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // +build !386 go1.8
 // ... see golang.org/issue/12840

 // Armspec reads the ``ARM Architecture Reference Manual''
 // to collect instruction encoding details and writes those details to standard output
 // in JSON format.
 //
 // Warning Warning Warning
 //
 // This program is unfinished. It is being published in this incomplete form
 // for interested readers, but do not expect it to be runnable or useful.
 //
 package main

 import (
 	"bufio"
 	"bytes"
 	"encoding/json"
 	"fmt"
 	"log"
 	"math"
 	"os"
 	"regexp"
 	"sort"
 	"strconv"
 	"strings"

 	"rsc.io/pdf"
 )

 type Inst struct {
 	Name   string
 	ID     string
 	Bits   string
 	Arch   string
 	Syntax []string
 	Code   string
 }

 const debugPage = 0

 var stdout *bufio.Writer

 func main() {
 	log.SetFlags(0)
 	log.SetPrefix("armspec: ")

 	if len(os.Args) != 2 {
 		fmt.Fprintf(os.Stderr, "usage: armspec file.pdf\n")
 		os.Exit(2)
 	}

 	f, err := pdf.Open(os.Args[1])
 	if err != nil {
 		log.Fatal(err)
 	}

 	// Find instruction set reference in outline, to build instruction list.
 	instList := instHeadings(f.Outline())
 	if len(instList) < 200 {
 		log.Fatalf("only found %d instructions in table of contents", len(instList))
 	}

 	stdout = bufio.NewWriter(os.Stdout)
 	fmt.Fprintf(stdout, "[")
 	numTable := 0
 	defer stdout.Flush()

 	// Scan document looking for instructions.
 	// Must find exactly the ones in the outline.
 	n := f.NumPage()
 PageLoop:
 	for pageNum := 1; pageNum <= n; pageNum++ {
 		if debugPage > 0 && pageNum != debugPage {
 			continue
 		}
 		if pageNum > 1127 {
 			break
 		}
 		p := f.Page(pageNum)
 		name, table := parsePage(pageNum, p)
 		if name == "" {
 			continue
 		}
 		if len(table) < 1 {
 			if false {
 				fmt.Fprintf(os.Stderr, "no encodings for instruction %q (page %d)\n", name, pageNum)
 			}
 			continue
 		}
 		for _, inst := range table {
 			if numTable > 0 {
 				fmt.Fprintf(stdout, ",")
 			}
 			numTable++
 			js, _ := json.Marshal(inst)
 			fmt.Fprintf(stdout, "\n%s", jsFix.Replace(string(js)))
 		}
 		for j, headline := range instList {
 			if name == headline {
 				instList[j] = ""
 				continue PageLoop
 			}
 		}
 		fmt.Fprintf(os.Stderr, "unexpected instruction %q (page %d)\n", name, pageNum)
 	}

 	fmt.Fprintf(stdout, "\n]\n")
 	stdout.Flush()

 	if debugPage == 0 {
 		for _, headline := range instList {
 			if headline != "" {
 				switch headline {
 				default:
 					fmt.Fprintf(os.Stderr, "missing instruction %q\n", headline)
 				case "CHKA": // ThumbEE
 				case "CPS": // system instruction
 				case "CPY": // synonym for MOV
 				case "ENTERX": // ThumbEE
 				case "F* (former VFP instruction mnemonics)": // synonyms
 				case "HB, HBL, HBLP, HBP": // ThumbEE
 				case "LEAVEX": // ThumbEE
 				case "MOV (shifted register)": // pseudo instruction for ASR, LSL, LSR, ROR, and RRX
 				case "NEG": // synonym for RSB
 				case "RFE": // system instruction
 				case "SMC (previously SMI)": // system instruction
 				case "SRS": // system instruction
 				case "SUBS PC, LR and related instructions": // system instruction
 				case "VAND (immediate)": // pseudo instruction
 				case "VCLE (register)": // pseudo instruction
 				case "VCLT (register)": // pseudo instruction
 				case "VORN (immediate)": // pseudo instruction
 				}
 			}
 		}
 	}
 }

 func instHeadings(outline pdf.Outline) []string {
 	return appendInstHeadings(outline, nil)
 }

 var instRE = regexp.MustCompile(`A[\d.]+ Alphabetical list of instructions`)
 var childRE = regexp.MustCompile(`A[\d.]+ (.+)`)
 var sectionRE = regexp.MustCompile(`^A[\d.]+$`)
 var bitRE = regexp.MustCompile(`^( |[01]|\([01]\))*$`)

 func appendInstHeadings(outline pdf.Outline, list []string) []string {
 	if instRE.MatchString(outline.Title) {
 		for _, child := range outline.Child {
 			m := childRE.FindStringSubmatch(child.Title)
 			if m == nil {
 				fmt.Fprintf(os.Stderr, "cannot parse section title: %s\n", child.Title)
 				continue
 			}
 			list = append(list, m[1])
 		}
 	}
 	for _, child := range outline.Child {
 		list = appendInstHeadings(child, list)
 	}
 	return list
 }

 const inch = 72.0

 func parsePage(num int, p pdf.Page) (name string, table []Inst) {
 	content := p.Content()

 	var text []pdf.Text
 	for _, t := range content.Text {
 		if match(t, "Times-Roman", 7.2, "") {
 			t.FontSize = 9
 		}
 		if match(t, "Times-Roman", 6.72, "") && '0' <= t.S[0] && t.S[0] <= '9' {
 			t.S = string([]rune("⁰¹²³⁴⁵⁶⁷⁸⁹")[t.S[0]-'0'])
 			t.FontSize = 9
 			t.Y -= 2.28
 		}
 		if t.Font == "Gen_Arial" {
 			continue
 		}
 		text = append(text, t)
 	}

 	text = findWords(text)

 	for i, t := range text {
 		if t.Font == "Times" {
 			t.Font = "Times-Roman"
 			text[i] = t
 		}
 	}

 	if debugPage > 0 {
 		for _, t := range text {
 			fmt.Println(t)
 		}
 		for _, r := range content.Rect {
 			fmt.Println(r)
 		}
 	}

 	// Remove text we should ignore.
 	out := text[:0]
 	skip := false
 	for _, t := range text {
 		// skip page footer
 		if match(t, "Helvetica", 8, "A") || match(t, "Helvetica", 8, "ARM DDI") || match(t, "Helvetica-Oblique", 8, "Copyright") {
 			continue
 		}
 		// skip section header and body text
 		if match(t, "Helvetica-Bold", 12, "") && (sectionRE.MatchString(t.S) || t.S == "Alphabetical list of instructions") {
 			skip = true
 			continue
 		}
 		if skip && match(t, "Times-Roman", 9, "") {
 			continue
 		}
 		skip = false
 		out = append(out, t)
 	}
 	text = out

 	// Page header must say Instruction Details.
 	if len(text) == 0 || !match(text[0], "Helvetica-Oblique", 8, "Instruction Details") && !match(text[0], "Times-Roman", 9, "Instruction Details") {
 		return "", nil
 	}
 	text = text[1:]

 	isSection := func(text []pdf.Text, i int) int {
 		if i+2 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && sectionRE.MatchString(text[i].S) && match(text[i+1], "Helvetica-Bold", 10, "") {
 			return 2
 		}
 		if i+1 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && childRE.MatchString(text[i].S) {
 			return 1
 		}
 		return 0
 	}

 	// Skip dummy headlines and sections.
 	for d := isSection(text, 0); d != 0; d = isSection(text, 0) {
 		i := d
 		for i < len(text) && !match(text[i], "Helvetica-Bold", 9, "Encoding") && !match(text[i], "Helvetica-Bold", 10, "") {
 			i++
 		}
 		if isSection(text, i) == 0 {
 			break
 		}
 		text = text[i:]
 	}

 	// Next line is headline. Can wrap to multiple lines.
 	d := isSection(text, 0)
 	if d == 0 {
 		if debugPage > 0 {
 			fmt.Printf("non-inst-headline: %v\n", text[0])
 		}
 		checkNoEncodings(num, text)
 		return "", nil
 	}
 	if d == 2 {
 		name = text[1].S
 		text = text[2:]
 	} else if d == 1 {
 		m := childRE.FindStringSubmatch(text[0].S)
 		name = m[1]
 		text = text[1:]
 	}
 	for len(text) > 0 && match(text[0], "Helvetica-Bold", 10, "") {
 		name += " " + text[0].S
 		text = text[1:]
 	}

 	// Skip description.
 	for len(text) > 0 && (match(text[0], "Times-Roman", 9, "") || match(text[0], "LucidaSansTypewriteX", 6.48, "") || match(text[0], "Times-Bold", 10, "Note")) {
 		text = text[1:]
 	}

 	// Encodings follow.
 	warned := false
 	for i := 0; i < len(text); {
 		if match(text[i], "Helvetica-Bold", 10, "Assembler syntax") ||
 			match(text[i], "Helvetica-Bold", 9, "Modified operation in ThumbEE") ||
 			match(text[i], "Helvetica-Bold", 9, "Unallocated memory hints") ||
 			match(text[i], "Helvetica-Bold", 9, "Related encodings") ||
 			match(text[i], "Times-Roman", 9, "Figure A") ||
 			match(text[i], "Helvetica-Bold", 9, "Table A") ||
 			match(text[i], "Helvetica-Bold", 9, "VFP Instructions") ||
 			match(text[i], "Helvetica-Bold", 9, "VFP instructions") ||
 			match(text[i], "Helvetica-Bold", 9, "VFP vectors") ||
 			match(text[i], "Helvetica-Bold", 9, "FLDMX") ||
 			match(text[i], "Helvetica-Bold", 9, "FSTMX") ||
 			match(text[i], "Helvetica-Bold", 9, "Advanced SIMD and VFP") {
 			checkNoEncodings(num, text[i:])
 			break
 		}
 		if match(text[i], "Helvetica-Bold", 9, "Figure A") {
 			y := text[i].Y
 			i++
 			for i < len(text) && math.Abs(text[i].Y-y) < 2 {
 				i++
 			}
 			continue
 		}
 		if !match(text[i], "Helvetica-Bold", 9, "Encoding") {
 			if !warned {
 				warned = true
 				fmt.Fprintln(os.Stderr, "page", num, ": unexpected:", text[i])
 			}
 			i++
 			continue
 		}
 		inst := Inst{
 			Name: name,
 		}
 		enc := text[i].S
 		x := text[i].X
 		i++
 		// Possible subarchitecture notes.
 		for i < len(text) && text[i].X > x+36 {
 			if inst.Arch != "" {
 				inst.Arch += " "
 			}
 			inst.Arch += text[i].S
 			i++
 		}
 		// Encoding syntaxes.
 		for i < len(text) && (match(text[i], "LucidaSansTypewriteX", 6.48, "") || text[i].X > x+36) {
 			if text[i].X < x+0.25*inch {
 				inst.Syntax = append(inst.Syntax, text[i].S)
 			} else {
 				s := inst.Syntax[len(inst.Syntax)-1]
 				if !strings.Contains(s, "\t") {
 					s += "\t"
 				} else {
 					s += " "
 				}
 				s += text[i].S
 				inst.Syntax[len(inst.Syntax)-1] = s
 			}
 			i++
 		}

 		var bits, abits, aenc string
 		bits, i = readBitBox(inst.Name, inst.Syntax, content, text, i)
 		if strings.Contains(enc, " / ") {
 			if i < len(text) && match(text[i], "Times-Roman", 8, "") {
 				abits, i = readBitBox(inst.Name, inst.Syntax, content, text, i)
 			} else {
 				abits = bits
 			}
 			slash := strings.Index(enc, " / ")
 			aenc = "Encoding " + enc[slash+len(" / "):]
 			enc = enc[:slash]
 		}

 		// pseudocode
 		y0 := -1 * inch
 		tab := 0.0
 		for i < len(text) && match(text[i], "LucidaSansTypewriteX", 6.48, "") {
 			t := text[i]
 			i++
 			if math.Abs(t.Y-y0) < 3 {
 				// same line as last fragment, probably just two spaces
 				inst.Code += " " + t.S
 				continue
 			}
 			if inst.Code != "" {
 				inst.Code += "\n"
 			}
 			if t.X > x+0.1*inch {
 				if tab == 0 {
 					tab = t.X - x
 				}
 				inst.Code += strings.Repeat("\t", int((t.X-x)/tab+0.5))
 			} else {
 				tab = 0
 			}
 			inst.Code += t.S
 			y0 = t.Y
 		}

 		inst.ID = strings.TrimPrefix(enc, "Encoding ")
 		inst.Bits = bits
 		table = append(table, inst)
 		if abits != "" {
 			inst.ID = strings.TrimPrefix(aenc, "Encoding ")
 			inst.Bits = abits
 			table = append(table, inst)
 		}

 	}
 	return name, table
 }

 func readBitBox(name string, syntax []string, content pdf.Content, text []pdf.Text, i int) (string, int) {
 	// bit headings
 	y2 := 0.0
 	x1 := 0.0
 	x2 := 0.0
 	for i < len(text) && match(text[i], "Times-Roman", 8, "") {
 		if y2 == 0 {
 			y2 = text[i].Y
 		}
 		if x1 == 0 {
 			x1 = text[i].X
 		}
 		i++
 	}
 	// bit fields in box
 	y1 := 0.0
 	dy1 := 0.0
 	for i < len(text) && match(text[i], "Times-Roman", 9, "") {
 		if x2 < text[i].X+text[i].W {
 			x2 = text[i].X + text[i].W
 		}
 		y1 = text[i].Y
 		dy1 = text[i].FontSize
 		i++
 	}

 	if debugPage > 0 {
 		fmt.Println("encoding box", x1, y1, x2, y2)
 	}

 	// Find lines (thin rectangles) separating bit fields.
 	var bottom, top pdf.Rect
 	const (
 		yMargin = 0.25 * 72
 		xMargin = 2 * 72
 	)
 	for _, r := range content.Rect {
 		if r.Max.Y-r.Min.Y < 2 && x1-xMargin < r.Min.X && r.Min.X < x1 && x2 < r.Max.X && r.Max.X < x2+xMargin {
 			if y1-yMargin < r.Min.Y && r.Min.Y < y1 {
 				bottom = r
 			}
 			if y1+dy1 < r.Min.Y && r.Min.Y < y2 {
 				top = r
 			}
 		}
 	}

 	if debugPage > 0 {
 		fmt.Println("top", top, "bottom", bottom)
 	}

 	const ε = 0.1 * 72
 	var bars []pdf.Rect
 	for _, r := range content.Rect {
 		if r.Max.X-r.Min.X < 2 && math.Abs(r.Min.Y-bottom.Min.Y) < ε && math.Abs(r.Max.Y-top.Min.Y) < ε {
 			bars = append(bars, r)
 		}
 	}
 	sort.Sort(RectHorizontal(bars))

 	// There are 16-bit and 32-bit encodings.
 	// In practice, they are about 2.65 and 5.3 inches wide, respectively.
 	// Use 4 inches as a cutoff.
 	nbit := 32
 	dx := top.Max.X - top.Min.X
 	if top.Max.X-top.Min.X < 4*72 {
 		nbit = 16
 	}

 	total := 0
 	var buf bytes.Buffer
 	for i := 0; i < len(bars)-1; i++ {
 		if i > 0 {
 			fmt.Fprintf(&buf, "|")
 		}
 		var sub []pdf.Text
 		x1, x2 := bars[i].Min.X, bars[i+1].Min.X
 		for _, t := range content.Text {
 			tx := t.X + t.W/2
 			ty := t.Y + t.FontSize/2
 			if x1 < tx && tx < x2 && y1 < ty && ty < y2 {
 				sub = append(sub, t)
 			}
 		}
 		var str []string
 		for _, t := range findWords(sub) {
 			str = append(str, t.S)
 		}
 		s := strings.Join(str, " ")
 		s = strings.Replace(s, ")(", ") (", -1)
 		n := len(strings.Fields(s))
 		b := int(float64(nbit)*(x2-x1)/dx + 0.5)
 		if n == b {
 			for j, f := range strings.Fields(s) {
 				if j > 0 {
 					fmt.Fprintf(&buf, "|")
 				}
 				fmt.Fprintf(&buf, "%s", f)
 			}
 		} else {
 			if n != 1 {
 				fmt.Fprintf(os.Stderr, "%s - %s - multi-field %d-bit encoding: %s\n", name, syntax, n, s)
 			}
 			fmt.Fprintf(&buf, "%s:%d", s, b)
 		}
 		total += b
 	}

 	if total != nbit || total == 0 {
 		fmt.Fprintf(os.Stderr, "%s - %s - %d-bit encoding\n", name, syntax, total)
 	}
 	return buf.String(), i
 }

 type RectHorizontal []pdf.Rect

 func (x RectHorizontal) Swap(i, j int)      { x[i], x[j] = x[j], x[i] }
 func (x RectHorizontal) Less(i, j int) bool { return x[i].Min.X < x[j].Min.X }
 func (x RectHorizontal) Len() int           { return len(x) }

 func checkNoEncodings(num int, text []pdf.Text) {
 	for _, t := range text {
 		if match(t, "Helvetica-Bold", 9, "Encoding") {
 			fmt.Fprintf(os.Stderr, "page %d: unexpected encoding: %s\n", num, t.S)
 		}
 	}
 }

 func match(t pdf.Text, font string, size float64, substr string) bool {
 	return t.Font == font && math.Abs(t.FontSize-size) < 0.1 && strings.Contains(t.S, substr)
 }

 func findWords(chars []pdf.Text) (words []pdf.Text) {
 	// Sort by Y coordinate and normalize.
 	const nudge = 1
 	sort.Sort(pdf.TextVertical(chars))
 	old := -100000.0
 	for i, c := range chars {
 		if c.Y != old && math.Abs(old-c.Y) < nudge {
 			chars[i].Y = old
 		} else {
 			old = c.Y
 		}
 	}

 	// Sort by Y coordinate, breaking ties with X.
 	// This will bring letters in a single word together.
 	sort.Sort(pdf.TextVertical(chars))

 	// Loop over chars.
 	for i := 0; i < len(chars); {
 		// Find all chars on line.
 		j := i + 1
 		for j < len(chars) && chars[j].Y == chars[i].Y {
 			j++
 		}
 		var end float64
 		// Split line into words (really, phrases).
 		for k := i; k < j; {
 			ck := &chars[k]
 			s := ck.S
 			end = ck.X + ck.W
 			charSpace := ck.FontSize / 6
 			wordSpace := ck.FontSize * 2 / 3
 			l := k + 1
 			for l < j {
 				// Grow word.
 				cl := &chars[l]
 				if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+charSpace {
 					s += cl.S
 					end = cl.X + cl.W
 					l++
 					continue
 				}
 				// Add space to phrase before next word.
 				if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+wordSpace {
 					s += " " + cl.S
 					end = cl.X + cl.W
 					l++
 					continue
 				}
 				break
 			}
 			f := ck.Font
 			f = strings.TrimSuffix(f, ",Italic")
 			f = strings.TrimSuffix(f, "-Italic")
 			words = append(words, pdf.Text{f, ck.FontSize, ck.X, ck.Y, end - ck.X, s})
 			k = l
 		}
 		i = j
 	}

 	return words
 }

 func sameFont(f1, f2 string) bool {
 	f1 = strings.TrimSuffix(f1, ",Italic")
 	f1 = strings.TrimSuffix(f1, "-Italic")
 	f2 = strings.TrimSuffix(f1, ",Italic")
 	f2 = strings.TrimSuffix(f1, "-Italic")
 	return strings.TrimSuffix(f1, ",Italic") == strings.TrimSuffix(f2, ",Italic") || f1 == "Symbol" || f2 == "Symbol" || f1 == "TimesNewRoman" || f2 == "TimesNewRoman"
 }

 var jsFix = strings.NewReplacer(
 //	`\u003c`, `<`,
 //	`\u003e`, `>`,
 //	`\u0026`, `&`,
 //	`\u0009`, `\t`,
 )

 func printTable(name string, table []Inst) {
 	_ = strconv.Atoi
 }
	// Copyright 2014 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// +build !386 go1.8
	// ... see golang.org/issue/12840

	// Armspec reads the ``ARM Architecture Reference Manual''
	// to collect instruction encoding details and writes those details to standard output
	// in JSON format.
	//
	// Warning Warning Warning
	//
	// This program is unfinished. It is being published in this incomplete form
	// for interested readers, but do not expect it to be runnable or useful.
	//
	package main

	import (
	"bufio"
	"bytes"
	"encoding/json"
	"fmt"
	"log"
	"math"
	"os"
	"regexp"
	"sort"
	"strconv"
	"strings"

	"rsc.io/pdf"
	)

	type Inst struct {
	Name string
	ID string
	Bits string
	Arch string
	Syntax []string
	Code string
	}

	const debugPage = 0

	var stdout *bufio.Writer

	func main() {
	log.SetFlags(0)
	log.SetPrefix("armspec: ")

	if len(os.Args) != 2 {
	fmt.Fprintf(os.Stderr, "usage: armspec file.pdf\n")
	os.Exit(2)
	}

	f, err := pdf.Open(os.Args[1])
	if err != nil {
	log.Fatal(err)
	}

	// Find instruction set reference in outline, to build instruction list.
	instList := instHeadings(f.Outline())
	if len(instList) < 200 {
	log.Fatalf("only found %d instructions in table of contents", len(instList))
	}

	stdout = bufio.NewWriter(os.Stdout)
	fmt.Fprintf(stdout, "[")
	numTable := 0
	defer stdout.Flush()

	// Scan document looking for instructions.
	// Must find exactly the ones in the outline.
	n := f.NumPage()
	PageLoop:
	for pageNum := 1; pageNum <= n; pageNum++ {
	if debugPage > 0 && pageNum != debugPage {
	continue
	}
	if pageNum > 1127 {
	break
	}
	p := f.Page(pageNum)
	name, table := parsePage(pageNum, p)
	if name == "" {
	continue
	}
	if len(table) < 1 {
	if false {
	fmt.Fprintf(os.Stderr, "no encodings for instruction %q (page %d)\n", name, pageNum)
	}
	continue
	}
	for _, inst := range table {
	if numTable > 0 {
	fmt.Fprintf(stdout, ",")
	}
	numTable++
	js, _ := json.Marshal(inst)
	fmt.Fprintf(stdout, "\n%s", jsFix.Replace(string(js)))
	}
	for j, headline := range instList {
	if name == headline {
	instList[j] = ""
	continue PageLoop
	}
	}
	fmt.Fprintf(os.Stderr, "unexpected instruction %q (page %d)\n", name, pageNum)
	}

	fmt.Fprintf(stdout, "\n]\n")
	stdout.Flush()

	if debugPage == 0 {
	for _, headline := range instList {
	if headline != "" {
	switch headline {
	default:
	fmt.Fprintf(os.Stderr, "missing instruction %q\n", headline)
	case "CHKA": // ThumbEE
	case "CPS": // system instruction
	case "CPY": // synonym for MOV
	case "ENTERX": // ThumbEE
	case "F* (former VFP instruction mnemonics)": // synonyms
	case "HB, HBL, HBLP, HBP": // ThumbEE
	case "LEAVEX": // ThumbEE
	case "MOV (shifted register)": // pseudo instruction for ASR, LSL, LSR, ROR, and RRX
	case "NEG": // synonym for RSB
	case "RFE": // system instruction
	case "SMC (previously SMI)": // system instruction
	case "SRS": // system instruction
	case "SUBS PC, LR and related instructions": // system instruction
	case "VAND (immediate)": // pseudo instruction
	case "VCLE (register)": // pseudo instruction
	case "VCLT (register)": // pseudo instruction
	case "VORN (immediate)": // pseudo instruction
	}
	}
	}
	}
	}

	func instHeadings(outline pdf.Outline) []string {
	return appendInstHeadings(outline, nil)
	}

	var instRE = regexp.MustCompile(`A[\d.]+ Alphabetical list of instructions`)
	var childRE = regexp.MustCompile(`A[\d.]+ (.+)`)
	var sectionRE = regexp.MustCompile(`^A[\d.]+$`)
	var bitRE = regexp.MustCompile(`^( \|[01]\|\([01]\))*$`)

	func appendInstHeadings(outline pdf.Outline, list []string) []string {
	if instRE.MatchString(outline.Title) {
	for _, child := range outline.Child {
	m := childRE.FindStringSubmatch(child.Title)
	if m == nil {
	fmt.Fprintf(os.Stderr, "cannot parse section title: %s\n", child.Title)
	continue
	}
	list = append(list, m[1])
	}
	}
	for _, child := range outline.Child {
	list = appendInstHeadings(child, list)
	}
	return list
	}

	const inch = 72.0

	func parsePage(num int, p pdf.Page) (name string, table []Inst) {
	content := p.Content()

	var text []pdf.Text
	for _, t := range content.Text {
	if match(t, "Times-Roman", 7.2, "") {
	t.FontSize = 9
	}
	if match(t, "Times-Roman", 6.72, "") && '0' <= t.S[0] && t.S[0] <= '9' {
	t.S = string([]rune("⁰¹²³⁴⁵⁶⁷⁸⁹")[t.S[0]-'0'])
	t.FontSize = 9
	t.Y -= 2.28
	}
	if t.Font == "Gen_Arial" {
	continue
	}
	text = append(text, t)
	}

	text = findWords(text)

	for i, t := range text {
	if t.Font == "Times" {
	t.Font = "Times-Roman"
	text[i] = t
	}
	}

	if debugPage > 0 {
	for _, t := range text {
	fmt.Println(t)
	}
	for _, r := range content.Rect {
	fmt.Println(r)
	}
	}

	// Remove text we should ignore.
	out := text[:0]
	skip := false
	for _, t := range text {
	// skip page footer
	if match(t, "Helvetica", 8, "A") \|\| match(t, "Helvetica", 8, "ARM DDI") \|\| match(t, "Helvetica-Oblique", 8, "Copyright") {
	continue
	}
	// skip section header and body text
	if match(t, "Helvetica-Bold", 12, "") && (sectionRE.MatchString(t.S) \|\| t.S == "Alphabetical list of instructions") {
	skip = true
	continue
	}
	if skip && match(t, "Times-Roman", 9, "") {
	continue
	}
	skip = false
	out = append(out, t)
	}
	text = out

	// Page header must say Instruction Details.
	if len(text) == 0 \|\| !match(text[0], "Helvetica-Oblique", 8, "Instruction Details") && !match(text[0], "Times-Roman", 9, "Instruction Details") {
	return "", nil
	}
	text = text[1:]

	isSection := func(text []pdf.Text, i int) int {
	if i+2 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && sectionRE.MatchString(text[i].S) && match(text[i+1], "Helvetica-Bold", 10, "") {
	return 2
	}
	if i+1 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && childRE.MatchString(text[i].S) {
	return 1
	}
	return 0
	}

	// Skip dummy headlines and sections.
	for d := isSection(text, 0); d != 0; d = isSection(text, 0) {
	i := d
	for i < len(text) && !match(text[i], "Helvetica-Bold", 9, "Encoding") && !match(text[i], "Helvetica-Bold", 10, "") {
	i++
	}
	if isSection(text, i) == 0 {
	break
	}
	text = text[i:]
	}

	// Next line is headline. Can wrap to multiple lines.
	d := isSection(text, 0)
	if d == 0 {
	if debugPage > 0 {
	fmt.Printf("non-inst-headline: %v\n", text[0])
	}
	checkNoEncodings(num, text)
	return "", nil
	}
	if d == 2 {
	name = text[1].S
	text = text[2:]
	} else if d == 1 {
	m := childRE.FindStringSubmatch(text[0].S)
	name = m[1]
	text = text[1:]
	}
	for len(text) > 0 && match(text[0], "Helvetica-Bold", 10, "") {
	name += " " + text[0].S
	text = text[1:]
	}

	// Skip description.
	for len(text) > 0 && (match(text[0], "Times-Roman", 9, "") \|\| match(text[0], "LucidaSansTypewriteX", 6.48, "") \|\| match(text[0], "Times-Bold", 10, "Note")) {
	text = text[1:]
	}

	// Encodings follow.
	warned := false
	for i := 0; i < len(text); {
	if match(text[i], "Helvetica-Bold", 10, "Assembler syntax") \|\|
	match(text[i], "Helvetica-Bold", 9, "Modified operation in ThumbEE") \|\|
	match(text[i], "Helvetica-Bold", 9, "Unallocated memory hints") \|\|
	match(text[i], "Helvetica-Bold", 9, "Related encodings") \|\|
	match(text[i], "Times-Roman", 9, "Figure A") \|\|
	match(text[i], "Helvetica-Bold", 9, "Table A") \|\|
	match(text[i], "Helvetica-Bold", 9, "VFP Instructions") \|\|
	match(text[i], "Helvetica-Bold", 9, "VFP instructions") \|\|
	match(text[i], "Helvetica-Bold", 9, "VFP vectors") \|\|
	match(text[i], "Helvetica-Bold", 9, "FLDMX") \|\|
	match(text[i], "Helvetica-Bold", 9, "FSTMX") \|\|
	match(text[i], "Helvetica-Bold", 9, "Advanced SIMD and VFP") {
	checkNoEncodings(num, text[i:])
	break
	}
	if match(text[i], "Helvetica-Bold", 9, "Figure A") {
	y := text[i].Y
	i++
	for i < len(text) && math.Abs(text[i].Y-y) < 2 {
	i++
	}
	continue
	}
	if !match(text[i], "Helvetica-Bold", 9, "Encoding") {
	if !warned {
	warned = true
	fmt.Fprintln(os.Stderr, "page", num, ": unexpected:", text[i])
	}
	i++
	continue
	}
	inst := Inst{
	Name: name,
	}
	enc := text[i].S
	x := text[i].X
	i++
	// Possible subarchitecture notes.
	for i < len(text) && text[i].X > x+36 {
	if inst.Arch != "" {
	inst.Arch += " "
	}
	inst.Arch += text[i].S
	i++
	}
	// Encoding syntaxes.
	for i < len(text) && (match(text[i], "LucidaSansTypewriteX", 6.48, "") \|\| text[i].X > x+36) {
	if text[i].X < x+0.25*inch {
	inst.Syntax = append(inst.Syntax, text[i].S)
	} else {
	s := inst.Syntax[len(inst.Syntax)-1]
	if !strings.Contains(s, "\t") {
	s += "\t"
	} else {
	s += " "
	}
	s += text[i].S
	inst.Syntax[len(inst.Syntax)-1] = s
	}
	i++
	}

	var bits, abits, aenc string
	bits, i = readBitBox(inst.Name, inst.Syntax, content, text, i)
	if strings.Contains(enc, " / ") {
	if i < len(text) && match(text[i], "Times-Roman", 8, "") {
	abits, i = readBitBox(inst.Name, inst.Syntax, content, text, i)
	} else {
	abits = bits
	}
	slash := strings.Index(enc, " / ")
	aenc = "Encoding " + enc[slash+len(" / "):]
	enc = enc[:slash]
	}

	// pseudocode
	y0 := -1 * inch
	tab := 0.0
	for i < len(text) && match(text[i], "LucidaSansTypewriteX", 6.48, "") {
	t := text[i]
	i++
	if math.Abs(t.Y-y0) < 3 {
	// same line as last fragment, probably just two spaces
	inst.Code += " " + t.S
	continue
	}
	if inst.Code != "" {
	inst.Code += "\n"
	}
	if t.X > x+0.1*inch {
	if tab == 0 {
	tab = t.X - x
	}
	inst.Code += strings.Repeat("\t", int((t.X-x)/tab+0.5))
	} else {
	tab = 0
	}
	inst.Code += t.S
	y0 = t.Y
	}

	inst.ID = strings.TrimPrefix(enc, "Encoding ")
	inst.Bits = bits
	table = append(table, inst)
	if abits != "" {
	inst.ID = strings.TrimPrefix(aenc, "Encoding ")
	inst.Bits = abits
	table = append(table, inst)
	}

	}
	return name, table
	}

	func readBitBox(name string, syntax []string, content pdf.Content, text []pdf.Text, i int) (string, int) {
	// bit headings
	y2 := 0.0
	x1 := 0.0
	x2 := 0.0
	for i < len(text) && match(text[i], "Times-Roman", 8, "") {
	if y2 == 0 {
	y2 = text[i].Y
	}
	if x1 == 0 {
	x1 = text[i].X
	}
	i++
	}
	// bit fields in box
	y1 := 0.0
	dy1 := 0.0
	for i < len(text) && match(text[i], "Times-Roman", 9, "") {
	if x2 < text[i].X+text[i].W {
	x2 = text[i].X + text[i].W
	}
	y1 = text[i].Y
	dy1 = text[i].FontSize
	i++
	}

	if debugPage > 0 {
	fmt.Println("encoding box", x1, y1, x2, y2)
	}

	// Find lines (thin rectangles) separating bit fields.
	var bottom, top pdf.Rect
	const (
	yMargin = 0.25 * 72
	xMargin = 2 * 72
	)
	for _, r := range content.Rect {
	if r.Max.Y-r.Min.Y < 2 && x1-xMargin < r.Min.X && r.Min.X < x1 && x2 < r.Max.X && r.Max.X < x2+xMargin {
	if y1-yMargin < r.Min.Y && r.Min.Y < y1 {
	bottom = r
	}
	if y1+dy1 < r.Min.Y && r.Min.Y < y2 {
	top = r
	}
	}
	}

	if debugPage > 0 {
	fmt.Println("top", top, "bottom", bottom)
	}

	const ε = 0.1 * 72
	var bars []pdf.Rect
	for _, r := range content.Rect {
	if r.Max.X-r.Min.X < 2 && math.Abs(r.Min.Y-bottom.Min.Y) < ε && math.Abs(r.Max.Y-top.Min.Y) < ε {
	bars = append(bars, r)
	}
	}
	sort.Sort(RectHorizontal(bars))

	// There are 16-bit and 32-bit encodings.
	// In practice, they are about 2.65 and 5.3 inches wide, respectively.
	// Use 4 inches as a cutoff.
	nbit := 32
	dx := top.Max.X - top.Min.X
	if top.Max.X-top.Min.X < 4*72 {
	nbit = 16
	}

	total := 0
	var buf bytes.Buffer
	for i := 0; i < len(bars)-1; i++ {
	if i > 0 {
	fmt.Fprintf(&buf, "\|")
	}
	var sub []pdf.Text
	x1, x2 := bars[i].Min.X, bars[i+1].Min.X
	for _, t := range content.Text {
	tx := t.X + t.W/2
	ty := t.Y + t.FontSize/2
	if x1 < tx && tx < x2 && y1 < ty && ty < y2 {
	sub = append(sub, t)
	}
	}
	var str []string
	for _, t := range findWords(sub) {
	str = append(str, t.S)
	}
	s := strings.Join(str, " ")
	s = strings.Replace(s, ")(", ") (", -1)
	n := len(strings.Fields(s))
	b := int(float64(nbit)*(x2-x1)/dx + 0.5)
	if n == b {
	for j, f := range strings.Fields(s) {
	if j > 0 {
	fmt.Fprintf(&buf, "\|")
	}
	fmt.Fprintf(&buf, "%s", f)
	}
	} else {
	if n != 1 {
	fmt.Fprintf(os.Stderr, "%s - %s - multi-field %d-bit encoding: %s\n", name, syntax, n, s)
	}
	fmt.Fprintf(&buf, "%s:%d", s, b)
	}
	total += b
	}

	if total != nbit \|\| total == 0 {
	fmt.Fprintf(os.Stderr, "%s - %s - %d-bit encoding\n", name, syntax, total)
	}
	return buf.String(), i
	}

	type RectHorizontal []pdf.Rect

	func (x RectHorizontal) Swap(i, j int) { x[i], x[j] = x[j], x[i] }
	func (x RectHorizontal) Less(i, j int) bool { return x[i].Min.X < x[j].Min.X }
	func (x RectHorizontal) Len() int { return len(x) }

	func checkNoEncodings(num int, text []pdf.Text) {
	for _, t := range text {
	if match(t, "Helvetica-Bold", 9, "Encoding") {
	fmt.Fprintf(os.Stderr, "page %d: unexpected encoding: %s\n", num, t.S)
	}
	}
	}

	func match(t pdf.Text, font string, size float64, substr string) bool {
	return t.Font == font && math.Abs(t.FontSize-size) < 0.1 && strings.Contains(t.S, substr)
	}

	func findWords(chars []pdf.Text) (words []pdf.Text) {
	// Sort by Y coordinate and normalize.
	const nudge = 1
	sort.Sort(pdf.TextVertical(chars))
	old := -100000.0
	for i, c := range chars {
	if c.Y != old && math.Abs(old-c.Y) < nudge {
	chars[i].Y = old
	} else {
	old = c.Y
	}
	}

	// Sort by Y coordinate, breaking ties with X.
	// This will bring letters in a single word together.
	sort.Sort(pdf.TextVertical(chars))

	// Loop over chars.
	for i := 0; i < len(chars); {
	// Find all chars on line.
	j := i + 1
	for j < len(chars) && chars[j].Y == chars[i].Y {
	j++
	}
	var end float64
	// Split line into words (really, phrases).
	for k := i; k < j; {
	ck := &chars[k]
	s := ck.S
	end = ck.X + ck.W
	charSpace := ck.FontSize / 6
	wordSpace := ck.FontSize * 2 / 3
	l := k + 1
	for l < j {
	// Grow word.
	cl := &chars[l]
	if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+charSpace {
	s += cl.S
	end = cl.X + cl.W
	l++
	continue
	}
	// Add space to phrase before next word.
	if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+wordSpace {
	s += " " + cl.S
	end = cl.X + cl.W
	l++
	continue
	}
	break
	}
	f := ck.Font
	f = strings.TrimSuffix(f, ",Italic")
	f = strings.TrimSuffix(f, "-Italic")
	words = append(words, pdf.Text{f, ck.FontSize, ck.X, ck.Y, end - ck.X, s})
	k = l
	}
	i = j
	}

	return words
	}

	func sameFont(f1, f2 string) bool {
	f1 = strings.TrimSuffix(f1, ",Italic")
	f1 = strings.TrimSuffix(f1, "-Italic")
	f2 = strings.TrimSuffix(f1, ",Italic")
	f2 = strings.TrimSuffix(f1, "-Italic")
	return strings.TrimSuffix(f1, ",Italic") == strings.TrimSuffix(f2, ",Italic") \|\| f1 == "Symbol" \|\| f2 == "Symbol" \|\| f1 == "TimesNewRoman" \|\| f2 == "TimesNewRoman"
	}

	var jsFix = strings.NewReplacer(
	// `\u003c`, `<`,
	// `\u003e`, `>`,
	// `\u0026`, `&`,
	// `\u0009`, `\t`,
	)

	func printTable(name string, table []Inst) {
	_ = strconv.Atoi
	}