// Copyright 2014 The Go Authors.  All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build go1.6 && (!386 || go1.8)
// +build go1.6
// +build !386 go1.8

// ... see golang.org/issue/12840

// Armspec reads the ``ARM Architecture Reference Manual''
// to collect instruction encoding details and writes those details to standard output
// in JSON format.
//
// Warning Warning Warning
//
// This program is unfinished. It is being published in this incomplete form
// for interested readers, but do not expect it to be runnable or useful.
//
package main

import (
	"bufio"
	"bytes"
	"encoding/json"
	"fmt"
	"log"
	"math"
	"os"
	"regexp"
	"sort"
	"strconv"
	"strings"

	"rsc.io/pdf"
)

type Inst struct {
	Name   string
	ID     string
	Bits   string
	Arch   string
	Syntax []string
	Code   string
}

const debugPage = 0

var stdout *bufio.Writer

func main() {
	log.SetFlags(0)
	log.SetPrefix("armspec: ")

	if len(os.Args) != 2 {
		fmt.Fprintf(os.Stderr, "usage: armspec file.pdf\n")
		os.Exit(2)
	}

	f, err := pdf.Open(os.Args[1])
	if err != nil {
		log.Fatal(err)
	}

	// Find instruction set reference in outline, to build instruction list.
	instList := instHeadings(f.Outline())
	if len(instList) < 200 {
		log.Fatalf("only found %d instructions in table of contents", len(instList))
	}

	stdout = bufio.NewWriter(os.Stdout)
	fmt.Fprintf(stdout, "[")
	numTable := 0
	defer stdout.Flush()

	// Scan document looking for instructions.
	// Must find exactly the ones in the outline.
	n := f.NumPage()
PageLoop:
	for pageNum := 1; pageNum <= n; pageNum++ {
		if debugPage > 0 && pageNum != debugPage {
			continue
		}
		if pageNum > 1127 {
			break
		}
		p := f.Page(pageNum)
		name, table := parsePage(pageNum, p)
		if name == "" {
			continue
		}
		if len(table) < 1 {
			if false {
				fmt.Fprintf(os.Stderr, "no encodings for instruction %q (page %d)\n", name, pageNum)
			}
			continue
		}
		for _, inst := range table {
			if numTable > 0 {
				fmt.Fprintf(stdout, ",")
			}
			numTable++
			js, _ := json.Marshal(inst)
			fmt.Fprintf(stdout, "\n%s", jsFix.Replace(string(js)))
		}
		for j, headline := range instList {
			if name == headline {
				instList[j] = ""
				continue PageLoop
			}
		}
		fmt.Fprintf(os.Stderr, "unexpected instruction %q (page %d)\n", name, pageNum)
	}

	fmt.Fprintf(stdout, "\n]\n")
	stdout.Flush()

	if debugPage == 0 {
		for _, headline := range instList {
			if headline != "" {
				switch headline {
				default:
					fmt.Fprintf(os.Stderr, "missing instruction %q\n", headline)
				case "CHKA": // ThumbEE
				case "CPS": // system instruction
				case "CPY": // synonym for MOV
				case "ENTERX": // ThumbEE
				case "F* (former VFP instruction mnemonics)": // synonyms
				case "HB, HBL, HBLP, HBP": // ThumbEE
				case "LEAVEX": // ThumbEE
				case "MOV (shifted register)": // pseudo instruction for ASR, LSL, LSR, ROR, and RRX
				case "NEG": // synonym for RSB
				case "RFE": // system instruction
				case "SMC (previously SMI)": // system instruction
				case "SRS": // system instruction
				case "SUBS PC, LR and related instructions": // system instruction
				case "VAND (immediate)": // pseudo instruction
				case "VCLE (register)": // pseudo instruction
				case "VCLT (register)": // pseudo instruction
				case "VORN (immediate)": // pseudo instruction
				}
			}
		}
	}
}

func instHeadings(outline pdf.Outline) []string {
	return appendInstHeadings(outline, nil)
}

var instRE = regexp.MustCompile(`A[\d.]+ Alphabetical list of instructions`)
var childRE = regexp.MustCompile(`A[\d.]+ (.+)`)
var sectionRE = regexp.MustCompile(`^A[\d.]+$`)
var bitRE = regexp.MustCompile(`^( |[01]|\([01]\))*$`)

func appendInstHeadings(outline pdf.Outline, list []string) []string {
	if instRE.MatchString(outline.Title) {
		for _, child := range outline.Child {
			m := childRE.FindStringSubmatch(child.Title)
			if m == nil {
				fmt.Fprintf(os.Stderr, "cannot parse section title: %s\n", child.Title)
				continue
			}
			list = append(list, m[1])
		}
	}
	for _, child := range outline.Child {
		list = appendInstHeadings(child, list)
	}
	return list
}

const inch = 72.0

func parsePage(num int, p pdf.Page) (name string, table []Inst) {
	content := p.Content()

	var text []pdf.Text
	for _, t := range content.Text {
		if match(t, "Times-Roman", 7.2, "") {
			t.FontSize = 9
		}
		if match(t, "Times-Roman", 6.72, "") && '0' <= t.S[0] && t.S[0] <= '9' {
			t.S = string([]rune("⁰¹²³⁴⁵⁶⁷⁸⁹")[t.S[0]-'0'])
			t.FontSize = 9
			t.Y -= 2.28
		}
		if t.Font == "Gen_Arial" {
			continue
		}
		text = append(text, t)
	}

	text = findWords(text)

	for i, t := range text {
		if t.Font == "Times" {
			t.Font = "Times-Roman"
			text[i] = t
		}
	}

	if debugPage > 0 {
		for _, t := range text {
			fmt.Println(t)
		}
		for _, r := range content.Rect {
			fmt.Println(r)
		}
	}

	// Remove text we should ignore.
	out := text[:0]
	skip := false
	for _, t := range text {
		// skip page footer
		if match(t, "Helvetica", 8, "A") || match(t, "Helvetica", 8, "ARM DDI") || match(t, "Helvetica-Oblique", 8, "Copyright") {
			continue
		}
		// skip section header and body text
		if match(t, "Helvetica-Bold", 12, "") && (sectionRE.MatchString(t.S) || t.S == "Alphabetical list of instructions") {
			skip = true
			continue
		}
		if skip && match(t, "Times-Roman", 9, "") {
			continue
		}
		skip = false
		out = append(out, t)
	}
	text = out

	// Page header must say Instruction Details.
	if len(text) == 0 || !match(text[0], "Helvetica-Oblique", 8, "Instruction Details") && !match(text[0], "Times-Roman", 9, "Instruction Details") {
		return "", nil
	}
	text = text[1:]

	isSection := func(text []pdf.Text, i int) int {
		if i+2 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && sectionRE.MatchString(text[i].S) && match(text[i+1], "Helvetica-Bold", 10, "") {
			return 2
		}
		if i+1 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && childRE.MatchString(text[i].S) {
			return 1
		}
		return 0
	}

	// Skip dummy headlines and sections.
	for d := isSection(text, 0); d != 0; d = isSection(text, 0) {
		i := d
		for i < len(text) && !match(text[i], "Helvetica-Bold", 9, "Encoding") && !match(text[i], "Helvetica-Bold", 10, "") {
			i++
		}
		if isSection(text, i) == 0 {
			break
		}
		text = text[i:]
	}

	// Next line is headline. Can wrap to multiple lines.
	d := isSection(text, 0)
	if d == 0 {
		if debugPage > 0 {
			fmt.Printf("non-inst-headline: %v\n", text[0])
		}
		checkNoEncodings(num, text)
		return "", nil
	}
	if d == 2 {
		name = text[1].S
		text = text[2:]
	} else if d == 1 {
		m := childRE.FindStringSubmatch(text[0].S)
		name = m[1]
		text = text[1:]
	}
	for len(text) > 0 && match(text[0], "Helvetica-Bold", 10, "") {
		name += " " + text[0].S
		text = text[1:]
	}

	// Skip description.
	for len(text) > 0 && (match(text[0], "Times-Roman", 9, "") || match(text[0], "LucidaSansTypewriteX", 6.48, "") || match(text[0], "Times-Bold", 10, "Note")) {
		text = text[1:]
	}

	// Encodings follow.
	warned := false
	for i := 0; i < len(text); {
		if match(text[i], "Helvetica-Bold", 10, "Assembler syntax") ||
			match(text[i], "Helvetica-Bold", 9, "Modified operation in ThumbEE") ||
			match(text[i], "Helvetica-Bold", 9, "Unallocated memory hints") ||
			match(text[i], "Helvetica-Bold", 9, "Related encodings") ||
			match(text[i], "Times-Roman", 9, "Figure A") ||
			match(text[i], "Helvetica-Bold", 9, "Table A") ||
			match(text[i], "Helvetica-Bold", 9, "VFP Instructions") ||
			match(text[i], "Helvetica-Bold", 9, "VFP instructions") ||
			match(text[i], "Helvetica-Bold", 9, "VFP vectors") ||
			match(text[i], "Helvetica-Bold", 9, "FLDMX") ||
			match(text[i], "Helvetica-Bold", 9, "FSTMX") ||
			match(text[i], "Helvetica-Bold", 9, "Advanced SIMD and VFP") {
			checkNoEncodings(num, text[i:])
			break
		}
		if match(text[i], "Helvetica-Bold", 9, "Figure A") {
			y := text[i].Y
			i++
			for i < len(text) && math.Abs(text[i].Y-y) < 2 {
				i++
			}
			continue
		}
		if !match(text[i], "Helvetica-Bold", 9, "Encoding") {
			if !warned {
				warned = true
				fmt.Fprintln(os.Stderr, "page", num, ": unexpected:", text[i])
			}
			i++
			continue
		}
		inst := Inst{
			Name: name,
		}
		enc := text[i].S
		x := text[i].X
		i++
		// Possible subarchitecture notes.
		for i < len(text) && text[i].X > x+36 {
			if inst.Arch != "" {
				inst.Arch += " "
			}
			inst.Arch += text[i].S
			i++
		}
		// Encoding syntaxes.
		for i < len(text) && (match(text[i], "LucidaSansTypewriteX", 6.48, "") || text[i].X > x+36) {
			if text[i].X < x+0.25*inch {
				inst.Syntax = append(inst.Syntax, text[i].S)
			} else {
				s := inst.Syntax[len(inst.Syntax)-1]
				if !strings.Contains(s, "\t") {
					s += "\t"
				} else {
					s += " "
				}
				s += text[i].S
				inst.Syntax[len(inst.Syntax)-1] = s
			}
			i++
		}

		var bits, abits, aenc string
		bits, i = readBitBox(inst.Name, inst.Syntax, content, text, i)
		if strings.Contains(enc, " / ") {
			if i < len(text) && match(text[i], "Times-Roman", 8, "") {
				abits, i = readBitBox(inst.Name, inst.Syntax, content, text, i)
			} else {
				abits = bits
			}
			slash := strings.Index(enc, " / ")
			aenc = "Encoding " + enc[slash+len(" / "):]
			enc = enc[:slash]
		}

		// pseudocode
		y0 := -1 * inch
		tab := 0.0
		for i < len(text) && match(text[i], "LucidaSansTypewriteX", 6.48, "") {
			t := text[i]
			i++
			if math.Abs(t.Y-y0) < 3 {
				// same line as last fragment, probably just two spaces
				inst.Code += " " + t.S
				continue
			}
			if inst.Code != "" {
				inst.Code += "\n"
			}
			if t.X > x+0.1*inch {
				if tab == 0 {
					tab = t.X - x
				}
				inst.Code += strings.Repeat("\t", int((t.X-x)/tab+0.5))
			} else {
				tab = 0
			}
			inst.Code += t.S
			y0 = t.Y
		}

		inst.ID = strings.TrimPrefix(enc, "Encoding ")
		inst.Bits = bits
		table = append(table, inst)
		if abits != "" {
			inst.ID = strings.TrimPrefix(aenc, "Encoding ")
			inst.Bits = abits
			table = append(table, inst)
		}

	}
	return name, table
}

func readBitBox(name string, syntax []string, content pdf.Content, text []pdf.Text, i int) (string, int) {
	// bit headings
	y2 := 0.0
	x1 := 0.0
	x2 := 0.0
	for i < len(text) && match(text[i], "Times-Roman", 8, "") {
		if y2 == 0 {
			y2 = text[i].Y
		}
		if x1 == 0 {
			x1 = text[i].X
		}
		i++
	}
	// bit fields in box
	y1 := 0.0
	dy1 := 0.0
	for i < len(text) && match(text[i], "Times-Roman", 9, "") {
		if x2 < text[i].X+text[i].W {
			x2 = text[i].X + text[i].W
		}
		y1 = text[i].Y
		dy1 = text[i].FontSize
		i++
	}

	if debugPage > 0 {
		fmt.Println("encoding box", x1, y1, x2, y2)
	}

	// Find lines (thin rectangles) separating bit fields.
	var bottom, top pdf.Rect
	const (
		yMargin = 0.25 * 72
		xMargin = 2 * 72
	)
	for _, r := range content.Rect {
		if r.Max.Y-r.Min.Y < 2 && x1-xMargin < r.Min.X && r.Min.X < x1 && x2 < r.Max.X && r.Max.X < x2+xMargin {
			if y1-yMargin < r.Min.Y && r.Min.Y < y1 {
				bottom = r
			}
			if y1+dy1 < r.Min.Y && r.Min.Y < y2 {
				top = r
			}
		}
	}

	if debugPage > 0 {
		fmt.Println("top", top, "bottom", bottom)
	}

	const ε = 0.1 * 72
	var bars []pdf.Rect
	for _, r := range content.Rect {
		if r.Max.X-r.Min.X < 2 && math.Abs(r.Min.Y-bottom.Min.Y) < ε && math.Abs(r.Max.Y-top.Min.Y) < ε {
			bars = append(bars, r)
		}
	}
	sort.Sort(RectHorizontal(bars))

	// There are 16-bit and 32-bit encodings.
	// In practice, they are about 2.65 and 5.3 inches wide, respectively.
	// Use 4 inches as a cutoff.
	nbit := 32
	dx := top.Max.X - top.Min.X
	if top.Max.X-top.Min.X < 4*72 {
		nbit = 16
	}

	total := 0
	var buf bytes.Buffer
	for i := 0; i < len(bars)-1; i++ {
		if i > 0 {
			fmt.Fprintf(&buf, "|")
		}
		var sub []pdf.Text
		x1, x2 := bars[i].Min.X, bars[i+1].Min.X
		for _, t := range content.Text {
			tx := t.X + t.W/2
			ty := t.Y + t.FontSize/2
			if x1 < tx && tx < x2 && y1 < ty && ty < y2 {
				sub = append(sub, t)
			}
		}
		var str []string
		for _, t := range findWords(sub) {
			str = append(str, t.S)
		}
		s := strings.Join(str, " ")
		s = strings.Replace(s, ")(", ") (", -1)
		n := len(strings.Fields(s))
		b := int(float64(nbit)*(x2-x1)/dx + 0.5)
		if n == b {
			for j, f := range strings.Fields(s) {
				if j > 0 {
					fmt.Fprintf(&buf, "|")
				}
				fmt.Fprintf(&buf, "%s", f)
			}
		} else {
			if n != 1 {
				fmt.Fprintf(os.Stderr, "%s - %s - multi-field %d-bit encoding: %s\n", name, syntax, n, s)
			}
			fmt.Fprintf(&buf, "%s:%d", s, b)
		}
		total += b
	}

	if total != nbit || total == 0 {
		fmt.Fprintf(os.Stderr, "%s - %s - %d-bit encoding\n", name, syntax, total)
	}
	return buf.String(), i
}

type RectHorizontal []pdf.Rect

func (x RectHorizontal) Swap(i, j int)      { x[i], x[j] = x[j], x[i] }
func (x RectHorizontal) Less(i, j int) bool { return x[i].Min.X < x[j].Min.X }
func (x RectHorizontal) Len() int           { return len(x) }

func checkNoEncodings(num int, text []pdf.Text) {
	for _, t := range text {
		if match(t, "Helvetica-Bold", 9, "Encoding") {
			fmt.Fprintf(os.Stderr, "page %d: unexpected encoding: %s\n", num, t.S)
		}
	}
}

func match(t pdf.Text, font string, size float64, substr string) bool {
	return t.Font == font && math.Abs(t.FontSize-size) < 0.1 && strings.Contains(t.S, substr)
}

func findWords(chars []pdf.Text) (words []pdf.Text) {
	// Sort by Y coordinate and normalize.
	const nudge = 1
	sort.Sort(pdf.TextVertical(chars))
	old := -100000.0
	for i, c := range chars {
		if c.Y != old && math.Abs(old-c.Y) < nudge {
			chars[i].Y = old
		} else {
			old = c.Y
		}
	}

	// Sort by Y coordinate, breaking ties with X.
	// This will bring letters in a single word together.
	sort.Sort(pdf.TextVertical(chars))

	// Loop over chars.
	for i := 0; i < len(chars); {
		// Find all chars on line.
		j := i + 1
		for j < len(chars) && chars[j].Y == chars[i].Y {
			j++
		}
		var end float64
		// Split line into words (really, phrases).
		for k := i; k < j; {
			ck := &chars[k]
			s := ck.S
			end = ck.X + ck.W
			charSpace := ck.FontSize / 6
			wordSpace := ck.FontSize * 2 / 3
			l := k + 1
			for l < j {
				// Grow word.
				cl := &chars[l]
				if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+charSpace {
					s += cl.S
					end = cl.X + cl.W
					l++
					continue
				}
				// Add space to phrase before next word.
				if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+wordSpace {
					s += " " + cl.S
					end = cl.X + cl.W
					l++
					continue
				}
				break
			}
			f := ck.Font
			f = strings.TrimSuffix(f, ",Italic")
			f = strings.TrimSuffix(f, "-Italic")
			words = append(words, pdf.Text{f, ck.FontSize, ck.X, ck.Y, end - ck.X, s})
			k = l
		}
		i = j
	}

	return words
}

func sameFont(f1, f2 string) bool {
	f1 = strings.TrimSuffix(f1, ",Italic")
	f1 = strings.TrimSuffix(f1, "-Italic")
	f2 = strings.TrimSuffix(f1, ",Italic")
	f2 = strings.TrimSuffix(f1, "-Italic")
	return strings.TrimSuffix(f1, ",Italic") == strings.TrimSuffix(f2, ",Italic") || f1 == "Symbol" || f2 == "Symbol" || f1 == "TimesNewRoman" || f2 == "TimesNewRoman"
}

var jsFix = strings.NewReplacer(
//	`\u003c`, `<`,
//	`\u003e`, `>`,
//	`\u0026`, `&`,
//	`\u0009`, `\t`,
)

func printTable(name string, table []Inst) {
	_ = strconv.Atoi
}
