| // Copyright 2014 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // +build !386 go1.8 |
| // ... see golang.org/issue/12840 |
| |
| // Armspec reads the ``ARM Architecture Reference Manual'' |
| // to collect instruction encoding details and writes those details to standard output |
| // in JSON format. |
| // |
| // Warning Warning Warning |
| // |
| // This program is unfinished. It is being published in this incomplete form |
| // for interested readers, but do not expect it to be runnable or useful. |
| // |
| package main |
| |
| import ( |
| "bufio" |
| "bytes" |
| "encoding/json" |
| "fmt" |
| "log" |
| "math" |
| "os" |
| "regexp" |
| "sort" |
| "strconv" |
| "strings" |
| |
| "rsc.io/pdf" |
| ) |
| |
| type Inst struct { |
| Name string |
| ID string |
| Bits string |
| Arch string |
| Syntax []string |
| Code string |
| } |
| |
| const debugPage = 0 |
| |
| var stdout *bufio.Writer |
| |
| func main() { |
| log.SetFlags(0) |
| log.SetPrefix("armspec: ") |
| |
| if len(os.Args) != 2 { |
| fmt.Fprintf(os.Stderr, "usage: armspec file.pdf\n") |
| os.Exit(2) |
| } |
| |
| f, err := pdf.Open(os.Args[1]) |
| if err != nil { |
| log.Fatal(err) |
| } |
| |
| // Find instruction set reference in outline, to build instruction list. |
| instList := instHeadings(f.Outline()) |
| if len(instList) < 200 { |
| log.Fatalf("only found %d instructions in table of contents", len(instList)) |
| } |
| |
| stdout = bufio.NewWriter(os.Stdout) |
| fmt.Fprintf(stdout, "[") |
| numTable := 0 |
| defer stdout.Flush() |
| |
| // Scan document looking for instructions. |
| // Must find exactly the ones in the outline. |
| n := f.NumPage() |
| PageLoop: |
| for pageNum := 1; pageNum <= n; pageNum++ { |
| if debugPage > 0 && pageNum != debugPage { |
| continue |
| } |
| if pageNum > 1127 { |
| break |
| } |
| p := f.Page(pageNum) |
| name, table := parsePage(pageNum, p) |
| if name == "" { |
| continue |
| } |
| if len(table) < 1 { |
| if false { |
| fmt.Fprintf(os.Stderr, "no encodings for instruction %q (page %d)\n", name, pageNum) |
| } |
| continue |
| } |
| for _, inst := range table { |
| if numTable > 0 { |
| fmt.Fprintf(stdout, ",") |
| } |
| numTable++ |
| js, _ := json.Marshal(inst) |
| fmt.Fprintf(stdout, "\n%s", jsFix.Replace(string(js))) |
| } |
| for j, headline := range instList { |
| if name == headline { |
| instList[j] = "" |
| continue PageLoop |
| } |
| } |
| fmt.Fprintf(os.Stderr, "unexpected instruction %q (page %d)\n", name, pageNum) |
| } |
| |
| fmt.Fprintf(stdout, "\n]\n") |
| stdout.Flush() |
| |
| if debugPage == 0 { |
| for _, headline := range instList { |
| if headline != "" { |
| switch headline { |
| default: |
| fmt.Fprintf(os.Stderr, "missing instruction %q\n", headline) |
| case "CHKA": // ThumbEE |
| case "CPS": // system instruction |
| case "CPY": // synonym for MOV |
| case "ENTERX": // ThumbEE |
| case "F* (former VFP instruction mnemonics)": // synonyms |
| case "HB, HBL, HBLP, HBP": // ThumbEE |
| case "LEAVEX": // ThumbEE |
| case "MOV (shifted register)": // pseudo instruction for ASR, LSL, LSR, ROR, and RRX |
| case "NEG": // synonym for RSB |
| case "RFE": // system instruction |
| case "SMC (previously SMI)": // system instruction |
| case "SRS": // system instruction |
| case "SUBS PC, LR and related instructions": // system instruction |
| case "VAND (immediate)": // pseudo instruction |
| case "VCLE (register)": // pseudo instruction |
| case "VCLT (register)": // pseudo instruction |
| case "VORN (immediate)": // pseudo instruction |
| } |
| } |
| } |
| } |
| } |
| |
| func instHeadings(outline pdf.Outline) []string { |
| return appendInstHeadings(outline, nil) |
| } |
| |
| var instRE = regexp.MustCompile(`A[\d.]+ Alphabetical list of instructions`) |
| var childRE = regexp.MustCompile(`A[\d.]+ (.+)`) |
| var sectionRE = regexp.MustCompile(`^A[\d.]+$`) |
| var bitRE = regexp.MustCompile(`^( |[01]|\([01]\))*$`) |
| |
| func appendInstHeadings(outline pdf.Outline, list []string) []string { |
| if instRE.MatchString(outline.Title) { |
| for _, child := range outline.Child { |
| m := childRE.FindStringSubmatch(child.Title) |
| if m == nil { |
| fmt.Fprintf(os.Stderr, "cannot parse section title: %s\n", child.Title) |
| continue |
| } |
| list = append(list, m[1]) |
| } |
| } |
| for _, child := range outline.Child { |
| list = appendInstHeadings(child, list) |
| } |
| return list |
| } |
| |
| const inch = 72.0 |
| |
| func parsePage(num int, p pdf.Page) (name string, table []Inst) { |
| content := p.Content() |
| |
| var text []pdf.Text |
| for _, t := range content.Text { |
| if match(t, "Times-Roman", 7.2, "") { |
| t.FontSize = 9 |
| } |
| if match(t, "Times-Roman", 6.72, "") && '0' <= t.S[0] && t.S[0] <= '9' { |
| t.S = string([]rune("⁰¹²³⁴⁵⁶⁷⁸⁹")[t.S[0]-'0']) |
| t.FontSize = 9 |
| t.Y -= 2.28 |
| } |
| if t.Font == "Gen_Arial" { |
| continue |
| } |
| text = append(text, t) |
| } |
| |
| text = findWords(text) |
| |
| for i, t := range text { |
| if t.Font == "Times" { |
| t.Font = "Times-Roman" |
| text[i] = t |
| } |
| } |
| |
| if debugPage > 0 { |
| for _, t := range text { |
| fmt.Println(t) |
| } |
| for _, r := range content.Rect { |
| fmt.Println(r) |
| } |
| } |
| |
| // Remove text we should ignore. |
| out := text[:0] |
| skip := false |
| for _, t := range text { |
| // skip page footer |
| if match(t, "Helvetica", 8, "A") || match(t, "Helvetica", 8, "ARM DDI") || match(t, "Helvetica-Oblique", 8, "Copyright") { |
| continue |
| } |
| // skip section header and body text |
| if match(t, "Helvetica-Bold", 12, "") && (sectionRE.MatchString(t.S) || t.S == "Alphabetical list of instructions") { |
| skip = true |
| continue |
| } |
| if skip && match(t, "Times-Roman", 9, "") { |
| continue |
| } |
| skip = false |
| out = append(out, t) |
| } |
| text = out |
| |
| // Page header must say Instruction Details. |
| if len(text) == 0 || !match(text[0], "Helvetica-Oblique", 8, "Instruction Details") && !match(text[0], "Times-Roman", 9, "Instruction Details") { |
| return "", nil |
| } |
| text = text[1:] |
| |
| isSection := func(text []pdf.Text, i int) int { |
| if i+2 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && sectionRE.MatchString(text[i].S) && match(text[i+1], "Helvetica-Bold", 10, "") { |
| return 2 |
| } |
| if i+1 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && childRE.MatchString(text[i].S) { |
| return 1 |
| } |
| return 0 |
| } |
| |
| // Skip dummy headlines and sections. |
| for d := isSection(text, 0); d != 0; d = isSection(text, 0) { |
| i := d |
| for i < len(text) && !match(text[i], "Helvetica-Bold", 9, "Encoding") && !match(text[i], "Helvetica-Bold", 10, "") { |
| i++ |
| } |
| if isSection(text, i) == 0 { |
| break |
| } |
| text = text[i:] |
| } |
| |
| // Next line is headline. Can wrap to multiple lines. |
| d := isSection(text, 0) |
| if d == 0 { |
| if debugPage > 0 { |
| fmt.Printf("non-inst-headline: %v\n", text[0]) |
| } |
| checkNoEncodings(num, text) |
| return "", nil |
| } |
| if d == 2 { |
| name = text[1].S |
| text = text[2:] |
| } else if d == 1 { |
| m := childRE.FindStringSubmatch(text[0].S) |
| name = m[1] |
| text = text[1:] |
| } |
| for len(text) > 0 && match(text[0], "Helvetica-Bold", 10, "") { |
| name += " " + text[0].S |
| text = text[1:] |
| } |
| |
| // Skip description. |
| for len(text) > 0 && (match(text[0], "Times-Roman", 9, "") || match(text[0], "LucidaSansTypewriteX", 6.48, "") || match(text[0], "Times-Bold", 10, "Note")) { |
| text = text[1:] |
| } |
| |
| // Encodings follow. |
| warned := false |
| for i := 0; i < len(text); { |
| if match(text[i], "Helvetica-Bold", 10, "Assembler syntax") || |
| match(text[i], "Helvetica-Bold", 9, "Modified operation in ThumbEE") || |
| match(text[i], "Helvetica-Bold", 9, "Unallocated memory hints") || |
| match(text[i], "Helvetica-Bold", 9, "Related encodings") || |
| match(text[i], "Times-Roman", 9, "Figure A") || |
| match(text[i], "Helvetica-Bold", 9, "Table A") || |
| match(text[i], "Helvetica-Bold", 9, "VFP Instructions") || |
| match(text[i], "Helvetica-Bold", 9, "VFP instructions") || |
| match(text[i], "Helvetica-Bold", 9, "VFP vectors") || |
| match(text[i], "Helvetica-Bold", 9, "FLDMX") || |
| match(text[i], "Helvetica-Bold", 9, "FSTMX") || |
| match(text[i], "Helvetica-Bold", 9, "Advanced SIMD and VFP") { |
| checkNoEncodings(num, text[i:]) |
| break |
| } |
| if match(text[i], "Helvetica-Bold", 9, "Figure A") { |
| y := text[i].Y |
| i++ |
| for i < len(text) && math.Abs(text[i].Y-y) < 2 { |
| i++ |
| } |
| continue |
| } |
| if !match(text[i], "Helvetica-Bold", 9, "Encoding") { |
| if !warned { |
| warned = true |
| fmt.Fprintln(os.Stderr, "page", num, ": unexpected:", text[i]) |
| } |
| i++ |
| continue |
| } |
| inst := Inst{ |
| Name: name, |
| } |
| enc := text[i].S |
| x := text[i].X |
| i++ |
| // Possible subarchitecture notes. |
| for i < len(text) && text[i].X > x+36 { |
| if inst.Arch != "" { |
| inst.Arch += " " |
| } |
| inst.Arch += text[i].S |
| i++ |
| } |
| // Encoding syntaxes. |
| for i < len(text) && (match(text[i], "LucidaSansTypewriteX", 6.48, "") || text[i].X > x+36) { |
| if text[i].X < x+0.25*inch { |
| inst.Syntax = append(inst.Syntax, text[i].S) |
| } else { |
| s := inst.Syntax[len(inst.Syntax)-1] |
| if !strings.Contains(s, "\t") { |
| s += "\t" |
| } else { |
| s += " " |
| } |
| s += text[i].S |
| inst.Syntax[len(inst.Syntax)-1] = s |
| } |
| i++ |
| } |
| |
| var bits, abits, aenc string |
| bits, i = readBitBox(inst.Name, inst.Syntax, content, text, i) |
| if strings.Contains(enc, " / ") { |
| if i < len(text) && match(text[i], "Times-Roman", 8, "") { |
| abits, i = readBitBox(inst.Name, inst.Syntax, content, text, i) |
| } else { |
| abits = bits |
| } |
| slash := strings.Index(enc, " / ") |
| aenc = "Encoding " + enc[slash+len(" / "):] |
| enc = enc[:slash] |
| } |
| |
| // pseudocode |
| y0 := -1 * inch |
| tab := 0.0 |
| for i < len(text) && match(text[i], "LucidaSansTypewriteX", 6.48, "") { |
| t := text[i] |
| i++ |
| if math.Abs(t.Y-y0) < 3 { |
| // same line as last fragment, probably just two spaces |
| inst.Code += " " + t.S |
| continue |
| } |
| if inst.Code != "" { |
| inst.Code += "\n" |
| } |
| if t.X > x+0.1*inch { |
| if tab == 0 { |
| tab = t.X - x |
| } |
| inst.Code += strings.Repeat("\t", int((t.X-x)/tab+0.5)) |
| } else { |
| tab = 0 |
| } |
| inst.Code += t.S |
| y0 = t.Y |
| } |
| |
| inst.ID = strings.TrimPrefix(enc, "Encoding ") |
| inst.Bits = bits |
| table = append(table, inst) |
| if abits != "" { |
| inst.ID = strings.TrimPrefix(aenc, "Encoding ") |
| inst.Bits = abits |
| table = append(table, inst) |
| } |
| |
| } |
| return name, table |
| } |
| |
| func readBitBox(name string, syntax []string, content pdf.Content, text []pdf.Text, i int) (string, int) { |
| // bit headings |
| y2 := 0.0 |
| x1 := 0.0 |
| x2 := 0.0 |
| for i < len(text) && match(text[i], "Times-Roman", 8, "") { |
| if y2 == 0 { |
| y2 = text[i].Y |
| } |
| if x1 == 0 { |
| x1 = text[i].X |
| } |
| i++ |
| } |
| // bit fields in box |
| y1 := 0.0 |
| dy1 := 0.0 |
| for i < len(text) && match(text[i], "Times-Roman", 9, "") { |
| if x2 < text[i].X+text[i].W { |
| x2 = text[i].X + text[i].W |
| } |
| y1 = text[i].Y |
| dy1 = text[i].FontSize |
| i++ |
| } |
| |
| if debugPage > 0 { |
| fmt.Println("encoding box", x1, y1, x2, y2) |
| } |
| |
| // Find lines (thin rectangles) separating bit fields. |
| var bottom, top pdf.Rect |
| const ( |
| yMargin = 0.25 * 72 |
| xMargin = 2 * 72 |
| ) |
| for _, r := range content.Rect { |
| if r.Max.Y-r.Min.Y < 2 && x1-xMargin < r.Min.X && r.Min.X < x1 && x2 < r.Max.X && r.Max.X < x2+xMargin { |
| if y1-yMargin < r.Min.Y && r.Min.Y < y1 { |
| bottom = r |
| } |
| if y1+dy1 < r.Min.Y && r.Min.Y < y2 { |
| top = r |
| } |
| } |
| } |
| |
| if debugPage > 0 { |
| fmt.Println("top", top, "bottom", bottom) |
| } |
| |
| const ε = 0.1 * 72 |
| var bars []pdf.Rect |
| for _, r := range content.Rect { |
| if r.Max.X-r.Min.X < 2 && math.Abs(r.Min.Y-bottom.Min.Y) < ε && math.Abs(r.Max.Y-top.Min.Y) < ε { |
| bars = append(bars, r) |
| } |
| } |
| sort.Sort(RectHorizontal(bars)) |
| |
| // There are 16-bit and 32-bit encodings. |
| // In practice, they are about 2.65 and 5.3 inches wide, respectively. |
| // Use 4 inches as a cutoff. |
| nbit := 32 |
| dx := top.Max.X - top.Min.X |
| if top.Max.X-top.Min.X < 4*72 { |
| nbit = 16 |
| } |
| |
| total := 0 |
| var buf bytes.Buffer |
| for i := 0; i < len(bars)-1; i++ { |
| if i > 0 { |
| fmt.Fprintf(&buf, "|") |
| } |
| var sub []pdf.Text |
| x1, x2 := bars[i].Min.X, bars[i+1].Min.X |
| for _, t := range content.Text { |
| tx := t.X + t.W/2 |
| ty := t.Y + t.FontSize/2 |
| if x1 < tx && tx < x2 && y1 < ty && ty < y2 { |
| sub = append(sub, t) |
| } |
| } |
| var str []string |
| for _, t := range findWords(sub) { |
| str = append(str, t.S) |
| } |
| s := strings.Join(str, " ") |
| s = strings.Replace(s, ")(", ") (", -1) |
| n := len(strings.Fields(s)) |
| b := int(float64(nbit)*(x2-x1)/dx + 0.5) |
| if n == b { |
| for j, f := range strings.Fields(s) { |
| if j > 0 { |
| fmt.Fprintf(&buf, "|") |
| } |
| fmt.Fprintf(&buf, "%s", f) |
| } |
| } else { |
| if n != 1 { |
| fmt.Fprintf(os.Stderr, "%s - %s - multi-field %d-bit encoding: %s\n", name, syntax, n, s) |
| } |
| fmt.Fprintf(&buf, "%s:%d", s, b) |
| } |
| total += b |
| } |
| |
| if total != nbit || total == 0 { |
| fmt.Fprintf(os.Stderr, "%s - %s - %d-bit encoding\n", name, syntax, total) |
| } |
| return buf.String(), i |
| } |
| |
| type RectHorizontal []pdf.Rect |
| |
| func (x RectHorizontal) Swap(i, j int) { x[i], x[j] = x[j], x[i] } |
| func (x RectHorizontal) Less(i, j int) bool { return x[i].Min.X < x[j].Min.X } |
| func (x RectHorizontal) Len() int { return len(x) } |
| |
| func checkNoEncodings(num int, text []pdf.Text) { |
| for _, t := range text { |
| if match(t, "Helvetica-Bold", 9, "Encoding") { |
| fmt.Fprintf(os.Stderr, "page %d: unexpected encoding: %s\n", num, t.S) |
| } |
| } |
| } |
| |
| func match(t pdf.Text, font string, size float64, substr string) bool { |
| return t.Font == font && math.Abs(t.FontSize-size) < 0.1 && strings.Contains(t.S, substr) |
| } |
| |
| func findWords(chars []pdf.Text) (words []pdf.Text) { |
| // Sort by Y coordinate and normalize. |
| const nudge = 1 |
| sort.Sort(pdf.TextVertical(chars)) |
| old := -100000.0 |
| for i, c := range chars { |
| if c.Y != old && math.Abs(old-c.Y) < nudge { |
| chars[i].Y = old |
| } else { |
| old = c.Y |
| } |
| } |
| |
| // Sort by Y coordinate, breaking ties with X. |
| // This will bring letters in a single word together. |
| sort.Sort(pdf.TextVertical(chars)) |
| |
| // Loop over chars. |
| for i := 0; i < len(chars); { |
| // Find all chars on line. |
| j := i + 1 |
| for j < len(chars) && chars[j].Y == chars[i].Y { |
| j++ |
| } |
| var end float64 |
| // Split line into words (really, phrases). |
| for k := i; k < j; { |
| ck := &chars[k] |
| s := ck.S |
| end = ck.X + ck.W |
| charSpace := ck.FontSize / 6 |
| wordSpace := ck.FontSize * 2 / 3 |
| l := k + 1 |
| for l < j { |
| // Grow word. |
| cl := &chars[l] |
| if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+charSpace { |
| s += cl.S |
| end = cl.X + cl.W |
| l++ |
| continue |
| } |
| // Add space to phrase before next word. |
| if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+wordSpace { |
| s += " " + cl.S |
| end = cl.X + cl.W |
| l++ |
| continue |
| } |
| break |
| } |
| f := ck.Font |
| f = strings.TrimSuffix(f, ",Italic") |
| f = strings.TrimSuffix(f, "-Italic") |
| words = append(words, pdf.Text{f, ck.FontSize, ck.X, ck.Y, end - ck.X, s}) |
| k = l |
| } |
| i = j |
| } |
| |
| return words |
| } |
| |
| func sameFont(f1, f2 string) bool { |
| f1 = strings.TrimSuffix(f1, ",Italic") |
| f1 = strings.TrimSuffix(f1, "-Italic") |
| f2 = strings.TrimSuffix(f1, ",Italic") |
| f2 = strings.TrimSuffix(f1, "-Italic") |
| return strings.TrimSuffix(f1, ",Italic") == strings.TrimSuffix(f2, ",Italic") || f1 == "Symbol" || f2 == "Symbol" || f1 == "TimesNewRoman" || f2 == "TimesNewRoman" |
| } |
| |
| var jsFix = strings.NewReplacer( |
| // `\u003c`, `<`, |
| // `\u003e`, `>`, |
| // `\u0026`, `&`, |
| // `\u0009`, `\t`, |
| ) |
| |
| func printTable(name string, table []Inst) { |
| _ = strconv.Atoi |
| } |