font/sfnt: parse the cmap table.
Change-Id: I757d42c9caf419f549696543f0f156cfe3dbfe1a
Reviewed-on: https://go-review.googlesource.com/35512
Reviewed-by: David Crawshaw <crawshaw@golang.org>
diff --git a/font/sfnt/cmap.go b/font/sfnt/cmap.go
new file mode 100644
index 0000000..19b0a9d
--- /dev/null
+++ b/font/sfnt/cmap.go
@@ -0,0 +1,198 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sfnt
+
+import (
+ "unicode/utf8"
+
+ "golang.org/x/text/encoding/charmap"
+)
+
+// Platform IDs and Platform Specific IDs as per
+// https://www.microsoft.com/typography/otspec/name.htm
+const (
+ pidUnicode = 0
+ pidMacintosh = 1
+ pidWindows = 3
+
+ psidUnicode2BMPOnly = 3
+ psidUnicode2FullRepertoire = 4
+
+ psidMacintoshRoman = 0
+
+ psidWindowsUCS2 = 1
+ psidWindowsUCS4 = 10
+)
+
+// platformEncodingWidth returns the number of bytes per character assumed by
+// the given Platform ID and Platform Specific ID.
+//
+// Very old fonts, from before Unicode was widely adopted, assume only 1 byte
+// per character: a character map.
+//
+// Old fonts, from when Unicode meant the Basic Multilingual Plane (BMP),
+// assume that 2 bytes per character is sufficient.
+//
+// Recent fonts naturally support the full range of Unicode code points, which
+// can take up to 4 bytes per character. Such fonts might still choose one of
+// the legacy encodings if e.g. their repertoire is limited to the BMP, for
+// greater compatibility with older software, or because the resultant file
+// size can be smaller.
+func platformEncodingWidth(pid, psid uint16) int {
+ switch pid {
+ case pidUnicode:
+ switch psid {
+ case psidUnicode2BMPOnly:
+ return 2
+ case psidUnicode2FullRepertoire:
+ return 4
+ }
+
+ case pidMacintosh:
+ switch psid {
+ case psidMacintoshRoman:
+ return 1
+ }
+
+ case pidWindows:
+ switch psid {
+ case psidWindowsUCS2:
+ return 2
+ case psidWindowsUCS4:
+ return 4
+ }
+ }
+ return 0
+}
+
+// The various cmap formats are described at
+// https://www.microsoft.com/typography/otspec/cmap.htm
+
+var supportedCmapFormat = func(format, pid, psid uint16) bool {
+ switch format {
+ case 0:
+ return pid == pidMacintosh && psid == psidMacintoshRoman
+ case 4:
+ return true
+ case 12:
+ // TODO: implement.
+ }
+ return false
+}
+
+func (f *Font) makeCachedGlyphIndex(buf []byte, offset, length uint32, format uint16) ([]byte, error) {
+ switch format {
+ case 0:
+ return f.makeCachedGlyphIndexFormat0(buf, offset, length)
+ case 4:
+ return f.makeCachedGlyphIndexFormat4(buf, offset, length)
+ case 12:
+ // TODO: implement, including a cmapEntry32 type (32, not 16).
+ }
+ panic("unreachable")
+}
+
+func (f *Font) makeCachedGlyphIndexFormat0(buf []byte, offset, length uint32) ([]byte, error) {
+ if length != 6+256 || offset+length > f.cmap.length {
+ return nil, errInvalidCmapTable
+ }
+ var err error
+ buf, err = f.src.view(buf, int(f.cmap.offset+offset), int(length))
+ if err != nil {
+ return nil, err
+ }
+ var table [256]byte
+ copy(table[:], buf[6:])
+ f.cached.glyphIndex = func(f *Font, b *Buffer, r rune) (GlyphIndex, error) {
+ // TODO: for this closure to be goroutine-safe, the
+ // golang.org/x/text/encoding/charmap API needs to allocate a new
+ // Encoder and new []byte buffers, for every call to this closure, even
+ // though all we want to do is to encode one rune as one byte. We could
+ // possibly add some fields in the Buffer struct to re-use these
+ // allocations, but a better solution is to improve the charmap API.
+ var dst, src [utf8.UTFMax]byte
+ n := utf8.EncodeRune(src[:], r)
+ _, _, err = charmap.Macintosh.NewEncoder().Transform(dst[:], src[:n], true)
+ if err != nil {
+ // The source rune r is not representable in the Macintosh-Roman encoding.
+ return 0, nil
+ }
+ return GlyphIndex(table[dst[0]]), nil
+ }
+ return buf, nil
+}
+
+func (f *Font) makeCachedGlyphIndexFormat4(buf []byte, offset, length uint32) ([]byte, error) {
+ const headerSize = 14
+ if offset+headerSize > f.cmap.length {
+ return nil, errInvalidCmapTable
+ }
+ var err error
+ buf, err = f.src.view(buf, int(f.cmap.offset+offset), headerSize)
+ if err != nil {
+ return nil, err
+ }
+ offset += headerSize
+
+ segCount := u16(buf[6:])
+ if segCount&1 != 0 {
+ return nil, errInvalidCmapTable
+ }
+ segCount /= 2
+ if segCount > maxCmapSegments {
+ return nil, errUnsupportedNumberOfCmapSegments
+ }
+
+ eLength := 8*uint32(segCount) + 2
+ if offset+eLength > f.cmap.length {
+ return nil, errInvalidCmapTable
+ }
+ buf, err = f.src.view(buf, int(f.cmap.offset+offset), int(eLength))
+ if err != nil {
+ return nil, err
+ }
+ offset += eLength
+
+ entries := make([]cmapEntry16, segCount)
+ for i := range entries {
+ entries[i] = cmapEntry16{
+ end: u16(buf[0*len(entries)+0+2*i:]),
+ start: u16(buf[2*len(entries)+2+2*i:]),
+ delta: u16(buf[4*len(entries)+2+2*i:]),
+ offset: u16(buf[6*len(entries)+2+2*i:]),
+ }
+ }
+
+ f.cached.glyphIndex = func(f *Font, b *Buffer, r rune) (GlyphIndex, error) {
+ if uint32(r) > 0xffff {
+ return 0, nil
+ }
+
+ c := uint16(r)
+ for i, j := 0, len(entries); i < j; {
+ h := i + (j-i)/2
+ entry := &entries[h]
+ if c < entry.start {
+ j = h
+ } else if entry.end < c {
+ i = h + 1
+ } else if entry.offset == 0 {
+ return GlyphIndex(c + entry.delta), nil
+ } else {
+ // TODO: support the glyphIdArray as per
+ // https://www.microsoft.com/typography/OTSPEC/cmap.htm
+ //
+ // This will probably use the *Font and *Buffer arguments.
+ return 0, errUnsupportedCmapFormat
+ }
+ }
+ return 0, nil
+ }
+ return buf, nil
+}
+
+type cmapEntry16 struct {
+ end, start, delta, offset uint16
+}
diff --git a/font/sfnt/sfnt.go b/font/sfnt/sfnt.go
index d4929a8..83065b1 100644
--- a/font/sfnt/sfnt.go
+++ b/font/sfnt/sfnt.go
@@ -13,6 +13,9 @@
//
// The pyftinspect tool from https://github.com/fonttools/fonttools is useful
// for inspecting SFNT fonts.
+//
+// The ttfdump tool is also useful. For example:
+// ttfdump -t cmap ../testdata/CFFTest.otf dump.txt
import (
"errors"
@@ -25,6 +28,7 @@
// These constants are not part of the specifications, but are limitations used
// by this implementation.
const (
+ maxCmapSegments = 1024
maxGlyphDataLength = 64 * 1024
maxHintBits = 256
maxNumTables = 256
@@ -41,6 +45,7 @@
errInvalidBounds = errors.New("sfnt: invalid bounds")
errInvalidCFFTable = errors.New("sfnt: invalid CFF table")
+ errInvalidCmapTable = errors.New("sfnt: invalid cmap table")
errInvalidGlyphData = errors.New("sfnt: invalid glyph data")
errInvalidHeadTable = errors.New("sfnt: invalid head table")
errInvalidLocaTable = errors.New("sfnt: invalid loca table")
@@ -53,15 +58,18 @@
errInvalidUCS2String = errors.New("sfnt: invalid UCS-2 string")
errInvalidVersion = errors.New("sfnt: invalid version")
- errUnsupportedCFFVersion = errors.New("sfnt: unsupported CFF version")
- errUnsupportedCompoundGlyph = errors.New("sfnt: unsupported compound glyph")
- errUnsupportedGlyphDataLength = errors.New("sfnt: unsupported glyph data length")
- errUnsupportedRealNumberEncoding = errors.New("sfnt: unsupported real number encoding")
- errUnsupportedNumberOfHints = errors.New("sfnt: unsupported number of hints")
- errUnsupportedNumberOfTables = errors.New("sfnt: unsupported number of tables")
- errUnsupportedPlatformEncoding = errors.New("sfnt: unsupported platform encoding")
- errUnsupportedTableOffsetLength = errors.New("sfnt: unsupported table offset or length")
- errUnsupportedType2Charstring = errors.New("sfnt: unsupported Type 2 Charstring")
+ errUnsupportedCFFVersion = errors.New("sfnt: unsupported CFF version")
+ errUnsupportedCmapEncodings = errors.New("sfnt: unsupported cmap encodings")
+ errUnsupportedCmapFormat = errors.New("sfnt: unsupported cmap format")
+ errUnsupportedCompoundGlyph = errors.New("sfnt: unsupported compound glyph")
+ errUnsupportedGlyphDataLength = errors.New("sfnt: unsupported glyph data length")
+ errUnsupportedRealNumberEncoding = errors.New("sfnt: unsupported real number encoding")
+ errUnsupportedNumberOfCmapSegments = errors.New("sfnt: unsupported number of cmap segments")
+ errUnsupportedNumberOfHints = errors.New("sfnt: unsupported number of hints")
+ errUnsupportedNumberOfTables = errors.New("sfnt: unsupported number of tables")
+ errUnsupportedPlatformEncoding = errors.New("sfnt: unsupported platform encoding")
+ errUnsupportedTableOffsetLength = errors.New("sfnt: unsupported table offset or length")
+ errUnsupportedType2Charstring = errors.New("sfnt: unsupported Type 2 Charstring")
)
// GlyphIndex is a glyph index in a Font.
@@ -107,17 +115,6 @@
// display resolution (DPI) and font size (e.g. a 12 point font).
type Units int32
-// Platform IDs and Platform Specific IDs as per
-// https://www.microsoft.com/typography/otspec/name.htm
-const (
- pidMacintosh = 1
- pidWindows = 3
-
- psidMacintoshRoman = 0
-
- psidWindowsUCS2 = 1
-)
-
func u16(b []byte) uint16 {
_ = b[1] // Bounds check hint to compiler.
return uint16(b[0])<<8 | uint16(b[1])<<0
@@ -286,6 +283,7 @@
// TODO: hdmx, kern, vmtx? Others?
cached struct {
+ glyphIndex func(f *Font, b *Buffer, r rune) (GlyphIndex, error)
indexToLocFormat bool // false means short, true means long.
isPostScript bool
unitsPerEm Units
@@ -306,18 +304,36 @@
if !f.src.valid() {
return errInvalidSourceData
}
- var buf []byte
+ buf, err := f.initializeTables(nil)
+ if err != nil {
+ return err
+ }
+ buf, err = f.parseHead(buf)
+ if err != nil {
+ return err
+ }
+ buf, err = f.parseMaxp(buf)
+ if err != nil {
+ return err
+ }
+ buf, err = f.parseCmap(buf)
+ if err != nil {
+ return err
+ }
+ return nil
+}
+func (f *Font) initializeTables(buf []byte) ([]byte, error) {
// https://www.microsoft.com/typography/otspec/otff.htm "Organization of an
// OpenType Font" says that "The OpenType font starts with the Offset
// Table", which is 12 bytes.
buf, err := f.src.view(buf, 0, 12)
if err != nil {
- return err
+ return nil, err
}
switch u32(buf) {
default:
- return errInvalidVersion
+ return nil, errInvalidVersion
case 0x00010000:
// No-op.
case 0x4f54544f: // "OTTO".
@@ -325,32 +341,32 @@
}
numTables := int(u16(buf[4:]))
if numTables > maxNumTables {
- return errUnsupportedNumberOfTables
+ return nil, errUnsupportedNumberOfTables
}
// "The Offset Table is followed immediately by the Table Record entries...
// sorted in ascending order by tag", 16 bytes each.
buf, err = f.src.view(buf, 12, 16*numTables)
if err != nil {
- return err
+ return nil, err
}
for b, first, prevTag := buf, true, uint32(0); len(b) > 0; b = b[16:] {
tag := u32(b)
if first {
first = false
} else if tag <= prevTag {
- return errInvalidTableTagOrder
+ return nil, errInvalidTableTagOrder
}
prevTag = tag
o, n := u32(b[8:12]), u32(b[12:16])
if o > maxTableOffset || n > maxTableLength {
- return errUnsupportedTableOffsetLength
+ return nil, errUnsupportedTableOffsetLength
}
// We ignore the checksums, but "all tables must begin on four byte
// boundries [sic]".
if o&3 != 0 {
- return errInvalidTableOffset
+ return nil, errInvalidTableOffset
}
// Match the 4-byte tag as a uint32. For example, "OS/2" is 0x4f532f32.
@@ -379,40 +395,109 @@
f.post = table{o, n}
}
}
+ return buf, nil
+}
- var u uint16
+func (f *Font) parseCmap(buf []byte) ([]byte, error) {
+ // https://www.microsoft.com/typography/OTSPEC/cmap.htm
- // https://www.microsoft.com/typography/otspec/head.htm
- if f.head.length != 54 {
- return errInvalidHeadTable
+ const headerSize, entrySize = 4, 8
+ if f.cmap.length < headerSize {
+ return nil, errInvalidCmapTable
}
- u, err = f.src.u16(buf, f.head, 18)
+ u, err := f.src.u16(buf, f.cmap, 2)
if err != nil {
- return err
+ return nil, err
+ }
+ numSubtables := int(u)
+ if f.cmap.length < headerSize+entrySize*uint32(numSubtables) {
+ return nil, errInvalidCmapTable
+ }
+
+ var (
+ bestWidth int
+ bestOffset uint32
+ bestLength uint32
+ bestFormat uint16
+ )
+
+ // Scan all of the subtables, picking the widest supported one. See the
+ // platformEncodingWidth comment for more discussion of width.
+ for i := 0; i < numSubtables; i++ {
+ buf, err = f.src.view(buf, int(f.cmap.offset)+headerSize+entrySize*i, entrySize)
+ if err != nil {
+ return nil, err
+ }
+ pid := u16(buf)
+ psid := u16(buf[2:])
+ width := platformEncodingWidth(pid, psid)
+ if width <= bestWidth {
+ continue
+ }
+ offset := u32(buf[4:])
+
+ if offset > f.cmap.length-4 {
+ return nil, errInvalidCmapTable
+ }
+ buf, err = f.src.view(buf, int(f.cmap.offset+offset), 4)
+ if err != nil {
+ return nil, err
+ }
+ format := u16(buf)
+ if !supportedCmapFormat(format, pid, psid) {
+ continue
+ }
+ length := uint32(u16(buf[2:]))
+
+ bestWidth = width
+ bestOffset = offset
+ bestLength = length
+ bestFormat = format
+ }
+
+ if bestWidth == 0 {
+ return nil, errUnsupportedCmapEncodings
+ }
+ return f.makeCachedGlyphIndex(buf, bestOffset, bestLength, bestFormat)
+}
+
+func (f *Font) parseHead(buf []byte) ([]byte, error) {
+ // https://www.microsoft.com/typography/otspec/head.htm
+
+ if f.head.length != 54 {
+ return nil, errInvalidHeadTable
+ }
+ u, err := f.src.u16(buf, f.head, 18)
+ if err != nil {
+ return nil, err
}
if u == 0 {
- return errInvalidHeadTable
+ return nil, errInvalidHeadTable
}
f.cached.unitsPerEm = Units(u)
u, err = f.src.u16(buf, f.head, 50)
if err != nil {
- return err
+ return nil, err
}
f.cached.indexToLocFormat = u != 0
+ return buf, nil
+}
+func (f *Font) parseMaxp(buf []byte) ([]byte, error) {
// https://www.microsoft.com/typography/otspec/maxp.htm
+
if f.cached.isPostScript {
if f.maxp.length != 6 {
- return errInvalidMaxpTable
+ return nil, errInvalidMaxpTable
}
} else {
if f.maxp.length != 32 {
- return errInvalidMaxpTable
+ return nil, errInvalidMaxpTable
}
}
- u, err = f.src.u16(buf, f.maxp, 4)
+ u, err := f.src.u16(buf, f.maxp, 4)
if err != nil {
- return err
+ return nil, err
}
numGlyphs := int(u)
@@ -425,23 +510,36 @@
}
f.cached.locations, err = p.parse()
if err != nil {
- return err
+ return nil, err
}
} else {
f.cached.locations, err = parseLoca(
&f.src, f.loca, f.glyf.offset, f.cached.indexToLocFormat, numGlyphs)
if err != nil {
- return err
+ return nil, err
}
}
if len(f.cached.locations) != numGlyphs+1 {
- return errInvalidLocationData
+ return nil, errInvalidLocationData
}
- return nil
+
+ return buf, nil
}
-// TODO: func (f *Font) GlyphIndex(r rune) (x GlyphIndex, ok bool)
-// This will require parsing the cmap table.
+// TODO: API for looking up glyph variants?? For example, some fonts may
+// provide both slashed and dotted zero glyphs ('0'), or regular and 'old
+// style' numerals, and users can direct software to choose a variant.
+
+// GlyphIndex returns the glyph index for the given rune.
+//
+// It returns (0, nil) if there is no glyph for r.
+// https://www.microsoft.com/typography/OTSPEC/cmap.htm says that "Character
+// codes that do not correspond to any glyph in the font should be mapped to
+// glyph index 0. The glyph at this location must be a special glyph
+// representing a missing character, commonly known as .notdef."
+func (f *Font) GlyphIndex(b *Buffer, r rune) (GlyphIndex, error) {
+ return f.cached.glyphIndex(f, b, r)
+}
func (f *Font) viewGlyphData(b *Buffer, x GlyphIndex) ([]byte, error) {
xx := int(x)
@@ -512,14 +610,14 @@
if err != nil {
return "", err
}
- nSubtables := u16(buf[2:])
- if f.name.length < headerSize+entrySize*uint32(nSubtables) {
+ numSubtables := u16(buf[2:])
+ if f.name.length < headerSize+entrySize*uint32(numSubtables) {
return "", errInvalidNameTable
}
stringOffset := u16(buf[4:])
seen := false
- for i, n := 0, int(nSubtables); i < n; i++ {
+ for i, n := 0, int(numSubtables); i < n; i++ {
buf, err := b.view(&f.src, int(f.name.offset)+headerSize+entrySize*i, entrySize)
if err != nil {
return "", err
diff --git a/font/sfnt/sfnt_test.go b/font/sfnt/sfnt_test.go
index 4ffd72f..4b72221 100644
--- a/font/sfnt/sfnt_test.go
+++ b/font/sfnt/sfnt_test.go
@@ -88,6 +88,74 @@
}
}
+func TestGlyphIndex(t *testing.T) {
+ data, err := ioutil.ReadFile(filepath.FromSlash("../testdata/CFFTest.otf"))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ for _, format := range []int{-1, 0, 4} {
+ testGlyphIndex(t, data, format)
+ }
+}
+
+func testGlyphIndex(t *testing.T, data []byte, cmapFormat int) {
+ if cmapFormat >= 0 {
+ originalSupportedCmapFormat := supportedCmapFormat
+ defer func() {
+ supportedCmapFormat = originalSupportedCmapFormat
+ }()
+ supportedCmapFormat = func(format, pid, psid uint16) bool {
+ return int(format) == cmapFormat && originalSupportedCmapFormat(format, pid, psid)
+ }
+ }
+
+ f, err := Parse(data)
+ if err != nil {
+ t.Errorf("cmapFormat=%d: %v", cmapFormat, err)
+ return
+ }
+
+ testCases := []struct {
+ r rune
+ want GlyphIndex
+ }{
+ {'0', 1},
+ {'1', 2},
+ {'Q', 3},
+ // TODO: add the U+00E0 non-ASCII Latin-1 Supplement rune to
+ // CFFTest.otf and change 0 to something non-zero.
+ {'\u00e0', 0},
+ {'\u4e2d', 4},
+ // TODO: add a rune >= U+00010000 to CFFTest.otf?
+
+ // Glyphs that aren't present in CFFTest.otf.
+ {'?', 0},
+ {'\ufffd', 0},
+ {'\U0001f4a9', 0},
+ }
+
+ var b Buffer
+ for _, tc := range testCases {
+ want := tc.want
+ // cmap format 0, with the Macintosh Roman encoding, can't represent
+ // U+4E2D.
+ if cmapFormat == 0 && tc.r == '\u4e2d' {
+ want = 0
+ }
+
+ got, err := f.GlyphIndex(&b, tc.r)
+ if err != nil {
+ t.Errorf("cmapFormat=%d, r=%q: %v", cmapFormat, tc.r, err)
+ continue
+ }
+ if got != want {
+ t.Errorf("cmapFormat=%d, r=%q: got %d, want %d", cmapFormat, tc.r, got, want)
+ continue
+ }
+ }
+}
+
func TestPostScriptSegments(t *testing.T) {
// wants' vectors correspond 1-to-1 to what's in the CFFTest.sfd file,
// although OpenType/CFF and FontForge's SFD have reversed orders.
@@ -226,7 +294,7 @@
}
func testSegments(t *testing.T, filename string, wants [][]Segment) {
- data, err := ioutil.ReadFile(filepath.Join("..", "testdata", filename))
+ data, err := ioutil.ReadFile(filepath.FromSlash("../testdata/" + filename))
if err != nil {
t.Fatal(err)
}