| // Copyright 2014 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // +build go1.6 |
| // +build !386 go1.8 |
| // ... see golang.org/issue/12840 |
| |
| // Package pdf implements reading of PDF files. |
| // |
| // Overview |
| // |
| // PDF is Adobe's Portable Document Format, ubiquitous on the internet. |
| // A PDF document is a complex data format built on a fairly simple structure. |
| // This package exposes the simple structure along with some wrappers to |
| // extract basic information. If more complex information is needed, it is |
| // possible to extract that information by interpreting the structure exposed |
| // by this package. |
| // |
| // Specifically, a PDF is a data structure built from Values, each of which has |
| // one of the following Kinds: |
| // |
| // Null, for the null object. |
| // Integer, for an integer. |
| // Real, for a floating-point number. |
| // Bool, for a boolean value. |
| // Name, for a name constant (as in /Helvetica). |
| // String, for a string constant. |
| // Dict, for a dictionary of name-value pairs. |
| // Array, for an array of values. |
| // Stream, for an opaque data stream and associated header dictionary. |
| // |
| // The accessors on Value—Int64, Float64, Bool, Name, and so on—return |
| // a view of the data as the given type. When there is no appropriate view, |
| // the accessor returns a zero result. For example, the Name accessor returns |
| // the empty string if called on a Value v for which v.Kind() != Name. |
| // Returning zero values this way, especially from the Dict and Array accessors, |
| // which themselves return Values, makes it possible to traverse a PDF quickly |
| // without writing any error checking. On the other hand, it means that mistakes |
| // can go unreported. |
| // |
| // The basic structure of the PDF file is exposed as the graph of Values. |
| // |
| // Most richer data structures in a PDF file are dictionaries with specific interpretations |
| // of the name-value pairs. The Font and Page wrappers make the interpretation |
| // of a specific Value as the corresponding type easier. They are only helpers, though: |
| // they are implemented only in terms of the Value API and could be moved outside |
| // the package. Equally important, traversal of other PDF data structures can be implemented |
| // in other packages as needed. |
| // |
| package pdf // import "rsc.io/pdf" |
| |
| // BUG(rsc): The package is incomplete, although it has been used successfully on some |
| // large real-world PDF files. |
| |
| // BUG(rsc): There is no support for closing open PDF files. If you drop all references to a Reader, |
| // the underlying reader will eventually be garbage collected. |
| |
| // BUG(rsc): The library makes no attempt at efficiency. A value cache maintained in the Reader |
| // would probably help significantly. |
| |
| // BUG(rsc): The support for reading encrypted files is weak. |
| |
| // BUG(rsc): The Value API does not support error reporting. The intent is to allow users to |
| // set an error reporting callback in Reader, but that code has not been implemented. |
| |
| import ( |
| "bytes" |
| "compress/zlib" |
| "crypto/aes" |
| "crypto/cipher" |
| "crypto/md5" |
| "crypto/rc4" |
| "fmt" |
| "io" |
| "io/ioutil" |
| "os" |
| "sort" |
| "strconv" |
| ) |
| |
| // A Reader is a single PDF file open for reading. |
| type Reader struct { |
| f io.ReaderAt |
| end int64 |
| xref []xref |
| trailer dict |
| trailerptr objptr |
| key []byte |
| useAES bool |
| } |
| |
| type xref struct { |
| ptr objptr |
| inStream bool |
| stream objptr |
| offset int64 |
| } |
| |
| func (r *Reader) errorf(format string, args ...interface{}) { |
| panic(fmt.Errorf(format, args...)) |
| } |
| |
| // Open opens a file for reading. |
| func Open(file string) (*Reader, error) { |
| // TODO: Deal with closing file. |
| f, err := os.Open(file) |
| if err != nil { |
| return nil, err |
| } |
| fi, err := f.Stat() |
| if err != nil { |
| f.Close() |
| return nil, err |
| } |
| return NewReader(f, fi.Size()) |
| } |
| |
| // NewReader opens a file for reading, using the data in f with the given total size. |
| func NewReader(f io.ReaderAt, size int64) (*Reader, error) { |
| return NewReaderEncrypted(f, size, nil) |
| } |
| |
| // NewReaderEncrypted opens a file for reading, using the data in f with the given total size. |
| // If the PDF is encrypted, NewReaderEncrypted calls pw repeatedly to obtain passwords |
| // to try. If pw returns the empty string, NewReaderEncrypted stops trying to decrypt |
| // the file and returns an error. |
| func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, error) { |
| buf := make([]byte, 10) |
| f.ReadAt(buf, 0) |
| if !bytes.HasPrefix(buf, []byte("%PDF-1.")) || buf[7] < '0' || buf[7] > '7' || buf[8] != '\r' && buf[8] != '\n' { |
| return nil, fmt.Errorf("not a PDF file: invalid header") |
| } |
| end := size |
| const endChunk = 100 |
| buf = make([]byte, endChunk) |
| f.ReadAt(buf, end-endChunk) |
| for len(buf) > 0 && buf[len(buf)-1] == '\n' || buf[len(buf)-1] == '\r' { |
| buf = buf[:len(buf)-1] |
| } |
| buf = bytes.TrimRight(buf, "\r\n\t ") |
| if !bytes.HasSuffix(buf, []byte("%%EOF")) { |
| return nil, fmt.Errorf("not a PDF file: missing %%%%EOF") |
| } |
| i := findLastLine(buf, "startxref") |
| if i < 0 { |
| return nil, fmt.Errorf("malformed PDF file: missing final startxref") |
| } |
| |
| r := &Reader{ |
| f: f, |
| end: end, |
| } |
| pos := end - endChunk + int64(i) |
| b := newBuffer(io.NewSectionReader(f, pos, end-pos), pos) |
| if b.readToken() != keyword("startxref") { |
| return nil, fmt.Errorf("malformed PDF file: missing startxref") |
| } |
| startxref, ok := b.readToken().(int64) |
| if !ok { |
| return nil, fmt.Errorf("malformed PDF file: startxref not followed by integer") |
| } |
| b = newBuffer(io.NewSectionReader(r.f, startxref, r.end-startxref), startxref) |
| xref, trailerptr, trailer, err := readXref(r, b) |
| if err != nil { |
| return nil, err |
| } |
| r.xref = xref |
| r.trailer = trailer |
| r.trailerptr = trailerptr |
| if trailer["Encrypt"] == nil { |
| return r, nil |
| } |
| err = r.initEncrypt("") |
| if err == nil { |
| return r, nil |
| } |
| if pw == nil || err != ErrInvalidPassword { |
| return nil, err |
| } |
| for { |
| next := pw() |
| if next == "" { |
| break |
| } |
| if r.initEncrypt(next) == nil { |
| return r, nil |
| } |
| } |
| return nil, err |
| } |
| |
| // Trailer returns the file's Trailer value. |
| func (r *Reader) Trailer() Value { |
| return Value{r, r.trailerptr, r.trailer} |
| } |
| |
| func readXref(r *Reader, b *buffer) ([]xref, objptr, dict, error) { |
| tok := b.readToken() |
| if tok == keyword("xref") { |
| return readXrefTable(r, b) |
| } |
| if _, ok := tok.(int64); ok { |
| b.unreadToken(tok) |
| return readXrefStream(r, b) |
| } |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: cross-reference table not found: %v", tok) |
| } |
| |
| func readXrefStream(r *Reader, b *buffer) ([]xref, objptr, dict, error) { |
| obj1 := b.readObject() |
| obj, ok := obj1.(objdef) |
| if !ok { |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: cross-reference table not found: %v", objfmt(obj1)) |
| } |
| strmptr := obj.ptr |
| strm, ok := obj.obj.(stream) |
| if !ok { |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: cross-reference table not found: %v", objfmt(obj)) |
| } |
| if strm.hdr["Type"] != name("XRef") { |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref stream does not have type XRef") |
| } |
| size, ok := strm.hdr["Size"].(int64) |
| if !ok { |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref stream missing Size") |
| } |
| table := make([]xref, size) |
| |
| table, err := readXrefStreamData(r, strm, table, size) |
| if err != nil { |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: %v", err) |
| } |
| |
| for prevoff := strm.hdr["Prev"]; prevoff != nil; { |
| off, ok := prevoff.(int64) |
| if !ok { |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev is not integer: %v", prevoff) |
| } |
| b := newBuffer(io.NewSectionReader(r.f, off, r.end-off), off) |
| obj1 := b.readObject() |
| obj, ok := obj1.(objdef) |
| if !ok { |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream not found: %v", objfmt(obj1)) |
| } |
| prevstrm, ok := obj.obj.(stream) |
| if !ok { |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream not found: %v", objfmt(obj)) |
| } |
| prevoff = prevstrm.hdr["Prev"] |
| prev := Value{r, objptr{}, prevstrm} |
| if prev.Kind() != Stream { |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream is not stream: %v", prev) |
| } |
| if prev.Key("Type").Name() != "XRef" { |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream does not have type XRef") |
| } |
| psize := prev.Key("Size").Int64() |
| if psize > size { |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream larger than last stream") |
| } |
| if table, err = readXrefStreamData(r, prev.data.(stream), table, psize); err != nil { |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: reading xref prev stream: %v", err) |
| } |
| } |
| |
| return table, strmptr, strm.hdr, nil |
| } |
| |
| func readXrefStreamData(r *Reader, strm stream, table []xref, size int64) ([]xref, error) { |
| index, _ := strm.hdr["Index"].(array) |
| if index == nil { |
| index = array{int64(0), size} |
| } |
| if len(index)%2 != 0 { |
| return nil, fmt.Errorf("invalid Index array %v", objfmt(index)) |
| } |
| ww, ok := strm.hdr["W"].(array) |
| if !ok { |
| return nil, fmt.Errorf("xref stream missing W array") |
| } |
| |
| var w []int |
| for _, x := range ww { |
| i, ok := x.(int64) |
| if !ok || int64(int(i)) != i { |
| return nil, fmt.Errorf("invalid W array %v", objfmt(ww)) |
| } |
| w = append(w, int(i)) |
| } |
| if len(w) < 3 { |
| return nil, fmt.Errorf("invalid W array %v", objfmt(ww)) |
| } |
| |
| v := Value{r, objptr{}, strm} |
| wtotal := 0 |
| for _, wid := range w { |
| wtotal += wid |
| } |
| buf := make([]byte, wtotal) |
| data := v.Reader() |
| for len(index) > 0 { |
| start, ok1 := index[0].(int64) |
| n, ok2 := index[1].(int64) |
| if !ok1 || !ok2 { |
| return nil, fmt.Errorf("malformed Index pair %v %v %T %T", objfmt(index[0]), objfmt(index[1]), index[0], index[1]) |
| } |
| index = index[2:] |
| for i := 0; i < int(n); i++ { |
| _, err := io.ReadFull(data, buf) |
| if err != nil { |
| return nil, fmt.Errorf("error reading xref stream: %v", err) |
| } |
| v1 := decodeInt(buf[0:w[0]]) |
| if w[0] == 0 { |
| v1 = 1 |
| } |
| v2 := decodeInt(buf[w[0] : w[0]+w[1]]) |
| v3 := decodeInt(buf[w[0]+w[1] : w[0]+w[1]+w[2]]) |
| x := int(start) + i |
| for cap(table) <= x { |
| table = append(table[:cap(table)], xref{}) |
| } |
| if table[x].ptr != (objptr{}) { |
| continue |
| } |
| switch v1 { |
| case 0: |
| table[x] = xref{ptr: objptr{0, 65535}} |
| case 1: |
| table[x] = xref{ptr: objptr{uint32(x), uint16(v3)}, offset: int64(v2)} |
| case 2: |
| table[x] = xref{ptr: objptr{uint32(x), 0}, inStream: true, stream: objptr{uint32(v2), 0}, offset: int64(v3)} |
| default: |
| fmt.Printf("invalid xref stream type %d: %x\n", v1, buf) |
| } |
| } |
| } |
| return table, nil |
| } |
| |
| func decodeInt(b []byte) int { |
| x := 0 |
| for _, c := range b { |
| x = x<<8 | int(c) |
| } |
| return x |
| } |
| |
| func readXrefTable(r *Reader, b *buffer) ([]xref, objptr, dict, error) { |
| var table []xref |
| |
| table, err := readXrefTableData(b, table) |
| if err != nil { |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: %v", err) |
| } |
| |
| trailer, ok := b.readObject().(dict) |
| if !ok { |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref table not followed by trailer dictionary") |
| } |
| |
| for prevoff := trailer["Prev"]; prevoff != nil; { |
| off, ok := prevoff.(int64) |
| if !ok { |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev is not integer: %v", prevoff) |
| } |
| b := newBuffer(io.NewSectionReader(r.f, off, r.end-off), off) |
| tok := b.readToken() |
| if tok != keyword("xref") { |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev does not point to xref") |
| } |
| table, err = readXrefTableData(b, table) |
| if err != nil { |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: %v", err) |
| } |
| |
| trailer, ok := b.readObject().(dict) |
| if !ok { |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev table not followed by trailer dictionary") |
| } |
| prevoff = trailer["Prev"] |
| } |
| |
| size, ok := trailer[name("Size")].(int64) |
| if !ok { |
| return nil, objptr{}, nil, fmt.Errorf("malformed PDF: trailer missing /Size entry") |
| } |
| |
| if size < int64(len(table)) { |
| table = table[:size] |
| } |
| |
| return table, objptr{}, trailer, nil |
| } |
| |
| func readXrefTableData(b *buffer, table []xref) ([]xref, error) { |
| for { |
| tok := b.readToken() |
| if tok == keyword("trailer") { |
| break |
| } |
| start, ok1 := tok.(int64) |
| n, ok2 := b.readToken().(int64) |
| if !ok1 || !ok2 { |
| return nil, fmt.Errorf("malformed xref table") |
| } |
| for i := 0; i < int(n); i++ { |
| off, ok1 := b.readToken().(int64) |
| gen, ok2 := b.readToken().(int64) |
| alloc, ok3 := b.readToken().(keyword) |
| if !ok1 || !ok2 || !ok3 || alloc != keyword("f") && alloc != keyword("n") { |
| return nil, fmt.Errorf("malformed xref table") |
| } |
| x := int(start) + i |
| for cap(table) <= x { |
| table = append(table[:cap(table)], xref{}) |
| } |
| if len(table) <= x { |
| table = table[:x+1] |
| } |
| if alloc == "n" && table[x].offset == 0 { |
| table[x] = xref{ptr: objptr{uint32(x), uint16(gen)}, offset: int64(off)} |
| } |
| } |
| } |
| return table, nil |
| } |
| |
| func findLastLine(buf []byte, s string) int { |
| bs := []byte(s) |
| max := len(buf) |
| for { |
| i := bytes.LastIndex(buf[:max], bs) |
| if i <= 0 || i+len(bs) >= len(buf) { |
| return -1 |
| } |
| if (buf[i-1] == '\n' || buf[i-1] == '\r') && (buf[i+len(bs)] == '\n' || buf[i+len(bs)] == '\r') { |
| return i |
| } |
| max = i |
| } |
| } |
| |
| // A Value is a single PDF value, such as an integer, dictionary, or array. |
| // The zero Value is a PDF null (Kind() == Null, IsNull() = true). |
| type Value struct { |
| r *Reader |
| ptr objptr |
| data interface{} |
| } |
| |
| // IsNull reports whether the value is a null. It is equivalent to Kind() == Null. |
| func (v Value) IsNull() bool { |
| return v.data == nil |
| } |
| |
| // A ValueKind specifies the kind of data underlying a Value. |
| type ValueKind int |
| |
| // The PDF value kinds. |
| const ( |
| Null ValueKind = iota |
| Bool |
| Integer |
| Real |
| String |
| Name |
| Dict |
| Array |
| Stream |
| ) |
| |
| // Kind reports the kind of value underlying v. |
| func (v Value) Kind() ValueKind { |
| switch v.data.(type) { |
| default: |
| return Null |
| case bool: |
| return Bool |
| case int64: |
| return Integer |
| case float64: |
| return Real |
| case string: |
| return String |
| case name: |
| return Name |
| case dict: |
| return Dict |
| case array: |
| return Array |
| case stream: |
| return Stream |
| } |
| } |
| |
| // String returns a textual representation of the value v. |
| // Note that String is not the accessor for values with Kind() == String. |
| // To access such values, see RawString, Text, and TextFromUTF16. |
| func (v Value) String() string { |
| return objfmt(v.data) |
| } |
| |
| func objfmt(x interface{}) string { |
| switch x := x.(type) { |
| default: |
| return fmt.Sprint(x) |
| case string: |
| if isPDFDocEncoded(x) { |
| return strconv.Quote(pdfDocDecode(x)) |
| } |
| if isUTF16(x) { |
| return strconv.Quote(utf16Decode(x[2:])) |
| } |
| return strconv.Quote(x) |
| case name: |
| return "/" + string(x) |
| case dict: |
| var keys []string |
| for k := range x { |
| keys = append(keys, string(k)) |
| } |
| sort.Strings(keys) |
| var buf bytes.Buffer |
| buf.WriteString("<<") |
| for i, k := range keys { |
| elem := x[name(k)] |
| if i > 0 { |
| buf.WriteString(" ") |
| } |
| buf.WriteString("/") |
| buf.WriteString(k) |
| buf.WriteString(" ") |
| buf.WriteString(objfmt(elem)) |
| } |
| buf.WriteString(">>") |
| return buf.String() |
| |
| case array: |
| var buf bytes.Buffer |
| buf.WriteString("[") |
| for i, elem := range x { |
| if i > 0 { |
| buf.WriteString(" ") |
| } |
| buf.WriteString(objfmt(elem)) |
| } |
| buf.WriteString("]") |
| return buf.String() |
| |
| case stream: |
| return fmt.Sprintf("%v@%d", objfmt(x.hdr), x.offset) |
| |
| case objptr: |
| return fmt.Sprintf("%d %d R", x.id, x.gen) |
| |
| case objdef: |
| return fmt.Sprintf("{%d %d obj}%v", x.ptr.id, x.ptr.gen, objfmt(x.obj)) |
| } |
| } |
| |
| // Bool returns v's boolean value. |
| // If v.Kind() != Bool, Bool returns false. |
| func (v Value) Bool() bool { |
| x, ok := v.data.(bool) |
| if !ok { |
| return false |
| } |
| return x |
| } |
| |
| // Int64 returns v's int64 value. |
| // If v.Kind() != Int64, Int64 returns 0. |
| func (v Value) Int64() int64 { |
| x, ok := v.data.(int64) |
| if !ok { |
| return 0 |
| } |
| return x |
| } |
| |
| // Float64 returns v's float64 value, converting from integer if necessary. |
| // If v.Kind() != Float64 and v.Kind() != Int64, Float64 returns 0. |
| func (v Value) Float64() float64 { |
| x, ok := v.data.(float64) |
| if !ok { |
| x, ok := v.data.(int64) |
| if ok { |
| return float64(x) |
| } |
| return 0 |
| } |
| return x |
| } |
| |
| // RawString returns v's string value. |
| // If v.Kind() != String, RawString returns the empty string. |
| func (v Value) RawString() string { |
| x, ok := v.data.(string) |
| if !ok { |
| return "" |
| } |
| return x |
| } |
| |
| // Text returns v's string value interpreted as a ``text string'' (defined in the PDF spec) |
| // and converted to UTF-8. |
| // If v.Kind() != String, Text returns the empty string. |
| func (v Value) Text() string { |
| x, ok := v.data.(string) |
| if !ok { |
| return "" |
| } |
| if isPDFDocEncoded(x) { |
| return pdfDocDecode(x) |
| } |
| if isUTF16(x) { |
| return utf16Decode(x[2:]) |
| } |
| return x |
| } |
| |
| // TextFromUTF16 returns v's string value interpreted as big-endian UTF-16 |
| // and then converted to UTF-8. |
| // If v.Kind() != String or if the data is not valid UTF-16, TextFromUTF16 returns |
| // the empty string. |
| func (v Value) TextFromUTF16() string { |
| x, ok := v.data.(string) |
| if !ok { |
| return "" |
| } |
| if len(x)%2 == 1 { |
| return "" |
| } |
| if x == "" { |
| return "" |
| } |
| return utf16Decode(x) |
| } |
| |
| // Name returns v's name value. |
| // If v.Kind() != Name, Name returns the empty string. |
| // The returned name does not include the leading slash: |
| // if v corresponds to the name written using the syntax /Helvetica, |
| // Name() == "Helvetica". |
| func (v Value) Name() string { |
| x, ok := v.data.(name) |
| if !ok { |
| return "" |
| } |
| return string(x) |
| } |
| |
| // Key returns the value associated with the given name key in the dictionary v. |
| // Like the result of the Name method, the key should not include a leading slash. |
| // If v is a stream, Key applies to the stream's header dictionary. |
| // If v.Kind() != Dict and v.Kind() != Stream, Key returns a null Value. |
| func (v Value) Key(key string) Value { |
| x, ok := v.data.(dict) |
| if !ok { |
| strm, ok := v.data.(stream) |
| if !ok { |
| return Value{} |
| } |
| x = strm.hdr |
| } |
| return v.r.resolve(v.ptr, x[name(key)]) |
| } |
| |
| // Keys returns a sorted list of the keys in the dictionary v. |
| // If v is a stream, Keys applies to the stream's header dictionary. |
| // If v.Kind() != Dict and v.Kind() != Stream, Keys returns nil. |
| func (v Value) Keys() []string { |
| x, ok := v.data.(dict) |
| if !ok { |
| strm, ok := v.data.(stream) |
| if !ok { |
| return nil |
| } |
| x = strm.hdr |
| } |
| keys := []string{} // not nil |
| for k := range x { |
| keys = append(keys, string(k)) |
| } |
| sort.Strings(keys) |
| return keys |
| } |
| |
| // Index returns the i'th element in the array v. |
| // If v.Kind() != Array or if i is outside the array bounds, |
| // Index returns a null Value. |
| func (v Value) Index(i int) Value { |
| x, ok := v.data.(array) |
| if !ok || i < 0 || i >= len(x) { |
| return Value{} |
| } |
| return v.r.resolve(v.ptr, x[i]) |
| } |
| |
| // Len returns the length of the array v. |
| // If v.Kind() != Array, Len returns a null Value. |
| func (v Value) Len() int { |
| x, ok := v.data.(array) |
| if !ok { |
| return 0 |
| } |
| return len(x) |
| } |
| |
| func (r *Reader) resolve(parent objptr, x interface{}) Value { |
| if ptr, ok := x.(objptr); ok { |
| if ptr.id >= uint32(len(r.xref)) { |
| return Value{} |
| } |
| xref := r.xref[ptr.id] |
| if xref.ptr != ptr || !xref.inStream && xref.offset == 0 { |
| return Value{} |
| } |
| var obj object |
| if xref.inStream { |
| strm := r.resolve(parent, xref.stream) |
| Search: |
| for { |
| if strm.Kind() != Stream { |
| panic("not a stream") |
| } |
| if strm.Key("Type").Name() != "ObjStm" { |
| panic("not an object stream") |
| } |
| n := int(strm.Key("N").Int64()) |
| first := strm.Key("First").Int64() |
| if first == 0 { |
| panic("missing First") |
| } |
| b := newBuffer(strm.Reader(), 0) |
| b.allowEOF = true |
| for i := 0; i < n; i++ { |
| id, _ := b.readToken().(int64) |
| off, _ := b.readToken().(int64) |
| if uint32(id) == ptr.id { |
| b.seekForward(first + off) |
| x = b.readObject() |
| break Search |
| } |
| } |
| ext := strm.Key("Extends") |
| if ext.Kind() != Stream { |
| panic("cannot find object in stream") |
| } |
| strm = ext |
| } |
| } else { |
| b := newBuffer(io.NewSectionReader(r.f, xref.offset, r.end-xref.offset), xref.offset) |
| b.key = r.key |
| b.useAES = r.useAES |
| obj = b.readObject() |
| def, ok := obj.(objdef) |
| if !ok { |
| panic(fmt.Errorf("loading %v: found %T instead of objdef", ptr, obj)) |
| return Value{} |
| } |
| if def.ptr != ptr { |
| panic(fmt.Errorf("loading %v: found %v", ptr, def.ptr)) |
| } |
| x = def.obj |
| } |
| parent = ptr |
| } |
| |
| switch x := x.(type) { |
| case nil, bool, int64, float64, name, dict, array, stream: |
| return Value{r, parent, x} |
| case string: |
| return Value{r, parent, x} |
| default: |
| panic(fmt.Errorf("unexpected value type %T in resolve", x)) |
| } |
| } |
| |
| type errorReadCloser struct { |
| err error |
| } |
| |
| func (e *errorReadCloser) Read([]byte) (int, error) { |
| return 0, e.err |
| } |
| |
| func (e *errorReadCloser) Close() error { |
| return e.err |
| } |
| |
| // Reader returns the data contained in the stream v. |
| // If v.Kind() != Stream, Reader returns a ReadCloser that |
| // responds to all reads with a ``stream not present'' error. |
| func (v Value) Reader() io.ReadCloser { |
| x, ok := v.data.(stream) |
| if !ok { |
| return &errorReadCloser{fmt.Errorf("stream not present")} |
| } |
| var rd io.Reader |
| rd = io.NewSectionReader(v.r.f, x.offset, v.Key("Length").Int64()) |
| if v.r.key != nil { |
| rd = decryptStream(v.r.key, v.r.useAES, x.ptr, rd) |
| } |
| filter := v.Key("Filter") |
| param := v.Key("DecodeParms") |
| switch filter.Kind() { |
| default: |
| panic(fmt.Errorf("unsupported filter %v", filter)) |
| case Null: |
| // ok |
| case Name: |
| rd = applyFilter(rd, filter.Name(), param) |
| case Array: |
| for i := 0; i < filter.Len(); i++ { |
| rd = applyFilter(rd, filter.Index(i).Name(), param.Index(i)) |
| } |
| } |
| |
| return ioutil.NopCloser(rd) |
| } |
| |
| func applyFilter(rd io.Reader, name string, param Value) io.Reader { |
| switch name { |
| default: |
| panic("unknown filter " + name) |
| case "FlateDecode": |
| zr, err := zlib.NewReader(rd) |
| if err != nil { |
| panic(err) |
| } |
| pred := param.Key("Predictor") |
| if pred.Kind() == Null { |
| return zr |
| } |
| columns := param.Key("Columns").Int64() |
| switch pred.Int64() { |
| default: |
| fmt.Println("unknown predictor", pred) |
| panic("pred") |
| case 12: |
| return &pngUpReader{r: zr, hist: make([]byte, 1+columns), tmp: make([]byte, 1+columns)} |
| } |
| } |
| } |
| |
| type pngUpReader struct { |
| r io.Reader |
| hist []byte |
| tmp []byte |
| pend []byte |
| } |
| |
| func (r *pngUpReader) Read(b []byte) (int, error) { |
| n := 0 |
| for len(b) > 0 { |
| if len(r.pend) > 0 { |
| m := copy(b, r.pend) |
| n += m |
| b = b[m:] |
| r.pend = r.pend[m:] |
| continue |
| } |
| _, err := io.ReadFull(r.r, r.tmp) |
| if err != nil { |
| return n, err |
| } |
| if r.tmp[0] != 2 { |
| return n, fmt.Errorf("malformed PNG-Up encoding") |
| } |
| for i, b := range r.tmp { |
| r.hist[i] += b |
| } |
| r.pend = r.hist[1:] |
| } |
| return n, nil |
| } |
| |
| var passwordPad = []byte{ |
| 0x28, 0xBF, 0x4E, 0x5E, 0x4E, 0x75, 0x8A, 0x41, 0x64, 0x00, 0x4E, 0x56, 0xFF, 0xFA, 0x01, 0x08, |
| 0x2E, 0x2E, 0x00, 0xB6, 0xD0, 0x68, 0x3E, 0x80, 0x2F, 0x0C, 0xA9, 0xFE, 0x64, 0x53, 0x69, 0x7A, |
| } |
| |
| func (r *Reader) initEncrypt(password string) error { |
| // See PDF 32000-1:2008, §7.6. |
| encrypt, _ := r.resolve(objptr{}, r.trailer["Encrypt"]).data.(dict) |
| if encrypt["Filter"] != name("Standard") { |
| return fmt.Errorf("unsupported PDF: encryption filter %v", objfmt(encrypt["Filter"])) |
| } |
| n, _ := encrypt["Length"].(int64) |
| if n == 0 { |
| n = 40 |
| } |
| if n%8 != 0 || n > 128 || n < 40 { |
| return fmt.Errorf("malformed PDF: %d-bit encryption key", n) |
| } |
| V, _ := encrypt["V"].(int64) |
| if V != 1 && V != 2 && (V != 4 || !okayV4(encrypt)) { |
| return fmt.Errorf("unsupported PDF: encryption version V=%d; %v", V, objfmt(encrypt)) |
| } |
| |
| ids, ok := r.trailer["ID"].(array) |
| if !ok || len(ids) < 1 { |
| return fmt.Errorf("malformed PDF: missing ID in trailer") |
| } |
| idstr, ok := ids[0].(string) |
| if !ok { |
| return fmt.Errorf("malformed PDF: missing ID in trailer") |
| } |
| ID := []byte(idstr) |
| |
| R, _ := encrypt["R"].(int64) |
| if R < 2 { |
| return fmt.Errorf("malformed PDF: encryption revision R=%d", R) |
| } |
| if R > 4 { |
| return fmt.Errorf("unsupported PDF: encryption revision R=%d", R) |
| } |
| O, _ := encrypt["O"].(string) |
| U, _ := encrypt["U"].(string) |
| if len(O) != 32 || len(U) != 32 { |
| return fmt.Errorf("malformed PDF: missing O= or U= encryption parameters") |
| } |
| p, _ := encrypt["P"].(int64) |
| P := uint32(p) |
| |
| // TODO: Password should be converted to Latin-1. |
| pw := []byte(password) |
| h := md5.New() |
| if len(pw) >= 32 { |
| h.Write(pw[:32]) |
| } else { |
| h.Write(pw) |
| h.Write(passwordPad[:32-len(pw)]) |
| } |
| h.Write([]byte(O)) |
| h.Write([]byte{byte(P), byte(P >> 8), byte(P >> 16), byte(P >> 24)}) |
| h.Write([]byte(ID)) |
| key := h.Sum(nil) |
| |
| if R >= 3 { |
| for i := 0; i < 50; i++ { |
| h.Reset() |
| h.Write(key[:n/8]) |
| key = h.Sum(key[:0]) |
| } |
| key = key[:n/8] |
| } else { |
| key = key[:40/8] |
| } |
| |
| c, err := rc4.NewCipher(key) |
| if err != nil { |
| return fmt.Errorf("malformed PDF: invalid RC4 key: %v", err) |
| } |
| |
| var u []byte |
| if R == 2 { |
| u = make([]byte, 32) |
| copy(u, passwordPad) |
| c.XORKeyStream(u, u) |
| } else { |
| h.Reset() |
| h.Write(passwordPad) |
| h.Write([]byte(ID)) |
| u = h.Sum(nil) |
| c.XORKeyStream(u, u) |
| |
| for i := 1; i <= 19; i++ { |
| key1 := make([]byte, len(key)) |
| copy(key1, key) |
| for j := range key1 { |
| key1[j] ^= byte(i) |
| } |
| c, _ = rc4.NewCipher(key1) |
| c.XORKeyStream(u, u) |
| } |
| } |
| |
| if !bytes.HasPrefix([]byte(U), u) { |
| return ErrInvalidPassword |
| } |
| |
| r.key = key |
| r.useAES = V == 4 |
| |
| return nil |
| } |
| |
| var ErrInvalidPassword = fmt.Errorf("encrypted PDF: invalid password") |
| |
| func okayV4(encrypt dict) bool { |
| cf, ok := encrypt["CF"].(dict) |
| if !ok { |
| return false |
| } |
| stmf, ok := encrypt["StmF"].(name) |
| if !ok { |
| return false |
| } |
| strf, ok := encrypt["StrF"].(name) |
| if !ok { |
| return false |
| } |
| if stmf != strf { |
| return false |
| } |
| cfparam, ok := cf[stmf].(dict) |
| if cfparam["AuthEvent"] != nil && cfparam["AuthEvent"] != name("DocOpen") { |
| return false |
| } |
| if cfparam["Length"] != nil && cfparam["Length"] != int64(16) { |
| return false |
| } |
| if cfparam["CFM"] != name("AESV2") { |
| return false |
| } |
| return true |
| } |
| |
| func cryptKey(key []byte, useAES bool, ptr objptr) []byte { |
| h := md5.New() |
| h.Write(key) |
| h.Write([]byte{byte(ptr.id), byte(ptr.id >> 8), byte(ptr.id >> 16), byte(ptr.gen), byte(ptr.gen >> 8)}) |
| if useAES { |
| h.Write([]byte("sAlT")) |
| } |
| return h.Sum(nil) |
| } |
| |
| func decryptString(key []byte, useAES bool, ptr objptr, x string) string { |
| key = cryptKey(key, useAES, ptr) |
| if useAES { |
| panic("AES not implemented") |
| } else { |
| c, _ := rc4.NewCipher(key) |
| data := []byte(x) |
| c.XORKeyStream(data, data) |
| x = string(data) |
| } |
| return x |
| } |
| |
| func decryptStream(key []byte, useAES bool, ptr objptr, rd io.Reader) io.Reader { |
| key = cryptKey(key, useAES, ptr) |
| if useAES { |
| cb, err := aes.NewCipher(key) |
| if err != nil { |
| panic("AES: " + err.Error()) |
| } |
| iv := make([]byte, 16) |
| io.ReadFull(rd, iv) |
| cbc := cipher.NewCBCDecrypter(cb, iv) |
| rd = &cbcReader{cbc: cbc, rd: rd, buf: make([]byte, 16)} |
| } else { |
| c, _ := rc4.NewCipher(key) |
| rd = &cipher.StreamReader{c, rd} |
| } |
| return rd |
| } |
| |
| type cbcReader struct { |
| cbc cipher.BlockMode |
| rd io.Reader |
| buf []byte |
| pend []byte |
| } |
| |
| func (r *cbcReader) Read(b []byte) (n int, err error) { |
| if len(r.pend) == 0 { |
| _, err = io.ReadFull(r.rd, r.buf) |
| if err != nil { |
| return 0, err |
| } |
| r.cbc.CryptBlocks(r.buf, r.buf) |
| r.pend = r.buf |
| } |
| n = copy(b, r.pend) |
| r.pend = r.pend[n:] |
| return n, nil |
| } |