// Copyright 2014 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Reading of PDF tokens and objects from a raw byte stream. package pdf import ( "fmt" "io" "strconv" ) // A token is a PDF token in the input stream, one of the following Go types: // // bool, a PDF boolean // int64, a PDF integer // float64, a PDF real // string, a PDF string literal // keyword, a PDF keyword // name, a PDF name without the leading slash // type token interface{} // A name is a PDF name, without the leading slash. type name string // A keyword is a PDF keyword. // Delimiter tokens used in higher-level syntax, // such as "<<", ">>", "[", "]", "{", "}", are also treated as keywords. type keyword string // A buffer holds buffered input bytes from the PDF file. type buffer struct { r io.Reader // source of data buf []byte // buffered data pos int // read index in buf offset int64 // offset at end of buf; aka offset of next read tmp []byte // scratch space for accumulating token unread []token // queue of read but then unread tokens allowEOF bool allowObjptr bool allowStream bool eof bool key []byte useAES bool objptr objptr } // newBuffer returns a new buffer reading from r at the given offset. func newBuffer(r io.Reader, offset int64) *buffer { return &buffer{ r: r, offset: offset, buf: make([]byte, 0, 4096), allowObjptr: true, allowStream: true, } } func (b *buffer) seek(offset int64) { b.offset = offset b.buf = b.buf[:0] b.pos = 0 b.unread = b.unread[:0] } func (b *buffer) readByte() byte { if b.pos >= len(b.buf) { b.reload() if b.pos >= len(b.buf) { return '\n' } } c := b.buf[b.pos] b.pos++ return c } func (b *buffer) errorf(format string, args ...interface{}) { panic(fmt.Errorf(format, args...)) } func (b *buffer) reload() bool { n := cap(b.buf) - int(b.offset%int64(cap(b.buf))) n, err := b.r.Read(b.buf[:n]) if n == 0 && err != nil { b.buf = b.buf[:0] b.pos = 0 if b.allowEOF && err == io.EOF { b.eof = true return false } b.errorf("malformed PDF: reading at offset %d: %v", b.offset, err) return false } b.offset += int64(n) b.buf = b.buf[:n] b.pos = 0 return true } func (b *buffer) seekForward(offset int64) { for b.offset < offset { if !b.reload() { return } } b.pos = len(b.buf) - int(b.offset-offset) } func (b *buffer) readOffset() int64 { return b.offset - int64(len(b.buf)) + int64(b.pos) } func (b *buffer) unreadByte() { if b.pos > 0 { b.pos-- } } func (b *buffer) unreadToken(t token) { b.unread = append(b.unread, t) } func (b *buffer) readToken() token { if n := len(b.unread); n > 0 { t := b.unread[n-1] b.unread = b.unread[:n-1] return t } // Find first non-space, non-comment byte. c := b.readByte() for { if isSpace(c) { if b.eof { return io.EOF } c = b.readByte() } else if c == '%' { for c != '\r' && c != '\n' { c = b.readByte() } } else { break } } switch c { case '<': if b.readByte() == '<' { return keyword("<<") } b.unreadByte() return b.readHexString() case '(': return b.readLiteralString() case '[', ']', '{', '}': return keyword(string(c)) case '/': return b.readName() case '>': if b.readByte() == '>' { return keyword(">>") } b.unreadByte() fallthrough default: if isDelim(c) { b.errorf("unexpected delimiter %#q", rune(c)) return nil } b.unreadByte() return b.readKeyword() } } func (b *buffer) readHexString() token { tmp := b.tmp[:0] for { Loop: c := b.readByte() if c == '>' { break } if isSpace(c) { goto Loop } Loop2: c2 := b.readByte() if isSpace(c2) { goto Loop2 } x := unhex(c)<<4 | unhex(c2) if x < 0 { b.errorf("malformed hex string %c %c %s", c, c2, b.buf[b.pos:]) break } tmp = append(tmp, byte(x)) } b.tmp = tmp return string(tmp) } func unhex(b byte) int { switch { case '0' <= b && b <= '9': return int(b) - '0' case 'a' <= b && b <= 'f': return int(b) - 'a' + 10 case 'A' <= b && b <= 'F': return int(b) - 'A' + 10 } return -1 } func (b *buffer) readLiteralString() token { tmp := b.tmp[:0] depth := 1 Loop: for { c := b.readByte() switch c { default: tmp = append(tmp, c) case '(': depth++ tmp = append(tmp, c) case ')': if depth--; depth == 0 { break Loop } tmp = append(tmp, c) case '\\': switch c = b.readByte(); c { default: b.errorf("invalid escape sequence \\%c", c) tmp = append(tmp, '\\', c) case 'n': tmp = append(tmp, '\n') case 'r': tmp = append(tmp, '\r') case 'b': tmp = append(tmp, '\b') case 't': tmp = append(tmp, '\t') case 'f': tmp = append(tmp, '\f') case '(', ')', '\\': tmp = append(tmp, c) case '\r': if b.readByte() != '\n' { b.unreadByte() } fallthrough case '\n': // no append case '0', '1', '2', '3', '4', '5', '6', '7': x := int(c - '0') for i := 0; i < 2; i++ { c = b.readByte() if c < '0' || c > '7' { b.unreadByte() break } x = x*8 + int(c-'0') } if x > 255 { b.errorf("invalid octal escape \\%03o", x) } tmp = append(tmp, byte(x)) } } } b.tmp = tmp return string(tmp) } func (b *buffer) readName() token { tmp := b.tmp[:0] for { c := b.readByte() if isDelim(c) || isSpace(c) { b.unreadByte() break } if c == '#' { x := unhex(b.readByte())<<4 | unhex(b.readByte()) if x < 0 { b.errorf("malformed name") } tmp = append(tmp, byte(x)) continue } tmp = append(tmp, c) } b.tmp = tmp return name(string(tmp)) } func (b *buffer) readKeyword() token { tmp := b.tmp[:0] for { c := b.readByte() if isDelim(c) || isSpace(c) { b.unreadByte() break } tmp = append(tmp, c) } b.tmp = tmp s := string(tmp) switch { case s == "true": return true case s == "false": return false case isInteger(s): x, err := strconv.ParseInt(s, 10, 64) if err != nil { b.errorf("invalid integer %s", s) } return x case isReal(s): x, err := strconv.ParseFloat(s, 64) if err != nil { b.errorf("invalid real %s", s) } return x } return keyword(string(tmp)) } func isInteger(s string) bool { if len(s) > 0 && (s[0] == '+' || s[0] == '-') { s = s[1:] } if len(s) == 0 { return false } for _, c := range s { if c < '0' || '9' < c { return false } } return true } func isReal(s string) bool { if len(s) > 0 && (s[0] == '+' || s[0] == '-') { s = s[1:] } if len(s) == 0 { return false } ndot := 0 for _, c := range s { if c == '.' { ndot++ continue } if c < '0' || '9' < c { return false } } return ndot == 1 } // An object is a PDF syntax object, one of the following Go types: // // bool, a PDF boolean // int64, a PDF integer // float64, a PDF real // string, a PDF string literal // name, a PDF name without the leading slash // dict, a PDF dictionary // array, a PDF array // stream, a PDF stream // objptr, a PDF object reference // objdef, a PDF object definition // // An object may also be nil, to represent the PDF null. type object interface{} type dict map[name]object type array []object type stream struct { hdr dict ptr objptr offset int64 } type objptr struct { id uint32 gen uint16 } type objdef struct { ptr objptr obj object } func (b *buffer) readObject() object { tok := b.readToken() if kw, ok := tok.(keyword); ok { switch kw { case "null": return nil case "<<": return b.readDict() case "[": return b.readArray() } b.errorf("unexpected keyword %q parsing object", kw) return nil } if str, ok := tok.(string); ok && b.key != nil && b.objptr.id != 0 { tok = decryptString(b.key, b.useAES, b.objptr, str) } if !b.allowObjptr { return tok } if t1, ok := tok.(int64); ok && int64(uint32(t1)) == t1 { tok2 := b.readToken() if t2, ok := tok2.(int64); ok && int64(uint16(t2)) == t2 { tok3 := b.readToken() switch tok3 { case keyword("R"): return objptr{uint32(t1), uint16(t2)} case keyword("obj"): old := b.objptr b.objptr = objptr{uint32(t1), uint16(t2)} obj := b.readObject() if _, ok := obj.(stream); !ok { tok4 := b.readToken() if tok4 != keyword("endobj") { b.errorf("missing endobj after indirect object definition") b.unreadToken(tok4) } } b.objptr = old return objdef{objptr{uint32(t1), uint16(t2)}, obj} } b.unreadToken(tok3) } b.unreadToken(tok2) } return tok } func (b *buffer) readArray() object { var x array for { tok := b.readToken() if tok == nil || tok == keyword("]") { break } b.unreadToken(tok) x = append(x, b.readObject()) } return x } func (b *buffer) readDict() object { x := make(dict) for { tok := b.readToken() if tok == nil || tok == keyword(">>") { break } n, ok := tok.(name) if !ok { b.errorf("unexpected non-name key %T(%v) parsing dictionary", tok, tok) continue } x[n] = b.readObject() } if !b.allowStream { return x } tok := b.readToken() if tok != keyword("stream") { b.unreadToken(tok) return x } switch b.readByte() { case '\r': if b.readByte() != '\n' { b.unreadByte() } case '\n': // ok default: b.errorf("stream keyword not followed by newline") } return stream{x, b.objptr, b.readOffset()} } func isSpace(b byte) bool { switch b { case '\x00', '\t', '\n', '\f', '\r', ' ': return true } return false } func isDelim(b byte) bool { switch b { case '<', '>', '(', ')', '[', ']', '{', '}', '/', '%': return true } return false }