mirror of https://github.com/perkeep/perkeep.git
530 lines
9.6 KiB
Go
530 lines
9.6 KiB
Go
// Copyright 2014 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Reading of PDF tokens and objects from a raw byte stream.
|
|
|
|
package pdf
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"strconv"
|
|
)
|
|
|
|
// A token is a PDF token in the input stream, one of the following Go types:
|
|
//
|
|
// bool, a PDF boolean
|
|
// int64, a PDF integer
|
|
// float64, a PDF real
|
|
// string, a PDF string literal
|
|
// keyword, a PDF keyword
|
|
// name, a PDF name without the leading slash
|
|
//
|
|
type token interface{}
|
|
|
|
// A name is a PDF name, without the leading slash.
|
|
type name string
|
|
|
|
// A keyword is a PDF keyword.
|
|
// Delimiter tokens used in higher-level syntax,
|
|
// such as "<<", ">>", "[", "]", "{", "}", are also treated as keywords.
|
|
type keyword string
|
|
|
|
// A buffer holds buffered input bytes from the PDF file.
|
|
type buffer struct {
|
|
r io.Reader // source of data
|
|
buf []byte // buffered data
|
|
pos int // read index in buf
|
|
offset int64 // offset at end of buf; aka offset of next read
|
|
tmp []byte // scratch space for accumulating token
|
|
unread []token // queue of read but then unread tokens
|
|
allowEOF bool
|
|
allowObjptr bool
|
|
allowStream bool
|
|
eof bool
|
|
key []byte
|
|
useAES bool
|
|
objptr objptr
|
|
}
|
|
|
|
// newBuffer returns a new buffer reading from r at the given offset.
|
|
func newBuffer(r io.Reader, offset int64) *buffer {
|
|
return &buffer{
|
|
r: r,
|
|
offset: offset,
|
|
buf: make([]byte, 0, 4096),
|
|
allowObjptr: true,
|
|
allowStream: true,
|
|
}
|
|
}
|
|
|
|
func (b *buffer) seek(offset int64) {
|
|
b.offset = offset
|
|
b.buf = b.buf[:0]
|
|
b.pos = 0
|
|
b.unread = b.unread[:0]
|
|
}
|
|
|
|
func (b *buffer) readByte() byte {
|
|
if b.pos >= len(b.buf) {
|
|
b.reload()
|
|
if b.pos >= len(b.buf) {
|
|
return '\n'
|
|
}
|
|
}
|
|
c := b.buf[b.pos]
|
|
b.pos++
|
|
return c
|
|
}
|
|
|
|
func (b *buffer) errorf(format string, args ...interface{}) {
|
|
panic(fmt.Errorf(format, args...))
|
|
}
|
|
|
|
func (b *buffer) reload() bool {
|
|
n := cap(b.buf) - int(b.offset%int64(cap(b.buf)))
|
|
n, err := b.r.Read(b.buf[:n])
|
|
if n == 0 && err != nil {
|
|
b.buf = b.buf[:0]
|
|
b.pos = 0
|
|
if b.allowEOF && err == io.EOF {
|
|
b.eof = true
|
|
return false
|
|
}
|
|
b.errorf("malformed PDF: reading at offset %d: %v", b.offset, err)
|
|
return false
|
|
}
|
|
b.offset += int64(n)
|
|
b.buf = b.buf[:n]
|
|
b.pos = 0
|
|
return true
|
|
}
|
|
|
|
func (b *buffer) seekForward(offset int64) {
|
|
for b.offset < offset {
|
|
if !b.reload() {
|
|
return
|
|
}
|
|
}
|
|
b.pos = len(b.buf) - int(b.offset-offset)
|
|
}
|
|
|
|
func (b *buffer) readOffset() int64 {
|
|
return b.offset - int64(len(b.buf)) + int64(b.pos)
|
|
}
|
|
|
|
func (b *buffer) unreadByte() {
|
|
if b.pos > 0 {
|
|
b.pos--
|
|
}
|
|
}
|
|
|
|
func (b *buffer) unreadToken(t token) {
|
|
b.unread = append(b.unread, t)
|
|
}
|
|
|
|
func (b *buffer) readToken() token {
|
|
if n := len(b.unread); n > 0 {
|
|
t := b.unread[n-1]
|
|
b.unread = b.unread[:n-1]
|
|
return t
|
|
}
|
|
|
|
// Find first non-space, non-comment byte.
|
|
c := b.readByte()
|
|
for {
|
|
if isSpace(c) {
|
|
if b.eof {
|
|
return io.EOF
|
|
}
|
|
c = b.readByte()
|
|
} else if c == '%' {
|
|
for c != '\r' && c != '\n' {
|
|
c = b.readByte()
|
|
}
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
|
|
switch c {
|
|
case '<':
|
|
if b.readByte() == '<' {
|
|
return keyword("<<")
|
|
}
|
|
b.unreadByte()
|
|
return b.readHexString()
|
|
|
|
case '(':
|
|
return b.readLiteralString()
|
|
|
|
case '[', ']', '{', '}':
|
|
return keyword(string(c))
|
|
|
|
case '/':
|
|
return b.readName()
|
|
|
|
case '>':
|
|
if b.readByte() == '>' {
|
|
return keyword(">>")
|
|
}
|
|
b.unreadByte()
|
|
fallthrough
|
|
|
|
default:
|
|
if isDelim(c) {
|
|
b.errorf("unexpected delimiter %#q", rune(c))
|
|
return nil
|
|
}
|
|
b.unreadByte()
|
|
return b.readKeyword()
|
|
}
|
|
}
|
|
|
|
func (b *buffer) readHexString() token {
|
|
tmp := b.tmp[:0]
|
|
for {
|
|
Loop:
|
|
c := b.readByte()
|
|
if c == '>' {
|
|
break
|
|
}
|
|
if isSpace(c) {
|
|
goto Loop
|
|
}
|
|
Loop2:
|
|
c2 := b.readByte()
|
|
if isSpace(c2) {
|
|
goto Loop2
|
|
}
|
|
x := unhex(c)<<4 | unhex(c2)
|
|
if x < 0 {
|
|
b.errorf("malformed hex string %c %c %s", c, c2, b.buf[b.pos:])
|
|
break
|
|
}
|
|
tmp = append(tmp, byte(x))
|
|
}
|
|
b.tmp = tmp
|
|
return string(tmp)
|
|
}
|
|
|
|
func unhex(b byte) int {
|
|
switch {
|
|
case '0' <= b && b <= '9':
|
|
return int(b) - '0'
|
|
case 'a' <= b && b <= 'f':
|
|
return int(b) - 'a' + 10
|
|
case 'A' <= b && b <= 'F':
|
|
return int(b) - 'A' + 10
|
|
}
|
|
return -1
|
|
}
|
|
|
|
func (b *buffer) readLiteralString() token {
|
|
tmp := b.tmp[:0]
|
|
depth := 1
|
|
Loop:
|
|
for {
|
|
c := b.readByte()
|
|
switch c {
|
|
default:
|
|
tmp = append(tmp, c)
|
|
case '(':
|
|
depth++
|
|
tmp = append(tmp, c)
|
|
case ')':
|
|
if depth--; depth == 0 {
|
|
break Loop
|
|
}
|
|
tmp = append(tmp, c)
|
|
case '\\':
|
|
switch c = b.readByte(); c {
|
|
default:
|
|
b.errorf("invalid escape sequence \\%c", c)
|
|
tmp = append(tmp, '\\', c)
|
|
case 'n':
|
|
tmp = append(tmp, '\n')
|
|
case 'r':
|
|
tmp = append(tmp, '\r')
|
|
case 'b':
|
|
tmp = append(tmp, '\b')
|
|
case 't':
|
|
tmp = append(tmp, '\t')
|
|
case 'f':
|
|
tmp = append(tmp, '\f')
|
|
case '(', ')', '\\':
|
|
tmp = append(tmp, c)
|
|
case '\r':
|
|
if b.readByte() != '\n' {
|
|
b.unreadByte()
|
|
}
|
|
fallthrough
|
|
case '\n':
|
|
// no append
|
|
case '0', '1', '2', '3', '4', '5', '6', '7':
|
|
x := int(c - '0')
|
|
for i := 0; i < 2; i++ {
|
|
c = b.readByte()
|
|
if c < '0' || c > '7' {
|
|
b.unreadByte()
|
|
break
|
|
}
|
|
x = x*8 + int(c-'0')
|
|
}
|
|
if x > 255 {
|
|
b.errorf("invalid octal escape \\%03o", x)
|
|
}
|
|
tmp = append(tmp, byte(x))
|
|
}
|
|
}
|
|
}
|
|
b.tmp = tmp
|
|
return string(tmp)
|
|
}
|
|
|
|
func (b *buffer) readName() token {
|
|
tmp := b.tmp[:0]
|
|
for {
|
|
c := b.readByte()
|
|
if isDelim(c) || isSpace(c) {
|
|
b.unreadByte()
|
|
break
|
|
}
|
|
if c == '#' {
|
|
x := unhex(b.readByte())<<4 | unhex(b.readByte())
|
|
if x < 0 {
|
|
b.errorf("malformed name")
|
|
}
|
|
tmp = append(tmp, byte(x))
|
|
continue
|
|
}
|
|
tmp = append(tmp, c)
|
|
}
|
|
b.tmp = tmp
|
|
return name(string(tmp))
|
|
}
|
|
|
|
func (b *buffer) readKeyword() token {
|
|
tmp := b.tmp[:0]
|
|
for {
|
|
c := b.readByte()
|
|
if isDelim(c) || isSpace(c) {
|
|
b.unreadByte()
|
|
break
|
|
}
|
|
tmp = append(tmp, c)
|
|
}
|
|
b.tmp = tmp
|
|
s := string(tmp)
|
|
switch {
|
|
case s == "true":
|
|
return true
|
|
case s == "false":
|
|
return false
|
|
case isInteger(s):
|
|
x, err := strconv.ParseInt(s, 10, 64)
|
|
if err != nil {
|
|
b.errorf("invalid integer %s", s)
|
|
}
|
|
return x
|
|
case isReal(s):
|
|
x, err := strconv.ParseFloat(s, 64)
|
|
if err != nil {
|
|
b.errorf("invalid real %s", s)
|
|
}
|
|
return x
|
|
}
|
|
return keyword(string(tmp))
|
|
}
|
|
|
|
func isInteger(s string) bool {
|
|
if len(s) > 0 && (s[0] == '+' || s[0] == '-') {
|
|
s = s[1:]
|
|
}
|
|
if len(s) == 0 {
|
|
return false
|
|
}
|
|
for _, c := range s {
|
|
if c < '0' || '9' < c {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
func isReal(s string) bool {
|
|
if len(s) > 0 && (s[0] == '+' || s[0] == '-') {
|
|
s = s[1:]
|
|
}
|
|
if len(s) == 0 {
|
|
return false
|
|
}
|
|
ndot := 0
|
|
for _, c := range s {
|
|
if c == '.' {
|
|
ndot++
|
|
continue
|
|
}
|
|
if c < '0' || '9' < c {
|
|
return false
|
|
}
|
|
}
|
|
return ndot == 1
|
|
}
|
|
|
|
// An object is a PDF syntax object, one of the following Go types:
|
|
//
|
|
// bool, a PDF boolean
|
|
// int64, a PDF integer
|
|
// float64, a PDF real
|
|
// string, a PDF string literal
|
|
// name, a PDF name without the leading slash
|
|
// dict, a PDF dictionary
|
|
// array, a PDF array
|
|
// stream, a PDF stream
|
|
// objptr, a PDF object reference
|
|
// objdef, a PDF object definition
|
|
//
|
|
// An object may also be nil, to represent the PDF null.
|
|
type object interface{}
|
|
|
|
type dict map[name]object
|
|
|
|
type array []object
|
|
|
|
type stream struct {
|
|
hdr dict
|
|
ptr objptr
|
|
offset int64
|
|
}
|
|
|
|
type objptr struct {
|
|
id uint32
|
|
gen uint16
|
|
}
|
|
|
|
type objdef struct {
|
|
ptr objptr
|
|
obj object
|
|
}
|
|
|
|
func (b *buffer) readObject() object {
|
|
tok := b.readToken()
|
|
if kw, ok := tok.(keyword); ok {
|
|
switch kw {
|
|
case "null":
|
|
return nil
|
|
case "<<":
|
|
return b.readDict()
|
|
case "[":
|
|
return b.readArray()
|
|
}
|
|
b.errorf("unexpected keyword %q parsing object", kw)
|
|
return nil
|
|
}
|
|
|
|
if str, ok := tok.(string); ok && b.key != nil && b.objptr.id != 0 {
|
|
tok = decryptString(b.key, b.useAES, b.objptr, str)
|
|
}
|
|
|
|
if !b.allowObjptr {
|
|
return tok
|
|
}
|
|
|
|
if t1, ok := tok.(int64); ok && int64(uint32(t1)) == t1 {
|
|
tok2 := b.readToken()
|
|
if t2, ok := tok2.(int64); ok && int64(uint16(t2)) == t2 {
|
|
tok3 := b.readToken()
|
|
switch tok3 {
|
|
case keyword("R"):
|
|
return objptr{uint32(t1), uint16(t2)}
|
|
case keyword("obj"):
|
|
old := b.objptr
|
|
b.objptr = objptr{uint32(t1), uint16(t2)}
|
|
obj := b.readObject()
|
|
if _, ok := obj.(stream); !ok {
|
|
tok4 := b.readToken()
|
|
if tok4 != keyword("endobj") {
|
|
b.errorf("missing endobj after indirect object definition")
|
|
b.unreadToken(tok4)
|
|
}
|
|
}
|
|
b.objptr = old
|
|
return objdef{objptr{uint32(t1), uint16(t2)}, obj}
|
|
}
|
|
b.unreadToken(tok3)
|
|
}
|
|
b.unreadToken(tok2)
|
|
}
|
|
return tok
|
|
}
|
|
|
|
func (b *buffer) readArray() object {
|
|
var x array
|
|
for {
|
|
tok := b.readToken()
|
|
if tok == nil || tok == keyword("]") {
|
|
break
|
|
}
|
|
b.unreadToken(tok)
|
|
x = append(x, b.readObject())
|
|
}
|
|
return x
|
|
}
|
|
|
|
func (b *buffer) readDict() object {
|
|
x := make(dict)
|
|
for {
|
|
tok := b.readToken()
|
|
if tok == nil || tok == keyword(">>") {
|
|
break
|
|
}
|
|
n, ok := tok.(name)
|
|
if !ok {
|
|
b.errorf("unexpected non-name key %T(%v) parsing dictionary", tok, tok)
|
|
continue
|
|
}
|
|
x[n] = b.readObject()
|
|
}
|
|
|
|
if !b.allowStream {
|
|
return x
|
|
}
|
|
|
|
tok := b.readToken()
|
|
if tok != keyword("stream") {
|
|
b.unreadToken(tok)
|
|
return x
|
|
}
|
|
|
|
switch b.readByte() {
|
|
case '\r':
|
|
if b.readByte() != '\n' {
|
|
b.unreadByte()
|
|
}
|
|
case '\n':
|
|
// ok
|
|
default:
|
|
b.errorf("stream keyword not followed by newline")
|
|
}
|
|
|
|
return stream{x, b.objptr, b.readOffset()}
|
|
}
|
|
|
|
func isSpace(b byte) bool {
|
|
switch b {
|
|
case '\x00', '\t', '\n', '\f', '\r', ' ':
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func isDelim(b byte) bool {
|
|
switch b {
|
|
case '<', '>', '(', ')', '[', ']', '{', '}', '/', '%':
|
|
return true
|
|
}
|
|
return false
|
|
}
|