perkeep/vendor/rsc.io/pdf/lex.go

530 lines
9.6 KiB
Go

// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Reading of PDF tokens and objects from a raw byte stream.
package pdf
import (
"fmt"
"io"
"strconv"
)
// A token is a PDF token in the input stream, one of the following Go types:
//
// bool, a PDF boolean
// int64, a PDF integer
// float64, a PDF real
// string, a PDF string literal
// keyword, a PDF keyword
// name, a PDF name without the leading slash
//
type token interface{}
// A name is a PDF name, without the leading slash.
type name string
// A keyword is a PDF keyword.
// Delimiter tokens used in higher-level syntax,
// such as "<<", ">>", "[", "]", "{", "}", are also treated as keywords.
type keyword string
// A buffer holds buffered input bytes from the PDF file.
type buffer struct {
r io.Reader // source of data
buf []byte // buffered data
pos int // read index in buf
offset int64 // offset at end of buf; aka offset of next read
tmp []byte // scratch space for accumulating token
unread []token // queue of read but then unread tokens
allowEOF bool
allowObjptr bool
allowStream bool
eof bool
key []byte
useAES bool
objptr objptr
}
// newBuffer returns a new buffer reading from r at the given offset.
func newBuffer(r io.Reader, offset int64) *buffer {
return &buffer{
r: r,
offset: offset,
buf: make([]byte, 0, 4096),
allowObjptr: true,
allowStream: true,
}
}
func (b *buffer) seek(offset int64) {
b.offset = offset
b.buf = b.buf[:0]
b.pos = 0
b.unread = b.unread[:0]
}
func (b *buffer) readByte() byte {
if b.pos >= len(b.buf) {
b.reload()
if b.pos >= len(b.buf) {
return '\n'
}
}
c := b.buf[b.pos]
b.pos++
return c
}
func (b *buffer) errorf(format string, args ...interface{}) {
panic(fmt.Errorf(format, args...))
}
func (b *buffer) reload() bool {
n := cap(b.buf) - int(b.offset%int64(cap(b.buf)))
n, err := b.r.Read(b.buf[:n])
if n == 0 && err != nil {
b.buf = b.buf[:0]
b.pos = 0
if b.allowEOF && err == io.EOF {
b.eof = true
return false
}
b.errorf("malformed PDF: reading at offset %d: %v", b.offset, err)
return false
}
b.offset += int64(n)
b.buf = b.buf[:n]
b.pos = 0
return true
}
func (b *buffer) seekForward(offset int64) {
for b.offset < offset {
if !b.reload() {
return
}
}
b.pos = len(b.buf) - int(b.offset-offset)
}
func (b *buffer) readOffset() int64 {
return b.offset - int64(len(b.buf)) + int64(b.pos)
}
func (b *buffer) unreadByte() {
if b.pos > 0 {
b.pos--
}
}
func (b *buffer) unreadToken(t token) {
b.unread = append(b.unread, t)
}
func (b *buffer) readToken() token {
if n := len(b.unread); n > 0 {
t := b.unread[n-1]
b.unread = b.unread[:n-1]
return t
}
// Find first non-space, non-comment byte.
c := b.readByte()
for {
if isSpace(c) {
if b.eof {
return io.EOF
}
c = b.readByte()
} else if c == '%' {
for c != '\r' && c != '\n' {
c = b.readByte()
}
} else {
break
}
}
switch c {
case '<':
if b.readByte() == '<' {
return keyword("<<")
}
b.unreadByte()
return b.readHexString()
case '(':
return b.readLiteralString()
case '[', ']', '{', '}':
return keyword(string(c))
case '/':
return b.readName()
case '>':
if b.readByte() == '>' {
return keyword(">>")
}
b.unreadByte()
fallthrough
default:
if isDelim(c) {
b.errorf("unexpected delimiter %#q", rune(c))
return nil
}
b.unreadByte()
return b.readKeyword()
}
}
func (b *buffer) readHexString() token {
tmp := b.tmp[:0]
for {
Loop:
c := b.readByte()
if c == '>' {
break
}
if isSpace(c) {
goto Loop
}
Loop2:
c2 := b.readByte()
if isSpace(c2) {
goto Loop2
}
x := unhex(c)<<4 | unhex(c2)
if x < 0 {
b.errorf("malformed hex string %c %c %s", c, c2, b.buf[b.pos:])
break
}
tmp = append(tmp, byte(x))
}
b.tmp = tmp
return string(tmp)
}
func unhex(b byte) int {
switch {
case '0' <= b && b <= '9':
return int(b) - '0'
case 'a' <= b && b <= 'f':
return int(b) - 'a' + 10
case 'A' <= b && b <= 'F':
return int(b) - 'A' + 10
}
return -1
}
func (b *buffer) readLiteralString() token {
tmp := b.tmp[:0]
depth := 1
Loop:
for {
c := b.readByte()
switch c {
default:
tmp = append(tmp, c)
case '(':
depth++
tmp = append(tmp, c)
case ')':
if depth--; depth == 0 {
break Loop
}
tmp = append(tmp, c)
case '\\':
switch c = b.readByte(); c {
default:
b.errorf("invalid escape sequence \\%c", c)
tmp = append(tmp, '\\', c)
case 'n':
tmp = append(tmp, '\n')
case 'r':
tmp = append(tmp, '\r')
case 'b':
tmp = append(tmp, '\b')
case 't':
tmp = append(tmp, '\t')
case 'f':
tmp = append(tmp, '\f')
case '(', ')', '\\':
tmp = append(tmp, c)
case '\r':
if b.readByte() != '\n' {
b.unreadByte()
}
fallthrough
case '\n':
// no append
case '0', '1', '2', '3', '4', '5', '6', '7':
x := int(c - '0')
for i := 0; i < 2; i++ {
c = b.readByte()
if c < '0' || c > '7' {
b.unreadByte()
break
}
x = x*8 + int(c-'0')
}
if x > 255 {
b.errorf("invalid octal escape \\%03o", x)
}
tmp = append(tmp, byte(x))
}
}
}
b.tmp = tmp
return string(tmp)
}
func (b *buffer) readName() token {
tmp := b.tmp[:0]
for {
c := b.readByte()
if isDelim(c) || isSpace(c) {
b.unreadByte()
break
}
if c == '#' {
x := unhex(b.readByte())<<4 | unhex(b.readByte())
if x < 0 {
b.errorf("malformed name")
}
tmp = append(tmp, byte(x))
continue
}
tmp = append(tmp, c)
}
b.tmp = tmp
return name(string(tmp))
}
func (b *buffer) readKeyword() token {
tmp := b.tmp[:0]
for {
c := b.readByte()
if isDelim(c) || isSpace(c) {
b.unreadByte()
break
}
tmp = append(tmp, c)
}
b.tmp = tmp
s := string(tmp)
switch {
case s == "true":
return true
case s == "false":
return false
case isInteger(s):
x, err := strconv.ParseInt(s, 10, 64)
if err != nil {
b.errorf("invalid integer %s", s)
}
return x
case isReal(s):
x, err := strconv.ParseFloat(s, 64)
if err != nil {
b.errorf("invalid real %s", s)
}
return x
}
return keyword(string(tmp))
}
func isInteger(s string) bool {
if len(s) > 0 && (s[0] == '+' || s[0] == '-') {
s = s[1:]
}
if len(s) == 0 {
return false
}
for _, c := range s {
if c < '0' || '9' < c {
return false
}
}
return true
}
func isReal(s string) bool {
if len(s) > 0 && (s[0] == '+' || s[0] == '-') {
s = s[1:]
}
if len(s) == 0 {
return false
}
ndot := 0
for _, c := range s {
if c == '.' {
ndot++
continue
}
if c < '0' || '9' < c {
return false
}
}
return ndot == 1
}
// An object is a PDF syntax object, one of the following Go types:
//
// bool, a PDF boolean
// int64, a PDF integer
// float64, a PDF real
// string, a PDF string literal
// name, a PDF name without the leading slash
// dict, a PDF dictionary
// array, a PDF array
// stream, a PDF stream
// objptr, a PDF object reference
// objdef, a PDF object definition
//
// An object may also be nil, to represent the PDF null.
type object interface{}
type dict map[name]object
type array []object
type stream struct {
hdr dict
ptr objptr
offset int64
}
type objptr struct {
id uint32
gen uint16
}
type objdef struct {
ptr objptr
obj object
}
func (b *buffer) readObject() object {
tok := b.readToken()
if kw, ok := tok.(keyword); ok {
switch kw {
case "null":
return nil
case "<<":
return b.readDict()
case "[":
return b.readArray()
}
b.errorf("unexpected keyword %q parsing object", kw)
return nil
}
if str, ok := tok.(string); ok && b.key != nil && b.objptr.id != 0 {
tok = decryptString(b.key, b.useAES, b.objptr, str)
}
if !b.allowObjptr {
return tok
}
if t1, ok := tok.(int64); ok && int64(uint32(t1)) == t1 {
tok2 := b.readToken()
if t2, ok := tok2.(int64); ok && int64(uint16(t2)) == t2 {
tok3 := b.readToken()
switch tok3 {
case keyword("R"):
return objptr{uint32(t1), uint16(t2)}
case keyword("obj"):
old := b.objptr
b.objptr = objptr{uint32(t1), uint16(t2)}
obj := b.readObject()
if _, ok := obj.(stream); !ok {
tok4 := b.readToken()
if tok4 != keyword("endobj") {
b.errorf("missing endobj after indirect object definition")
b.unreadToken(tok4)
}
}
b.objptr = old
return objdef{objptr{uint32(t1), uint16(t2)}, obj}
}
b.unreadToken(tok3)
}
b.unreadToken(tok2)
}
return tok
}
func (b *buffer) readArray() object {
var x array
for {
tok := b.readToken()
if tok == nil || tok == keyword("]") {
break
}
b.unreadToken(tok)
x = append(x, b.readObject())
}
return x
}
func (b *buffer) readDict() object {
x := make(dict)
for {
tok := b.readToken()
if tok == nil || tok == keyword(">>") {
break
}
n, ok := tok.(name)
if !ok {
b.errorf("unexpected non-name key %T(%v) parsing dictionary", tok, tok)
continue
}
x[n] = b.readObject()
}
if !b.allowStream {
return x
}
tok := b.readToken()
if tok != keyword("stream") {
b.unreadToken(tok)
return x
}
switch b.readByte() {
case '\r':
if b.readByte() != '\n' {
b.unreadByte()
}
case '\n':
// ok
default:
b.errorf("stream keyword not followed by newline")
}
return stream{x, b.objptr, b.readOffset()}
}
func isSpace(b byte) bool {
switch b {
case '\x00', '\t', '\n', '\f', '\r', ' ':
return true
}
return false
}
func isDelim(b byte) bool {
switch b {
case '<', '>', '(', ')', '[', ']', '{', '}', '/', '%':
return true
}
return false
}