mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Implement pure python solution for reading PDF metadata
This commit is contained in:
parent
76af4c11d0
commit
f7332494ae
@ -14,83 +14,41 @@
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
'''Read meta information from PDF files'''
|
||||
|
||||
import sys, os, copy
|
||||
import sys, os
|
||||
|
||||
from libprs500.ebooks.metadata import MetaInformation, get_parser
|
||||
from libprs500.ptempfile import PersistentTemporaryFile
|
||||
from libprs500.ebooks.metadata import MetaInformation
|
||||
from libprs500.ebooks.pyPdf import PdfFileReader
|
||||
|
||||
def get_metadata(stream):
|
||||
""" Return metadata as a L{MetaInfo} object """
|
||||
if hasattr(stream, 'name'):
|
||||
title = stream.name
|
||||
title = os.path.splitext(os.path.basename(stream.name))[0]
|
||||
else:
|
||||
title = 'Unknown'
|
||||
mi = MetaInformation(title, 'Unknown')
|
||||
|
||||
stream.seek(0)
|
||||
pt = PersistentTemporaryFile('.pdf')
|
||||
pt.write(stream.read())
|
||||
pt.close()
|
||||
return get_metadata_from_file(pt.name, mi)
|
||||
|
||||
def set_metadata(path, options):
|
||||
try:
|
||||
import podofo
|
||||
doc = podofo.PdfDocument()
|
||||
doc.Load(path)
|
||||
info = doc.GetInfo()
|
||||
if options.title:
|
||||
info.SetTitle(options.title)
|
||||
if options.authors:
|
||||
info.SetAuthor(options.authors)
|
||||
if options.category:
|
||||
info.SetSubject(options.category)
|
||||
pt = PersistentTemporaryFile('.pdf')
|
||||
pt.close()
|
||||
doc.Write(pt.name)
|
||||
stream = open(path, 'wb')
|
||||
stream.write(open(pt.name, 'rb').read())
|
||||
stream.close()
|
||||
except ImportError:
|
||||
return False
|
||||
return True
|
||||
|
||||
def get_metadata_from_file(path, default_mi=None):
|
||||
if default_mi is None:
|
||||
title = os.path.splitext(os.path.basename(path))[0]
|
||||
mi = MetaInformation(title, 'Unknown')
|
||||
else:
|
||||
mi = copy.copy(default_mi)
|
||||
try:
|
||||
import podofo
|
||||
doc = podofo.PdfDocument()
|
||||
doc.Load(path)
|
||||
info = doc.GetInfo()
|
||||
if info.GetTitle():
|
||||
mi.title = info.GetTitle()
|
||||
if info.GetAuthor():
|
||||
mi.authors = info.GetAuthor().split(',')
|
||||
if info.GetSubject():
|
||||
mi.category = info.GetSubject()
|
||||
except ImportError:
|
||||
pass
|
||||
finally:
|
||||
return mi
|
||||
|
||||
|
||||
info = PdfFileReader(stream).getDocumentInfo()
|
||||
if info.title:
|
||||
mi.title = title
|
||||
if info.author:
|
||||
src = info.author.split('&')
|
||||
authors = []
|
||||
for au in src:
|
||||
authors += au.split(',')
|
||||
mi.authors = authors
|
||||
mi.author = info.author
|
||||
if info.subject:
|
||||
mi.category = info.subject
|
||||
return mi
|
||||
|
||||
|
||||
def main(args=sys.argv):
|
||||
parser = get_parser('pdf')
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) != 2:
|
||||
print >>sys.stderr, 'No filename specified.'
|
||||
return 1
|
||||
|
||||
path = os.path.abspath(os.path.expanduser(args[1]))
|
||||
if not set_metadata(path, options):
|
||||
print >>sys.stderr, 'You do not have the podofo python extension installed. Cannot read PDF files.'
|
||||
return 1
|
||||
|
||||
print get_metadata_from_file(path)
|
||||
print get_metadata(open(path, 'rb'))
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
2
src/libprs500/ebooks/pyPdf/__init__.py
Normal file
2
src/libprs500/ebooks/pyPdf/__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
from pdf import PdfFileReader, PdfFileWriter
|
||||
__all__ = ["pdf"]
|
239
src/libprs500/ebooks/pyPdf/filters.py
Normal file
239
src/libprs500/ebooks/pyPdf/filters.py
Normal file
@ -0,0 +1,239 @@
|
||||
# vim: sw=4:expandtab:foldmethod=marker
|
||||
#
|
||||
# Copyright (c) 2006, Mathieu Fenniak
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
"""
|
||||
Implementation of stream filters for PDF.
|
||||
"""
|
||||
__author__ = "Mathieu Fenniak"
|
||||
__author_email__ = "mfenniak@pobox.com"
|
||||
|
||||
from generic import NameObject
|
||||
|
||||
try:
|
||||
import zlib
|
||||
def decompress(data):
|
||||
return zlib.decompress(data)
|
||||
def compress(data):
|
||||
return zlib.compress(data)
|
||||
except ImportError:
|
||||
# Unable to import zlib. Attempt to use the System.IO.Compression
|
||||
# library from the .NET framework. (IronPython only)
|
||||
import System
|
||||
from System import IO, Collections, Array
|
||||
def _string_to_bytearr(buf):
|
||||
retval = Array.CreateInstance(System.Byte, len(buf))
|
||||
for i in range(len(buf)):
|
||||
retval[i] = ord(buf[i])
|
||||
return retval
|
||||
def _bytearr_to_string(bytes):
|
||||
retval = ""
|
||||
for i in range(bytes.Length):
|
||||
retval += chr(bytes[i])
|
||||
return retval
|
||||
def _read_bytes(stream):
|
||||
ms = IO.MemoryStream()
|
||||
buf = Array.CreateInstance(System.Byte, 2048)
|
||||
while True:
|
||||
bytes = stream.Read(buf, 0, buf.Length)
|
||||
if bytes == 0:
|
||||
break
|
||||
else:
|
||||
ms.Write(buf, 0, bytes)
|
||||
retval = ms.ToArray()
|
||||
ms.Close()
|
||||
return retval
|
||||
def decompress(data):
|
||||
bytes = _string_to_bytearr(data)
|
||||
ms = IO.MemoryStream()
|
||||
ms.Write(bytes, 0, bytes.Length)
|
||||
ms.Position = 0 # fseek 0
|
||||
gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Decompress)
|
||||
bytes = _read_bytes(gz)
|
||||
retval = _bytearr_to_string(bytes)
|
||||
gz.Close()
|
||||
return retval
|
||||
def compress(data):
|
||||
bytes = _string_to_bytearr(data)
|
||||
ms = IO.MemoryStream()
|
||||
gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Compress, True)
|
||||
gz.Write(bytes, 0, bytes.Length)
|
||||
gz.Close()
|
||||
ms.Position = 0 # fseek 0
|
||||
bytes = ms.ToArray()
|
||||
retval = _bytearr_to_string(bytes)
|
||||
ms.Close()
|
||||
return retval
|
||||
|
||||
|
||||
class FlateDecode(object):
|
||||
def decode(data, decodeParms):
|
||||
data = decompress(data)
|
||||
predictor = 1
|
||||
if decodeParms:
|
||||
predictor = decodeParms.get("/Predictor", 1)
|
||||
# predictor 1 == no predictor
|
||||
if predictor != 1:
|
||||
columns = decodeParms["/Columns"]
|
||||
if predictor >= 10:
|
||||
newdata = ""
|
||||
# PNG prediction can vary from row to row
|
||||
rowlength = columns + 1
|
||||
assert len(data) % rowlength == 0
|
||||
prev_rowdata = "\x00"*rowlength
|
||||
for row in range(len(data) / rowlength):
|
||||
rowdata = list(data[(row*rowlength):((row+1)*rowlength)])
|
||||
filterByte = ord(rowdata[0])
|
||||
if filterByte == 0:
|
||||
pass
|
||||
elif filterByte == 1:
|
||||
for i in range(2, rowlength):
|
||||
rowdata[i] = chr((ord(rowdata[i]) + ord(rowdata[i-1])) % 256)
|
||||
elif filterByte == 2:
|
||||
for i in range(1, rowlength):
|
||||
rowdata[i] = chr((ord(rowdata[i]) + ord(prev_rowdata[i])) % 256)
|
||||
else:
|
||||
# unsupported PNG filter
|
||||
assert False
|
||||
prev_rowdata = rowdata
|
||||
newdata += ''.join(rowdata[1:])
|
||||
data = newdata
|
||||
else:
|
||||
# unsupported predictor
|
||||
assert False
|
||||
return data
|
||||
decode = staticmethod(decode)
|
||||
|
||||
def encode(data):
|
||||
return compress(data)
|
||||
encode = staticmethod(encode)
|
||||
|
||||
class ASCIIHexDecode(object):
|
||||
def decode(data, decodeParms=None):
|
||||
retval = ""
|
||||
char = ""
|
||||
x = 0
|
||||
while True:
|
||||
c = data[x]
|
||||
if c == ">":
|
||||
break
|
||||
elif c.isspace():
|
||||
x += 1
|
||||
continue
|
||||
char += c
|
||||
if len(char) == 2:
|
||||
retval += chr(int(char, base=16))
|
||||
char = ""
|
||||
x += 1
|
||||
assert char == ""
|
||||
return retval
|
||||
decode = staticmethod(decode)
|
||||
|
||||
class ASCII85Decode(object):
|
||||
def decode(data, decodeParms=None):
|
||||
retval = ""
|
||||
group = []
|
||||
x = 0
|
||||
hitEod = False
|
||||
# remove all whitespace from data
|
||||
data = [y for y in data if not (y in ' \n\r\t')]
|
||||
while not hitEod:
|
||||
c = data[x]
|
||||
if len(retval) == 0 and c == "<" and data[x+1] == "~":
|
||||
x += 2
|
||||
continue
|
||||
#elif c.isspace():
|
||||
# x += 1
|
||||
# continue
|
||||
elif c == 'z':
|
||||
assert len(group) == 0
|
||||
retval += '\x00\x00\x00\x00'
|
||||
continue
|
||||
elif c == "~" and data[x+1] == ">":
|
||||
if len(group) != 0:
|
||||
# cannot have a final group of just 1 char
|
||||
assert len(group) > 1
|
||||
cnt = len(group) - 1
|
||||
group += [ 85, 85, 85 ]
|
||||
hitEod = cnt
|
||||
else:
|
||||
break
|
||||
else:
|
||||
c = ord(c) - 33
|
||||
assert c >= 0 and c < 85
|
||||
group += [ c ]
|
||||
if len(group) >= 5:
|
||||
b = group[0] * (85**4) + \
|
||||
group[1] * (85**3) + \
|
||||
group[2] * (85**2) + \
|
||||
group[3] * 85 + \
|
||||
group[4]
|
||||
assert b < (2**32 - 1)
|
||||
c4 = chr((b >> 0) % 256)
|
||||
c3 = chr((b >> 8) % 256)
|
||||
c2 = chr((b >> 16) % 256)
|
||||
c1 = chr(b >> 24)
|
||||
retval += (c1 + c2 + c3 + c4)
|
||||
if hitEod:
|
||||
retval = retval[:-4+hitEod]
|
||||
group = []
|
||||
x += 1
|
||||
return retval
|
||||
decode = staticmethod(decode)
|
||||
|
||||
def decodeStreamData(stream):
|
||||
filters = stream.get("/Filter", ())
|
||||
if len(filters) and not isinstance(filters[0], NameObject):
|
||||
# we have a single filter instance
|
||||
filters = (filters,)
|
||||
data = stream._data
|
||||
for filterType in filters:
|
||||
if filterType == "/FlateDecode":
|
||||
data = FlateDecode.decode(data, stream.get("/DecodeParms"))
|
||||
elif filterType == "/ASCIIHexDecode":
|
||||
data = ASCIIHexDecode.decode(data)
|
||||
elif filterType == "/ASCII85Decode":
|
||||
data = ASCII85Decode.decode(data)
|
||||
else:
|
||||
# unsupported filter
|
||||
assert False
|
||||
return data
|
||||
|
||||
if __name__ == "__main__":
|
||||
assert "abc" == ASCIIHexDecode.decode('61\n626\n3>')
|
||||
|
||||
ascii85Test = """
|
||||
<~9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
|
||||
O<DJ+*.@<*K0@<6L(Df-\\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
|
||||
i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
|
||||
l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
|
||||
>uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
|
||||
"""
|
||||
ascii85_originalText="Man is distinguished, not only by his reason, but by this singular passion from other animals, which is a lust of the mind, that by a perseverance of delight in the continued and indefatigable generation of knowledge, exceeds the short vehemence of any carnal pleasure."
|
||||
assert ASCII85Decode.decode(ascii85Test) == ascii85_originalText
|
542
src/libprs500/ebooks/pyPdf/generic.py
Normal file
542
src/libprs500/ebooks/pyPdf/generic.py
Normal file
@ -0,0 +1,542 @@
|
||||
# vim: sw=4:expandtab:foldmethod=marker
|
||||
#
|
||||
# Copyright (c) 2006, Mathieu Fenniak
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
"""
|
||||
Implementation of generic PDF objects (dictionary, number, string, and so on)
|
||||
"""
|
||||
__author__ = "Mathieu Fenniak"
|
||||
__author_email__ = "mfenniak@pobox.com"
|
||||
|
||||
import re
|
||||
from utils import readNonWhitespace, RC4_encrypt
|
||||
import filters
|
||||
|
||||
def readObject(stream, pdf):
|
||||
tok = stream.read(1)
|
||||
stream.seek(-1, 1) # reset to start
|
||||
if tok == 't' or tok == 'f':
|
||||
# boolean object
|
||||
return BooleanObject.readFromStream(stream)
|
||||
elif tok == '(':
|
||||
# string object
|
||||
return StringObject.readFromStream(stream)
|
||||
elif tok == '/':
|
||||
# name object
|
||||
return NameObject.readFromStream(stream)
|
||||
elif tok == '[':
|
||||
# array object
|
||||
return ArrayObject.readFromStream(stream, pdf)
|
||||
elif tok == 'n':
|
||||
# null object
|
||||
return NullObject.readFromStream(stream)
|
||||
elif tok == '<':
|
||||
# hexadecimal string OR dictionary
|
||||
peek = stream.read(2)
|
||||
stream.seek(-2, 1) # reset to start
|
||||
if peek == '<<':
|
||||
return DictionaryObject.readFromStream(stream, pdf)
|
||||
else:
|
||||
return StringObject.readHexStringFromStream(stream)
|
||||
elif tok == '%':
|
||||
# comment
|
||||
while tok not in ('\r', '\n'):
|
||||
tok = stream.read(1)
|
||||
tok = readNonWhitespace(stream)
|
||||
stream.seek(-1, 1)
|
||||
return readObject(stream, pdf)
|
||||
else:
|
||||
# number object OR indirect reference
|
||||
if tok == '+' or tok == '-':
|
||||
# number
|
||||
return NumberObject.readFromStream(stream)
|
||||
peek = stream.read(20)
|
||||
stream.seek(-len(peek), 1) # reset to start
|
||||
if re.match(r"(\d+)\s(\d+)\sR[^a-zA-Z]", peek) != None:
|
||||
return IndirectObject.readFromStream(stream, pdf)
|
||||
else:
|
||||
return NumberObject.readFromStream(stream)
|
||||
|
||||
class PdfObject(object):
|
||||
def getObject(self):
|
||||
"""Resolves indirect references."""
|
||||
return self
|
||||
|
||||
|
||||
class NullObject(PdfObject):
|
||||
def writeToStream(self, stream, encryption_key):
|
||||
stream.write("null")
|
||||
|
||||
def readFromStream(stream):
|
||||
assert stream.read(4) == "null"
|
||||
return NullObject()
|
||||
readFromStream = staticmethod(readFromStream)
|
||||
|
||||
|
||||
class BooleanObject(PdfObject):
|
||||
def __init__(self, value):
|
||||
self.value = value
|
||||
|
||||
def writeToStream(self, stream, encryption_key):
|
||||
if self.value:
|
||||
stream.write("true")
|
||||
else:
|
||||
stream.write("false")
|
||||
|
||||
def readFromStream(stream):
|
||||
word = stream.read(4)
|
||||
if word == "true":
|
||||
return BooleanObject(True)
|
||||
elif word == "fals":
|
||||
stream.read(1)
|
||||
return BooleanObject(False)
|
||||
assert False
|
||||
readFromStream = staticmethod(readFromStream)
|
||||
|
||||
|
||||
class ArrayObject(list, PdfObject):
|
||||
def writeToStream(self, stream, encryption_key):
|
||||
stream.write("[")
|
||||
for data in self:
|
||||
stream.write(" ")
|
||||
data.writeToStream(stream, encryption_key)
|
||||
stream.write(" ]")
|
||||
|
||||
def readFromStream(stream, pdf):
|
||||
arr = ArrayObject()
|
||||
assert stream.read(1) == "["
|
||||
while True:
|
||||
# skip leading whitespace
|
||||
tok = stream.read(1)
|
||||
while tok.isspace():
|
||||
tok = stream.read(1)
|
||||
stream.seek(-1, 1)
|
||||
# check for array ending
|
||||
peekahead = stream.read(1)
|
||||
if peekahead == "]":
|
||||
break
|
||||
stream.seek(-1, 1)
|
||||
# read and append obj
|
||||
arr.append(readObject(stream, pdf))
|
||||
return arr
|
||||
readFromStream = staticmethod(readFromStream)
|
||||
|
||||
|
||||
class IndirectObject(PdfObject):
|
||||
def __init__(self, idnum, generation, pdf):
|
||||
self.idnum = idnum
|
||||
self.generation = generation
|
||||
self.pdf = pdf
|
||||
|
||||
def getObject(self):
|
||||
return self.pdf.getObject(self).getObject()
|
||||
|
||||
def __repr__(self):
|
||||
return "IndirectObject(%r, %r)" % (self.idnum, self.generation)
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
other != None and
|
||||
isinstance(other, IndirectObject) and
|
||||
self.idnum == other.idnum and
|
||||
self.generation == other.generation and
|
||||
self.pdf is other.pdf
|
||||
)
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self.__eq__(other)
|
||||
|
||||
def writeToStream(self, stream, encryption_key):
|
||||
stream.write("%s %s R" % (self.idnum, self.generation))
|
||||
|
||||
def readFromStream(stream, pdf):
|
||||
idnum = ""
|
||||
while True:
|
||||
tok = stream.read(1)
|
||||
if tok.isspace():
|
||||
break
|
||||
idnum += tok
|
||||
generation = ""
|
||||
while True:
|
||||
tok = stream.read(1)
|
||||
if tok.isspace():
|
||||
break
|
||||
generation += tok
|
||||
r = stream.read(1)
|
||||
#if r != "R":
|
||||
# stream.seek(-20, 1)
|
||||
# print idnum, generation
|
||||
# print repr(stream.read(40))
|
||||
assert r == "R"
|
||||
return IndirectObject(int(idnum), int(generation), pdf)
|
||||
readFromStream = staticmethod(readFromStream)
|
||||
|
||||
|
||||
class FloatObject(float, PdfObject):
|
||||
def writeToStream(self, stream, encryption_key):
|
||||
stream.write(repr(self))
|
||||
|
||||
|
||||
class NumberObject(int, PdfObject):
|
||||
def __init__(self, value):
|
||||
int.__init__(self, value)
|
||||
|
||||
def writeToStream(self, stream, encryption_key):
|
||||
stream.write(repr(self))
|
||||
|
||||
def readFromStream(stream):
|
||||
name = ""
|
||||
while True:
|
||||
tok = stream.read(1)
|
||||
if tok != '+' and tok != '-' and tok != '.' and not tok.isdigit():
|
||||
stream.seek(-1, 1)
|
||||
break
|
||||
name += tok
|
||||
if name.find(".") != -1:
|
||||
return FloatObject(name)
|
||||
else:
|
||||
return NumberObject(name)
|
||||
readFromStream = staticmethod(readFromStream)
|
||||
|
||||
|
||||
class StringObject(str, PdfObject):
|
||||
def writeToStream(self, stream, encryption_key):
|
||||
string = self
|
||||
if encryption_key:
|
||||
string = RC4_encrypt(encryption_key, string)
|
||||
stream.write("(")
|
||||
for c in string:
|
||||
if not c.isalnum() and not c.isspace():
|
||||
stream.write("\\%03o" % ord(c))
|
||||
else:
|
||||
stream.write(c)
|
||||
stream.write(")")
|
||||
|
||||
def readHexStringFromStream(stream):
|
||||
stream.read(1)
|
||||
txt = ""
|
||||
x = ""
|
||||
while True:
|
||||
tok = readNonWhitespace(stream)
|
||||
if tok == ">":
|
||||
break
|
||||
x += tok
|
||||
if len(x) == 2:
|
||||
txt += chr(int(x, base=16))
|
||||
x = ""
|
||||
if len(x) == 1:
|
||||
x += "0"
|
||||
if len(x) == 2:
|
||||
txt += chr(int(x, base=16))
|
||||
return StringObject(txt)
|
||||
readHexStringFromStream = staticmethod(readHexStringFromStream)
|
||||
|
||||
def readFromStream(stream):
|
||||
tok = stream.read(1)
|
||||
parens = 1
|
||||
txt = ""
|
||||
while True:
|
||||
tok = stream.read(1)
|
||||
if tok == "(":
|
||||
parens += 1
|
||||
elif tok == ")":
|
||||
parens -= 1
|
||||
if parens == 0:
|
||||
break
|
||||
elif tok == "\\":
|
||||
tok = stream.read(1)
|
||||
if tok == "n":
|
||||
tok = "\n"
|
||||
elif tok == "r":
|
||||
tok = "\r"
|
||||
elif tok == "t":
|
||||
tok = "\t"
|
||||
elif tok == "b":
|
||||
tok == "\b"
|
||||
elif tok == "f":
|
||||
tok = "\f"
|
||||
elif tok == "(":
|
||||
tok = "("
|
||||
elif tok == ")":
|
||||
tok = ")"
|
||||
elif tok == "\\":
|
||||
tok = "\\"
|
||||
elif tok.isdigit():
|
||||
tok += stream.read(2)
|
||||
tok = chr(int(tok, base=8))
|
||||
txt += tok
|
||||
return StringObject(txt)
|
||||
readFromStream = staticmethod(readFromStream)
|
||||
|
||||
|
||||
class NameObject(str, PdfObject):
|
||||
delimiterCharacters = "(", ")", "<", ">", "[", "]", "{", "}", "/", "%"
|
||||
|
||||
def __init__(self, data):
|
||||
str.__init__(self, data)
|
||||
|
||||
def writeToStream(self, stream, encryption_key):
|
||||
stream.write(self)
|
||||
|
||||
def readFromStream(stream):
|
||||
name = stream.read(1)
|
||||
assert name == "/"
|
||||
while True:
|
||||
tok = stream.read(1)
|
||||
if tok.isspace() or tok in NameObject.delimiterCharacters:
|
||||
stream.seek(-1, 1)
|
||||
break
|
||||
name += tok
|
||||
return NameObject(name)
|
||||
readFromStream = staticmethod(readFromStream)
|
||||
|
||||
|
||||
class DictionaryObject(dict, PdfObject):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def writeToStream(self, stream, encryption_key):
|
||||
stream.write("<<\n")
|
||||
for key, value in self.items():
|
||||
key.writeToStream(stream, encryption_key)
|
||||
stream.write(" ")
|
||||
value.writeToStream(stream, encryption_key)
|
||||
stream.write("\n")
|
||||
stream.write(">>")
|
||||
|
||||
def readFromStream(stream, pdf):
|
||||
assert stream.read(2) == "<<"
|
||||
data = {}
|
||||
while True:
|
||||
tok = readNonWhitespace(stream)
|
||||
if tok == ">":
|
||||
stream.read(1)
|
||||
break
|
||||
stream.seek(-1, 1)
|
||||
key = readObject(stream, pdf)
|
||||
tok = readNonWhitespace(stream)
|
||||
stream.seek(-1, 1)
|
||||
value = readObject(stream, pdf)
|
||||
if data.has_key(key):
|
||||
# multiple definitions of key not permitted
|
||||
assert False
|
||||
data[key] = value
|
||||
pos = stream.tell()
|
||||
s = readNonWhitespace(stream)
|
||||
if s == 's' and stream.read(5) == 'tream':
|
||||
eol = stream.read(1)
|
||||
# odd PDF file output has spaces after 'stream' keyword but before EOL.
|
||||
# patch provided by Danial Sandler
|
||||
while eol == ' ':
|
||||
eol = stream.read(1)
|
||||
assert eol in ("\n", "\r")
|
||||
if eol == "\r":
|
||||
# read \n after
|
||||
stream.read(1)
|
||||
# this is a stream object, not a dictionary
|
||||
assert data.has_key("/Length")
|
||||
length = data["/Length"]
|
||||
if isinstance(length, IndirectObject):
|
||||
t = stream.tell()
|
||||
length = pdf.getObject(length)
|
||||
stream.seek(t, 0)
|
||||
data["__streamdata__"] = stream.read(length)
|
||||
e = readNonWhitespace(stream)
|
||||
ndstream = stream.read(8)
|
||||
if (e + ndstream) != "endstream":
|
||||
# (sigh) - the odd PDF file has a length that is too long, so
|
||||
# we need to read backwards to find the "endstream" ending.
|
||||
# ReportLab (unknown version) generates files with this bug,
|
||||
# and Python users into PDF files tend to be our audience.
|
||||
# we need to do this to correct the streamdata and chop off
|
||||
# an extra character.
|
||||
pos = stream.tell()
|
||||
stream.seek(-10, 1)
|
||||
end = stream.read(9)
|
||||
if end == "endstream":
|
||||
# we found it by looking back one character further.
|
||||
data["__streamdata__"] = data["__streamdata__"][:-1]
|
||||
else:
|
||||
stream.seek(pos, 0)
|
||||
raise "Unable to find 'endstream' marker after stream."
|
||||
else:
|
||||
stream.seek(pos, 0)
|
||||
if data.has_key("__streamdata__"):
|
||||
return StreamObject.initializeFromDictionary(data)
|
||||
else:
|
||||
retval = DictionaryObject()
|
||||
retval.update(data)
|
||||
return retval
|
||||
readFromStream = staticmethod(readFromStream)
|
||||
|
||||
|
||||
class StreamObject(DictionaryObject):
|
||||
def __init__(self):
|
||||
self._data = None
|
||||
self.decodedSelf = None
|
||||
|
||||
def writeToStream(self, stream, encryption_key):
|
||||
self[NameObject("/Length")] = NumberObject(len(self._data))
|
||||
DictionaryObject.writeToStream(self, stream, encryption_key)
|
||||
del self["/Length"]
|
||||
stream.write("\nstream\n")
|
||||
data = self._data
|
||||
if encryption_key:
|
||||
data = RC4_encrypt(encryption_key, data)
|
||||
stream.write(data)
|
||||
stream.write("\nendstream")
|
||||
|
||||
def initializeFromDictionary(data):
|
||||
if data.has_key("/Filter"):
|
||||
retval = EncodedStreamObject()
|
||||
else:
|
||||
retval = DecodedStreamObject()
|
||||
retval._data = data["__streamdata__"]
|
||||
del data["__streamdata__"]
|
||||
del data["/Length"]
|
||||
retval.update(data)
|
||||
return retval
|
||||
initializeFromDictionary = staticmethod(initializeFromDictionary)
|
||||
|
||||
def flateEncode(self):
|
||||
if self.has_key("/Filter"):
|
||||
f = self["/Filter"]
|
||||
if isinstance(f, ArrayObject):
|
||||
f.insert(0, NameObject("/FlateDecode"))
|
||||
else:
|
||||
newf = ArrayObject()
|
||||
newf.append(NameObject("/FlateDecode"))
|
||||
newf.append(f)
|
||||
f = newf
|
||||
else:
|
||||
f = NameObject("/FlateDecode")
|
||||
retval = EncodedStreamObject()
|
||||
retval[NameObject("/Filter")] = f
|
||||
retval._data = filters.FlateDecode.encode(self._data)
|
||||
return retval
|
||||
|
||||
|
||||
class DecodedStreamObject(StreamObject):
|
||||
def getData(self):
|
||||
return self._data
|
||||
|
||||
def setData(self, data):
|
||||
self._data = data
|
||||
|
||||
|
||||
class EncodedStreamObject(StreamObject):
|
||||
def __init__(self):
|
||||
self.decodedSelf = None
|
||||
|
||||
def getData(self):
|
||||
if self.decodedSelf:
|
||||
# cached version of decoded object
|
||||
return self.decodedSelf.getData()
|
||||
else:
|
||||
# create decoded object
|
||||
decoded = StreamObject()
|
||||
decoded._data = filters.decodeStreamData(self)
|
||||
for key, value in self.items():
|
||||
if not key in ("/Length", "/Filter", "/DecodeParms"):
|
||||
decoded[key] = value
|
||||
self.decodedSelf = decoded
|
||||
return decoded._data
|
||||
|
||||
def setData(self, data):
|
||||
raise "Creating EncodedStreamObject is not currently supported"
|
||||
|
||||
|
||||
class RectangleObject(ArrayObject):
|
||||
def __init__(self, arr):
|
||||
# must have four points
|
||||
assert len(arr) == 4
|
||||
# automatically convert arr[x] into NumberObject(arr[x]) if necessary
|
||||
ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr])
|
||||
|
||||
def ensureIsNumber(self, value):
|
||||
if not isinstance(value, NumberObject):
|
||||
value = NumberObject(value)
|
||||
return value
|
||||
|
||||
def __repr__(self):
|
||||
return "RectangleObject(%s)" % repr(list(self))
|
||||
|
||||
def getLowerLeft_x(self):
|
||||
return self[0]
|
||||
|
||||
def getLowerLeft_y(self):
|
||||
return self[1]
|
||||
|
||||
def getUpperRight_x(self):
|
||||
return self[2]
|
||||
|
||||
def getUpperRight_y(self):
|
||||
return self[3]
|
||||
|
||||
def getUpperLeft_x(self):
|
||||
return self.getLowerLeft_x()
|
||||
|
||||
def getUpperLeft_y(self):
|
||||
return self.getUpperRight_y()
|
||||
|
||||
def getLowerRight_x(self):
|
||||
return self.getUpperRight_x()
|
||||
|
||||
def getLowerRight_y(self):
|
||||
return self.getLowerLeft_y()
|
||||
|
||||
def getLowerLeft(self):
|
||||
return self.getLowerLeft_x(), self.getLowerLeft_y()
|
||||
|
||||
def getLowerRight(self):
|
||||
return self.getLowerRight_x(), self.getLowerRight_y()
|
||||
|
||||
def getUpperLeft(self):
|
||||
return self.getUpperLeft_x(), self.getUpperLeft_y()
|
||||
|
||||
def getUpperRight(self):
|
||||
return self.getUpperRight_x(), self.getUpperRight_y()
|
||||
|
||||
def setLowerLeft(self, value):
|
||||
self[0], self[1] = [self.ensureIsNumber(x) for x in value]
|
||||
|
||||
def setLowerRight(self, value):
|
||||
self[2], self[1] = [self.ensureIsNumber(x) for x in value]
|
||||
|
||||
def setUpperLeft(self, value):
|
||||
self[0], self[3] = [self.ensureIsNumber(x) for x in value]
|
||||
|
||||
def setUpperRight(self, value):
|
||||
self[2], self[3] = [self.ensureIsNumber(x) for x in value]
|
||||
|
||||
lowerLeft = property(getLowerLeft, setLowerLeft, None, None)
|
||||
lowerRight = property(getLowerRight, setLowerRight, None, None)
|
||||
upperLeft = property(getUpperLeft, setUpperLeft, None, None)
|
||||
upperRight = property(getUpperRight, setUpperRight, None, None)
|
||||
|
1162
src/libprs500/ebooks/pyPdf/pdf.py
Normal file
1162
src/libprs500/ebooks/pyPdf/pdf.py
Normal file
File diff suppressed because it is too large
Load Diff
94
src/libprs500/ebooks/pyPdf/utils.py
Normal file
94
src/libprs500/ebooks/pyPdf/utils.py
Normal file
@ -0,0 +1,94 @@
|
||||
# vim: sw=4:expandtab:foldmethod=marker
|
||||
#
|
||||
# Copyright (c) 2006, Mathieu Fenniak
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
"""
|
||||
Utility functions for PDF library.
|
||||
"""
|
||||
__author__ = "Mathieu Fenniak"
|
||||
__author_email__ = "mfenniak@pobox.com"
|
||||
|
||||
def readUntilWhitespace(stream, maxchars=None):
|
||||
txt = ""
|
||||
while True:
|
||||
tok = stream.read(1)
|
||||
if tok.isspace() or not tok:
|
||||
break
|
||||
txt += tok
|
||||
if len(txt) == maxchars:
|
||||
break
|
||||
return txt
|
||||
|
||||
def readNonWhitespace(stream):
|
||||
tok = ' '
|
||||
while tok == '\n' or tok == '\r' or tok == ' ' or tok == '\t':
|
||||
tok = stream.read(1)
|
||||
return tok
|
||||
|
||||
class ConvertFunctionsToVirtualList(object):
|
||||
def __init__(self, lengthFunction, getFunction):
|
||||
self.lengthFunction = lengthFunction
|
||||
self.getFunction = getFunction
|
||||
|
||||
def __len__(self):
|
||||
return self.lengthFunction()
|
||||
|
||||
def __getitem__(self, index):
|
||||
if not isinstance(index, int):
|
||||
raise TypeError, "sequence indices must be integers"
|
||||
len_self = len(self)
|
||||
if index < 0:
|
||||
# support negative indexes
|
||||
index = len_self + index
|
||||
if index < 0 or index >= len_self:
|
||||
raise IndexError, "sequence index out of range"
|
||||
return self.getFunction(index)
|
||||
|
||||
def RC4_encrypt(key, plaintext):
|
||||
S = [i for i in range(256)]
|
||||
j = 0
|
||||
for i in range(256):
|
||||
j = (j + S[i] + ord(key[i % len(key)])) % 256
|
||||
S[i], S[j] = S[j], S[i]
|
||||
i, j = 0, 0
|
||||
retval = ""
|
||||
for x in range(len(plaintext)):
|
||||
i = (i + 1) % 256
|
||||
j = (j + S[i]) % 256
|
||||
S[i], S[j] = S[j], S[i]
|
||||
t = S[(S[i] + S[j]) % 256]
|
||||
retval += chr(ord(plaintext[x]) ^ t)
|
||||
return retval
|
||||
|
||||
if __name__ == "__main__":
|
||||
# test RC4
|
||||
out = RC4_encrypt("Key", "Plaintext")
|
||||
print repr(out)
|
||||
pt = RC4_encrypt("Key", out)
|
||||
print repr(pt)
|
Loading…
x
Reference in New Issue
Block a user