diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 58390a314a..bfe23cee67 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -444,23 +444,6 @@ class CurrentDir(object): pass -class StreamReadWrapper(object): - ''' - Used primarily with pyPdf to ensure the stream is properly closed. - ''' - - def __init__(self, stream): - for x in ('read', 'seek', 'tell'): - setattr(self, x, getattr(stream, x)) - - def __exit__(self, *args): - for x in ('read', 'seek', 'tell'): - setattr(self, x, None) - - def __enter__(self): - return self - - def detect_ncpus(): """Detects the number of effective CPUs in the system""" import multiprocessing diff --git a/src/calibre/ebooks/pdf/manipulate/__init__.py b/src/calibre/ebooks/pdf/manipulate/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/calibre/ebooks/pdf/manipulate/cli.py b/src/calibre/ebooks/pdf/manipulate/cli.py deleted file mode 100644 index c6e52f85d3..0000000000 --- a/src/calibre/ebooks/pdf/manipulate/cli.py +++ /dev/null @@ -1,72 +0,0 @@ -from __future__ import with_statement -# -*- coding: utf-8 -*- - -__license__ = 'GPL v3' -__copyright__ = '2009, John Schember ' -__docformat__ = 'restructuredtext en' - -''' -Command line interface to run pdf manipulation commands. -''' - -import string, sys - -from calibre.utils.config import OptionParser -from calibre.utils.logging import Log -from calibre.constants import preferred_encoding -from calibre.ebooks.pdf.manipulate import crop, decrypt, encrypt, \ - info, merge, reverse, rotate, split - -COMMANDS = { - 'crop' : crop, - 'decrypt' : decrypt, - 'encrypt' : encrypt, - 'info' : info, - 'merge' : merge, - 'reverse' : reverse, - 'rotate' : rotate, - 'split' : split, - } - -USAGE = '%prog ' + _('''command ... - -command can be one of the following: -[%%commands] - -Use %prog command --help to get more information about a specific command - -Manipulate a PDF. -''').replace('%%commands', string.join(sorted(COMMANDS.keys()), ', ')) - -def print_help(parser, log): - help = parser.format_help().encode(preferred_encoding, 'replace') - log(help) - -def option_parser(): - return OptionParser(usage=USAGE) - -def main(args=sys.argv): - log = Log() - parser = option_parser() - - if len(args) < 2: - print 'Error: No command sepecified.\n' - print_help(parser, log) - return 1 - - command = args[1].lower().strip() - - if command in COMMANDS.keys(): - del args[1] - return COMMANDS[command].main(args, command) - else: - parser.parse_args(args) - print 'Unknown command %s.\n' % command - print_help(parser, log) - return 1 - - # We should never get here. - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/manipulate/crop.py b/src/calibre/ebooks/pdf/manipulate/crop.py deleted file mode 100644 index 8957320280..0000000000 --- a/src/calibre/ebooks/pdf/manipulate/crop.py +++ /dev/null @@ -1,150 +0,0 @@ -# -*- coding: utf-8 -*- - -__license__ = 'GPL v3' -__copyright__ = '2009, James Beal , ' \ - '2009, John Schember ' -__docformat__ = 'restructuredtext en' - -''' -Crop a pdf file -''' - -import sys -import re -from decimal import Decimal -from optparse import OptionGroup, Option - -from calibre.ebooks.metadata.meta import metadata_from_formats -from calibre.ebooks.metadata import authors_to_string -from calibre.utils.config import OptionParser -from calibre.utils.logging import Log -from calibre.constants import preferred_encoding -from calibre.customize.conversion import OptionRecommendation -from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted - -from pyPdf import PdfFileWriter, PdfFileReader - -DEFAULT_CROP = 10 - -USAGE = '\n%prog %%name ' + _('''\ -[options] file.pdf - -Crop a PDF file. -''') - -OPTIONS = set([ - OptionRecommendation(name='output', recommended_value='cropped.pdf', - level=OptionRecommendation.HIGH, long_switch='output', short_switch='o', - help=_('Path to output file. By default a file is created in the current directory.')), - OptionRecommendation(name='bottom_left_x', recommended_value=DEFAULT_CROP, - level=OptionRecommendation.LOW, long_switch='left-x', short_switch='x', - help=_('Number of pixels to crop from the left most x (default is %s)') % DEFAULT_CROP), - OptionRecommendation(name='bottom_left_y', recommended_value=DEFAULT_CROP, - level=OptionRecommendation.LOW, long_switch='left-y', short_switch='y', - help=_('Number of pixels to crop from the left most y (default is %s)') % DEFAULT_CROP), - OptionRecommendation(name='top_right_x', recommended_value=DEFAULT_CROP, - level=OptionRecommendation.LOW, long_switch='right-x', short_switch='v', - help=_('Number of pixels to crop from the right most x (default is %s)') % DEFAULT_CROP), - OptionRecommendation(name='top_right_y', recommended_value=DEFAULT_CROP, - level=OptionRecommendation.LOW, long_switch='right-y', short_switch='w', - help=_('Number of pixels to crop from the right most y (default is %s)') % DEFAULT_CROP), - OptionRecommendation(name='bounding', recommended_value=None, - level=OptionRecommendation.LOW, long_switch='bounding', short_switch='b', - help=_('A file generated by ghostscript which allows each page to be individually cropped `gs -dSAFER -dNOPAUSE -dBATCH -sDEVICE=bbox file.pdf 2> bounding`')), -]) - -def print_help(parser, log): - help = parser.format_help().encode(preferred_encoding, 'replace') - log(help) - -def option_parser(name): - usage = USAGE.replace('%%name', name) - return OptionParser(usage=usage) - -def option_recommendation_to_cli_option(add_option, rec): - opt = rec.option - switches = ['-'+opt.short_switch] if opt.short_switch else [] - switches.append('--'+opt.long_switch) - attrs = dict(dest=opt.name, help=opt.help, - choices=opt.choices, default=rec.recommended_value) - add_option(Option(*switches, **attrs)) - -def add_options(parser): - group = OptionGroup(parser, _('Crop Options:'), _('Options to control the transformation of pdf')) - parser.add_option_group(group) - add_option = group.add_option - - for rec in OPTIONS: - option_recommendation_to_cli_option(add_option, rec) - -def crop_pdf(pdf_path, opts, metadata=None): - if metadata == None: - title = _('Unknown') - author = _('Unknown') - else: - title = metadata.title - author = authors_to_string(metadata.authors) - - input_pdf = PdfFileReader(open(pdf_path, 'rb')) - - bounding_lines = [] - if opts.bounding != None: - try: - bounding = open(opts.bounding , 'r') - bounding_regex = re.compile('%%BoundingBox: (?P\d+) (?P\d+) (?P\d+) (?P\d+)') - except: - raise Exception('Error reading %s' % opts.bounding) - - lines = bounding.readlines() - for line in lines: - if line.startswith('%%BoundingBox:'): - bounding_lines.append(line) - if len(bounding_lines) != input_pdf.numPages: - raise Exception('Error bounding file %s page count does not correspond to specified pdf' % opts.bounding) - - output_pdf = PdfFileWriter(title=title,author=author) - blines = iter(bounding_lines) - for page in input_pdf.pages: - if bounding_lines != []: - mo = bounding_regex.search(blines.next()) - if mo == None: - raise Exception('Error in bounding file %s' % opts.bounding) - page.mediaBox.upperRight = (float(mo.group('top_x')), Decimal(mo.group('top_y'))) - page.mediaBox.lowerLeft = (float(mo.group('bottom_x')), Decimal(mo.group('bottom_y'))) - else: - page.mediaBox.upperRight = (page.bleedBox.getUpperRight_x() - Decimal(opts.top_right_x), page.bleedBox.getUpperRight_y() - Decimal(opts.top_right_y)) - page.mediaBox.lowerLeft = (page.bleedBox.getLowerLeft_x() + Decimal(opts.bottom_left_x), page.bleedBox.getLowerLeft_y() + Decimal(opts.bottom_left_y)) - output_pdf.addPage(page) - - with open(opts.output, 'wb') as output_file: - output_pdf.write(output_file) - -def main(args=sys.argv, name=''): - log = Log() - parser = option_parser(name) - add_options(parser) - - opts, args = parser.parse_args(args) - args = args[1:] - - if len(args) < 1: - print 'Error: A PDF file is required.\n' - print_help(parser, log) - return 1 - - if not is_valid_pdf(args[0]): - print 'Error: Could not read file `%s`.' % args[0] - return 1 - - if is_encrypted(args[0]): - print 'Error: file `%s` is encrypted.' % args[0] - return 1 - - mi = metadata_from_formats([args[0]]) - - crop_pdf(args[0], opts, mi) - - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/manipulate/decrypt.py b/src/calibre/ebooks/pdf/manipulate/decrypt.py deleted file mode 100644 index fd8510efc7..0000000000 --- a/src/calibre/ebooks/pdf/manipulate/decrypt.py +++ /dev/null @@ -1,113 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import with_statement - -__license__ = 'GPL v3' -__copyright__ = '2009, John Schember ' -__docformat__ = 'restructuredtext en' - -''' -Decrypt content of PDF. -''' - -import os, sys -from optparse import OptionGroup, Option - -from calibre.utils.config import OptionParser -from calibre.utils.logging import Log -from calibre.constants import preferred_encoding -from calibre.customize.conversion import OptionRecommendation -from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted - -from pyPdf import PdfFileWriter, PdfFileReader - -USAGE = '\n%prog %%name ' + _('''\ -[options] file.pdf password - -Decrypt a PDF. -''') - -OPTIONS = set([ - OptionRecommendation(name='output', recommended_value='decrypted.pdf', - level=OptionRecommendation.HIGH, long_switch='output', short_switch='o', - help=_('Path to output file. By default a file is created in the current directory.')), -]) - -class DecryptionError(Exception): - def __init__(self, pdf_path): - self.value = 'Unable to decrypt file `%s`.' % pdf_path - - def __str__(self): - return repr(self.value) - - -def print_help(parser, log): - help = parser.format_help().encode(preferred_encoding, 'replace') - log(help) - -def option_parser(name): - usage = USAGE.replace('%%name', name) - return OptionParser(usage=usage) - -def option_recommendation_to_cli_option(add_option, rec): - opt = rec.option - switches = ['-'+opt.short_switch] if opt.short_switch else [] - switches.append('--'+opt.long_switch) - attrs = dict(dest=opt.name, help=opt.help, - choices=opt.choices, default=rec.recommended_value) - add_option(Option(*switches, **attrs)) - -def add_options(parser): - group = OptionGroup(parser, _('Decrypt Options:'), _('Options to control the transformation of pdf')) - parser.add_option_group(group) - add_option = group.add_option - - for rec in OPTIONS: - option_recommendation_to_cli_option(add_option, rec) - -def decrypt(pdf_path, out_path, password): - pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) - - if pdf.decrypt(str(password)) == 0: - raise DecryptionError(pdf_path) - - title = pdf.documentInfo.title if pdf.documentInfo.title else _('Unknown') - author = pdf.documentInfo.author if pdf.documentInfo.author else _('Unknown') - out_pdf = PdfFileWriter(title=title, author=author) - - for page in pdf.pages: - out_pdf.addPage(page) - - with open(out_path, 'wb') as out_file: - out_pdf.write(out_file) - -def main(args=sys.argv, name=''): - log = Log() - parser = option_parser(name) - add_options(parser) - - opts, args = parser.parse_args(args) - args = args[1:] - - if len(args) < 2: - print 'Error: A PDF file and decryption password is required.\n' - print_help(parser, log) - return 1 - - if not is_valid_pdf(args[0]): - print 'Error: Could not read file `%s`.' % args[0] - return 1 - - if not is_encrypted(args[0]): - print 'Error: file `%s` is not encrypted.' % args[0] - return 1 - - try: - decrypt(args[0], opts.output, args[1]) - except DecryptionError as e: - print e.value - return 1 - - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/manipulate/encrypt.py b/src/calibre/ebooks/pdf/manipulate/encrypt.py deleted file mode 100644 index ff3b47b11a..0000000000 --- a/src/calibre/ebooks/pdf/manipulate/encrypt.py +++ /dev/null @@ -1,107 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import with_statement - -__license__ = 'GPL v3' -__copyright__ = '2009, John Schember ' -__docformat__ = 'restructuredtext en' - -''' -Encrypt a PDF. -''' - -import os, sys -from optparse import OptionGroup, Option - -from calibre.utils.config import OptionParser -from calibre.utils.logging import Log -from calibre.constants import preferred_encoding -from calibre.customize.conversion import OptionRecommendation -from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted -from calibre.ebooks.metadata import authors_to_string -from calibre.ebooks.metadata.meta import metadata_from_formats - -from pyPdf import PdfFileWriter, PdfFileReader - -USAGE = '\n%prog %%name ' + _('''\ -[options] file.pdf password - -Encrypt a PDF. -''') - -OPTIONS = set([ - OptionRecommendation(name='output', recommended_value='encrypted.pdf', - level=OptionRecommendation.HIGH, long_switch='output', short_switch='o', - help=_('Path to output file. By default a file is created in the current directory.')), -]) - -def print_help(parser, log): - help = parser.format_help().encode(preferred_encoding, 'replace') - log(help) - -def option_parser(name): - usage = USAGE.replace('%%name', name) - return OptionParser(usage=usage) - -def option_recommendation_to_cli_option(add_option, rec): - opt = rec.option - switches = ['-'+opt.short_switch] if opt.short_switch else [] - switches.append('--'+opt.long_switch) - attrs = dict(dest=opt.name, help=opt.help, - choices=opt.choices, default=rec.recommended_value) - add_option(Option(*switches, **attrs)) - -def add_options(parser): - group = OptionGroup(parser, _('Encrypt Options:'), _('Options to control the transformation of pdf')) - parser.add_option_group(group) - add_option = group.add_option - - for rec in OPTIONS: - option_recommendation_to_cli_option(add_option, rec) - -def encrypt(pdf_path, out_path, password, metadata=None): - if metadata == None: - title = _('Unknown') - author = _('Unknown') - else: - title = metadata.title - author = authors_to_string(metadata.authors) - - out_pdf = PdfFileWriter(title=title, author=author) - - pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) - for page in pdf.pages: - out_pdf.addPage(page) - - with open(out_path, 'wb') as out_file: - out_pdf.encrypt(str(password)) - out_pdf.write(out_file) - -def main(args=sys.argv, name=''): - log = Log() - parser = option_parser(name) - add_options(parser) - - opts, args = parser.parse_args(args) - args = args[1:] - - if len(args) < 2: - print 'Error: A PDF file and decryption password is required.\n' - print_help(parser, log) - return 1 - - if not is_valid_pdf(args[0]): - print 'Error: Could not read file `%s`.' % args[0] - return 1 - - if is_encrypted(args[0]): - print 'Error: file `%s` is already encrypted.' % args[0] - return 1 - - mi = metadata_from_formats([args[0]]) - - encrypt(args[0], opts.output, args[1], mi) - - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/manipulate/info.py b/src/calibre/ebooks/pdf/manipulate/info.py deleted file mode 100644 index ee71dac71d..0000000000 --- a/src/calibre/ebooks/pdf/manipulate/info.py +++ /dev/null @@ -1,85 +0,0 @@ -from __future__ import with_statement -# -*- coding: utf-8 -*- - -__license__ = 'GPL v3' -__copyright__ = '2009, John Schember ' -__docformat__ = 'restructuredtext en' - -''' -Merge PDF files into a single PDF document. -''' - -import os, sys - -from calibre.utils.config import OptionParser -from calibre.utils.logging import Log -from calibre.constants import preferred_encoding -from calibre.ebooks.pdf.verify import is_valid_pdfs, is_encrypted -from calibre.utils.podofo import get_podofo -from calibre import prints - -USAGE = '\n%prog %%name ' + _('''\ -file.pdf ... - -Get info about a PDF. -''') - -def print_help(parser, log): - help = parser.format_help().encode(preferred_encoding, 'replace') - log(help) - -def option_parser(name): - usage = USAGE.replace('%%name', name) - return OptionParser(usage=usage) - -def print_info(pdf_path): - podofo = get_podofo() - p = podofo.PDFDoc() - p.open(pdf_path) - - fmt = lambda x, y: '%-20s: %s'%(x, y) - - print - - prints(fmt(_('Title'), p.title)) - prints(fmt(_('Author'), p.author)) - prints(fmt(_('Subject'), p.subject)) - prints(fmt(_('Creator'), p.creator)) - prints(fmt(_('Producer'), p.producer)) - prints(fmt(_('Pages'), p.pages)) - prints(fmt(_('File Size'), os.stat(pdf_path).st_size)) - prints(fmt(_('PDF Version'), p.version if p.version else _('Unknown'))) - -def main(args=sys.argv, name=''): - log = Log() - parser = option_parser(name) - - opts, args = parser.parse_args(args) - args = args[1:] - - if len(args) < 1: - print 'Error: No PDF sepecified.\n' - print_help(parser, log) - return 1 - - bad_pdfs = is_valid_pdfs(args) - if bad_pdfs != []: - for pdf in bad_pdfs: - print 'Error: Could not read file `%s`.' % pdf - return 1 - - enc = False - for pdf in args: - if is_encrypted(pdf): - enc = True - print 'Error: file `%s` is encrypted. Please decrypt first.' % pdf - if enc: - return 1 - - for pdf in args: - print_info(pdf) - - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/manipulate/merge.py b/src/calibre/ebooks/pdf/manipulate/merge.py deleted file mode 100644 index e300136d16..0000000000 --- a/src/calibre/ebooks/pdf/manipulate/merge.py +++ /dev/null @@ -1,115 +0,0 @@ -from __future__ import with_statement -# -*- coding: utf-8 -*- - -__license__ = 'GPL v3' -__copyright__ = '2009, John Schember ' -__docformat__ = 'restructuredtext en' - -''' -Merge PDF files into a single PDF document. -''' - -import os, sys -from optparse import OptionGroup, Option - -from calibre.ebooks.metadata.meta import metadata_from_formats -from calibre.ebooks.metadata import authors_to_string -from calibre.utils.config import OptionParser -from calibre.utils.logging import Log -from calibre.constants import preferred_encoding -from calibre.customize.conversion import OptionRecommendation -from calibre.ebooks.pdf.verify import is_valid_pdfs, is_encrypted - -from pyPdf import PdfFileWriter, PdfFileReader - -USAGE = '\n%prog %%name ' + _('''\ -[options] file1.pdf file2.pdf ... - -Metadata will be used from the first PDF specified. - -Merges individual PDFs. -''') - -OPTIONS = set([ - OptionRecommendation(name='output', recommended_value='merged.pdf', - level=OptionRecommendation.HIGH, long_switch='output', short_switch='o', - help=_('Path to output file. By default a file is created in the current directory.')), -]) - -def print_help(parser, log): - help = parser.format_help().encode(preferred_encoding, 'replace') - log(help) - -def option_parser(name): - usage = USAGE.replace('%%name', name) - return OptionParser(usage=usage) - -def option_recommendation_to_cli_option(add_option, rec): - opt = rec.option - switches = ['-'+opt.short_switch] if opt.short_switch else [] - switches.append('--'+opt.long_switch) - attrs = dict(dest=opt.name, help=opt.help, - choices=opt.choices, default=rec.recommended_value) - add_option(Option(*switches, **attrs)) - -def add_options(parser): - group = OptionGroup(parser, _('Merge Options:'), _('Options to control the transformation of pdf')) - parser.add_option_group(group) - add_option = group.add_option - - for rec in OPTIONS: - option_recommendation_to_cli_option(add_option, rec) - -def merge_files(in_paths, out_path, metadata=None): - if metadata == None: - title = _('Unknown') - author = _('Unknown') - else: - title = metadata.title - author = authors_to_string(metadata.authors) - - out_pdf = PdfFileWriter(title=title, author=author) - - for pdf_path in in_paths: - pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) - for page in pdf.pages: - out_pdf.addPage(page) - - with open(out_path, 'wb') as out_file: - out_pdf.write(out_file) - -def main(args=sys.argv, name=''): - log = Log() - parser = option_parser(name) - add_options(parser) - - opts, args = parser.parse_args(args) - args = args[1:] - - if len(args) < 2: - print 'Error: Two or more PDF files are required.\n' - print_help(parser, log) - return 1 - - bad_pdfs = is_valid_pdfs(args) - if bad_pdfs != []: - for pdf in bad_pdfs: - print 'Error: Could not read file `%s`.' % pdf - return 1 - - enc = False - for pdf in args: - if is_encrypted(pdf): - enc = True - print 'Error: file `%s` is encrypted.' % pdf - if enc: - return 1 - - mi = metadata_from_formats([args[0]]) - - merge_files(args, opts.output, mi) - - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/manipulate/reverse.py b/src/calibre/ebooks/pdf/manipulate/reverse.py deleted file mode 100644 index b4bbe27a40..0000000000 --- a/src/calibre/ebooks/pdf/manipulate/reverse.py +++ /dev/null @@ -1,106 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import with_statement - -__license__ = 'GPL v3' -__copyright__ = '2009, John Schember ' -__docformat__ = 'restructuredtext en' - -''' -Reverse content of PDF. -''' - -import os, sys -from optparse import OptionGroup, Option - -from calibre.ebooks.metadata.meta import metadata_from_formats -from calibre.ebooks.metadata import authors_to_string -from calibre.utils.config import OptionParser -from calibre.utils.logging import Log -from calibre.constants import preferred_encoding -from calibre.customize.conversion import OptionRecommendation -from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted - -from pyPdf import PdfFileWriter, PdfFileReader - -USAGE = '\n%prog %%name ' + _('''\ -[options] file.pdf - -Reverse a PDF. -''') - -OPTIONS = set([ - OptionRecommendation(name='output', recommended_value='reversed.pdf', - level=OptionRecommendation.HIGH, long_switch='output', short_switch='o', - help=_('Path to output file. By default a file is created in the current directory.')), -]) - -def print_help(parser, log): - help = parser.format_help().encode(preferred_encoding, 'replace') - log(help) - -def option_parser(name): - usage = USAGE.replace('%%name', name) - return OptionParser(usage=usage) - -def option_recommendation_to_cli_option(add_option, rec): - opt = rec.option - switches = ['-'+opt.short_switch] if opt.short_switch else [] - switches.append('--'+opt.long_switch) - attrs = dict(dest=opt.name, help=opt.help, - choices=opt.choices, default=rec.recommended_value) - add_option(Option(*switches, **attrs)) - -def add_options(parser): - group = OptionGroup(parser, _('Reverse Options:'), _('Options to control the transformation of pdf')) - parser.add_option_group(group) - add_option = group.add_option - - for rec in OPTIONS: - option_recommendation_to_cli_option(add_option, rec) - -def reverse(pdf_path, out_path, metadata=None): - if metadata == None: - title = _('Unknown') - author = _('Unknown') - else: - title = metadata.title - author = authors_to_string(metadata.authors) - - out_pdf = PdfFileWriter(title=title, author=author) - - pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) - for page in reversed(pdf.pages): - out_pdf.addPage(page) - - with open(out_path, 'wb') as out_file: - out_pdf.write(out_file) - -def main(args=sys.argv, name=''): - log = Log() - parser = option_parser(name) - add_options(parser) - - opts, args = parser.parse_args(args) - args = args[1:] - - if len(args) < 1: - print 'Error: A PDF file is required.\n' - print_help(parser, log) - return 1 - - if not is_valid_pdf(args[0]): - print 'Error: Could not read file `%s`.' % args[0] - return 1 - - if is_encrypted(args[0]): - print 'Error: file `%s` is encrypted.' % args[0] - return 1 - - mi = metadata_from_formats([args[0]]) - - reverse(args[0], opts.output, mi) - - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/manipulate/rotate.py b/src/calibre/ebooks/pdf/manipulate/rotate.py deleted file mode 100644 index ac46a8e0c8..0000000000 --- a/src/calibre/ebooks/pdf/manipulate/rotate.py +++ /dev/null @@ -1,105 +0,0 @@ -# -*- coding: utf-8 -*- - -__license__ = 'GPL v3' -__copyright__ = '2009, John Schember ' -__docformat__ = 'restructuredtext en' - -''' -Rotate pages of a PDF. -''' - -import os, sys -from optparse import OptionGroup, Option - -from calibre.ebooks.metadata.meta import metadata_from_formats -from calibre.ebooks.metadata import authors_to_string -from calibre.utils.config import OptionParser -from calibre.utils.logging import Log -from calibre.constants import preferred_encoding -from calibre.customize.conversion import OptionRecommendation -from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted - -from pyPdf import PdfFileWriter, PdfFileReader - -USAGE = '\n%prog %%name ' + _('''\ -file.pdf degrees - -Rotate pages of a PDF clockwise. -''') - -OPTIONS = set([ - OptionRecommendation(name='output', recommended_value='rotated.pdf', - level=OptionRecommendation.HIGH, long_switch='output', short_switch='o', - help=_('Path to output file. By default a file is created in the current directory.')), -]) - -def print_help(parser, log): - help = parser.format_help().encode(preferred_encoding, 'replace') - log(help) - -def option_parser(name): - usage = USAGE.replace('%%name', name) - return OptionParser(usage=usage) - -def option_recommendation_to_cli_option(add_option, rec): - opt = rec.option - switches = ['-'+opt.short_switch] if opt.short_switch else [] - switches.append('--'+opt.long_switch) - attrs = dict(dest=opt.name, help=opt.help, - choices=opt.choices, default=rec.recommended_value) - add_option(Option(*switches, **attrs)) - -def add_options(parser): - group = OptionGroup(parser, _('Rotate Options:'), _('Options to control the transformation of pdf')) - parser.add_option_group(group) - add_option = group.add_option - - for rec in OPTIONS: - option_recommendation_to_cli_option(add_option, rec) - -def rotate(pdf_path, out_path, degrees, metadata=None): - if metadata == None: - title = _('Unknown') - author = _('Unknown') - else: - title = metadata.title - author = authors_to_string(metadata.authors) - - out_pdf = PdfFileWriter(title=title, author=author) - - pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) - for page in pdf.pages: - out_pdf.addPage(page.rotateClockwise(int(degrees))) - - with open(out_path, 'wb') as out_file: - out_pdf.write(out_file) - -def main(args=sys.argv, name=''): - log = Log() - parser = option_parser(name) - add_options(parser) - - opts, args = parser.parse_args(args) - args = args[1:] - - if len(args) < 2: - print 'Error: A PDF file and how many degrees to rotate is required.\n' - print_help(parser, log) - return 1 - - if not is_valid_pdf(args[0]): - print 'Error: Could not read file `%s`.' % args[0] - return 1 - - if is_encrypted(args[0]): - print 'Error: file `%s` is encrypted.' % args[0] - return 1 - - mi = metadata_from_formats([args[0]]) - - rotate(args[0], opts.output, args[1], mi) - - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/manipulate/split.py b/src/calibre/ebooks/pdf/manipulate/split.py deleted file mode 100644 index 3ef2549a62..0000000000 --- a/src/calibre/ebooks/pdf/manipulate/split.py +++ /dev/null @@ -1,204 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import with_statement - -__license__ = 'GPL v3' -__copyright__ = '2009, John Schember ' -__docformat__ = 'restructuredtext en' - -''' -Split PDF file into multiple PDF documents. -''' - -import os, sys, re -from optparse import OptionGroup, Option - -from calibre.ebooks.metadata.meta import metadata_from_formats -from calibre.ebooks.metadata import authors_to_string -from calibre.utils.config import OptionParser -from calibre.utils.logging import Log -from calibre.constants import preferred_encoding -from calibre.customize.conversion import OptionRecommendation -from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted - -from pyPdf import PdfFileWriter, PdfFileReader - -USAGE = _(''' -%prog %%name [options] file.pdf page_to_split_on ... -%prog %%name [options] file.pdf page_range_to_split_on ... - -Ex. - -%prog %%name file.pdf 6 -%prog %%name file.pdf 6-12 -%prog %%name file.pdf 6-12 8 10 9-20 - -Split a PDF. -''') - -OPTIONS = set([ - OptionRecommendation(name='output', recommended_value='split.pdf', - level=OptionRecommendation.HIGH, long_switch='output', short_switch='o', - help=_('Path to output file. By default a file is created in the current directory.')), -]) - -def print_help(parser, log): - help = parser.format_help().encode(preferred_encoding, 'replace') - log(help) - -def option_parser(name): - usage = USAGE.replace('%%name', name) - return OptionParser(usage=usage) - -def option_recommendation_to_cli_option(add_option, rec): - opt = rec.option - switches = ['-'+opt.short_switch] if opt.short_switch else [] - switches.append('--'+opt.long_switch) - attrs = dict(dest=opt.name, help=opt.help, - choices=opt.choices, default=rec.recommended_value) - add_option(Option(*switches, **attrs)) - -def add_options(parser): - group = OptionGroup(parser, _('Split Options:'), _('Options to control the transformation of pdf')) - parser.add_option_group(group) - add_option = group.add_option - - for rec in OPTIONS: - option_recommendation_to_cli_option(add_option, rec) - -def split_pdf(in_path, pages, page_ranges, out_name, metadata=None): - pdf = PdfFileReader(open(os.path.abspath(in_path), 'rb')) - total_pages = pdf.numPages - 1 - - for index in pages+page_ranges: - if index in pages: - write_pdf(pdf, out_name, '%s' % (index + 1), index, total_pages, metadata) - else: - - write_pdf(pdf, out_name, '%s-%s' % (index[0] + 1, index[1] + 1), index[0], index[1], metadata) - -def write_pdf(pdf, name, suffix, start, end, metadata=None): - if metadata == None: - title = _('Unknown') - author = _('Unknown') - else: - title = metadata.title - author = authors_to_string(metadata.authors) - - out_pdf = PdfFileWriter(title=title, author=author) - for page_num in range(start, end + 1): - out_pdf.addPage(pdf.getPage(page_num)) - with open('%s%s.pdf' % (name, suffix), 'wb') as out_file: - out_pdf.write(out_file) - -def split_args(args): - pdf = '' - pages = [] - page_ranges = [] - bad = [] - - for arg in args: - arg = arg.strip() - # Find the pdf input - if re.search('(?iu)^.*?\.pdf[ ]*$', arg) != None: - if pdf == '': - pdf = arg - else: - bad.append(arg) - # Find single indexes - elif re.search('^[ ]*\d+[ ]*$', arg) != None: - pages.append(arg) - # Find index ranges - elif re.search('^[ ]*\d+[ ]*-[ ]*\d+[ ]*$', arg) != None: - mo = re.search('^[ ]*(?P\d+)[ ]*-[ ]*(?P\d+)[ ]*$', arg) - start = mo.group('start') - end = mo.group('end') - - # check to see if the range is really a single index - if start == end: - pages.append(start) - else: - page_ranges.append([start, end]) - else: - bad.append(arg) - - bad = sorted(list(set(bad))) - - return pdf, pages, page_ranges, bad - -# Remove duplicates from pages and page_ranges. -# Set pages higher than the total number of pages in the pdf to the last page. -# Return pages and page_ranges as lists of ints. -def clean_page_list(pdf_path, pages, page_ranges): - pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) - - total_pages = pdf.numPages - sorted_pages = [] - sorted_ranges = [] - - for index in pages: - index = int(index) - if index > total_pages: - sorted_pages.append(total_pages - 1) - else: - sorted_pages.append(index - 1) - - for start, end in page_ranges: - start = int(start) - end = int(end) - - if start > total_pages and end > total_pages: - sorted_pages.append(total_pages - 1) - continue - - if start > total_pages: - start = total_pages - if end > total_pages: - end = total_pages - page_range = sorted([start - 1, end - 1]) - if page_range not in sorted_ranges: - sorted_ranges.append(page_range) - - # Remove duplicates and sort - pages = sorted(list(set(sorted_pages))) - page_ranges = sorted(sorted_ranges) - - return pages, page_ranges - -def main(args=sys.argv, name=''): - log = Log() - parser = option_parser(name) - add_options(parser) - - opts, args = parser.parse_args(args) - - pdf, pages, page_ranges, unknown = split_args(args[1:]) - - if pdf == '' and (pages == [] or page_ranges == []): - print 'Error: PDF and where to split is required.\n' - print_help(parser, log) - return 1 - - if unknown != []: - for arg in unknown: - print 'Error: Unknown argument `%s`' % arg - print_help(parser, log) - return 1 - - if not is_valid_pdf(pdf): - print 'Error: Could not read file `%s`.' % pdf - return 1 - - if is_encrypted(pdf): - print 'Error: file `%s` is encrypted.' % args[0] - return 1 - - pages, page_ranges = clean_page_list(pdf, pages, page_ranges) - - mi = metadata_from_formats([pdf]) - - split_pdf(pdf, pages, page_ranges, os.path.splitext(opts.output)[0], mi) - - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/verify.py b/src/calibre/ebooks/pdf/verify.py deleted file mode 100644 index dea2e15ce1..0000000000 --- a/src/calibre/ebooks/pdf/verify.py +++ /dev/null @@ -1,44 +0,0 @@ -from __future__ import with_statement -# -*- coding: utf-8 -*- - -__license__ = 'GPL v3' -__copyright__ = '2009, John Schember ' -__docformat__ = 'restructuredtext en' - -''' -Verify PDF files. -''' - -import os - -from pyPdf import PdfFileReader - -def is_valid_pdf(pdf_path): - ''' - Returns True if the pdf file is valid. - ''' - - try: - with open(os.path.abspath(pdf_path), 'rb') as pdf_file: - PdfFileReader(pdf_file) - except: - return False - return True - -def is_valid_pdfs(pdf_paths): - ''' - Returns a list of invalid pdf files. - ''' - - invalid = [] - for pdf_path in pdf_paths: - if not is_valid_pdf(pdf_path): - invalid.append(pdf_path) - return invalid - -def is_encrypted(pdf_path): - with open(os.path.abspath(pdf_path), 'rb') as pdf_file: - pdf = PdfFileReader(pdf_file) - if pdf.isEncrypted: - return True - return False diff --git a/src/calibre/linux.py b/src/calibre/linux.py index 1a66324d9f..a50cedb001 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -29,7 +29,6 @@ entry_points = { 'calibre-parallel = calibre.utils.ipc.worker:main', 'calibre-customize = calibre.customize.ui:main', 'calibre-complete = calibre.utils.complete:main', - 'pdfmanipulate = calibre.ebooks.pdf.manipulate.cli:main', 'fetch-ebook-metadata = calibre.ebooks.metadata.sources.cli:main', 'epub-fix = calibre.ebooks.epub.fix.main:main', 'calibre-smtp = calibre.utils.smtp:main', diff --git a/src/pyPdf/__init__.py b/src/pyPdf/__init__.py deleted file mode 100644 index af02553da6..0000000000 --- a/src/pyPdf/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from pdf import PdfFileReader, PdfFileWriter -__all__ = ["pdf"] diff --git a/src/pyPdf/filters.py b/src/pyPdf/filters.py deleted file mode 100644 index 7fe10fb481..0000000000 --- a/src/pyPdf/filters.py +++ /dev/null @@ -1,252 +0,0 @@ -# vim: sw=4:expandtab:foldmethod=marker -# -# Copyright (c) 2006, Mathieu Fenniak -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# * The name of the author may not be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. - - -""" -Implementation of stream filters for PDF. -""" -__author__ = "Mathieu Fenniak" -__author_email__ = "biziqe@mathieu.fenniak.net" - -from utils import PdfReadError -try: - from cStringIO import StringIO -except ImportError: - from StringIO import StringIO - -try: - import zlib - def decompress(data): - return zlib.decompress(data) - def compress(data): - return zlib.compress(data) -except ImportError: - # Unable to import zlib. Attempt to use the System.IO.Compression - # library from the .NET framework. (IronPython only) - import System - from System import IO, Collections, Array - def _string_to_bytearr(buf): - retval = Array.CreateInstance(System.Byte, len(buf)) - for i in range(len(buf)): - retval[i] = ord(buf[i]) - return retval - def _bytearr_to_string(bytes): - retval = "" - for i in range(bytes.Length): - retval += chr(bytes[i]) - return retval - def _read_bytes(stream): - ms = IO.MemoryStream() - buf = Array.CreateInstance(System.Byte, 2048) - while True: - bytes = stream.Read(buf, 0, buf.Length) - if bytes == 0: - break - else: - ms.Write(buf, 0, bytes) - retval = ms.ToArray() - ms.Close() - return retval - def decompress(data): - bytes = _string_to_bytearr(data) - ms = IO.MemoryStream() - ms.Write(bytes, 0, bytes.Length) - ms.Position = 0 # fseek 0 - gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Decompress) - bytes = _read_bytes(gz) - retval = _bytearr_to_string(bytes) - gz.Close() - return retval - def compress(data): - bytes = _string_to_bytearr(data) - ms = IO.MemoryStream() - gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Compress, True) - gz.Write(bytes, 0, bytes.Length) - gz.Close() - ms.Position = 0 # fseek 0 - bytes = ms.ToArray() - retval = _bytearr_to_string(bytes) - ms.Close() - return retval - - -class FlateDecode(object): - def decode(data, decodeParms): - data = decompress(data) - predictor = 1 - if decodeParms: - predictor = decodeParms.get("/Predictor", 1) - # predictor 1 == no predictor - if predictor != 1: - columns = decodeParms["/Columns"] - # PNG prediction: - if predictor >= 10 and predictor <= 15: - output = StringIO() - # PNG prediction can vary from row to row - rowlength = columns + 1 - assert len(data) % rowlength == 0 - prev_rowdata = (0,) * rowlength - for row in xrange(len(data) / rowlength): - rowdata = [ord(x) for x in data[(row*rowlength):((row+1)*rowlength)]] - filterByte = rowdata[0] - if filterByte == 0: - pass - elif filterByte == 1: - for i in range(2, rowlength): - rowdata[i] = (rowdata[i] + rowdata[i-1]) % 256 - elif filterByte == 2: - for i in range(1, rowlength): - rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256 - else: - # unsupported PNG filter - raise PdfReadError("Unsupported PNG filter %r" % filterByte) - prev_rowdata = rowdata - output.write(''.join([chr(x) for x in rowdata[1:]])) - data = output.getvalue() - else: - # unsupported predictor - raise PdfReadError("Unsupported flatedecode predictor %r" % predictor) - return data - decode = staticmethod(decode) - - def encode(data): - return compress(data) - encode = staticmethod(encode) - -class ASCIIHexDecode(object): - def decode(data, decodeParms=None): - retval = "" - char = "" - x = 0 - while True: - c = data[x] - if c == ">": - break - elif c.isspace(): - x += 1 - continue - char += c - if len(char) == 2: - retval += chr(int(char, base=16)) - char = "" - x += 1 - assert char == "" - return retval - decode = staticmethod(decode) - -class ASCII85Decode(object): - def decode(data, decodeParms=None): - retval = "" - group = [] - x = 0 - hitEod = False - # remove all whitespace from data - data = [y for y in data if not (y in ' \n\r\t')] - while not hitEod: - c = data[x] - if len(retval) == 0 and c == "<" and data[x+1] == "~": - x += 2 - continue - #elif c.isspace(): - # x += 1 - # continue - elif c == 'z': - assert len(group) == 0 - retval += '\x00\x00\x00\x00' - continue - elif c == "~" and data[x+1] == ">": - if len(group) != 0: - # cannot have a final group of just 1 char - assert len(group) > 1 - cnt = len(group) - 1 - group += [ 85, 85, 85 ] - hitEod = cnt - else: - break - else: - c = ord(c) - 33 - assert c >= 0 and c < 85 - group += [ c ] - if len(group) >= 5: - b = group[0] * (85**4) + \ - group[1] * (85**3) + \ - group[2] * (85**2) + \ - group[3] * 85 + \ - group[4] - assert b < (2**32 - 1) - c4 = chr((b >> 0) % 256) - c3 = chr((b >> 8) % 256) - c2 = chr((b >> 16) % 256) - c1 = chr(b >> 24) - retval += (c1 + c2 + c3 + c4) - if hitEod: - retval = retval[:-4+hitEod] - group = [] - x += 1 - return retval - decode = staticmethod(decode) - -def decodeStreamData(stream): - from generic import NameObject - filters = stream.get("/Filter", ()) - if len(filters) and not isinstance(filters[0], NameObject): - # we have a single filter instance - filters = (filters,) - data = stream._data - for filterType in filters: - if filterType == "/FlateDecode": - data = FlateDecode.decode(data, stream.get("/DecodeParms")) - elif filterType == "/ASCIIHexDecode": - data = ASCIIHexDecode.decode(data) - elif filterType == "/ASCII85Decode": - data = ASCII85Decode.decode(data) - elif filterType == "/Crypt": - decodeParams = stream.get("/DecodeParams", {}) - if "/Name" not in decodeParams and "/Type" not in decodeParams: - pass - else: - raise NotImplementedError("/Crypt filter with /Name or /Type not supported yet") - else: - # unsupported filter - raise NotImplementedError("unsupported filter %s" % filterType) - return data - -if __name__ == "__main__": - assert "abc" == ASCIIHexDecode.decode('61\n626\n3>') - - ascii85Test = """ - <~9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKFCj@.4Gp$d7F!,L7@<6@)/0JDEF@3BB/F*&OCAfu2/AKY - i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF-FD5W8ARlolDIa - l(DIduD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~> - """ - ascii85_originalText="Man is distinguished, not only by his reason, but by this singular passion from other animals, which is a lust of the mind, that by a perseverance of delight in the continued and indefatigable generation of knowledge, exceeds the short vehemence of any carnal pleasure." - assert ASCII85Decode.decode(ascii85Test) == ascii85_originalText - diff --git a/src/pyPdf/generic.py b/src/pyPdf/generic.py deleted file mode 100644 index 5447ef5fbc..0000000000 --- a/src/pyPdf/generic.py +++ /dev/null @@ -1,780 +0,0 @@ -# vim: sw=4:expandtab:foldmethod=marker -# -# Copyright (c) 2006, Mathieu Fenniak -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# * The name of the author may not be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. - - -""" -Implementation of generic PDF objects (dictionary, number, string, and so on) -""" -__author__ = "Mathieu Fenniak" -__author_email__ = "biziqe@mathieu.fenniak.net" - -import re -from utils import readNonWhitespace, RC4_encrypt -import filters -import utils -import decimal -import codecs - -def readObject(stream, pdf): - tok = stream.read(1) - stream.seek(-1, 1) # reset to start - if tok == 't' or tok == 'f': - # boolean object - return BooleanObject.readFromStream(stream) - elif tok == '(': - # string object - return readStringFromStream(stream) - elif tok == '/': - # name object - return NameObject.readFromStream(stream) - elif tok == '[': - # array object - return ArrayObject.readFromStream(stream, pdf) - elif tok == 'n': - # null object - return NullObject.readFromStream(stream) - elif tok == '<': - # hexadecimal string OR dictionary - peek = stream.read(2) - stream.seek(-2, 1) # reset to start - if peek == '<<': - return DictionaryObject.readFromStream(stream, pdf) - else: - return readHexStringFromStream(stream) - elif tok == '%': - # comment - while tok not in ('\r', '\n'): - tok = stream.read(1) - tok = readNonWhitespace(stream) - stream.seek(-1, 1) - return readObject(stream, pdf) - else: - # number object OR indirect reference - if tok == '+' or tok == '-': - # number - return NumberObject.readFromStream(stream) - peek = stream.read(20) - stream.seek(-len(peek), 1) # reset to start - if re.match(r"(\d+)\s(\d+)\sR[^a-zA-Z]", peek) != None: - return IndirectObject.readFromStream(stream, pdf) - else: - return NumberObject.readFromStream(stream) - -class PdfObject(object): - def getObject(self): - """Resolves indirect references.""" - return self - - -class NullObject(PdfObject): - def writeToStream(self, stream, encryption_key): - stream.write("null") - - def readFromStream(stream): - nulltxt = stream.read(4) - if nulltxt != "null": - raise utils.PdfReadError, "error reading null object" - return NullObject() - readFromStream = staticmethod(readFromStream) - - -class BooleanObject(PdfObject): - def __init__(self, value): - self.value = value - - def writeToStream(self, stream, encryption_key): - if self.value: - stream.write("true") - else: - stream.write("false") - - def readFromStream(stream): - word = stream.read(4) - if word == "true": - return BooleanObject(True) - elif word == "fals": - stream.read(1) - return BooleanObject(False) - assert False - readFromStream = staticmethod(readFromStream) - - -class ArrayObject(list, PdfObject): - def writeToStream(self, stream, encryption_key): - stream.write("[") - for data in self: - stream.write(" ") - data.writeToStream(stream, encryption_key) - stream.write(" ]") - - def readFromStream(stream, pdf): - arr = ArrayObject() - tmp = stream.read(1) - if tmp != "[": - raise utils.PdfReadError, "error reading array" - while True: - # skip leading whitespace - tok = stream.read(1) - while tok.isspace(): - tok = stream.read(1) - stream.seek(-1, 1) - # check for array ending - peekahead = stream.read(1) - if peekahead == "]": - break - stream.seek(-1, 1) - # read and append obj - arr.append(readObject(stream, pdf)) - return arr - readFromStream = staticmethod(readFromStream) - - -class IndirectObject(PdfObject): - def __init__(self, idnum, generation, pdf): - self.idnum = idnum - self.generation = generation - self.pdf = pdf - - def getObject(self): - return self.pdf.getObject(self).getObject() - - def __repr__(self): - return "IndirectObject(%r, %r)" % (self.idnum, self.generation) - - def __eq__(self, other): - return ( - other != None and - isinstance(other, IndirectObject) and - self.idnum == other.idnum and - self.generation == other.generation and - self.pdf is other.pdf - ) - - def __ne__(self, other): - return not self.__eq__(other) - - def writeToStream(self, stream, encryption_key): - stream.write("%s %s R" % (self.idnum, self.generation)) - - def readFromStream(stream, pdf): - idnum = "" - while True: - tok = stream.read(1) - if tok.isspace(): - break - idnum += tok - generation = "" - while True: - tok = stream.read(1) - if tok.isspace(): - break - generation += tok - r = stream.read(1) - if r != "R": - raise utils.PdfReadError("error reading indirect object reference") - return IndirectObject(int(idnum), int(generation), pdf) - readFromStream = staticmethod(readFromStream) - - -class FloatObject(decimal.Decimal, PdfObject): - def __new__(cls, value="0", context=None): - return decimal.Decimal.__new__(cls, str(value), context) - def __repr__(self): - return str(self) - def writeToStream(self, stream, encryption_key): - stream.write(str(self)) - - -class NumberObject(int, PdfObject): - def __init__(self, value): - int.__init__(self, value) - - def writeToStream(self, stream, encryption_key): - stream.write(repr(self)) - - def readFromStream(stream): - name = "" - while True: - tok = stream.read(1) - if tok != '+' and tok != '-' and tok != '.' and not tok.isdigit(): - stream.seek(-1, 1) - break - name += tok - if name.find(".") != -1: - return FloatObject(name) - else: - return NumberObject(name) - readFromStream = staticmethod(readFromStream) - - -## -# Given a string (either a "str" or "unicode"), create a ByteStringObject or a -# TextStringObject to represent the string. -def createStringObject(string): - if isinstance(string, unicode): - return TextStringObject(string) - elif isinstance(string, str): - if string.startswith(codecs.BOM_UTF16_BE): - retval = TextStringObject(string.decode("utf-16")) - retval.autodetect_utf16 = True - return retval - else: - # This is probably a big performance hit here, but we need to - # convert string objects into the text/unicode-aware version if - # possible... and the only way to check if that's possible is - # to try. Some strings are strings, some are just byte arrays. - try: - retval = TextStringObject(decode_pdfdocencoding(string)) - retval.autodetect_pdfdocencoding = True - return retval - except UnicodeDecodeError: - return ByteStringObject(string) - else: - raise TypeError("createStringObject should have str or unicode arg") - - -def readHexStringFromStream(stream): - stream.read(1) - txt = "" - x = "" - while True: - tok = readNonWhitespace(stream) - if tok == ">": - break - x += tok - if len(x) == 2: - txt += chr(int(x, base=16)) - x = "" - if len(x) == 1: - x += "0" - if len(x) == 2: - txt += chr(int(x, base=16)) - return createStringObject(txt) - - -def readStringFromStream(stream): - tok = stream.read(1) - parens = 1 - txt = "" - while True: - tok = stream.read(1) - if tok == "(": - parens += 1 - elif tok == ")": - parens -= 1 - if parens == 0: - break - elif tok == "\\": - tok = stream.read(1) - if tok == "n": - tok = "\n" - elif tok == "r": - tok = "\r" - elif tok == "t": - tok = "\t" - elif tok == "b": - tok = "\b" - elif tok == "f": - tok = "\f" - elif tok == "(": - tok = "(" - elif tok == ")": - tok = ")" - elif tok == "\\": - tok = "\\" - elif tok.isdigit(): - tok += stream.read(2) - tok = chr(int(tok, base=8)) - elif tok in "\n\r": - # This case is hit when a backslash followed by a line - # break occurs. If it's a multi-char EOL, consume the - # second character: - tok = stream.read(1) - if not tok in "\n\r": - stream.seek(-1, 1) - # Then don't add anything to the actual string, since this - # line break was escaped: - tok = '' - else: - raise utils.PdfReadError("Unexpected escaped string") - txt += tok - return createStringObject(txt) - - -## -# Represents a string object where the text encoding could not be determined. -# This occurs quite often, as the PDF spec doesn't provide an alternate way to -# represent strings -- for example, the encryption data stored in files (like -# /O) is clearly not text, but is still stored in a "String" object. -class ByteStringObject(str, PdfObject): - - ## - # For compatibility with TextStringObject.original_bytes. This method - # returns self. - original_bytes = property(lambda self: self) - - def writeToStream(self, stream, encryption_key): - bytearr = self - if encryption_key: - bytearr = RC4_encrypt(encryption_key, bytearr) - stream.write("<") - stream.write(bytearr.encode("hex")) - stream.write(">") - - -## -# Represents a string object that has been decoded into a real unicode string. -# If read from a PDF document, this string appeared to match the -# PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to -# occur. -class TextStringObject(unicode, PdfObject): - autodetect_pdfdocencoding = False - autodetect_utf16 = False - - ## - # It is occasionally possible that a text string object gets created where - # a byte string object was expected due to the autodetection mechanism -- - # if that occurs, this "original_bytes" property can be used to - # back-calculate what the original encoded bytes were. - original_bytes = property(lambda self: self.get_original_bytes()) - - def get_original_bytes(self): - # We're a text string object, but the library is trying to get our raw - # bytes. This can happen if we auto-detected this string as text, but - # we were wrong. It's pretty common. Return the original bytes that - # would have been used to create this object, based upon the autodetect - # method. - if self.autodetect_utf16: - return codecs.BOM_UTF16_BE + self.encode("utf-16be") - elif self.autodetect_pdfdocencoding: - return encode_pdfdocencoding(self) - else: - raise Exception("no information about original bytes") - - def writeToStream(self, stream, encryption_key): - # Try to write the string out as a PDFDocEncoding encoded string. It's - # nicer to look at in the PDF file. Sadly, we take a performance hit - # here for trying... - try: - bytearr = encode_pdfdocencoding(self) - except UnicodeEncodeError: - bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") - if encryption_key: - bytearr = RC4_encrypt(encryption_key, bytearr) - obj = ByteStringObject(bytearr) - obj.writeToStream(stream, None) - else: - stream.write("(") - for c in bytearr: - if not c.isalnum() and c != ' ': - stream.write("\\%03o" % ord(c)) - else: - stream.write(c) - stream.write(")") - - -class NameObject(str, PdfObject): - delimiterCharacters = "(", ")", "<", ">", "[", "]", "{", "}", "/", "%" - - def __init__(self, data): - str.__init__(self, data) - - def writeToStream(self, stream, encryption_key): - stream.write(self) - - def readFromStream(stream): - name = stream.read(1) - if name != "/": - raise utils.PdfReadError, "name read error" - while True: - tok = stream.read(1) - if tok.isspace() or tok in NameObject.delimiterCharacters: - stream.seek(-1, 1) - break - name += tok - return NameObject(name) - readFromStream = staticmethod(readFromStream) - - -class DictionaryObject(dict, PdfObject): - - def __init__(self, *args, **kwargs): - if len(args) == 0: - self.update(kwargs) - elif len(args) == 1: - arr = args[0] - # If we're passed a list/tuple, make a dict out of it - if not hasattr(arr, "iteritems"): - newarr = {} - for k, v in arr: - newarr[k] = v - arr = newarr - self.update(arr) - else: - raise TypeError("dict expected at most 1 argument, got 3") - - def update(self, arr): - # note, a ValueError halfway through copying values - # will leave half the values in this dict. - for k, v in arr.iteritems(): - self.__setitem__(k, v) - - def raw_get(self, key): - return dict.__getitem__(self, key) - - def __setitem__(self, key, value): - if not isinstance(key, PdfObject): - raise ValueError("key must be PdfObject") - if not isinstance(value, PdfObject): - raise ValueError("value must be PdfObject") - return dict.__setitem__(self, key, value) - - def setdefault(self, key, value=None): - if not isinstance(key, PdfObject): - raise ValueError("key must be PdfObject") - if not isinstance(value, PdfObject): - raise ValueError("value must be PdfObject") - return dict.setdefault(self, key, value) - - def __getitem__(self, key): - return dict.__getitem__(self, key).getObject() - - ## - # Retrieves XMP (Extensible Metadata Platform) data relevant to the - # this object, if available. - #

- # Stability: Added in v1.12, will exist for all future v1.x releases. - # @return Returns a {@link #xmp.XmpInformation XmlInformation} instance - # that can be used to access XMP metadata from the document. Can also - # return None if no metadata was found on the document root. - def getXmpMetadata(self): - metadata = self.get("/Metadata", None) - if metadata == None: - return None - metadata = metadata.getObject() - import xmp - if not isinstance(metadata, xmp.XmpInformation): - metadata = xmp.XmpInformation(metadata) - self[NameObject("/Metadata")] = metadata - return metadata - - ## - # Read-only property that accesses the {@link - # #DictionaryObject.getXmpData getXmpData} function. - #

- # Stability: Added in v1.12, will exist for all future v1.x releases. - xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None) - - def writeToStream(self, stream, encryption_key): - stream.write("<<\n") - for key, value in self.items(): - key.writeToStream(stream, encryption_key) - stream.write(" ") - value.writeToStream(stream, encryption_key) - stream.write("\n") - stream.write(">>") - - def readFromStream(stream, pdf): - tmp = stream.read(2) - if tmp != "<<": - raise utils.PdfReadError, "dictionary read error" - data = {} - while True: - tok = readNonWhitespace(stream) - if tok == ">": - stream.read(1) - break - stream.seek(-1, 1) - key = readObject(stream, pdf) - tok = readNonWhitespace(stream) - stream.seek(-1, 1) - value = readObject(stream, pdf) - if data.has_key(key): - # multiple definitions of key not permitted - raise utils.PdfReadError, "multiple definitions in dictionary" - data[key] = value - pos = stream.tell() - s = readNonWhitespace(stream) - if s == 's' and stream.read(5) == 'tream': - eol = stream.read(1) - # odd PDF file output has spaces after 'stream' keyword but before EOL. - # patch provided by Danial Sandler - while eol == ' ': - eol = stream.read(1) - assert eol in ("\n", "\r") - if eol == "\r": - # read \n after - stream.read(1) - # this is a stream object, not a dictionary - assert data.has_key("/Length") - length = data["/Length"] - if isinstance(length, IndirectObject): - t = stream.tell() - length = pdf.getObject(length) - stream.seek(t, 0) - data["__streamdata__"] = stream.read(length) - e = readNonWhitespace(stream) - ndstream = stream.read(8) - if (e + ndstream) != "endstream": - # (sigh) - the odd PDF file has a length that is too long, so - # we need to read backwards to find the "endstream" ending. - # ReportLab (unknown version) generates files with this bug, - # and Python users into PDF files tend to be our audience. - # we need to do this to correct the streamdata and chop off - # an extra character. - pos = stream.tell() - stream.seek(-10, 1) - end = stream.read(9) - if end == "endstream": - # we found it by looking back one character further. - data["__streamdata__"] = data["__streamdata__"][:-1] - else: - stream.seek(pos, 0) - raise utils.PdfReadError, "Unable to find 'endstream' marker after stream." - else: - stream.seek(pos, 0) - if data.has_key("__streamdata__"): - return StreamObject.initializeFromDictionary(data) - else: - retval = DictionaryObject() - retval.update(data) - return retval - readFromStream = staticmethod(readFromStream) - - -class StreamObject(DictionaryObject): - def __init__(self): - self._data = None - self.decodedSelf = None - - def writeToStream(self, stream, encryption_key): - self[NameObject("/Length")] = NumberObject(len(self._data)) - DictionaryObject.writeToStream(self, stream, encryption_key) - del self["/Length"] - stream.write("\nstream\n") - data = self._data - if encryption_key: - data = RC4_encrypt(encryption_key, data) - stream.write(data) - stream.write("\nendstream") - - def initializeFromDictionary(data): - if data.has_key("/Filter"): - retval = EncodedStreamObject() - else: - retval = DecodedStreamObject() - retval._data = data["__streamdata__"] - del data["__streamdata__"] - del data["/Length"] - retval.update(data) - return retval - initializeFromDictionary = staticmethod(initializeFromDictionary) - - def flateEncode(self): - if self.has_key("/Filter"): - f = self["/Filter"] - if isinstance(f, ArrayObject): - f.insert(0, NameObject("/FlateDecode")) - else: - newf = ArrayObject() - newf.append(NameObject("/FlateDecode")) - newf.append(f) - f = newf - else: - f = NameObject("/FlateDecode") - retval = EncodedStreamObject() - retval[NameObject("/Filter")] = f - retval._data = filters.FlateDecode.encode(self._data) - return retval - - -class DecodedStreamObject(StreamObject): - def getData(self): - return self._data - - def setData(self, data): - self._data = data - - -class EncodedStreamObject(StreamObject): - def __init__(self): - self.decodedSelf = None - - def getData(self): - if self.decodedSelf: - # cached version of decoded object - return self.decodedSelf.getData() - else: - # create decoded object - decoded = DecodedStreamObject() - decoded._data = filters.decodeStreamData(self) - for key, value in self.items(): - if not key in ("/Length", "/Filter", "/DecodeParms"): - decoded[key] = value - self.decodedSelf = decoded - return decoded._data - - def setData(self, data): - raise utils.PdfReadError, "Creating EncodedStreamObject is not currently supported" - - -class RectangleObject(ArrayObject): - def __init__(self, arr): - # must have four points - assert len(arr) == 4 - # automatically convert arr[x] into NumberObject(arr[x]) if necessary - ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr]) - - def ensureIsNumber(self, value): - if not isinstance(value, (NumberObject, FloatObject)): - value = FloatObject(value) - return value - - def __repr__(self): - return "RectangleObject(%s)" % repr(list(self)) - - def getLowerLeft_x(self): - return self[0] - - def getLowerLeft_y(self): - return self[1] - - def getUpperRight_x(self): - return self[2] - - def getUpperRight_y(self): - return self[3] - - def getUpperLeft_x(self): - return self.getLowerLeft_x() - - def getUpperLeft_y(self): - return self.getUpperRight_y() - - def getLowerRight_x(self): - return self.getUpperRight_x() - - def getLowerRight_y(self): - return self.getLowerLeft_y() - - def getLowerLeft(self): - return self.getLowerLeft_x(), self.getLowerLeft_y() - - def getLowerRight(self): - return self.getLowerRight_x(), self.getLowerRight_y() - - def getUpperLeft(self): - return self.getUpperLeft_x(), self.getUpperLeft_y() - - def getUpperRight(self): - return self.getUpperRight_x(), self.getUpperRight_y() - - def setLowerLeft(self, value): - self[0], self[1] = [self.ensureIsNumber(x) for x in value] - - def setLowerRight(self, value): - self[2], self[1] = [self.ensureIsNumber(x) for x in value] - - def setUpperLeft(self, value): - self[0], self[3] = [self.ensureIsNumber(x) for x in value] - - def setUpperRight(self, value): - self[2], self[3] = [self.ensureIsNumber(x) for x in value] - - lowerLeft = property(getLowerLeft, setLowerLeft, None, None) - lowerRight = property(getLowerRight, setLowerRight, None, None) - upperLeft = property(getUpperLeft, setUpperLeft, None, None) - upperRight = property(getUpperRight, setUpperRight, None, None) - - -def encode_pdfdocencoding(unicode_string): - retval = '' - for c in unicode_string: - try: - retval += chr(_pdfDocEncoding_rev[c]) - except KeyError: - raise UnicodeEncodeError("pdfdocencoding", c, -1, -1, - "does not exist in translation table") - return retval - -def decode_pdfdocencoding(byte_array): - retval = u'' - for b in byte_array: - c = _pdfDocEncoding[ord(b)] - if c == u'\u0000': - raise UnicodeDecodeError("pdfdocencoding", b, -1, -1, - "does not exist in translation table") - retval += c - return retval - -_pdfDocEncoding = ( - u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', - u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', - u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', - u'\u02d8', u'\u02c7', u'\u02c6', u'\u02d9', u'\u02dd', u'\u02db', u'\u02da', u'\u02dc', - u'\u0020', u'\u0021', u'\u0022', u'\u0023', u'\u0024', u'\u0025', u'\u0026', u'\u0027', - u'\u0028', u'\u0029', u'\u002a', u'\u002b', u'\u002c', u'\u002d', u'\u002e', u'\u002f', - u'\u0030', u'\u0031', u'\u0032', u'\u0033', u'\u0034', u'\u0035', u'\u0036', u'\u0037', - u'\u0038', u'\u0039', u'\u003a', u'\u003b', u'\u003c', u'\u003d', u'\u003e', u'\u003f', - u'\u0040', u'\u0041', u'\u0042', u'\u0043', u'\u0044', u'\u0045', u'\u0046', u'\u0047', - u'\u0048', u'\u0049', u'\u004a', u'\u004b', u'\u004c', u'\u004d', u'\u004e', u'\u004f', - u'\u0050', u'\u0051', u'\u0052', u'\u0053', u'\u0054', u'\u0055', u'\u0056', u'\u0057', - u'\u0058', u'\u0059', u'\u005a', u'\u005b', u'\u005c', u'\u005d', u'\u005e', u'\u005f', - u'\u0060', u'\u0061', u'\u0062', u'\u0063', u'\u0064', u'\u0065', u'\u0066', u'\u0067', - u'\u0068', u'\u0069', u'\u006a', u'\u006b', u'\u006c', u'\u006d', u'\u006e', u'\u006f', - u'\u0070', u'\u0071', u'\u0072', u'\u0073', u'\u0074', u'\u0075', u'\u0076', u'\u0077', - u'\u0078', u'\u0079', u'\u007a', u'\u007b', u'\u007c', u'\u007d', u'\u007e', u'\u0000', - u'\u2022', u'\u2020', u'\u2021', u'\u2026', u'\u2014', u'\u2013', u'\u0192', u'\u2044', - u'\u2039', u'\u203a', u'\u2212', u'\u2030', u'\u201e', u'\u201c', u'\u201d', u'\u2018', - u'\u2019', u'\u201a', u'\u2122', u'\ufb01', u'\ufb02', u'\u0141', u'\u0152', u'\u0160', - u'\u0178', u'\u017d', u'\u0131', u'\u0142', u'\u0153', u'\u0161', u'\u017e', u'\u0000', - u'\u20ac', u'\u00a1', u'\u00a2', u'\u00a3', u'\u00a4', u'\u00a5', u'\u00a6', u'\u00a7', - u'\u00a8', u'\u00a9', u'\u00aa', u'\u00ab', u'\u00ac', u'\u0000', u'\u00ae', u'\u00af', - u'\u00b0', u'\u00b1', u'\u00b2', u'\u00b3', u'\u00b4', u'\u00b5', u'\u00b6', u'\u00b7', - u'\u00b8', u'\u00b9', u'\u00ba', u'\u00bb', u'\u00bc', u'\u00bd', u'\u00be', u'\u00bf', - u'\u00c0', u'\u00c1', u'\u00c2', u'\u00c3', u'\u00c4', u'\u00c5', u'\u00c6', u'\u00c7', - u'\u00c8', u'\u00c9', u'\u00ca', u'\u00cb', u'\u00cc', u'\u00cd', u'\u00ce', u'\u00cf', - u'\u00d0', u'\u00d1', u'\u00d2', u'\u00d3', u'\u00d4', u'\u00d5', u'\u00d6', u'\u00d7', - u'\u00d8', u'\u00d9', u'\u00da', u'\u00db', u'\u00dc', u'\u00dd', u'\u00de', u'\u00df', - u'\u00e0', u'\u00e1', u'\u00e2', u'\u00e3', u'\u00e4', u'\u00e5', u'\u00e6', u'\u00e7', - u'\u00e8', u'\u00e9', u'\u00ea', u'\u00eb', u'\u00ec', u'\u00ed', u'\u00ee', u'\u00ef', - u'\u00f0', u'\u00f1', u'\u00f2', u'\u00f3', u'\u00f4', u'\u00f5', u'\u00f6', u'\u00f7', - u'\u00f8', u'\u00f9', u'\u00fa', u'\u00fb', u'\u00fc', u'\u00fd', u'\u00fe', u'\u00ff' -) - -assert len(_pdfDocEncoding) == 256 - -_pdfDocEncoding_rev = {} -for i in xrange(256): - char = _pdfDocEncoding[i] - if char == u"\u0000": - continue - assert char not in _pdfDocEncoding_rev - _pdfDocEncoding_rev[char] = i - diff --git a/src/pyPdf/pdf.py b/src/pyPdf/pdf.py deleted file mode 100644 index 1a2b8709b2..0000000000 --- a/src/pyPdf/pdf.py +++ /dev/null @@ -1,1530 +0,0 @@ -# vim: sw=4:expandtab:foldmethod=marker -# -# Copyright (c) 2006, Mathieu Fenniak -# Copyright (c) 2007, Ashish Kulkarni -# -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# * The name of the author may not be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. - - -""" -A pure-Python PDF library with very minimal capabilities. It was designed to -be able to split and merge PDF files by page, and that's about all it can do. -It may be a solid base for future PDF file work in Python. -""" -__author__ = "Mathieu Fenniak" -__author_email__ = "biziqe@mathieu.fenniak.net" - -import struct -from cStringIO import StringIO - -from generic import DictionaryObject, NameObject, NumberObject, \ -createStringObject, ArrayObject, ByteStringObject, StreamObject, \ -IndirectObject, utils, readObject, TextStringObject, BooleanObject, \ -RectangleObject, DecodedStreamObject -from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList - - -## -# This class supports writing PDF files out, given pages produced by another -# class (typically {@link #PdfFileReader PdfFileReader}). -class PdfFileWriter(object): - def __init__(self,title=u"Unknown",author=u"Unknown"): - self.killed = False - self._header = "%PDF-1.3" - self._objects = [] # array of indirect objects - - # The root of our page tree node. - pages = DictionaryObject() - pages.update({ - NameObject("/Type"): NameObject("/Pages"), - NameObject("/Count"): NumberObject(0), - NameObject("/Kids"): ArrayObject(), - }) - self._pages = self._addObject(pages) - - # info object - info = DictionaryObject() - info.update({ - NameObject("/Producer"): createStringObject(u"Python PDF Library - http://pybrary.net/pyPdf/"), - NameObject("/Author"): createStringObject(author), - NameObject("/Title"): createStringObject(title), - }) - self._info = self._addObject(info) - - # root object - root = DictionaryObject() - root.update({ - NameObject("/Type"): NameObject("/Catalog"), - NameObject("/Pages"): self._pages, - }) - self._root = self._addObject(root) - - def _addObject(self, obj): - self._objects.append(obj) - return IndirectObject(len(self._objects), 0, self) - - def getObject(self, ido): - if ido.pdf != self: - raise ValueError("pdf must be self") - return self._objects[ido.idnum - 1] - - ## - # Adds a page to this PDF file. The page is usually acquired from a - # {@link #PdfFileReader PdfFileReader} instance. - #

- # Stability: Added in v1.0, will exist for all v1.x releases. - # - # @param page The page to add to the document. This argument should be - # an instance of {@link #PageObject PageObject}. - def addPage(self, page): - assert page["/Type"] == "/Page" - page[NameObject("/Parent")] = self._pages - page = self._addObject(page) - pages = self.getObject(self._pages) - pages["/Kids"].append(page) - pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1) - - ## - # Encrypt this PDF file with the PDF Standard encryption handler. - # @param user_pwd The "user password", which allows for opening and reading - # the PDF file with the restrictions provided. - # @param owner_pwd The "owner password", which allows for opening the PDF - # files without any restrictions. By default, the owner password is the - # same as the user password. - # @param use_128bit Boolean argument as to whether to use 128bit - # encryption. When false, 40bit encryption will be used. By default, this - # flag is on. - def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True): - import md5, time, random - if owner_pwd == None: - owner_pwd = user_pwd - if use_128bit: - V = 2 - rev = 3 - keylen = 128 / 8 - else: - V = 1 - rev = 2 - keylen = 40 / 8 - # permit everything: - P = -1 - O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen)) - ID_1 = md5.new(repr(time.time())).digest() - ID_2 = md5.new(repr(random.random())).digest() - self._ID = ArrayObject((ByteStringObject(ID_1), ByteStringObject(ID_2))) - if rev == 2: - U, key = _alg34(user_pwd, O, P, ID_1) - else: - assert rev == 3 - U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False) - encrypt = DictionaryObject() - encrypt[NameObject("/Filter")] = NameObject("/Standard") - encrypt[NameObject("/V")] = NumberObject(V) - if V == 2: - encrypt[NameObject("/Length")] = NumberObject(keylen * 8) - encrypt[NameObject("/R")] = NumberObject(rev) - encrypt[NameObject("/O")] = ByteStringObject(O) - encrypt[NameObject("/U")] = ByteStringObject(U) - encrypt[NameObject("/P")] = NumberObject(P) - self._encrypt = self._addObject(encrypt) - self._encrypt_key = key - - ## - # Writes the collection of pages added to this object out as a PDF file. - #

- # Stability: Added in v1.0, will exist for all v1.x releases. - # @param stream An object to write the file to. The object must support - # the write method, and the tell method, similar to a file object. - def write(self, stream): - import md5 - - externalReferenceMap = {} - self.stack = [] - self._sweepIndirectReferences(externalReferenceMap, self._root) - del self.stack - - # Begin writing: - object_positions = [] - stream.write(self._header + "\n") - for i in range(len(self._objects)): - idnum = (i + 1) - obj = self._objects[i] - object_positions.append(stream.tell()) - stream.write(str(idnum) + " 0 obj\n") - key = None - if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum: - pack1 = struct.pack(" -# Stability: Added in v1.0, will exist for all v1.x releases. -# -# @param stream An object that supports the standard read and seek methods -# similar to a file object. -class PdfFileReader(object): - def __init__(self, stream): - self.flattenedPages = None - self.resolvedObjects = {} - self.read(stream) - self.stream = stream - self._override_encryption = False - - ## - # Retrieves the PDF file's document information dictionary, if it exists. - # Note that some PDF files use metadata streams instead of docinfo - # dictionaries, and these metadata streams will not be accessed by this - # function. - #

- # Stability: Added in v1.6, will exist for all future v1.x releases. - # @return Returns a {@link #DocumentInformation DocumentInformation} - # instance, or None if none exists. - def getDocumentInfo(self): - if not self.trailer.has_key("/Info"): - return None - obj = self.trailer['/Info'] - retval = DocumentInformation() - retval.update(obj) - return retval - - ## - # Read-only property that accesses the {@link - # #PdfFileReader.getDocumentInfo getDocumentInfo} function. - #

- # Stability: Added in v1.7, will exist for all future v1.x releases. - documentInfo = property(lambda self: self.getDocumentInfo(), None, None) - - ## - # Retrieves XMP (Extensible Metadata Platform) data from the PDF document - # root. - #

- # Stability: Added in v1.12, will exist for all future v1.x releases. - # @return Returns a {@link #generic.XmpInformation XmlInformation} - # instance that can be used to access XMP metadata from the document. - # Can also return None if no metadata was found on the document root. - def getXmpMetadata(self): - try: - self._override_encryption = True - return self.trailer["/Root"].getXmpMetadata() - finally: - self._override_encryption = False - - ## - # Read-only property that accesses the {@link #PdfFileReader.getXmpData - # getXmpData} function. - #

- # Stability: Added in v1.12, will exist for all future v1.x releases. - xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None) - - ## - # Calculates the number of pages in this PDF file. - #

- # Stability: Added in v1.0, will exist for all v1.x releases. - # @return Returns an integer. - def getNumPages(self): - if self.flattenedPages == None: - self._flatten() - return len(self.flattenedPages) - - ## - # Read-only property that accesses the {@link #PdfFileReader.getNumPages - # getNumPages} function. - #

- # Stability: Added in v1.7, will exist for all future v1.x releases. - numPages = property(lambda self: self.getNumPages(), None, None) - - ## - # Retrieves a page by number from this PDF file. - #

- # Stability: Added in v1.0, will exist for all v1.x releases. - # @return Returns a {@link #PageObject PageObject} instance. - def getPage(self, pageNumber): - ## ensure that we're not trying to access an encrypted PDF - #assert not self.trailer.has_key("/Encrypt") - if self.flattenedPages == None: - self._flatten() - return self.flattenedPages[pageNumber] - - ## - # Read-only property that accesses the - # {@link #PdfFileReader.getNamedDestinations - # getNamedDestinations} function. - #

- # Stability: Added in v1.10, will exist for all future v1.x releases. - namedDestinations = property(lambda self: - self.getNamedDestinations(), None, None) - - ## - # Retrieves the named destinations present in the document. - #

- # Stability: Added in v1.10, will exist for all future v1.x releases. - # @return Returns a dict which maps names to {@link #Destination - # destinations}. - def getNamedDestinations(self, tree=None, retval=None): - if retval == None: - retval = {} - catalog = self.trailer["/Root"] - - # get the name tree - if catalog.has_key("/Dests"): - tree = catalog["/Dests"] - elif catalog.has_key("/Names"): - names = catalog['/Names'] - if names.has_key("/Dests"): - tree = names['/Dests'] - - if tree == None: - return retval - - if tree.has_key("/Kids"): - # recurse down the tree - for kid in tree["/Kids"]: - self.getNamedDestinations(kid.getObject(), retval) - - if tree.has_key("/Names"): - names = tree["/Names"] - for i in range(0, len(names), 2): - key = names[i].getObject() - val = names[i+1].getObject() - if isinstance(val, DictionaryObject) and val.has_key('/D'): - val = val['/D'] - dest = self._buildDestination(key, val) - if dest != None: - retval[key] = dest - - return retval - - ## - # Read-only property that accesses the {@link #PdfFileReader.getOutlines - # getOutlines} function. - #

- # Stability: Added in v1.10, will exist for all future v1.x releases. - outlines = property(lambda self: self.getOutlines(), None, None) - - ## - # Retrieves the document outline present in the document. - #

- # Stability: Added in v1.10, will exist for all future v1.x releases. - # @return Returns a nested list of {@link #Destination destinations}. - def getOutlines(self, node=None, outlines=None): - if outlines == None: - outlines = [] - catalog = self.trailer["/Root"] - - # get the outline dictionary and named destinations - if catalog.has_key("/Outlines"): - lines = catalog["/Outlines"] - if lines.has_key("/First"): - node = lines["/First"] - self._namedDests = self.getNamedDestinations() - - if node == None: - return outlines - - # see if there are any more outlines - while 1: - outline = self._buildOutline(node) - if outline: - outlines.append(outline) - - # check for sub-outlines - if node.has_key("/First"): - subOutlines = [] - self.getOutlines(node["/First"], subOutlines) - if subOutlines: - outlines.append(subOutlines) - - if not node.has_key("/Next"): - break - node = node["/Next"] - - return outlines - - def _buildDestination(self, title, array): - page, typ = array[0:2] - array = array[2:] - return Destination(title, page, typ, *array) - - def _buildOutline(self, node): - dest, title, outline = None, None, None - - if node.has_key("/A") and node.has_key("/Title"): - # Action, section 8.5 (only type GoTo supported) - title = node["/Title"] - action = node["/A"] - if action["/S"] == "/GoTo": - dest = action["/D"] - elif node.has_key("/Dest") and node.has_key("/Title"): - # Destination, section 8.2.1 - title = node["/Title"] - dest = node["/Dest"] - - # if destination found, then create outline - if dest: - if isinstance(dest, ArrayObject): - outline = self._buildDestination(title, dest) - elif isinstance(dest, unicode) and self._namedDests.has_key(dest): - outline = self._namedDests[dest] - outline[NameObject("/Title")] = title - else: - raise utils.PdfReadError("Unexpected destination %r" % dest) - return outline - - ## - # Read-only property that emulates a list based upon the {@link - # #PdfFileReader.getNumPages getNumPages} and {@link #PdfFileReader.getPage - # getPage} functions. - #

- # Stability: Added in v1.7, and will exist for all future v1.x releases. - pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage), - None, None) - - def _flatten(self, pages=None, inherit=None): - inheritablePageAttributes = ( - NameObject("/Resources"), NameObject("/MediaBox"), - NameObject("/CropBox"), NameObject("/Rotate") - ) - if inherit == None: - inherit = dict() - if pages == None: - self.flattenedPages = [] - catalog = self.trailer["/Root"].getObject() - pages = catalog["/Pages"].getObject() - t = pages["/Type"] - if t == "/Pages": - for attr in inheritablePageAttributes: - if pages.has_key(attr): - inherit[attr] = pages[attr] - for page in pages["/Kids"]: - self._flatten(page.getObject(), inherit) - elif t == "/Page": - for attr,value in inherit.items(): - # if the page has it's own value, it does not inherit the - # parent's value: - if not pages.has_key(attr): - pages[attr] = value - pageObj = PageObject(self) - pageObj.update(pages) - self.flattenedPages.append(pageObj) - - def getObject(self, indirectReference): - retval = self.resolvedObjects.get(indirectReference.generation, {}).get(indirectReference.idnum, None) - if retval != None: - return retval - if indirectReference.generation == 0 and \ - self.xref_objStm.has_key(indirectReference.idnum): - # indirect reference to object in object stream - # read the entire object stream into memory - stmnum,idx = self.xref_objStm[indirectReference.idnum] - objStm = IndirectObject(stmnum, 0, self).getObject() - assert objStm['/Type'] == '/ObjStm' - assert idx < objStm['/N'] - streamData = StringIO(objStm.getData()) - for i in range(objStm['/N']): - objnum = NumberObject.readFromStream(streamData) - readNonWhitespace(streamData) - streamData.seek(-1, 1) - offset = NumberObject.readFromStream(streamData) - readNonWhitespace(streamData) - streamData.seek(-1, 1) - t = streamData.tell() - streamData.seek(objStm['/First']+offset, 0) - obj = readObject(streamData, self) - self.resolvedObjects[0][objnum] = obj - streamData.seek(t, 0) - return self.resolvedObjects[0][indirectReference.idnum] - start = self.xref[indirectReference.generation][indirectReference.idnum] - self.stream.seek(start, 0) - idnum, generation = self.readObjectHeader(self.stream) - assert idnum == indirectReference.idnum - assert generation == indirectReference.generation - retval = readObject(self.stream, self) - - # override encryption is used for the /Encrypt dictionary - if not self._override_encryption and self.isEncrypted: - # if we don't have the encryption key: - if not hasattr(self, '_decryption_key'): - from calibre.ebooks import DRMError - raise DRMError('File contents are encrypted') - # otherwise, decrypt here... - import struct, md5 - pack1 = struct.pack("= len(array): - break - - def readNextEndLine(self, stream): - line = "" - while True: - x = stream.read(1) - stream.seek(-2, 1) - if x == '\n' or x == '\r': - while x == '\n' or x == '\r': - x = stream.read(1) - stream.seek(-2, 1) - stream.seek(1, 1) - break - else: - line = x + line - return line - - ## - # When using an encrypted / secured PDF file with the PDF Standard - # encryption handler, this function will allow the file to be decrypted. - # It checks the given password against the document's user password and - # owner password, and then stores the resulting decryption key if either - # password is correct. - #

- # It does not matter which password was matched. Both passwords provide - # the correct decryption key that will allow the document to be used with - # this library. - #

- # Stability: Added in v1.8, will exist for all future v1.x releases. - # - # @return 0 if the password failed, 1 if the password matched the user - # password, and 2 if the password matched the owner password. - # - # @exception NotImplementedError Document uses an unsupported encryption - # method. - def decrypt(self, password): - self._override_encryption = True - try: - return self._decrypt(password) - finally: - self._override_encryption = False - - def _decrypt(self, password): - encrypt = self.trailer['/Encrypt'].getObject() - if encrypt['/Filter'] != '/Standard': - raise NotImplementedError, "only Standard PDF encryption handler is available" - if not (encrypt['/V'] in (1, 2)): - raise NotImplementedError, "only algorithm code 1 and 2 are supported" - user_password, key = self._authenticateUserPassword(password) - if user_password: - self._decryption_key = key - return 1 - else: - rev = encrypt['/R'].getObject() - if rev == 2: - keylen = 5 - else: - keylen = encrypt['/Length'].getObject() / 8 - key = _alg33_1(password, rev, keylen) - real_O = encrypt["/O"].getObject() - if rev == 2: - userpass = utils.RC4_encrypt(key, real_O) - else: - val = real_O - for i in range(19, -1, -1): - new_key = '' - for l in range(len(key)): - new_key += chr(ord(key[l]) ^ i) - val = utils.RC4_encrypt(new_key, val) - userpass = val - owner_password, key = self._authenticateUserPassword(userpass) - if owner_password: - self._decryption_key = key - return 2 - return 0 - - def _authenticateUserPassword(self, password): - encrypt = self.trailer['/Encrypt'].getObject() - rev = encrypt['/R'].getObject() - owner_entry = encrypt['/O'].getObject().original_bytes - p_entry = encrypt['/P'].getObject() - id_entry = self.trailer['/ID'].getObject() - id1_entry = id_entry[0].getObject() - if rev == 2: - U, key = _alg34(password, owner_entry, p_entry, id1_entry) - elif rev >= 3: - U, key = _alg35(password, rev, - encrypt["/Length"].getObject() / 8, owner_entry, - p_entry, id1_entry, - encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject()) - real_U = encrypt['/U'].getObject().original_bytes - return U == real_U, key - - def getIsEncrypted(self): - return self.trailer.has_key("/Encrypt") - - ## - # Read-only boolean property showing whether this PDF file is encrypted. - # Note that this property, if true, will remain true even after the {@link - # #PdfFileReader.decrypt decrypt} function is called. - isEncrypted = property(lambda self: self.getIsEncrypted(), None, None) - - -def getRectangle(self, name, defaults): - retval = self.get(name) - if isinstance(retval, RectangleObject): - return retval - if retval == None: - for d in defaults: - retval = self.get(d) - if retval != None: - break - if isinstance(retval, IndirectObject): - retval = self.pdf.getObject(retval) - retval = RectangleObject(retval) - setRectangle(self, name, retval) - return retval - -def setRectangle(self, name, value): - if not isinstance(name, NameObject): - name = NameObject(name) - self[name] = value - -def deleteRectangle(self, name): - del self[name] - -def createRectangleAccessor(name, fallback): - return \ - property( - lambda self: getRectangle(self, name, fallback), - lambda self, value: setRectangle(self, name, value), - lambda self: deleteRectangle(self, name) - ) - -## -# This class represents a single page within a PDF file. Typically this object -# will be created by accessing the {@link #PdfFileReader.getPage getPage} -# function of the {@link #PdfFileReader PdfFileReader} class. -class PageObject(DictionaryObject): - def __init__(self, pdf): - DictionaryObject.__init__(self) - self.pdf = pdf - - ## - # Rotates a page clockwise by increments of 90 degrees. - #

- # Stability: Added in v1.1, will exist for all future v1.x releases. - # @param angle Angle to rotate the page. Must be an increment of 90 deg. - def rotateClockwise(self, angle): - assert angle % 90 == 0 - self._rotate(angle) - return self - - ## - # Rotates a page counter-clockwise by increments of 90 degrees. - #

- # Stability: Added in v1.1, will exist for all future v1.x releases. - # @param angle Angle to rotate the page. Must be an increment of 90 deg. - def rotateCounterClockwise(self, angle): - assert angle % 90 == 0 - self._rotate(-angle) - return self - - def _rotate(self, angle): - currentAngle = self.get("/Rotate", 0) - self[NameObject("/Rotate")] = NumberObject(currentAngle + angle) - - def _mergeResources(res1, res2, resource): - newRes = DictionaryObject() - newRes.update(res1.get(resource, DictionaryObject()).getObject()) - page2Res = res2.get(resource, DictionaryObject()).getObject() - renameRes = {} - for key in page2Res.keys(): - if newRes.has_key(key) and newRes[key] != page2Res[key]: - newname = NameObject(key + "renamed") - renameRes[key] = newname - newRes[newname] = page2Res[key] - elif not newRes.has_key(key): - newRes[key] = page2Res[key] - return newRes, renameRes - _mergeResources = staticmethod(_mergeResources) - - def _contentStreamRename(stream, rename, pdf): - if not rename: - return stream - stream = ContentStream(stream, pdf) - for operands,operator in stream.operations: - for i in range(len(operands)): - op = operands[i] - if isinstance(op, NameObject): - operands[i] = rename.get(op, op) - return stream - _contentStreamRename = staticmethod(_contentStreamRename) - - def _pushPopGS(contents, pdf): - # adds a graphics state "push" and "pop" to the beginning and end - # of a content stream. This isolates it from changes such as - # transformation matricies. - stream = ContentStream(contents, pdf) - stream.operations.insert(0, [[], "q"]) - stream.operations.append([[], "Q"]) - return stream - _pushPopGS = staticmethod(_pushPopGS) - - ## - # Merges the content streams of two pages into one. Resource references - # (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc - # of this page are not altered. The parameter page's content stream will - # be added to the end of this page's content stream, meaning that it will - # be drawn after, or "on top" of this page. - #

- # Stability: Added in v1.4, will exist for all future 1.x releases. - # @param page2 An instance of {@link #PageObject PageObject} to be merged - # into this one. - def mergePage(self, page2): - - # First we work on merging the resource dictionaries. This allows us - # to find out what symbols in the content streams we might need to - # rename. - - newResources = DictionaryObject() - rename = {} - originalResources = self["/Resources"].getObject() - page2Resources = page2["/Resources"].getObject() - - for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading": - new, newrename = PageObject._mergeResources(originalResources, page2Resources, res) - if new: - newResources[NameObject(res)] = new - rename.update(newrename) - - # Combine /ProcSet sets. - newResources[NameObject("/ProcSet")] = ArrayObject( - frozenset(originalResources.get("/ProcSet", ArrayObject()).getObject()).union( - frozenset(page2Resources.get("/ProcSet", ArrayObject()).getObject()) - ) - ) - - newContentArray = ArrayObject() - - originalContent = self["/Contents"].getObject() - newContentArray.append(PageObject._pushPopGS(originalContent, self.pdf)) - - page2Content = page2['/Contents'].getObject() - page2Content = PageObject._contentStreamRename(page2Content, rename, self.pdf) - page2Content = PageObject._pushPopGS(page2Content, self.pdf) - newContentArray.append(page2Content) - - self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf) - self[NameObject('/Resources')] = newResources - - ## - # Compresses the size of this page by joining all content streams and - # applying a FlateDecode filter. - #

- # Stability: Added in v1.6, will exist for all future v1.x releases. - # However, it is possible that this function will perform no action if - # content stream compression becomes "automatic" for some reason. - def compressContentStreams(self): - content = self["/Contents"].getObject() - if not isinstance(content, ContentStream): - content = ContentStream(content, self.pdf) - self[NameObject("/Contents")] = content.flateEncode() - - ## - # Locate all text drawing commands, in the order they are provided in the - # content stream, and extract the text. This works well for some PDF - # files, but poorly for others, depending on the generator used. This will - # be refined in the future. Do not rely on the order of text coming out of - # this function, as it will change if this function is made more - # sophisticated. - #

- # Stability: Added in v1.7, will exist for all future v1.x releases. May - # be overhauled to provide more ordered text in the future. - # @return a unicode string object - def extractText(self): - text = u"" - content = self["/Contents"].getObject() - if not isinstance(content, ContentStream): - content = ContentStream(content, self.pdf) - # Note: we check all strings are TextStringObjects. ByteStringObjects - # are strings where the byte->string encoding was unknown, so adding - # them to the text here would be gibberish. - for operands,operator in content.operations: - if operator == "Tj": - _text = operands[0] - if isinstance(_text, TextStringObject): - text += _text - elif operator == "T*": - text += "\n" - elif operator == "'": - text += "\n" - _text = operands[0] - if isinstance(_text, TextStringObject): - text += operands[0] - elif operator == '"': - _text = operands[2] - if isinstance(_text, TextStringObject): - text += "\n" - text += _text - elif operator == "TJ": - for i in operands[0]: - if isinstance(i, TextStringObject): - text += i - return text - - ## - # A rectangle (RectangleObject), expressed in default user space units, - # defining the boundaries of the physical medium on which the page is - # intended to be displayed or printed. - #

- # Stability: Added in v1.4, will exist for all future v1.x releases. - mediaBox = createRectangleAccessor("/MediaBox", ()) - - ## - # A rectangle (RectangleObject), expressed in default user space units, - # defining the visible region of default user space. When the page is - # displayed or printed, its contents are to be clipped (cropped) to this - # rectangle and then imposed on the output medium in some - # implementation-defined manner. Default value: same as MediaBox. - #

- # Stability: Added in v1.4, will exist for all future v1.x releases. - cropBox = createRectangleAccessor("/CropBox", ("/MediaBox",)) - - ## - # A rectangle (RectangleObject), expressed in default user space units, - # defining the region to which the contents of the page should be clipped - # when output in a production enviroment. - #

- # Stability: Added in v1.4, will exist for all future v1.x releases. - bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", "/MediaBox")) - - ## - # A rectangle (RectangleObject), expressed in default user space units, - # defining the intended dimensions of the finished page after trimming. - #

- # Stability: Added in v1.4, will exist for all future v1.x releases. - trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", "/MediaBox")) - - ## - # A rectangle (RectangleObject), expressed in default user space units, - # defining the extent of the page's meaningful content as intended by the - # page's creator. - #

- # Stability: Added in v1.4, will exist for all future v1.x releases. - artBox = createRectangleAccessor("/ArtBox", ("/CropBox", "/MediaBox")) - - -class ContentStream(DecodedStreamObject): - def __init__(self, stream, pdf): - self.pdf = pdf - self.operations = [] - # stream may be a StreamObject or an ArrayObject containing - # multiple StreamObjects to be cat'd together. - stream = stream.getObject() - if isinstance(stream, ArrayObject): - data = "" - for s in stream: - data += s.getObject().getData() - stream = StringIO(data) - else: - stream = StringIO(stream.getData()) - self.__parseContentStream(stream) - - def __parseContentStream(self, stream): - # file("f:\\tmp.txt", "w").write(stream.read()) - stream.seek(0, 0) - operands = [] - while True: - peek = readNonWhitespace(stream) - if peek == '': - break - stream.seek(-1, 1) - if peek.isalpha() or peek == "'" or peek == '"': - operator = "" - while True: - tok = stream.read(1) - if tok.isspace() or tok in NameObject.delimiterCharacters: - stream.seek(-1, 1) - break - elif tok == '': - break - operator += tok - if operator == "BI": - # begin inline image - a completely different parsing - # mechanism is required, of course... thanks buddy... - assert operands == [] - ii = self._readInlineImage(stream) - self.operations.append((ii, "INLINE IMAGE")) - else: - self.operations.append((operands, operator)) - operands = [] - elif peek == '%': - # If we encounter a comment in the content stream, we have to - # handle it here. Typically, readObject will handle - # encountering a comment -- but readObject assumes that - # following the comment must be the object we're trying to - # read. In this case, it could be an operator instead. - while peek not in ('\r', '\n'): - peek = stream.read(1) - else: - operands.append(readObject(stream, None)) - - def _readInlineImage(self, stream): - # begin reading just after the "BI" - begin image - # first read the dictionary of settings. - settings = DictionaryObject() - while True: - tok = readNonWhitespace(stream) - stream.seek(-1, 1) - if tok == "I": - # "ID" - begin of image data - break - key = readObject(stream, self.pdf) - tok = readNonWhitespace(stream) - stream.seek(-1, 1) - value = readObject(stream, self.pdf) - settings[key] = value - # left at beginning of ID - tmp = stream.read(3) - assert tmp[:2] == "ID" - data = "" - while True: - tok = stream.read(1) - if tok == "E": - next = stream.read(1) - if next == "I": - break - else: - stream.seek(-1, 1) - data += tok - else: - data += tok - x = readNonWhitespace(stream) - stream.seek(-1, 1) - return {"settings": settings, "data": data} - - def _getData(self): - newdata = StringIO() - for operands,operator in self.operations: - if operator == "INLINE IMAGE": - newdata.write("BI") - dicttext = StringIO() - operands["settings"].writeToStream(dicttext, None) - newdata.write(dicttext.getvalue()[2:-2]) - newdata.write("ID ") - newdata.write(operands["data"]) - newdata.write("EI") - else: - for op in operands: - op.writeToStream(newdata, None) - newdata.write(" ") - newdata.write(operator) - newdata.write("\n") - return newdata.getvalue() - - def _setData(self, value): - self.__parseContentStream(StringIO(value)) - - _data = property(_getData, _setData) - - -## -# A class representing the basic document metadata provided in a PDF File. -#

-# As of pyPdf v1.10, all text properties of the document metadata have two -# properties, eg. author and author_raw. The non-raw property will always -# return a TextStringObject, making it ideal for a case where the metadata is -# being displayed. The raw property can sometimes return a ByteStringObject, -# if pyPdf was unable to decode the string's text encoding; this requires -# additional safety in the caller and therefore is not as commonly accessed. -class DocumentInformation(DictionaryObject): - def __init__(self): - DictionaryObject.__init__(self) - - def getText(self, key): - retval = self.get(key, None) - if isinstance(retval, TextStringObject): - return retval - return None - - ## - # Read-only property accessing the document's title. Added in v1.6, will - # exist for all future v1.x releases. Modified in v1.10 to always return a - # unicode string (TextStringObject). - # @return A unicode string, or None if the title is not provided. - title = property(lambda self: self.getText("/Title")) - title_raw = property(lambda self: self.get("/Title")) - - ## - # Read-only property accessing the document's author. Added in v1.6, will - # exist for all future v1.x releases. Modified in v1.10 to always return a - # unicode string (TextStringObject). - # @return A unicode string, or None if the author is not provided. - author = property(lambda self: self.getText("/Author")) - author_raw = property(lambda self: self.get("/Author")) - - ## - # Read-only property accessing the subject of the document. Added in v1.6, - # will exist for all future v1.x releases. Modified in v1.10 to always - # return a unicode string (TextStringObject). - # @return A unicode string, or None if the subject is not provided. - subject = property(lambda self: self.getText("/Subject")) - subject_raw = property(lambda self: self.get("/Subject")) - - ## - # Read-only property accessing the document's creator. If the document was - # converted to PDF from another format, the name of the application (for - # example, OpenOffice) that created the original document from which it was - # converted. Added in v1.6, will exist for all future v1.x releases. - # Modified in v1.10 to always return a unicode string (TextStringObject). - # @return A unicode string, or None if the creator is not provided. - creator = property(lambda self: self.getText("/Creator")) - creator_raw = property(lambda self: self.get("/Creator")) - - ## - # Read-only property accessing the document's producer. If the document - # was converted to PDF from another format, the name of the application - # (for example, OSX Quartz) that converted it to PDF. Added in v1.6, will - # exist for all future v1.x releases. Modified in v1.10 to always return a - # unicode string (TextStringObject). - # @return A unicode string, or None if the producer is not provided. - producer = property(lambda self: self.getText("/Producer")) - producer_raw = property(lambda self: self.get("/Producer")) - - -## -# A class representing a destination within a PDF file. -# See section 8.2.1 of the PDF 1.6 reference. -# Stability: Added in v1.10, will exist for all v1.x releases. -class Destination(DictionaryObject): - def __init__(self, title, page, typ, *args): - DictionaryObject.__init__(self) - self[NameObject("/Title")] = title - self[NameObject("/Page")] = page - self[NameObject("/Type")] = typ - - # from table 8.2 of the PDF 1.6 reference. - if typ == "/XYZ": - (self[NameObject("/Left")], self[NameObject("/Top")], - self[NameObject("/Zoom")]) = args - elif typ == "/FitR": - (self[NameObject("/Left")], self[NameObject("/Bottom")], - self[NameObject("/Right")], self[NameObject("/Top")]) = args - elif typ in ["/FitH", "FitBH"]: - self[NameObject("/Top")], = args - elif typ in ["/FitV", "FitBV"]: - self[NameObject("/Left")], = args - elif typ in ["/Fit", "FitB"]: - pass - else: - raise utils.PdfReadError("Unknown Destination Type: %r" % typ) - - ## - # Read-only property accessing the destination title. - # @return A string. - title = property(lambda self: self.get("/Title")) - - ## - # Read-only property accessing the destination page. - # @return An integer. - page = property(lambda self: self.get("/Page")) - - ## - # Read-only property accessing the destination type. - # @return A string. - typ = property(lambda self: self.get("/Type")) - - ## - # Read-only property accessing the zoom factor. - # @return A number, or None if not available. - zoom = property(lambda self: self.get("/Zoom", None)) - - ## - # Read-only property accessing the left horizontal coordinate. - # @return A number, or None if not available. - left = property(lambda self: self.get("/Left", None)) - - ## - # Read-only property accessing the right horizontal coordinate. - # @return A number, or None if not available. - right = property(lambda self: self.get("/Right", None)) - - ## - # Read-only property accessing the top vertical coordinate. - # @return A number, or None if not available. - top = property(lambda self: self.get("/Top", None)) - - ## - # Read-only property accessing the bottom vertical coordinate. - # @return A number, or None if not available. - bottom = property(lambda self: self.get("/Bottom", None)) - -def convertToInt(d, size): - if size > 8: - raise utils.PdfReadError("invalid size in convertToInt") - d = "\x00\x00\x00\x00\x00\x00\x00\x00" + d - d = d[-8:] - return struct.unpack(">q", d)[0] - -# ref: pdf1.8 spec section 3.5.2 algorithm 3.2 -_encryption_padding = '\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56' + \ - '\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c' + \ - '\xa9\xfe\x64\x53\x69\x7a' - -# Implementation of algorithm 3.2 of the PDF standard security handler, -# section 3.5.2 of the PDF 1.6 reference. -def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True): - # 1. Pad or truncate the password string to exactly 32 bytes. If the - # password string is more than 32 bytes long, use only its first 32 bytes; - # if it is less than 32 bytes long, pad it by appending the required number - # of additional bytes from the beginning of the padding string - # (_encryption_padding). - password = (password + _encryption_padding)[:32] - # 2. Initialize the MD5 hash function and pass the result of step 1 as - # input to this function. - import md5, struct - m = md5.new(password) - # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash - # function. - m.update(owner_entry) - # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass - # these bytes to the MD5 hash function, low-order byte first. - p_entry = struct.pack('= 3 and not metadata_encrypt: - m.update("\xff\xff\xff\xff") - # 7. Finish the hash. - md5_hash = m.digest() - # 8. (Revision 3 or greater) Do the following 50 times: Take the output - # from the previous MD5 hash and pass the first n bytes of the output as - # input into a new MD5 hash, where n is the number of bytes of the - # encryption key as defined by the value of the encryption dictionary's - # /Length entry. - if rev >= 3: - for i in range(50): - md5_hash = md5.new(md5_hash[:keylen]).digest() - # 9. Set the encryption key to the first n bytes of the output from the - # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or - # greater, depends on the value of the encryption dictionary's /Length - # entry. - return md5_hash[:keylen] - -# Implementation of algorithm 3.3 of the PDF standard security handler, -# section 3.5.2 of the PDF 1.6 reference. -def _alg33(owner_pwd, user_pwd, rev, keylen): - # steps 1 - 4 - key = _alg33_1(owner_pwd, rev, keylen) - # 5. Pad or truncate the user password string as described in step 1 of - # algorithm 3.2. - user_pwd = (user_pwd + _encryption_padding)[:32] - # 6. Encrypt the result of step 5, using an RC4 encryption function with - # the encryption key obtained in step 4. - val = utils.RC4_encrypt(key, user_pwd) - # 7. (Revision 3 or greater) Do the following 19 times: Take the output - # from the previous invocation of the RC4 function and pass it as input to - # a new invocation of the function; use an encryption key generated by - # taking each byte of the encryption key obtained in step 4 and performing - # an XOR operation between that byte and the single-byte value of the - # iteration counter (from 1 to 19). - if rev >= 3: - for i in range(1, 20): - new_key = '' - for l in range(len(key)): - new_key += chr(ord(key[l]) ^ i) - val = utils.RC4_encrypt(new_key, val) - # 8. Store the output from the final invocation of the RC4 as the value of - # the /O entry in the encryption dictionary. - return val - -# Steps 1-4 of algorithm 3.3 -def _alg33_1(password, rev, keylen): - # 1. Pad or truncate the owner password string as described in step 1 of - # algorithm 3.2. If there is no owner password, use the user password - # instead. - password = (password + _encryption_padding)[:32] - # 2. Initialize the MD5 hash function and pass the result of step 1 as - # input to this function. - import md5 - m = md5.new(password) - # 3. (Revision 3 or greater) Do the following 50 times: Take the output - # from the previous MD5 hash and pass it as input into a new MD5 hash. - md5_hash = m.digest() - if rev >= 3: - for i in range(50): - md5_hash = md5.new(md5_hash).digest() - # 4. Create an RC4 encryption key using the first n bytes of the output - # from the final MD5 hash, where n is always 5 for revision 2 but, for - # revision 3 or greater, depends on the value of the encryption - # dictionary's /Length entry. - key = md5_hash[:keylen] - return key - -# Implementation of algorithm 3.4 of the PDF standard security handler, -# section 3.5.2 of the PDF 1.6 reference. -def _alg34(password, owner_entry, p_entry, id1_entry): - # 1. Create an encryption key based on the user password string, as - # described in algorithm 3.2. - key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry) - # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2, - # using an RC4 encryption function with the encryption key from the - # preceding step. - U = utils.RC4_encrypt(key, _encryption_padding) - # 3. Store the result of step 2 as the value of the /U entry in the - # encryption dictionary. - return U, key - -# Implementation of algorithm 3.4 of the PDF standard security handler, -# section 3.5.2 of the PDF 1.6 reference. -def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt): - # 1. Create an encryption key based on the user password string, as - # described in Algorithm 3.2. - key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) - # 2. Initialize the MD5 hash function and pass the 32-byte padding string - # shown in step 1 of Algorithm 3.2 as input to this function. - import md5 - m = md5.new() - m.update(_encryption_padding) - # 3. Pass the first element of the file's file identifier array (the value - # of the ID entry in the document's trailer dictionary; see Table 3.13 on - # page 73) to the hash function and finish the hash. (See implementation - # note 25 in Appendix H.) - m.update(id1_entry) - md5_hash = m.digest() - # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption - # function with the encryption key from step 1. - val = utils.RC4_encrypt(key, md5_hash) - # 5. Do the following 19 times: Take the output from the previous - # invocation of the RC4 function and pass it as input to a new invocation - # of the function; use an encryption key generated by taking each byte of - # the original encryption key (obtained in step 2) and performing an XOR - # operation between that byte and the single-byte value of the iteration - # counter (from 1 to 19). - for i in range(1, 20): - new_key = '' - for l in range(len(key)): - new_key += chr(ord(key[l]) ^ i) - val = utils.RC4_encrypt(new_key, val) - # 6. Append 16 bytes of arbitrary padding to the output from the final - # invocation of the RC4 function and store the 32-byte result as the value - # of the U entry in the encryption dictionary. - # (implementator note: I don't know what "arbitrary padding" is supposed to - # mean, so I have used null bytes. This seems to match a few other - # people's implementations) - return val + ('\x00' * 16), key - -#if __name__ == "__main__": -# output = PdfFileWriter() -# -# input1 = PdfFileReader(file("test\\5000-s1-05e.pdf", "rb")) -# page1 = input1.getPage(0) -# -# input2 = PdfFileReader(file("test\\PDFReference16.pdf", "rb")) -# page2 = input2.getPage(0) -# page3 = input2.getPage(1) -# page1.mergePage(page2) -# page1.mergePage(page3) -# -# input3 = PdfFileReader(file("test\\cc-cc.pdf", "rb")) -# page1.mergePage(input3.getPage(0)) -# -# page1.compressContentStreams() -# -# output.addPage(page1) -# output.write(file("test\\merge-test.pdf", "wb")) - - diff --git a/src/pyPdf/utils.py b/src/pyPdf/utils.py deleted file mode 100644 index dd0a3d002a..0000000000 --- a/src/pyPdf/utils.py +++ /dev/null @@ -1,110 +0,0 @@ -# vim: sw=4:expandtab:foldmethod=marker -# -# Copyright (c) 2006, Mathieu Fenniak -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# * The name of the author may not be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. - - -""" -Utility functions for PDF library. -""" -__author__ = "Mathieu Fenniak" -__author_email__ = "biziqe@mathieu.fenniak.net" - -#ENABLE_PSYCO = False -#if ENABLE_PSYCO: -# try: -# import psyco -# except ImportError: -# ENABLE_PSYCO = False -# -#if not ENABLE_PSYCO: -# class psyco: -# def proxy(func): -# return func -# proxy = staticmethod(proxy) - -def readUntilWhitespace(stream, maxchars=None): - txt = "" - while True: - tok = stream.read(1) - if tok.isspace() or not tok: - break - txt += tok - if len(txt) == maxchars: - break - return txt - -def readNonWhitespace(stream): - tok = ' ' - while tok == '\n' or tok == '\r' or tok == ' ' or tok == '\t': - tok = stream.read(1) - return tok - -class ConvertFunctionsToVirtualList(object): - def __init__(self, lengthFunction, getFunction): - self.lengthFunction = lengthFunction - self.getFunction = getFunction - - def __len__(self): - return self.lengthFunction() - - def __getitem__(self, index): - if not isinstance(index, int): - raise TypeError, "sequence indices must be integers" - len_self = len(self) - if index < 0: - # support negative indexes - index = len_self + index - if index < 0 or index >= len_self: - raise IndexError, "sequence index out of range" - return self.getFunction(index) - -def RC4_encrypt(key, plaintext): - S = [i for i in range(256)] - j = 0 - for i in range(256): - j = (j + S[i] + ord(key[i % len(key)])) % 256 - S[i], S[j] = S[j], S[i] - i, j = 0, 0 - retval = "" - for x in range(len(plaintext)): - i = (i + 1) % 256 - j = (j + S[i]) % 256 - S[i], S[j] = S[j], S[i] - t = S[(S[i] + S[j]) % 256] - retval += chr(ord(plaintext[x]) ^ t) - return retval - -class PdfReadError(Exception): - pass - -if __name__ == "__main__": - # test RC4 - out = RC4_encrypt("Key", "Plaintext") - print repr(out) - pt = RC4_encrypt("Key", out) - print repr(pt) diff --git a/src/pyPdf/xmp.py b/src/pyPdf/xmp.py deleted file mode 100644 index b070df9093..0000000000 --- a/src/pyPdf/xmp.py +++ /dev/null @@ -1,355 +0,0 @@ -import re -import datetime -import decimal -from generic import PdfObject -from xml.dom import getDOMImplementation -from xml.dom.minidom import parseString - -RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" -DC_NAMESPACE = "http://purl.org/dc/elements/1.1/" -XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/" -PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/" -XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/" - -# What is the PDFX namespace, you might ask? I might ask that too. It's -# a completely undocumented namespace used to place "custom metadata" -# properties, which are arbitrary metadata properties with no semantic or -# documented meaning. Elements in the namespace are key/value-style storage, -# where the element name is the key and the content is the value. The keys -# are transformed into valid XML identifiers by substituting an invalid -# identifier character with \u2182 followed by the unicode hex ID of the -# original character. A key like "my car" is therefore "my\u21820020car". -# -# \u2182, in case you're wondering, is the unicode character -# \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for -# escaping characters. -# -# Intentional users of the pdfx namespace should be shot on sight. A -# custom data schema and sensical XML elements could be used instead, as is -# suggested by Adobe's own documentation on XMP (under "Extensibility of -# Schemas"). -# -# Information presented here on the /pdfx/ schema is a result of limited -# reverse engineering, and does not constitute a full specification. -PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/" - -iso8601 = re.compile(""" - (?P[0-9]{4}) - (- - (?P[0-9]{2}) - (- - (?P[0-9]+) - (T - (?P[0-9]{2}): - (?P[0-9]{2}) - (:(?P[0-9]{2}(.[0-9]+)?))? - (?PZ|[-+][0-9]{2}:[0-9]{2}) - )? - )? - )? - """, re.VERBOSE) - -## -# An object that represents Adobe XMP metadata. -class XmpInformation(PdfObject): - - def __init__(self, stream): - self.stream = stream - docRoot = parseString(self.stream.getData()) - self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0] - self.cache = {} - - def writeToStream(self, stream, encryption_key): - self.stream.writeToStream(stream, encryption_key) - - def getElement(self, aboutUri, namespace, name): - for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): - if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri: - attr = desc.getAttributeNodeNS(namespace, name) - if attr != None: - yield attr - for element in desc.getElementsByTagNameNS(namespace, name): - yield element - - def getNodesInNamespace(self, aboutUri, namespace): - for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): - if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri: - for i in range(desc.attributes.length): - attr = desc.attributes.item(i) - if attr.namespaceURI == namespace: - yield attr - for child in desc.childNodes: - if child.namespaceURI == namespace: - yield child - - def _getText(self, element): - text = "" - for child in element.childNodes: - if child.nodeType == child.TEXT_NODE: - text += child.data - return text - - def _converter_string(value): - return value - - def _converter_date(value): - m = iso8601.match(value) - year = int(m.group("year")) - month = int(m.group("month") or "1") - day = int(m.group("day") or "1") - hour = int(m.group("hour") or "0") - minute = int(m.group("minute") or "0") - second = decimal.Decimal(m.group("second") or "0") - seconds = second.to_integral(decimal.ROUND_FLOOR) - milliseconds = (second - seconds) * 1000000 - tzd = m.group("tzd") or "Z" - dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds) - if tzd != "Z": - tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")] - tzd_hours *= -1 - if tzd_hours < 0: - tzd_minutes *= -1 - dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes) - return dt - _test_converter_date = staticmethod(_converter_date) - - def _getter_bag(namespace, name, converter): - def get(self): - cached = self.cache.get(namespace, {}).get(name) - if cached: - return cached - retval = [] - for element in self.getElement("", namespace, name): - bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag") - if len(bags): - for bag in bags: - for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"): - value = self._getText(item) - value = converter(value) - retval.append(value) - ns_cache = self.cache.setdefault(namespace, {}) - ns_cache[name] = retval - return retval - return get - - def _getter_seq(namespace, name, converter): - def get(self): - cached = self.cache.get(namespace, {}).get(name) - if cached: - return cached - retval = [] - for element in self.getElement("", namespace, name): - seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq") - if len(seqs): - for seq in seqs: - for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"): - value = self._getText(item) - value = converter(value) - retval.append(value) - else: - value = converter(self._getText(element)) - retval.append(value) - ns_cache = self.cache.setdefault(namespace, {}) - ns_cache[name] = retval - return retval - return get - - def _getter_langalt(namespace, name, converter): - def get(self): - cached = self.cache.get(namespace, {}).get(name) - if cached: - return cached - retval = {} - for element in self.getElement("", namespace, name): - alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt") - if len(alts): - for alt in alts: - for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"): - value = self._getText(item) - value = converter(value) - retval[item.getAttribute("xml:lang")] = value - else: - retval["x-default"] = converter(self._getText(element)) - ns_cache = self.cache.setdefault(namespace, {}) - ns_cache[name] = retval - return retval - return get - - def _getter_single(namespace, name, converter): - def get(self): - cached = self.cache.get(namespace, {}).get(name) - if cached: - return cached - value = None - for element in self.getElement("", namespace, name): - if element.nodeType == element.ATTRIBUTE_NODE: - value = element.nodeValue - else: - value = self._getText(element) - break - if value != None: - value = converter(value) - ns_cache = self.cache.setdefault(namespace, {}) - ns_cache[name] = value - return value - return get - - ## - # Contributors to the resource (other than the authors). An unsorted - # array of names. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor", _converter_string)) - - ## - # Text describing the extent or scope of the resource. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string)) - - ## - # A sorted array of names of the authors of the resource, listed in order - # of precedence. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string)) - - ## - # A sorted array of dates (datetime.datetime instances) of signifigance to - # the resource. The dates and times are in UTC. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date)) - - ## - # A language-keyed dictionary of textual descriptions of the content of the - # resource. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - dc_description = property(_getter_langalt(DC_NAMESPACE, "description", _converter_string)) - - ## - # The mime-type of the resource. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string)) - - ## - # Unique identifier of the resource. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier", _converter_string)) - - ## - # An unordered array specifying the languages used in the resource. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string)) - - ## - # An unordered array of publisher names. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string)) - - ## - # An unordered array of text descriptions of relationships to other - # documents. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string)) - - ## - # A language-keyed dictionary of textual descriptions of the rights the - # user has to this resource. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string)) - - ## - # Unique identifier of the work from which this resource was derived. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string)) - - ## - # An unordered array of descriptive phrases or keywrods that specify the - # topic of the content of the resource. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string)) - - ## - # A language-keyed dictionary of the title of the resource. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string)) - - ## - # An unordered array of textual descriptions of the document type. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string)) - - ## - # An unformatted text string representing document keywords. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords", _converter_string)) - - ## - # The PDF file version, for example 1.0, 1.3. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string)) - - ## - # The name of the tool that created the PDF document. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer", _converter_string)) - - ## - # The date and time the resource was originally created. The date and - # time are returned as a UTC datetime.datetime object. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - xmp_createDate = property(_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date)) - - ## - # The date and time the resource was last modified. The date and time - # are returned as a UTC datetime.datetime object. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - xmp_modifyDate = property(_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date)) - - ## - # The date and time that any metadata for this resource was last - # changed. The date and time are returned as a UTC datetime.datetime - # object. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - xmp_metadataDate = property(_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date)) - - ## - # The name of the first known tool used to create the resource. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string)) - - ## - # The common identifier for all versions and renditions of this resource. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string)) - - ## - # An identifier for a specific incarnation of a document, updated each - # time a file is saved. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string)) - - def custom_properties(self): - if not hasattr(self, "_custom_properties"): - self._custom_properties = {} - for node in self.getNodesInNamespace("", PDFX_NAMESPACE): - key = node.localName - while True: - # see documentation about PDFX_NAMESPACE earlier in file - idx = key.find(u"\u2182") - if idx == -1: - break - key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:] - if node.nodeType == node.ATTRIBUTE_NODE: - value = node.nodeValue - else: - value = self._getText(node) - self._custom_properties[key] = value - return self._custom_properties - - ## - # Retrieves custom metadata properties defined in the undocumented pdfx - # metadata schema. - #

Stability: Added in v1.12, will exist for all future v1.x releases. - # @return Returns a dictionary of key/value items for custom metadata - # properties. - custom_properties = property(custom_properties) - -