mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Remove the unmaintained pdfmanipulate command line utility
This commit is contained in:
parent
0a22c291b7
commit
b69fb230c5
@ -444,23 +444,6 @@ class CurrentDir(object):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class StreamReadWrapper(object):
|
|
||||||
'''
|
|
||||||
Used primarily with pyPdf to ensure the stream is properly closed.
|
|
||||||
'''
|
|
||||||
|
|
||||||
def __init__(self, stream):
|
|
||||||
for x in ('read', 'seek', 'tell'):
|
|
||||||
setattr(self, x, getattr(stream, x))
|
|
||||||
|
|
||||||
def __exit__(self, *args):
|
|
||||||
for x in ('read', 'seek', 'tell'):
|
|
||||||
setattr(self, x, None)
|
|
||||||
|
|
||||||
def __enter__(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
|
|
||||||
def detect_ncpus():
|
def detect_ncpus():
|
||||||
"""Detects the number of effective CPUs in the system"""
|
"""Detects the number of effective CPUs in the system"""
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
|
@ -1,72 +0,0 @@
|
|||||||
from __future__ import with_statement
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
'''
|
|
||||||
Command line interface to run pdf manipulation commands.
|
|
||||||
'''
|
|
||||||
|
|
||||||
import string, sys
|
|
||||||
|
|
||||||
from calibre.utils.config import OptionParser
|
|
||||||
from calibre.utils.logging import Log
|
|
||||||
from calibre.constants import preferred_encoding
|
|
||||||
from calibre.ebooks.pdf.manipulate import crop, decrypt, encrypt, \
|
|
||||||
info, merge, reverse, rotate, split
|
|
||||||
|
|
||||||
COMMANDS = {
|
|
||||||
'crop' : crop,
|
|
||||||
'decrypt' : decrypt,
|
|
||||||
'encrypt' : encrypt,
|
|
||||||
'info' : info,
|
|
||||||
'merge' : merge,
|
|
||||||
'reverse' : reverse,
|
|
||||||
'rotate' : rotate,
|
|
||||||
'split' : split,
|
|
||||||
}
|
|
||||||
|
|
||||||
USAGE = '%prog ' + _('''command ...
|
|
||||||
|
|
||||||
command can be one of the following:
|
|
||||||
[%%commands]
|
|
||||||
|
|
||||||
Use %prog command --help to get more information about a specific command
|
|
||||||
|
|
||||||
Manipulate a PDF.
|
|
||||||
''').replace('%%commands', string.join(sorted(COMMANDS.keys()), ', '))
|
|
||||||
|
|
||||||
def print_help(parser, log):
|
|
||||||
help = parser.format_help().encode(preferred_encoding, 'replace')
|
|
||||||
log(help)
|
|
||||||
|
|
||||||
def option_parser():
|
|
||||||
return OptionParser(usage=USAGE)
|
|
||||||
|
|
||||||
def main(args=sys.argv):
|
|
||||||
log = Log()
|
|
||||||
parser = option_parser()
|
|
||||||
|
|
||||||
if len(args) < 2:
|
|
||||||
print 'Error: No command sepecified.\n'
|
|
||||||
print_help(parser, log)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
command = args[1].lower().strip()
|
|
||||||
|
|
||||||
if command in COMMANDS.keys():
|
|
||||||
del args[1]
|
|
||||||
return COMMANDS[command].main(args, command)
|
|
||||||
else:
|
|
||||||
parser.parse_args(args)
|
|
||||||
print 'Unknown command %s.\n' % command
|
|
||||||
print_help(parser, log)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
# We should never get here.
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
sys.exit(main())
|
|
@ -1,150 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2009, James Beal <james_@catbus.co.uk>, ' \
|
|
||||||
'2009, John Schember <john@nachtimwald.com>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
'''
|
|
||||||
Crop a pdf file
|
|
||||||
'''
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import re
|
|
||||||
from decimal import Decimal
|
|
||||||
from optparse import OptionGroup, Option
|
|
||||||
|
|
||||||
from calibre.ebooks.metadata.meta import metadata_from_formats
|
|
||||||
from calibre.ebooks.metadata import authors_to_string
|
|
||||||
from calibre.utils.config import OptionParser
|
|
||||||
from calibre.utils.logging import Log
|
|
||||||
from calibre.constants import preferred_encoding
|
|
||||||
from calibre.customize.conversion import OptionRecommendation
|
|
||||||
from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted
|
|
||||||
|
|
||||||
from pyPdf import PdfFileWriter, PdfFileReader
|
|
||||||
|
|
||||||
DEFAULT_CROP = 10
|
|
||||||
|
|
||||||
USAGE = '\n%prog %%name ' + _('''\
|
|
||||||
[options] file.pdf
|
|
||||||
|
|
||||||
Crop a PDF file.
|
|
||||||
''')
|
|
||||||
|
|
||||||
OPTIONS = set([
|
|
||||||
OptionRecommendation(name='output', recommended_value='cropped.pdf',
|
|
||||||
level=OptionRecommendation.HIGH, long_switch='output', short_switch='o',
|
|
||||||
help=_('Path to output file. By default a file is created in the current directory.')),
|
|
||||||
OptionRecommendation(name='bottom_left_x', recommended_value=DEFAULT_CROP,
|
|
||||||
level=OptionRecommendation.LOW, long_switch='left-x', short_switch='x',
|
|
||||||
help=_('Number of pixels to crop from the left most x (default is %s)') % DEFAULT_CROP),
|
|
||||||
OptionRecommendation(name='bottom_left_y', recommended_value=DEFAULT_CROP,
|
|
||||||
level=OptionRecommendation.LOW, long_switch='left-y', short_switch='y',
|
|
||||||
help=_('Number of pixels to crop from the left most y (default is %s)') % DEFAULT_CROP),
|
|
||||||
OptionRecommendation(name='top_right_x', recommended_value=DEFAULT_CROP,
|
|
||||||
level=OptionRecommendation.LOW, long_switch='right-x', short_switch='v',
|
|
||||||
help=_('Number of pixels to crop from the right most x (default is %s)') % DEFAULT_CROP),
|
|
||||||
OptionRecommendation(name='top_right_y', recommended_value=DEFAULT_CROP,
|
|
||||||
level=OptionRecommendation.LOW, long_switch='right-y', short_switch='w',
|
|
||||||
help=_('Number of pixels to crop from the right most y (default is %s)') % DEFAULT_CROP),
|
|
||||||
OptionRecommendation(name='bounding', recommended_value=None,
|
|
||||||
level=OptionRecommendation.LOW, long_switch='bounding', short_switch='b',
|
|
||||||
help=_('A file generated by ghostscript which allows each page to be individually cropped `gs -dSAFER -dNOPAUSE -dBATCH -sDEVICE=bbox file.pdf 2> bounding`')),
|
|
||||||
])
|
|
||||||
|
|
||||||
def print_help(parser, log):
|
|
||||||
help = parser.format_help().encode(preferred_encoding, 'replace')
|
|
||||||
log(help)
|
|
||||||
|
|
||||||
def option_parser(name):
|
|
||||||
usage = USAGE.replace('%%name', name)
|
|
||||||
return OptionParser(usage=usage)
|
|
||||||
|
|
||||||
def option_recommendation_to_cli_option(add_option, rec):
|
|
||||||
opt = rec.option
|
|
||||||
switches = ['-'+opt.short_switch] if opt.short_switch else []
|
|
||||||
switches.append('--'+opt.long_switch)
|
|
||||||
attrs = dict(dest=opt.name, help=opt.help,
|
|
||||||
choices=opt.choices, default=rec.recommended_value)
|
|
||||||
add_option(Option(*switches, **attrs))
|
|
||||||
|
|
||||||
def add_options(parser):
|
|
||||||
group = OptionGroup(parser, _('Crop Options:'), _('Options to control the transformation of pdf'))
|
|
||||||
parser.add_option_group(group)
|
|
||||||
add_option = group.add_option
|
|
||||||
|
|
||||||
for rec in OPTIONS:
|
|
||||||
option_recommendation_to_cli_option(add_option, rec)
|
|
||||||
|
|
||||||
def crop_pdf(pdf_path, opts, metadata=None):
|
|
||||||
if metadata == None:
|
|
||||||
title = _('Unknown')
|
|
||||||
author = _('Unknown')
|
|
||||||
else:
|
|
||||||
title = metadata.title
|
|
||||||
author = authors_to_string(metadata.authors)
|
|
||||||
|
|
||||||
input_pdf = PdfFileReader(open(pdf_path, 'rb'))
|
|
||||||
|
|
||||||
bounding_lines = []
|
|
||||||
if opts.bounding != None:
|
|
||||||
try:
|
|
||||||
bounding = open(opts.bounding , 'r')
|
|
||||||
bounding_regex = re.compile('%%BoundingBox: (?P<bottom_x>\d+) (?P<bottom_y>\d+) (?P<top_x>\d+) (?P<top_y>\d+)')
|
|
||||||
except:
|
|
||||||
raise Exception('Error reading %s' % opts.bounding)
|
|
||||||
|
|
||||||
lines = bounding.readlines()
|
|
||||||
for line in lines:
|
|
||||||
if line.startswith('%%BoundingBox:'):
|
|
||||||
bounding_lines.append(line)
|
|
||||||
if len(bounding_lines) != input_pdf.numPages:
|
|
||||||
raise Exception('Error bounding file %s page count does not correspond to specified pdf' % opts.bounding)
|
|
||||||
|
|
||||||
output_pdf = PdfFileWriter(title=title,author=author)
|
|
||||||
blines = iter(bounding_lines)
|
|
||||||
for page in input_pdf.pages:
|
|
||||||
if bounding_lines != []:
|
|
||||||
mo = bounding_regex.search(blines.next())
|
|
||||||
if mo == None:
|
|
||||||
raise Exception('Error in bounding file %s' % opts.bounding)
|
|
||||||
page.mediaBox.upperRight = (float(mo.group('top_x')), Decimal(mo.group('top_y')))
|
|
||||||
page.mediaBox.lowerLeft = (float(mo.group('bottom_x')), Decimal(mo.group('bottom_y')))
|
|
||||||
else:
|
|
||||||
page.mediaBox.upperRight = (page.bleedBox.getUpperRight_x() - Decimal(opts.top_right_x), page.bleedBox.getUpperRight_y() - Decimal(opts.top_right_y))
|
|
||||||
page.mediaBox.lowerLeft = (page.bleedBox.getLowerLeft_x() + Decimal(opts.bottom_left_x), page.bleedBox.getLowerLeft_y() + Decimal(opts.bottom_left_y))
|
|
||||||
output_pdf.addPage(page)
|
|
||||||
|
|
||||||
with open(opts.output, 'wb') as output_file:
|
|
||||||
output_pdf.write(output_file)
|
|
||||||
|
|
||||||
def main(args=sys.argv, name=''):
|
|
||||||
log = Log()
|
|
||||||
parser = option_parser(name)
|
|
||||||
add_options(parser)
|
|
||||||
|
|
||||||
opts, args = parser.parse_args(args)
|
|
||||||
args = args[1:]
|
|
||||||
|
|
||||||
if len(args) < 1:
|
|
||||||
print 'Error: A PDF file is required.\n'
|
|
||||||
print_help(parser, log)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
if not is_valid_pdf(args[0]):
|
|
||||||
print 'Error: Could not read file `%s`.' % args[0]
|
|
||||||
return 1
|
|
||||||
|
|
||||||
if is_encrypted(args[0]):
|
|
||||||
print 'Error: file `%s` is encrypted.' % args[0]
|
|
||||||
return 1
|
|
||||||
|
|
||||||
mi = metadata_from_formats([args[0]])
|
|
||||||
|
|
||||||
crop_pdf(args[0], opts, mi)
|
|
||||||
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
sys.exit(main())
|
|
@ -1,113 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
from __future__ import with_statement
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
'''
|
|
||||||
Decrypt content of PDF.
|
|
||||||
'''
|
|
||||||
|
|
||||||
import os, sys
|
|
||||||
from optparse import OptionGroup, Option
|
|
||||||
|
|
||||||
from calibre.utils.config import OptionParser
|
|
||||||
from calibre.utils.logging import Log
|
|
||||||
from calibre.constants import preferred_encoding
|
|
||||||
from calibre.customize.conversion import OptionRecommendation
|
|
||||||
from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted
|
|
||||||
|
|
||||||
from pyPdf import PdfFileWriter, PdfFileReader
|
|
||||||
|
|
||||||
USAGE = '\n%prog %%name ' + _('''\
|
|
||||||
[options] file.pdf password
|
|
||||||
|
|
||||||
Decrypt a PDF.
|
|
||||||
''')
|
|
||||||
|
|
||||||
OPTIONS = set([
|
|
||||||
OptionRecommendation(name='output', recommended_value='decrypted.pdf',
|
|
||||||
level=OptionRecommendation.HIGH, long_switch='output', short_switch='o',
|
|
||||||
help=_('Path to output file. By default a file is created in the current directory.')),
|
|
||||||
])
|
|
||||||
|
|
||||||
class DecryptionError(Exception):
|
|
||||||
def __init__(self, pdf_path):
|
|
||||||
self.value = 'Unable to decrypt file `%s`.' % pdf_path
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return repr(self.value)
|
|
||||||
|
|
||||||
|
|
||||||
def print_help(parser, log):
|
|
||||||
help = parser.format_help().encode(preferred_encoding, 'replace')
|
|
||||||
log(help)
|
|
||||||
|
|
||||||
def option_parser(name):
|
|
||||||
usage = USAGE.replace('%%name', name)
|
|
||||||
return OptionParser(usage=usage)
|
|
||||||
|
|
||||||
def option_recommendation_to_cli_option(add_option, rec):
|
|
||||||
opt = rec.option
|
|
||||||
switches = ['-'+opt.short_switch] if opt.short_switch else []
|
|
||||||
switches.append('--'+opt.long_switch)
|
|
||||||
attrs = dict(dest=opt.name, help=opt.help,
|
|
||||||
choices=opt.choices, default=rec.recommended_value)
|
|
||||||
add_option(Option(*switches, **attrs))
|
|
||||||
|
|
||||||
def add_options(parser):
|
|
||||||
group = OptionGroup(parser, _('Decrypt Options:'), _('Options to control the transformation of pdf'))
|
|
||||||
parser.add_option_group(group)
|
|
||||||
add_option = group.add_option
|
|
||||||
|
|
||||||
for rec in OPTIONS:
|
|
||||||
option_recommendation_to_cli_option(add_option, rec)
|
|
||||||
|
|
||||||
def decrypt(pdf_path, out_path, password):
|
|
||||||
pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb'))
|
|
||||||
|
|
||||||
if pdf.decrypt(str(password)) == 0:
|
|
||||||
raise DecryptionError(pdf_path)
|
|
||||||
|
|
||||||
title = pdf.documentInfo.title if pdf.documentInfo.title else _('Unknown')
|
|
||||||
author = pdf.documentInfo.author if pdf.documentInfo.author else _('Unknown')
|
|
||||||
out_pdf = PdfFileWriter(title=title, author=author)
|
|
||||||
|
|
||||||
for page in pdf.pages:
|
|
||||||
out_pdf.addPage(page)
|
|
||||||
|
|
||||||
with open(out_path, 'wb') as out_file:
|
|
||||||
out_pdf.write(out_file)
|
|
||||||
|
|
||||||
def main(args=sys.argv, name=''):
|
|
||||||
log = Log()
|
|
||||||
parser = option_parser(name)
|
|
||||||
add_options(parser)
|
|
||||||
|
|
||||||
opts, args = parser.parse_args(args)
|
|
||||||
args = args[1:]
|
|
||||||
|
|
||||||
if len(args) < 2:
|
|
||||||
print 'Error: A PDF file and decryption password is required.\n'
|
|
||||||
print_help(parser, log)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
if not is_valid_pdf(args[0]):
|
|
||||||
print 'Error: Could not read file `%s`.' % args[0]
|
|
||||||
return 1
|
|
||||||
|
|
||||||
if not is_encrypted(args[0]):
|
|
||||||
print 'Error: file `%s` is not encrypted.' % args[0]
|
|
||||||
return 1
|
|
||||||
|
|
||||||
try:
|
|
||||||
decrypt(args[0], opts.output, args[1])
|
|
||||||
except DecryptionError as e:
|
|
||||||
print e.value
|
|
||||||
return 1
|
|
||||||
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
sys.exit(main())
|
|
@ -1,107 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
from __future__ import with_statement
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
'''
|
|
||||||
Encrypt a PDF.
|
|
||||||
'''
|
|
||||||
|
|
||||||
import os, sys
|
|
||||||
from optparse import OptionGroup, Option
|
|
||||||
|
|
||||||
from calibre.utils.config import OptionParser
|
|
||||||
from calibre.utils.logging import Log
|
|
||||||
from calibre.constants import preferred_encoding
|
|
||||||
from calibre.customize.conversion import OptionRecommendation
|
|
||||||
from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted
|
|
||||||
from calibre.ebooks.metadata import authors_to_string
|
|
||||||
from calibre.ebooks.metadata.meta import metadata_from_formats
|
|
||||||
|
|
||||||
from pyPdf import PdfFileWriter, PdfFileReader
|
|
||||||
|
|
||||||
USAGE = '\n%prog %%name ' + _('''\
|
|
||||||
[options] file.pdf password
|
|
||||||
|
|
||||||
Encrypt a PDF.
|
|
||||||
''')
|
|
||||||
|
|
||||||
OPTIONS = set([
|
|
||||||
OptionRecommendation(name='output', recommended_value='encrypted.pdf',
|
|
||||||
level=OptionRecommendation.HIGH, long_switch='output', short_switch='o',
|
|
||||||
help=_('Path to output file. By default a file is created in the current directory.')),
|
|
||||||
])
|
|
||||||
|
|
||||||
def print_help(parser, log):
|
|
||||||
help = parser.format_help().encode(preferred_encoding, 'replace')
|
|
||||||
log(help)
|
|
||||||
|
|
||||||
def option_parser(name):
|
|
||||||
usage = USAGE.replace('%%name', name)
|
|
||||||
return OptionParser(usage=usage)
|
|
||||||
|
|
||||||
def option_recommendation_to_cli_option(add_option, rec):
|
|
||||||
opt = rec.option
|
|
||||||
switches = ['-'+opt.short_switch] if opt.short_switch else []
|
|
||||||
switches.append('--'+opt.long_switch)
|
|
||||||
attrs = dict(dest=opt.name, help=opt.help,
|
|
||||||
choices=opt.choices, default=rec.recommended_value)
|
|
||||||
add_option(Option(*switches, **attrs))
|
|
||||||
|
|
||||||
def add_options(parser):
|
|
||||||
group = OptionGroup(parser, _('Encrypt Options:'), _('Options to control the transformation of pdf'))
|
|
||||||
parser.add_option_group(group)
|
|
||||||
add_option = group.add_option
|
|
||||||
|
|
||||||
for rec in OPTIONS:
|
|
||||||
option_recommendation_to_cli_option(add_option, rec)
|
|
||||||
|
|
||||||
def encrypt(pdf_path, out_path, password, metadata=None):
|
|
||||||
if metadata == None:
|
|
||||||
title = _('Unknown')
|
|
||||||
author = _('Unknown')
|
|
||||||
else:
|
|
||||||
title = metadata.title
|
|
||||||
author = authors_to_string(metadata.authors)
|
|
||||||
|
|
||||||
out_pdf = PdfFileWriter(title=title, author=author)
|
|
||||||
|
|
||||||
pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb'))
|
|
||||||
for page in pdf.pages:
|
|
||||||
out_pdf.addPage(page)
|
|
||||||
|
|
||||||
with open(out_path, 'wb') as out_file:
|
|
||||||
out_pdf.encrypt(str(password))
|
|
||||||
out_pdf.write(out_file)
|
|
||||||
|
|
||||||
def main(args=sys.argv, name=''):
|
|
||||||
log = Log()
|
|
||||||
parser = option_parser(name)
|
|
||||||
add_options(parser)
|
|
||||||
|
|
||||||
opts, args = parser.parse_args(args)
|
|
||||||
args = args[1:]
|
|
||||||
|
|
||||||
if len(args) < 2:
|
|
||||||
print 'Error: A PDF file and decryption password is required.\n'
|
|
||||||
print_help(parser, log)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
if not is_valid_pdf(args[0]):
|
|
||||||
print 'Error: Could not read file `%s`.' % args[0]
|
|
||||||
return 1
|
|
||||||
|
|
||||||
if is_encrypted(args[0]):
|
|
||||||
print 'Error: file `%s` is already encrypted.' % args[0]
|
|
||||||
return 1
|
|
||||||
|
|
||||||
mi = metadata_from_formats([args[0]])
|
|
||||||
|
|
||||||
encrypt(args[0], opts.output, args[1], mi)
|
|
||||||
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
sys.exit(main())
|
|
@ -1,85 +0,0 @@
|
|||||||
from __future__ import with_statement
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
'''
|
|
||||||
Merge PDF files into a single PDF document.
|
|
||||||
'''
|
|
||||||
|
|
||||||
import os, sys
|
|
||||||
|
|
||||||
from calibre.utils.config import OptionParser
|
|
||||||
from calibre.utils.logging import Log
|
|
||||||
from calibre.constants import preferred_encoding
|
|
||||||
from calibre.ebooks.pdf.verify import is_valid_pdfs, is_encrypted
|
|
||||||
from calibre.utils.podofo import get_podofo
|
|
||||||
from calibre import prints
|
|
||||||
|
|
||||||
USAGE = '\n%prog %%name ' + _('''\
|
|
||||||
file.pdf ...
|
|
||||||
|
|
||||||
Get info about a PDF.
|
|
||||||
''')
|
|
||||||
|
|
||||||
def print_help(parser, log):
|
|
||||||
help = parser.format_help().encode(preferred_encoding, 'replace')
|
|
||||||
log(help)
|
|
||||||
|
|
||||||
def option_parser(name):
|
|
||||||
usage = USAGE.replace('%%name', name)
|
|
||||||
return OptionParser(usage=usage)
|
|
||||||
|
|
||||||
def print_info(pdf_path):
|
|
||||||
podofo = get_podofo()
|
|
||||||
p = podofo.PDFDoc()
|
|
||||||
p.open(pdf_path)
|
|
||||||
|
|
||||||
fmt = lambda x, y: '%-20s: %s'%(x, y)
|
|
||||||
|
|
||||||
print
|
|
||||||
|
|
||||||
prints(fmt(_('Title'), p.title))
|
|
||||||
prints(fmt(_('Author'), p.author))
|
|
||||||
prints(fmt(_('Subject'), p.subject))
|
|
||||||
prints(fmt(_('Creator'), p.creator))
|
|
||||||
prints(fmt(_('Producer'), p.producer))
|
|
||||||
prints(fmt(_('Pages'), p.pages))
|
|
||||||
prints(fmt(_('File Size'), os.stat(pdf_path).st_size))
|
|
||||||
prints(fmt(_('PDF Version'), p.version if p.version else _('Unknown')))
|
|
||||||
|
|
||||||
def main(args=sys.argv, name=''):
|
|
||||||
log = Log()
|
|
||||||
parser = option_parser(name)
|
|
||||||
|
|
||||||
opts, args = parser.parse_args(args)
|
|
||||||
args = args[1:]
|
|
||||||
|
|
||||||
if len(args) < 1:
|
|
||||||
print 'Error: No PDF sepecified.\n'
|
|
||||||
print_help(parser, log)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
bad_pdfs = is_valid_pdfs(args)
|
|
||||||
if bad_pdfs != []:
|
|
||||||
for pdf in bad_pdfs:
|
|
||||||
print 'Error: Could not read file `%s`.' % pdf
|
|
||||||
return 1
|
|
||||||
|
|
||||||
enc = False
|
|
||||||
for pdf in args:
|
|
||||||
if is_encrypted(pdf):
|
|
||||||
enc = True
|
|
||||||
print 'Error: file `%s` is encrypted. Please decrypt first.' % pdf
|
|
||||||
if enc:
|
|
||||||
return 1
|
|
||||||
|
|
||||||
for pdf in args:
|
|
||||||
print_info(pdf)
|
|
||||||
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
sys.exit(main())
|
|
@ -1,115 +0,0 @@
|
|||||||
from __future__ import with_statement
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
'''
|
|
||||||
Merge PDF files into a single PDF document.
|
|
||||||
'''
|
|
||||||
|
|
||||||
import os, sys
|
|
||||||
from optparse import OptionGroup, Option
|
|
||||||
|
|
||||||
from calibre.ebooks.metadata.meta import metadata_from_formats
|
|
||||||
from calibre.ebooks.metadata import authors_to_string
|
|
||||||
from calibre.utils.config import OptionParser
|
|
||||||
from calibre.utils.logging import Log
|
|
||||||
from calibre.constants import preferred_encoding
|
|
||||||
from calibre.customize.conversion import OptionRecommendation
|
|
||||||
from calibre.ebooks.pdf.verify import is_valid_pdfs, is_encrypted
|
|
||||||
|
|
||||||
from pyPdf import PdfFileWriter, PdfFileReader
|
|
||||||
|
|
||||||
USAGE = '\n%prog %%name ' + _('''\
|
|
||||||
[options] file1.pdf file2.pdf ...
|
|
||||||
|
|
||||||
Metadata will be used from the first PDF specified.
|
|
||||||
|
|
||||||
Merges individual PDFs.
|
|
||||||
''')
|
|
||||||
|
|
||||||
OPTIONS = set([
|
|
||||||
OptionRecommendation(name='output', recommended_value='merged.pdf',
|
|
||||||
level=OptionRecommendation.HIGH, long_switch='output', short_switch='o',
|
|
||||||
help=_('Path to output file. By default a file is created in the current directory.')),
|
|
||||||
])
|
|
||||||
|
|
||||||
def print_help(parser, log):
|
|
||||||
help = parser.format_help().encode(preferred_encoding, 'replace')
|
|
||||||
log(help)
|
|
||||||
|
|
||||||
def option_parser(name):
|
|
||||||
usage = USAGE.replace('%%name', name)
|
|
||||||
return OptionParser(usage=usage)
|
|
||||||
|
|
||||||
def option_recommendation_to_cli_option(add_option, rec):
|
|
||||||
opt = rec.option
|
|
||||||
switches = ['-'+opt.short_switch] if opt.short_switch else []
|
|
||||||
switches.append('--'+opt.long_switch)
|
|
||||||
attrs = dict(dest=opt.name, help=opt.help,
|
|
||||||
choices=opt.choices, default=rec.recommended_value)
|
|
||||||
add_option(Option(*switches, **attrs))
|
|
||||||
|
|
||||||
def add_options(parser):
|
|
||||||
group = OptionGroup(parser, _('Merge Options:'), _('Options to control the transformation of pdf'))
|
|
||||||
parser.add_option_group(group)
|
|
||||||
add_option = group.add_option
|
|
||||||
|
|
||||||
for rec in OPTIONS:
|
|
||||||
option_recommendation_to_cli_option(add_option, rec)
|
|
||||||
|
|
||||||
def merge_files(in_paths, out_path, metadata=None):
|
|
||||||
if metadata == None:
|
|
||||||
title = _('Unknown')
|
|
||||||
author = _('Unknown')
|
|
||||||
else:
|
|
||||||
title = metadata.title
|
|
||||||
author = authors_to_string(metadata.authors)
|
|
||||||
|
|
||||||
out_pdf = PdfFileWriter(title=title, author=author)
|
|
||||||
|
|
||||||
for pdf_path in in_paths:
|
|
||||||
pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb'))
|
|
||||||
for page in pdf.pages:
|
|
||||||
out_pdf.addPage(page)
|
|
||||||
|
|
||||||
with open(out_path, 'wb') as out_file:
|
|
||||||
out_pdf.write(out_file)
|
|
||||||
|
|
||||||
def main(args=sys.argv, name=''):
|
|
||||||
log = Log()
|
|
||||||
parser = option_parser(name)
|
|
||||||
add_options(parser)
|
|
||||||
|
|
||||||
opts, args = parser.parse_args(args)
|
|
||||||
args = args[1:]
|
|
||||||
|
|
||||||
if len(args) < 2:
|
|
||||||
print 'Error: Two or more PDF files are required.\n'
|
|
||||||
print_help(parser, log)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
bad_pdfs = is_valid_pdfs(args)
|
|
||||||
if bad_pdfs != []:
|
|
||||||
for pdf in bad_pdfs:
|
|
||||||
print 'Error: Could not read file `%s`.' % pdf
|
|
||||||
return 1
|
|
||||||
|
|
||||||
enc = False
|
|
||||||
for pdf in args:
|
|
||||||
if is_encrypted(pdf):
|
|
||||||
enc = True
|
|
||||||
print 'Error: file `%s` is encrypted.' % pdf
|
|
||||||
if enc:
|
|
||||||
return 1
|
|
||||||
|
|
||||||
mi = metadata_from_formats([args[0]])
|
|
||||||
|
|
||||||
merge_files(args, opts.output, mi)
|
|
||||||
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
sys.exit(main())
|
|
@ -1,106 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
from __future__ import with_statement
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
'''
|
|
||||||
Reverse content of PDF.
|
|
||||||
'''
|
|
||||||
|
|
||||||
import os, sys
|
|
||||||
from optparse import OptionGroup, Option
|
|
||||||
|
|
||||||
from calibre.ebooks.metadata.meta import metadata_from_formats
|
|
||||||
from calibre.ebooks.metadata import authors_to_string
|
|
||||||
from calibre.utils.config import OptionParser
|
|
||||||
from calibre.utils.logging import Log
|
|
||||||
from calibre.constants import preferred_encoding
|
|
||||||
from calibre.customize.conversion import OptionRecommendation
|
|
||||||
from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted
|
|
||||||
|
|
||||||
from pyPdf import PdfFileWriter, PdfFileReader
|
|
||||||
|
|
||||||
USAGE = '\n%prog %%name ' + _('''\
|
|
||||||
[options] file.pdf
|
|
||||||
|
|
||||||
Reverse a PDF.
|
|
||||||
''')
|
|
||||||
|
|
||||||
OPTIONS = set([
|
|
||||||
OptionRecommendation(name='output', recommended_value='reversed.pdf',
|
|
||||||
level=OptionRecommendation.HIGH, long_switch='output', short_switch='o',
|
|
||||||
help=_('Path to output file. By default a file is created in the current directory.')),
|
|
||||||
])
|
|
||||||
|
|
||||||
def print_help(parser, log):
|
|
||||||
help = parser.format_help().encode(preferred_encoding, 'replace')
|
|
||||||
log(help)
|
|
||||||
|
|
||||||
def option_parser(name):
|
|
||||||
usage = USAGE.replace('%%name', name)
|
|
||||||
return OptionParser(usage=usage)
|
|
||||||
|
|
||||||
def option_recommendation_to_cli_option(add_option, rec):
|
|
||||||
opt = rec.option
|
|
||||||
switches = ['-'+opt.short_switch] if opt.short_switch else []
|
|
||||||
switches.append('--'+opt.long_switch)
|
|
||||||
attrs = dict(dest=opt.name, help=opt.help,
|
|
||||||
choices=opt.choices, default=rec.recommended_value)
|
|
||||||
add_option(Option(*switches, **attrs))
|
|
||||||
|
|
||||||
def add_options(parser):
|
|
||||||
group = OptionGroup(parser, _('Reverse Options:'), _('Options to control the transformation of pdf'))
|
|
||||||
parser.add_option_group(group)
|
|
||||||
add_option = group.add_option
|
|
||||||
|
|
||||||
for rec in OPTIONS:
|
|
||||||
option_recommendation_to_cli_option(add_option, rec)
|
|
||||||
|
|
||||||
def reverse(pdf_path, out_path, metadata=None):
|
|
||||||
if metadata == None:
|
|
||||||
title = _('Unknown')
|
|
||||||
author = _('Unknown')
|
|
||||||
else:
|
|
||||||
title = metadata.title
|
|
||||||
author = authors_to_string(metadata.authors)
|
|
||||||
|
|
||||||
out_pdf = PdfFileWriter(title=title, author=author)
|
|
||||||
|
|
||||||
pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb'))
|
|
||||||
for page in reversed(pdf.pages):
|
|
||||||
out_pdf.addPage(page)
|
|
||||||
|
|
||||||
with open(out_path, 'wb') as out_file:
|
|
||||||
out_pdf.write(out_file)
|
|
||||||
|
|
||||||
def main(args=sys.argv, name=''):
|
|
||||||
log = Log()
|
|
||||||
parser = option_parser(name)
|
|
||||||
add_options(parser)
|
|
||||||
|
|
||||||
opts, args = parser.parse_args(args)
|
|
||||||
args = args[1:]
|
|
||||||
|
|
||||||
if len(args) < 1:
|
|
||||||
print 'Error: A PDF file is required.\n'
|
|
||||||
print_help(parser, log)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
if not is_valid_pdf(args[0]):
|
|
||||||
print 'Error: Could not read file `%s`.' % args[0]
|
|
||||||
return 1
|
|
||||||
|
|
||||||
if is_encrypted(args[0]):
|
|
||||||
print 'Error: file `%s` is encrypted.' % args[0]
|
|
||||||
return 1
|
|
||||||
|
|
||||||
mi = metadata_from_formats([args[0]])
|
|
||||||
|
|
||||||
reverse(args[0], opts.output, mi)
|
|
||||||
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
sys.exit(main())
|
|
@ -1,105 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
'''
|
|
||||||
Rotate pages of a PDF.
|
|
||||||
'''
|
|
||||||
|
|
||||||
import os, sys
|
|
||||||
from optparse import OptionGroup, Option
|
|
||||||
|
|
||||||
from calibre.ebooks.metadata.meta import metadata_from_formats
|
|
||||||
from calibre.ebooks.metadata import authors_to_string
|
|
||||||
from calibre.utils.config import OptionParser
|
|
||||||
from calibre.utils.logging import Log
|
|
||||||
from calibre.constants import preferred_encoding
|
|
||||||
from calibre.customize.conversion import OptionRecommendation
|
|
||||||
from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted
|
|
||||||
|
|
||||||
from pyPdf import PdfFileWriter, PdfFileReader
|
|
||||||
|
|
||||||
USAGE = '\n%prog %%name ' + _('''\
|
|
||||||
file.pdf degrees
|
|
||||||
|
|
||||||
Rotate pages of a PDF clockwise.
|
|
||||||
''')
|
|
||||||
|
|
||||||
OPTIONS = set([
|
|
||||||
OptionRecommendation(name='output', recommended_value='rotated.pdf',
|
|
||||||
level=OptionRecommendation.HIGH, long_switch='output', short_switch='o',
|
|
||||||
help=_('Path to output file. By default a file is created in the current directory.')),
|
|
||||||
])
|
|
||||||
|
|
||||||
def print_help(parser, log):
|
|
||||||
help = parser.format_help().encode(preferred_encoding, 'replace')
|
|
||||||
log(help)
|
|
||||||
|
|
||||||
def option_parser(name):
|
|
||||||
usage = USAGE.replace('%%name', name)
|
|
||||||
return OptionParser(usage=usage)
|
|
||||||
|
|
||||||
def option_recommendation_to_cli_option(add_option, rec):
|
|
||||||
opt = rec.option
|
|
||||||
switches = ['-'+opt.short_switch] if opt.short_switch else []
|
|
||||||
switches.append('--'+opt.long_switch)
|
|
||||||
attrs = dict(dest=opt.name, help=opt.help,
|
|
||||||
choices=opt.choices, default=rec.recommended_value)
|
|
||||||
add_option(Option(*switches, **attrs))
|
|
||||||
|
|
||||||
def add_options(parser):
|
|
||||||
group = OptionGroup(parser, _('Rotate Options:'), _('Options to control the transformation of pdf'))
|
|
||||||
parser.add_option_group(group)
|
|
||||||
add_option = group.add_option
|
|
||||||
|
|
||||||
for rec in OPTIONS:
|
|
||||||
option_recommendation_to_cli_option(add_option, rec)
|
|
||||||
|
|
||||||
def rotate(pdf_path, out_path, degrees, metadata=None):
|
|
||||||
if metadata == None:
|
|
||||||
title = _('Unknown')
|
|
||||||
author = _('Unknown')
|
|
||||||
else:
|
|
||||||
title = metadata.title
|
|
||||||
author = authors_to_string(metadata.authors)
|
|
||||||
|
|
||||||
out_pdf = PdfFileWriter(title=title, author=author)
|
|
||||||
|
|
||||||
pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb'))
|
|
||||||
for page in pdf.pages:
|
|
||||||
out_pdf.addPage(page.rotateClockwise(int(degrees)))
|
|
||||||
|
|
||||||
with open(out_path, 'wb') as out_file:
|
|
||||||
out_pdf.write(out_file)
|
|
||||||
|
|
||||||
def main(args=sys.argv, name=''):
|
|
||||||
log = Log()
|
|
||||||
parser = option_parser(name)
|
|
||||||
add_options(parser)
|
|
||||||
|
|
||||||
opts, args = parser.parse_args(args)
|
|
||||||
args = args[1:]
|
|
||||||
|
|
||||||
if len(args) < 2:
|
|
||||||
print 'Error: A PDF file and how many degrees to rotate is required.\n'
|
|
||||||
print_help(parser, log)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
if not is_valid_pdf(args[0]):
|
|
||||||
print 'Error: Could not read file `%s`.' % args[0]
|
|
||||||
return 1
|
|
||||||
|
|
||||||
if is_encrypted(args[0]):
|
|
||||||
print 'Error: file `%s` is encrypted.' % args[0]
|
|
||||||
return 1
|
|
||||||
|
|
||||||
mi = metadata_from_formats([args[0]])
|
|
||||||
|
|
||||||
rotate(args[0], opts.output, args[1], mi)
|
|
||||||
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
sys.exit(main())
|
|
@ -1,204 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
from __future__ import with_statement
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
'''
|
|
||||||
Split PDF file into multiple PDF documents.
|
|
||||||
'''
|
|
||||||
|
|
||||||
import os, sys, re
|
|
||||||
from optparse import OptionGroup, Option
|
|
||||||
|
|
||||||
from calibre.ebooks.metadata.meta import metadata_from_formats
|
|
||||||
from calibre.ebooks.metadata import authors_to_string
|
|
||||||
from calibre.utils.config import OptionParser
|
|
||||||
from calibre.utils.logging import Log
|
|
||||||
from calibre.constants import preferred_encoding
|
|
||||||
from calibre.customize.conversion import OptionRecommendation
|
|
||||||
from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted
|
|
||||||
|
|
||||||
from pyPdf import PdfFileWriter, PdfFileReader
|
|
||||||
|
|
||||||
USAGE = _('''
|
|
||||||
%prog %%name [options] file.pdf page_to_split_on ...
|
|
||||||
%prog %%name [options] file.pdf page_range_to_split_on ...
|
|
||||||
|
|
||||||
Ex.
|
|
||||||
|
|
||||||
%prog %%name file.pdf 6
|
|
||||||
%prog %%name file.pdf 6-12
|
|
||||||
%prog %%name file.pdf 6-12 8 10 9-20
|
|
||||||
|
|
||||||
Split a PDF.
|
|
||||||
''')
|
|
||||||
|
|
||||||
OPTIONS = set([
|
|
||||||
OptionRecommendation(name='output', recommended_value='split.pdf',
|
|
||||||
level=OptionRecommendation.HIGH, long_switch='output', short_switch='o',
|
|
||||||
help=_('Path to output file. By default a file is created in the current directory.')),
|
|
||||||
])
|
|
||||||
|
|
||||||
def print_help(parser, log):
|
|
||||||
help = parser.format_help().encode(preferred_encoding, 'replace')
|
|
||||||
log(help)
|
|
||||||
|
|
||||||
def option_parser(name):
|
|
||||||
usage = USAGE.replace('%%name', name)
|
|
||||||
return OptionParser(usage=usage)
|
|
||||||
|
|
||||||
def option_recommendation_to_cli_option(add_option, rec):
|
|
||||||
opt = rec.option
|
|
||||||
switches = ['-'+opt.short_switch] if opt.short_switch else []
|
|
||||||
switches.append('--'+opt.long_switch)
|
|
||||||
attrs = dict(dest=opt.name, help=opt.help,
|
|
||||||
choices=opt.choices, default=rec.recommended_value)
|
|
||||||
add_option(Option(*switches, **attrs))
|
|
||||||
|
|
||||||
def add_options(parser):
|
|
||||||
group = OptionGroup(parser, _('Split Options:'), _('Options to control the transformation of pdf'))
|
|
||||||
parser.add_option_group(group)
|
|
||||||
add_option = group.add_option
|
|
||||||
|
|
||||||
for rec in OPTIONS:
|
|
||||||
option_recommendation_to_cli_option(add_option, rec)
|
|
||||||
|
|
||||||
def split_pdf(in_path, pages, page_ranges, out_name, metadata=None):
|
|
||||||
pdf = PdfFileReader(open(os.path.abspath(in_path), 'rb'))
|
|
||||||
total_pages = pdf.numPages - 1
|
|
||||||
|
|
||||||
for index in pages+page_ranges:
|
|
||||||
if index in pages:
|
|
||||||
write_pdf(pdf, out_name, '%s' % (index + 1), index, total_pages, metadata)
|
|
||||||
else:
|
|
||||||
|
|
||||||
write_pdf(pdf, out_name, '%s-%s' % (index[0] + 1, index[1] + 1), index[0], index[1], metadata)
|
|
||||||
|
|
||||||
def write_pdf(pdf, name, suffix, start, end, metadata=None):
|
|
||||||
if metadata == None:
|
|
||||||
title = _('Unknown')
|
|
||||||
author = _('Unknown')
|
|
||||||
else:
|
|
||||||
title = metadata.title
|
|
||||||
author = authors_to_string(metadata.authors)
|
|
||||||
|
|
||||||
out_pdf = PdfFileWriter(title=title, author=author)
|
|
||||||
for page_num in range(start, end + 1):
|
|
||||||
out_pdf.addPage(pdf.getPage(page_num))
|
|
||||||
with open('%s%s.pdf' % (name, suffix), 'wb') as out_file:
|
|
||||||
out_pdf.write(out_file)
|
|
||||||
|
|
||||||
def split_args(args):
|
|
||||||
pdf = ''
|
|
||||||
pages = []
|
|
||||||
page_ranges = []
|
|
||||||
bad = []
|
|
||||||
|
|
||||||
for arg in args:
|
|
||||||
arg = arg.strip()
|
|
||||||
# Find the pdf input
|
|
||||||
if re.search('(?iu)^.*?\.pdf[ ]*$', arg) != None:
|
|
||||||
if pdf == '':
|
|
||||||
pdf = arg
|
|
||||||
else:
|
|
||||||
bad.append(arg)
|
|
||||||
# Find single indexes
|
|
||||||
elif re.search('^[ ]*\d+[ ]*$', arg) != None:
|
|
||||||
pages.append(arg)
|
|
||||||
# Find index ranges
|
|
||||||
elif re.search('^[ ]*\d+[ ]*-[ ]*\d+[ ]*$', arg) != None:
|
|
||||||
mo = re.search('^[ ]*(?P<start>\d+)[ ]*-[ ]*(?P<end>\d+)[ ]*$', arg)
|
|
||||||
start = mo.group('start')
|
|
||||||
end = mo.group('end')
|
|
||||||
|
|
||||||
# check to see if the range is really a single index
|
|
||||||
if start == end:
|
|
||||||
pages.append(start)
|
|
||||||
else:
|
|
||||||
page_ranges.append([start, end])
|
|
||||||
else:
|
|
||||||
bad.append(arg)
|
|
||||||
|
|
||||||
bad = sorted(list(set(bad)))
|
|
||||||
|
|
||||||
return pdf, pages, page_ranges, bad
|
|
||||||
|
|
||||||
# Remove duplicates from pages and page_ranges.
|
|
||||||
# Set pages higher than the total number of pages in the pdf to the last page.
|
|
||||||
# Return pages and page_ranges as lists of ints.
|
|
||||||
def clean_page_list(pdf_path, pages, page_ranges):
|
|
||||||
pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb'))
|
|
||||||
|
|
||||||
total_pages = pdf.numPages
|
|
||||||
sorted_pages = []
|
|
||||||
sorted_ranges = []
|
|
||||||
|
|
||||||
for index in pages:
|
|
||||||
index = int(index)
|
|
||||||
if index > total_pages:
|
|
||||||
sorted_pages.append(total_pages - 1)
|
|
||||||
else:
|
|
||||||
sorted_pages.append(index - 1)
|
|
||||||
|
|
||||||
for start, end in page_ranges:
|
|
||||||
start = int(start)
|
|
||||||
end = int(end)
|
|
||||||
|
|
||||||
if start > total_pages and end > total_pages:
|
|
||||||
sorted_pages.append(total_pages - 1)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if start > total_pages:
|
|
||||||
start = total_pages
|
|
||||||
if end > total_pages:
|
|
||||||
end = total_pages
|
|
||||||
page_range = sorted([start - 1, end - 1])
|
|
||||||
if page_range not in sorted_ranges:
|
|
||||||
sorted_ranges.append(page_range)
|
|
||||||
|
|
||||||
# Remove duplicates and sort
|
|
||||||
pages = sorted(list(set(sorted_pages)))
|
|
||||||
page_ranges = sorted(sorted_ranges)
|
|
||||||
|
|
||||||
return pages, page_ranges
|
|
||||||
|
|
||||||
def main(args=sys.argv, name=''):
|
|
||||||
log = Log()
|
|
||||||
parser = option_parser(name)
|
|
||||||
add_options(parser)
|
|
||||||
|
|
||||||
opts, args = parser.parse_args(args)
|
|
||||||
|
|
||||||
pdf, pages, page_ranges, unknown = split_args(args[1:])
|
|
||||||
|
|
||||||
if pdf == '' and (pages == [] or page_ranges == []):
|
|
||||||
print 'Error: PDF and where to split is required.\n'
|
|
||||||
print_help(parser, log)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
if unknown != []:
|
|
||||||
for arg in unknown:
|
|
||||||
print 'Error: Unknown argument `%s`' % arg
|
|
||||||
print_help(parser, log)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
if not is_valid_pdf(pdf):
|
|
||||||
print 'Error: Could not read file `%s`.' % pdf
|
|
||||||
return 1
|
|
||||||
|
|
||||||
if is_encrypted(pdf):
|
|
||||||
print 'Error: file `%s` is encrypted.' % args[0]
|
|
||||||
return 1
|
|
||||||
|
|
||||||
pages, page_ranges = clean_page_list(pdf, pages, page_ranges)
|
|
||||||
|
|
||||||
mi = metadata_from_formats([pdf])
|
|
||||||
|
|
||||||
split_pdf(pdf, pages, page_ranges, os.path.splitext(opts.output)[0], mi)
|
|
||||||
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
sys.exit(main())
|
|
@ -1,44 +0,0 @@
|
|||||||
from __future__ import with_statement
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
'''
|
|
||||||
Verify PDF files.
|
|
||||||
'''
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
from pyPdf import PdfFileReader
|
|
||||||
|
|
||||||
def is_valid_pdf(pdf_path):
|
|
||||||
'''
|
|
||||||
Returns True if the pdf file is valid.
|
|
||||||
'''
|
|
||||||
|
|
||||||
try:
|
|
||||||
with open(os.path.abspath(pdf_path), 'rb') as pdf_file:
|
|
||||||
PdfFileReader(pdf_file)
|
|
||||||
except:
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def is_valid_pdfs(pdf_paths):
|
|
||||||
'''
|
|
||||||
Returns a list of invalid pdf files.
|
|
||||||
'''
|
|
||||||
|
|
||||||
invalid = []
|
|
||||||
for pdf_path in pdf_paths:
|
|
||||||
if not is_valid_pdf(pdf_path):
|
|
||||||
invalid.append(pdf_path)
|
|
||||||
return invalid
|
|
||||||
|
|
||||||
def is_encrypted(pdf_path):
|
|
||||||
with open(os.path.abspath(pdf_path), 'rb') as pdf_file:
|
|
||||||
pdf = PdfFileReader(pdf_file)
|
|
||||||
if pdf.isEncrypted:
|
|
||||||
return True
|
|
||||||
return False
|
|
@ -29,7 +29,6 @@ entry_points = {
|
|||||||
'calibre-parallel = calibre.utils.ipc.worker:main',
|
'calibre-parallel = calibre.utils.ipc.worker:main',
|
||||||
'calibre-customize = calibre.customize.ui:main',
|
'calibre-customize = calibre.customize.ui:main',
|
||||||
'calibre-complete = calibre.utils.complete:main',
|
'calibre-complete = calibre.utils.complete:main',
|
||||||
'pdfmanipulate = calibre.ebooks.pdf.manipulate.cli:main',
|
|
||||||
'fetch-ebook-metadata = calibre.ebooks.metadata.sources.cli:main',
|
'fetch-ebook-metadata = calibre.ebooks.metadata.sources.cli:main',
|
||||||
'epub-fix = calibre.ebooks.epub.fix.main:main',
|
'epub-fix = calibre.ebooks.epub.fix.main:main',
|
||||||
'calibre-smtp = calibre.utils.smtp:main',
|
'calibre-smtp = calibre.utils.smtp:main',
|
||||||
|
@ -1,2 +0,0 @@
|
|||||||
from pdf import PdfFileReader, PdfFileWriter
|
|
||||||
__all__ = ["pdf"]
|
|
@ -1,252 +0,0 @@
|
|||||||
# vim: sw=4:expandtab:foldmethod=marker
|
|
||||||
#
|
|
||||||
# Copyright (c) 2006, Mathieu Fenniak
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# Redistribution and use in source and binary forms, with or without
|
|
||||||
# modification, are permitted provided that the following conditions are
|
|
||||||
# met:
|
|
||||||
#
|
|
||||||
# * Redistributions of source code must retain the above copyright notice,
|
|
||||||
# this list of conditions and the following disclaimer.
|
|
||||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
||||||
# this list of conditions and the following disclaimer in the documentation
|
|
||||||
# and/or other materials provided with the distribution.
|
|
||||||
# * The name of the author may not be used to endorse or promote products
|
|
||||||
# derived from this software without specific prior written permission.
|
|
||||||
#
|
|
||||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
||||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
||||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
||||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
||||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
||||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
||||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
||||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
||||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
||||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
||||||
# POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
Implementation of stream filters for PDF.
|
|
||||||
"""
|
|
||||||
__author__ = "Mathieu Fenniak"
|
|
||||||
__author_email__ = "biziqe@mathieu.fenniak.net"
|
|
||||||
|
|
||||||
from utils import PdfReadError
|
|
||||||
try:
|
|
||||||
from cStringIO import StringIO
|
|
||||||
except ImportError:
|
|
||||||
from StringIO import StringIO
|
|
||||||
|
|
||||||
try:
|
|
||||||
import zlib
|
|
||||||
def decompress(data):
|
|
||||||
return zlib.decompress(data)
|
|
||||||
def compress(data):
|
|
||||||
return zlib.compress(data)
|
|
||||||
except ImportError:
|
|
||||||
# Unable to import zlib. Attempt to use the System.IO.Compression
|
|
||||||
# library from the .NET framework. (IronPython only)
|
|
||||||
import System
|
|
||||||
from System import IO, Collections, Array
|
|
||||||
def _string_to_bytearr(buf):
|
|
||||||
retval = Array.CreateInstance(System.Byte, len(buf))
|
|
||||||
for i in range(len(buf)):
|
|
||||||
retval[i] = ord(buf[i])
|
|
||||||
return retval
|
|
||||||
def _bytearr_to_string(bytes):
|
|
||||||
retval = ""
|
|
||||||
for i in range(bytes.Length):
|
|
||||||
retval += chr(bytes[i])
|
|
||||||
return retval
|
|
||||||
def _read_bytes(stream):
|
|
||||||
ms = IO.MemoryStream()
|
|
||||||
buf = Array.CreateInstance(System.Byte, 2048)
|
|
||||||
while True:
|
|
||||||
bytes = stream.Read(buf, 0, buf.Length)
|
|
||||||
if bytes == 0:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
ms.Write(buf, 0, bytes)
|
|
||||||
retval = ms.ToArray()
|
|
||||||
ms.Close()
|
|
||||||
return retval
|
|
||||||
def decompress(data):
|
|
||||||
bytes = _string_to_bytearr(data)
|
|
||||||
ms = IO.MemoryStream()
|
|
||||||
ms.Write(bytes, 0, bytes.Length)
|
|
||||||
ms.Position = 0 # fseek 0
|
|
||||||
gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Decompress)
|
|
||||||
bytes = _read_bytes(gz)
|
|
||||||
retval = _bytearr_to_string(bytes)
|
|
||||||
gz.Close()
|
|
||||||
return retval
|
|
||||||
def compress(data):
|
|
||||||
bytes = _string_to_bytearr(data)
|
|
||||||
ms = IO.MemoryStream()
|
|
||||||
gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Compress, True)
|
|
||||||
gz.Write(bytes, 0, bytes.Length)
|
|
||||||
gz.Close()
|
|
||||||
ms.Position = 0 # fseek 0
|
|
||||||
bytes = ms.ToArray()
|
|
||||||
retval = _bytearr_to_string(bytes)
|
|
||||||
ms.Close()
|
|
||||||
return retval
|
|
||||||
|
|
||||||
|
|
||||||
class FlateDecode(object):
|
|
||||||
def decode(data, decodeParms):
|
|
||||||
data = decompress(data)
|
|
||||||
predictor = 1
|
|
||||||
if decodeParms:
|
|
||||||
predictor = decodeParms.get("/Predictor", 1)
|
|
||||||
# predictor 1 == no predictor
|
|
||||||
if predictor != 1:
|
|
||||||
columns = decodeParms["/Columns"]
|
|
||||||
# PNG prediction:
|
|
||||||
if predictor >= 10 and predictor <= 15:
|
|
||||||
output = StringIO()
|
|
||||||
# PNG prediction can vary from row to row
|
|
||||||
rowlength = columns + 1
|
|
||||||
assert len(data) % rowlength == 0
|
|
||||||
prev_rowdata = (0,) * rowlength
|
|
||||||
for row in xrange(len(data) / rowlength):
|
|
||||||
rowdata = [ord(x) for x in data[(row*rowlength):((row+1)*rowlength)]]
|
|
||||||
filterByte = rowdata[0]
|
|
||||||
if filterByte == 0:
|
|
||||||
pass
|
|
||||||
elif filterByte == 1:
|
|
||||||
for i in range(2, rowlength):
|
|
||||||
rowdata[i] = (rowdata[i] + rowdata[i-1]) % 256
|
|
||||||
elif filterByte == 2:
|
|
||||||
for i in range(1, rowlength):
|
|
||||||
rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256
|
|
||||||
else:
|
|
||||||
# unsupported PNG filter
|
|
||||||
raise PdfReadError("Unsupported PNG filter %r" % filterByte)
|
|
||||||
prev_rowdata = rowdata
|
|
||||||
output.write(''.join([chr(x) for x in rowdata[1:]]))
|
|
||||||
data = output.getvalue()
|
|
||||||
else:
|
|
||||||
# unsupported predictor
|
|
||||||
raise PdfReadError("Unsupported flatedecode predictor %r" % predictor)
|
|
||||||
return data
|
|
||||||
decode = staticmethod(decode)
|
|
||||||
|
|
||||||
def encode(data):
|
|
||||||
return compress(data)
|
|
||||||
encode = staticmethod(encode)
|
|
||||||
|
|
||||||
class ASCIIHexDecode(object):
|
|
||||||
def decode(data, decodeParms=None):
|
|
||||||
retval = ""
|
|
||||||
char = ""
|
|
||||||
x = 0
|
|
||||||
while True:
|
|
||||||
c = data[x]
|
|
||||||
if c == ">":
|
|
||||||
break
|
|
||||||
elif c.isspace():
|
|
||||||
x += 1
|
|
||||||
continue
|
|
||||||
char += c
|
|
||||||
if len(char) == 2:
|
|
||||||
retval += chr(int(char, base=16))
|
|
||||||
char = ""
|
|
||||||
x += 1
|
|
||||||
assert char == ""
|
|
||||||
return retval
|
|
||||||
decode = staticmethod(decode)
|
|
||||||
|
|
||||||
class ASCII85Decode(object):
|
|
||||||
def decode(data, decodeParms=None):
|
|
||||||
retval = ""
|
|
||||||
group = []
|
|
||||||
x = 0
|
|
||||||
hitEod = False
|
|
||||||
# remove all whitespace from data
|
|
||||||
data = [y for y in data if not (y in ' \n\r\t')]
|
|
||||||
while not hitEod:
|
|
||||||
c = data[x]
|
|
||||||
if len(retval) == 0 and c == "<" and data[x+1] == "~":
|
|
||||||
x += 2
|
|
||||||
continue
|
|
||||||
#elif c.isspace():
|
|
||||||
# x += 1
|
|
||||||
# continue
|
|
||||||
elif c == 'z':
|
|
||||||
assert len(group) == 0
|
|
||||||
retval += '\x00\x00\x00\x00'
|
|
||||||
continue
|
|
||||||
elif c == "~" and data[x+1] == ">":
|
|
||||||
if len(group) != 0:
|
|
||||||
# cannot have a final group of just 1 char
|
|
||||||
assert len(group) > 1
|
|
||||||
cnt = len(group) - 1
|
|
||||||
group += [ 85, 85, 85 ]
|
|
||||||
hitEod = cnt
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
c = ord(c) - 33
|
|
||||||
assert c >= 0 and c < 85
|
|
||||||
group += [ c ]
|
|
||||||
if len(group) >= 5:
|
|
||||||
b = group[0] * (85**4) + \
|
|
||||||
group[1] * (85**3) + \
|
|
||||||
group[2] * (85**2) + \
|
|
||||||
group[3] * 85 + \
|
|
||||||
group[4]
|
|
||||||
assert b < (2**32 - 1)
|
|
||||||
c4 = chr((b >> 0) % 256)
|
|
||||||
c3 = chr((b >> 8) % 256)
|
|
||||||
c2 = chr((b >> 16) % 256)
|
|
||||||
c1 = chr(b >> 24)
|
|
||||||
retval += (c1 + c2 + c3 + c4)
|
|
||||||
if hitEod:
|
|
||||||
retval = retval[:-4+hitEod]
|
|
||||||
group = []
|
|
||||||
x += 1
|
|
||||||
return retval
|
|
||||||
decode = staticmethod(decode)
|
|
||||||
|
|
||||||
def decodeStreamData(stream):
|
|
||||||
from generic import NameObject
|
|
||||||
filters = stream.get("/Filter", ())
|
|
||||||
if len(filters) and not isinstance(filters[0], NameObject):
|
|
||||||
# we have a single filter instance
|
|
||||||
filters = (filters,)
|
|
||||||
data = stream._data
|
|
||||||
for filterType in filters:
|
|
||||||
if filterType == "/FlateDecode":
|
|
||||||
data = FlateDecode.decode(data, stream.get("/DecodeParms"))
|
|
||||||
elif filterType == "/ASCIIHexDecode":
|
|
||||||
data = ASCIIHexDecode.decode(data)
|
|
||||||
elif filterType == "/ASCII85Decode":
|
|
||||||
data = ASCII85Decode.decode(data)
|
|
||||||
elif filterType == "/Crypt":
|
|
||||||
decodeParams = stream.get("/DecodeParams", {})
|
|
||||||
if "/Name" not in decodeParams and "/Type" not in decodeParams:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
raise NotImplementedError("/Crypt filter with /Name or /Type not supported yet")
|
|
||||||
else:
|
|
||||||
# unsupported filter
|
|
||||||
raise NotImplementedError("unsupported filter %s" % filterType)
|
|
||||||
return data
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
assert "abc" == ASCIIHexDecode.decode('61\n626\n3>')
|
|
||||||
|
|
||||||
ascii85Test = """
|
|
||||||
<~9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
|
|
||||||
O<DJ+*.@<*K0@<6L(Df-\\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
|
|
||||||
i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
|
|
||||||
l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
|
|
||||||
>uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
|
|
||||||
"""
|
|
||||||
ascii85_originalText="Man is distinguished, not only by his reason, but by this singular passion from other animals, which is a lust of the mind, that by a perseverance of delight in the continued and indefatigable generation of knowledge, exceeds the short vehemence of any carnal pleasure."
|
|
||||||
assert ASCII85Decode.decode(ascii85Test) == ascii85_originalText
|
|
||||||
|
|
@ -1,780 +0,0 @@
|
|||||||
# vim: sw=4:expandtab:foldmethod=marker
|
|
||||||
#
|
|
||||||
# Copyright (c) 2006, Mathieu Fenniak
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# Redistribution and use in source and binary forms, with or without
|
|
||||||
# modification, are permitted provided that the following conditions are
|
|
||||||
# met:
|
|
||||||
#
|
|
||||||
# * Redistributions of source code must retain the above copyright notice,
|
|
||||||
# this list of conditions and the following disclaimer.
|
|
||||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
||||||
# this list of conditions and the following disclaimer in the documentation
|
|
||||||
# and/or other materials provided with the distribution.
|
|
||||||
# * The name of the author may not be used to endorse or promote products
|
|
||||||
# derived from this software without specific prior written permission.
|
|
||||||
#
|
|
||||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
||||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
||||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
||||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
||||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
||||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
||||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
||||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
||||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
||||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
||||||
# POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
Implementation of generic PDF objects (dictionary, number, string, and so on)
|
|
||||||
"""
|
|
||||||
__author__ = "Mathieu Fenniak"
|
|
||||||
__author_email__ = "biziqe@mathieu.fenniak.net"
|
|
||||||
|
|
||||||
import re
|
|
||||||
from utils import readNonWhitespace, RC4_encrypt
|
|
||||||
import filters
|
|
||||||
import utils
|
|
||||||
import decimal
|
|
||||||
import codecs
|
|
||||||
|
|
||||||
def readObject(stream, pdf):
|
|
||||||
tok = stream.read(1)
|
|
||||||
stream.seek(-1, 1) # reset to start
|
|
||||||
if tok == 't' or tok == 'f':
|
|
||||||
# boolean object
|
|
||||||
return BooleanObject.readFromStream(stream)
|
|
||||||
elif tok == '(':
|
|
||||||
# string object
|
|
||||||
return readStringFromStream(stream)
|
|
||||||
elif tok == '/':
|
|
||||||
# name object
|
|
||||||
return NameObject.readFromStream(stream)
|
|
||||||
elif tok == '[':
|
|
||||||
# array object
|
|
||||||
return ArrayObject.readFromStream(stream, pdf)
|
|
||||||
elif tok == 'n':
|
|
||||||
# null object
|
|
||||||
return NullObject.readFromStream(stream)
|
|
||||||
elif tok == '<':
|
|
||||||
# hexadecimal string OR dictionary
|
|
||||||
peek = stream.read(2)
|
|
||||||
stream.seek(-2, 1) # reset to start
|
|
||||||
if peek == '<<':
|
|
||||||
return DictionaryObject.readFromStream(stream, pdf)
|
|
||||||
else:
|
|
||||||
return readHexStringFromStream(stream)
|
|
||||||
elif tok == '%':
|
|
||||||
# comment
|
|
||||||
while tok not in ('\r', '\n'):
|
|
||||||
tok = stream.read(1)
|
|
||||||
tok = readNonWhitespace(stream)
|
|
||||||
stream.seek(-1, 1)
|
|
||||||
return readObject(stream, pdf)
|
|
||||||
else:
|
|
||||||
# number object OR indirect reference
|
|
||||||
if tok == '+' or tok == '-':
|
|
||||||
# number
|
|
||||||
return NumberObject.readFromStream(stream)
|
|
||||||
peek = stream.read(20)
|
|
||||||
stream.seek(-len(peek), 1) # reset to start
|
|
||||||
if re.match(r"(\d+)\s(\d+)\sR[^a-zA-Z]", peek) != None:
|
|
||||||
return IndirectObject.readFromStream(stream, pdf)
|
|
||||||
else:
|
|
||||||
return NumberObject.readFromStream(stream)
|
|
||||||
|
|
||||||
class PdfObject(object):
|
|
||||||
def getObject(self):
|
|
||||||
"""Resolves indirect references."""
|
|
||||||
return self
|
|
||||||
|
|
||||||
|
|
||||||
class NullObject(PdfObject):
|
|
||||||
def writeToStream(self, stream, encryption_key):
|
|
||||||
stream.write("null")
|
|
||||||
|
|
||||||
def readFromStream(stream):
|
|
||||||
nulltxt = stream.read(4)
|
|
||||||
if nulltxt != "null":
|
|
||||||
raise utils.PdfReadError, "error reading null object"
|
|
||||||
return NullObject()
|
|
||||||
readFromStream = staticmethod(readFromStream)
|
|
||||||
|
|
||||||
|
|
||||||
class BooleanObject(PdfObject):
|
|
||||||
def __init__(self, value):
|
|
||||||
self.value = value
|
|
||||||
|
|
||||||
def writeToStream(self, stream, encryption_key):
|
|
||||||
if self.value:
|
|
||||||
stream.write("true")
|
|
||||||
else:
|
|
||||||
stream.write("false")
|
|
||||||
|
|
||||||
def readFromStream(stream):
|
|
||||||
word = stream.read(4)
|
|
||||||
if word == "true":
|
|
||||||
return BooleanObject(True)
|
|
||||||
elif word == "fals":
|
|
||||||
stream.read(1)
|
|
||||||
return BooleanObject(False)
|
|
||||||
assert False
|
|
||||||
readFromStream = staticmethod(readFromStream)
|
|
||||||
|
|
||||||
|
|
||||||
class ArrayObject(list, PdfObject):
|
|
||||||
def writeToStream(self, stream, encryption_key):
|
|
||||||
stream.write("[")
|
|
||||||
for data in self:
|
|
||||||
stream.write(" ")
|
|
||||||
data.writeToStream(stream, encryption_key)
|
|
||||||
stream.write(" ]")
|
|
||||||
|
|
||||||
def readFromStream(stream, pdf):
|
|
||||||
arr = ArrayObject()
|
|
||||||
tmp = stream.read(1)
|
|
||||||
if tmp != "[":
|
|
||||||
raise utils.PdfReadError, "error reading array"
|
|
||||||
while True:
|
|
||||||
# skip leading whitespace
|
|
||||||
tok = stream.read(1)
|
|
||||||
while tok.isspace():
|
|
||||||
tok = stream.read(1)
|
|
||||||
stream.seek(-1, 1)
|
|
||||||
# check for array ending
|
|
||||||
peekahead = stream.read(1)
|
|
||||||
if peekahead == "]":
|
|
||||||
break
|
|
||||||
stream.seek(-1, 1)
|
|
||||||
# read and append obj
|
|
||||||
arr.append(readObject(stream, pdf))
|
|
||||||
return arr
|
|
||||||
readFromStream = staticmethod(readFromStream)
|
|
||||||
|
|
||||||
|
|
||||||
class IndirectObject(PdfObject):
|
|
||||||
def __init__(self, idnum, generation, pdf):
|
|
||||||
self.idnum = idnum
|
|
||||||
self.generation = generation
|
|
||||||
self.pdf = pdf
|
|
||||||
|
|
||||||
def getObject(self):
|
|
||||||
return self.pdf.getObject(self).getObject()
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return "IndirectObject(%r, %r)" % (self.idnum, self.generation)
|
|
||||||
|
|
||||||
def __eq__(self, other):
|
|
||||||
return (
|
|
||||||
other != None and
|
|
||||||
isinstance(other, IndirectObject) and
|
|
||||||
self.idnum == other.idnum and
|
|
||||||
self.generation == other.generation and
|
|
||||||
self.pdf is other.pdf
|
|
||||||
)
|
|
||||||
|
|
||||||
def __ne__(self, other):
|
|
||||||
return not self.__eq__(other)
|
|
||||||
|
|
||||||
def writeToStream(self, stream, encryption_key):
|
|
||||||
stream.write("%s %s R" % (self.idnum, self.generation))
|
|
||||||
|
|
||||||
def readFromStream(stream, pdf):
|
|
||||||
idnum = ""
|
|
||||||
while True:
|
|
||||||
tok = stream.read(1)
|
|
||||||
if tok.isspace():
|
|
||||||
break
|
|
||||||
idnum += tok
|
|
||||||
generation = ""
|
|
||||||
while True:
|
|
||||||
tok = stream.read(1)
|
|
||||||
if tok.isspace():
|
|
||||||
break
|
|
||||||
generation += tok
|
|
||||||
r = stream.read(1)
|
|
||||||
if r != "R":
|
|
||||||
raise utils.PdfReadError("error reading indirect object reference")
|
|
||||||
return IndirectObject(int(idnum), int(generation), pdf)
|
|
||||||
readFromStream = staticmethod(readFromStream)
|
|
||||||
|
|
||||||
|
|
||||||
class FloatObject(decimal.Decimal, PdfObject):
|
|
||||||
def __new__(cls, value="0", context=None):
|
|
||||||
return decimal.Decimal.__new__(cls, str(value), context)
|
|
||||||
def __repr__(self):
|
|
||||||
return str(self)
|
|
||||||
def writeToStream(self, stream, encryption_key):
|
|
||||||
stream.write(str(self))
|
|
||||||
|
|
||||||
|
|
||||||
class NumberObject(int, PdfObject):
|
|
||||||
def __init__(self, value):
|
|
||||||
int.__init__(self, value)
|
|
||||||
|
|
||||||
def writeToStream(self, stream, encryption_key):
|
|
||||||
stream.write(repr(self))
|
|
||||||
|
|
||||||
def readFromStream(stream):
|
|
||||||
name = ""
|
|
||||||
while True:
|
|
||||||
tok = stream.read(1)
|
|
||||||
if tok != '+' and tok != '-' and tok != '.' and not tok.isdigit():
|
|
||||||
stream.seek(-1, 1)
|
|
||||||
break
|
|
||||||
name += tok
|
|
||||||
if name.find(".") != -1:
|
|
||||||
return FloatObject(name)
|
|
||||||
else:
|
|
||||||
return NumberObject(name)
|
|
||||||
readFromStream = staticmethod(readFromStream)
|
|
||||||
|
|
||||||
|
|
||||||
##
|
|
||||||
# Given a string (either a "str" or "unicode"), create a ByteStringObject or a
|
|
||||||
# TextStringObject to represent the string.
|
|
||||||
def createStringObject(string):
|
|
||||||
if isinstance(string, unicode):
|
|
||||||
return TextStringObject(string)
|
|
||||||
elif isinstance(string, str):
|
|
||||||
if string.startswith(codecs.BOM_UTF16_BE):
|
|
||||||
retval = TextStringObject(string.decode("utf-16"))
|
|
||||||
retval.autodetect_utf16 = True
|
|
||||||
return retval
|
|
||||||
else:
|
|
||||||
# This is probably a big performance hit here, but we need to
|
|
||||||
# convert string objects into the text/unicode-aware version if
|
|
||||||
# possible... and the only way to check if that's possible is
|
|
||||||
# to try. Some strings are strings, some are just byte arrays.
|
|
||||||
try:
|
|
||||||
retval = TextStringObject(decode_pdfdocencoding(string))
|
|
||||||
retval.autodetect_pdfdocencoding = True
|
|
||||||
return retval
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
return ByteStringObject(string)
|
|
||||||
else:
|
|
||||||
raise TypeError("createStringObject should have str or unicode arg")
|
|
||||||
|
|
||||||
|
|
||||||
def readHexStringFromStream(stream):
|
|
||||||
stream.read(1)
|
|
||||||
txt = ""
|
|
||||||
x = ""
|
|
||||||
while True:
|
|
||||||
tok = readNonWhitespace(stream)
|
|
||||||
if tok == ">":
|
|
||||||
break
|
|
||||||
x += tok
|
|
||||||
if len(x) == 2:
|
|
||||||
txt += chr(int(x, base=16))
|
|
||||||
x = ""
|
|
||||||
if len(x) == 1:
|
|
||||||
x += "0"
|
|
||||||
if len(x) == 2:
|
|
||||||
txt += chr(int(x, base=16))
|
|
||||||
return createStringObject(txt)
|
|
||||||
|
|
||||||
|
|
||||||
def readStringFromStream(stream):
|
|
||||||
tok = stream.read(1)
|
|
||||||
parens = 1
|
|
||||||
txt = ""
|
|
||||||
while True:
|
|
||||||
tok = stream.read(1)
|
|
||||||
if tok == "(":
|
|
||||||
parens += 1
|
|
||||||
elif tok == ")":
|
|
||||||
parens -= 1
|
|
||||||
if parens == 0:
|
|
||||||
break
|
|
||||||
elif tok == "\\":
|
|
||||||
tok = stream.read(1)
|
|
||||||
if tok == "n":
|
|
||||||
tok = "\n"
|
|
||||||
elif tok == "r":
|
|
||||||
tok = "\r"
|
|
||||||
elif tok == "t":
|
|
||||||
tok = "\t"
|
|
||||||
elif tok == "b":
|
|
||||||
tok = "\b"
|
|
||||||
elif tok == "f":
|
|
||||||
tok = "\f"
|
|
||||||
elif tok == "(":
|
|
||||||
tok = "("
|
|
||||||
elif tok == ")":
|
|
||||||
tok = ")"
|
|
||||||
elif tok == "\\":
|
|
||||||
tok = "\\"
|
|
||||||
elif tok.isdigit():
|
|
||||||
tok += stream.read(2)
|
|
||||||
tok = chr(int(tok, base=8))
|
|
||||||
elif tok in "\n\r":
|
|
||||||
# This case is hit when a backslash followed by a line
|
|
||||||
# break occurs. If it's a multi-char EOL, consume the
|
|
||||||
# second character:
|
|
||||||
tok = stream.read(1)
|
|
||||||
if not tok in "\n\r":
|
|
||||||
stream.seek(-1, 1)
|
|
||||||
# Then don't add anything to the actual string, since this
|
|
||||||
# line break was escaped:
|
|
||||||
tok = ''
|
|
||||||
else:
|
|
||||||
raise utils.PdfReadError("Unexpected escaped string")
|
|
||||||
txt += tok
|
|
||||||
return createStringObject(txt)
|
|
||||||
|
|
||||||
|
|
||||||
##
|
|
||||||
# Represents a string object where the text encoding could not be determined.
|
|
||||||
# This occurs quite often, as the PDF spec doesn't provide an alternate way to
|
|
||||||
# represent strings -- for example, the encryption data stored in files (like
|
|
||||||
# /O) is clearly not text, but is still stored in a "String" object.
|
|
||||||
class ByteStringObject(str, PdfObject):
|
|
||||||
|
|
||||||
##
|
|
||||||
# For compatibility with TextStringObject.original_bytes. This method
|
|
||||||
# returns self.
|
|
||||||
original_bytes = property(lambda self: self)
|
|
||||||
|
|
||||||
def writeToStream(self, stream, encryption_key):
|
|
||||||
bytearr = self
|
|
||||||
if encryption_key:
|
|
||||||
bytearr = RC4_encrypt(encryption_key, bytearr)
|
|
||||||
stream.write("<")
|
|
||||||
stream.write(bytearr.encode("hex"))
|
|
||||||
stream.write(">")
|
|
||||||
|
|
||||||
|
|
||||||
##
|
|
||||||
# Represents a string object that has been decoded into a real unicode string.
|
|
||||||
# If read from a PDF document, this string appeared to match the
|
|
||||||
# PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to
|
|
||||||
# occur.
|
|
||||||
class TextStringObject(unicode, PdfObject):
|
|
||||||
autodetect_pdfdocencoding = False
|
|
||||||
autodetect_utf16 = False
|
|
||||||
|
|
||||||
##
|
|
||||||
# It is occasionally possible that a text string object gets created where
|
|
||||||
# a byte string object was expected due to the autodetection mechanism --
|
|
||||||
# if that occurs, this "original_bytes" property can be used to
|
|
||||||
# back-calculate what the original encoded bytes were.
|
|
||||||
original_bytes = property(lambda self: self.get_original_bytes())
|
|
||||||
|
|
||||||
def get_original_bytes(self):
|
|
||||||
# We're a text string object, but the library is trying to get our raw
|
|
||||||
# bytes. This can happen if we auto-detected this string as text, but
|
|
||||||
# we were wrong. It's pretty common. Return the original bytes that
|
|
||||||
# would have been used to create this object, based upon the autodetect
|
|
||||||
# method.
|
|
||||||
if self.autodetect_utf16:
|
|
||||||
return codecs.BOM_UTF16_BE + self.encode("utf-16be")
|
|
||||||
elif self.autodetect_pdfdocencoding:
|
|
||||||
return encode_pdfdocencoding(self)
|
|
||||||
else:
|
|
||||||
raise Exception("no information about original bytes")
|
|
||||||
|
|
||||||
def writeToStream(self, stream, encryption_key):
|
|
||||||
# Try to write the string out as a PDFDocEncoding encoded string. It's
|
|
||||||
# nicer to look at in the PDF file. Sadly, we take a performance hit
|
|
||||||
# here for trying...
|
|
||||||
try:
|
|
||||||
bytearr = encode_pdfdocencoding(self)
|
|
||||||
except UnicodeEncodeError:
|
|
||||||
bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
|
|
||||||
if encryption_key:
|
|
||||||
bytearr = RC4_encrypt(encryption_key, bytearr)
|
|
||||||
obj = ByteStringObject(bytearr)
|
|
||||||
obj.writeToStream(stream, None)
|
|
||||||
else:
|
|
||||||
stream.write("(")
|
|
||||||
for c in bytearr:
|
|
||||||
if not c.isalnum() and c != ' ':
|
|
||||||
stream.write("\\%03o" % ord(c))
|
|
||||||
else:
|
|
||||||
stream.write(c)
|
|
||||||
stream.write(")")
|
|
||||||
|
|
||||||
|
|
||||||
class NameObject(str, PdfObject):
|
|
||||||
delimiterCharacters = "(", ")", "<", ">", "[", "]", "{", "}", "/", "%"
|
|
||||||
|
|
||||||
def __init__(self, data):
|
|
||||||
str.__init__(self, data)
|
|
||||||
|
|
||||||
def writeToStream(self, stream, encryption_key):
|
|
||||||
stream.write(self)
|
|
||||||
|
|
||||||
def readFromStream(stream):
|
|
||||||
name = stream.read(1)
|
|
||||||
if name != "/":
|
|
||||||
raise utils.PdfReadError, "name read error"
|
|
||||||
while True:
|
|
||||||
tok = stream.read(1)
|
|
||||||
if tok.isspace() or tok in NameObject.delimiterCharacters:
|
|
||||||
stream.seek(-1, 1)
|
|
||||||
break
|
|
||||||
name += tok
|
|
||||||
return NameObject(name)
|
|
||||||
readFromStream = staticmethod(readFromStream)
|
|
||||||
|
|
||||||
|
|
||||||
class DictionaryObject(dict, PdfObject):
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
if len(args) == 0:
|
|
||||||
self.update(kwargs)
|
|
||||||
elif len(args) == 1:
|
|
||||||
arr = args[0]
|
|
||||||
# If we're passed a list/tuple, make a dict out of it
|
|
||||||
if not hasattr(arr, "iteritems"):
|
|
||||||
newarr = {}
|
|
||||||
for k, v in arr:
|
|
||||||
newarr[k] = v
|
|
||||||
arr = newarr
|
|
||||||
self.update(arr)
|
|
||||||
else:
|
|
||||||
raise TypeError("dict expected at most 1 argument, got 3")
|
|
||||||
|
|
||||||
def update(self, arr):
|
|
||||||
# note, a ValueError halfway through copying values
|
|
||||||
# will leave half the values in this dict.
|
|
||||||
for k, v in arr.iteritems():
|
|
||||||
self.__setitem__(k, v)
|
|
||||||
|
|
||||||
def raw_get(self, key):
|
|
||||||
return dict.__getitem__(self, key)
|
|
||||||
|
|
||||||
def __setitem__(self, key, value):
|
|
||||||
if not isinstance(key, PdfObject):
|
|
||||||
raise ValueError("key must be PdfObject")
|
|
||||||
if not isinstance(value, PdfObject):
|
|
||||||
raise ValueError("value must be PdfObject")
|
|
||||||
return dict.__setitem__(self, key, value)
|
|
||||||
|
|
||||||
def setdefault(self, key, value=None):
|
|
||||||
if not isinstance(key, PdfObject):
|
|
||||||
raise ValueError("key must be PdfObject")
|
|
||||||
if not isinstance(value, PdfObject):
|
|
||||||
raise ValueError("value must be PdfObject")
|
|
||||||
return dict.setdefault(self, key, value)
|
|
||||||
|
|
||||||
def __getitem__(self, key):
|
|
||||||
return dict.__getitem__(self, key).getObject()
|
|
||||||
|
|
||||||
##
|
|
||||||
# Retrieves XMP (Extensible Metadata Platform) data relevant to the
|
|
||||||
# this object, if available.
|
|
||||||
# <p>
|
|
||||||
# Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
# @return Returns a {@link #xmp.XmpInformation XmlInformation} instance
|
|
||||||
# that can be used to access XMP metadata from the document. Can also
|
|
||||||
# return None if no metadata was found on the document root.
|
|
||||||
def getXmpMetadata(self):
|
|
||||||
metadata = self.get("/Metadata", None)
|
|
||||||
if metadata == None:
|
|
||||||
return None
|
|
||||||
metadata = metadata.getObject()
|
|
||||||
import xmp
|
|
||||||
if not isinstance(metadata, xmp.XmpInformation):
|
|
||||||
metadata = xmp.XmpInformation(metadata)
|
|
||||||
self[NameObject("/Metadata")] = metadata
|
|
||||||
return metadata
|
|
||||||
|
|
||||||
##
|
|
||||||
# Read-only property that accesses the {@link
|
|
||||||
# #DictionaryObject.getXmpData getXmpData} function.
|
|
||||||
# <p>
|
|
||||||
# Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None)
|
|
||||||
|
|
||||||
def writeToStream(self, stream, encryption_key):
|
|
||||||
stream.write("<<\n")
|
|
||||||
for key, value in self.items():
|
|
||||||
key.writeToStream(stream, encryption_key)
|
|
||||||
stream.write(" ")
|
|
||||||
value.writeToStream(stream, encryption_key)
|
|
||||||
stream.write("\n")
|
|
||||||
stream.write(">>")
|
|
||||||
|
|
||||||
def readFromStream(stream, pdf):
|
|
||||||
tmp = stream.read(2)
|
|
||||||
if tmp != "<<":
|
|
||||||
raise utils.PdfReadError, "dictionary read error"
|
|
||||||
data = {}
|
|
||||||
while True:
|
|
||||||
tok = readNonWhitespace(stream)
|
|
||||||
if tok == ">":
|
|
||||||
stream.read(1)
|
|
||||||
break
|
|
||||||
stream.seek(-1, 1)
|
|
||||||
key = readObject(stream, pdf)
|
|
||||||
tok = readNonWhitespace(stream)
|
|
||||||
stream.seek(-1, 1)
|
|
||||||
value = readObject(stream, pdf)
|
|
||||||
if data.has_key(key):
|
|
||||||
# multiple definitions of key not permitted
|
|
||||||
raise utils.PdfReadError, "multiple definitions in dictionary"
|
|
||||||
data[key] = value
|
|
||||||
pos = stream.tell()
|
|
||||||
s = readNonWhitespace(stream)
|
|
||||||
if s == 's' and stream.read(5) == 'tream':
|
|
||||||
eol = stream.read(1)
|
|
||||||
# odd PDF file output has spaces after 'stream' keyword but before EOL.
|
|
||||||
# patch provided by Danial Sandler
|
|
||||||
while eol == ' ':
|
|
||||||
eol = stream.read(1)
|
|
||||||
assert eol in ("\n", "\r")
|
|
||||||
if eol == "\r":
|
|
||||||
# read \n after
|
|
||||||
stream.read(1)
|
|
||||||
# this is a stream object, not a dictionary
|
|
||||||
assert data.has_key("/Length")
|
|
||||||
length = data["/Length"]
|
|
||||||
if isinstance(length, IndirectObject):
|
|
||||||
t = stream.tell()
|
|
||||||
length = pdf.getObject(length)
|
|
||||||
stream.seek(t, 0)
|
|
||||||
data["__streamdata__"] = stream.read(length)
|
|
||||||
e = readNonWhitespace(stream)
|
|
||||||
ndstream = stream.read(8)
|
|
||||||
if (e + ndstream) != "endstream":
|
|
||||||
# (sigh) - the odd PDF file has a length that is too long, so
|
|
||||||
# we need to read backwards to find the "endstream" ending.
|
|
||||||
# ReportLab (unknown version) generates files with this bug,
|
|
||||||
# and Python users into PDF files tend to be our audience.
|
|
||||||
# we need to do this to correct the streamdata and chop off
|
|
||||||
# an extra character.
|
|
||||||
pos = stream.tell()
|
|
||||||
stream.seek(-10, 1)
|
|
||||||
end = stream.read(9)
|
|
||||||
if end == "endstream":
|
|
||||||
# we found it by looking back one character further.
|
|
||||||
data["__streamdata__"] = data["__streamdata__"][:-1]
|
|
||||||
else:
|
|
||||||
stream.seek(pos, 0)
|
|
||||||
raise utils.PdfReadError, "Unable to find 'endstream' marker after stream."
|
|
||||||
else:
|
|
||||||
stream.seek(pos, 0)
|
|
||||||
if data.has_key("__streamdata__"):
|
|
||||||
return StreamObject.initializeFromDictionary(data)
|
|
||||||
else:
|
|
||||||
retval = DictionaryObject()
|
|
||||||
retval.update(data)
|
|
||||||
return retval
|
|
||||||
readFromStream = staticmethod(readFromStream)
|
|
||||||
|
|
||||||
|
|
||||||
class StreamObject(DictionaryObject):
|
|
||||||
def __init__(self):
|
|
||||||
self._data = None
|
|
||||||
self.decodedSelf = None
|
|
||||||
|
|
||||||
def writeToStream(self, stream, encryption_key):
|
|
||||||
self[NameObject("/Length")] = NumberObject(len(self._data))
|
|
||||||
DictionaryObject.writeToStream(self, stream, encryption_key)
|
|
||||||
del self["/Length"]
|
|
||||||
stream.write("\nstream\n")
|
|
||||||
data = self._data
|
|
||||||
if encryption_key:
|
|
||||||
data = RC4_encrypt(encryption_key, data)
|
|
||||||
stream.write(data)
|
|
||||||
stream.write("\nendstream")
|
|
||||||
|
|
||||||
def initializeFromDictionary(data):
|
|
||||||
if data.has_key("/Filter"):
|
|
||||||
retval = EncodedStreamObject()
|
|
||||||
else:
|
|
||||||
retval = DecodedStreamObject()
|
|
||||||
retval._data = data["__streamdata__"]
|
|
||||||
del data["__streamdata__"]
|
|
||||||
del data["/Length"]
|
|
||||||
retval.update(data)
|
|
||||||
return retval
|
|
||||||
initializeFromDictionary = staticmethod(initializeFromDictionary)
|
|
||||||
|
|
||||||
def flateEncode(self):
|
|
||||||
if self.has_key("/Filter"):
|
|
||||||
f = self["/Filter"]
|
|
||||||
if isinstance(f, ArrayObject):
|
|
||||||
f.insert(0, NameObject("/FlateDecode"))
|
|
||||||
else:
|
|
||||||
newf = ArrayObject()
|
|
||||||
newf.append(NameObject("/FlateDecode"))
|
|
||||||
newf.append(f)
|
|
||||||
f = newf
|
|
||||||
else:
|
|
||||||
f = NameObject("/FlateDecode")
|
|
||||||
retval = EncodedStreamObject()
|
|
||||||
retval[NameObject("/Filter")] = f
|
|
||||||
retval._data = filters.FlateDecode.encode(self._data)
|
|
||||||
return retval
|
|
||||||
|
|
||||||
|
|
||||||
class DecodedStreamObject(StreamObject):
|
|
||||||
def getData(self):
|
|
||||||
return self._data
|
|
||||||
|
|
||||||
def setData(self, data):
|
|
||||||
self._data = data
|
|
||||||
|
|
||||||
|
|
||||||
class EncodedStreamObject(StreamObject):
|
|
||||||
def __init__(self):
|
|
||||||
self.decodedSelf = None
|
|
||||||
|
|
||||||
def getData(self):
|
|
||||||
if self.decodedSelf:
|
|
||||||
# cached version of decoded object
|
|
||||||
return self.decodedSelf.getData()
|
|
||||||
else:
|
|
||||||
# create decoded object
|
|
||||||
decoded = DecodedStreamObject()
|
|
||||||
decoded._data = filters.decodeStreamData(self)
|
|
||||||
for key, value in self.items():
|
|
||||||
if not key in ("/Length", "/Filter", "/DecodeParms"):
|
|
||||||
decoded[key] = value
|
|
||||||
self.decodedSelf = decoded
|
|
||||||
return decoded._data
|
|
||||||
|
|
||||||
def setData(self, data):
|
|
||||||
raise utils.PdfReadError, "Creating EncodedStreamObject is not currently supported"
|
|
||||||
|
|
||||||
|
|
||||||
class RectangleObject(ArrayObject):
|
|
||||||
def __init__(self, arr):
|
|
||||||
# must have four points
|
|
||||||
assert len(arr) == 4
|
|
||||||
# automatically convert arr[x] into NumberObject(arr[x]) if necessary
|
|
||||||
ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr])
|
|
||||||
|
|
||||||
def ensureIsNumber(self, value):
|
|
||||||
if not isinstance(value, (NumberObject, FloatObject)):
|
|
||||||
value = FloatObject(value)
|
|
||||||
return value
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return "RectangleObject(%s)" % repr(list(self))
|
|
||||||
|
|
||||||
def getLowerLeft_x(self):
|
|
||||||
return self[0]
|
|
||||||
|
|
||||||
def getLowerLeft_y(self):
|
|
||||||
return self[1]
|
|
||||||
|
|
||||||
def getUpperRight_x(self):
|
|
||||||
return self[2]
|
|
||||||
|
|
||||||
def getUpperRight_y(self):
|
|
||||||
return self[3]
|
|
||||||
|
|
||||||
def getUpperLeft_x(self):
|
|
||||||
return self.getLowerLeft_x()
|
|
||||||
|
|
||||||
def getUpperLeft_y(self):
|
|
||||||
return self.getUpperRight_y()
|
|
||||||
|
|
||||||
def getLowerRight_x(self):
|
|
||||||
return self.getUpperRight_x()
|
|
||||||
|
|
||||||
def getLowerRight_y(self):
|
|
||||||
return self.getLowerLeft_y()
|
|
||||||
|
|
||||||
def getLowerLeft(self):
|
|
||||||
return self.getLowerLeft_x(), self.getLowerLeft_y()
|
|
||||||
|
|
||||||
def getLowerRight(self):
|
|
||||||
return self.getLowerRight_x(), self.getLowerRight_y()
|
|
||||||
|
|
||||||
def getUpperLeft(self):
|
|
||||||
return self.getUpperLeft_x(), self.getUpperLeft_y()
|
|
||||||
|
|
||||||
def getUpperRight(self):
|
|
||||||
return self.getUpperRight_x(), self.getUpperRight_y()
|
|
||||||
|
|
||||||
def setLowerLeft(self, value):
|
|
||||||
self[0], self[1] = [self.ensureIsNumber(x) for x in value]
|
|
||||||
|
|
||||||
def setLowerRight(self, value):
|
|
||||||
self[2], self[1] = [self.ensureIsNumber(x) for x in value]
|
|
||||||
|
|
||||||
def setUpperLeft(self, value):
|
|
||||||
self[0], self[3] = [self.ensureIsNumber(x) for x in value]
|
|
||||||
|
|
||||||
def setUpperRight(self, value):
|
|
||||||
self[2], self[3] = [self.ensureIsNumber(x) for x in value]
|
|
||||||
|
|
||||||
lowerLeft = property(getLowerLeft, setLowerLeft, None, None)
|
|
||||||
lowerRight = property(getLowerRight, setLowerRight, None, None)
|
|
||||||
upperLeft = property(getUpperLeft, setUpperLeft, None, None)
|
|
||||||
upperRight = property(getUpperRight, setUpperRight, None, None)
|
|
||||||
|
|
||||||
|
|
||||||
def encode_pdfdocencoding(unicode_string):
|
|
||||||
retval = ''
|
|
||||||
for c in unicode_string:
|
|
||||||
try:
|
|
||||||
retval += chr(_pdfDocEncoding_rev[c])
|
|
||||||
except KeyError:
|
|
||||||
raise UnicodeEncodeError("pdfdocencoding", c, -1, -1,
|
|
||||||
"does not exist in translation table")
|
|
||||||
return retval
|
|
||||||
|
|
||||||
def decode_pdfdocencoding(byte_array):
|
|
||||||
retval = u''
|
|
||||||
for b in byte_array:
|
|
||||||
c = _pdfDocEncoding[ord(b)]
|
|
||||||
if c == u'\u0000':
|
|
||||||
raise UnicodeDecodeError("pdfdocencoding", b, -1, -1,
|
|
||||||
"does not exist in translation table")
|
|
||||||
retval += c
|
|
||||||
return retval
|
|
||||||
|
|
||||||
_pdfDocEncoding = (
|
|
||||||
u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000',
|
|
||||||
u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000',
|
|
||||||
u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000',
|
|
||||||
u'\u02d8', u'\u02c7', u'\u02c6', u'\u02d9', u'\u02dd', u'\u02db', u'\u02da', u'\u02dc',
|
|
||||||
u'\u0020', u'\u0021', u'\u0022', u'\u0023', u'\u0024', u'\u0025', u'\u0026', u'\u0027',
|
|
||||||
u'\u0028', u'\u0029', u'\u002a', u'\u002b', u'\u002c', u'\u002d', u'\u002e', u'\u002f',
|
|
||||||
u'\u0030', u'\u0031', u'\u0032', u'\u0033', u'\u0034', u'\u0035', u'\u0036', u'\u0037',
|
|
||||||
u'\u0038', u'\u0039', u'\u003a', u'\u003b', u'\u003c', u'\u003d', u'\u003e', u'\u003f',
|
|
||||||
u'\u0040', u'\u0041', u'\u0042', u'\u0043', u'\u0044', u'\u0045', u'\u0046', u'\u0047',
|
|
||||||
u'\u0048', u'\u0049', u'\u004a', u'\u004b', u'\u004c', u'\u004d', u'\u004e', u'\u004f',
|
|
||||||
u'\u0050', u'\u0051', u'\u0052', u'\u0053', u'\u0054', u'\u0055', u'\u0056', u'\u0057',
|
|
||||||
u'\u0058', u'\u0059', u'\u005a', u'\u005b', u'\u005c', u'\u005d', u'\u005e', u'\u005f',
|
|
||||||
u'\u0060', u'\u0061', u'\u0062', u'\u0063', u'\u0064', u'\u0065', u'\u0066', u'\u0067',
|
|
||||||
u'\u0068', u'\u0069', u'\u006a', u'\u006b', u'\u006c', u'\u006d', u'\u006e', u'\u006f',
|
|
||||||
u'\u0070', u'\u0071', u'\u0072', u'\u0073', u'\u0074', u'\u0075', u'\u0076', u'\u0077',
|
|
||||||
u'\u0078', u'\u0079', u'\u007a', u'\u007b', u'\u007c', u'\u007d', u'\u007e', u'\u0000',
|
|
||||||
u'\u2022', u'\u2020', u'\u2021', u'\u2026', u'\u2014', u'\u2013', u'\u0192', u'\u2044',
|
|
||||||
u'\u2039', u'\u203a', u'\u2212', u'\u2030', u'\u201e', u'\u201c', u'\u201d', u'\u2018',
|
|
||||||
u'\u2019', u'\u201a', u'\u2122', u'\ufb01', u'\ufb02', u'\u0141', u'\u0152', u'\u0160',
|
|
||||||
u'\u0178', u'\u017d', u'\u0131', u'\u0142', u'\u0153', u'\u0161', u'\u017e', u'\u0000',
|
|
||||||
u'\u20ac', u'\u00a1', u'\u00a2', u'\u00a3', u'\u00a4', u'\u00a5', u'\u00a6', u'\u00a7',
|
|
||||||
u'\u00a8', u'\u00a9', u'\u00aa', u'\u00ab', u'\u00ac', u'\u0000', u'\u00ae', u'\u00af',
|
|
||||||
u'\u00b0', u'\u00b1', u'\u00b2', u'\u00b3', u'\u00b4', u'\u00b5', u'\u00b6', u'\u00b7',
|
|
||||||
u'\u00b8', u'\u00b9', u'\u00ba', u'\u00bb', u'\u00bc', u'\u00bd', u'\u00be', u'\u00bf',
|
|
||||||
u'\u00c0', u'\u00c1', u'\u00c2', u'\u00c3', u'\u00c4', u'\u00c5', u'\u00c6', u'\u00c7',
|
|
||||||
u'\u00c8', u'\u00c9', u'\u00ca', u'\u00cb', u'\u00cc', u'\u00cd', u'\u00ce', u'\u00cf',
|
|
||||||
u'\u00d0', u'\u00d1', u'\u00d2', u'\u00d3', u'\u00d4', u'\u00d5', u'\u00d6', u'\u00d7',
|
|
||||||
u'\u00d8', u'\u00d9', u'\u00da', u'\u00db', u'\u00dc', u'\u00dd', u'\u00de', u'\u00df',
|
|
||||||
u'\u00e0', u'\u00e1', u'\u00e2', u'\u00e3', u'\u00e4', u'\u00e5', u'\u00e6', u'\u00e7',
|
|
||||||
u'\u00e8', u'\u00e9', u'\u00ea', u'\u00eb', u'\u00ec', u'\u00ed', u'\u00ee', u'\u00ef',
|
|
||||||
u'\u00f0', u'\u00f1', u'\u00f2', u'\u00f3', u'\u00f4', u'\u00f5', u'\u00f6', u'\u00f7',
|
|
||||||
u'\u00f8', u'\u00f9', u'\u00fa', u'\u00fb', u'\u00fc', u'\u00fd', u'\u00fe', u'\u00ff'
|
|
||||||
)
|
|
||||||
|
|
||||||
assert len(_pdfDocEncoding) == 256
|
|
||||||
|
|
||||||
_pdfDocEncoding_rev = {}
|
|
||||||
for i in xrange(256):
|
|
||||||
char = _pdfDocEncoding[i]
|
|
||||||
if char == u"\u0000":
|
|
||||||
continue
|
|
||||||
assert char not in _pdfDocEncoding_rev
|
|
||||||
_pdfDocEncoding_rev[char] = i
|
|
||||||
|
|
1530
src/pyPdf/pdf.py
1530
src/pyPdf/pdf.py
File diff suppressed because it is too large
Load Diff
@ -1,110 +0,0 @@
|
|||||||
# vim: sw=4:expandtab:foldmethod=marker
|
|
||||||
#
|
|
||||||
# Copyright (c) 2006, Mathieu Fenniak
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# Redistribution and use in source and binary forms, with or without
|
|
||||||
# modification, are permitted provided that the following conditions are
|
|
||||||
# met:
|
|
||||||
#
|
|
||||||
# * Redistributions of source code must retain the above copyright notice,
|
|
||||||
# this list of conditions and the following disclaimer.
|
|
||||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
||||||
# this list of conditions and the following disclaimer in the documentation
|
|
||||||
# and/or other materials provided with the distribution.
|
|
||||||
# * The name of the author may not be used to endorse or promote products
|
|
||||||
# derived from this software without specific prior written permission.
|
|
||||||
#
|
|
||||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
||||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
||||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
||||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
||||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
||||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
||||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
||||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
||||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
||||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
||||||
# POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
Utility functions for PDF library.
|
|
||||||
"""
|
|
||||||
__author__ = "Mathieu Fenniak"
|
|
||||||
__author_email__ = "biziqe@mathieu.fenniak.net"
|
|
||||||
|
|
||||||
#ENABLE_PSYCO = False
|
|
||||||
#if ENABLE_PSYCO:
|
|
||||||
# try:
|
|
||||||
# import psyco
|
|
||||||
# except ImportError:
|
|
||||||
# ENABLE_PSYCO = False
|
|
||||||
#
|
|
||||||
#if not ENABLE_PSYCO:
|
|
||||||
# class psyco:
|
|
||||||
# def proxy(func):
|
|
||||||
# return func
|
|
||||||
# proxy = staticmethod(proxy)
|
|
||||||
|
|
||||||
def readUntilWhitespace(stream, maxchars=None):
|
|
||||||
txt = ""
|
|
||||||
while True:
|
|
||||||
tok = stream.read(1)
|
|
||||||
if tok.isspace() or not tok:
|
|
||||||
break
|
|
||||||
txt += tok
|
|
||||||
if len(txt) == maxchars:
|
|
||||||
break
|
|
||||||
return txt
|
|
||||||
|
|
||||||
def readNonWhitespace(stream):
|
|
||||||
tok = ' '
|
|
||||||
while tok == '\n' or tok == '\r' or tok == ' ' or tok == '\t':
|
|
||||||
tok = stream.read(1)
|
|
||||||
return tok
|
|
||||||
|
|
||||||
class ConvertFunctionsToVirtualList(object):
|
|
||||||
def __init__(self, lengthFunction, getFunction):
|
|
||||||
self.lengthFunction = lengthFunction
|
|
||||||
self.getFunction = getFunction
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return self.lengthFunction()
|
|
||||||
|
|
||||||
def __getitem__(self, index):
|
|
||||||
if not isinstance(index, int):
|
|
||||||
raise TypeError, "sequence indices must be integers"
|
|
||||||
len_self = len(self)
|
|
||||||
if index < 0:
|
|
||||||
# support negative indexes
|
|
||||||
index = len_self + index
|
|
||||||
if index < 0 or index >= len_self:
|
|
||||||
raise IndexError, "sequence index out of range"
|
|
||||||
return self.getFunction(index)
|
|
||||||
|
|
||||||
def RC4_encrypt(key, plaintext):
|
|
||||||
S = [i for i in range(256)]
|
|
||||||
j = 0
|
|
||||||
for i in range(256):
|
|
||||||
j = (j + S[i] + ord(key[i % len(key)])) % 256
|
|
||||||
S[i], S[j] = S[j], S[i]
|
|
||||||
i, j = 0, 0
|
|
||||||
retval = ""
|
|
||||||
for x in range(len(plaintext)):
|
|
||||||
i = (i + 1) % 256
|
|
||||||
j = (j + S[i]) % 256
|
|
||||||
S[i], S[j] = S[j], S[i]
|
|
||||||
t = S[(S[i] + S[j]) % 256]
|
|
||||||
retval += chr(ord(plaintext[x]) ^ t)
|
|
||||||
return retval
|
|
||||||
|
|
||||||
class PdfReadError(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test RC4
|
|
||||||
out = RC4_encrypt("Key", "Plaintext")
|
|
||||||
print repr(out)
|
|
||||||
pt = RC4_encrypt("Key", out)
|
|
||||||
print repr(pt)
|
|
355
src/pyPdf/xmp.py
355
src/pyPdf/xmp.py
@ -1,355 +0,0 @@
|
|||||||
import re
|
|
||||||
import datetime
|
|
||||||
import decimal
|
|
||||||
from generic import PdfObject
|
|
||||||
from xml.dom import getDOMImplementation
|
|
||||||
from xml.dom.minidom import parseString
|
|
||||||
|
|
||||||
RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
|
||||||
DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
|
|
||||||
XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
|
|
||||||
PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
|
|
||||||
XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
|
|
||||||
|
|
||||||
# What is the PDFX namespace, you might ask? I might ask that too. It's
|
|
||||||
# a completely undocumented namespace used to place "custom metadata"
|
|
||||||
# properties, which are arbitrary metadata properties with no semantic or
|
|
||||||
# documented meaning. Elements in the namespace are key/value-style storage,
|
|
||||||
# where the element name is the key and the content is the value. The keys
|
|
||||||
# are transformed into valid XML identifiers by substituting an invalid
|
|
||||||
# identifier character with \u2182 followed by the unicode hex ID of the
|
|
||||||
# original character. A key like "my car" is therefore "my\u21820020car".
|
|
||||||
#
|
|
||||||
# \u2182, in case you're wondering, is the unicode character
|
|
||||||
# \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for
|
|
||||||
# escaping characters.
|
|
||||||
#
|
|
||||||
# Intentional users of the pdfx namespace should be shot on sight. A
|
|
||||||
# custom data schema and sensical XML elements could be used instead, as is
|
|
||||||
# suggested by Adobe's own documentation on XMP (under "Extensibility of
|
|
||||||
# Schemas").
|
|
||||||
#
|
|
||||||
# Information presented here on the /pdfx/ schema is a result of limited
|
|
||||||
# reverse engineering, and does not constitute a full specification.
|
|
||||||
PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
|
|
||||||
|
|
||||||
iso8601 = re.compile("""
|
|
||||||
(?P<year>[0-9]{4})
|
|
||||||
(-
|
|
||||||
(?P<month>[0-9]{2})
|
|
||||||
(-
|
|
||||||
(?P<day>[0-9]+)
|
|
||||||
(T
|
|
||||||
(?P<hour>[0-9]{2}):
|
|
||||||
(?P<minute>[0-9]{2})
|
|
||||||
(:(?P<second>[0-9]{2}(.[0-9]+)?))?
|
|
||||||
(?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
|
|
||||||
)?
|
|
||||||
)?
|
|
||||||
)?
|
|
||||||
""", re.VERBOSE)
|
|
||||||
|
|
||||||
##
|
|
||||||
# An object that represents Adobe XMP metadata.
|
|
||||||
class XmpInformation(PdfObject):
|
|
||||||
|
|
||||||
def __init__(self, stream):
|
|
||||||
self.stream = stream
|
|
||||||
docRoot = parseString(self.stream.getData())
|
|
||||||
self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0]
|
|
||||||
self.cache = {}
|
|
||||||
|
|
||||||
def writeToStream(self, stream, encryption_key):
|
|
||||||
self.stream.writeToStream(stream, encryption_key)
|
|
||||||
|
|
||||||
def getElement(self, aboutUri, namespace, name):
|
|
||||||
for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
|
|
||||||
if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
|
|
||||||
attr = desc.getAttributeNodeNS(namespace, name)
|
|
||||||
if attr != None:
|
|
||||||
yield attr
|
|
||||||
for element in desc.getElementsByTagNameNS(namespace, name):
|
|
||||||
yield element
|
|
||||||
|
|
||||||
def getNodesInNamespace(self, aboutUri, namespace):
|
|
||||||
for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
|
|
||||||
if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
|
|
||||||
for i in range(desc.attributes.length):
|
|
||||||
attr = desc.attributes.item(i)
|
|
||||||
if attr.namespaceURI == namespace:
|
|
||||||
yield attr
|
|
||||||
for child in desc.childNodes:
|
|
||||||
if child.namespaceURI == namespace:
|
|
||||||
yield child
|
|
||||||
|
|
||||||
def _getText(self, element):
|
|
||||||
text = ""
|
|
||||||
for child in element.childNodes:
|
|
||||||
if child.nodeType == child.TEXT_NODE:
|
|
||||||
text += child.data
|
|
||||||
return text
|
|
||||||
|
|
||||||
def _converter_string(value):
|
|
||||||
return value
|
|
||||||
|
|
||||||
def _converter_date(value):
|
|
||||||
m = iso8601.match(value)
|
|
||||||
year = int(m.group("year"))
|
|
||||||
month = int(m.group("month") or "1")
|
|
||||||
day = int(m.group("day") or "1")
|
|
||||||
hour = int(m.group("hour") or "0")
|
|
||||||
minute = int(m.group("minute") or "0")
|
|
||||||
second = decimal.Decimal(m.group("second") or "0")
|
|
||||||
seconds = second.to_integral(decimal.ROUND_FLOOR)
|
|
||||||
milliseconds = (second - seconds) * 1000000
|
|
||||||
tzd = m.group("tzd") or "Z"
|
|
||||||
dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
|
|
||||||
if tzd != "Z":
|
|
||||||
tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")]
|
|
||||||
tzd_hours *= -1
|
|
||||||
if tzd_hours < 0:
|
|
||||||
tzd_minutes *= -1
|
|
||||||
dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
|
|
||||||
return dt
|
|
||||||
_test_converter_date = staticmethod(_converter_date)
|
|
||||||
|
|
||||||
def _getter_bag(namespace, name, converter):
|
|
||||||
def get(self):
|
|
||||||
cached = self.cache.get(namespace, {}).get(name)
|
|
||||||
if cached:
|
|
||||||
return cached
|
|
||||||
retval = []
|
|
||||||
for element in self.getElement("", namespace, name):
|
|
||||||
bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag")
|
|
||||||
if len(bags):
|
|
||||||
for bag in bags:
|
|
||||||
for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
|
|
||||||
value = self._getText(item)
|
|
||||||
value = converter(value)
|
|
||||||
retval.append(value)
|
|
||||||
ns_cache = self.cache.setdefault(namespace, {})
|
|
||||||
ns_cache[name] = retval
|
|
||||||
return retval
|
|
||||||
return get
|
|
||||||
|
|
||||||
def _getter_seq(namespace, name, converter):
|
|
||||||
def get(self):
|
|
||||||
cached = self.cache.get(namespace, {}).get(name)
|
|
||||||
if cached:
|
|
||||||
return cached
|
|
||||||
retval = []
|
|
||||||
for element in self.getElement("", namespace, name):
|
|
||||||
seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq")
|
|
||||||
if len(seqs):
|
|
||||||
for seq in seqs:
|
|
||||||
for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
|
|
||||||
value = self._getText(item)
|
|
||||||
value = converter(value)
|
|
||||||
retval.append(value)
|
|
||||||
else:
|
|
||||||
value = converter(self._getText(element))
|
|
||||||
retval.append(value)
|
|
||||||
ns_cache = self.cache.setdefault(namespace, {})
|
|
||||||
ns_cache[name] = retval
|
|
||||||
return retval
|
|
||||||
return get
|
|
||||||
|
|
||||||
def _getter_langalt(namespace, name, converter):
|
|
||||||
def get(self):
|
|
||||||
cached = self.cache.get(namespace, {}).get(name)
|
|
||||||
if cached:
|
|
||||||
return cached
|
|
||||||
retval = {}
|
|
||||||
for element in self.getElement("", namespace, name):
|
|
||||||
alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
|
|
||||||
if len(alts):
|
|
||||||
for alt in alts:
|
|
||||||
for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
|
|
||||||
value = self._getText(item)
|
|
||||||
value = converter(value)
|
|
||||||
retval[item.getAttribute("xml:lang")] = value
|
|
||||||
else:
|
|
||||||
retval["x-default"] = converter(self._getText(element))
|
|
||||||
ns_cache = self.cache.setdefault(namespace, {})
|
|
||||||
ns_cache[name] = retval
|
|
||||||
return retval
|
|
||||||
return get
|
|
||||||
|
|
||||||
def _getter_single(namespace, name, converter):
|
|
||||||
def get(self):
|
|
||||||
cached = self.cache.get(namespace, {}).get(name)
|
|
||||||
if cached:
|
|
||||||
return cached
|
|
||||||
value = None
|
|
||||||
for element in self.getElement("", namespace, name):
|
|
||||||
if element.nodeType == element.ATTRIBUTE_NODE:
|
|
||||||
value = element.nodeValue
|
|
||||||
else:
|
|
||||||
value = self._getText(element)
|
|
||||||
break
|
|
||||||
if value != None:
|
|
||||||
value = converter(value)
|
|
||||||
ns_cache = self.cache.setdefault(namespace, {})
|
|
||||||
ns_cache[name] = value
|
|
||||||
return value
|
|
||||||
return get
|
|
||||||
|
|
||||||
##
|
|
||||||
# Contributors to the resource (other than the authors). An unsorted
|
|
||||||
# array of names.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor", _converter_string))
|
|
||||||
|
|
||||||
##
|
|
||||||
# Text describing the extent or scope of the resource.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string))
|
|
||||||
|
|
||||||
##
|
|
||||||
# A sorted array of names of the authors of the resource, listed in order
|
|
||||||
# of precedence.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string))
|
|
||||||
|
|
||||||
##
|
|
||||||
# A sorted array of dates (datetime.datetime instances) of signifigance to
|
|
||||||
# the resource. The dates and times are in UTC.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date))
|
|
||||||
|
|
||||||
##
|
|
||||||
# A language-keyed dictionary of textual descriptions of the content of the
|
|
||||||
# resource.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
dc_description = property(_getter_langalt(DC_NAMESPACE, "description", _converter_string))
|
|
||||||
|
|
||||||
##
|
|
||||||
# The mime-type of the resource.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string))
|
|
||||||
|
|
||||||
##
|
|
||||||
# Unique identifier of the resource.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier", _converter_string))
|
|
||||||
|
|
||||||
##
|
|
||||||
# An unordered array specifying the languages used in the resource.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string))
|
|
||||||
|
|
||||||
##
|
|
||||||
# An unordered array of publisher names.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string))
|
|
||||||
|
|
||||||
##
|
|
||||||
# An unordered array of text descriptions of relationships to other
|
|
||||||
# documents.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string))
|
|
||||||
|
|
||||||
##
|
|
||||||
# A language-keyed dictionary of textual descriptions of the rights the
|
|
||||||
# user has to this resource.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string))
|
|
||||||
|
|
||||||
##
|
|
||||||
# Unique identifier of the work from which this resource was derived.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string))
|
|
||||||
|
|
||||||
##
|
|
||||||
# An unordered array of descriptive phrases or keywrods that specify the
|
|
||||||
# topic of the content of the resource.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string))
|
|
||||||
|
|
||||||
##
|
|
||||||
# A language-keyed dictionary of the title of the resource.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string))
|
|
||||||
|
|
||||||
##
|
|
||||||
# An unordered array of textual descriptions of the document type.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string))
|
|
||||||
|
|
||||||
##
|
|
||||||
# An unformatted text string representing document keywords.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords", _converter_string))
|
|
||||||
|
|
||||||
##
|
|
||||||
# The PDF file version, for example 1.0, 1.3.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string))
|
|
||||||
|
|
||||||
##
|
|
||||||
# The name of the tool that created the PDF document.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer", _converter_string))
|
|
||||||
|
|
||||||
##
|
|
||||||
# The date and time the resource was originally created. The date and
|
|
||||||
# time are returned as a UTC datetime.datetime object.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
xmp_createDate = property(_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date))
|
|
||||||
|
|
||||||
##
|
|
||||||
# The date and time the resource was last modified. The date and time
|
|
||||||
# are returned as a UTC datetime.datetime object.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
xmp_modifyDate = property(_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date))
|
|
||||||
|
|
||||||
##
|
|
||||||
# The date and time that any metadata for this resource was last
|
|
||||||
# changed. The date and time are returned as a UTC datetime.datetime
|
|
||||||
# object.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
xmp_metadataDate = property(_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date))
|
|
||||||
|
|
||||||
##
|
|
||||||
# The name of the first known tool used to create the resource.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string))
|
|
||||||
|
|
||||||
##
|
|
||||||
# The common identifier for all versions and renditions of this resource.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string))
|
|
||||||
|
|
||||||
##
|
|
||||||
# An identifier for a specific incarnation of a document, updated each
|
|
||||||
# time a file is saved.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string))
|
|
||||||
|
|
||||||
def custom_properties(self):
|
|
||||||
if not hasattr(self, "_custom_properties"):
|
|
||||||
self._custom_properties = {}
|
|
||||||
for node in self.getNodesInNamespace("", PDFX_NAMESPACE):
|
|
||||||
key = node.localName
|
|
||||||
while True:
|
|
||||||
# see documentation about PDFX_NAMESPACE earlier in file
|
|
||||||
idx = key.find(u"\u2182")
|
|
||||||
if idx == -1:
|
|
||||||
break
|
|
||||||
key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:]
|
|
||||||
if node.nodeType == node.ATTRIBUTE_NODE:
|
|
||||||
value = node.nodeValue
|
|
||||||
else:
|
|
||||||
value = self._getText(node)
|
|
||||||
self._custom_properties[key] = value
|
|
||||||
return self._custom_properties
|
|
||||||
|
|
||||||
##
|
|
||||||
# Retrieves custom metadata properties defined in the undocumented pdfx
|
|
||||||
# metadata schema.
|
|
||||||
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
|
|
||||||
# @return Returns a dictionary of key/value items for custom metadata
|
|
||||||
# properties.
|
|
||||||
custom_properties = property(custom_properties)
|
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user