Add PDF splitting utility

This commit is contained in:
John Schember 2009-03-28 21:57:42 -04:00
parent 4579b10571
commit a5228d56d2
3 changed files with 193 additions and 3 deletions

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, sys, re import os, sys
from calibre.ebooks.metadata.meta import metadata_from_formats from calibre.ebooks.metadata.meta import metadata_from_formats
from calibre.ebooks.metadata import authors_to_string from calibre.ebooks.metadata import authors_to_string
@ -24,7 +24,7 @@ def config(defaults=None):
c = StringConfig(defaults, desc) c = StringConfig(defaults, desc)
c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count',
help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.')) help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.'))
c.add_opt('output', ['-o', '--output'],default='merged.pdf', c.add_opt('output', ['-o', '--output'], default='merged.pdf',
help=_('Path to output file. By default a file is created in the current directory.')) help=_('Path to output file. By default a file is created in the current directory.'))
return c return c
@ -33,7 +33,7 @@ def option_parser():
return c.option_parser(usage=_('''\ return c.option_parser(usage=_('''\
%prog [options] file1.pdf file2.pdf ... %prog [options] file1.pdf file2.pdf ...
Merges individual pdfs. Metadata will be used from the first PDF specified. Merges individual PDFs. Metadata will be used from the first PDF specified.
''')) '''))
def merge_files(in_paths, out_path, metadata=None): def merge_files(in_paths, out_path, metadata=None):

View File

@ -0,0 +1,189 @@
'''
Split PDF file into multiple PDF documents.
'''
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os, sys, re
from calibre.ebooks.metadata.meta import metadata_from_formats
from calibre.ebooks.metadata import authors_to_string
from calibre.utils.config import Config, StringConfig
from pyPdf import PdfFileWriter, PdfFileReader
def config(defaults=None):
desc = _('Options to control the transformation of pdf')
default_crop=10
if defaults is None:
c = Config('trimpdf', desc)
else:
c = StringConfig(defaults, desc)
c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count',
help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.'))
c.add_opt('output', ['-o', '--output'], default='split.pdf',
help=_('Path to output file. By default a file is created in the current directory. \
The file name will be the base name for the output.'))
return c
def option_parser():
c = config()
return c.option_parser(usage=_('''\
%prog [options] file.pdf page_to_split_on ...
%prog [options] file.pdf page_range_to_split_on ...
Ex.
%prog file.pdf 6
%prog file.pdf 6-12
%prog file.pdf 6-12 8 10 9-20
Split a PDF.
'''))
def split_pdf(in_path, pages, page_ranges, out_name, metadata=None):
pdf = PdfFileReader(open(os.path.abspath(in_path), 'rb'))
total_pages = pdf.numPages - 1
for index in pages+page_ranges:
if index in pages:
write_pdf(pdf, out_name, '%s' % (index + 1), index, total_pages, metadata)
else:
write_pdf(pdf, out_name, '%s-%s' % (index[0] + 1, index[1] + 1), index[0], index[1], metadata)
def write_pdf(pdf, name, suffix, start, end, metadata=None):
if metadata == None:
title = _('Unknown')
author = _('Unknown')
else:
title = metadata.title
author = authors_to_string(metadata.authors)
out_pdf = PdfFileWriter(title=title, author=author)
for page_num in range(start, end + 1):
out_pdf.addPage(pdf.getPage(page_num))
with open('%s%s.pdf' % (name, suffix), 'wb') as out_file:
out_pdf.write(out_file)
def split_args(args):
pdf = ''
pages = []
page_ranges = []
bad = []
for arg in args:
arg = arg.strip()
# Find the pdf input
if re.search('(?iu)^.*?\.pdf[ ]*$', arg) != None:
if pdf == '':
pdf = arg
else:
bad.append(arg)
# Find single indexes
elif re.search('^[ ]*\d+[ ]*$', arg) != None:
pages.append(arg)
# Find index ranges
elif re.search('^[ ]*\d+[ ]*-[ ]*\d+[ ]*$', arg) != None:
mo = re.search('^[ ]*(?P<start>\d+)[ ]*-[ ]*(?P<end>\d+)[ ]*$', arg)
start = mo.group('start')
end = mo.group('end')
# check to see if the range is really a single index
if start == end:
pages.append(start)
else:
page_ranges.append([start, end])
else:
bad.append(arg)
bad = sorted(list(set(bad)))
return pdf, pages, page_ranges, bad
# Remove duplicates from pages and page_ranges.
# Set pages higher than the total number of pages in the pdf to the last page.
# Return pages and page_ranges as lists of ints.
def clean_page_list(pdf_path, pages, page_ranges):
pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb'))
total_pages = pdf.numPages
sorted_pages = []
sorted_ranges = []
for index in pages:
index = int(index)
if index > total_pages:
sorted_pages.append(total_pages - 1)
else:
sorted_pages.append(index - 1)
for start, end in page_ranges:
start = int(start)
end = int(end)
if start > total_pages and end > total_pages:
sorted_pages.append(total_pages - 1)
continue
if start > total_pages:
start = total_pages
if end > total_pages:
end = total_pages
page_range = sorted([start - 1, end - 1])
if page_range not in sorted_ranges:
sorted_ranges.append(page_range)
# Remove duplicates and sort
pages = sorted(list(set(sorted_pages)))
page_ranges = sorted(sorted_ranges)
return pages, page_ranges
# Return True if the pdf is valid.
def valid_pdf(pdf_path):
try:
with open(os.path.abspath(pdf_path), 'rb') as pdf_file:
pdf = PdfFileReader(pdf_file)
if pdf.isEncrypted or pdf.numPages <= 0:
raise Exception
except:
return False
return True
def main(args=sys.argv):
parser = option_parser()
opts, args = parser.parse_args(args)
pdf, pages, page_ranges, unknown = split_args(args[1:])
if pdf == '' and (pages == [] or page_ranges == []):
print 'Error: PDF and where to split is required.\n\n'
print parser.get_usage()
return 2
if unknown != []:
for arg in unknown:
print 'Error: Unknown argument `%s`' % arg
print parser.get_usage()
return 2
if not valid_pdf(pdf):
print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf
return 2
pages, page_ranges = clean_page_list(pdf, pages, page_ranges)
mi = metadata_from_formats([pdf])
split_pdf(pdf, pages, page_ranges, os.path.splitext(opts.output)[0], mi)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -41,6 +41,7 @@ entry_points = {
'calibre-customize = calibre.customize.ui:main', 'calibre-customize = calibre.customize.ui:main',
'pdftrim = calibre.ebooks.pdf.pdftrim:main', 'pdftrim = calibre.ebooks.pdf.pdftrim:main',
'pdfmerge = calibre.ebooks.pdf.pdfmerge:main', 'pdfmerge = calibre.ebooks.pdf.pdfmerge:main',
'pdfsplit = calibre.ebooks.pdf.pdfsplit:main',
'fetch-ebook-metadata = calibre.ebooks.metadata.fetch:main', 'fetch-ebook-metadata = calibre.ebooks.metadata.fetch:main',
], ],
'gui_scripts' : [ 'gui_scripts' : [