diff --git a/src/calibre/ebooks/pdf/pdfmerge.py b/src/calibre/ebooks/pdf/pdfmerge.py index e8554dbc6b..4a741c4f5a 100644 --- a/src/calibre/ebooks/pdf/pdfmerge.py +++ b/src/calibre/ebooks/pdf/pdfmerge.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os, sys, re +import os, sys from calibre.ebooks.metadata.meta import metadata_from_formats from calibre.ebooks.metadata import authors_to_string @@ -24,7 +24,7 @@ def config(defaults=None): c = StringConfig(defaults, desc) c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.')) - c.add_opt('output', ['-o', '--output'],default='merged.pdf', + c.add_opt('output', ['-o', '--output'], default='merged.pdf', help=_('Path to output file. By default a file is created in the current directory.')) return c @@ -33,7 +33,7 @@ def option_parser(): return c.option_parser(usage=_('''\ %prog [options] file1.pdf file2.pdf ... - Merges individual pdfs. Metadata will be used from the first PDF specified. + Merges individual PDFs. Metadata will be used from the first PDF specified. ''')) def merge_files(in_paths, out_path, metadata=None): diff --git a/src/calibre/ebooks/pdf/pdfsplit.py b/src/calibre/ebooks/pdf/pdfsplit.py new file mode 100644 index 0000000000..460dbef148 --- /dev/null +++ b/src/calibre/ebooks/pdf/pdfsplit.py @@ -0,0 +1,189 @@ +''' +Split PDF file into multiple PDF documents. +''' +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os, sys, re + +from calibre.ebooks.metadata.meta import metadata_from_formats +from calibre.ebooks.metadata import authors_to_string +from calibre.utils.config import Config, StringConfig + +from pyPdf import PdfFileWriter, PdfFileReader + +def config(defaults=None): + desc = _('Options to control the transformation of pdf') + default_crop=10 + if defaults is None: + c = Config('trimpdf', desc) + else: + c = StringConfig(defaults, desc) + c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', + help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.')) + c.add_opt('output', ['-o', '--output'], default='split.pdf', + help=_('Path to output file. By default a file is created in the current directory. \ + The file name will be the base name for the output.')) + return c + +def option_parser(): + c = config() + return c.option_parser(usage=_('''\ + + %prog [options] file.pdf page_to_split_on ... + %prog [options] file.pdf page_range_to_split_on ... + + Ex. + + %prog file.pdf 6 + %prog file.pdf 6-12 + %prog file.pdf 6-12 8 10 9-20 + + Split a PDF. + ''')) + +def split_pdf(in_path, pages, page_ranges, out_name, metadata=None): + pdf = PdfFileReader(open(os.path.abspath(in_path), 'rb')) + total_pages = pdf.numPages - 1 + + for index in pages+page_ranges: + if index in pages: + write_pdf(pdf, out_name, '%s' % (index + 1), index, total_pages, metadata) + else: + + write_pdf(pdf, out_name, '%s-%s' % (index[0] + 1, index[1] + 1), index[0], index[1], metadata) + +def write_pdf(pdf, name, suffix, start, end, metadata=None): + if metadata == None: + title = _('Unknown') + author = _('Unknown') + else: + title = metadata.title + author = authors_to_string(metadata.authors) + + out_pdf = PdfFileWriter(title=title, author=author) + for page_num in range(start, end + 1): + out_pdf.addPage(pdf.getPage(page_num)) + with open('%s%s.pdf' % (name, suffix), 'wb') as out_file: + out_pdf.write(out_file) + +def split_args(args): + pdf = '' + pages = [] + page_ranges = [] + bad = [] + + for arg in args: + arg = arg.strip() + # Find the pdf input + if re.search('(?iu)^.*?\.pdf[ ]*$', arg) != None: + if pdf == '': + pdf = arg + else: + bad.append(arg) + # Find single indexes + elif re.search('^[ ]*\d+[ ]*$', arg) != None: + pages.append(arg) + # Find index ranges + elif re.search('^[ ]*\d+[ ]*-[ ]*\d+[ ]*$', arg) != None: + mo = re.search('^[ ]*(?P\d+)[ ]*-[ ]*(?P\d+)[ ]*$', arg) + start = mo.group('start') + end = mo.group('end') + + # check to see if the range is really a single index + if start == end: + pages.append(start) + else: + page_ranges.append([start, end]) + else: + bad.append(arg) + + bad = sorted(list(set(bad))) + + return pdf, pages, page_ranges, bad + +# Remove duplicates from pages and page_ranges. +# Set pages higher than the total number of pages in the pdf to the last page. +# Return pages and page_ranges as lists of ints. +def clean_page_list(pdf_path, pages, page_ranges): + pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) + + total_pages = pdf.numPages + sorted_pages = [] + sorted_ranges = [] + + for index in pages: + index = int(index) + if index > total_pages: + sorted_pages.append(total_pages - 1) + else: + sorted_pages.append(index - 1) + + for start, end in page_ranges: + start = int(start) + end = int(end) + + if start > total_pages and end > total_pages: + sorted_pages.append(total_pages - 1) + continue + + if start > total_pages: + start = total_pages + if end > total_pages: + end = total_pages + page_range = sorted([start - 1, end - 1]) + if page_range not in sorted_ranges: + sorted_ranges.append(page_range) + + # Remove duplicates and sort + pages = sorted(list(set(sorted_pages))) + page_ranges = sorted(sorted_ranges) + + return pages, page_ranges + +# Return True if the pdf is valid. +def valid_pdf(pdf_path): + try: + with open(os.path.abspath(pdf_path), 'rb') as pdf_file: + pdf = PdfFileReader(pdf_file) + if pdf.isEncrypted or pdf.numPages <= 0: + raise Exception + except: + return False + return True + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + + pdf, pages, page_ranges, unknown = split_args(args[1:]) + + if pdf == '' and (pages == [] or page_ranges == []): + print 'Error: PDF and where to split is required.\n\n' + print parser.get_usage() + return 2 + + if unknown != []: + for arg in unknown: + print 'Error: Unknown argument `%s`' % arg + print parser.get_usage() + return 2 + + if not valid_pdf(pdf): + print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf + return 2 + + pages, page_ranges = clean_page_list(pdf, pages, page_ranges) + + mi = metadata_from_formats([pdf]) + + split_pdf(pdf, pages, page_ranges, os.path.splitext(opts.output)[0], mi) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) + diff --git a/src/calibre/linux.py b/src/calibre/linux.py index c7a6099623..3ba6f55bc8 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -41,6 +41,7 @@ entry_points = { 'calibre-customize = calibre.customize.ui:main', 'pdftrim = calibre.ebooks.pdf.pdftrim:main', 'pdfmerge = calibre.ebooks.pdf.pdfmerge:main', + 'pdfsplit = calibre.ebooks.pdf.pdfsplit:main', 'fetch-ebook-metadata = calibre.ebooks.metadata.fetch:main', ], 'gui_scripts' : [