From 08971e831637122a1307d1aa8307775a887cba91 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 4 Apr 2009 10:37:00 -0400 Subject: [PATCH 1/7] New pdf manipulate commands. Remove old option parser from pdf metadata. --- src/calibre/ebooks/metadata/pdf.py | 39 +------ src/calibre/ebooks/pdf/manipulate/cli.py | 5 +- src/calibre/ebooks/pdf/manipulate/crop.py | 8 +- src/calibre/ebooks/pdf/manipulate/decrypt.py | 115 +++++++++++++++++++ src/calibre/ebooks/pdf/manipulate/encrypt.py | 105 +++++++++++++++++ src/calibre/ebooks/pdf/manipulate/info.py | 16 ++- src/calibre/ebooks/pdf/manipulate/merge.py | 14 ++- src/calibre/ebooks/pdf/manipulate/reverse.py | 10 +- src/calibre/ebooks/pdf/manipulate/split.py | 6 +- src/calibre/ebooks/pdf/verify.py | 7 ++ src/calibre/ebooks/txt/input.py | 3 +- 11 files changed, 274 insertions(+), 54 deletions(-) create mode 100644 src/calibre/ebooks/pdf/manipulate/decrypt.py create mode 100644 src/calibre/ebooks/pdf/manipulate/encrypt.py diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 8f73e04050..6b94b07275 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -7,7 +7,7 @@ __copyright__ = '2008, Kovid Goyal ' import sys, os, re, StringIO -from calibre.ebooks.metadata import MetaInformation, authors_to_string, get_parser +from calibre.ebooks.metadata import MetaInformation, authors_to_string from calibre.ptempfile import TemporaryDirectory from pyPdf import PdfFileReader, PdfFileWriter import Image @@ -96,40 +96,3 @@ def get_cover(stream): traceback.print_exc() return data.getvalue() - -def option_parser(): - p = get_parser('pdf') - p.remove_option('--category') - p.remove_option('--comment') - p.add_option('--get-cover', default=False, action='store_true', - help=_('Extract the cover')) - return p - -def main(args=sys.argv): - p = option_parser() - opts, args = p.parse_args(args) - - with open(os.path.abspath(os.path.expanduser(args[1])), 'r+b') as stream: - mi = get_metadata(stream, extract_cover=opts.get_cover) - changed = False - if opts.title: - mi.title = opts.title - changed = True - if opts.authors: - mi.authors = opts.authors.split(',') - changed = True - - if changed: - set_metadata(stream, mi) - print unicode(get_metadata(stream, extract_cover=False)).encode('utf-8') - - if mi.cover_data[1] is not None: - cpath = os.path.splitext(os.path.basename(args[1]))[0] + '_cover.jpg' - with open(cpath, 'wb') as f: - f.write(mi.cover_data[1]) - print 'Cover saved to', f.name - - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/manipulate/cli.py b/src/calibre/ebooks/pdf/manipulate/cli.py index e3fcef559c..edbba54a8d 100644 --- a/src/calibre/ebooks/pdf/manipulate/cli.py +++ b/src/calibre/ebooks/pdf/manipulate/cli.py @@ -15,10 +15,13 @@ from calibre.utils.config import OptionParser from calibre.utils.logging import Log from calibre.constants import preferred_encoding from calibre.customize.conversion import OptionRecommendation -from calibre.ebooks.pdf.manipulate import crop, info, merge, reverse, split +from calibre.ebooks.pdf.manipulate import crop, decrypt, encrypt, \ + info, merge, reverse, split COMMANDS = { 'crop' : crop, + 'decrypt' : decrypt, + 'encrypt' : encrypt, 'info' : info, 'merge' : merge, 'reverse' : reverse, diff --git a/src/calibre/ebooks/pdf/manipulate/crop.py b/src/calibre/ebooks/pdf/manipulate/crop.py index fa996b754f..7627823a89 100644 --- a/src/calibre/ebooks/pdf/manipulate/crop.py +++ b/src/calibre/ebooks/pdf/manipulate/crop.py @@ -25,7 +25,7 @@ from pyPdf import PdfFileWriter, PdfFileReader DEFAULT_CROP = '10' -USAGE = '%prog %%name ' + _(''' +USAGE = '\n%prog %%name ' + _('''\ [options] file.pdf Crop a PDF file. @@ -132,7 +132,11 @@ def main(args=sys.argv, name=''): return 1 if not is_valid_pdf(args[0]): - print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % args[0] + print 'Error: Could not read file `%s`.' % args[0] + return 1 + + if is_encrypted(args[0]): + print 'Error: file `%s` is encrypted.' % args[0] return 1 mi = metadata_from_formats([args[0]]) diff --git a/src/calibre/ebooks/pdf/manipulate/decrypt.py b/src/calibre/ebooks/pdf/manipulate/decrypt.py new file mode 100644 index 0000000000..5f4265b5ed --- /dev/null +++ b/src/calibre/ebooks/pdf/manipulate/decrypt.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +''' +Decrypt content of PDF. +''' + +import os, sys +from optparse import OptionGroup, Option + +from calibre.ebooks.metadata.meta import metadata_from_formats +from calibre.ebooks.metadata import authors_to_string +from calibre.utils.config import OptionParser +from calibre.utils.logging import Log +from calibre.constants import preferred_encoding +from calibre.customize.conversion import OptionRecommendation +from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted + +from pyPdf import PdfFileWriter, PdfFileReader + +USAGE = '\n%prog %%name ' + _('''\ +[options] file.pdf password + +Decrypt a PDF. +''') + +OPTIONS = set([ + OptionRecommendation(name='output', recommended_value='decrypted.pdf', + level=OptionRecommendation.HIGH, long_switch='output', short_switch='o', + help=_('Path to output file. By default a file is created in the current directory.')), +]) + +class DecryptionError(Exception): + def __init__(self, pdf_path): + self.value = 'Unable to decrypt file `%s`.' % value + + def __str__(self): + return repr(self.value) + + +def print_help(parser, log): + help = parser.format_help().encode(preferred_encoding, 'replace') + log(help) + +def option_parser(name): + usage = USAGE.replace('%%name', name) + return OptionParser(usage=usage) + +def option_recommendation_to_cli_option(add_option, rec): + opt = rec.option + switches = ['-'+opt.short_switch] if opt.short_switch else [] + switches.append('--'+opt.long_switch) + attrs = dict(dest=opt.name, help=opt.help, + choices=opt.choices, default=rec.recommended_value) + add_option(Option(*switches, **attrs)) + +def add_options(parser): + group = OptionGroup(parser, _('Decrypt Options:'), _('Options to control the transformation of pdf')) + parser.add_option_group(group) + add_option = group.add_option + + for rec in OPTIONS: + option_recommendation_to_cli_option(add_option, rec) + +def decrypt(pdf_path, out_path, password): + pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) + + if pdf.decrypt(str(password)) == 0: + raise DecryptionError(pdf_path) + + title = pdf.documentInfo.title if pdf.documentInfo.title else _('Unknown') + author = pdf.documentInfo.author if pdf.documentInfo.author else _('Unknown') + out_pdf = PdfFileWriter(title=title, author=author) + + for page in pdf.pages: + out_pdf.addPage(page) + + with open(out_path, 'wb') as out_file: + out_pdf.write(out_file) + +def main(args=sys.argv, name=''): + log = Log() + parser = option_parser(name) + add_options(parser) + + opts, args = parser.parse_args(args) + args = args[1:] + + if len(args) < 2: + print 'Error: A PDF file and decryption password is required.\n' + print_help(parser, log) + return 1 + + if not is_valid_pdf(args[0]): + print 'Error: Could not read file `%s`.' % args[0] + return 1 + + if not is_encrypted(args[0]): + print 'Error: file `%s` is not encrypted.' % args[0] + return 1 + + try: + decrypt(args[0], opts.output, args[1]) + except DecryptionError, e: + print e.value + return 1 + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/manipulate/encrypt.py b/src/calibre/ebooks/pdf/manipulate/encrypt.py new file mode 100644 index 0000000000..15600fb07c --- /dev/null +++ b/src/calibre/ebooks/pdf/manipulate/encrypt.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +''' +Encrypt a PDF. +''' + +import os, sys +from optparse import OptionGroup, Option + +from calibre.utils.config import OptionParser +from calibre.utils.logging import Log +from calibre.constants import preferred_encoding +from calibre.customize.conversion import OptionRecommendation +from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted + +from pyPdf import PdfFileWriter, PdfFileReader + +USAGE = '\n%prog %%name ' + _('''\ +[options] file.pdf password + +Encrypt a PDF. +''') + +OPTIONS = set([ + OptionRecommendation(name='output', recommended_value='encrypted.pdf', + level=OptionRecommendation.HIGH, long_switch='output', short_switch='o', + help=_('Path to output file. By default a file is created in the current directory.')), +]) + +def print_help(parser, log): + help = parser.format_help().encode(preferred_encoding, 'replace') + log(help) + +def option_parser(name): + usage = USAGE.replace('%%name', name) + return OptionParser(usage=usage) + +def option_recommendation_to_cli_option(add_option, rec): + opt = rec.option + switches = ['-'+opt.short_switch] if opt.short_switch else [] + switches.append('--'+opt.long_switch) + attrs = dict(dest=opt.name, help=opt.help, + choices=opt.choices, default=rec.recommended_value) + add_option(Option(*switches, **attrs)) + +def add_options(parser): + group = OptionGroup(parser, _('Encrypt Options:'), _('Options to control the transformation of pdf')) + parser.add_option_group(group) + add_option = group.add_option + + for rec in OPTIONS: + option_recommendation_to_cli_option(add_option, rec) + +def encrypt(pdf_path, out_path, password, metadata=None): + if metadata == None: + title = _('Unknown') + author = _('Unknown') + else: + title = metadata.title + author = authors_to_string(metadata.authors) + + out_pdf = PdfFileWriter(title=title, author=author) + + pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) + for page in pdf.pages: + out_pdf.addPage(page) + + with open(out_path, 'wb') as out_file: + out_pdf.encrypt(str(password)) + out_pdf.write(out_file) + +def main(args=sys.argv, name=''): + log = Log() + parser = option_parser(name) + add_options(parser) + + opts, args = parser.parse_args(args) + args = args[1:] + + if len(args) < 2: + print 'Error: A PDF file and decryption password is required.\n' + print_help(parser, log) + return 1 + + if not is_valid_pdf(args[0]): + print 'Error: Could not read file `%s`.' % args[0] + return 1 + + if is_encrypted(args[0]): + print 'Error: file `%s` is already encrypted.' % args[0] + return 1 + + mi = metadata_from_formats([args[0]]) + + encrypt(args[0], opts.output, args[1], mi) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/manipulate/info.py b/src/calibre/ebooks/pdf/manipulate/info.py index 21a07fdeff..d1b52a602c 100644 --- a/src/calibre/ebooks/pdf/manipulate/info.py +++ b/src/calibre/ebooks/pdf/manipulate/info.py @@ -16,11 +16,11 @@ from calibre.utils.config import OptionParser from calibre.utils.logging import Log from calibre.constants import preferred_encoding from calibre.customize.conversion import OptionRecommendation -from calibre.ebooks.pdf.verify import is_valid_pdfs +from calibre.ebooks.pdf.verify import is_valid_pdfs, is_encrypted from pyPdf import PdfFileWriter, PdfFileReader -USAGE = '%prog %%name ' + _(''' +USAGE = '\n%prog %%name ' + _('''\ file.pdf ... Get info about a PDF. @@ -72,9 +72,17 @@ def main(args=sys.argv, name=''): bad_pdfs = is_valid_pdfs(args) if bad_pdfs != []: for pdf in bad_pdfs: - print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf + print 'Error: Could not read file `%s`.' % pdf return 1 - + + enc = False + for pdf in args: + if is_encrypted(pdf): + enc = True + print 'Error: file `%s` is encrypted. Please decrypt first.' % pdf + if enc: + return 1 + for pdf in args: print_info(pdf) diff --git a/src/calibre/ebooks/pdf/manipulate/merge.py b/src/calibre/ebooks/pdf/manipulate/merge.py index 1e285e3bdf..fce7076e85 100644 --- a/src/calibre/ebooks/pdf/manipulate/merge.py +++ b/src/calibre/ebooks/pdf/manipulate/merge.py @@ -22,7 +22,7 @@ from calibre.ebooks.pdf.verify import is_valid_pdfs from pyPdf import PdfFileWriter, PdfFileReader -USAGE = '%prog %%name ' + _(''' +USAGE = '\n%prog %%name ' + _('''\ [options] file1.pdf file2.pdf ... Metadata will be used from the first PDF specified. @@ -94,9 +94,17 @@ def main(args=sys.argv, name=''): bad_pdfs = is_valid_pdfs(args) if bad_pdfs != []: for pdf in bad_pdfs: - print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf + print 'Error: Could not read file `%s`.' % pdf return 1 - + + enc = False + for pdf in args: + if is_encrypted(pdf): + enc = True + print 'Error: file `%s` is encrypted.' % pdf + if enc: + return 1 + mi = metadata_from_formats([args[0]]) merge_files(args, opts.output, mi) diff --git a/src/calibre/ebooks/pdf/manipulate/reverse.py b/src/calibre/ebooks/pdf/manipulate/reverse.py index 564e523ae3..f2f3fa16da 100644 --- a/src/calibre/ebooks/pdf/manipulate/reverse.py +++ b/src/calibre/ebooks/pdf/manipulate/reverse.py @@ -22,10 +22,10 @@ from calibre.ebooks.pdf.verify import is_valid_pdf from pyPdf import PdfFileWriter, PdfFileReader -USAGE = '%prog %%name ' + _(''' +USAGE = '\n%prog %%name ' + _('''\ [options] file.pdf -Reverse PDF. +Reverse a PDF. ''') OPTIONS = set([ @@ -89,7 +89,11 @@ def main(args=sys.argv, name=''): return 1 if not is_valid_pdf(args[0]): - print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % args[0] + print 'Error: Could not read file `%s`.' % args[0] + return 1 + + if is_encrypted(args[0]): + print 'Error: file `%s` is encrypted.' % args[0] return 1 mi = metadata_from_formats([args[0]]) diff --git a/src/calibre/ebooks/pdf/manipulate/split.py b/src/calibre/ebooks/pdf/manipulate/split.py index fb7e4d06d7..19012797ae 100644 --- a/src/calibre/ebooks/pdf/manipulate/split.py +++ b/src/calibre/ebooks/pdf/manipulate/split.py @@ -185,7 +185,11 @@ def main(args=sys.argv, name=''): return 1 if not is_valid_pdf(pdf): - print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf + print 'Error: Could not read file `%s`.' % pdf + return 1 + + if is_encrypted(args[0]): + print 'Error: file `%s` is encrypted.' % args[0] return 1 pages, page_ranges = clean_page_list(pdf, pages, page_ranges) diff --git a/src/calibre/ebooks/pdf/verify.py b/src/calibre/ebooks/pdf/verify.py index 35f7edf0be..3a8a8073ce 100644 --- a/src/calibre/ebooks/pdf/verify.py +++ b/src/calibre/ebooks/pdf/verify.py @@ -35,3 +35,10 @@ def is_valid_pdfs(pdf_paths): if not is_valid_pdf(pdf_path): invalid.append(pdf_path) return invalid + +def is_encrypted(pdf_path): + with open(os.path.abspath(pdf_path), 'rb') as pdf_file: + pdf = PdfFileReader(pdf_file) + if pdf.isEncrypted: + return True + return False diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index a42c72866f..fdc2851342 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -10,7 +10,6 @@ import os from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.markdown import markdown from calibre.ebooks.metadata.opf import OPFCreator -from calibre.ebooks.metadata import MetaInformation #from calibre.ebooks.metadata.meta import metadata_from_formats class TXTInput(InputFormatPlugin): @@ -32,7 +31,7 @@ class TXTInput(InputFormatPlugin): index.write(html.encode('utf-8')) #mi = metadata_from_formats([stream.name]) - mi = MetaInformation(_('Unknown'), _('Unknown')) + mi = None opf = OPFCreator(os.getcwd(), mi) opf.create_manifest([('index.html', None)]) opf.create_spine(['index.html']) From a60cd4c5672dbbf0e3273a2612232a0b12c57403 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 4 Apr 2009 19:59:43 -0400 Subject: [PATCH 2/7] Auto convert in GUI started --- src/calibre/gui2/device.py | 49 ++++++++++++++++++++++++++++++++------ src/calibre/gui2/main.py | 1 - src/calibre/gui2/tools.py | 6 +---- 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index de11366b3b..ed001c30ba 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -10,12 +10,14 @@ from binascii import unhexlify from PyQt4.Qt import QMenu, QAction, QActionGroup, QIcon, SIGNAL, QPixmap, \ Qt +from calibre.customize.ui import available_input_formats, available_output_formats from calibre.devices import devices from calibre.gui2.dialogs.choose_format import ChooseFormatDialog from calibre.parallel import Job from calibre.devices.scanner import DeviceScanner from calibre.gui2 import config, error_dialog, Dispatcher, dynamic, \ - pixmap_to_data, warning_dialog + pixmap_to_data, warning_dialog, \ + info_dialog from calibre.ebooks.metadata import authors_to_string from calibre.gui2.dialogs.conversion_error import ConversionErrorDialog from calibre.devices.interface import Device @@ -575,10 +577,17 @@ class DeviceGUI(object): def sync_to_device(self, on_card, delete_from_library, - specific_format=None): - rows = self.library_view.selectionModel().selectedRows() + specific_format=None, send_rows=None, auto_convert=True): + rows = self.library_view.selectionModel().selectedRows() if send_rows is None else send_rows if not self.device_manager or not rows or len(rows) == 0: return + + _files, _auto_rows = self.library_view.model().get_preferred_formats(rows, + self.device_manager.device_class.FORMATS, + paths=True, set_metadata=True, + specific_format=specific_format) + rows = list(set(rows).difference(_auto_rows)) + ids = iter(self.library_view.model().id(r) for r in rows) metadata = self.library_view.model().get_metadata(rows) for mi in metadata: @@ -586,10 +595,7 @@ class DeviceGUI(object): if cdata: mi['cover'] = self.cover_to_thumbnail(cdata) metadata = iter(metadata) - _files = self.library_view.model().get_preferred_formats(rows, - self.device_manager.device_class.FORMATS, - paths=True, set_metadata=True, - specific_format=specific_format) + files = [getattr(f, 'name', None) for f in _files] bad, good, gf, names, remove_ids = [], [], [], [], [] for f in files: @@ -615,6 +621,35 @@ class DeviceGUI(object): remove = remove_ids if delete_from_library else [] self.upload_books(gf, names, good, on_card, memory=(_files, remove)) self.status_bar.showMessage(_('Sending books to device.'), 5000) + + auto = [] + if _auto_rows != []: + for row in _auto_rows: + if specific_format == None: + formats = self.library_view.model().db.formats(row).split(',') + formats = formats if formats != None else [] + if set(formats).intersection(available_input_formats()) is not None and set(self.device_manager.device_class.FORMATS).intersection(available_output_formats()) is not None: + auto.append(row) + else: + bad.append(self.library_view.model().title(row)) + else: + if specific_format in available_output_formats(): + auto.append(row) + else: + bad.append(self.library_view.model().title(row)) + + if auto != []: + autos = [self.library_view.model().title(row) for row in auto] + autos = '\n'.join('
  • %s
  • '%(i,) for i in autos) + d = info_dialog(self, _('No suitable formats'), + _('Auto converting the following books before uploading to the device:
      %s
    ')%(autos,)) + for fmt in self.device_manager.device_class.FORMATS: + if fmt in list(set(self.device_manager.device_class.FORMATS).intersection(set(available_output_formats()))): + format = fmt + break + self.auto_convert(_auto_rows, on_card, format) + d.exec_() + if bad: bad = '\n'.join('
  • %s
  • '%(i,) for i in bad) d = warning_dialog(self, _('No suitable formats'), diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py index f1f1e674b7..dcece08a3e 100644 --- a/src/calibre/gui2/main.py +++ b/src/calibre/gui2/main.py @@ -979,7 +979,6 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI): comics.append(r) else: others.append(r) - jobs, changed, bad_rows = auto_convert_ebook(format, self, self.library_view.model().db, comics, others) for func, args, desc, fmt, id, temp_files in jobs: if id not in bad_rows: diff --git a/src/calibre/gui2/tools.py b/src/calibre/gui2/tools.py index 0bf78ffaa7..07587d3c25 100644 --- a/src/calibre/gui2/tools.py +++ b/src/calibre/gui2/tools.py @@ -505,11 +505,7 @@ def fetch_scheduled_recipe(recipe, script): return 'feeds2'+fmt, [args], _('Fetch news from ')+recipe.title, fmt.upper(), [pt] def auto_convert_ebook(*args): - fmt = args[0] if args[0] else 'epub' - if fmt == 'lrf': - return auto_convert_lrf() - elif fmt in ('epub', 'mobi'): - return auto_convert(*args) + return auto_convert(*args) def convert_single_ebook(*args): fmt = prefs['output_format'].lower() From c1a37749a67e08a9b699298d644fd78a6810336e Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 5 Apr 2009 00:01:37 -0400 Subject: [PATCH 3/7] Auto convert in GUI working --- src/calibre/gui2/device.py | 22 +++--- src/calibre/gui2/library.py | 6 +- src/calibre/gui2/main.py | 18 ++--- src/calibre/gui2/tools.py | 147 ++++++++++++++++-------------------- src/calibre/parallel.py | 3 + 5 files changed, 88 insertions(+), 108 deletions(-) diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index ed001c30ba..46cf9895d4 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -577,7 +577,7 @@ class DeviceGUI(object): def sync_to_device(self, on_card, delete_from_library, - specific_format=None, send_rows=None, auto_convert=True): + specific_format=None, send_rows=None, do_auto_convert=True): rows = self.library_view.selectionModel().selectedRows() if send_rows is None else send_rows if not self.device_manager or not rows or len(rows) == 0: return @@ -585,8 +585,12 @@ class DeviceGUI(object): _files, _auto_rows = self.library_view.model().get_preferred_formats(rows, self.device_manager.device_class.FORMATS, paths=True, set_metadata=True, - specific_format=specific_format) - rows = list(set(rows).difference(_auto_rows)) + specific_format=specific_format, + exclude_auto=do_auto_convert) + if do_auto_convert: + rows = list(set(rows).difference(_auto_rows)) + else: + _auto_rows = [] ids = iter(self.library_view.model().id(r) for r in rows) metadata = self.library_view.model().get_metadata(rows) @@ -626,9 +630,9 @@ class DeviceGUI(object): if _auto_rows != []: for row in _auto_rows: if specific_format == None: - formats = self.library_view.model().db.formats(row).split(',') - formats = formats if formats != None else [] - if set(formats).intersection(available_input_formats()) is not None and set(self.device_manager.device_class.FORMATS).intersection(available_output_formats()) is not None: + formats = [f.lower() for f in self.library_view.model().db.formats(row).split(',')] + formats = formats if formats != None else [] + if list(set(formats).intersection(available_input_formats())) != [] and list(set(self.device_manager.device_class.FORMATS).intersection(available_output_formats())) != []: auto.append(row) else: bad.append(self.library_view.model().title(row)) @@ -646,10 +650,10 @@ class DeviceGUI(object): for fmt in self.device_manager.device_class.FORMATS: if fmt in list(set(self.device_manager.device_class.FORMATS).intersection(set(available_output_formats()))): format = fmt - break + break + d.exec_() self.auto_convert(_auto_rows, on_card, format) - d.exec_() - + if bad: bad = '\n'.join('
  • %s
  • '%(i,) for i in bad) d = warning_dialog(self, _('No suitable formats'), diff --git a/src/calibre/gui2/library.py b/src/calibre/gui2/library.py index 1f3ed31478..c67f9bc1b0 100644 --- a/src/calibre/gui2/library.py +++ b/src/calibre/gui2/library.py @@ -420,7 +420,8 @@ class BooksModel(QAbstractTableModel): def get_preferred_formats(self, rows, formats, paths=False, - set_metadata=False, specific_format=None): + set_metadata=False, specific_format=None, + exclude_auto=False): ans = [] need_auto = [] if specific_format is not None: @@ -448,7 +449,8 @@ class BooksModel(QAbstractTableModel): ans.append(pt) else: need_auto.append(row) - ans.append(None) + if not exclude_auto: + ans.append(None) return ans, need_auto def id(self, row): diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py index dcece08a3e..fee500bdb9 100644 --- a/src/calibre/gui2/main.py +++ b/src/calibre/gui2/main.py @@ -969,17 +969,9 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI): def auto_convert(self, rows, on_card, format): previous = self.library_view.currentIndex() - comics, others = [], [] - db = self.library_view.model().db - for r in rows: - formats = db.formats(r) - if not formats: continue - formats = formats.lower().split(',') - if 'cbr' in formats or 'cbz' in formats: - comics.append(r) - else: - others.append(r) - jobs, changed, bad_rows = auto_convert_ebook(format, self, self.library_view.model().db, comics, others) + jobs, changed, bad_rows = auto_convert_ebook(format, self, self.library_view.model().db, rows) + if jobs is None: + return for func, args, desc, fmt, id, temp_files in jobs: if id not in bad_rows: job = self.job_manager.run_job(Dispatcher(self.book_auto_converted), @@ -1063,7 +1055,7 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI): if job.exception is not None: self.job_exception(job) return - data = open(temp_files[-1].name, 'rb') + data = open(temp_files[0].name, 'rb') self.library_view.model().db.add_format(book_id, fmt, data, index_is_id=True) data.close() self.status_bar.showMessage(job.description + (' completed'), 2000) @@ -1080,7 +1072,7 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI): self.library_view.model().current_changed(current, QModelIndex()) r = self.library_view.model().index(self.library_view.model().db.row(book_id), 0) - self.sync_to_device(on_card, False, specific_format=fmt, send_rows=[r], auto_convert=False) + self.sync_to_device(on_card, False, specific_format=fmt, send_rows=[r], do_auto_convert=False) def book_converted(self, job): temp_files, fmt, book_id = self.conversion_jobs.pop(job) diff --git a/src/calibre/gui2/tools.py b/src/calibre/gui2/tools.py index 07587d3c25..e6bbf543e1 100644 --- a/src/calibre/gui2/tools.py +++ b/src/calibre/gui2/tools.py @@ -9,6 +9,7 @@ Logic for setting up conversion jobs import os from PyQt4.Qt import QDialog +from calibre.customize.ui import available_input_formats from calibre.utils.config import prefs from calibre.gui2.dialogs.lrf_single import LRFSingleDialog, LRFBulkDialog from calibre.gui2.dialogs.epub import Config as EPUBConvert @@ -22,6 +23,11 @@ from calibre.ebooks.epub.from_any import SOURCE_FORMATS as EPUB_PREFERRED_SOURCE from calibre.ebooks.mobi.from_any import config as mobiconfig from calibre.ebooks.lrf.comic.convert_from import config as comicconfig +# Ordered list of source formats. Items closer to the beginning are +# preferred for conversion over those toward the end. +PREFERRED_SOURCE_FORMATS = ['epub', 'lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf', + 'txt', 'pdf', 'oebzip', 'htm', 'html'] + def get_dialog(fmt): return { 'epub':EPUBConvert, @@ -34,101 +40,77 @@ def get_config(fmt): 'mobi':mobiconfig, }[fmt] -def auto_convert(fmt, parent, db, comics, others): +def auto_convert(fmt, parent, db, rows): changed = False jobs = [] - total = sum(map(len, (others, comics))) + total = len(rows) if total == 0: - return + return None, None, None parent.status_bar.showMessage(_('Starting auto conversion of %d books')%total, 2000) i = 0 bad_rows = [] - for i, row in enumerate(others+comics): + for i, row in enumerate(rows): row_id = db.id(row) - if row in others: - temp_files = [] - - data = None - for _fmt in EPUB_PREFERRED_SOURCE_FORMATS: - try: - data = db.format(row, _fmt.upper()) - if data is not None: - break - except: - continue - if data is None: - bad_rows.append(row) - continue + temp_files = [] - defaults = db.conversion_options(db.id(row), fmt) - defaults = defaults if defaults else '' - options = get_config(fmt)(defaults=defaults).parse() - - mi = db.get_metadata(row) - opf = OPFCreator(os.getcwdu(), mi) - opf_file = PersistentTemporaryFile('.opf') - opf.render(opf_file) - opf_file.close() - pt = PersistentTemporaryFile('.'+_fmt.lower()) - pt.write(data) - pt.close() - of = PersistentTemporaryFile('.'+fmt) - of.close() - cover = db.cover(row) - cf = None - if cover: - cf = PersistentTemporaryFile('.jpeg') - cf.write(cover) - cf.close() - options.cover = cf.name - options.output = of.name - options.from_opf = opf_file.name - args = [options, pt.name] - desc = _('Auto convert book %d of %d (%s)')%(i+1, total, repr(mi.title)) - temp_files = [cf] if cf is not None else [] - temp_files.extend([opf_file, pt, of]) - jobs.append(('any2'+fmt, args, desc, fmt.upper(), row_id, temp_files)) - - changed = True - else: - defaults = db.conversion_options(db.id(row), fmt) - defaults = defaults if defaults else '' - options = comicconfig(defaults=defaults).parse() - - mi = db.get_metadata(row) - if mi.title: - options.title = mi.title - if mi.authors: - options.author = ','.join(mi.authors) - data = None - for _fmt in ['cbz', 'cbr']: - try: - data = db.format(row, _fmt.upper()) - if data is not None: - break - except: - continue - - if data is None: + data = None + in_formats = [f.lower() for f in db.formats(row).split(',')] + in_formats = list(set(in_formats).intersection(available_input_formats())) + for _fmt in PREFERRED_SOURCE_FORMATS: + if _fmt in in_formats: + data = _fmt + break + if data is None: + if in_formats != []: + data = list(in_formats)[0] + else: bad_rows.append(row) continue - - pt = PersistentTemporaryFile('.'+_fmt.lower()) - pt.write(data) - pt.close() - of = PersistentTemporaryFile('.'+fmt) - of.close() - setattr(options, 'output', of.name) - options.verbose = 1 - args = [pt.name, options] - desc = _('Convert book %d of %d (%s)')%(i+1, total, repr(mi.title)) - jobs.append(('comic2'+fmt, args, desc, fmt.upper(), row_id, [pt, of])) - - changed = True + +# defaults = db.conversion_options(db.id(row), fmt) +# defaults = defaults if defaults else '' +# options = get_config(fmt)(defaults=defaults).parse() + +# mi = db.get_metadata(row) +# opf = OPFCreator(os.getcwdu(), mi) +# opf_file = PersistentTemporaryFile('.opf') +# opf.render(opf_file) +# opf_file.close() +# pt = PersistentTemporaryFile('.'+_fmt.lower()) +# pt.write(data) +# pt.close() +# of = PersistentTemporaryFile('.'+fmt) +# of.close() +# cover = db.cover(row) +# cf = None +# if cover: +# cf = PersistentTemporaryFile('.jpeg') +# cf.write(cover) +# cf.close() +# options.cover = cf.name +# options.output = of.name +# options.from_opf = opf_file.name +# args = [options, pt.name] +# desc = _('Auto convert book %d of %d (%s)')%(i+1, total, repr(mi.title)) +# temp_files = [cf] if cf is not None else [] +# temp_files.extend([opf_file, pt, of]) +# jobs.append(('any2'+fmt, args, desc, fmt.upper(), row_id, temp_files)) + + mi = db.get_metadata(row) + in_file = db.format_abspath(row, data) + out_file = PersistentTemporaryFile('.'+fmt.lower()) + out_file.write(data) + out_file.close() + desc = _('Auto convert book %d of %d (%s)')%(i+1, total, repr(mi.title)) + args = [['', in_file, out_file.name]] + temp_files = [out_file] + jobs.append(('ebook-convert', args, desc, fmt.upper(), row_id, temp_files)) + + changed = True if bad_rows: res = [] @@ -141,9 +123,6 @@ def auto_convert(fmt, parent, db, comics, others): return jobs, changed, bad_rows -def auto_convert_lrf(fmt, parent, db, comics, others): - pass - def convert_single(fmt, parent, db, comics, others): changed = False jobs = [] diff --git a/src/calibre/parallel.py b/src/calibre/parallel.py index 4969877da9..90a2969c86 100644 --- a/src/calibre/parallel.py +++ b/src/calibre/parallel.py @@ -79,6 +79,9 @@ PARALLEL_FUNCS = { 'comic2mobi' : ('calibre.ebooks.mobi.from_comic', 'convert', {}, 'notification'), + + 'ebook-convert' : + ('calibre.ebooks.conversion.cli', 'main', {}, None), } From 011e2811d28c991e5a4a4715999a6b15827c8a39 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 5 Apr 2009 09:08:43 -0400 Subject: [PATCH 4/7] Metadata reading and writing for TXT/PDF input/output. --- src/calibre/ebooks/pdf/input.py | 6 ++---- src/calibre/ebooks/pdf/output.py | 4 ++-- src/calibre/ebooks/pdf/writer.py | 20 +++++++++++++++++--- src/calibre/ebooks/txt/input.py | 7 +++---- src/calibre/ebooks/txt/output.py | 2 +- 5 files changed, 25 insertions(+), 14 deletions(-) diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py index 6f55b71dd5..edbc2d6b30 100644 --- a/src/calibre/ebooks/pdf/input.py +++ b/src/calibre/ebooks/pdf/input.py @@ -10,8 +10,7 @@ import os from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.pdf.pdftohtml import pdftohtml from calibre.ebooks.metadata.opf import OPFCreator -from calibre.ebooks.metadata import MetaInformation -#from calibre.ebooks.metadata.meta import metadata_from_formats +from calibre.customize.builtins import PDFMetadataReader class PDFInput(InputFormatPlugin): @@ -27,8 +26,7 @@ class PDFInput(InputFormatPlugin): with open('index.html', 'wb') as index: index.write(html) - #mi = metadata_from_formats([stream.name]) - mi = MetaInformation(_('Unknown'), _('Unknown')) + mi = PDFMetadataReader(None).get_metadata(stream, 'pdf') opf = OPFCreator(os.getcwd(), mi) opf.create_manifest([('index.html', None)]) opf.create_spine(['index.html']) diff --git a/src/calibre/ebooks/pdf/output.py b/src/calibre/ebooks/pdf/output.py index 230beed9ae..65af40dc51 100644 --- a/src/calibre/ebooks/pdf/output.py +++ b/src/calibre/ebooks/pdf/output.py @@ -17,7 +17,7 @@ from calibre.customize.conversion import OutputFormatPlugin, \ OptionRecommendation from calibre.ebooks.oeb.output import OEBOutput from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.pdf.writer import PDFWriter +from calibre.ebooks.pdf.writer import PDFWriter, PDFMetadata from calibre.ebooks.pdf.pageoptions import UNITS, unit, PAPER_SIZES, \ paper_size, ORIENTATIONS, orientation, PageOptions @@ -88,7 +88,7 @@ class PDFOutput(OutputFormatPlugin): out_stream.seek(0) out_stream.truncate() - writer.dump(opf, out_stream) + writer.dump(opf, out_stream, PDFMetadata(oeb_book.metadata)) if close: out_stream.close() diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index 2aebd7322c..7d0a690856 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -13,6 +13,7 @@ import os, shutil, sys from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ebooks.pdf.pageoptions import PageOptions +from calibre.ebooks.metadata import authors_to_string from calibre.ebooks.metadata.opf2 import OPF from PyQt4 import QtCore @@ -22,6 +23,18 @@ from PyQt4.QtWebKit import QWebView from pyPdf import PdfFileWriter, PdfFileReader +class PDFMetadata(object): + def __init__(self, oeb_metadata=None): + self.title = _('Unknown') + self.author = _('Unknown') + + if oeb_metadata != None: + if len(oeb_metadata.title) >= 1: + self.title = oeb_metadata.title[0].value + if len(oeb_metadata.creator) >= 1: + self.author = authors_to_string([x.value for x in oeb_metadata.creator]) + + class PDFWriter(QObject): def __init__(self, log, popts=PageOptions()): if QApplication.instance() is None: @@ -37,8 +50,9 @@ class PDFWriter(QObject): self.combine_queue = [] self.tmp_path = PersistentTemporaryDirectory('_any2pdf_parts') self.popts = popts - - def dump(self, opfpath, out_stream): + + def dump(self, opfpath, out_stream, pdf_metadata): + self.metadata = pdf_metadata self._delete_tmpdir() opf = OPF(opfpath, os.path.dirname(opfpath)) @@ -88,7 +102,7 @@ class PDFWriter(QObject): self.logger.info('Combining individual PDF parts...') try: - outPDF = PdfFileWriter() + outPDF = PdfFileWriter(title=self.metadata.title, author=self.metadata.author) for item in self.combine_queue: inputPDF = PdfFileReader(file(item, 'rb')) for page in inputPDF.pages: diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index fdc2851342..69d9c09da5 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -10,7 +10,7 @@ import os from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.markdown import markdown from calibre.ebooks.metadata.opf import OPFCreator -#from calibre.ebooks.metadata.meta import metadata_from_formats +from calibre.customize.builtins import TXTMetadataReader class TXTInput(InputFormatPlugin): @@ -26,12 +26,11 @@ class TXTInput(InputFormatPlugin): md = markdown.Markdown( extensions=['footnotes', 'tables', 'toc'], safe_mode=False,) - html = ''+md.convert(txt)+'' + html = '</head><body>'+md.convert(txt)+'</body></html>' with open('index.html', 'wb') as index: index.write(html.encode('utf-8')) - #mi = metadata_from_formats([stream.name]) - mi = None + mi = TXTMetadataReader(None).get_metadata(stream, 'txt') opf = OPFCreator(os.getcwd(), mi) opf.create_manifest([('index.html', None)]) opf.create_spine(['index.html']) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 2d1ef98662..423e668a56 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -34,7 +34,7 @@ class TXTOutput(OutputFormatPlugin): def convert(self, oeb_book, output_path, input_plugin, opts, log): metadata = TxtMetadata() if opts.prepend_metadata.lower() == 'true': - metadata.author = opts.authors if opts.authors else authors_to_string(oeb_book.metadata.authors.value) if oeb_book.metadata.authors != [] else _('Unknown') + metadata.author = opts.authors if opts.authors else authors_to_string([x.value for x in oeb_book.metadata.creator]) if oeb_book.metadata.creator != [] else _('Unknown') metadata.title = opts.title if opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') writer = TxtWriter(TxtNewlines(opts.newline).newline, log) From 3b09d017016ab40685b32232f32294ca26d701c8 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 5 Apr 2009 19:43:59 -0400 Subject: [PATCH 5/7] TXT input encoding option honored --- src/calibre/ebooks/txt/input.py | 5 ++++- src/calibre/gui2/tools.py | 29 ----------------------------- 2 files changed, 4 insertions(+), 30 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 69d9c09da5..e161f6b9bd 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -21,7 +21,10 @@ class TXTInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): - txt = stream.read() + ienc = stream.encoding if stream.encoding else 'utf-8' + if options.input_encoding: + ienc = options.input_encoding + txt = stream.read().decode(ienc) md = markdown.Markdown( extensions=['footnotes', 'tables', 'toc'], diff --git a/src/calibre/gui2/tools.py b/src/calibre/gui2/tools.py index e6bbf543e1..d004dcb502 100644 --- a/src/calibre/gui2/tools.py +++ b/src/calibre/gui2/tools.py @@ -70,35 +70,6 @@ def auto_convert(fmt, parent, db, rows): else: bad_rows.append(row) continue - -# defaults = db.conversion_options(db.id(row), fmt) -# defaults = defaults if defaults else '' -# options = get_config(fmt)(defaults=defaults).parse() - -# mi = db.get_metadata(row) -# opf = OPFCreator(os.getcwdu(), mi) -# opf_file = PersistentTemporaryFile('.opf') -# opf.render(opf_file) -# opf_file.close() -# pt = PersistentTemporaryFile('.'+_fmt.lower()) -# pt.write(data) -# pt.close() -# of = PersistentTemporaryFile('.'+fmt) -# of.close() -# cover = db.cover(row) -# cf = None -# if cover: -# cf = PersistentTemporaryFile('.jpeg') -# cf.write(cover) -# cf.close() -# options.cover = cf.name -# options.output = of.name -# options.from_opf = opf_file.name -# args = [options, pt.name] -# desc = _('Auto convert book %d of %d (%s)')%(i+1, total, repr(mi.title)) -# temp_files = [cf] if cf is not None else [] -# temp_files.extend([opf_file, pt, of]) -# jobs.append(('any2'+fmt, args, desc, fmt.upper(), row_id, temp_files)) mi = db.get_metadata(row) in_file = db.format_abspath(row, data) From 383fe33adb0921f5355b901a1039b7848262b406 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 8 Apr 2009 19:51:56 -0400 Subject: [PATCH 6/7] process pdf input html output a bit. --- src/calibre/ebooks/pdf/pdftohtml.py | 35 +++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index e7707479c3..0f6581dea6 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -6,7 +6,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>, ' \ '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' -import errno, os, sys, subprocess +import errno, os, re, sys, subprocess from functools import partial from calibre.ebooks import ConversionError, DRMError @@ -24,6 +24,32 @@ if iswindows and hasattr(sys, 'frozen'): if islinux and getattr(sys, 'frozen_path', False): PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml') +# Fix pdftohtml markup +PDFTOHTML_RULES = [ + # Remove <hr> tags + (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'), + # Remove page numbers + (re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''), + # Remove <br> and replace <br><br> with <p> + (re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'), + (re.compile(r'(.*)<br.*?>', re.IGNORECASE), + lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 + else match.group(1)), + # Remove hyphenation + (re.compile(r'-\n\r?'), lambda match: ''), + + # Remove gray background + (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), + + # Remove non breaking spaces + (re.compile(ur'\u00a0'), lambda match : ' '), + + # Add second <br /> after first to allow paragraphs to show better + (re.compile(r'<br.*?>'), lambda match : '<br /><br />'), + + ] + + def pdftohtml(pdf_path): ''' Convert the pdf into html using the pdftohtml app. @@ -72,4 +98,9 @@ def pdftohtml(pdf_path): if not '<br' in raw[:4000]: raise ConversionError(os.path.basename(pdf_path) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'), True) - return '<!-- created by calibre\'s pdftohtml -->\n' + raw + return '<!-- created by calibre\'s pdftohtml -->\n' + processed_html(raw) + +def processed_html(html): + for rule in PDFTOHTML_RULES: + html = rule[0].sub(rule[1], html) + return html From 902272b6bc6f0767241692543fff417fdc0174d6 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 8 Apr 2009 20:53:45 -0400 Subject: [PATCH 7/7] a bit of preprocessing work --- src/calibre/ebooks/conversion/preprocess.py | 4 ++- src/calibre/ebooks/pdf/pdftohtml.py | 33 +-------------------- 2 files changed, 4 insertions(+), 33 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index f544a331d8..bb8ee90364 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -66,7 +66,9 @@ class HTMLPreProcessor(object): # Remove non breaking spaces (re.compile(ur'\u00a0'), lambda match : ' '), - + + # Have paragraphs show better + (re.compile(r'<br.*?>'), lambda match : '<p>'), ] # Fix Book Designer markup diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 0f6581dea6..e03d7d0647 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -24,32 +24,6 @@ if iswindows and hasattr(sys, 'frozen'): if islinux and getattr(sys, 'frozen_path', False): PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml') -# Fix pdftohtml markup -PDFTOHTML_RULES = [ - # Remove <hr> tags - (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'), - # Remove page numbers - (re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''), - # Remove <br> and replace <br><br> with <p> - (re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'), - (re.compile(r'(.*)<br.*?>', re.IGNORECASE), - lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 - else match.group(1)), - # Remove hyphenation - (re.compile(r'-\n\r?'), lambda match: ''), - - # Remove gray background - (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), - - # Remove non breaking spaces - (re.compile(ur'\u00a0'), lambda match : ' '), - - # Add second <br /> after first to allow paragraphs to show better - (re.compile(r'<br.*?>'), lambda match : '<br /><br />'), - - ] - - def pdftohtml(pdf_path): ''' Convert the pdf into html using the pdftohtml app. @@ -98,9 +72,4 @@ def pdftohtml(pdf_path): if not '<br' in raw[:4000]: raise ConversionError(os.path.basename(pdf_path) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'), True) - return '<!-- created by calibre\'s pdftohtml -->\n' + processed_html(raw) - -def processed_html(html): - for rule in PDFTOHTML_RULES: - html = rule[0].sub(rule[1], html) - return html + return '<!-- created by calibre\'s pdftohtml -->\n' + raw