From a747879afc91c88e96993740852e4ea19c0e569d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 20 May 2010 22:29:35 -0600 Subject: [PATCH] Conversion pipeline: calibre will now automatically replace all ligatures in the input document with the normal character sequence they are menat to represent. This can be turned off via an option under Look & Feel, in the Conversion settings. Fixes #5553 (Text errors converting from .pdf) --- src/calibre/ebooks/conversion/plumber.py | 12 ++ src/calibre/ebooks/conversion/preprocess.py | 21 ++++ src/calibre/gui2/convert/look_and_feel.py | 2 +- src/calibre/gui2/convert/look_and_feel.ui | 126 +++++++++----------- 4 files changed, 92 insertions(+), 69 deletions(-) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 1034511016..e227ad2c8e 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -411,6 +411,18 @@ OptionRecommendation(name='asciiize', ) ), +OptionRecommendation(name='keep_ligatures', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Preserve ligatures present in the input document. ' + 'A ligature is a special rendering of a pair of ' + 'characters like ff, fi, fl et cetera. ' + 'Most readers do not have support for ' + 'ligatures in their default fonts, so they are ' + 'unlikely to render correctly. By default, calibre ' + 'will turn a ligature into the corresponding pair of normal ' + 'characters. This option will preserve them instead.') + ), + OptionRecommendation(name='title', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the title.')), diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index ada4f1a3af..a42f0fc73b 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -18,6 +18,24 @@ convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp', '#60', '#62']) _span_pat = re.compile('', re.DOTALL|re.IGNORECASE) +LIGATURES = { + u'\u00c6': u'AE', + u'\u00e6': u'ae', + u'\u0152': u'OE', + u'\u0153': u'oe', + u'\u0132': u'IJ', + u'\u0133': u'ij', + u'\u1D6B': u'ue', + u'\uFB00': u'ff', + u'\uFB01': u'fi', + u'\uFB02': u'fl', + u'\uFB03': u'ffi', + u'\uFB04': u'ffl', + u'\uFB05': u'ft', + u'\uFB06': u'st', + } + +_ligpat = re.compile(u'|'.join(LIGATURES)) def sanitize_head(match): x = match.group(1) @@ -228,6 +246,9 @@ class HTMLPreProcessor(object): else: rules = [] + if not self.extra_opts.keep_ligatures: + html = _ligpat.sub(lambda m:LIGATURES[m.group()], html) + end_rules = [] if getattr(self.extra_opts, 'remove_header', None): try: diff --git a/src/calibre/gui2/convert/look_and_feel.py b/src/calibre/gui2/convert/look_and_feel.py index 8ef1f77351..e18657cf69 100644 --- a/src/calibre/gui2/convert/look_and_feel.py +++ b/src/calibre/gui2/convert/look_and_feel.py @@ -24,7 +24,7 @@ class LookAndFeelWidget(Widget, Ui_Form): 'linearize_tables', 'disable_font_rescaling', 'insert_blank_line', 'remove_paragraph_spacing', 'remove_paragraph_spacing_indent_size','input_encoding', - 'asciiize'] + 'asciiize', 'keep_ligatures'] ) self.db, self.book_id = db, book_id self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/look_and_feel.ui b/src/calibre/gui2/convert/look_and_feel.ui index 6fbf4e11cd..764226012b 100644 --- a/src/calibre/gui2/convert/look_and_feel.ui +++ b/src/calibre/gui2/convert/look_and_feel.ui @@ -31,7 +31,7 @@ - + pt @@ -63,7 +63,7 @@ - + @@ -84,7 +84,7 @@ ... - + :/images/wizard.svg:/images/wizard.svg @@ -107,7 +107,7 @@ - + pt @@ -127,60 +127,50 @@ - + - - + + + + Remove &spacing between paragraphs + + + + + - + - Remove &spacing between paragraphs + Indent size: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - Qt::Horizontal + + + <p>When calibre removes inter paragraph spacing, it automatically sets a paragraph indent, to ensure that paragraphs can be easily distinguished. This option controls the width of that indent. - - - 40 - 20 - + + em - - - - - - - - Indent size: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - - - - - <p>When calibre removes inter paragraph spacing, it automatically sets a paragraph indent, to ensure that paragraphs can be easily distinguished. This option controls the width of that indent. - - - em - - - 1 - - - - + + 1 + + + + + + Text justification: + + + @@ -188,14 +178,7 @@ - - - - &Transliterate unicode characters to ASCII. - - - - + Extra &CSS @@ -207,21 +190,7 @@ - - - - Insert &blank line - - - - - - - Text justification: - - - - + 2 @@ -243,6 +212,27 @@ + + + + &Transliterate unicode characters to ASCII + + + + + + + Insert &blank line + + + + + + + Keep &ligatures + + +