From fec88e8d4e732822be549a9c332233c4743f80e9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 9 Oct 2008 11:57:11 -0700 Subject: [PATCH] Fix #1145 (Soft Hyphens are not removed when placed after a "-") --- src/calibre/ebooks/epub/__init__.py | 6 +++--- src/calibre/ebooks/html.py | 9 ++++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index 66712b98b4..6132505dc2 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en' ''' Conversion to EPUB. ''' -import sys, textwrap +import sys, textwrap, re from calibre.utils.config import Config, StringConfig from calibre.utils.zipfile import ZipFile, ZIP_STORED from calibre.ebooks.html import config as common_config, tostring @@ -15,13 +15,13 @@ class DefaultProfile(object): flow_size = sys.maxint screen_size = None - remove_soft_hyphens = False + remove_special_chars = False class PRS505(DefaultProfile): flow_size = 300000 screen_size = (600, 775) - remove_soft_hyphens = True + remove_special_chars = re.compile(u'[\u200b\u00ad]') PROFILES = { diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index e3f8f516e1..548e5874e4 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -316,7 +316,10 @@ def opf_traverse(opf_reader, verbose=0, encoding=None): class PreProcessor(object): - PREPROCESS = [(re.compile(r'&(\S+?);'), entity_to_unicode)] + PREPROCESS = [ + # Convert all entities, since lxml doesn't handle them well + (re.compile(r'&(\S+?);'), entity_to_unicode), + ] # Fix pdftohtml markup PDFTOHTML = [ @@ -365,8 +368,8 @@ class PreProcessor(object): def preprocess(self, html): opts = getattr(self, 'opts', False) - if opts and hasattr(opts, 'profile') and getattr(opts.profile, 'remove_soft_hyphens', False): - html = html.replace(u'\u00ad', '') + if opts and hasattr(opts, 'profile') and getattr(opts.profile, 'remove_special_chars', False): + html = opts.profile.remove_special_chars.sub('', html) if self.is_baen(html): rules = [] elif self.is_book_designer(html):