Fix #1145 (Soft Hyphens are not removed when placed after a "-")

This commit is contained in:
Kovid Goyal 2008-10-09 11:57:11 -07:00
parent 92ba788069
commit fec88e8d4e
2 changed files with 9 additions and 6 deletions

View File

@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'
''' '''
Conversion to EPUB. Conversion to EPUB.
''' '''
import sys, textwrap import sys, textwrap, re
from calibre.utils.config import Config, StringConfig from calibre.utils.config import Config, StringConfig
from calibre.utils.zipfile import ZipFile, ZIP_STORED from calibre.utils.zipfile import ZipFile, ZIP_STORED
from calibre.ebooks.html import config as common_config, tostring from calibre.ebooks.html import config as common_config, tostring
@ -15,13 +15,13 @@ class DefaultProfile(object):
flow_size = sys.maxint flow_size = sys.maxint
screen_size = None screen_size = None
remove_soft_hyphens = False remove_special_chars = False
class PRS505(DefaultProfile): class PRS505(DefaultProfile):
flow_size = 300000 flow_size = 300000
screen_size = (600, 775) screen_size = (600, 775)
remove_soft_hyphens = True remove_special_chars = re.compile(u'[\u200b\u00ad]')
PROFILES = { PROFILES = {

View File

@ -316,7 +316,10 @@ def opf_traverse(opf_reader, verbose=0, encoding=None):
class PreProcessor(object): class PreProcessor(object):
PREPROCESS = [(re.compile(r'&(\S+?);'), entity_to_unicode)] PREPROCESS = [
# Convert all entities, since lxml doesn't handle them well
(re.compile(r'&(\S+?);'), entity_to_unicode),
]
# Fix pdftohtml markup # Fix pdftohtml markup
PDFTOHTML = [ PDFTOHTML = [
@ -365,8 +368,8 @@ class PreProcessor(object):
def preprocess(self, html): def preprocess(self, html):
opts = getattr(self, 'opts', False) opts = getattr(self, 'opts', False)
if opts and hasattr(opts, 'profile') and getattr(opts.profile, 'remove_soft_hyphens', False): if opts and hasattr(opts, 'profile') and getattr(opts.profile, 'remove_special_chars', False):
html = html.replace(u'\u00ad', '') html = opts.profile.remove_special_chars.sub('', html)
if self.is_baen(html): if self.is_baen(html):
rules = [] rules = []
elif self.is_book_designer(html): elif self.is_book_designer(html):