TXT Output: When using preserve spaces, output tab characters as a sequence of four non-breaking spaces as some readers dont handle the \x09 char code. Fix #6770 (Problem converting pmlz to epub). PDF Input: More unicode character matching.

This commit is contained in:
Kovid Goyal 2010-09-11 11:58:32 -06:00
commit 1e77e6538f
3 changed files with 15 additions and 3 deletions

View File

@ -166,6 +166,17 @@ class HTMLPreProcessor(object):
(re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'), (re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
(re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'), (re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
(re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'), (re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
# ` with letter before
(re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'),
(re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'),
(re.compile(u'e\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'è'),
(re.compile(u'E\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'È'),
(re.compile(u'i\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ì'),
(re.compile(u'I\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ì'),
(re.compile(u'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ò'),
(re.compile(u'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ò'),
(re.compile(u'u\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ù'),
(re.compile(u'U\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ù'),
# ´ # ´
(re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'), (re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'),

View File

@ -207,6 +207,7 @@ class PML_HTMLizer(object):
while html != old: while html != old:
old = html old = html
html = self.cleanup_html_remove_redundant(html) html = self.cleanup_html_remove_redundant(html)
html = re.sub(r'(?imu)^\s*', '', html)
return html return html
def cleanup_html_remove_redundant(self, html): def cleanup_html_remove_redundant(self, html):
@ -216,7 +217,7 @@ class PML_HTMLizer(object):
html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html) html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html)
else: else:
html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html) html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html)
html = re.sub(r'<p>\s*</p>', '', html) html = re.sub(r'(?imu)<p>\s*</p>', '', html)
return html return html
def start_line(self): def start_line(self):
@ -556,7 +557,7 @@ class PML_HTMLizer(object):
text = t text = t
else: else:
self.toc.add_item(os.path.basename(self.file_name), id, value) self.toc.add_item(os.path.basename(self.file_name), id, value)
text = '<span id="%s"></span>%s' % (id, t) text = '%s<span id="%s"></span>' % (t, id)
elif c == 'm': elif c == 'm':
empty = False empty = False
src = self.code_value(line) src = self.code_value(line)

View File

@ -77,7 +77,7 @@ def separate_paragraphs_print_formatted(txt):
def preserve_spaces(txt): def preserve_spaces(txt):
txt = txt.replace(' ', '&nbsp;') txt = txt.replace(' ', '&nbsp;')
txt = txt.replace('\t', '&#09;') txt = txt.replace('\t', '&nbsp;&nbsp;&nbsp;&nbsp;')
return txt return txt
def opf_writer(path, opf_name, manifest, spine, mi): def opf_writer(path, opf_name, manifest, spine, mi):