From 86e68579f32972a2424771a7f3e84d046d630283 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 11 Sep 2010 08:39:40 -0400 Subject: [PATCH 1/5] PDF Input: Fix bug #6734, add additional matching for unicode characters. --- src/calibre/ebooks/conversion/preprocess.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index f7b803974f..256bcce6fc 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -166,6 +166,17 @@ class HTMLPreProcessor(object): (re.compile(u'`\s*()*\s*O', re.UNICODE), lambda match: u'Ò'), (re.compile(u'`\s*()*\s*u', re.UNICODE), lambda match: u'ù'), (re.compile(u'`\s*()*\s*U', re.UNICODE), lambda match: u'Ù'), + # ` with letter before + (re.compile(u'a\s*()*\s*`', re.UNICODE), lambda match: u'à'), + (re.compile(u'A\s*()*\s*`', re.UNICODE), lambda match: u'À'), + (re.compile(u'e\s*()*\s*`', re.UNICODE), lambda match: u'è'), + (re.compile(u'E\s*()*\s*`', re.UNICODE), lambda match: u'È'), + (re.compile(u'i\s*()*\s*`', re.UNICODE), lambda match: u'ì'), + (re.compile(u'I\s*()*\s*`', re.UNICODE), lambda match: u'Ì'), + (re.compile(u'o\s*()*\s*`', re.UNICODE), lambda match: u'ò'), + (re.compile(u'O\s*()*\s*`', re.UNICODE), lambda match: u'Ò'), + (re.compile(u'u\s*()*\s*`', re.UNICODE), lambda match: u'ù'), + (re.compile(u'U\s*()*\s*`', re.UNICODE), lambda match: u'Ù'), # ´ (re.compile(u'´\s*()*\s*a', re.UNICODE), lambda match: u'á'), From 96478da323e642febb94c2c1a2c9826a6b3dddb7 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 11 Sep 2010 08:48:47 -0400 Subject: [PATCH 2/5] PLM Input: Fix cleanup code. --- src/calibre/ebooks/pml/pmlconverter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 166695ff5c..3a4454725a 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -216,7 +216,7 @@ class PML_HTMLizer(object): html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html) else: html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html) - html = re.sub(r'

\s*

', '', html) + html = re.sub(r'(?imu)

\s*

', '', html) return html def start_line(self): From dc7bc5dd5d890278d7f43377e9df944675888fc6 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 11 Sep 2010 09:01:34 -0400 Subject: [PATCH 3/5] PML Input: Fix bug #6770, put toc link after header so toc link goes to correct page. --- src/calibre/ebooks/pml/pmlconverter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 3a4454725a..6e479a71ef 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -556,7 +556,7 @@ class PML_HTMLizer(object): text = t else: self.toc.add_item(os.path.basename(self.file_name), id, value) - text = '%s' % (id, t) + text = '%s' % (t, id) elif c == 'm': empty = False src = self.code_value(line) From c2b3c445e17a38b5599393c943036c6c448886da Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 11 Sep 2010 09:09:08 -0400 Subject: [PATCH 4/5] PML Input: Remove emtpy lines. --- src/calibre/ebooks/pml/pmlconverter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 6e479a71ef..b0fc15197a 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -207,6 +207,7 @@ class PML_HTMLizer(object): while html != old: old = html html = self.cleanup_html_remove_redundant(html) + html = re.sub(r'(?imu)^\s*', '', html) return html def cleanup_html_remove_redundant(self, html): From ef8408869cebac380474deb971c4b6910680c895 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 11 Sep 2010 09:13:23 -0400 Subject: [PATCH 5/5] TXT Output: preserve spaces, handle tab character correct. is reduced to a single space by many renderers. --- src/calibre/ebooks/txt/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index a12e8a0761..dac1e34df7 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -77,7 +77,7 @@ def separate_paragraphs_print_formatted(txt): def preserve_spaces(txt): txt = txt.replace(' ', ' ') - txt = txt.replace('\t', ' ') + txt = txt.replace('\t', '    ') return txt def opf_writer(path, opf_name, manifest, spine, mi):