From 2bd8cb905902a53e571e006cbf1504a00668acb5 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 30 Jul 2009 22:12:14 -0400 Subject: [PATCH] PML to HTML conversion fixes. --- src/calibre/ebooks/conversion/preprocess.py | 12 ++++++ src/calibre/ebooks/pdb/ereader/reader132.py | 11 +++-- src/calibre/ebooks/pdb/ereader/reader202.py | 12 +++--- src/calibre/ebooks/pml/pmlconverter.py | 47 +++++++++++---------- 4 files changed, 50 insertions(+), 32 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 739f8ff30c..6dc4b9143c 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -114,6 +114,18 @@ class HTMLPreProcessor(object): (re.compile(u'¨\s*()*\s*a', re.UNICODE), lambda match: u'ä'), (re.compile(u'¨\s*()*\s*A', re.UNICODE), lambda match: u'Ä'), + # Fix French accents + (re.compile(u'`\s*()*\s*o', re.UNICODE), lambda match: u'ò'), + (re.compile(u'`\s*()*\s*O', re.UNICODE), lambda match: u'Ò'), + (re.compile(u'`\s*()*\s*u', re.UNICODE), lambda match: u'ù'), + (re.compile(u'`\s*()*\s*U', re.UNICODE), lambda match: u'Ù'), + (re.compile(u'`\s*()*\s*e', re.UNICODE), lambda match: u'è'), + (re.compile(u'`\s*()*\s*E', re.UNICODE), lambda match: u'È'), + (re.compile(u'`\s*()*\s*i', re.UNICODE), lambda match: u'ì'), + (re.compile(u'`\s*()*\s*I', re.UNICODE), lambda match: u'Ì'), + (re.compile(u'`\s*()*\s*a', re.UNICODE), lambda match: u'à'), + (re.compile(u'`\s*()*\s*A', re.UNICODE), lambda match: u'À'), + # Remove page links (re.compile(r'', re.IGNORECASE), lambda match: ''), # Remove
tags diff --git a/src/calibre/ebooks/pdb/ereader/reader132.py b/src/calibre/ebooks/pdb/ereader/reader132.py index 52d4778561..a1d1f4294d 100644 --- a/src/calibre/ebooks/pdb/ereader/reader132.py +++ b/src/calibre/ebooks/pdb/ereader/reader132.py @@ -19,8 +19,6 @@ from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.pdb.ereader import EreaderError from calibre.ebooks.pdb.formatreader import FormatReader -from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html -from calibre.ebooks.pml.pmlconverter import pml_to_html class HeaderRecord(object): ''' @@ -99,16 +97,21 @@ class Reader132(FormatReader): return self.decompress_text(number) def extract_content(self, output_dir): + from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html + from calibre.ebooks.pml.pmlconverter import pml_to_html + output_dir = os.path.abspath(output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) - html = u'' + html = u'%s' % self.mi.title + pml = u'' for i in range(1, self.header_record.num_text_pages + 1): self.log.debug('Extracting text page %i' % i) - html += pml_to_html(self.get_text_page(i)) + pml += self.get_text_page(i) + html += pml_to_html(pml) if self.header_record.footnote_rec > 0: html += '

%s

' % _('Footnotes') diff --git a/src/calibre/ebooks/pdb/ereader/reader202.py b/src/calibre/ebooks/pdb/ereader/reader202.py index 0b394fb765..5057df363e 100644 --- a/src/calibre/ebooks/pdb/ereader/reader202.py +++ b/src/calibre/ebooks/pdb/ereader/reader202.py @@ -12,7 +12,6 @@ import struct from calibre import CurrentDir from calibre.ebooks.metadata.opf2 import OPFCreator -from calibre.ebooks.pml.pmlconverter import pml_to_html from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ereader import EreaderError @@ -81,19 +80,20 @@ class Reader202(FormatReader): return self.decompress_text(number) def extract_content(self, output_dir): + from calibre.ebooks.pml.pmlconverter import pml_to_html + output_dir = os.path.abspath(output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) - html = u'' - + pml = u'' for i in range(1, self.header_record.num_text_pages + 1): self.log.debug('Extracting text page %i' % i) - html += pml_to_html(self.get_text_page(i)) + pml += self.get_text_page(i) - - html += '' + html = u'%s%s' % \ + (self.mi.title, pml_to_html(pml)) with CurrentDir(output_dir): with open('index.html', 'wb') as index: diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 19e0522e23..c07c01a110 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -15,44 +15,47 @@ from calibre.ebooks.pdb.ereader import image_name PML_HTML_RULES = [ (re.compile(r'\\p'), lambda match: '

'), - (re.compile(r'\\x(?P.+?)\\x', re.DOTALL), lambda match: '

%s

' % match.group('text')), - (re.compile(r'\\X(?P[0-4])(?P.+?)\\X[0-4]', re.DOTALL), lambda match: '%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), - (re.compile(r'\\C\d=".+"'), lambda match: ''), # This should be made to create a TOC entry - (re.compile(r'\\c(?P.+?)\\c', re.DOTALL), lambda match: '
%s
' % match.group('text')), - (re.compile(r'\\r(?P.+?)\\r', re.DOTALL), lambda match: '
%s
' % match.group('text')), - (re.compile(r'\\i(?P.+?)\\i', re.DOTALL), lambda match: '%s' % match.group('text')), - (re.compile(r'\\u(?P.+?)\\u', re.DOTALL), lambda match: '
%s
' % match.group('text')), - (re.compile(r'\\o(?P.+?)\\o', re.DOTALL), lambda match: '%s' % match.group('text')), - (re.compile(r'\\v(?P.+?)\\v', re.DOTALL), lambda match: '' % match.group('text')), - (re.compile(r'\\t(?P.+?)\\t', re.DOTALL), lambda match: '
%s
' % match.group('text')), - (re.compile(r'\\T="(?P\d+)%*"(?P.+?)$', re.MULTILINE), lambda match: r'
%s
' % (match.group('val'), match.group('text'))), + (re.compile(r'\\x(?P.*?)\\x', re.DOTALL), lambda match: '

%s

' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\X(?P[0-4])(?P.*?)\\X[0-4]', re.DOTALL), lambda match: '%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''), + (re.compile(r'\\C\d=".+?"'), lambda match: ''), # This should be made to create a TOC entry + (re.compile(r'\\c(?P.*?)\\c', re.DOTALL), lambda match: '
%s
' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\r(?P.*?)\\r', re.DOTALL), lambda match: '
%s
' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\i(?P.*?)\\i', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\u(?P.*?)\\u', re.DOTALL), lambda match: '
%s
' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\o(?P.*?)\\o', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\v(?P.*?)\\v', re.DOTALL), lambda match: '' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\t(?P.*?)\\t', re.DOTALL), lambda match: '
%s
' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\T="(?P\d+)%*"(?P.*?)$', re.MULTILINE), lambda match: r'
%s
' % (match.group('val'), match.group('text')) if match.group('text') else ''), (re.compile(r'\\w="(?P\d+)%"'), lambda match: '
' % match.group('val')), (re.compile(r'\\n'), lambda match: ''), (re.compile(r'\\s'), lambda match: ''), - (re.compile(r'\\b(?P.+?)\\b', re.DOTALL), lambda match: '%s' % match.group('text')), # \b is deprecated; \B should be used instead. - (re.compile(r'\\l(?P.+?)\\l', re.DOTALL), lambda match: '%s' % match.group('text')), - (re.compile(r'\\B(?P.+?)\\B', re.DOTALL), lambda match: '%s' % match.group('text')), - (re.compile(r'\\Sp(?P.+?)\\Sp', re.DOTALL), lambda match: '%s' % match.group('text')), - (re.compile(r'\\Sb(?P.+?)\\Sb', re.DOTALL), lambda match: '%s' % match.group('text')), - (re.compile(r'\\k(?P.+?)\\k', re.DOTALL), lambda match: '%s' % match.group('text')), + (re.compile(r'\\b(?P.*?)\\b', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), # \b is deprecated; \B should be used instead. + (re.compile(r'\\l(?P.*?)\\l', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\B(?P.*?)\\B', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\Sp(?P.*?)\\Sp', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\Sb(?P.*?)\\Sb', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), + (re.compile(r'\\k(?P.*?)\\k', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), (re.compile(r'\\a(?P\d\d\d)'), lambda match: '&#%s;' % match.group('num')), (re.compile(r'\\U(?P\d\d\d\d)'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))), (re.compile(r'\\m="(?P.+?)"'), lambda match: '' % image_name(match.group('name')).strip('\x00')), - (re.compile(r'\\q="(?P#.+?)"(?P.+?)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))), + (re.compile(r'\\q="(?P#.+?)"(?P.*?)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text')) if match.group('text') else ''), (re.compile(r'\\Q="(?P.+?)"'), lambda match: '' % match.group('target')), (re.compile(r'\\-'), lambda match: ''), - (re.compile(r'\\Fn="(?P.+?)"(?P.+?)\\Fn'), lambda match: '%s' % (match.group('target'), match.group('text'))), - (re.compile(r'\\Sd="(?P.+?)"(?P.+?)\\Sd'), lambda match: '%s' % (match.group('target'), match.group('text'))), + (re.compile(r'\\Fn="(?P.+?)"(?P.*?)\\Fn'), lambda match: '%s' % (match.group('target'), match.group('text')) if match.group('text') else ''), + (re.compile(r'\\Sd="(?P.+?)"(?P.*?)\\Sd'), lambda match: '%s' % (match.group('target'), match.group('text')) if match.group('text') else ''), (re.compile(r'\\I'), lambda match: ''), # Sidebar and Footnotes - (re.compile(r'.+?)">\s*(?P.+?)\s*', re.DOTALL), lambda match: '' % (match.group('target'), match.group('text'))), - (re.compile(r'.+?)">\s*(?P.+?)\s*', re.DOTALL), lambda match: '
%s
' % (match.group('target'), match.group('text'))), + (re.compile(r'.+?)">\s*(?P.*?)\s*', re.DOTALL), lambda match: '' % (match.group('target'), match.group('text')) if match.group('text') else ''), + (re.compile(r'.+?)">\s*(?P.*?)\s*', re.DOTALL), lambda match: '
%s
' % (match.group('target'), match.group('text')) if match.group('text') else ''), # eReader files are one paragraph per line. # This forces the lines to wrap properly. (re.compile('^(?P.+)$', re.MULTILINE), lambda match: '

%s

' % match.group('text')), + # Remove empty

's. (re.compile('

[ ]*

'), lambda match: ''), + # Ensure empty lines carry over. + (re.compile('^$', re.MULTILINE), lambda match: '
'), # Remove unmatched plm codes. (re.compile(r'(?<=[^\\])\\[pxcriouvtblBk]'), lambda match: ''),