diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py index 8ff30e9349..61a91febda 100644 --- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py +++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py @@ -24,8 +24,8 @@ PML_HTML_RULES = [ (re.compile(r'\\o(?P.+?)\\o', re.DOTALL), lambda match: '%s' % match.group('text')), (re.compile(r'\\v(?P.+?)\\v', re.DOTALL), lambda match: '' % match.group('text')), (re.compile(r'\\t(?P.+?)\\t', re.DOTALL), lambda match: '
%s
' % match.group('text')), - (re.compile(r'\\T="(?P\d+)%%*"(?P.+?)$', re.MULTILINE), lambda match: '
%s
' % (match.group('val'), match.group('text'))), - (re.compile(r'\\w="(?P\d+)%%"'), lambda match: '
' % match.group('val')), + (re.compile(r'\\T="(?P\d+)%*"(?P.+?)$', re.MULTILINE), lambda match: r'
%s
' % (match.group('val'), match.group('text'))), + (re.compile(r'\\w="(?P\d+)%"'), lambda match: '
' % match.group('val')), (re.compile(r'\\n'), lambda match: ''), (re.compile(r'\\s'), lambda match: ''), (re.compile(r'\\b(?P.+?)\\b', re.DOTALL), lambda match: '%s' % match.group('text')), # \b is deprecated; \B should be used instead. @@ -58,14 +58,6 @@ PML_HTML_RULES = [ (re.compile(r'\\\\'), lambda match: '\\'), ] -FOOTNOTE_HTML_RULES = [ - (re.compile('(?P.+?)', re.DOTALL), lambda match: '
%s
') -] - -SIDEBAR_HTML_RULES = [ - (re.compile('(?P.+?)', re.DOTALL), lambda match: '') -] - HTML_PML_RULES = [ (re.compile(r'\\'), lambda match: '\\\\'), (re.compile('(?<=[^\n])[ ]*'), lambda match: '\n

'), @@ -109,23 +101,9 @@ def pml_to_html(pml): return html -def footnote_to_html(footnotes): - html = footnotes - for rule in FOOTNOTE_HTML_RULES: - html = rule[0].sub(rule[1], html) - - html = pml_to_html(html) - - return html - -def sidebar_to_html(sidebars): - html = sidebars - for rule in FOOTNOTE_HTML_RULES: - html = rule[0].sub(rule[1], html) - - html = pml_to_html(html) - - return html +def footnote_sidebar_to_html(id, pml): + html = '

' % (id, pml_to_html(pml)) + return html def html_to_pml(html): pml = html diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index 8a0abb970e..b47dac1af0 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -8,7 +8,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os, sys, struct, zlib +import os, re, sys, struct, zlib from calibre import CurrentDir from calibre.ebooks import DRMError @@ -16,7 +16,7 @@ from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ereader import EreaderError from calibre.ebooks.pdb.ereader.pmlconverter import pml_to_html, \ - footnote_to_html, sidebar_to_html + footnote_sidebar_to_html from calibre.ebooks.mobi.palmdoc import decompress_doc from calibre.ebooks.metadata.opf2 import OPFCreator @@ -42,14 +42,6 @@ class HeaderRecord(object): self.num_text_pages = self.non_text_offset -1 self.num_image_pages = self.metadata_offset - self.image_data_offset - - # Can't tell which is sidebar and footnote if they have same offset. - # They don't exist if offset is larget than last_record. - # Todo: Determine if the subtraction is necessary and find out - # what _rec means. - end_footnote_offset = self.sidebar_offset if self.sidebar_offset != self.footnote_offset else self.last_data_offset - self.num_footnote_pages = end_footnote_offset - self.footnote_offset if self.footnote_offset < self.last_data_offset else 0 - self.num_sidebar_pages = self.sidebar_offset - self.last_data_offset if self.footnote_offset < self.last_data_offset else 0 class Reader(FormatReader): @@ -94,44 +86,10 @@ class Reader(FormatReader): assumed to be encoded as Windows-1252. The encoding is part of the eReader file spec and should always be this encoding. ''' - if number not in range(1, self.header_record.num_text_pages): + if number not in range(1, self.header_record.num_text_pages + 1): return '' return self.decompress_text(number) - - def get_footnote_page(self, number): - if number not in range(self.header_record.footnote_offset, self.header_record.footnote_offset + self.header_record.num_footnote_pages): - return '' - - return self.decompress_text(number) - - def get_sidebar_page(self, number): - if number not in range(self.header_record.sidebar_offset, self.header_record.sidebar_offset + self.header_record.num_sidebar_pages - 1): - return '' - - return self.decompress_text(number) - - def has_footnotes(self): - if self.header_record.num_footnote_pages > 1: - try: - content = self.decompress_text(self.header_record.footnote_offset) - - if content.contains(''): - return True - except: - pass - return False - - def has_sidebar(self): - if self.header_record.num_sidebar_pages > 1: - try: - content = self.decompress_text(self.header_record.sidebar_offset) - - if content.contains(''): - return True - except: - pass - return False def extract_content(self, output_dir): output_dir = os.path.abspath(output_dir) @@ -144,22 +102,20 @@ class Reader(FormatReader): for i in range(1, self.header_record.num_text_pages + 1): self.log.debug('Extracting text page %i' % i) html += pml_to_html(self.get_text_page(i)) - - # Untested: The num_.._pages variable may not be correct! - # Possibly use .._rec instead? - ''' - if has_footnotes(): + + if self.header_record.footnote_rec > 0: html += '

%s

' % _('Footnotes') - for i in range(self.header_record.footnote_offset, self.header_record.num_footnote_pages): + footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding)) + for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)): self.log.debug('Extracting footnote page %i' % i) - html += footnote_to_html(self.get_footnote_page(i)) + html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i)) - if has_sidebar(): + if self.header_record.sidebar_rec > 0: html += '

%s

' % _('Sidebar') - for i in range(self.header_record.sidebar_offset, self.header_record.num_sidebar_pages): + sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding)) + for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)): self.log.debug('Extracting sidebar page %i' % i) - html += sidebar_to_html(self.get_sidebar_page(i)) - ''' + html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i)) html += ''