ereader reader work

This commit is contained in:
John Schember 2009-04-24 20:21:01 -04:00
parent 19ba43153b
commit 06aa8f8361
2 changed files with 17 additions and 83 deletions

View File

@ -24,8 +24,8 @@ PML_HTML_RULES = [
(re.compile(r'\\o(?P<text>.+?)\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text')), (re.compile(r'\\o(?P<text>.+?)\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text')),
(re.compile(r'\\v(?P<text>.+?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text')), (re.compile(r'\\v(?P<text>.+?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text')),
(re.compile(r'\\t(?P<text>.+?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%;">%s</div>' % match.group('text')), (re.compile(r'\\t(?P<text>.+?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%;">%s</div>' % match.group('text')),
(re.compile(r'\\T="(?P<val>\d+)%%*"(?P<text>.+?)$', re.MULTILINE), lambda match: '<div style="margin-left: %i%%;">%s</div>' % (match.group('val'), match.group('text'))), (re.compile(r'\\T="(?P<val>\d+)%*"(?P<text>.+?)$', re.MULTILINE), lambda match: r'<div style="margin-left: %s%%;">%s</div>' % (match.group('val'), match.group('text'))),
(re.compile(r'\\w="(?P<val>\d+)%%"'), lambda match: '<hr width="%s%%" />' % match.group('val')), (re.compile(r'\\w="(?P<val>\d+)%"'), lambda match: '<hr width="%s%%" />' % match.group('val')),
(re.compile(r'\\n'), lambda match: ''), (re.compile(r'\\n'), lambda match: ''),
(re.compile(r'\\s'), lambda match: ''), (re.compile(r'\\s'), lambda match: ''),
(re.compile(r'\\b(?P<text>.+?)\\b', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text')), # \b is deprecated; \B should be used instead. (re.compile(r'\\b(?P<text>.+?)\\b', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text')), # \b is deprecated; \B should be used instead.
@ -58,14 +58,6 @@ PML_HTML_RULES = [
(re.compile(r'\\\\'), lambda match: '\\'), (re.compile(r'\\\\'), lambda match: '\\'),
] ]
FOOTNOTE_HTML_RULES = [
(re.compile('<footnote id="(?P<id>.+?)">(?P<text>.+?)</footnote>', re.DOTALL), lambda match: '<div id="footnote-%s">%s</div>')
]
SIDEBAR_HTML_RULES = [
(re.compile('<sidebar id="(?P<id>.+?)">(?P<text>.+?)</sidebar>', re.DOTALL), lambda match: '<div id="sidebar-%s">%s</div>')
]
HTML_PML_RULES = [ HTML_PML_RULES = [
(re.compile(r'\\'), lambda match: '\\\\'), (re.compile(r'\\'), lambda match: '\\\\'),
(re.compile('(?<=[^\n])[ ]*<p.*?>'), lambda match: '\n<p>'), (re.compile('(?<=[^\n])[ ]*<p.*?>'), lambda match: '\n<p>'),
@ -109,23 +101,9 @@ def pml_to_html(pml):
return html return html
def footnote_to_html(footnotes): def footnote_sidebar_to_html(id, pml):
html = footnotes html = '<div id="sidebar-%s">%s</div>' % (id, pml_to_html(pml))
for rule in FOOTNOTE_HTML_RULES: return html
html = rule[0].sub(rule[1], html)
html = pml_to_html(html)
return html
def sidebar_to_html(sidebars):
html = sidebars
for rule in FOOTNOTE_HTML_RULES:
html = rule[0].sub(rule[1], html)
html = pml_to_html(html)
return html
def html_to_pml(html): def html_to_pml(html):
pml = html pml = html

View File

@ -8,7 +8,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, sys, struct, zlib import os, re, sys, struct, zlib
from calibre import CurrentDir from calibre import CurrentDir
from calibre.ebooks import DRMError from calibre.ebooks import DRMError
@ -16,7 +16,7 @@ from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pdb.ereader import EreaderError from calibre.ebooks.pdb.ereader import EreaderError
from calibre.ebooks.pdb.ereader.pmlconverter import pml_to_html, \ from calibre.ebooks.pdb.ereader.pmlconverter import pml_to_html, \
footnote_to_html, sidebar_to_html footnote_sidebar_to_html
from calibre.ebooks.mobi.palmdoc import decompress_doc from calibre.ebooks.mobi.palmdoc import decompress_doc
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
@ -42,14 +42,6 @@ class HeaderRecord(object):
self.num_text_pages = self.non_text_offset -1 self.num_text_pages = self.non_text_offset -1
self.num_image_pages = self.metadata_offset - self.image_data_offset self.num_image_pages = self.metadata_offset - self.image_data_offset
# Can't tell which is sidebar and footnote if they have same offset.
# They don't exist if offset is larget than last_record.
# Todo: Determine if the subtraction is necessary and find out
# what _rec means.
end_footnote_offset = self.sidebar_offset if self.sidebar_offset != self.footnote_offset else self.last_data_offset
self.num_footnote_pages = end_footnote_offset - self.footnote_offset if self.footnote_offset < self.last_data_offset else 0
self.num_sidebar_pages = self.sidebar_offset - self.last_data_offset if self.footnote_offset < self.last_data_offset else 0
class Reader(FormatReader): class Reader(FormatReader):
@ -94,44 +86,10 @@ class Reader(FormatReader):
assumed to be encoded as Windows-1252. The encoding is part of assumed to be encoded as Windows-1252. The encoding is part of
the eReader file spec and should always be this encoding. the eReader file spec and should always be this encoding.
''' '''
if number not in range(1, self.header_record.num_text_pages): if number not in range(1, self.header_record.num_text_pages + 1):
return '' return ''
return self.decompress_text(number) return self.decompress_text(number)
def get_footnote_page(self, number):
if number not in range(self.header_record.footnote_offset, self.header_record.footnote_offset + self.header_record.num_footnote_pages):
return ''
return self.decompress_text(number)
def get_sidebar_page(self, number):
if number not in range(self.header_record.sidebar_offset, self.header_record.sidebar_offset + self.header_record.num_sidebar_pages - 1):
return ''
return self.decompress_text(number)
def has_footnotes(self):
if self.header_record.num_footnote_pages > 1:
try:
content = self.decompress_text(self.header_record.footnote_offset)
if content.contains('</footnote>'):
return True
except:
pass
return False
def has_sidebar(self):
if self.header_record.num_sidebar_pages > 1:
try:
content = self.decompress_text(self.header_record.sidebar_offset)
if content.contains('</sidebar>'):
return True
except:
pass
return False
def extract_content(self, output_dir): def extract_content(self, output_dir):
output_dir = os.path.abspath(output_dir) output_dir = os.path.abspath(output_dir)
@ -144,22 +102,20 @@ class Reader(FormatReader):
for i in range(1, self.header_record.num_text_pages + 1): for i in range(1, self.header_record.num_text_pages + 1):
self.log.debug('Extracting text page %i' % i) self.log.debug('Extracting text page %i' % i)
html += pml_to_html(self.get_text_page(i)) html += pml_to_html(self.get_text_page(i))
# Untested: The num_.._pages variable may not be correct! if self.header_record.footnote_rec > 0:
# Possibly use .._rec instead?
'''
if has_footnotes():
html += '<br /><h1>%s</h1>' % _('Footnotes') html += '<br /><h1>%s</h1>' % _('Footnotes')
for i in range(self.header_record.footnote_offset, self.header_record.num_footnote_pages): footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)):
self.log.debug('Extracting footnote page %i' % i) self.log.debug('Extracting footnote page %i' % i)
html += footnote_to_html(self.get_footnote_page(i)) html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
if has_sidebar(): if self.header_record.sidebar_rec > 0:
html += '<br /><h1>%s</h1>' % _('Sidebar') html += '<br /><h1>%s</h1>' % _('Sidebar')
for i in range(self.header_record.sidebar_offset, self.header_record.num_sidebar_pages): sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)):
self.log.debug('Extracting sidebar page %i' % i) self.log.debug('Extracting sidebar page %i' % i)
html += sidebar_to_html(self.get_sidebar_page(i)) html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i))
'''
html += '</body></html>' html += '</body></html>'