mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
ereader reader work
This commit is contained in:
parent
19ba43153b
commit
06aa8f8361
@ -24,8 +24,8 @@ PML_HTML_RULES = [
|
|||||||
(re.compile(r'\\o(?P<text>.+?)\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text')),
|
(re.compile(r'\\o(?P<text>.+?)\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text')),
|
||||||
(re.compile(r'\\v(?P<text>.+?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text')),
|
(re.compile(r'\\v(?P<text>.+?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text')),
|
||||||
(re.compile(r'\\t(?P<text>.+?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%;">%s</div>' % match.group('text')),
|
(re.compile(r'\\t(?P<text>.+?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%;">%s</div>' % match.group('text')),
|
||||||
(re.compile(r'\\T="(?P<val>\d+)%%*"(?P<text>.+?)$', re.MULTILINE), lambda match: '<div style="margin-left: %i%%;">%s</div>' % (match.group('val'), match.group('text'))),
|
(re.compile(r'\\T="(?P<val>\d+)%*"(?P<text>.+?)$', re.MULTILINE), lambda match: r'<div style="margin-left: %s%%;">%s</div>' % (match.group('val'), match.group('text'))),
|
||||||
(re.compile(r'\\w="(?P<val>\d+)%%"'), lambda match: '<hr width="%s%%" />' % match.group('val')),
|
(re.compile(r'\\w="(?P<val>\d+)%"'), lambda match: '<hr width="%s%%" />' % match.group('val')),
|
||||||
(re.compile(r'\\n'), lambda match: ''),
|
(re.compile(r'\\n'), lambda match: ''),
|
||||||
(re.compile(r'\\s'), lambda match: ''),
|
(re.compile(r'\\s'), lambda match: ''),
|
||||||
(re.compile(r'\\b(?P<text>.+?)\\b', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text')), # \b is deprecated; \B should be used instead.
|
(re.compile(r'\\b(?P<text>.+?)\\b', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text')), # \b is deprecated; \B should be used instead.
|
||||||
@ -58,14 +58,6 @@ PML_HTML_RULES = [
|
|||||||
(re.compile(r'\\\\'), lambda match: '\\'),
|
(re.compile(r'\\\\'), lambda match: '\\'),
|
||||||
]
|
]
|
||||||
|
|
||||||
FOOTNOTE_HTML_RULES = [
|
|
||||||
(re.compile('<footnote id="(?P<id>.+?)">(?P<text>.+?)</footnote>', re.DOTALL), lambda match: '<div id="footnote-%s">%s</div>')
|
|
||||||
]
|
|
||||||
|
|
||||||
SIDEBAR_HTML_RULES = [
|
|
||||||
(re.compile('<sidebar id="(?P<id>.+?)">(?P<text>.+?)</sidebar>', re.DOTALL), lambda match: '<div id="sidebar-%s">%s</div>')
|
|
||||||
]
|
|
||||||
|
|
||||||
HTML_PML_RULES = [
|
HTML_PML_RULES = [
|
||||||
(re.compile(r'\\'), lambda match: '\\\\'),
|
(re.compile(r'\\'), lambda match: '\\\\'),
|
||||||
(re.compile('(?<=[^\n])[ ]*<p.*?>'), lambda match: '\n<p>'),
|
(re.compile('(?<=[^\n])[ ]*<p.*?>'), lambda match: '\n<p>'),
|
||||||
@ -109,23 +101,9 @@ def pml_to_html(pml):
|
|||||||
|
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def footnote_to_html(footnotes):
|
def footnote_sidebar_to_html(id, pml):
|
||||||
html = footnotes
|
html = '<div id="sidebar-%s">%s</div>' % (id, pml_to_html(pml))
|
||||||
for rule in FOOTNOTE_HTML_RULES:
|
return html
|
||||||
html = rule[0].sub(rule[1], html)
|
|
||||||
|
|
||||||
html = pml_to_html(html)
|
|
||||||
|
|
||||||
return html
|
|
||||||
|
|
||||||
def sidebar_to_html(sidebars):
|
|
||||||
html = sidebars
|
|
||||||
for rule in FOOTNOTE_HTML_RULES:
|
|
||||||
html = rule[0].sub(rule[1], html)
|
|
||||||
|
|
||||||
html = pml_to_html(html)
|
|
||||||
|
|
||||||
return html
|
|
||||||
|
|
||||||
def html_to_pml(html):
|
def html_to_pml(html):
|
||||||
pml = html
|
pml = html
|
||||||
|
@ -8,7 +8,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import os, sys, struct, zlib
|
import os, re, sys, struct, zlib
|
||||||
|
|
||||||
from calibre import CurrentDir
|
from calibre import CurrentDir
|
||||||
from calibre.ebooks import DRMError
|
from calibre.ebooks import DRMError
|
||||||
@ -16,7 +16,7 @@ from calibre.ebooks.metadata import MetaInformation
|
|||||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||||
from calibre.ebooks.pdb.ereader import EreaderError
|
from calibre.ebooks.pdb.ereader import EreaderError
|
||||||
from calibre.ebooks.pdb.ereader.pmlconverter import pml_to_html, \
|
from calibre.ebooks.pdb.ereader.pmlconverter import pml_to_html, \
|
||||||
footnote_to_html, sidebar_to_html
|
footnote_sidebar_to_html
|
||||||
from calibre.ebooks.mobi.palmdoc import decompress_doc
|
from calibre.ebooks.mobi.palmdoc import decompress_doc
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
|
|
||||||
@ -42,14 +42,6 @@ class HeaderRecord(object):
|
|||||||
|
|
||||||
self.num_text_pages = self.non_text_offset -1
|
self.num_text_pages = self.non_text_offset -1
|
||||||
self.num_image_pages = self.metadata_offset - self.image_data_offset
|
self.num_image_pages = self.metadata_offset - self.image_data_offset
|
||||||
|
|
||||||
# Can't tell which is sidebar and footnote if they have same offset.
|
|
||||||
# They don't exist if offset is larget than last_record.
|
|
||||||
# Todo: Determine if the subtraction is necessary and find out
|
|
||||||
# what _rec means.
|
|
||||||
end_footnote_offset = self.sidebar_offset if self.sidebar_offset != self.footnote_offset else self.last_data_offset
|
|
||||||
self.num_footnote_pages = end_footnote_offset - self.footnote_offset if self.footnote_offset < self.last_data_offset else 0
|
|
||||||
self.num_sidebar_pages = self.sidebar_offset - self.last_data_offset if self.footnote_offset < self.last_data_offset else 0
|
|
||||||
|
|
||||||
|
|
||||||
class Reader(FormatReader):
|
class Reader(FormatReader):
|
||||||
@ -94,44 +86,10 @@ class Reader(FormatReader):
|
|||||||
assumed to be encoded as Windows-1252. The encoding is part of
|
assumed to be encoded as Windows-1252. The encoding is part of
|
||||||
the eReader file spec and should always be this encoding.
|
the eReader file spec and should always be this encoding.
|
||||||
'''
|
'''
|
||||||
if number not in range(1, self.header_record.num_text_pages):
|
if number not in range(1, self.header_record.num_text_pages + 1):
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
return self.decompress_text(number)
|
return self.decompress_text(number)
|
||||||
|
|
||||||
def get_footnote_page(self, number):
|
|
||||||
if number not in range(self.header_record.footnote_offset, self.header_record.footnote_offset + self.header_record.num_footnote_pages):
|
|
||||||
return ''
|
|
||||||
|
|
||||||
return self.decompress_text(number)
|
|
||||||
|
|
||||||
def get_sidebar_page(self, number):
|
|
||||||
if number not in range(self.header_record.sidebar_offset, self.header_record.sidebar_offset + self.header_record.num_sidebar_pages - 1):
|
|
||||||
return ''
|
|
||||||
|
|
||||||
return self.decompress_text(number)
|
|
||||||
|
|
||||||
def has_footnotes(self):
|
|
||||||
if self.header_record.num_footnote_pages > 1:
|
|
||||||
try:
|
|
||||||
content = self.decompress_text(self.header_record.footnote_offset)
|
|
||||||
|
|
||||||
if content.contains('</footnote>'):
|
|
||||||
return True
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
return False
|
|
||||||
|
|
||||||
def has_sidebar(self):
|
|
||||||
if self.header_record.num_sidebar_pages > 1:
|
|
||||||
try:
|
|
||||||
content = self.decompress_text(self.header_record.sidebar_offset)
|
|
||||||
|
|
||||||
if content.contains('</sidebar>'):
|
|
||||||
return True
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
return False
|
|
||||||
|
|
||||||
def extract_content(self, output_dir):
|
def extract_content(self, output_dir):
|
||||||
output_dir = os.path.abspath(output_dir)
|
output_dir = os.path.abspath(output_dir)
|
||||||
@ -144,22 +102,20 @@ class Reader(FormatReader):
|
|||||||
for i in range(1, self.header_record.num_text_pages + 1):
|
for i in range(1, self.header_record.num_text_pages + 1):
|
||||||
self.log.debug('Extracting text page %i' % i)
|
self.log.debug('Extracting text page %i' % i)
|
||||||
html += pml_to_html(self.get_text_page(i))
|
html += pml_to_html(self.get_text_page(i))
|
||||||
|
|
||||||
# Untested: The num_.._pages variable may not be correct!
|
if self.header_record.footnote_rec > 0:
|
||||||
# Possibly use .._rec instead?
|
|
||||||
'''
|
|
||||||
if has_footnotes():
|
|
||||||
html += '<br /><h1>%s</h1>' % _('Footnotes')
|
html += '<br /><h1>%s</h1>' % _('Footnotes')
|
||||||
for i in range(self.header_record.footnote_offset, self.header_record.num_footnote_pages):
|
footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
||||||
|
for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)):
|
||||||
self.log.debug('Extracting footnote page %i' % i)
|
self.log.debug('Extracting footnote page %i' % i)
|
||||||
html += footnote_to_html(self.get_footnote_page(i))
|
html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
|
||||||
|
|
||||||
if has_sidebar():
|
if self.header_record.sidebar_rec > 0:
|
||||||
html += '<br /><h1>%s</h1>' % _('Sidebar')
|
html += '<br /><h1>%s</h1>' % _('Sidebar')
|
||||||
for i in range(self.header_record.sidebar_offset, self.header_record.num_sidebar_pages):
|
sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
||||||
|
for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)):
|
||||||
self.log.debug('Extracting sidebar page %i' % i)
|
self.log.debug('Extracting sidebar page %i' % i)
|
||||||
html += sidebar_to_html(self.get_sidebar_page(i))
|
html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i))
|
||||||
'''
|
|
||||||
|
|
||||||
html += '</body></html>'
|
html += '</body></html>'
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user