mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
ereader writer mostly working.
This commit is contained in:
parent
188f630c35
commit
0c858e43bc
@ -5,5 +5,21 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
class EreaderError(Exception):
|
class EreaderError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def image_name(name):
|
||||||
|
name = os.path.basename(name)
|
||||||
|
|
||||||
|
if len(name) > 32:
|
||||||
|
cut = len(name) - 32
|
||||||
|
names = name[:10]
|
||||||
|
namee = name[10+cut:]
|
||||||
|
name = names + namee
|
||||||
|
|
||||||
|
name = name.ljust(32, '\x00')[:32]
|
||||||
|
|
||||||
|
return name
|
||||||
|
|
||||||
|
@ -5,9 +5,8 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from calibre.customize.conversion import OutputFormatPlugin, \
|
from calibre.customize.conversion import OutputFormatPlugin
|
||||||
OptionRecommendation
|
from calibre.ebooks.pdb.ereader.writer import Writer
|
||||||
from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines, TxtMetadata
|
|
||||||
from calibre.ebooks.metadata import authors_to_string
|
from calibre.ebooks.metadata import authors_to_string
|
||||||
|
|
||||||
class EREADEROutput(OutputFormatPlugin):
|
class EREADEROutput(OutputFormatPlugin):
|
||||||
@ -17,7 +16,22 @@ class EREADEROutput(OutputFormatPlugin):
|
|||||||
file_type = 'erpdb'
|
file_type = 'erpdb'
|
||||||
|
|
||||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||||
from calibre.ebooks.pdb.ereader.pmlconverter import html_to_pml
|
writer = Writer(log)
|
||||||
|
|
||||||
# print html_to_pml('<p class="calibre1"> “A hundred kisses from the Princess,” said he, “or else let everyone keep his own!”</p>')
|
close = False
|
||||||
print html_to_pml(str(oeb_book.spine[3]))
|
if not hasattr(output_path, 'write'):
|
||||||
|
close = True
|
||||||
|
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
|
||||||
|
os.makedirs(os.path.dirname(output_path))
|
||||||
|
out_stream = open(output_path, 'wb')
|
||||||
|
else:
|
||||||
|
out_stream = output_path
|
||||||
|
|
||||||
|
out_stream.seek(0)
|
||||||
|
out_stream.truncate()
|
||||||
|
|
||||||
|
writer.dump(oeb_book, out_stream)
|
||||||
|
|
||||||
|
if close:
|
||||||
|
out_stream.close()
|
||||||
|
|
||||||
|
@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from calibre.ebooks.pdb.ereader import image_name
|
||||||
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
|
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
|
||||||
|
|
||||||
from BeautifulSoup import BeautifulSoup
|
from BeautifulSoup import BeautifulSoup
|
||||||
@ -61,35 +62,69 @@ PML_HTML_RULES = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
HTML_PML_RULES = [
|
HTML_PML_RULES = [
|
||||||
|
|
||||||
(re.compile(r'\\'), lambda match: '\\\\'),
|
(re.compile(r'\\'), lambda match: '\\\\'),
|
||||||
(re.compile('(?<=[^\n])[ ]*<p.*?>'), lambda match: '\n<p>'),
|
(re.compile('(?<=[^\n])[ ]*<p.*?>'), lambda match: '\n<p>'),
|
||||||
(re.compile('</p>(^\n|\r\n)'), lambda match: '\n'),
|
(re.compile('</p>(?=^\n|^\r\n)'), lambda match: '\n'),
|
||||||
|
|
||||||
|
|
||||||
|
# Clean up HTML
|
||||||
|
(re.compile('@page.*?}'), lambda match: ''),
|
||||||
|
(re.compile('<script.*?>.*?</script>', re.DOTALL), lambda match: ''),
|
||||||
|
(re.compile('<style.*?>.*?</style>', re.DOTALL), lambda match: ''),
|
||||||
|
|
||||||
|
# Reflow paragraphs
|
||||||
|
(re.compile('<p.*?>(?P<text>.*?)</p>', re.DOTALL), lambda match: match.group('text').replace('\r\n', ' ').replace('\n', ' ')),
|
||||||
|
|
||||||
|
# HTML to PML
|
||||||
(re.compile('<a.*?href="#sidebar-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Sd="%s"%s\\Sd' % (match.group('target'), match.group('text'))),
|
(re.compile('<a.*?href="#sidebar-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Sd="%s"%s\\Sd' % (match.group('target'), match.group('text'))),
|
||||||
(re.compile('<a.*?href="#footnote-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Fn="%s"%s\\Fn' % (match.group('target'), match.group('text'))),
|
(re.compile('<a.*?href="#footnote-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Fn="%s"%s\\Fn' % (match.group('target'), match.group('text'))),
|
||||||
(re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')),
|
(re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')),
|
||||||
(re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
|
(re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
|
||||||
(re.compile('<img.*?src="images/(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')),
|
#(re.compile('<img.*?src="images/(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')),
|
||||||
|
(re.compile('<img.*?src="(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % image_name(match.group('name'))),
|
||||||
#(re.compile('&#(?P<num>\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))),
|
#(re.compile('&#(?P<num>\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))),
|
||||||
(re.compile('&#(?P<num>\d\d\d);'), lambda match: '\\a%s' % match.group('num')),
|
(re.compile('&#(?P<num>\d\d\d);'), lambda match: '\\a%s' % match.group('num')),
|
||||||
(re.compile('<small.*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
|
(re.compile('<small .*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
|
||||||
(re.compile('<sub.*?>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
|
(re.compile('<small>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
|
||||||
(re.compile('<sup.*?>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
|
(re.compile('<sub .*?>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
|
||||||
(re.compile('<b.*?>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
|
(re.compile('<sub>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
|
||||||
(re.compile('<big.*?>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
|
(re.compile('<sup .*?>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
|
||||||
|
(re.compile('<sup>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
|
||||||
|
(re.compile('<b .*?>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
|
||||||
|
(re.compile('<b>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
|
||||||
|
(re.compile('<big .*?>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
|
||||||
|
(re.compile('<big>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
|
||||||
(re.compile('<hr.*?width="(?P<val>\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')),
|
(re.compile('<hr.*?width="(?P<val>\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')),
|
||||||
(re.compile('<div.*?style.*?margin-left: (?P<val>\d+)%*;.*?>(?P<text>.+?)</div>', re.MULTILINE), lambda match: '\\T="%s%%"%s$' % (match.group('val'), match.group('text'))),
|
(re.compile('<div.*?style.*?margin-left: (?P<val>\d+)%*;.*?>(?P<text>.+?)</div>', re.MULTILINE), lambda match: '\\T="%s%%"%s$' % (match.group('val'), match.group('text'))),
|
||||||
(re.compile('<div.*?style.*?margin-left: \d{1,3}%;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')),
|
(re.compile('<div.*?style.*?margin-left: \d{1,3}%;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')),
|
||||||
(re.compile('<!-- (?P<text>.+?) -->', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')),
|
(re.compile('<!--(?P<text>.+?)-->', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')),
|
||||||
(re.compile('<del.*?>(?P<text>.+?)</del>', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')),
|
(re.compile('<del .*?>(?P<text>.+?)</del>', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')),
|
||||||
|
(re.compile('<del>(?P<text>.+?)</del>', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')),
|
||||||
(re.compile('<div.*?style.*?text-decoration: underline;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')),
|
(re.compile('<div.*?style.*?text-decoration: underline;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')),
|
||||||
(re.compile('<i.*?>(?P<text>.+?)</i>', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')),
|
(re.compile('<i .*?>(?P<text>.+?)</i>', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')),
|
||||||
|
(re.compile('<i>(?P<text>.+?)</i>', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')),
|
||||||
(re.compile('<div.*?style.*?text-align: right;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')),
|
(re.compile('<div.*?style.*?text-align: right;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')),
|
||||||
(re.compile('<div.*?style.*?text-align: center;.*?".*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')),
|
(re.compile('<div.*?style.*?text-align: center;.*?".*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')),
|
||||||
(re.compile('<h(?P<val>[0-4]).*?>(?P<text>.+?)</h[0-4]>', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
|
(re.compile('<h(?P<val>[0-4]).*?>(?P<text>.+?)</h[0-4]>', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
|
||||||
(re.compile('<h1.*?>(?P<text>.+?)</h1>', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')),
|
(re.compile('<h1.*?>(?P<text>.+?)</h1>', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')),
|
||||||
(re.compile('<br.*?>'), lambda match: '\\p'),
|
(re.compile('<br .*?>'), lambda match: '\\p'),
|
||||||
|
(re.compile('<br/*>'), lambda match: '\\p'),
|
||||||
|
|
||||||
|
# Remove remaining HTML tags
|
||||||
(re.compile('<.*?>'), lambda match: ''),
|
(re.compile('<.*?>'), lambda match: ''),
|
||||||
|
|
||||||
|
# Remove redundant page break markers
|
||||||
(re.compile(r'(\\p){2,}'), lambda match: r'\p'),
|
(re.compile(r'(\\p){2,}'), lambda match: r'\p'),
|
||||||
|
|
||||||
|
# Remove whitespace on empty lines
|
||||||
|
(re.compile('^[\t\r ]$', re.MULTILINE), lambda match: ''),
|
||||||
|
|
||||||
|
# Remove excess newlines at the beginning and end
|
||||||
|
(re.compile('^(\r\n){1,}'), lambda match: ''),
|
||||||
|
(re.compile('^\n{1,}'), lambda match: ''),
|
||||||
|
(re.compile('(\r\n){3,}$'), lambda match: ''),
|
||||||
|
(re.compile('\n{3,}$'), lambda match: ''),
|
||||||
]
|
]
|
||||||
|
|
||||||
def pml_to_html(pml):
|
def pml_to_html(pml):
|
||||||
@ -111,13 +146,13 @@ def html_to_pml(html):
|
|||||||
pml = ''
|
pml = ''
|
||||||
|
|
||||||
for dom_tree in BeautifulSoup(html).findAll('body'):
|
for dom_tree in BeautifulSoup(html).findAll('body'):
|
||||||
body = unicode(dom_tree.pretty_print())
|
body = unicode(dom_tree.prettify())
|
||||||
|
|
||||||
for rule in HTML_PML_RULES:
|
for rule in HTML_PML_RULES:
|
||||||
body = rule[0].sub(rule[1], pml)
|
body = rule[0].sub(rule[1], body)
|
||||||
|
|
||||||
pml += body
|
pml += body
|
||||||
|
|
||||||
# Replace symbols outside of cp1512 wtih \Uxxxx
|
# Replace symbols outside of cp1512 wtih \Uxxxx
|
||||||
|
|
||||||
return pml
|
return pml
|
||||||
|
@ -40,7 +40,7 @@ class HeaderRecord(object):
|
|||||||
self.sidebar_offset, = struct.unpack('>H', raw[50:52])
|
self.sidebar_offset, = struct.unpack('>H', raw[50:52])
|
||||||
self.last_data_offset, = struct.unpack('>H', raw[52:54])
|
self.last_data_offset, = struct.unpack('>H', raw[52:54])
|
||||||
|
|
||||||
self.num_text_pages = self.non_text_offset -1
|
self.num_text_pages = self.non_text_offset - 1
|
||||||
self.num_image_pages = self.metadata_offset - self.image_data_offset
|
self.num_image_pages = self.metadata_offset - self.image_data_offset
|
||||||
|
|
||||||
|
|
||||||
|
@ -4,17 +4,90 @@ from __future__ import with_statement
|
|||||||
Write content to ereader pdb file.
|
Write content to ereader pdb file.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
import struct, zlib
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.base import OEB_IMAGES
|
||||||
|
from calibre.ebooks.pdb.header import PdbHeaderBuilder
|
||||||
|
from calibre.ebooks.pdb.ereader import image_name
|
||||||
from calibre.ebooks.pdb.ereader.pmlconverter import html_to_pml
|
from calibre.ebooks.pdb.ereader.pmlconverter import html_to_pml
|
||||||
|
|
||||||
|
IDENTITY = 'PNPdPPrs'
|
||||||
|
|
||||||
class Writer(object):
|
class Writer(object):
|
||||||
|
|
||||||
def __init__(self, log):
|
def __init__(self, log):
|
||||||
self.oeb_book = oeb_book
|
self.log = log
|
||||||
|
|
||||||
def dump(oeb_book):
|
def dump(self, oeb_book, out_stream, metadata=None):
|
||||||
|
text = self._text(oeb_book.spine)
|
||||||
|
images = self._images(oeb_book.manifest)
|
||||||
|
metadata = [self._metadata(metadata)]
|
||||||
|
|
||||||
|
hr = [self._header_record(len(text), len(images))]
|
||||||
|
|
||||||
|
sections = hr+text+images+metadata
|
||||||
|
|
||||||
|
lengths = [len(i) for i in sections]
|
||||||
|
|
||||||
|
pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, '')
|
||||||
|
pdbHeaderBuilder.build_header(lengths, out_stream)
|
||||||
|
|
||||||
|
for item in sections:
|
||||||
|
out_stream.write(item)
|
||||||
|
|
||||||
|
def _text(self, pages):
|
||||||
pml_pages = []
|
pml_pages = []
|
||||||
for page in oeb_book.spine:
|
|
||||||
pml_pages.append(html_to_pml(page))
|
|
||||||
|
|
||||||
|
for page in pages:
|
||||||
|
pml_pages.append(zlib.compress(html_to_pml(unicode(page))))
|
||||||
|
|
||||||
|
return pml_pages
|
||||||
|
|
||||||
|
def _images(self, manifest):
|
||||||
|
images = []
|
||||||
|
|
||||||
|
for item in manifest:
|
||||||
|
if item.media_type in OEB_IMAGES:
|
||||||
|
image = '\x00\x00\x00\x00'
|
||||||
|
|
||||||
|
image += image_name(item.href)
|
||||||
|
image = image.ljust(62, '\x00')
|
||||||
|
image += item.data
|
||||||
|
|
||||||
|
images.append(image)
|
||||||
|
|
||||||
|
return images
|
||||||
|
|
||||||
|
def _metadata(self, metadata):
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def _header_record(self, text_items, image_items):
|
||||||
|
'''
|
||||||
|
text_items = the number of text pages
|
||||||
|
image_items = the number of images
|
||||||
|
'''
|
||||||
|
version = 10
|
||||||
|
non_text_offset = text_items
|
||||||
|
|
||||||
|
if image_items > 0:
|
||||||
|
image_data_offset = text_items + 1
|
||||||
|
meta_data_offset = image_data_offset + image_items
|
||||||
|
else:
|
||||||
|
meta_data_offset = text_items + 1
|
||||||
|
image_data_offset = meta_data_offset
|
||||||
|
|
||||||
|
record = u''
|
||||||
|
|
||||||
|
# Version
|
||||||
|
record += struct.pack('>H', version)
|
||||||
|
record = record.ljust(12, '\x00')
|
||||||
|
record += struct.pack('>H', non_text_offset)
|
||||||
|
record = record.ljust(40, '\x00')
|
||||||
|
record += struct.pack('>H', image_data_offset)
|
||||||
|
record = record.ljust(44, '\x00')
|
||||||
|
record += struct.pack('>H', meta_data_offset)
|
||||||
|
record = record.ljust(52, '\x00')
|
||||||
|
record += struct.pack('>H', meta_data_offset)
|
||||||
|
|
||||||
|
return record
|
||||||
|
|
||||||
|
@ -8,7 +8,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import os, struct
|
import os, re, struct, time
|
||||||
|
|
||||||
class PdbHeaderReader(object):
|
class PdbHeaderReader(object):
|
||||||
|
|
||||||
@ -60,18 +60,26 @@ class PdbHeaderReader(object):
|
|||||||
return self.stream.read(end - start)
|
return self.stream.read(end - start)
|
||||||
|
|
||||||
|
|
||||||
class PdbHeaderWriter(object):
|
class PdbHeaderBuilder(object):
|
||||||
|
|
||||||
def __init__(self, identity, title):
|
def __init__(self, identity, title):
|
||||||
self.identity = identity.ljust(3, '\x00')[:8]
|
self.identity = identity.ljust(3, '\x00')[:8]
|
||||||
self.title = title.ljust(32, '\x00')[:32]
|
self.title = re.sub('[^-A-Za-z0-9]+', '_', title).ljust(32, '\x00')[:32]
|
||||||
|
|
||||||
def build_header(self, offsets):
|
def build_header(self, section_lengths, out_stream):
|
||||||
'''
|
'''
|
||||||
Offsets is a list of section offsets
|
section_lengths = Lenght of each section in file.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
now = int(time.time())
|
||||||
|
nrecords = len(section_lengths)
|
||||||
|
|
||||||
|
out_stream.write(self.title + struct.pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0))
|
||||||
|
out_stream.write(self.identity + struct.pack('>IIH', nrecords, 0, nrecords))
|
||||||
|
|
||||||
|
offset = 78 + (8 * nrecords) + 2
|
||||||
|
for id, record in enumerate(section_lengths):
|
||||||
|
out_stream.write(struct.pack('>LBBBB', long(offset), 0, 0, 0, 0))
|
||||||
|
offset += record
|
||||||
|
out_stream.write('\x00\x00')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
return header
|
|
||||||
|
@ -55,3 +55,4 @@ class TXTOutput(OutputFormatPlugin):
|
|||||||
|
|
||||||
if close:
|
if close:
|
||||||
out_stream.close()
|
out_stream.close()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user