Pull from driver-dev

This commit is contained in:
Kovid Goyal 2009-04-26 12:06:07 -07:00
commit 95f90b845a
8 changed files with 239 additions and 67 deletions

View File

@ -26,11 +26,17 @@ def sanitize_head(match):
def chap_head(match): def chap_head(match):
chap = match.group('chap') chap = match.group('chap')
title = match.group('title') title = match.group('title')
if not title: if not title:
return '<h1>'+chap+'</h1><br/>' return '<h1>'+chap+'</h1><br/>\n'
else: else:
return '<h1>'+chap+'<br/>'+title+'</h1><br/>' return '<h1>'+chap+'<br/>\n'+title+'</h1><br/>\n'
def wrap_lines(match):
ital = match.group('ital')
if not ital:
return ' '
else:
return ital+' '
def line_length(raw, percent): def line_length(raw, percent):
''' '''
@ -93,17 +99,11 @@ class HTMLPreProcessor(object):
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''), (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
# Remove <hr> tags # Remove <hr> tags
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'), (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
# Remove page numbers
(re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''),
# Replace <br><br> with <p> # Replace <br><br> with <p>
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'), (re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
# Remove <br>
(re.compile(r'(.*)<br.*?>', re.IGNORECASE),
lambda match: match.group() if \
re.match('<', match.group(1).lstrip()) or \
len(match.group(1)) < 40 else match.group(1)),
# Remove hyphenation # Remove hyphenation
(re.compile(r'-\n\r?'), lambda match: ''), (re.compile(r'-<br.*?>\n\r?'), lambda match: ''),
# Remove gray background # Remove gray background
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
@ -112,19 +112,15 @@ class HTMLPreProcessor(object):
(re.compile(ur'\u00a0'), lambda match : ' '), (re.compile(ur'\u00a0'), lambda match : ' '),
# Detect Chapters to match default XPATH in GUI # Detect Chapters to match default XPATH in GUI
(re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head), (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<i><b>|<i>|<b>)?(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?(</i></b>|</i>|</b>)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>(<i>)?\s*\w+(\s+\w+)?(</i>)?)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head),
(re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
# Have paragraphs show better # Have paragraphs show better
(re.compile(r'<br.*?>'), lambda match : '<p>'), (re.compile(r'<br.*?>'), lambda match : '<p>'),
# Un wrap lines
(re.compile(r'(?<=[^\.^\^?^!^"^”])\s*(</(i|b|u)>)*\s*<p.*?>\s*(<(i|b|u)>)*\s*(?=[a-z0-9I])', re.UNICODE), lambda match: ' '),
# Clean up spaces # Clean up spaces
(re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), (re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
# Add space before and after italics # Add space before and after italics
(re.compile(r'(?<!“)<i>'), lambda match: ' <i>'), (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '), (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
] ]
@ -163,12 +159,12 @@ class HTMLPreProcessor(object):
elif self.is_book_designer(html): elif self.is_book_designer(html):
rules = self.BOOK_DESIGNER rules = self.BOOK_DESIGNER
elif self.is_pdftohtml(html): elif self.is_pdftohtml(html):
# Add rules that require matching line length here line_length_rules = [
#line_length_rules = [ # Un wrap using punctuation
# (re.compile('%i' % line_length(html, .85)), lambda match:) (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines),
#] ]
rules = self.PDFTOHTML # + line_length_rules rules = self.PDFTOHTML + line_length_rules
else: else:
rules = [] rules = []
for rule in self.PREPROCESS + rules: for rule in self.PREPROCESS + rules:

View File

@ -5,5 +5,21 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os
class EreaderError(Exception): class EreaderError(Exception):
pass pass
def image_name(name):
name = os.path.basename(name)
if len(name) > 32:
cut = len(name) - 32
names = name[:10]
namee = name[10+cut:]
name = names + namee
name = name.ljust(32, '\x00')[:32]
return name

View File

@ -5,9 +5,8 @@ __docformat__ = 'restructuredtext en'
import os import os
from calibre.customize.conversion import OutputFormatPlugin, \ from calibre.customize.conversion import OutputFormatPlugin
OptionRecommendation from calibre.ebooks.pdb.ereader.writer import Writer
from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines, TxtMetadata
from calibre.ebooks.metadata import authors_to_string from calibre.ebooks.metadata import authors_to_string
class EREADEROutput(OutputFormatPlugin): class EREADEROutput(OutputFormatPlugin):
@ -17,7 +16,22 @@ class EREADEROutput(OutputFormatPlugin):
file_type = 'erpdb' file_type = 'erpdb'
def convert(self, oeb_book, output_path, input_plugin, opts, log): def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.pdb.ereader.pmlconverter import html_to_pml writer = Writer(log)
# print html_to_pml('<p class="calibre1"> “A hundred kisses from the Princess,” said he, “or else let everyone keep his own!”</p>') close = False
print html_to_pml(str(oeb_book.spine[3])) if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
os.makedirs(os.path.dirname(output_path))
out_stream = open(output_path, 'wb')
else:
out_stream = output_path
out_stream.seek(0)
out_stream.truncate()
writer.dump(oeb_book, out_stream)
if close:
out_stream.close()

View File

@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en'
import re import re
from calibre.ebooks.pdb.ereader import image_name
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
@ -38,7 +39,7 @@ PML_HTML_RULES = [
(re.compile(r'\\k(?P<text>.+?)\\k', re.DOTALL), lambda match: '<small>%s</small>' % match.group('text')), (re.compile(r'\\k(?P<text>.+?)\\k', re.DOTALL), lambda match: '<small>%s</small>' % match.group('text')),
(re.compile(r'\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')), (re.compile(r'\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')),
(re.compile(r'\\U(?P<num>\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))), (re.compile(r'\\U(?P<num>\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))),
(re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % match.group('name')), (re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')),
(re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.+?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text'))), (re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.+?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text'))),
(re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<div id="%s"></div>' % match.group('target')), (re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<div id="%s"></div>' % match.group('target')),
(re.compile(r'\\-'), lambda match: ''), (re.compile(r'\\-'), lambda match: ''),
@ -49,6 +50,7 @@ PML_HTML_RULES = [
# eReader files are one paragraph per line. # eReader files are one paragraph per line.
# This forces the lines to wrap properly. # This forces the lines to wrap properly.
(re.compile('^(?P<text>.+)$', re.MULTILINE), lambda match: '<p>%s</p>' % match.group('text')), (re.compile('^(?P<text>.+)$', re.MULTILINE), lambda match: '<p>%s</p>' % match.group('text')),
(re.compile('<p>[ ]*</p>'), lambda match: ''),
# Remove unmatched plm codes. # Remove unmatched plm codes.
(re.compile(r'(?<=[^\\])\\[pxcriouvtblBk]'), lambda match: ''), (re.compile(r'(?<=[^\\])\\[pxcriouvtblBk]'), lambda match: ''),
@ -61,35 +63,73 @@ PML_HTML_RULES = [
] ]
HTML_PML_RULES = [ HTML_PML_RULES = [
(re.compile(r'\\'), lambda match: '\\\\'), (re.compile(r'\\'), lambda match: '\\\\'),
(re.compile('(?<=[^\n])[ ]*<p.*?>'), lambda match: '\n<p>'), (re.compile('(?<=[^\n])[ ]*<p.*?>'), lambda match: '\n<p>'),
(re.compile('</p>(^\n|\r\n)'), lambda match: '\n'), (re.compile('</p>(?=^\n|^\r\n)'), lambda match: '\n'),
# Clean up HTML
(re.compile('@page.*?}'), lambda match: ''),
(re.compile('<script.*?>.*?</script>', re.DOTALL), lambda match: ''),
(re.compile('<style.*?>.*?</style>', re.DOTALL), lambda match: ''),
# Reflow paragraphs
(re.compile('<p.*?>(?P<text>.*?)</p>', re.DOTALL), lambda match: match.group('text').replace('\r\n', ' ').replace('\n', ' ')),
# HTML to PML
(re.compile('<a.*?href="#sidebar-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Sd="%s"%s\\Sd' % (match.group('target'), match.group('text'))), (re.compile('<a.*?href="#sidebar-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Sd="%s"%s\\Sd' % (match.group('target'), match.group('text'))),
(re.compile('<a.*?href="#footnote-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Fn="%s"%s\\Fn' % (match.group('target'), match.group('text'))), (re.compile('<a.*?href="#footnote-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Fn="%s"%s\\Fn' % (match.group('target'), match.group('text'))),
(re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')), (re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')),
(re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))), (re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
(re.compile('<img.*?src="images/(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')), #(re.compile('<img.*?src="images/(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')),
(re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name').strip('\x00'))),
#(re.compile('&#(?P<num>\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))), #(re.compile('&#(?P<num>\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))),
(re.compile('&#(?P<num>\d\d\d);'), lambda match: '\\a%s' % match.group('num')), (re.compile('&#(?P<num>\d\d\d);'), lambda match: '\\a%s' % match.group('num')),
(re.compile('<small.*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), (re.compile('<small .*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
(re.compile('<sub.*?>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')), (re.compile('<small>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
(re.compile('<sup.*?>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')), (re.compile('<sub .*?>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
(re.compile('<b.*?>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), (re.compile('<sub>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
(re.compile('<big.*?>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), (re.compile('<sup .*?>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
(re.compile('<sup>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
(re.compile('<b .*?>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
(re.compile('<b>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
(re.compile('<strong .*?>(?P<text>.+?)</strong>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
(re.compile('<strong>(?P<text>.+?)</strong>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
(re.compile('<big .*?>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
(re.compile('<big>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
(re.compile('<hr.*?width="(?P<val>\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')), (re.compile('<hr.*?width="(?P<val>\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')),
(re.compile('<div.*?style.*?margin-left: (?P<val>\d+)%*;.*?>(?P<text>.+?)</div>', re.MULTILINE), lambda match: '\\T="%s%%"%s$' % (match.group('val'), match.group('text'))), (re.compile('<div.*?style.*?margin-left: (?P<val>\d+)%*;.*?>(?P<text>.+?)</div>', re.MULTILINE), lambda match: '\\T="%s%%"%s$' % (match.group('val'), match.group('text'))),
(re.compile('<div.*?style.*?margin-left: \d{1,3}%;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')), (re.compile('<div.*?style.*?margin-left: \d{1,3}%;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')),
(re.compile('<!-- (?P<text>.+?) -->', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')), (re.compile('<!--(?P<text>.+?)-->', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')),
(re.compile('<del.*?>(?P<text>.+?)</del>', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')), (re.compile('<del .*?>(?P<text>.+?)</del>', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')),
(re.compile('<del>(?P<text>.+?)</del>', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')),
(re.compile('<div.*?style.*?text-decoration: underline;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')), (re.compile('<div.*?style.*?text-decoration: underline;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')),
(re.compile('<i.*?>(?P<text>.+?)</i>', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')), (re.compile('<i .*?>(?P<text>.+?)</i>', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')),
(re.compile('<i>(?P<text>.+?)</i>', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')),
(re.compile('<div.*?style.*?text-align: right;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')), (re.compile('<div.*?style.*?text-align: right;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')),
(re.compile('<div.*?style.*?text-align: center;.*?".*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')), (re.compile('<div.*?style.*?text-align: center;.*?".*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')),
(re.compile('<h(?P<val>[0-4]).*?>(?P<text>.+?)</h[0-4]>', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), (re.compile('<h(?P<val>[0-4]).*?>(?P<text>.+?)</h[0-4]>', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
(re.compile('<h1.*?>(?P<text>.+?)</h1>', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')), (re.compile('<h1.*?>(?P<text>.+?)</h1>', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')),
(re.compile('<br.*?>'), lambda match: '\\p'), (re.compile('<br .*?>'), lambda match: '\n'),
(re.compile('<br/*>'), lambda match: '\n'),
# Remove remaining HTML tags
(re.compile('<.*?>'), lambda match: ''), (re.compile('<.*?>'), lambda match: ''),
# Remove redundant page break markers
(re.compile(r'(\\p){2,}'), lambda match: r'\p'), (re.compile(r'(\\p){2,}'), lambda match: r'\p'),
# Remove whitespace on empty lines
(re.compile('^[\t\r ]$', re.MULTILINE), lambda match: ''),
# Remove excess whitespace in lines
(re.compile('(?<=.)[ ]{2,}(?=.)'), lambda match: ' '),
# Remove excess newlines at the beginning and end
(re.compile('^(\r\n){1,}'), lambda match: ''),
(re.compile('^\n{1,}'), lambda match: ''),
(re.compile('(\r\n){3,}$'), lambda match: ''),
(re.compile('\n{3,}$'), lambda match: ''),
] ]
def pml_to_html(pml): def pml_to_html(pml):
@ -111,13 +151,13 @@ def html_to_pml(html):
pml = '' pml = ''
for dom_tree in BeautifulSoup(html).findAll('body'): for dom_tree in BeautifulSoup(html).findAll('body'):
body = unicode(dom_tree.pretty_print()) body = unicode(dom_tree.prettify())
for rule in HTML_PML_RULES: for rule in HTML_PML_RULES:
body = rule[0].sub(rule[1], pml) body = rule[0].sub(rule[1], body)
pml += body pml += body
# Replace symbols outside of cp1512 wtih \Uxxxx # Replace symbols outside of cp1512 wtih \Uxxxx
return pml return pml

View File

@ -40,7 +40,7 @@ class HeaderRecord(object):
self.sidebar_offset, = struct.unpack('>H', raw[50:52]) self.sidebar_offset, = struct.unpack('>H', raw[50:52])
self.last_data_offset, = struct.unpack('>H', raw[52:54]) self.last_data_offset, = struct.unpack('>H', raw[52:54])
self.num_text_pages = self.non_text_offset -1 self.num_text_pages = self.non_text_offset - 1
self.num_image_pages = self.metadata_offset - self.image_data_offset self.num_image_pages = self.metadata_offset - self.image_data_offset
@ -76,7 +76,7 @@ class Reader(FormatReader):
if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1: if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
return 'empty', '' return 'empty', ''
data = self.section_data(number) data = self.section_data(number)
name = data[4:4+32].strip('\0') name = data[4:4+32].strip('\x00')
img = data[62:] img = data[62:]
return name, img return name, img
@ -97,7 +97,7 @@ class Reader(FormatReader):
if not os.path.exists(output_dir): if not os.path.exists(output_dir):
os.makedirs(output_dir) os.makedirs(output_dir)
html = '<html><head><title></title></head><body>' html = u'<html><head><title></title></head><body>'
for i in range(1, self.header_record.num_text_pages + 1): for i in range(1, self.header_record.num_text_pages + 1):
self.log.debug('Extracting text page %i' % i) self.log.debug('Extracting text page %i' % i)
@ -110,8 +110,7 @@ class Reader(FormatReader):
self.log.debug('Extracting footnote page %i' % i) self.log.debug('Extracting footnote page %i' % i)
html += '<dl>' html += '<dl>'
html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i)) html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
html += '</dl>' html += '</dl>'
if self.header_record.sidebar_rec > 0: if self.header_record.sidebar_rec > 0:
html += '<br /><h1>%s</h1>' % _('Sidebar') html += '<br /><h1>%s</h1>' % _('Sidebar')
@ -127,7 +126,8 @@ class Reader(FormatReader):
with CurrentDir(output_dir): with CurrentDir(output_dir):
with open('index.html', 'wb') as index: with open('index.html', 'wb') as index:
self.log.debug('Writing text to index.html') self.log.debug('Writing text to index.html')
index.write(html.encode('utf-8')) index.write(html)
# print html
if not os.path.exists(os.path.join(output_dir, 'images/')): if not os.path.exists(os.path.join(output_dir, 'images/')):
os.makedirs(os.path.join(output_dir, 'images/')) os.makedirs(os.path.join(output_dir, 'images/'))
@ -154,7 +154,7 @@ class Reader(FormatReader):
for i in images: for i in images:
manifest.append((os.path.join('images/', i), None)) manifest.append((os.path.join('images/', i), None))
opf.create_manifest(manifest) opf.create_manifest(manifest)
opf.create_spine(['index.html']) opf.create_spine(['index.html'])
with open('metadata.opf', 'wb') as opffile: with open('metadata.opf', 'wb') as opffile:

View File

@ -4,17 +4,114 @@ from __future__ import with_statement
Write content to ereader pdb file. Write content to ereader pdb file.
''' '''
import struct, zlib
import Image, cStringIO
from calibre.ebooks.oeb.base import OEB_IMAGES
from calibre.ebooks.pdb.header import PdbHeaderBuilder
from calibre.ebooks.pdb.ereader import image_name
from calibre.ebooks.pdb.ereader.pmlconverter import html_to_pml from calibre.ebooks.pdb.ereader.pmlconverter import html_to_pml
IDENTITY = 'PNPdPPrs'
class Writer(object): class Writer(object):
def __init__(self, log): def __init__(self, log):
self.oeb_book = oeb_book self.log = log
def dump(oeb_book): def dump(self, oeb_book, out_stream, metadata=None):
text = self._text(oeb_book.spine)
images = self._images(oeb_book.manifest)
metadata = [self._metadata(metadata)]
hr = [self._header_record(len(text), len(images))]
sections = hr+text+images+metadata
lengths = [len(i) for i in sections]
pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, '')
pdbHeaderBuilder.build_header(lengths, out_stream)
for item in sections:
out_stream.write(item)
def _text(self, pages):
pml_pages = [] pml_pages = []
for page in oeb_book.spine:
pml_pages.append(html_to_pml(page))
for page in pages:
pml_pages.append(zlib.compress(html_to_pml(unicode(page)).encode('utf-8')))
return pml_pages
def _images(self, manifest):
images = []
for item in manifest:
if item.media_type in OEB_IMAGES:
image = '\x00\x00\x00\x00'
image += image_name(item.href)
image = image.ljust(62, '\x00')
im = Image.open(cStringIO.StringIO(item.data))
data = cStringIO.StringIO()
im.save(data, 'PNG')
data = data.getvalue()
image += data
if len(image) < 65505:
images.append(image)
return images
def _metadata(self, metadata):
return '\x00\x00\x00\x00\x00'
def _header_record(self, text_items, image_items):
'''
text_items = the number of text pages
image_items = the number of images
'''
version = 10
non_text_offset = text_items + 1
if image_items > 0:
image_data_offset = text_items + 1
meta_data_offset = image_data_offset + image_items
last_data_offset = meta_data_offset + 1
else:
meta_data_offset = text_items + 1
last_data_offset = meta_data_offset + 1
image_data_offset = last_data_offset
record = u''
# Version
record += struct.pack('>H', version)
record = record.ljust(12, '\x00')
# Non-text offset, everything between record 0 and non_text_offset is text pages
record += struct.pack('>H', non_text_offset)
record = record.ljust(28, '\x00')
# Footnote and Sidebar rec
record += struct.pack('>H', 0)
record += struct.pack('>H', 0)
record += struct.pack('>H', last_data_offset)
record = record.ljust(40, '\x00')
# image pages
record += struct.pack('>H', image_data_offset)
record = record.ljust(44, '\x00')
# metadata string
record += struct.pack('>H', meta_data_offset)
record = record.ljust(48, '\x00')
# footnote and sidebar offsets
record += struct.pack('>H', last_data_offset)
record += struct.pack('>H', last_data_offset)
record = record.ljust(52, '\x00')
record += struct.pack('>H', last_data_offset)
return record

View File

@ -8,7 +8,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, struct import os, re, struct, time
class PdbHeaderReader(object): class PdbHeaderReader(object):
@ -60,18 +60,26 @@ class PdbHeaderReader(object):
return self.stream.read(end - start) return self.stream.read(end - start)
class PdbHeaderWriter(object): class PdbHeaderBuilder(object):
def __init__(self, identity, title): def __init__(self, identity, title):
self.identity = identity.ljust(3, '\x00')[:8] self.identity = identity.ljust(3, '\x00')[:8]
self.title = title.ljust(32, '\x00')[:32] self.title = re.sub('[^-A-Za-z0-9]+', '_', title).ljust(32, '\x00')[:32]
def build_header(self, offsets): def build_header(self, section_lengths, out_stream):
''' '''
Offsets is a list of section offsets section_lengths = Lenght of each section in file.
''' '''
now = int(time.time())
nrecords = len(section_lengths)
out_stream.write(self.title + struct.pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0))
out_stream.write(self.identity + struct.pack('>IIH', nrecords, 0, nrecords))
offset = 78 + (8 * nrecords) + 2
for id, record in enumerate(section_lengths):
out_stream.write(struct.pack('>LBBBB', long(offset), 0, 0, 0, 0))
offset += record
out_stream.write('\x00\x00')
return header

View File

@ -55,3 +55,4 @@ class TXTOutput(OutputFormatPlugin):
if close: if close:
out_stream.close() out_stream.close()