diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 42e6654127..dad77ea3aa 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -26,11 +26,17 @@ def sanitize_head(match):
def chap_head(match):
chap = match.group('chap')
title = match.group('title')
- if not title:
- return '
', re.IGNORECASE), lambda match: '
'),
- # Remove page numbers
- (re.compile(r'\d+
', re.IGNORECASE), lambda match: ''),
# Replace
with
(re.compile(r'\s*', re.IGNORECASE), lambda match: ''),
- # Remove
- (re.compile(r'(.*)', re.IGNORECASE),
- lambda match: match.group() if \
- re.match('<', match.group(1).lstrip()) or \
- len(match.group(1)) < 40 else match.group(1)),
+
# Remove hyphenation
- (re.compile(r'-\n\r?'), lambda match: ''),
+ (re.compile(r'-\n\r?'), lambda match: ''),
# Remove gray background
(re.compile(r']+>'), lambda match : ''),
@@ -112,19 +112,15 @@ class HTMLPreProcessor(object):
(re.compile(ur'\u00a0'), lambda match : ' '),
# Detect Chapters to match default XPATH in GUI
- (re.compile(r'(
]*>)?(?p[^>]*>)?s*(?P(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P.*)(
]*>|?p[^>]*>)))?', re.IGNORECASE), chap_head),
- (re.compile(r'(
]*>)?(?p[^>]*>)?s*(?P([A-Z \'"!]{5,})\s*(\d+|\w+)?)(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P.*)(
]*>|?p[^>]*>)))?'), chap_head),
+ (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(||)?(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?(||)?)(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P()?\s*\w+(\s+\w+)?()?)(
]*>|?p[^>]*>)))?', re.IGNORECASE), chap_head),
+ (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P([A-Z \'"!]{5,})\s*(\d+|\w+)?)(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P.*)(
]*>|?p[^>]*>)))?'), chap_head),
# Have paragraphs show better
(re.compile(r''), lambda match : ''),
-
- # Un wrap lines
- (re.compile(r'(?<=[^\.^\^?^!^"^”])\s*((i|b|u)>)*\s*
\s*(<(i|b|u)>)*\s*(?=[a-z0-9I])', re.UNICODE), lambda match: ' '),
-
# Clean up spaces
(re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
# Add space before and after italics
- (re.compile(r'(?'), lambda match: ' '),
+ (re.compile(u'(?'), lambda match: ' '),
(re.compile(r'(?=\w)'), lambda match: ' '),
]
@@ -163,12 +159,12 @@ class HTMLPreProcessor(object):
elif self.is_book_designer(html):
rules = self.BOOK_DESIGNER
elif self.is_pdftohtml(html):
- # Add rules that require matching line length here
- #line_length_rules = [
- # (re.compile('%i' % line_length(html, .85)), lambda match:)
- #]
-
- rules = self.PDFTOHTML # + line_length_rules
+ line_length_rules = [
+ # Un wrap using punctuation
+ (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P(i|b|u)>)?\s*()\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines),
+ ]
+
+ rules = self.PDFTOHTML + line_length_rules
else:
rules = []
for rule in self.PREPROCESS + rules:
diff --git a/src/calibre/ebooks/pdb/ereader/__init__.py b/src/calibre/ebooks/pdb/ereader/__init__.py
index 89d9dfdd35..b39467c6e3 100644
--- a/src/calibre/ebooks/pdb/ereader/__init__.py
+++ b/src/calibre/ebooks/pdb/ereader/__init__.py
@@ -5,5 +5,21 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember '
__docformat__ = 'restructuredtext en'
+import os
+
class EreaderError(Exception):
pass
+
+def image_name(name):
+ name = os.path.basename(name)
+
+ if len(name) > 32:
+ cut = len(name) - 32
+ names = name[:10]
+ namee = name[10+cut:]
+ name = names + namee
+
+ name = name.ljust(32, '\x00')[:32]
+
+ return name
+
diff --git a/src/calibre/ebooks/pdb/ereader/output.py b/src/calibre/ebooks/pdb/ereader/output.py
index 034508b0da..4b188ae2f1 100644
--- a/src/calibre/ebooks/pdb/ereader/output.py
+++ b/src/calibre/ebooks/pdb/ereader/output.py
@@ -5,9 +5,8 @@ __docformat__ = 'restructuredtext en'
import os
-from calibre.customize.conversion import OutputFormatPlugin, \
- OptionRecommendation
-from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines, TxtMetadata
+from calibre.customize.conversion import OutputFormatPlugin
+from calibre.ebooks.pdb.ereader.writer import Writer
from calibre.ebooks.metadata import authors_to_string
class EREADEROutput(OutputFormatPlugin):
@@ -17,7 +16,22 @@ class EREADEROutput(OutputFormatPlugin):
file_type = 'erpdb'
def convert(self, oeb_book, output_path, input_plugin, opts, log):
- from calibre.ebooks.pdb.ereader.pmlconverter import html_to_pml
+ writer = Writer(log)
-# print html_to_pml(' “A hundred kisses from the Princess,” said he, “or else let everyone keep his own!”
')
- print html_to_pml(str(oeb_book.spine[3]))
+ close = False
+ if not hasattr(output_path, 'write'):
+ close = True
+ if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
+ os.makedirs(os.path.dirname(output_path))
+ out_stream = open(output_path, 'wb')
+ else:
+ out_stream = output_path
+
+ out_stream.seek(0)
+ out_stream.truncate()
+
+ writer.dump(oeb_book, out_stream)
+
+ if close:
+ out_stream.close()
+
diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py
index 347bde951c..391f70a504 100644
--- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py
+++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py
@@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en'
import re
+from calibre.ebooks.pdb.ereader import image_name
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
from BeautifulSoup import BeautifulSoup
@@ -38,7 +39,7 @@ PML_HTML_RULES = [
(re.compile(r'\\k(?P.+?)\\k', re.DOTALL), lambda match: '%s' % match.group('text')),
(re.compile(r'\\a(?P\d\d\d)'), lambda match: '%s;' % match.group('num')),
(re.compile(r'\\U(?P\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))),
- (re.compile(r'\\m="(?P.+?)"'), lambda match: '
' % match.group('name')),
+ (re.compile(r'\\m="(?P.+?)"'), lambda match: '
' % image_name(match.group('name')).strip('\x00')),
(re.compile(r'\\q="(?P#.+?)"(?P.+?)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))),
(re.compile(r'\\Q="(?P.+?)"'), lambda match: '' % match.group('target')),
(re.compile(r'\\-'), lambda match: ''),
@@ -49,6 +50,7 @@ PML_HTML_RULES = [
# eReader files are one paragraph per line.
# This forces the lines to wrap properly.
(re.compile('^(?P.+)$', re.MULTILINE), lambda match: '%s
' % match.group('text')),
+ (re.compile('[ ]*
'), lambda match: ''),
# Remove unmatched plm codes.
(re.compile(r'(?<=[^\\])\\[pxcriouvtblBk]'), lambda match: ''),
@@ -61,35 +63,73 @@ PML_HTML_RULES = [
]
HTML_PML_RULES = [
+
(re.compile(r'\\'), lambda match: '\\\\'),
(re.compile('(?<=[^\n])[ ]*'), lambda match: '\n'),
- (re.compile('
(^\n|\r\n)'), lambda match: '\n'),
+ (re.compile('
(?=^\n|^\r\n)'), lambda match: '\n'),
+
+
+ # Clean up HTML
+ (re.compile('@page.*?}'), lambda match: ''),
+ (re.compile('.*?', re.DOTALL), lambda match: ''),
+ (re.compile('.*?', re.DOTALL), lambda match: ''),
+
+ # Reflow paragraphs
+ (re.compile('(?P.*?)
', re.DOTALL), lambda match: match.group('text').replace('\r\n', ' ').replace('\n', ' ')),
+
+ # HTML to PML
(re.compile('.+?).*?">(?P.+?)'), lambda match: '\\Sd="%s"%s\\Sd' % (match.group('target'), match.group('text'))),
(re.compile('.+?).*?">(?P.+?)'), lambda match: '\\Fn="%s"%s\\Fn' % (match.group('target'), match.group('text'))),
(re.compile('.+?).*?">'), lambda match: '\\\\Q="%s"' % match.group('target')),
(re.compile('#.+?).*?">(?P)', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
- (re.compile('.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')),
+ #(re.compile('.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')),
+ (re.compile('.+?)".*?>(.*?)*'), lambda match: '\\m="%s"' % image_name(match.group('name').strip('\x00'))),
#(re.compile('(?P\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))),
(re.compile('(?P\d\d\d);'), lambda match: '\\a%s' % match.group('num')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
+ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
+ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
+ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
+ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
+ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
+ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
+ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
+ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
+ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
+ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
+ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
+ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
(re.compile('\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')),
(re.compile('\d+)%*;.*?>(?P.+?)', re.MULTILINE), lambda match: '\\T="%s%%"%s$' % (match.group('val'), match.group('text'))),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')),
- (re.compile('', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')),
+ (re.compile('', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')),
+ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')),
+ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')),
+ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')),
+ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')),
(re.compile('[0-4]).*?>(?P.+?)', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')),
- (re.compile(''), lambda match: '\\p'),
+ (re.compile('
'), lambda match: '\n'),
+ (re.compile('
'), lambda match: '\n'),
+
+ # Remove remaining HTML tags
(re.compile('<.*?>'), lambda match: ''),
+
+ # Remove redundant page break markers
(re.compile(r'(\\p){2,}'), lambda match: r'\p'),
+
+ # Remove whitespace on empty lines
+ (re.compile('^[\t\r ]$', re.MULTILINE), lambda match: ''),
+ # Remove excess whitespace in lines
+ (re.compile('(?<=.)[ ]{2,}(?=.)'), lambda match: ' '),
+
+ # Remove excess newlines at the beginning and end
+ (re.compile('^(\r\n){1,}'), lambda match: ''),
+ (re.compile('^\n{1,}'), lambda match: ''),
+ (re.compile('(\r\n){3,}$'), lambda match: ''),
+ (re.compile('\n{3,}$'), lambda match: ''),
]
def pml_to_html(pml):
@@ -111,13 +151,13 @@ def html_to_pml(html):
pml = ''
for dom_tree in BeautifulSoup(html).findAll('body'):
- body = unicode(dom_tree.pretty_print())
+ body = unicode(dom_tree.prettify())
for rule in HTML_PML_RULES:
- body = rule[0].sub(rule[1], pml)
+ body = rule[0].sub(rule[1], body)
pml += body
-
+
# Replace symbols outside of cp1512 wtih \Uxxxx
return pml
diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py
index e0e42e40fd..d36e01ed69 100644
--- a/src/calibre/ebooks/pdb/ereader/reader.py
+++ b/src/calibre/ebooks/pdb/ereader/reader.py
@@ -40,7 +40,7 @@ class HeaderRecord(object):
self.sidebar_offset, = struct.unpack('>H', raw[50:52])
self.last_data_offset, = struct.unpack('>H', raw[52:54])
- self.num_text_pages = self.non_text_offset -1
+ self.num_text_pages = self.non_text_offset - 1
self.num_image_pages = self.metadata_offset - self.image_data_offset
@@ -76,7 +76,7 @@ class Reader(FormatReader):
if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
return 'empty', ''
data = self.section_data(number)
- name = data[4:4+32].strip('\0')
+ name = data[4:4+32].strip('\x00')
img = data[62:]
return name, img
@@ -97,7 +97,7 @@ class Reader(FormatReader):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
- html = ''
+ html = u''
for i in range(1, self.header_record.num_text_pages + 1):
self.log.debug('Extracting text page %i' % i)
@@ -110,8 +110,7 @@ class Reader(FormatReader):
self.log.debug('Extracting footnote page %i' % i)
html += ''
html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
- html += '
'
-
+ html += ''
if self.header_record.sidebar_rec > 0:
html += '
%s
' % _('Sidebar')
@@ -127,7 +126,8 @@ class Reader(FormatReader):
with CurrentDir(output_dir):
with open('index.html', 'wb') as index:
self.log.debug('Writing text to index.html')
- index.write(html.encode('utf-8'))
+ index.write(html)
+# print html
if not os.path.exists(os.path.join(output_dir, 'images/')):
os.makedirs(os.path.join(output_dir, 'images/'))
@@ -154,7 +154,7 @@ class Reader(FormatReader):
for i in images:
manifest.append((os.path.join('images/', i), None))
-
+
opf.create_manifest(manifest)
opf.create_spine(['index.html'])
with open('metadata.opf', 'wb') as opffile:
diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py
index c9493d2915..65eb35157e 100644
--- a/src/calibre/ebooks/pdb/ereader/writer.py
+++ b/src/calibre/ebooks/pdb/ereader/writer.py
@@ -4,17 +4,114 @@ from __future__ import with_statement
Write content to ereader pdb file.
'''
+import struct, zlib
+
+import Image, cStringIO
+
+from calibre.ebooks.oeb.base import OEB_IMAGES
+from calibre.ebooks.pdb.header import PdbHeaderBuilder
+from calibre.ebooks.pdb.ereader import image_name
from calibre.ebooks.pdb.ereader.pmlconverter import html_to_pml
+IDENTITY = 'PNPdPPrs'
+
class Writer(object):
def __init__(self, log):
- self.oeb_book = oeb_book
+ self.log = log
- def dump(oeb_book):
+ def dump(self, oeb_book, out_stream, metadata=None):
+ text = self._text(oeb_book.spine)
+ images = self._images(oeb_book.manifest)
+ metadata = [self._metadata(metadata)]
+
+ hr = [self._header_record(len(text), len(images))]
+
+ sections = hr+text+images+metadata
+
+ lengths = [len(i) for i in sections]
+
+ pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, '')
+ pdbHeaderBuilder.build_header(lengths, out_stream)
+
+ for item in sections:
+ out_stream.write(item)
+
+ def _text(self, pages):
pml_pages = []
- for page in oeb_book.spine:
- pml_pages.append(html_to_pml(page))
+ for page in pages:
+ pml_pages.append(zlib.compress(html_to_pml(unicode(page)).encode('utf-8')))
+
+ return pml_pages
-
\ No newline at end of file
+ def _images(self, manifest):
+ images = []
+
+ for item in manifest:
+ if item.media_type in OEB_IMAGES:
+ image = '\x00\x00\x00\x00'
+
+ image += image_name(item.href)
+ image = image.ljust(62, '\x00')
+
+ im = Image.open(cStringIO.StringIO(item.data))
+
+ data = cStringIO.StringIO()
+ im.save(data, 'PNG')
+ data = data.getvalue()
+
+ image += data
+
+ if len(image) < 65505:
+ images.append(image)
+
+ return images
+
+ def _metadata(self, metadata):
+ return '\x00\x00\x00\x00\x00'
+
+ def _header_record(self, text_items, image_items):
+ '''
+ text_items = the number of text pages
+ image_items = the number of images
+ '''
+ version = 10
+ non_text_offset = text_items + 1
+
+ if image_items > 0:
+ image_data_offset = text_items + 1
+ meta_data_offset = image_data_offset + image_items
+ last_data_offset = meta_data_offset + 1
+ else:
+ meta_data_offset = text_items + 1
+ last_data_offset = meta_data_offset + 1
+ image_data_offset = last_data_offset
+
+ record = u''
+
+ # Version
+ record += struct.pack('>H', version)
+ record = record.ljust(12, '\x00')
+ # Non-text offset, everything between record 0 and non_text_offset is text pages
+ record += struct.pack('>H', non_text_offset)
+ record = record.ljust(28, '\x00')
+ # Footnote and Sidebar rec
+ record += struct.pack('>H', 0)
+ record += struct.pack('>H', 0)
+ record += struct.pack('>H', last_data_offset)
+ record = record.ljust(40, '\x00')
+ # image pages
+ record += struct.pack('>H', image_data_offset)
+ record = record.ljust(44, '\x00')
+ # metadata string
+ record += struct.pack('>H', meta_data_offset)
+ record = record.ljust(48, '\x00')
+ # footnote and sidebar offsets
+ record += struct.pack('>H', last_data_offset)
+ record += struct.pack('>H', last_data_offset)
+ record = record.ljust(52, '\x00')
+ record += struct.pack('>H', last_data_offset)
+
+ return record
+
diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py
index d270c0ef71..8a9b7b105c 100644
--- a/src/calibre/ebooks/pdb/header.py
+++ b/src/calibre/ebooks/pdb/header.py
@@ -8,7 +8,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember '
__docformat__ = 'restructuredtext en'
-import os, struct
+import os, re, struct, time
class PdbHeaderReader(object):
@@ -60,18 +60,26 @@ class PdbHeaderReader(object):
return self.stream.read(end - start)
-class PdbHeaderWriter(object):
+class PdbHeaderBuilder(object):
def __init__(self, identity, title):
self.identity = identity.ljust(3, '\x00')[:8]
- self.title = title.ljust(32, '\x00')[:32]
+ self.title = re.sub('[^-A-Za-z0-9]+', '_', title).ljust(32, '\x00')[:32]
- def build_header(self, offsets):
+ def build_header(self, section_lengths, out_stream):
'''
- Offsets is a list of section offsets
+ section_lengths = Lenght of each section in file.
'''
+
+ now = int(time.time())
+ nrecords = len(section_lengths)
+
+ out_stream.write(self.title + struct.pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0))
+ out_stream.write(self.identity + struct.pack('>IIH', nrecords, 0, nrecords))
+
+ offset = 78 + (8 * nrecords) + 2
+ for id, record in enumerate(section_lengths):
+ out_stream.write(struct.pack('>LBBBB', long(offset), 0, 0, 0, 0))
+ offset += record
+ out_stream.write('\x00\x00')
-
-
-
- return header
diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py
index dd87394507..62c07c3d04 100644
--- a/src/calibre/ebooks/txt/output.py
+++ b/src/calibre/ebooks/txt/output.py
@@ -55,3 +55,4 @@ class TXTOutput(OutputFormatPlugin):
if close:
out_stream.close()
+