From 188f630c35e127e2ab3964b322b8f181b18480af Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 25 Apr 2009 08:14:22 -0400 Subject: [PATCH 1/5] New pdftohtml processing rules. Best yet. --- src/calibre/ebooks/conversion/preprocess.py | 33 ++++++++++----------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index fb55ee74fb..0421534f65 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -31,6 +31,12 @@ def chap_head(match): else: return '

'+chap+'
'+title+'


' +def wrap_lines(match): + ital = match.group('ital') + if not ital: + return ' ' + else: + return ital+' ' def line_length(raw, percent): ''' @@ -93,17 +99,11 @@ class HTMLPreProcessor(object): (re.compile(r'', re.IGNORECASE), lambda match: ''), # Remove
tags (re.compile(r'', re.IGNORECASE), lambda match: '
'), - # Remove page numbers - (re.compile(r'\d+
', re.IGNORECASE), lambda match: ''), # Replace

with

(re.compile(r'\s*', re.IGNORECASE), lambda match: '

'), - # Remove
- (re.compile(r'(.*)', re.IGNORECASE), - lambda match: match.group() if \ - re.match('<', match.group(1).lstrip()) or \ - len(match.group(1)) < 40 else match.group(1)), + # Remove hyphenation - (re.compile(r'-\n\r?'), lambda match: ''), + (re.compile(r'-\n\r?'), lambda match: ''), # Remove gray background (re.compile(r']+>'), lambda match : ''), @@ -112,15 +112,12 @@ class HTMLPreProcessor(object): (re.compile(ur'\u00a0'), lambda match : ' '), # Detect Chapters to match default XPATH in GUI - (re.compile(r'(]*>)?(]*>)?s*(?P(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(]*>|]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(]*>|]*>))((?P.*)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head), - (re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), + (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<i><b>|<i>|<b>)?(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?(</i></b>|</i>|</b>)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>(<i>)?\s*\w+(\s+\w+)?(</i>)?)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head), + (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), # Have paragraphs show better (re.compile(r'<br.*?>'), lambda match : '<p>'), - # Un wrap lines - (re.compile(r'(?<=[^\.^\^?^!^"^”])\s*(</(i|b|u)>)*\s*<p.*?>\s*(<(i|b|u)>)*\s*(?=[a-z0-9I])', re.UNICODE), lambda match: ' '), - # Clean up spaces (re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), # Add space before and after italics @@ -162,12 +159,12 @@ class HTMLPreProcessor(object): elif self.is_book_designer(html): rules = self.BOOK_DESIGNER elif self.is_pdftohtml(html): - # Add rules that require matching line length here - #line_length_rules = [ - # (re.compile('%i' % line_length(html, .85)), lambda match:) - #] + line_length_rules = [ + # Un wrap using punctuation + (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .2), re.UNICODE), wrap_lines), + ] - rules = self.PDFTOHTML # + line_length_rules + rules = self.PDFTOHTML + line_length_rules else: rules = [] for rule in self.PREPROCESS + rules: From 0c858e43bcd7a0774c5555e6a8df8496df30c894 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 25 Apr 2009 13:55:45 -0400 Subject: [PATCH 2/5] ereader writer mostly working. --- src/calibre/ebooks/pdb/ereader/__init__.py | 16 ++++ src/calibre/ebooks/pdb/ereader/output.py | 26 ++++-- .../ebooks/pdb/ereader/pmlconverter.py | 63 ++++++++++---- src/calibre/ebooks/pdb/ereader/reader.py | 2 +- src/calibre/ebooks/pdb/ereader/writer.py | 83 +++++++++++++++++-- src/calibre/ebooks/pdb/header.py | 26 ++++-- src/calibre/ebooks/txt/output.py | 1 + 7 files changed, 182 insertions(+), 35 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/__init__.py b/src/calibre/ebooks/pdb/ereader/__init__.py index 89d9dfdd35..b39467c6e3 100644 --- a/src/calibre/ebooks/pdb/ereader/__init__.py +++ b/src/calibre/ebooks/pdb/ereader/__init__.py @@ -5,5 +5,21 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' +import os + class EreaderError(Exception): pass + +def image_name(name): + name = os.path.basename(name) + + if len(name) > 32: + cut = len(name) - 32 + names = name[:10] + namee = name[10+cut:] + name = names + namee + + name = name.ljust(32, '\x00')[:32] + + return name + diff --git a/src/calibre/ebooks/pdb/ereader/output.py b/src/calibre/ebooks/pdb/ereader/output.py index 034508b0da..4b188ae2f1 100644 --- a/src/calibre/ebooks/pdb/ereader/output.py +++ b/src/calibre/ebooks/pdb/ereader/output.py @@ -5,9 +5,8 @@ __docformat__ = 'restructuredtext en' import os -from calibre.customize.conversion import OutputFormatPlugin, \ - OptionRecommendation -from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines, TxtMetadata +from calibre.customize.conversion import OutputFormatPlugin +from calibre.ebooks.pdb.ereader.writer import Writer from calibre.ebooks.metadata import authors_to_string class EREADEROutput(OutputFormatPlugin): @@ -17,7 +16,22 @@ class EREADEROutput(OutputFormatPlugin): file_type = 'erpdb' def convert(self, oeb_book, output_path, input_plugin, opts, log): - from calibre.ebooks.pdb.ereader.pmlconverter import html_to_pml + writer = Writer(log) -# print html_to_pml('<p class="calibre1"> “A hundred kisses from the Princess,” said he, “or else let everyone keep his own!”</p>') - print html_to_pml(str(oeb_book.spine[3])) + close = False + if not hasattr(output_path, 'write'): + close = True + if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': + os.makedirs(os.path.dirname(output_path)) + out_stream = open(output_path, 'wb') + else: + out_stream = output_path + + out_stream.seek(0) + out_stream.truncate() + + writer.dump(oeb_book, out_stream) + + if close: + out_stream.close() + diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py index 347bde951c..88c841b81f 100644 --- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py +++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py @@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en' import re +from calibre.ebooks.pdb.ereader import image_name from calibre.ebooks.htmlsymbols import HTML_SYMBOLS from BeautifulSoup import BeautifulSoup @@ -61,35 +62,69 @@ PML_HTML_RULES = [ ] HTML_PML_RULES = [ + (re.compile(r'\\'), lambda match: '\\\\'), (re.compile('(?<=[^\n])[ ]*<p.*?>'), lambda match: '\n<p>'), - (re.compile('</p>(^\n|\r\n)'), lambda match: '\n'), + (re.compile('</p>(?=^\n|^\r\n)'), lambda match: '\n'), + + + # Clean up HTML + (re.compile('@page.*?}'), lambda match: ''), + (re.compile('<script.*?>.*?</script>', re.DOTALL), lambda match: ''), + (re.compile('<style.*?>.*?</style>', re.DOTALL), lambda match: ''), + + # Reflow paragraphs + (re.compile('<p.*?>(?P<text>.*?)</p>', re.DOTALL), lambda match: match.group('text').replace('\r\n', ' ').replace('\n', ' ')), + + # HTML to PML (re.compile('<a.*?href="#sidebar-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Sd="%s"%s\\Sd' % (match.group('target'), match.group('text'))), (re.compile('<a.*?href="#footnote-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Fn="%s"%s\\Fn' % (match.group('target'), match.group('text'))), (re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')), (re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))), - (re.compile('<img.*?src="images/(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')), + #(re.compile('<img.*?src="images/(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')), + (re.compile('<img.*?src="(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % image_name(match.group('name'))), #(re.compile('&#(?P<num>\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))), (re.compile('&#(?P<num>\d\d\d);'), lambda match: '\\a%s' % match.group('num')), - (re.compile('<small.*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), - (re.compile('<sub.*?>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')), - (re.compile('<sup.*?>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')), - (re.compile('<b.*?>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), - (re.compile('<big.*?>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), + (re.compile('<small .*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), + (re.compile('<small>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), + (re.compile('<sub .*?>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')), + (re.compile('<sub>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')), + (re.compile('<sup .*?>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')), + (re.compile('<sup>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')), + (re.compile('<b .*?>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), + (re.compile('<b>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), + (re.compile('<big .*?>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), + (re.compile('<big>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), (re.compile('<hr.*?width="(?P<val>\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')), (re.compile('<div.*?style.*?margin-left: (?P<val>\d+)%*;.*?>(?P<text>.+?)</div>', re.MULTILINE), lambda match: '\\T="%s%%"%s$' % (match.group('val'), match.group('text'))), (re.compile('<div.*?style.*?margin-left: \d{1,3}%;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')), - (re.compile('<!-- (?P<text>.+?) -->', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')), - (re.compile('<del.*?>(?P<text>.+?)</del>', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')), + (re.compile('<!--(?P<text>.+?)-->', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')), + (re.compile('<del .*?>(?P<text>.+?)</del>', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')), + (re.compile('<del>(?P<text>.+?)</del>', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')), (re.compile('<div.*?style.*?text-decoration: underline;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')), - (re.compile('<i.*?>(?P<text>.+?)</i>', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')), + (re.compile('<i .*?>(?P<text>.+?)</i>', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')), + (re.compile('<i>(?P<text>.+?)</i>', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')), (re.compile('<div.*?style.*?text-align: right;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')), (re.compile('<div.*?style.*?text-align: center;.*?".*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')), (re.compile('<h(?P<val>[0-4]).*?>(?P<text>.+?)</h[0-4]>', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), (re.compile('<h1.*?>(?P<text>.+?)</h1>', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')), - (re.compile('<br.*?>'), lambda match: '\\p'), + (re.compile('<br .*?>'), lambda match: '\\p'), + (re.compile('<br/*>'), lambda match: '\\p'), + + # Remove remaining HTML tags (re.compile('<.*?>'), lambda match: ''), + + # Remove redundant page break markers (re.compile(r'(\\p){2,}'), lambda match: r'\p'), + + # Remove whitespace on empty lines + (re.compile('^[\t\r ]$', re.MULTILINE), lambda match: ''), + + # Remove excess newlines at the beginning and end + (re.compile('^(\r\n){1,}'), lambda match: ''), + (re.compile('^\n{1,}'), lambda match: ''), + (re.compile('(\r\n){3,}$'), lambda match: ''), + (re.compile('\n{3,}$'), lambda match: ''), ] def pml_to_html(pml): @@ -111,13 +146,13 @@ def html_to_pml(html): pml = '' for dom_tree in BeautifulSoup(html).findAll('body'): - body = unicode(dom_tree.pretty_print()) + body = unicode(dom_tree.prettify()) for rule in HTML_PML_RULES: - body = rule[0].sub(rule[1], pml) + body = rule[0].sub(rule[1], body) pml += body - + # Replace symbols outside of cp1512 wtih \Uxxxx return pml diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index e0e42e40fd..c6f520ecb2 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -40,7 +40,7 @@ class HeaderRecord(object): self.sidebar_offset, = struct.unpack('>H', raw[50:52]) self.last_data_offset, = struct.unpack('>H', raw[52:54]) - self.num_text_pages = self.non_text_offset -1 + self.num_text_pages = self.non_text_offset - 1 self.num_image_pages = self.metadata_offset - self.image_data_offset diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index c9493d2915..1605e15f32 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -4,17 +4,90 @@ from __future__ import with_statement Write content to ereader pdb file. ''' +import struct, zlib + +from calibre.ebooks.oeb.base import OEB_IMAGES +from calibre.ebooks.pdb.header import PdbHeaderBuilder +from calibre.ebooks.pdb.ereader import image_name from calibre.ebooks.pdb.ereader.pmlconverter import html_to_pml +IDENTITY = 'PNPdPPrs' + class Writer(object): def __init__(self, log): - self.oeb_book = oeb_book + self.log = log - def dump(oeb_book): + def dump(self, oeb_book, out_stream, metadata=None): + text = self._text(oeb_book.spine) + images = self._images(oeb_book.manifest) + metadata = [self._metadata(metadata)] + + hr = [self._header_record(len(text), len(images))] + + sections = hr+text+images+metadata + + lengths = [len(i) for i in sections] + + pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, '') + pdbHeaderBuilder.build_header(lengths, out_stream) + + for item in sections: + out_stream.write(item) + + def _text(self, pages): pml_pages = [] - for page in oeb_book.spine: - pml_pages.append(html_to_pml(page)) + for page in pages: + pml_pages.append(zlib.compress(html_to_pml(unicode(page)))) + + return pml_pages - \ No newline at end of file + def _images(self, manifest): + images = [] + + for item in manifest: + if item.media_type in OEB_IMAGES: + image = '\x00\x00\x00\x00' + + image += image_name(item.href) + image = image.ljust(62, '\x00') + image += item.data + + images.append(image) + + return images + + def _metadata(self, metadata): + return '' + + def _header_record(self, text_items, image_items): + ''' + text_items = the number of text pages + image_items = the number of images + ''' + version = 10 + non_text_offset = text_items + + if image_items > 0: + image_data_offset = text_items + 1 + meta_data_offset = image_data_offset + image_items + else: + meta_data_offset = text_items + 1 + image_data_offset = meta_data_offset + + record = u'' + + # Version + record += struct.pack('>H', version) + record = record.ljust(12, '\x00') + record += struct.pack('>H', non_text_offset) + record = record.ljust(40, '\x00') + record += struct.pack('>H', image_data_offset) + record = record.ljust(44, '\x00') + record += struct.pack('>H', meta_data_offset) + record = record.ljust(52, '\x00') + record += struct.pack('>H', meta_data_offset) + + return record + diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py index d270c0ef71..8a9b7b105c 100644 --- a/src/calibre/ebooks/pdb/header.py +++ b/src/calibre/ebooks/pdb/header.py @@ -8,7 +8,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' -import os, struct +import os, re, struct, time class PdbHeaderReader(object): @@ -60,18 +60,26 @@ class PdbHeaderReader(object): return self.stream.read(end - start) -class PdbHeaderWriter(object): +class PdbHeaderBuilder(object): def __init__(self, identity, title): self.identity = identity.ljust(3, '\x00')[:8] - self.title = title.ljust(32, '\x00')[:32] + self.title = re.sub('[^-A-Za-z0-9]+', '_', title).ljust(32, '\x00')[:32] - def build_header(self, offsets): + def build_header(self, section_lengths, out_stream): ''' - Offsets is a list of section offsets + section_lengths = Lenght of each section in file. ''' + + now = int(time.time()) + nrecords = len(section_lengths) + + out_stream.write(self.title + struct.pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0)) + out_stream.write(self.identity + struct.pack('>IIH', nrecords, 0, nrecords)) + + offset = 78 + (8 * nrecords) + 2 + for id, record in enumerate(section_lengths): + out_stream.write(struct.pack('>LBBBB', long(offset), 0, 0, 0, 0)) + offset += record + out_stream.write('\x00\x00') - - - - return header diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index dd87394507..62c07c3d04 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -55,3 +55,4 @@ class TXTOutput(OutputFormatPlugin): if close: out_stream.close() + From 6ee829ff794bba0820f6c21ed44d62760df5eca3 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 25 Apr 2009 15:50:52 -0400 Subject: [PATCH 3/5] ereader output work --- src/calibre/ebooks/conversion/preprocess.py | 8 ++++---- src/calibre/ebooks/pdb/ereader/pmlconverter.py | 11 ++++++++--- src/calibre/ebooks/pdb/ereader/reader.py | 2 +- src/calibre/ebooks/pdb/ereader/writer.py | 4 ++-- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 0421534f65..43f1f619d0 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -27,9 +27,9 @@ def chap_head(match): chap = match.group('chap') title = match.group('title') if not title: - return '<h1>'+chap+'</h1><br/>' + return '<h1>'+chap+'</h1><br/>\n' else: - return '<h1>'+chap+'<br/>'+title+'</h1><br/>' + return '<h1>'+chap+'<br/>\n'+title+'</h1><br/>\n' def wrap_lines(match): ital = match.group('ital') @@ -121,7 +121,7 @@ class HTMLPreProcessor(object): # Clean up spaces (re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), # Add space before and after italics - (re.compile(r'(?<!“)<i>'), lambda match: ' <i>'), + (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'), (re.compile(r'</i>(?=\w)'), lambda match: '</i> '), ] @@ -161,7 +161,7 @@ class HTMLPreProcessor(object): elif self.is_pdftohtml(html): line_length_rules = [ # Un wrap using punctuation - (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .2), re.UNICODE), wrap_lines), + (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines), ] rules = self.PDFTOHTML + line_length_rules diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py index 88c841b81f..a9c9d2f7a7 100644 --- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py +++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py @@ -50,6 +50,7 @@ PML_HTML_RULES = [ # eReader files are one paragraph per line. # This forces the lines to wrap properly. (re.compile('^(?P<text>.+)$', re.MULTILINE), lambda match: '<p>%s</p>' % match.group('text')), + (re.compile('<p>[ ]*</p>'), lambda match: ''), # Remove unmatched plm codes. (re.compile(r'(?<=[^\\])\\[pxcriouvtblBk]'), lambda match: ''), @@ -82,7 +83,7 @@ HTML_PML_RULES = [ (re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')), (re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))), #(re.compile('<img.*?src="images/(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')), - (re.compile('<img.*?src="(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % image_name(match.group('name'))), + (re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name'))), #(re.compile('&#(?P<num>\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))), (re.compile('&#(?P<num>\d\d\d);'), lambda match: '\\a%s' % match.group('num')), (re.compile('<small .*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), @@ -93,6 +94,8 @@ HTML_PML_RULES = [ (re.compile('<sup>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')), (re.compile('<b .*?>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), (re.compile('<b>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), + (re.compile('<strong .*?>(?P<text>.+?)</strong>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), + (re.compile('<strong>(?P<text>.+?)</strong>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), (re.compile('<big .*?>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), (re.compile('<big>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), (re.compile('<hr.*?width="(?P<val>\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')), @@ -108,8 +111,8 @@ HTML_PML_RULES = [ (re.compile('<div.*?style.*?text-align: center;.*?".*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')), (re.compile('<h(?P<val>[0-4]).*?>(?P<text>.+?)</h[0-4]>', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), (re.compile('<h1.*?>(?P<text>.+?)</h1>', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')), - (re.compile('<br .*?>'), lambda match: '\\p'), - (re.compile('<br/*>'), lambda match: '\\p'), + (re.compile('<br .*?>'), lambda match: '\n'), + (re.compile('<br/*>'), lambda match: '\n'), # Remove remaining HTML tags (re.compile('<.*?>'), lambda match: ''), @@ -119,6 +122,8 @@ HTML_PML_RULES = [ # Remove whitespace on empty lines (re.compile('^[\t\r ]$', re.MULTILINE), lambda match: ''), + # Remove excess whitespace in lines + (re.compile('(?<=.)[ ]{2,}(?=.)'), lambda match: ' '), # Remove excess newlines at the beginning and end (re.compile('^(\r\n){1,}'), lambda match: ''), diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index c6f520ecb2..e0953753f4 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -154,7 +154,7 @@ class Reader(FormatReader): for i in images: manifest.append((os.path.join('images/', i), None)) - + opf.create_manifest(manifest) opf.create_spine(['index.html']) with open('metadata.opf', 'wb') as opffile: diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 1605e15f32..cc90b41fb6 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -39,7 +39,7 @@ class Writer(object): pml_pages = [] for page in pages: - pml_pages.append(zlib.compress(html_to_pml(unicode(page)))) + pml_pages.append(zlib.compress(html_to_pml(unicode(page)).encode('utf-8'))) return pml_pages @@ -67,7 +67,7 @@ class Writer(object): image_items = the number of images ''' version = 10 - non_text_offset = text_items + non_text_offset = text_items + 1 if image_items > 0: image_data_offset = text_items + 1 From e7ec12575d51ad4e2a645fdb74295e5d4cbc0058 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 25 Apr 2009 16:57:29 -0400 Subject: [PATCH 4/5] ereader writer working --- src/calibre/ebooks/pdb/ereader/pmlconverter.py | 4 ++-- src/calibre/ebooks/pdb/ereader/reader.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py index a9c9d2f7a7..391f70a504 100644 --- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py +++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py @@ -39,7 +39,7 @@ PML_HTML_RULES = [ (re.compile(r'\\k(?P<text>.+?)\\k', re.DOTALL), lambda match: '<small>%s</small>' % match.group('text')), (re.compile(r'\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')), (re.compile(r'\\U(?P<num>\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))), - (re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % match.group('name')), + (re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')), (re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.+?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text'))), (re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<div id="%s"></div>' % match.group('target')), (re.compile(r'\\-'), lambda match: ''), @@ -83,7 +83,7 @@ HTML_PML_RULES = [ (re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')), (re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))), #(re.compile('<img.*?src="images/(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')), - (re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name'))), + (re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name').strip('\x00'))), #(re.compile('&#(?P<num>\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))), (re.compile('&#(?P<num>\d\d\d);'), lambda match: '\\a%s' % match.group('num')), (re.compile('<small .*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index e0953753f4..d36e01ed69 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -76,7 +76,7 @@ class Reader(FormatReader): if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1: return 'empty', '' data = self.section_data(number) - name = data[4:4+32].strip('\0') + name = data[4:4+32].strip('\x00') img = data[62:] return name, img @@ -97,7 +97,7 @@ class Reader(FormatReader): if not os.path.exists(output_dir): os.makedirs(output_dir) - html = '<html><head><title>' + html = u'' for i in range(1, self.header_record.num_text_pages + 1): self.log.debug('Extracting text page %i' % i) @@ -110,8 +110,7 @@ class Reader(FormatReader): self.log.debug('Extracting footnote page %i' % i) html += '

' html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i)) - html += '
' - + html += '' if self.header_record.sidebar_rec > 0: html += '

%s

' % _('Sidebar') @@ -127,7 +126,8 @@ class Reader(FormatReader): with CurrentDir(output_dir): with open('index.html', 'wb') as index: self.log.debug('Writing text to index.html') - index.write(html.encode('utf-8')) + index.write(html) +# print html if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) From 1daf7bd86a950e7b21676341098d8c3f01e00f39 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 25 Apr 2009 19:57:21 -0400 Subject: [PATCH 5/5] ereader writer changes --- src/calibre/ebooks/conversion/preprocess.py | 7 ----- src/calibre/ebooks/pdb/ereader/writer.py | 34 ++++++++++++++++++--- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 230d759755..dad77ea3aa 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -26,17 +26,10 @@ def sanitize_head(match): def chap_head(match): chap = match.group('chap') title = match.group('title') -<<<<<<< TREE if not title: return '

'+chap+'


\n' else: return '

'+chap+'
\n'+title+'


\n' -======= - if not title: - return '

'+chap+'


' - else: - return '

'+chap+'
'+title+'


' ->>>>>>> MERGE-SOURCE def wrap_lines(match): ital = match.group('ital') diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index cc90b41fb6..65eb35157e 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -6,6 +6,8 @@ Write content to ereader pdb file. import struct, zlib +import Image, cStringIO + from calibre.ebooks.oeb.base import OEB_IMAGES from calibre.ebooks.pdb.header import PdbHeaderBuilder from calibre.ebooks.pdb.ereader import image_name @@ -52,14 +54,22 @@ class Writer(object): image += image_name(item.href) image = image.ljust(62, '\x00') - image += item.data - images.append(image) + im = Image.open(cStringIO.StringIO(item.data)) + + data = cStringIO.StringIO() + im.save(data, 'PNG') + data = data.getvalue() + + image += data + + if len(image) < 65505: + images.append(image) return images def _metadata(self, metadata): - return '' + return '\x00\x00\x00\x00\x00' def _header_record(self, text_items, image_items): ''' @@ -72,22 +82,36 @@ class Writer(object): if image_items > 0: image_data_offset = text_items + 1 meta_data_offset = image_data_offset + image_items + last_data_offset = meta_data_offset + 1 else: meta_data_offset = text_items + 1 - image_data_offset = meta_data_offset + last_data_offset = meta_data_offset + 1 + image_data_offset = last_data_offset record = u'' # Version record += struct.pack('>H', version) record = record.ljust(12, '\x00') + # Non-text offset, everything between record 0 and non_text_offset is text pages record += struct.pack('>H', non_text_offset) + record = record.ljust(28, '\x00') + # Footnote and Sidebar rec + record += struct.pack('>H', 0) + record += struct.pack('>H', 0) + record += struct.pack('>H', last_data_offset) record = record.ljust(40, '\x00') + # image pages record += struct.pack('>H', image_data_offset) record = record.ljust(44, '\x00') + # metadata string record += struct.pack('>H', meta_data_offset) + record = record.ljust(48, '\x00') + # footnote and sidebar offsets + record += struct.pack('>H', last_data_offset) + record += struct.pack('>H', last_data_offset) record = record.ljust(52, '\x00') - record += struct.pack('>H', meta_data_offset) + record += struct.pack('>H', last_data_offset) return record