diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index b105a6c042..fb55ee74fb 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -32,6 +32,39 @@ def chap_head(match):
return '
'+chap+'
'+title+'
'
+def line_length(raw, percent):
+ '''
+ raw is the raw text to find the line length to use for wrapping.
+ percentage is a decimal number, 0 - 1 which is used to determine
+ how far in the list of line lengths to use.
+ '''
+ raw = raw.replace(' ', ' ')
+ linere = re.compile('(?<=
).*?(?=
)', re.DOTALL)
+ lines = linere.findall(raw)
+
+ lengths = []
+ for line in lines:
+ if len(line) > 0:
+ lengths.append(len(line))
+ total = sum(lengths)
+ avg = total / len(lengths)
+ max_line = avg * 2
+
+ lengths = sorted(lengths)
+ for i in range(len(lengths) - 1, -1, -1):
+ if lengths[i] > max_line:
+ del lengths[i]
+
+ if percent > 1:
+ percent = 1
+ if percent < 0:
+ percent = 0
+
+ index = int(len(lengths) * percent) - 1
+
+ return lengths[index]
+
+
class CSSPreProcessor(object):
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
@@ -129,7 +162,12 @@ class HTMLPreProcessor(object):
elif self.is_book_designer(html):
rules = self.BOOK_DESIGNER
elif self.is_pdftohtml(html):
- rules = self.PDFTOHTML
+ # Add rules that require matching line length here
+ #line_length_rules = [
+ # (re.compile('%i' % line_length(html, .85)), lambda match:)
+ #]
+
+ rules = self.PDFTOHTML # + line_length_rules
else:
rules = []
for rule in self.PREPROCESS + rules:
diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py
index 8ff30e9349..347bde951c 100644
--- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py
+++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py
@@ -12,10 +12,12 @@ import re
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
+from BeautifulSoup import BeautifulSoup
+
PML_HTML_RULES = [
(re.compile(r'\\p'), lambda match: '
'),
(re.compile(r'\\x(?P.+?)\\x', re.DOTALL), lambda match: '%s
' % match.group('text')),
- (re.compile(r'\\X(?P[0-4])(?P.+?)\\X[0-4]', re.DOTALL), lambda match: '%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
+ (re.compile(r'\\X(?P[0-4])(?P.+?)\\X[0-4]', re.DOTALL), lambda match: '%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
(re.compile(r'\\C\d=".+"'), lambda match: ''), # This should be made to create a TOC entry
(re.compile(r'\\c(?P.+?)\\c', re.DOTALL), lambda match: '%s
' % match.group('text')),
(re.compile(r'\\r(?P.+?)\\r', re.DOTALL), lambda match: '%s
' % match.group('text')),
@@ -24,8 +26,8 @@ PML_HTML_RULES = [
(re.compile(r'\\o(?P.+?)\\o', re.DOTALL), lambda match: '%s' % match.group('text')),
(re.compile(r'\\v(?P.+?)\\v', re.DOTALL), lambda match: '' % match.group('text')),
(re.compile(r'\\t(?P.+?)\\t', re.DOTALL), lambda match: '%s
' % match.group('text')),
- (re.compile(r'\\T="(?P\d+)%%*"(?P.+?)$', re.MULTILINE), lambda match: '%s
' % (match.group('val'), match.group('text'))),
- (re.compile(r'\\w="(?P\d+)%%"'), lambda match: '
' % match.group('val')),
+ (re.compile(r'\\T="(?P\d+)%*"(?P.+?)$', re.MULTILINE), lambda match: r'%s
' % (match.group('val'), match.group('text'))),
+ (re.compile(r'\\w="(?P\d+)%"'), lambda match: '
' % match.group('val')),
(re.compile(r'\\n'), lambda match: ''),
(re.compile(r'\\s'), lambda match: ''),
(re.compile(r'\\b(?P.+?)\\b', re.DOTALL), lambda match: '%s' % match.group('text')), # \b is deprecated; \B should be used instead.
@@ -34,10 +36,10 @@ PML_HTML_RULES = [
(re.compile(r'\\Sp(?P.+?)\\Sp', re.DOTALL), lambda match: '%s' % match.group('text')),
(re.compile(r'\\Sb(?P.+?)\\Sb', re.DOTALL), lambda match: '%s' % match.group('text')),
(re.compile(r'\\k(?P.+?)\\k', re.DOTALL), lambda match: '%s' % match.group('text')),
- (re.compile(r'\\a(?P\d\d\d)'), lambda match: '%i;' % match.group('num')),
- (re.compile(r'\\U(?P\d\d\d\d)'), lambda match: '%i;' % int(match.group('num'))),
+ (re.compile(r'\\a(?P\d\d\d)'), lambda match: '%s;' % match.group('num')),
+ (re.compile(r'\\U(?P\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))),
(re.compile(r'\\m="(?P.+?)"'), lambda match: '
' % match.group('name')),
- (re.compile(r'\\q="(?P#.+?)"(?P)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))),
+ (re.compile(r'\\q="(?P#.+?)"(?P.+?)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))),
(re.compile(r'\\Q="(?P.+?)"'), lambda match: '' % match.group('target')),
(re.compile(r'\\-'), lambda match: ''),
(re.compile(r'\\Fn="(?P.+?)"(?P.+?)\\Fn'), lambda match: '%s' % (match.group('target'), match.group('text'))),
@@ -58,14 +60,6 @@ PML_HTML_RULES = [
(re.compile(r'\\\\'), lambda match: '\\'),
]
-FOOTNOTE_HTML_RULES = [
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '')
-]
-
-SIDEBAR_HTML_RULES = [
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '')
-]
-
HTML_PML_RULES = [
(re.compile(r'\\'), lambda match: '\\\\'),
(re.compile('(?<=[^\n])[ ]*'), lambda match: '\n'),
@@ -75,23 +69,23 @@ HTML_PML_RULES = [
(re.compile('
.+?).*?">'), lambda match: '\\\\Q="%s"' % match.group('target')),
(re.compile('#.+?).*?">(?P)', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
(re.compile('.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')),
- (re.compile('(?P\d\d\d\d);'), lambda match: '\\U%i' % int(match.group('num'))),
- (re.compile('(?P\d\d\d);'), lambda match: '\\a%i' % match.group('num')),
+ #(re.compile('(?P\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))),
+ (re.compile('(?P\d\d\d);'), lambda match: '\\a%s' % match.group('num')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
- (re.compile('\d+)%%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')),
- (re.compile('\d+)%%*;.*?>(?P.+?)', re.MULTILINE), lambda match: '\\T="%i%%"%s$' % (match.group('val'), match.group('text'))),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')),
+ (re.compile('\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')),
+ (re.compile('\d+)%*;.*?>(?P.+?)', re.MULTILINE), lambda match: '\\T="%s%%"%s$' % (match.group('val'), match.group('text'))),
+ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')),
(re.compile('', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')),
- (re.compile('[0-4]).*?>(?P.+?)', re.DOTALL), lambda match: '\\X%i%s\\X%i' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
+ (re.compile('[0-4]).*?>(?P.+?)', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')),
(re.compile(''), lambda match: '\\p'),
(re.compile('<.*?>'), lambda match: ''),
@@ -109,28 +103,20 @@ def pml_to_html(pml):
return html
-def footnote_to_html(footnotes):
- html = footnotes
- for rule in FOOTNOTE_HTML_RULES:
- html = rule[0].sub(rule[1], html)
-
- html = pml_to_html(html)
-
- return html
-
-def sidebar_to_html(sidebars):
- html = sidebars
- for rule in FOOTNOTE_HTML_RULES:
- html = rule[0].sub(rule[1], html)
-
- html = pml_to_html(html)
-
- return html
+def footnote_sidebar_to_html(id, pml):
+ html = '%s' % (id, id, pml_to_html(pml))
+ return html
def html_to_pml(html):
- pml = html
- for rule in HTML_PML_RULES:
- pml = rule[0].sub(rule[1], pml)
+ pml = ''
+
+ for dom_tree in BeautifulSoup(html).findAll('body'):
+ body = unicode(dom_tree.pretty_print())
+
+ for rule in HTML_PML_RULES:
+ body = rule[0].sub(rule[1], pml)
+
+ pml += body
# Replace symbols outside of cp1512 wtih \Uxxxx
diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py
index 8a0abb970e..e0e42e40fd 100644
--- a/src/calibre/ebooks/pdb/ereader/reader.py
+++ b/src/calibre/ebooks/pdb/ereader/reader.py
@@ -8,7 +8,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember '
__docformat__ = 'restructuredtext en'
-import os, sys, struct, zlib
+import os, re, sys, struct, zlib
from calibre import CurrentDir
from calibre.ebooks import DRMError
@@ -16,7 +16,7 @@ from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pdb.ereader import EreaderError
from calibre.ebooks.pdb.ereader.pmlconverter import pml_to_html, \
- footnote_to_html, sidebar_to_html
+ footnote_sidebar_to_html
from calibre.ebooks.mobi.palmdoc import decompress_doc
from calibre.ebooks.metadata.opf2 import OPFCreator
@@ -42,14 +42,6 @@ class HeaderRecord(object):
self.num_text_pages = self.non_text_offset -1
self.num_image_pages = self.metadata_offset - self.image_data_offset
-
- # Can't tell which is sidebar and footnote if they have same offset.
- # They don't exist if offset is larget than last_record.
- # Todo: Determine if the subtraction is necessary and find out
- # what _rec means.
- end_footnote_offset = self.sidebar_offset if self.sidebar_offset != self.footnote_offset else self.last_data_offset
- self.num_footnote_pages = end_footnote_offset - self.footnote_offset if self.footnote_offset < self.last_data_offset else 0
- self.num_sidebar_pages = self.sidebar_offset - self.last_data_offset if self.footnote_offset < self.last_data_offset else 0
class Reader(FormatReader):
@@ -94,44 +86,10 @@ class Reader(FormatReader):
assumed to be encoded as Windows-1252. The encoding is part of
the eReader file spec and should always be this encoding.
'''
- if number not in range(1, self.header_record.num_text_pages):
+ if number not in range(1, self.header_record.num_text_pages + 1):
return ''
return self.decompress_text(number)
-
- def get_footnote_page(self, number):
- if number not in range(self.header_record.footnote_offset, self.header_record.footnote_offset + self.header_record.num_footnote_pages):
- return ''
-
- return self.decompress_text(number)
-
- def get_sidebar_page(self, number):
- if number not in range(self.header_record.sidebar_offset, self.header_record.sidebar_offset + self.header_record.num_sidebar_pages - 1):
- return ''
-
- return self.decompress_text(number)
-
- def has_footnotes(self):
- if self.header_record.num_footnote_pages > 1:
- try:
- content = self.decompress_text(self.header_record.footnote_offset)
-
- if content.contains(''):
- return True
- except:
- pass
- return False
-
- def has_sidebar(self):
- if self.header_record.num_sidebar_pages > 1:
- try:
- content = self.decompress_text(self.header_record.sidebar_offset)
-
- if content.contains(''):
- return True
- except:
- pass
- return False
def extract_content(self, output_dir):
output_dir = os.path.abspath(output_dir)
@@ -144,22 +102,25 @@ class Reader(FormatReader):
for i in range(1, self.header_record.num_text_pages + 1):
self.log.debug('Extracting text page %i' % i)
html += pml_to_html(self.get_text_page(i))
-
- # Untested: The num_.._pages variable may not be correct!
- # Possibly use .._rec instead?
- '''
- if has_footnotes():
+
+ if self.header_record.footnote_rec > 0:
html += '
%s
' % _('Footnotes')
- for i in range(self.header_record.footnote_offset, self.header_record.num_footnote_pages):
+ footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
+ for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)):
self.log.debug('Extracting footnote page %i' % i)
- html += footnote_to_html(self.get_footnote_page(i))
+ html += ''
+ html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
+ html += '
'
- if has_sidebar():
+
+ if self.header_record.sidebar_rec > 0:
html += '
%s
' % _('Sidebar')
- for i in range(self.header_record.sidebar_offset, self.header_record.num_sidebar_pages):
+ sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
+ for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)):
self.log.debug('Extracting sidebar page %i' % i)
- html += sidebar_to_html(self.get_sidebar_page(i))
- '''
+ html += ''
+ html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i))
+ html += '
'
html += '