diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py
index 61a91febda..347bde951c 100644
--- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py
+++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py
@@ -12,10 +12,12 @@ import re
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
+from BeautifulSoup import BeautifulSoup
+
PML_HTML_RULES = [
(re.compile(r'\\p'), lambda match: '
'),
(re.compile(r'\\x(?P.+?)\\x', re.DOTALL), lambda match: '%s
' % match.group('text')),
- (re.compile(r'\\X(?P[0-4])(?P.+?)\\X[0-4]', re.DOTALL), lambda match: '%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
+ (re.compile(r'\\X(?P[0-4])(?P.+?)\\X[0-4]', re.DOTALL), lambda match: '%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
(re.compile(r'\\C\d=".+"'), lambda match: ''), # This should be made to create a TOC entry
(re.compile(r'\\c(?P.+?)\\c', re.DOTALL), lambda match: '%s
' % match.group('text')),
(re.compile(r'\\r(?P.+?)\\r', re.DOTALL), lambda match: '%s
' % match.group('text')),
@@ -34,10 +36,10 @@ PML_HTML_RULES = [
(re.compile(r'\\Sp(?P.+?)\\Sp', re.DOTALL), lambda match: '%s' % match.group('text')),
(re.compile(r'\\Sb(?P.+?)\\Sb', re.DOTALL), lambda match: '%s' % match.group('text')),
(re.compile(r'\\k(?P.+?)\\k', re.DOTALL), lambda match: '%s' % match.group('text')),
- (re.compile(r'\\a(?P\d\d\d)'), lambda match: '%i;' % match.group('num')),
- (re.compile(r'\\U(?P\d\d\d\d)'), lambda match: '%i;' % int(match.group('num'))),
+ (re.compile(r'\\a(?P\d\d\d)'), lambda match: '%s;' % match.group('num')),
+ (re.compile(r'\\U(?P\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))),
(re.compile(r'\\m="(?P.+?)"'), lambda match: '
' % match.group('name')),
- (re.compile(r'\\q="(?P#.+?)"(?P)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))),
+ (re.compile(r'\\q="(?P#.+?)"(?P.+?)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))),
(re.compile(r'\\Q="(?P.+?)"'), lambda match: '' % match.group('target')),
(re.compile(r'\\-'), lambda match: ''),
(re.compile(r'\\Fn="(?P.+?)"(?P.+?)\\Fn'), lambda match: '%s' % (match.group('target'), match.group('text'))),
@@ -67,23 +69,23 @@ HTML_PML_RULES = [
(re.compile('.+?).*?">'), lambda match: '\\\\Q="%s"' % match.group('target')),
(re.compile('#.+?).*?">(?P)', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
(re.compile('.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')),
- (re.compile('(?P\d\d\d\d);'), lambda match: '\\U%i' % int(match.group('num'))),
- (re.compile('(?P\d\d\d);'), lambda match: '\\a%i' % match.group('num')),
+ #(re.compile('(?P\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))),
+ (re.compile('(?P\d\d\d);'), lambda match: '\\a%s' % match.group('num')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
- (re.compile('\d+)%%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')),
- (re.compile('\d+)%%*;.*?>(?P.+?)', re.MULTILINE), lambda match: '\\T="%i%%"%s$' % (match.group('val'), match.group('text'))),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')),
+ (re.compile('\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')),
+ (re.compile('\d+)%*;.*?>(?P.+?)', re.MULTILINE), lambda match: '\\T="%s%%"%s$' % (match.group('val'), match.group('text'))),
+ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')),
(re.compile('', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')),
- (re.compile('[0-4]).*?>(?P.+?)', re.DOTALL), lambda match: '\\X%i%s\\X%i' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
+ (re.compile('[0-4]).*?>(?P.+?)', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
(re.compile('(?P.+?)', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')),
(re.compile(''), lambda match: '\\p'),
(re.compile('<.*?>'), lambda match: ''),
@@ -102,13 +104,19 @@ def pml_to_html(pml):
return html
def footnote_sidebar_to_html(id, pml):
- html = '' % (id, pml_to_html(pml))
+ html = '%s' % (id, id, pml_to_html(pml))
return html
def html_to_pml(html):
- pml = html
- for rule in HTML_PML_RULES:
- pml = rule[0].sub(rule[1], pml)
+ pml = ''
+
+ for dom_tree in BeautifulSoup(html).findAll('body'):
+ body = unicode(dom_tree.pretty_print())
+
+ for rule in HTML_PML_RULES:
+ body = rule[0].sub(rule[1], pml)
+
+ pml += body
# Replace symbols outside of cp1512 wtih \Uxxxx
diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py
index b47dac1af0..e0e42e40fd 100644
--- a/src/calibre/ebooks/pdb/ereader/reader.py
+++ b/src/calibre/ebooks/pdb/ereader/reader.py
@@ -108,14 +108,19 @@ class Reader(FormatReader):
footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)):
self.log.debug('Extracting footnote page %i' % i)
+ html += ''
html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
+ html += '
'
+
if self.header_record.sidebar_rec > 0:
html += '
%s
' % _('Sidebar')
sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)):
self.log.debug('Extracting sidebar page %i' % i)
+ html += ''
html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i))
+ html += '
'
html += '