diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py index 61a91febda..347bde951c 100644 --- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py +++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py @@ -12,10 +12,12 @@ import re from calibre.ebooks.htmlsymbols import HTML_SYMBOLS +from BeautifulSoup import BeautifulSoup + PML_HTML_RULES = [ (re.compile(r'\\p'), lambda match: '

'), (re.compile(r'\\x(?P.+?)\\x', re.DOTALL), lambda match: '

%s

' % match.group('text')), - (re.compile(r'\\X(?P[0-4])(?P.+?)\\X[0-4]', re.DOTALL), lambda match: '%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), + (re.compile(r'\\X(?P[0-4])(?P.+?)\\X[0-4]', re.DOTALL), lambda match: '%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), (re.compile(r'\\C\d=".+"'), lambda match: ''), # This should be made to create a TOC entry (re.compile(r'\\c(?P.+?)\\c', re.DOTALL), lambda match: '
%s
' % match.group('text')), (re.compile(r'\\r(?P.+?)\\r', re.DOTALL), lambda match: '
%s
' % match.group('text')), @@ -34,10 +36,10 @@ PML_HTML_RULES = [ (re.compile(r'\\Sp(?P.+?)\\Sp', re.DOTALL), lambda match: '%s' % match.group('text')), (re.compile(r'\\Sb(?P.+?)\\Sb', re.DOTALL), lambda match: '%s' % match.group('text')), (re.compile(r'\\k(?P.+?)\\k', re.DOTALL), lambda match: '%s' % match.group('text')), - (re.compile(r'\\a(?P\d\d\d)'), lambda match: '&#%i;' % match.group('num')), - (re.compile(r'\\U(?P\d\d\d\d)'), lambda match: '&#%i;' % int(match.group('num'))), + (re.compile(r'\\a(?P\d\d\d)'), lambda match: '&#%s;' % match.group('num')), + (re.compile(r'\\U(?P\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))), (re.compile(r'\\m="(?P.+?)"'), lambda match: '' % match.group('name')), - (re.compile(r'\\q="(?P#.+?)"(?P)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))), + (re.compile(r'\\q="(?P#.+?)"(?P.+?)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))), (re.compile(r'\\Q="(?P.+?)"'), lambda match: '
' % match.group('target')), (re.compile(r'\\-'), lambda match: ''), (re.compile(r'\\Fn="(?P.+?)"(?P.+?)\\Fn'), lambda match: '%s' % (match.group('target'), match.group('text'))), @@ -67,23 +69,23 @@ HTML_PML_RULES = [ (re.compile('.+?).*?">'), lambda match: '\\\\Q="%s"' % match.group('target')), (re.compile('#.+?).*?">(?P)', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))), (re.compile('.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')), - (re.compile('&#(?P\d\d\d\d);'), lambda match: '\\U%i' % int(match.group('num'))), - (re.compile('&#(?P\d\d\d);'), lambda match: '\\a%i' % match.group('num')), + #(re.compile('&#(?P\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))), + (re.compile('&#(?P\d\d\d);'), lambda match: '\\a%s' % match.group('num')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), - (re.compile('\d+)%%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')), - (re.compile('\d+)%%*;.*?>(?P.+?)', re.MULTILINE), lambda match: '\\T="%i%%"%s$' % (match.group('val'), match.group('text'))), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')), + (re.compile('\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')), + (re.compile('\d+)%*;.*?>(?P.+?)', re.MULTILINE), lambda match: '\\T="%s%%"%s$' % (match.group('val'), match.group('text'))), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')), (re.compile('', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')), - (re.compile('[0-4]).*?>(?P.+?)', re.DOTALL), lambda match: '\\X%i%s\\X%i' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), + (re.compile('[0-4]).*?>(?P.+?)', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')), (re.compile(''), lambda match: '\\p'), (re.compile('<.*?>'), lambda match: ''), @@ -102,13 +104,19 @@ def pml_to_html(pml): return html def footnote_sidebar_to_html(id, pml): - html = '' % (id, pml_to_html(pml)) + html = '
%s
' % (id, id, pml_to_html(pml)) return html def html_to_pml(html): - pml = html - for rule in HTML_PML_RULES: - pml = rule[0].sub(rule[1], pml) + pml = '' + + for dom_tree in BeautifulSoup(html).findAll('body'): + body = unicode(dom_tree.pretty_print()) + + for rule in HTML_PML_RULES: + body = rule[0].sub(rule[1], pml) + + pml += body # Replace symbols outside of cp1512 wtih \Uxxxx diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index b47dac1af0..e0e42e40fd 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -108,14 +108,19 @@ class Reader(FormatReader): footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding)) for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)): self.log.debug('Extracting footnote page %i' % i) + html += '
' html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i)) + html += '
' + if self.header_record.sidebar_rec > 0: html += '

%s

' % _('Sidebar') sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding)) for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)): self.log.debug('Extracting sidebar page %i' % i) + html += '
' html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i)) + html += '
' html += ''