mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
eReader PDB output: proper length of indexes and do not try to add them if they are not avaliable. PML Outpu: cleanup. PML Input: read unicode and entity PML tags correctly.
This commit is contained in:
parent
4865f55e40
commit
b3ad9f0160
@ -75,7 +75,6 @@ class FB2MLizer(object):
|
|||||||
output.append(self.fb2mlize_images())
|
output.append(self.fb2mlize_images())
|
||||||
output.append(self.fb2_footer())
|
output.append(self.fb2_footer())
|
||||||
output = ''.join(output).replace(u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk', self.get_toc())
|
output = ''.join(output).replace(u'ghji87yhjko0Caliblre-toc-placeholder-for-insertion-later8ujko0987yjk', self.get_toc())
|
||||||
return output
|
|
||||||
return u'<?xml version="1.0" encoding="UTF-8"?>\n%s' % etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True)
|
return u'<?xml version="1.0" encoding="UTF-8"?>\n%s' % etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True)
|
||||||
|
|
||||||
def fb2_header(self):
|
def fb2_header(self):
|
||||||
|
@ -34,7 +34,6 @@ class HeaderRecord(object):
|
|||||||
self.has_metadata, = struct.unpack('>H', raw[24:26])
|
self.has_metadata, = struct.unpack('>H', raw[24:26])
|
||||||
self.footnote_rec, = struct.unpack('>H', raw[28:30])
|
self.footnote_rec, = struct.unpack('>H', raw[28:30])
|
||||||
self.sidebar_rec, = struct.unpack('>H', raw[30:32])
|
self.sidebar_rec, = struct.unpack('>H', raw[30:32])
|
||||||
self.bookmark_offset, = struct.unpack('>H', raw[32:34])
|
|
||||||
self.image_data_offset, = struct.unpack('>H', raw[40:42])
|
self.image_data_offset, = struct.unpack('>H', raw[40:42])
|
||||||
self.metadata_offset, = struct.unpack('>H', raw[44:46])
|
self.metadata_offset, = struct.unpack('>H', raw[44:46])
|
||||||
self.footnote_offset, = struct.unpack('>H', raw[48:50])
|
self.footnote_offset, = struct.unpack('>H', raw[48:50])
|
||||||
|
@ -8,6 +8,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re
|
||||||
import struct
|
import struct
|
||||||
import zlib
|
import zlib
|
||||||
|
|
||||||
@ -28,7 +29,7 @@ IDENTITY = 'PNRdPPrs'
|
|||||||
|
|
||||||
# This is an arbitrary number that is small enough to work. The actual maximum
|
# This is an arbitrary number that is small enough to work. The actual maximum
|
||||||
# record size is unknown.
|
# record size is unknown.
|
||||||
MAX_RECORD_SIZE = 3560
|
MAX_RECORD_SIZE = 8192
|
||||||
|
|
||||||
class Writer(FormatWriter):
|
class Writer(FormatWriter):
|
||||||
|
|
||||||
@ -37,13 +38,33 @@ class Writer(FormatWriter):
|
|||||||
self.log = log
|
self.log = log
|
||||||
|
|
||||||
def write_content(self, oeb_book, out_stream, metadata=None):
|
def write_content(self, oeb_book, out_stream, metadata=None):
|
||||||
text, image_hrefs = self._text(oeb_book)
|
pmlmlizer = PMLMLizer(self.log)
|
||||||
images = self._images(oeb_book.manifest, image_hrefs)
|
pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
|
||||||
|
|
||||||
|
text, text_sizes = self._text(pml)
|
||||||
|
chapter_index = self._chapter_index(pml)
|
||||||
|
link_index = self._link_index(pml)
|
||||||
|
images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs)
|
||||||
metadata = [self._metadata(metadata)]
|
metadata = [self._metadata(metadata)]
|
||||||
|
hr = [self._header_record(len(text), len(chapter_index), len(link_index), len(images))]
|
||||||
|
|
||||||
hr = [self._header_record(len(text), len(images))]
|
'''
|
||||||
|
Record order as generated by Dropbook.
|
||||||
sections = hr+text+images+metadata+['MeTaInFo\x00']
|
1. eReader Header
|
||||||
|
2. Compressed text
|
||||||
|
3. Small font page index
|
||||||
|
4. Large font page index
|
||||||
|
5. Chapter index
|
||||||
|
6. Links index
|
||||||
|
7. Images
|
||||||
|
8. (Extrapolation: there should be one more record type here though yet uncovered what it might be).
|
||||||
|
9. Metadata
|
||||||
|
10. Sidebar records
|
||||||
|
11. Footnote records
|
||||||
|
12. Text block size record
|
||||||
|
13. "MeTaInFo\x00" word record
|
||||||
|
'''
|
||||||
|
sections = hr+text+chapter_index+link_index+images+metadata+[text_sizes]+['MeTaInFo\x00']
|
||||||
|
|
||||||
lengths = [len(i) if i not in images else len(i[0]) + len(i[1]) for i in sections]
|
lengths = [len(i) if i not in images else len(i[0]) + len(i[1]) for i in sections]
|
||||||
|
|
||||||
@ -57,17 +78,74 @@ class Writer(FormatWriter):
|
|||||||
else:
|
else:
|
||||||
out_stream.write(item)
|
out_stream.write(item)
|
||||||
|
|
||||||
def _text(self, oeb_book):
|
def _text(self, pml):
|
||||||
pmlmlizer = PMLMLizer(self.log)
|
|
||||||
pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
|
|
||||||
|
|
||||||
pml_pages = []
|
pml_pages = []
|
||||||
for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1):
|
text_sizes = ''
|
||||||
pml_pages.append(zlib.compress(pml[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]))
|
index = 0
|
||||||
|
while index < len(pml):
|
||||||
|
'''
|
||||||
|
Split on the space character closest to MAX_RECORD_SIZE when possible.
|
||||||
|
'''
|
||||||
|
split = pml.rfind(' ', index, MAX_RECORD_SIZE)
|
||||||
|
if split == -1:
|
||||||
|
len_end = len(pml[index:])
|
||||||
|
if len_end > MAX_RECORD_SIZE:
|
||||||
|
split = MAX_RECORD_SIZE
|
||||||
|
else:
|
||||||
|
split = len_end
|
||||||
|
if split == 0:
|
||||||
|
split = 1
|
||||||
|
pml_pages.append(zlib.compress(pml[index:index+split]))
|
||||||
|
text_sizes += struct.pack('>H', split)
|
||||||
|
index += split
|
||||||
|
|
||||||
return pml_pages, pmlmlizer.image_hrefs
|
return pml_pages, text_sizes
|
||||||
|
|
||||||
|
def _index_item(self, mo):
|
||||||
|
index = ''
|
||||||
|
if 'text' in mo.groupdict().keys():
|
||||||
|
index += struct.pack('>L', mo.start())
|
||||||
|
text = mo.group('text')
|
||||||
|
# Strip all PML tags from text
|
||||||
|
text = re.sub(r'\\U[0-9a-z]{4}', '', text)
|
||||||
|
text = re.sub(r'\\a\d{3}', '', text)
|
||||||
|
text = re.sub(r'\\.', '', text)
|
||||||
|
# Add appropriate spacing to denote the various levels of headings
|
||||||
|
if 'val' in mo.groupdict().keys():
|
||||||
|
text = '%s%s' % (' ' * 4 * int(mo.group('val')), text)
|
||||||
|
index += text
|
||||||
|
index += '\x00'
|
||||||
|
return index
|
||||||
|
|
||||||
|
def _chapter_index(self, pml):
|
||||||
|
chapter_marks = [
|
||||||
|
r'(?s)\\x(?P<text>.+?)\\x',
|
||||||
|
r'(?s)\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]',
|
||||||
|
r'(?s)\\C(?P<val>\d)="(?P<text>.+?)"',
|
||||||
|
]
|
||||||
|
index = []
|
||||||
|
for chapter_mark in chapter_marks:
|
||||||
|
for mo in re.finditer(chapter_mark, pml):
|
||||||
|
index.append(self._index_item(mo))
|
||||||
|
return index
|
||||||
|
|
||||||
|
def _link_index(self, pml):
|
||||||
|
index = []
|
||||||
|
for mo in re.finditer(r'(?s)\\Q="(?P<text>.+?)"', pml):
|
||||||
|
index.append(self._index_item(mo))
|
||||||
|
return index
|
||||||
|
|
||||||
def _images(self, manifest, image_hrefs):
|
def _images(self, manifest, image_hrefs):
|
||||||
|
'''
|
||||||
|
Image format.
|
||||||
|
|
||||||
|
0-4 : 'PNG '. There must be a space after PNG.
|
||||||
|
4-36 : Image name. Must be exactly 32 bytes long. Pad with \x00 for names shorter than 32 bytes
|
||||||
|
36-58 : Unknown.
|
||||||
|
58-60 : Width.
|
||||||
|
60-62 : Height.
|
||||||
|
62-...: Raw image data in 8 bit PNG format.
|
||||||
|
'''
|
||||||
images = []
|
images = []
|
||||||
|
|
||||||
for item in manifest:
|
for item in manifest:
|
||||||
@ -82,6 +160,8 @@ class Writer(FormatWriter):
|
|||||||
|
|
||||||
header = 'PNG '
|
header = 'PNG '
|
||||||
header += image_hrefs[item.href].ljust(32, '\x00')[:32]
|
header += image_hrefs[item.href].ljust(32, '\x00')[:32]
|
||||||
|
header = header.ljust(58, '\x00')
|
||||||
|
header += struct.pack('>HH', im.size[0], im.size[1])
|
||||||
header = header.ljust(62, '\x00')
|
header = header.ljust(62, '\x00')
|
||||||
|
|
||||||
if len(data) + len(header) < 65505:
|
if len(data) + len(header) < 65505:
|
||||||
@ -121,52 +201,60 @@ class Writer(FormatWriter):
|
|||||||
|
|
||||||
return '%s\x00%s\x00%s\x00%s\x00%s\x00' % (title, author, copyright, publisher, isbn)
|
return '%s\x00%s\x00%s\x00%s\x00%s\x00' % (title, author, copyright, publisher, isbn)
|
||||||
|
|
||||||
def _header_record(self, text_items, image_items):
|
def _header_record(self, text_count, chapter_count, link_count, image_count):
|
||||||
'''
|
'''
|
||||||
text_items = the number of text pages
|
text_count = the number of text pages
|
||||||
image_items = the number of images
|
image_count = the number of images
|
||||||
'''
|
'''
|
||||||
version = 10 # Zlib compression
|
compression = 10 # zlib compression.
|
||||||
non_text_offset = text_items + 1
|
non_text_offset = text_count + 1
|
||||||
|
|
||||||
if image_items > 0:
|
chapter_offset = non_text_offset
|
||||||
image_data_offset = text_items + 1
|
link_offset = chapter_offset + chapter_count
|
||||||
meta_data_offset = image_data_offset + image_items
|
|
||||||
|
if image_count > 0:
|
||||||
|
image_data_offset = link_offset + link_count
|
||||||
|
meta_data_offset = image_data_offset + image_count
|
||||||
last_data_offset = meta_data_offset + 1
|
last_data_offset = meta_data_offset + 1
|
||||||
else:
|
else:
|
||||||
meta_data_offset = text_items + 1
|
meta_data_offset = link_offset + link_count
|
||||||
last_data_offset = meta_data_offset + 1
|
last_data_offset = meta_data_offset + 1
|
||||||
image_data_offset = last_data_offset
|
image_data_offset = last_data_offset
|
||||||
|
|
||||||
|
if chapter_count == 0:
|
||||||
|
chapter_offset = last_data_offset
|
||||||
|
if link_count == 0:
|
||||||
|
link_offset = last_data_offset
|
||||||
|
|
||||||
record = ''
|
record = ''
|
||||||
|
|
||||||
record += struct.pack('>H', version) # [0:2] # Version. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM
|
record += struct.pack('>H', compression) # [0:2] # Compression. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM
|
||||||
record += struct.pack('>H', 0) # [2:4]
|
record += struct.pack('>H', 0) # [2:4] # Unknown.
|
||||||
record += struct.pack('>H', 0) # [4:6]
|
record += struct.pack('>H', 0) # [4:6] # Unknown.
|
||||||
record += struct.pack('>H', 25152) # [6:8] # 25152 is MAGIC. Somehow represents the cp1252 encoding of the text
|
record += struct.pack('>H', 25152) # [6:8] # 25152 is MAGIC. Somehow represents the cp1252 encoding of the text
|
||||||
record += struct.pack('>H', 0) # [8:10]
|
record += struct.pack('>H', 0) # [8:10] # Number of small font pages. 0 if page index is not built.
|
||||||
record += struct.pack('>H', 0) # [10:12]
|
record += struct.pack('>H', 0) # [10:12] # Number of large font pages. 0 if page index is not built.
|
||||||
record += struct.pack('>H', non_text_offset) # [12:14] # non_text_offset
|
record += struct.pack('>H', non_text_offset) # [12:14] # Non-Text record start.
|
||||||
record += struct.pack('>H', 0) # [14:16]
|
record += struct.pack('>H', chapter_count) # [14:16] # Number of chapter index records.
|
||||||
record += struct.pack('>H', 0) # [16:18]
|
record += struct.pack('>H', 0) # [16:18] # Number of small font page index records.
|
||||||
record += struct.pack('>H', 0) # [18:20]
|
record += struct.pack('>H', 0) # [18:20] # Number of large font page index records.
|
||||||
record += struct.pack('>H', image_items) # [20:22] # Number of images
|
record += struct.pack('>H', image_count) # [20:22] # Number of images.
|
||||||
record += struct.pack('>H', 0) # [22:24]
|
record += struct.pack('>H', link_count) # [22:24] # Number of links.
|
||||||
record += struct.pack('>H', 1) # [24:26] # 1 if has metadata, 0 if not
|
record += struct.pack('>H', 1) # [24:26] # 1 if has metadata, 0 if not.
|
||||||
record += struct.pack('>H', 0) # [26:28]
|
record += struct.pack('>H', 0) # [26:28] # Unknown.
|
||||||
record += struct.pack('>H', 0) # [28:30] # footnote_rec
|
record += struct.pack('>H', 0) # [28:30] # Number of Footnotes.
|
||||||
record += struct.pack('>H', 0) # [30:32] # sidebar_rec
|
record += struct.pack('>H', 0) # [30:32] # Number of Sidebars.
|
||||||
record += struct.pack('>H', last_data_offset) # [32:34] # bookmark_offset
|
record += struct.pack('>H', chapter_offset) # [32:34] # Chapter index offset.
|
||||||
record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC
|
record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC.
|
||||||
record += struct.pack('>H', 0) # [36:38]
|
record += struct.pack('>H', last_data_offset) # [36:38] # Small font page offset. This will be the last data offset if there are none.
|
||||||
record += struct.pack('>H', 0) # [38:40]
|
record += struct.pack('>H', last_data_offset) # [38:40] # Large font page offset. This will be the last data offset if there are none.
|
||||||
record += struct.pack('>H', image_data_offset) # [40:42] # image_data_offset. This will be the last data offset if there are no images
|
record += struct.pack('>H', image_data_offset) # [40:42] # Image offset. This will be the last data offset if there are none.
|
||||||
record += struct.pack('>H', 0) # [42:44]
|
record += struct.pack('>H', link_offset) # [42:44] # Links offset. This will be the last data offset if there are none.
|
||||||
record += struct.pack('>H', meta_data_offset) # [44:46] # meta_data_offset. This will be the last data offset if there are no images
|
record += struct.pack('>H', meta_data_offset) # [44:46] # Metadata offset. This will be the last data offset if there are none.
|
||||||
record += struct.pack('>H', 0) # [46:48]
|
record += struct.pack('>H', 0) # [46:48] # Unknown.
|
||||||
record += struct.pack('>H', last_data_offset) # [48:50] # footnote_offset. This will be the last data offset if there are no images
|
record += struct.pack('>H', last_data_offset) # [48:50] # Footnote offset. This will be the last data offset if there are none.
|
||||||
record += struct.pack('>H', last_data_offset) # [50:52] # sidebar_offset. This will be the last data offset if there are no images
|
record += struct.pack('>H', last_data_offset) # [50:52] # Sidebar offset. This will be the last data offset if there are none.
|
||||||
record += struct.pack('>H', last_data_offset) # [52:54] # last_data_offset
|
record += struct.pack('>H', last_data_offset) # [52:54] # Last data offset.
|
||||||
|
|
||||||
for i in range(54, 132, 2):
|
for i in range(54, 132, 2):
|
||||||
record += struct.pack('>H', 0) # [54:132]
|
record += struct.pack('>H', 0) # [54:132]
|
||||||
|
@ -18,10 +18,10 @@ PML_HTML_RULES = [
|
|||||||
(re.compile(r'\\x(?P<text>.*?)\\x', re.DOTALL), lambda match: '<h1 style="page-break-before: always;">%s</h1>' % match.group('text') if match.group('text') else ''),
|
(re.compile(r'\\x(?P<text>.*?)\\x', re.DOTALL), lambda match: '<h1 style="page-break-before: always;">%s</h1>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]', re.DOTALL), lambda match: '<h%s style="page-break-before: always;">%s</h%s>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''),
|
(re.compile(r'\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]', re.DOTALL), lambda match: '<h%s style="page-break-before: always;">%s</h%s>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''),
|
||||||
(re.compile(r'\\C\d=".+?"'), lambda match: ''), # This should be made to create a TOC entry
|
(re.compile(r'\\C\d=".+?"'), lambda match: ''), # This should be made to create a TOC entry
|
||||||
(re.compile(r'\\c(?P<text>.*?)\\c', re.DOTALL), lambda match: '<div style="text-align: center; display: block; margin: auto;">%s</div>' % match.group('text') if match.group('text') else ''),
|
(re.compile(r'\\c(?P<text>.*?)\\c', re.DOTALL), lambda match: '<span style="text-align: center; display: block; margin: auto;">%s</span>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\r(?P<text>.*?)\\r', re.DOTALL), lambda match: '<div style="text-align: right; display: block;">%s</div>' % match.group('text') if match.group('text') else ''),
|
(re.compile(r'\\r(?P<text>.*?)\\r', re.DOTALL), lambda match: '<span style="text-align: right; display: block;">%s</span>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\i(?P<text>.*?)\\i', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text') if match.group('text') else ''),
|
(re.compile(r'\\i(?P<text>.*?)\\i', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\u(?P<text>.*?)\\u', re.DOTALL), lambda match: '<div style="text-decoration: underline;">%s</div>' % match.group('text') if match.group('text') else ''),
|
(re.compile(r'\\u(?P<text>.*?)\\u', re.DOTALL), lambda match: '<span style="text-decoration: underline;">%s</span>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\o(?P<text>.*?)\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text') if match.group('text') else ''),
|
(re.compile(r'\\o(?P<text>.*?)\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\v(?P<text>.*?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text') if match.group('text') else ''),
|
(re.compile(r'\\v(?P<text>.*?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\t(?P<text>.*?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%;">%s</div>' % match.group('text') if match.group('text') else ''),
|
(re.compile(r'\\t(?P<text>.*?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%;">%s</div>' % match.group('text') if match.group('text') else ''),
|
||||||
@ -35,8 +35,8 @@ PML_HTML_RULES = [
|
|||||||
(re.compile(r'\\Sp(?P<text>.*?)\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text') if match.group('text') else ''),
|
(re.compile(r'\\Sp(?P<text>.*?)\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\Sb(?P<text>.*?)\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text') if match.group('text') else ''),
|
(re.compile(r'\\Sb(?P<text>.*?)\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\k(?P<text>.*?)\\k', re.DOTALL), lambda match: '<span style="font-size: 50%%">%s</span>' % match.group('text').upper() if match.group('text') else ''),
|
(re.compile(r'\\k(?P<text>.*?)\\k', re.DOTALL), lambda match: '<span style="font-size: 50%%">%s</span>' % match.group('text').upper() if match.group('text') else ''),
|
||||||
(re.compile(r'\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')),
|
(re.compile(r'\\a(?P<num>\d{3})'), lambda match: '&#%s;' % match.group('num')),
|
||||||
(re.compile(r'\\U(?P<num>\d\d\d\d)'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))),
|
(re.compile(r'\\U(?P<num>[0-9a-f]{4})'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))),
|
||||||
(re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')),
|
(re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')),
|
||||||
(re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.*?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''),
|
(re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.*?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''),
|
||||||
(re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<span id="%s"></span>' % match.group('target')),
|
(re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<span id="%s"></span>' % match.group('target')),
|
||||||
@ -64,7 +64,7 @@ PML_HTML_RULES = [
|
|||||||
(re.compile(r'(?<=[^\\])\\Sp'), lambda match: ''),
|
(re.compile(r'(?<=[^\\])\\Sp'), lambda match: ''),
|
||||||
(re.compile(r'(?<=[^\\])\\Sb'), lambda match: ''),
|
(re.compile(r'(?<=[^\\])\\Sb'), lambda match: ''),
|
||||||
# Remove invalid single item pml codes.
|
# Remove invalid single item pml codes.
|
||||||
(re.compile(r'(?<=[^\\])\\.'), lambda match: ''),
|
(re.compile(r'(?<=[^\\])\\[^\\]'), lambda match: ''),
|
||||||
|
|
||||||
# Replace \\ with \.
|
# Replace \\ with \.
|
||||||
(re.compile(r'\\\\'), lambda match: '\\'),
|
(re.compile(r'\\\\'), lambda match: '\\'),
|
||||||
@ -78,6 +78,7 @@ def pml_to_html(pml):
|
|||||||
return html
|
return html
|
||||||
|
|
||||||
def footnote_sidebar_to_html(id, pml):
|
def footnote_sidebar_to_html(id, pml):
|
||||||
|
if id.startswith('\x01'):
|
||||||
|
id = id[2:]
|
||||||
html = '<div id="sidebar-%s"><dt>%s</dt></div><dd>%s</dd>' % (id, id, pml_to_html(pml))
|
html = '<div id="sidebar-%s"><dt>%s</dt></div><dd>%s</dd>' % (id, id, pml_to_html(pml))
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
@ -154,10 +154,15 @@ class PMLMLizer(object):
|
|||||||
for unused in anchors.difference(links):
|
for unused in anchors.difference(links):
|
||||||
text = text.replace('\\Q="%s"' % unused, '')
|
text = text.replace('\\Q="%s"' % unused, '')
|
||||||
|
|
||||||
|
# Turn all html entities into unicode. This should not be necessary as
|
||||||
|
# lxml should have already done this but we want to be sure it happens.
|
||||||
for entity in set(re.findall('&.+?;', text)):
|
for entity in set(re.findall('&.+?;', text)):
|
||||||
mo = re.search('(%s)' % entity[1:-1], text)
|
mo = re.search('(%s)' % entity[1:-1], text)
|
||||||
text = text.replace(entity, entity_to_unicode(mo))
|
text = text.replace(entity, entity_to_unicode(mo))
|
||||||
|
|
||||||
|
# Turn all unicode characters into their PML hex equivelent
|
||||||
|
text = re.sub('[^\x00-\x7f]', lambda x: '\\U%04x' % ord(x.group()), text)
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def dump_text(self, elem, stylizer, page, tag_stack=[]):
|
def dump_text(self, elem, stylizer, page, tag_stack=[]):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user