PML: turn html entities into characters, internal links produced properly.

This commit is contained in:
John Schember 2009-05-19 18:57:07 -04:00
parent 2a155e22be
commit 91b7cbc580
2 changed files with 9 additions and 5 deletions

View File

@ -41,7 +41,7 @@ class Writer(FormatWriter):
lengths = [len(i) for i in sections] lengths = [len(i) for i in sections]
pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].parition()[0]) pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].partition('\x00')[0])
pdbHeaderBuilder.build_header(lengths, out_stream) pdbHeaderBuilder.build_header(lengths, out_stream)
for item in sections: for item in sections:
@ -49,7 +49,7 @@ class Writer(FormatWriter):
def _text(self, oeb_book): def _text(self, oeb_book):
pmlmlizer = PMLMLizer(ignore_tables=self.opts.linearize_tables) pmlmlizer = PMLMLizer(ignore_tables=self.opts.linearize_tables)
pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252') pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
pml_pages = [] pml_pages = []
for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1): for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1):

View File

@ -13,6 +13,7 @@ import os, re
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.pdb.ereader import image_name from calibre.ebooks.pdb.ereader import image_name
from calibre import entity_to_unicode
TAG_MAP = { TAG_MAP = {
'b' : 'B', 'b' : 'B',
@ -78,9 +79,12 @@ class PMLMLizer(object):
# Remove anchors that do not have links # Remove anchors that do not have links
anchors = set(re.findall(r'(?<=\\Q=").+?(?=")', text)) anchors = set(re.findall(r'(?<=\\Q=").+?(?=")', text))
links = set(re.findall(r'(?<=\\q=").+?(?=")', text)) links = set(re.findall(r'(?<=\\q="#).+?(?=")', text))
for unused in anchors.difference(links): for unused in anchors.difference(links):
text = text.replace('\\Q="%s"' % unused, '') text = text.replace('\\Q="%s"' % unused, '')
for entity in set(re.findall('&.+?;', text)):
text = text.replace(entity, entity_to_unicode(entity[1:-1]))
return text return text
@ -136,10 +140,10 @@ class PMLMLizer(object):
href = elem.get('href') href = elem.get('href')
if href and '://' not in href: if href and '://' not in href:
if '#' in href: if '#' in href:
href = href.partition('#')[2][1:] href = href.partition('#')[2]
href = os.path.splitext(os.path.basename(href))[0] href = os.path.splitext(os.path.basename(href))[0]
tag_count += 1 tag_count += 1
text += '\\q="%s"' % href text += '\\q="#%s"' % href
tag_stack.append('q') tag_stack.append('q')
# Anchor ids # Anchor ids
id_name = elem.get('id') id_name = elem.get('id')