mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 02:34:06 -04:00
PML: turn html entities into characters, internal links produced properly.
This commit is contained in:
parent
2a155e22be
commit
91b7cbc580
@ -41,7 +41,7 @@ class Writer(FormatWriter):
|
|||||||
|
|
||||||
lengths = [len(i) for i in sections]
|
lengths = [len(i) for i in sections]
|
||||||
|
|
||||||
pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].parition()[0])
|
pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].partition('\x00')[0])
|
||||||
pdbHeaderBuilder.build_header(lengths, out_stream)
|
pdbHeaderBuilder.build_header(lengths, out_stream)
|
||||||
|
|
||||||
for item in sections:
|
for item in sections:
|
||||||
@ -49,7 +49,7 @@ class Writer(FormatWriter):
|
|||||||
|
|
||||||
def _text(self, oeb_book):
|
def _text(self, oeb_book):
|
||||||
pmlmlizer = PMLMLizer(ignore_tables=self.opts.linearize_tables)
|
pmlmlizer = PMLMLizer(ignore_tables=self.opts.linearize_tables)
|
||||||
pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252')
|
pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
|
||||||
|
|
||||||
pml_pages = []
|
pml_pages = []
|
||||||
for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1):
|
for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1):
|
||||||
|
@ -13,6 +13,7 @@ import os, re
|
|||||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
|
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
|
||||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||||
from calibre.ebooks.pdb.ereader import image_name
|
from calibre.ebooks.pdb.ereader import image_name
|
||||||
|
from calibre import entity_to_unicode
|
||||||
|
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
'b' : 'B',
|
'b' : 'B',
|
||||||
@ -78,9 +79,12 @@ class PMLMLizer(object):
|
|||||||
|
|
||||||
# Remove anchors that do not have links
|
# Remove anchors that do not have links
|
||||||
anchors = set(re.findall(r'(?<=\\Q=").+?(?=")', text))
|
anchors = set(re.findall(r'(?<=\\Q=").+?(?=")', text))
|
||||||
links = set(re.findall(r'(?<=\\q=").+?(?=")', text))
|
links = set(re.findall(r'(?<=\\q="#).+?(?=")', text))
|
||||||
for unused in anchors.difference(links):
|
for unused in anchors.difference(links):
|
||||||
text = text.replace('\\Q="%s"' % unused, '')
|
text = text.replace('\\Q="%s"' % unused, '')
|
||||||
|
|
||||||
|
for entity in set(re.findall('&.+?;', text)):
|
||||||
|
text = text.replace(entity, entity_to_unicode(entity[1:-1]))
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
@ -136,10 +140,10 @@ class PMLMLizer(object):
|
|||||||
href = elem.get('href')
|
href = elem.get('href')
|
||||||
if href and '://' not in href:
|
if href and '://' not in href:
|
||||||
if '#' in href:
|
if '#' in href:
|
||||||
href = href.partition('#')[2][1:]
|
href = href.partition('#')[2]
|
||||||
href = os.path.splitext(os.path.basename(href))[0]
|
href = os.path.splitext(os.path.basename(href))[0]
|
||||||
tag_count += 1
|
tag_count += 1
|
||||||
text += '\\q="%s"' % href
|
text += '\\q="#%s"' % href
|
||||||
tag_stack.append('q')
|
tag_stack.append('q')
|
||||||
# Anchor ids
|
# Anchor ids
|
||||||
id_name = elem.get('id')
|
id_name = elem.get('id')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user