mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 02:34:06 -04:00
PML: remove unused anchors, clean up anchors and links.
This commit is contained in:
parent
19b04056d4
commit
2a155e22be
@ -53,21 +53,35 @@ class PMLMLizer(object):
|
|||||||
output = u''
|
output = u''
|
||||||
for item in self.oeb_book.spine:
|
for item in self.oeb_book.spine:
|
||||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
|
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
|
||||||
|
output += self.add_page_anchor(item.href)
|
||||||
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
|
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
|
||||||
output = self.clean_text(output)
|
output = self.clean_text(output)
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
def add_page_anchor(self, href):
|
||||||
|
href = os.path.splitext(os.path.basename(href))[0]
|
||||||
|
return '\\Q="%s"' % href
|
||||||
|
|
||||||
def clean_text(self, text):
|
def clean_text(self, text):
|
||||||
|
# Remove excess spaces at beginning and end of lines
|
||||||
text = re.sub('(?m)^[ ]+', '', text)
|
text = re.sub('(?m)^[ ]+', '', text)
|
||||||
text = re.sub('(?m)[ ]+$', '', text)
|
text = re.sub('(?m)[ ]+$', '', text)
|
||||||
|
|
||||||
|
# Remove excessive newlines
|
||||||
text = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
|
text = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
|
||||||
text = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
|
text = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
|
||||||
text = re.sub('[ ]{2,}', ' ', text)
|
text = re.sub('[ ]{2,}', ' ', text)
|
||||||
|
|
||||||
|
# Remove excessive \p tags
|
||||||
text = re.sub(r'\\p\s*\\p', '', text)
|
text = re.sub(r'\\p\s*\\p', '', text)
|
||||||
|
|
||||||
|
# Remove anchors that do not have links
|
||||||
|
anchors = set(re.findall(r'(?<=\\Q=").+?(?=")', text))
|
||||||
|
links = set(re.findall(r'(?<=\\q=").+?(?=")', text))
|
||||||
|
for unused in anchors.difference(links):
|
||||||
|
text = text.replace('\\Q="%s"' % unused, '')
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def dump_text(self, elem, stylizer, tag_stack=[]):
|
def dump_text(self, elem, stylizer, tag_stack=[]):
|
||||||
@ -120,14 +134,17 @@ class PMLMLizer(object):
|
|||||||
# Anchors links
|
# Anchors links
|
||||||
if tag == 'a' and 'q' not in tag_stack:
|
if tag == 'a' and 'q' not in tag_stack:
|
||||||
href = elem.get('href')
|
href = elem.get('href')
|
||||||
if href and href.startswith('#'):
|
if href and '://' not in href:
|
||||||
|
if '#' in href:
|
||||||
|
href = href.partition('#')[2][1:]
|
||||||
|
href = os.path.splitext(os.path.basename(href))[0]
|
||||||
tag_count += 1
|
tag_count += 1
|
||||||
text += '\\q="%s"' % href
|
text += '\\q="%s"' % href
|
||||||
tag_stack.append('q')
|
tag_stack.append('q')
|
||||||
# Anchor ids
|
# Anchor ids
|
||||||
id_name = elem.get('id')
|
id_name = elem.get('id')
|
||||||
if id_name:
|
if id_name:
|
||||||
text += '\\Q="%s"' % id_name
|
text += '\\Q="%s"' % os.path.splitext(id_name)[0]
|
||||||
|
|
||||||
# Processes style information
|
# Processes style information
|
||||||
for s in STYLES:
|
for s in STYLES:
|
||||||
@ -147,7 +164,7 @@ class PMLMLizer(object):
|
|||||||
for i in range(0, tag_count):
|
for i in range(0, tag_count):
|
||||||
close_tag_list.insert(0, tag_stack.pop())
|
close_tag_list.insert(0, tag_stack.pop())
|
||||||
text += self.close_tags(close_tag_list)
|
text += self.close_tags(close_tag_list)
|
||||||
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li'):
|
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li', 'tr'):
|
||||||
text += os.linesep + os.linesep
|
text += os.linesep + os.linesep
|
||||||
|
|
||||||
if 'block' not in tag_stack:
|
if 'block' not in tag_stack:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user