PML: remove unused anchors, clean up anchors and links.

This commit is contained in:
John Schember 2009-05-19 07:44:01 -04:00
parent 19b04056d4
commit 2a155e22be

View File

@ -53,21 +53,35 @@ class PMLMLizer(object):
output = u'' output = u''
for item in self.oeb_book.spine: for item in self.oeb_book.spine:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
output += self.add_page_anchor(item.href)
output += self.dump_text(item.data.find(XHTML('body')), stylizer) output += self.dump_text(item.data.find(XHTML('body')), stylizer)
output = self.clean_text(output) output = self.clean_text(output)
return output return output
def add_page_anchor(self, href):
href = os.path.splitext(os.path.basename(href))[0]
return '\\Q="%s"' % href
def clean_text(self, text): def clean_text(self, text):
# Remove excess spaces at beginning and end of lines
text = re.sub('(?m)^[ ]+', '', text) text = re.sub('(?m)^[ ]+', '', text)
text = re.sub('(?m)[ ]+$', '', text) text = re.sub('(?m)[ ]+$', '', text)
# Remove excessive newlines
text = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text) text = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
text = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text) text = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
text = re.sub('[ ]{2,}', ' ', text) text = re.sub('[ ]{2,}', ' ', text)
# Remove excessive \p tags
text = re.sub(r'\\p\s*\\p', '', text) text = re.sub(r'\\p\s*\\p', '', text)
# Remove anchors that do not have links
anchors = set(re.findall(r'(?<=\\Q=").+?(?=")', text))
links = set(re.findall(r'(?<=\\q=").+?(?=")', text))
for unused in anchors.difference(links):
text = text.replace('\\Q="%s"' % unused, '')
return text return text
def dump_text(self, elem, stylizer, tag_stack=[]): def dump_text(self, elem, stylizer, tag_stack=[]):
@ -120,14 +134,17 @@ class PMLMLizer(object):
# Anchors links # Anchors links
if tag == 'a' and 'q' not in tag_stack: if tag == 'a' and 'q' not in tag_stack:
href = elem.get('href') href = elem.get('href')
if href and href.startswith('#'): if href and '://' not in href:
if '#' in href:
href = href.partition('#')[2][1:]
href = os.path.splitext(os.path.basename(href))[0]
tag_count += 1 tag_count += 1
text += '\\q="%s"' % href text += '\\q="%s"' % href
tag_stack.append('q') tag_stack.append('q')
# Anchor ids # Anchor ids
id_name = elem.get('id') id_name = elem.get('id')
if id_name: if id_name:
text += '\\Q="%s"' % id_name text += '\\Q="%s"' % os.path.splitext(id_name)[0]
# Processes style information # Processes style information
for s in STYLES: for s in STYLES:
@ -147,7 +164,7 @@ class PMLMLizer(object):
for i in range(0, tag_count): for i in range(0, tag_count):
close_tag_list.insert(0, tag_stack.pop()) close_tag_list.insert(0, tag_stack.pop())
text += self.close_tags(close_tag_list) text += self.close_tags(close_tag_list)
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li'): if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li', 'tr'):
text += os.linesep + os.linesep text += os.linesep + os.linesep
if 'block' not in tag_stack: if 'block' not in tag_stack: