From 2a155e22bef2baaed15e3a4089b7477fc770c08f Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Tue, 19 May 2009 07:44:01 -0400
Subject: [PATCH] PML: remove unused anchors, clean up anchors and links.

---
 src/calibre/ebooks/pml/pmlml.py | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py
index a5e3b36377..d32d391004 100644
--- a/src/calibre/ebooks/pml/pmlml.py
+++ b/src/calibre/ebooks/pml/pmlml.py
@@ -53,21 +53,35 @@ class PMLMLizer(object):
         output = u''
         for item in self.oeb_book.spine:
             stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
+            output += self.add_page_anchor(item.href)
             output += self.dump_text(item.data.find(XHTML('body')), stylizer)
         output = self.clean_text(output)
 
         return output
 
+    def add_page_anchor(self, href):
+        href = os.path.splitext(os.path.basename(href))[0]
+        return '\\Q="%s"' % href
+
     def clean_text(self, text):
+        # Remove excess spaces at beginning and end of lines
         text = re.sub('(?m)^[ ]+', '', text)
         text = re.sub('(?m)[ ]+$', '', text)
     
+        # Remove excessive newlines
         text = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
         text = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
         text = re.sub('[ ]{2,}', ' ', text)
         
+        # Remove excessive \p tags
         text = re.sub(r'\\p\s*\\p', '', text)
         
+        # Remove anchors that do not have links
+        anchors = set(re.findall(r'(?<=\\Q=").+?(?=")', text))
+        links = set(re.findall(r'(?<=\\q=").+?(?=")', text))
+        for unused in anchors.difference(links):
+            text = text.replace('\\Q="%s"' % unused, '')
+        
         return text
 
     def dump_text(self, elem, stylizer, tag_stack=[]):
@@ -120,14 +134,17 @@ class PMLMLizer(object):
             # Anchors links
             if tag == 'a' and 'q' not in tag_stack:
                 href = elem.get('href')
-                if href and href.startswith('#'):
+                if href and '://' not in href:
+                    if '#' in href:
+                        href = href.partition('#')[2][1:]
+                    href = os.path.splitext(os.path.basename(href))[0]
                     tag_count += 1
                     text += '\\q="%s"' % href
                     tag_stack.append('q')
             # Anchor ids
             id_name = elem.get('id')
             if id_name:
-                text += '\\Q="%s"' % id_name
+                text += '\\Q="%s"' % os.path.splitext(id_name)[0]
 
             # Processes style information
             for s in STYLES:
@@ -147,7 +164,7 @@ class PMLMLizer(object):
         for i in range(0, tag_count):
             close_tag_list.insert(0, tag_stack.pop())
         text += self.close_tags(close_tag_list)
-        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li'):
+        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li', 'tr'):
             text += os.linesep + os.linesep
         
         if 'block' not in tag_stack: