From 1f0932ad4047395bd5ae11b8ee350b26367f1eea Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 17 May 2009 21:35:59 -0400
Subject: [PATCH 1/6] Much better html to pml parser, now accounts for style
 information and produces output that looks more like the input.

---
 src/calibre/ebooks/pdb/ereader/writer.py |  13 +-
 src/calibre/ebooks/pml/output.py         |  18 +--
 src/calibre/ebooks/pml/pmlconverter.py   |  82 -----------
 src/calibre/ebooks/pml/pmlml.py          | 178 +++++++++++++++++++++++
 4 files changed, 190 insertions(+), 101 deletions(-)
 create mode 100644 src/calibre/ebooks/pml/pmlml.py

diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py
index f49aa4e125..c99c75a929 100644
--- a/src/calibre/ebooks/pdb/ereader/writer.py
+++ b/src/calibre/ebooks/pdb/ereader/writer.py
@@ -16,7 +16,7 @@ from calibre.ebooks.pdb.formatwriter import FormatWriter
 from calibre.ebooks.oeb.base import OEB_IMAGES
 from calibre.ebooks.pdb.header import PdbHeaderBuilder
 from calibre.ebooks.pdb.ereader import image_name
-from calibre.ebooks.pml.pmlconverter import html_to_pml
+from calibre.ebooks.pml.pmlml import PMLMLizer
 
 IDENTITY = 'PNRdPPrs'
 
@@ -31,7 +31,7 @@ class Writer(FormatWriter):
         self.log = log
         
     def write_content(self, oeb_book, out_stream, metadata=None):
-        text = self._text(oeb_book.spine)
+        text = self._text(oeb_book)
         images = self._images(oeb_book.manifest)
         metadata = [self._metadata(metadata)]
         
@@ -41,16 +41,15 @@ class Writer(FormatWriter):
         
         lengths = [len(i) for i in sections]
         
-        pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, '')
+        pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].parition()[0])
         pdbHeaderBuilder.build_header(lengths, out_stream)
         
         for item in sections:
             out_stream.write(item)
 
-    def _text(self, pages):
-        pml = ''
-        for page in pages:
-            pml += html_to_pml(unicode(page)).encode('cp1252')
+    def _text(self, oeb_book):
+        pmlmlizer = PMLMLizer(ignore_tables=self.opts.linearize_tables)
+        pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252')
     
         pml_pages = []
         for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1):
diff --git a/src/calibre/ebooks/pml/output.py b/src/calibre/ebooks/pml/output.py
index c5fbc990af..9d07718654 100644
--- a/src/calibre/ebooks/pml/output.py
+++ b/src/calibre/ebooks/pml/output.py
@@ -12,7 +12,7 @@ from calibre.customize.conversion import OutputFormatPlugin
 from calibre.ptempfile import TemporaryDirectory
 from calibre.utils.zipfile import ZipFile
 from calibre.ebooks.oeb.base import OEB_IMAGES
-from calibre.ebooks.pml.pmlconverter import html_to_pml
+from calibre.ebooks.pml.pmlml import PMLMLizer
 
 class PMLOutput(OutputFormatPlugin):
 
@@ -22,22 +22,16 @@ class PMLOutput(OutputFormatPlugin):
 
     def convert(self, oeb_book, output_path, input_plugin, opts, log):
         with TemporaryDirectory('_pmlz_output') as tdir:
-            self.process_spine(oeb_book.spine, tdir)
+            pmlmlizer = PMLMLizer(ignore_tables=opts.linearize_tables)
+            content = pmlmlizer.extract_content(oeb_book, opts)
+            with open(os.path.join(tdir, 'index.pml'), 'wb') as out:
+                out.write(content.encode('utf-8'))
+                
             self.write_images(oeb_book.manifest, tdir)
 
             pmlz = ZipFile(output_path, 'w')
             pmlz.add_dir(tdir)
         
-    def process_spine(self, spine, out_dir):
-        for item in spine:
-            html = html_to_pml(unicode(item)).encode('utf-8')
-            
-            name = os.path.splitext(os.path.basename(item.href))[0] + '.pml'
-            path = os.path.join(out_dir, name)
-            
-            with open(path, 'wb') as out:
-                out.write(html)
-        
     def write_images(self, manifest, out_dir):
         for item in manifest:
             if item.media_type in OEB_IMAGES:
diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py
index dded21c38c..0cd7da8e72 100644
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en'
 
 import re
 
-from calibre import entity_to_unicode
 from calibre.ebooks.pdb.ereader import image_name
 from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
 
@@ -67,75 +66,6 @@ PML_HTML_RULES = [
     (re.compile(r'\\\\'), lambda match: '\\'),
 ]
 
-HTML_PML_RULES = [
-
-    (re.compile(r'\\'), lambda match: '\\\\'),
-    (re.compile('(?<=[^\n])[ ]*<p.*?>'), lambda match: '\n<p>'),
-    (re.compile('</p>(?=^\n|^\r\n)'), lambda match: '\n'),
-    
-    
-    # Clean up HTML
-    (re.compile('@page.*?}'), lambda match: ''),
-    (re.compile('<script.*?>.*?</script>', re.DOTALL), lambda match: ''),
-    (re.compile('<style.*?>.*?</style>', re.DOTALL), lambda match: ''),
-    
-    # Reflow paragraphs
-    (re.compile('<p.*?>(?P<text>.*?)</p>', re.DOTALL), lambda match: match.group('text').replace('\r\n', ' ').replace('\n', ' ')),
-    
-    # HTML to PML
-    (re.compile('<a.*?href="#sidebar-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Sd="%s"%s\\Sd' % (match.group('target'), match.group('text'))),
-    (re.compile('<a.*?href="#footnote-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Fn="%s"%s\\Fn' % (match.group('target'), match.group('text'))),
-    (re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')),
-    (re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
-    (re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name')).strip('\x00')),
-    (re.compile('&(?P<num>#\d+);'), lambda match: entity_to_unicode(match)),
-    (re.compile('&(?P<num>.+);'), lambda match: entity_to_unicode(match)),
-    (re.compile('<small .*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
-    (re.compile('<small>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
-    (re.compile('<sub .*?>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
-    (re.compile('<sub>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
-    (re.compile('<sup .*?>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
-    (re.compile('<sup>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
-    (re.compile('<b .*?>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
-    (re.compile('<b>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
-    (re.compile('<strong .*?>(?P<text>.+?)</strong>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
-    (re.compile('<strong>(?P<text>.+?)</strong>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
-    (re.compile('<big .*?>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
-    (re.compile('<big>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
-    (re.compile('<hr.*?width="(?P<val>\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')),
-    (re.compile('<div.*?style.*?margin-left: (?P<val>\d+)%*;.*?>(?P<text>.+?)</div>', re.MULTILINE), lambda match: '\\T="%s%%"%s$' % (match.group('val'), match.group('text'))),
-    (re.compile('<div.*?style.*?margin-left: \d{1,3}%;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')),
-    (re.compile('<!--(?P<text>.+?)-->', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')),
-    (re.compile('<del .*?>(?P<text>.+?)</del>', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')),
-    (re.compile('<del>(?P<text>.+?)</del>', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')),
-    (re.compile('<div.*?style.*?text-decoration: underline;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')),
-    (re.compile('<i .*?>(?P<text>.+?)</i>', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')),
-    (re.compile('<i>(?P<text>.+?)</i>', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')),
-    (re.compile('<div.*?style.*?text-align: right;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')),
-    (re.compile('<div.*?style.*?text-align: center;.*?".*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')),
-    (re.compile('<h(?P<val>[0-4]).*?>(?P<text>.+?)</h[0-4]>', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
-    (re.compile('<h1.*?>(?P<text>.+?)</h1>', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')),
-    (re.compile('<br .*?>'), lambda match: '\n'),
-    (re.compile('<br/*>'), lambda match: '\n'),
-    
-    # Remove remaining HTML tags
-    (re.compile('<.*?>'), lambda match: ''),
-    
-    # Remove redundant page break markers
-    (re.compile(r'(\\p){2,}'), lambda match: r'\p'),
-    
-    # Remove whitespace on empty lines
-    (re.compile('^[\t\r ]$', re.MULTILINE), lambda match: ''),
-    # Remove excess whitespace in lines
-    (re.compile('(?<=.)[ ]{2,}(?=.)'), lambda match: ' '),
-    
-    # Remove excess newlines at the beginning and end
-    (re.compile('^(\r\n){1,}'), lambda match: ''),
-    (re.compile('^\n{1,}'), lambda match: ''),
-    (re.compile('(\r\n){3,}$'), lambda match: ''),
-    (re.compile('\n{3,}$'), lambda match: ''),
-]
-
 def pml_to_html(pml):
     html = pml
     for rule in PML_HTML_RULES:
@@ -151,15 +81,3 @@ def footnote_sidebar_to_html(id, pml):
     html = '<div id="sidebar-%s"><dt>%s</dt></div><dd>%s</dd>' % (id, id, pml_to_html(pml))
     return html 
 
-def html_to_pml(html):
-    pml = ''
-    
-    for dom_tree in BeautifulSoup(html).findAll('body'):
-        body = unicode(dom_tree.prettify())
-
-        for rule in HTML_PML_RULES:
-            body = rule[0].sub(rule[1], body)
-            
-        pml += body
-
-    return pml
diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py
new file mode 100644
index 0000000000..a6febdc53f
--- /dev/null
+++ b/src/calibre/ebooks/pml/pmlml.py
@@ -0,0 +1,178 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Transform OEB content into PML markup
+'''
+
+import os, re
+
+from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
+from calibre.ebooks.oeb.stylizer import Stylizer
+from calibre.ebooks.pdb.ereader import image_name
+
+TAG_MAP = {
+    'b'       : 'B',
+    'strong'  : 'B',
+    'i'       : 'I',
+    'small'   : 'k',
+    'sub'     : 'Sb',
+    'sup'     : 'Sp',
+    'big'     : 'l',
+    'del'     : 'o',
+    'h1'      : 'x',
+    'h2'      : 'x0',
+    'h3'      : 'x1',
+    'h4'      : 'x2',
+    'h5'      : 'x3',
+    'h6'      : 'x4',
+    '!--'     : 'v',
+}
+
+STYLES = [
+    ('font-weight', {'bold' : 'B', 'bolder' : 'B'}),
+    ('font-style', {'italic' : 'I'}),
+    ('text-decoration', {'underline' : 'u'}),
+    ('text-align', {'right' : 'r', 'center' : 'c'}),
+]
+
+class PMLMLizer(object):
+    def __init__(self, ignore_tables=False):
+        self.ignore_tables = ignore_tables
+        
+    def extract_content(self, oeb_book, opts):
+        oeb_book.logger.info('Converting XHTML to PML markup...')
+        self.oeb_book = oeb_book
+        self.opts = opts
+        return self.pmlmlize_spine()
+        
+    def pmlmlize_spine(self):
+        output = u''
+        for item in self.oeb_book.spine:
+            stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
+            output += self.dump_text(item.data.find(XHTML('body')), stylizer)
+        output = self.clean_text(output)
+        
+        output = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), output)
+        output = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), output)
+        output = re.sub('[ ]{2,}', ' ', output)
+
+        return output
+
+    def clean_text(self, text):
+        return text
+
+    def dump_text(self, elem, stylizer, tag_stack=[]):
+        if not isinstance(elem.tag, basestring) \
+           or namespace(elem.tag) != XHTML_NS:
+            return u''
+
+        text = u''
+        style = stylizer.style(elem)
+        
+        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
+           or style['visibility'] == 'hidden':
+            return u''
+
+        tag = barename(elem.tag)
+        tag_count = 0
+
+        # Are we in a paragraph block?
+        if tag == 'p' or style['display'] in ('block'):
+            if 'block' not in tag_stack:
+                tag_count += 1
+                tag_stack.append('block')
+        
+        # Process tags that need special processing and that do not have inner
+        # text. Usually these require an argument
+        if tag == 'img':
+            text += '\\m="%s"' % image_name(os.path.basename(elem.get('src'))).strip('\x00')
+        if tag == 'hr':
+            text += '\\w'
+            width = elem.get('width')
+            if width:
+                text += '="%s%"' % width
+            else:
+                text += '="50%"'
+        
+        # Process style information that needs holds a single tag
+        if style['page-break-before'] == 'always':
+            text += '\\p'
+        if style['page-break-after'] == 'always':
+            text += '\\p'
+        
+        # Proccess tags that contain text.
+        if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
+            pml_tag = TAG_MAP.get(tag, None)
+            if pml_tag and pml_tag not in tag_stack:
+                tag_count += 1
+                text += '\\%s' % pml_tag
+                tag_stack.append(pml_tag)
+                
+            # Special processing of tags that require an argument.
+            # Anchors links
+            if tag == 'a' and 'q' not in tag_stack:
+                href = elem.get('href')
+                if href and href.startswith('#'):
+                    tag_count += 1
+                    text += '\\q="%s"' % href
+                    tag_stack.append('q')
+            # Anchor ids
+            id_name = elem.get('id')
+            if id_name:
+                text += '\\Q="%s"' % id_name
+
+            # Processes style information
+            for s in STYLES:
+                style_tag = s[1].get(style[s[0]], None)
+                if style_tag and style_tag not in tag_stack:
+                    tag_count += 1
+                    text += '\\%s' % style_tag
+                    tag_stack.append(style_tag)
+            # margin
+
+            text += self.elem_text(elem, tag_stack)
+            
+        for item in elem:
+            text += self.dump_text(item, stylizer, tag_stack)
+        
+        close_tag_list = []
+        for i in range(0, tag_count):
+            close_tag_list.insert(0, tag_stack.pop())
+        text += self.close_tags(close_tag_list)
+        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li'):
+            text += os.linesep + os.linesep
+
+        
+        if 'block' not in tag_stack:
+            text += os.linesep + os.linesep
+        
+        if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
+            text += self.elem_tail(elem, tag_stack)
+
+        return text
+
+    def elem_text(self, elem, tag_stack):
+        return self.block_text(elem.text, 'block' in tag_stack)
+
+    def elem_tail(self, elem, tag_stack):
+        return self.block_text(elem.tail, 'block' in tag_stack)
+
+    def block_text(self, text, in_block):
+        if in_block:
+            text = text.replace('\n\r', ' ')
+            text = text.replace('\n', ' ')
+            text = text.replace('\r', ' ')
+        return text
+
+    def close_tags(self, tags):
+        text = u''
+        for i in range(0, len(tags)):
+            tag = tags.pop()
+            if tag != 'block':
+                text += '\\%s' % tag
+        return text
+

From 71eb5ab8fa695cb6ec7c57185703b983909d695f Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 17 May 2009 21:38:35 -0400
Subject: [PATCH 2/6] Fix chapter pml tag.

---
 src/calibre/ebooks/pml/pmlml.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py
index a6febdc53f..7c10784867 100644
--- a/src/calibre/ebooks/pml/pmlml.py
+++ b/src/calibre/ebooks/pml/pmlml.py
@@ -24,11 +24,11 @@ TAG_MAP = {
     'big'     : 'l',
     'del'     : 'o',
     'h1'      : 'x',
-    'h2'      : 'x0',
+    'h2'      : 'X0',
     'h3'      : 'x1',
-    'h4'      : 'x2',
-    'h5'      : 'x3',
-    'h6'      : 'x4',
+    'h4'      : 'X2',
+    'h5'      : 'X3',
+    'h6'      : 'X4',
     '!--'     : 'v',
 }
 

From 19b04056d4149d3467a3b623a09802dcd77682e3 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Mon, 18 May 2009 20:17:40 -0400
Subject: [PATCH 3/6] disable page breaks

---
 src/calibre/ebooks/pml/pmlml.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py
index 7c10784867..a5e3b36377 100644
--- a/src/calibre/ebooks/pml/pmlml.py
+++ b/src/calibre/ebooks/pml/pmlml.py
@@ -55,14 +55,19 @@ class PMLMLizer(object):
             stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
             output += self.dump_text(item.data.find(XHTML('body')), stylizer)
         output = self.clean_text(output)
-        
-        output = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), output)
-        output = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), output)
-        output = re.sub('[ ]{2,}', ' ', output)
 
         return output
 
     def clean_text(self, text):
+        text = re.sub('(?m)^[ ]+', '', text)
+        text = re.sub('(?m)[ ]+$', '', text)
+    
+        text = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
+        text = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
+        text = re.sub('[ ]{2,}', ' ', text)
+        
+        text = re.sub(r'\\p\s*\\p', '', text)
+        
         return text
 
     def dump_text(self, elem, stylizer, tag_stack=[]):
@@ -99,10 +104,9 @@ class PMLMLizer(object):
                 text += '="50%"'
         
         # Process style information that needs holds a single tag
-        if style['page-break-before'] == 'always':
-            text += '\\p'
-        if style['page-break-after'] == 'always':
-            text += '\\p'
+        # Commented out because every page in an OEB book starts with this style
+        #if style['page-break-before'] == 'always':
+        #    text += '\\p'
         
         # Proccess tags that contain text.
         if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
@@ -145,10 +149,12 @@ class PMLMLizer(object):
         text += self.close_tags(close_tag_list)
         if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li'):
             text += os.linesep + os.linesep
-
         
         if 'block' not in tag_stack:
             text += os.linesep + os.linesep
+
+        #if style['page-break-after'] == 'always':
+        #    text += '\\p'
         
         if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
             text += self.elem_tail(elem, tag_stack)

From 2a155e22bef2baaed15e3a4089b7477fc770c08f Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Tue, 19 May 2009 07:44:01 -0400
Subject: [PATCH 4/6] PML: remove unused anchors, clean up anchors and links.

---
 src/calibre/ebooks/pml/pmlml.py | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py
index a5e3b36377..d32d391004 100644
--- a/src/calibre/ebooks/pml/pmlml.py
+++ b/src/calibre/ebooks/pml/pmlml.py
@@ -53,21 +53,35 @@ class PMLMLizer(object):
         output = u''
         for item in self.oeb_book.spine:
             stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
+            output += self.add_page_anchor(item.href)
             output += self.dump_text(item.data.find(XHTML('body')), stylizer)
         output = self.clean_text(output)
 
         return output
 
+    def add_page_anchor(self, href):
+        href = os.path.splitext(os.path.basename(href))[0]
+        return '\\Q="%s"' % href
+
     def clean_text(self, text):
+        # Remove excess spaces at beginning and end of lines
         text = re.sub('(?m)^[ ]+', '', text)
         text = re.sub('(?m)[ ]+$', '', text)
     
+        # Remove excessive newlines
         text = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
         text = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
         text = re.sub('[ ]{2,}', ' ', text)
         
+        # Remove excessive \p tags
         text = re.sub(r'\\p\s*\\p', '', text)
         
+        # Remove anchors that do not have links
+        anchors = set(re.findall(r'(?<=\\Q=").+?(?=")', text))
+        links = set(re.findall(r'(?<=\\q=").+?(?=")', text))
+        for unused in anchors.difference(links):
+            text = text.replace('\\Q="%s"' % unused, '')
+        
         return text
 
     def dump_text(self, elem, stylizer, tag_stack=[]):
@@ -120,14 +134,17 @@ class PMLMLizer(object):
             # Anchors links
             if tag == 'a' and 'q' not in tag_stack:
                 href = elem.get('href')
-                if href and href.startswith('#'):
+                if href and '://' not in href:
+                    if '#' in href:
+                        href = href.partition('#')[2][1:]
+                    href = os.path.splitext(os.path.basename(href))[0]
                     tag_count += 1
                     text += '\\q="%s"' % href
                     tag_stack.append('q')
             # Anchor ids
             id_name = elem.get('id')
             if id_name:
-                text += '\\Q="%s"' % id_name
+                text += '\\Q="%s"' % os.path.splitext(id_name)[0]
 
             # Processes style information
             for s in STYLES:
@@ -147,7 +164,7 @@ class PMLMLizer(object):
         for i in range(0, tag_count):
             close_tag_list.insert(0, tag_stack.pop())
         text += self.close_tags(close_tag_list)
-        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li'):
+        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li', 'tr'):
             text += os.linesep + os.linesep
         
         if 'block' not in tag_stack:

From 91b7cbc5808cb65aa69f51a59ccb0b5cbb604291 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Tue, 19 May 2009 18:57:07 -0400
Subject: [PATCH 5/6] PML: turn html entities into characters, internal links
 produced properly.

---
 src/calibre/ebooks/pdb/ereader/writer.py |  4 ++--
 src/calibre/ebooks/pml/pmlml.py          | 10 +++++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py
index c99c75a929..875aae764a 100644
--- a/src/calibre/ebooks/pdb/ereader/writer.py
+++ b/src/calibre/ebooks/pdb/ereader/writer.py
@@ -41,7 +41,7 @@ class Writer(FormatWriter):
         
         lengths = [len(i) for i in sections]
         
-        pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].parition()[0])
+        pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].partition('\x00')[0])
         pdbHeaderBuilder.build_header(lengths, out_stream)
         
         for item in sections:
@@ -49,7 +49,7 @@ class Writer(FormatWriter):
 
     def _text(self, oeb_book):
         pmlmlizer = PMLMLizer(ignore_tables=self.opts.linearize_tables)
-        pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252')
+        pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
     
         pml_pages = []
         for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1):
diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py
index d32d391004..cdf3bf69e8 100644
--- a/src/calibre/ebooks/pml/pmlml.py
+++ b/src/calibre/ebooks/pml/pmlml.py
@@ -13,6 +13,7 @@ import os, re
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
 from calibre.ebooks.oeb.stylizer import Stylizer
 from calibre.ebooks.pdb.ereader import image_name
+from calibre import entity_to_unicode
 
 TAG_MAP = {
     'b'       : 'B',
@@ -78,9 +79,12 @@ class PMLMLizer(object):
         
         # Remove anchors that do not have links
         anchors = set(re.findall(r'(?<=\\Q=").+?(?=")', text))
-        links = set(re.findall(r'(?<=\\q=").+?(?=")', text))
+        links = set(re.findall(r'(?<=\\q="#).+?(?=")', text))
         for unused in anchors.difference(links):
             text = text.replace('\\Q="%s"' % unused, '')
+            
+        for entity in set(re.findall('&.+?;', text)):
+            text = text.replace(entity, entity_to_unicode(entity[1:-1]))
         
         return text
 
@@ -136,10 +140,10 @@ class PMLMLizer(object):
                 href = elem.get('href')
                 if href and '://' not in href:
                     if '#' in href:
-                        href = href.partition('#')[2][1:]
+                        href = href.partition('#')[2]
                     href = os.path.splitext(os.path.basename(href))[0]
                     tag_count += 1
-                    text += '\\q="%s"' % href
+                    text += '\\q="#%s"' % href
                     tag_stack.append('q')
             # Anchor ids
             id_name = elem.get('id')

From 8398d4a7d706e6958d8bf112e60c12eecbd0fa4c Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Tue, 19 May 2009 20:23:40 -0400
Subject: [PATCH 6/6] Auto convert when syncing news.

---
 src/calibre/gui2/device.py | 35 ++++++++++++++++++++++++++------
 src/calibre/gui2/main.py   | 41 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 6 deletions(-)

diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py
index b176c25062..caed0358cc 100644
--- a/src/calibre/gui2/device.py
+++ b/src/calibre/gui2/device.py
@@ -640,12 +640,33 @@ class DeviceGUI(object):
                     ', '.join(sent_mails),  3000)
 
 
-    def sync_news(self):
+    def sync_news(self, send_ids=None, do_auto=True):
         if self.device_connected:
-            ids = list(dynamic.get('news_to_be_synced', set([])))
+            ids = list(dynamic.get('news_to_be_synced', set([]))) if send_ids is None else send_ids
             ids = [id for id in ids if self.library_view.model().db.has_id(id)]
-            files, auto = self.library_view.model().get_preferred_formats_from_ids(
-                                ids, self.device_manager.device_class.settings().format_map)
+            files, _auto_ids = self.library_view.model().get_preferred_formats_from_ids(
+                                ids, self.device_manager.device_class.settings().format_map,
+                                exclude_auto=do_auto)
+            auto = []
+            if _auto_ids:
+                for id in _auto_ids:
+                    formats = [f.lower() for f in self.library_view.model().db.formats(id, index_is_id=True).split(',')]
+                    formats = formats if formats != None else []
+                    if list(set(formats).intersection(available_input_formats())) != [] and list(set(self.device_manager.device_class.settings().format_map).intersection(available_output_formats())) != []:
+                        auto.append(id)
+            if auto != []:
+                format = None
+                for fmt in self.device_manager.device_class.settings().format_map:
+                    if fmt in list(set(self.device_manager.device_class.settings().format_map).intersection(set(available_output_formats()))):
+                        format = fmt
+                        break
+                if format is not None:
+                    autos = [self.library_view.model().db.title(id, index_is_id=True) for id in auto]
+                    autos = '\n'.join('%s'%i for i in autos)
+                    info_dialog(self, _('No suitable formats'),
+                        _('Auto converting the following books before uploading to '
+                            'the device:'), det_msg=autos, show=True)
+                    self.auto_convert_news(auto, format)
             files = [f for f in files if f is not None]
             if not files:
                 dynamic.set('news_to_be_synced', set([]))
@@ -667,8 +688,10 @@ class DeviceGUI(object):
             if config['upload_news_to_device'] and files:
                 remove = ids if \
                     config['delete_news_from_library_on_upload'] else []
-                on_card = self.location_view.model().free[0] < \
-                          self.location_view.model().free[1]
+                space = { self.location_view.model().free[0] : 'main',
+                    self.location_view.model().free[1] : 'carda',
+                    self.location_view.model().free[2] : 'cardb' }
+                on_card = space.get(sorted(space.keys(), reverse=True)[0], 'main')
                 self.upload_books(files, names, metadata,
                         on_card=on_card,
                         memory=[[f.name for f in files], remove])
diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py
index f50bffbb76..7f36a9560c 100644
--- a/src/calibre/gui2/main.py
+++ b/src/calibre/gui2/main.py
@@ -1080,6 +1080,24 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
             current = self.library_view.currentIndex()
             self.library_view.model().current_changed(current, previous)
 
+    def auto_convert_news(self, book_ids, format):
+        previous = self.library_view.currentIndex()
+        rows = [x.row() for x in \
+                self.library_view.selectionModel().selectedRows()]
+        jobs, changed, bad = convert_single_ebook(self, self.library_view.model().db, book_ids, True, format)
+        if jobs == []: return
+        for func, args, desc, fmt, id, temp_files in jobs:
+            if id not in bad:
+                job = self.job_manager.run_job(Dispatcher(self.book_auto_converted_news),
+                                        func, args=args, description=desc)
+                self.conversion_jobs[job] = (temp_files, fmt, id)
+
+        if changed:
+            self.library_view.model().refresh_rows(rows)
+            current = self.library_view.currentIndex()
+            self.library_view.model().current_changed(current, previous)
+
+
     def get_books_for_conversion(self):
         rows = [r.row() for r in \
                 self.library_view.selectionModel().selectedRows()]
@@ -1175,6 +1193,29 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
 
         self.send_by_mail(to, fmts, delete_from_library, specific_format=fmt, send_ids=[book_id], do_auto_convert=False)
 
+    def book_auto_converted_news(self, job):
+        temp_files, fmt, book_id = self.conversion_jobs.pop(job)
+        try:
+            if job.failed:
+                return self.job_exception(job)
+            data = open(temp_files[0].name, 'rb')
+            self.library_view.model().db.add_format(book_id, fmt, data, index_is_id=True)
+            data.close()
+            self.status_bar.showMessage(job.description + (' completed'), 2000)
+        finally:
+            for f in temp_files:
+                try:
+                    if os.path.exists(f.name):
+                        os.remove(f.name)
+                except:
+                    pass
+        self.tags_view.recount()
+        if self.current_view() is self.library_view:
+            current = self.library_view.currentIndex()
+            self.library_view.model().current_changed(current, QModelIndex())
+
+        self.sync_news(send_ids=[book_id], do_auto_convert=False)
+
     def book_converted(self, job):
         temp_files, fmt, book_id = self.conversion_jobs.pop(job)
         try: