ereader output work

2025-07-09 03:04:10 -04:00 · 2009-04-25 15:50:52 -04:00 · 2009-04-25 15:50:52 -04:00 · 6ee829ff79
commit 6ee829ff79
parent 19c8343d6a
4 changed files with 15 additions and 10 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -27,9 +27,9 @@ def chap_head(match):
    chap = match.group('chap')
    title = match.group('title')
    if not title: 
-               return '<h1>'+chap+'</h1><br/>'
+               return '<h1>'+chap+'</h1><br/>\n'
    else: 
-               return '<h1>'+chap+'<br/>'+title+'</h1><br/>'
+               return '<h1>'+chap+'<br/>\n'+title+'</h1><br/>\n'

 def wrap_lines(match):
    ital = match.group('ital')
@ -121,7 +121,7 @@ class HTMLPreProcessor(object):
                  # Clean up spaces
                  (re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
                  # Add space before and after italics
-                  (re.compile(r'(?<!“)<i>'), lambda match: ' <i>'),
+                  (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
                  (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
                 ]

@ -161,7 +161,7 @@ class HTMLPreProcessor(object):
        elif self.is_pdftohtml(html):
            line_length_rules = [
                # Un wrap using punctuation
-                (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .2), re.UNICODE), wrap_lines),
+                (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines),
            ]
            
            rules = self.PDFTOHTML + line_length_rules
--- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py
+++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py
@ -50,6 +50,7 @@ PML_HTML_RULES = [
    # eReader files are one paragraph per line.
    # This forces the lines to wrap properly.
    (re.compile('^(?P<text>.+)$', re.MULTILINE), lambda match: '<p>%s</p>' % match.group('text')),
+    (re.compile('<p>[ ]*</p>'), lambda match: ''),
    
    # Remove unmatched plm codes.
    (re.compile(r'(?<=[^\\])\\[pxcriouvtblBk]'), lambda match: ''),
@ -82,7 +83,7 @@ HTML_PML_RULES = [
    (re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')),
    (re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
    #(re.compile('<img.*?src="images/(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')),
-    (re.compile('<img.*?src="(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % image_name(match.group('name'))),
+    (re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name'))),
    #(re.compile('&#(?P<num>\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))),
    (re.compile('&#(?P<num>\d\d\d);'), lambda match: '\\a%s' % match.group('num')),
    (re.compile('<small .*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
@ -93,6 +94,8 @@ HTML_PML_RULES = [
    (re.compile('<sup>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
    (re.compile('<b .*?>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
    (re.compile('<b>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
+    (re.compile('<strong .*?>(?P<text>.+?)</strong>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
+    (re.compile('<strong>(?P<text>.+?)</strong>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
    (re.compile('<big .*?>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
    (re.compile('<big>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
    (re.compile('<hr.*?width="(?P<val>\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')),
@ -108,8 +111,8 @@ HTML_PML_RULES = [
    (re.compile('<div.*?style.*?text-align: center;.*?".*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')),
    (re.compile('<h(?P<val>[0-4]).*?>(?P<text>.+?)</h[0-4]>', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
    (re.compile('<h1.*?>(?P<text>.+?)</h1>', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')),
-    (re.compile('<br .*?>'), lambda match: '\\p'),
-    (re.compile('<br/*>'), lambda match: '\\p'),
+    (re.compile('<br .*?>'), lambda match: '\n'),
+    (re.compile('<br/*>'), lambda match: '\n'),
    
    # Remove remaining HTML tags
    (re.compile('<.*?>'), lambda match: ''),
@ -119,6 +122,8 @@ HTML_PML_RULES = [
    
    # Remove whitespace on empty lines
    (re.compile('^[\t\r ]$', re.MULTILINE), lambda match: ''),
+    # Remove excess whitespace in lines
+    (re.compile('(?<=.)[ ]{2,}(?=.)'), lambda match: ' '),
    
    # Remove excess newlines at the beginning and end
    (re.compile('^(\r\n){1,}'), lambda match: ''),
--- a/src/calibre/ebooks/pdb/ereader/reader.py
+++ b/src/calibre/ebooks/pdb/ereader/reader.py
@ -154,7 +154,7 @@ class Reader(FormatReader):
        
            for i in images:
                manifest.append((os.path.join('images/', i), None))
-        
+
            opf.create_manifest(manifest)
            opf.create_spine(['index.html'])
            with open('metadata.opf', 'wb') as opffile:
--- a/src/calibre/ebooks/pdb/ereader/writer.py
+++ b/src/calibre/ebooks/pdb/ereader/writer.py
@ -39,7 +39,7 @@ class Writer(object):
        pml_pages = []
        
        for page in pages:
-            pml_pages.append(zlib.compress(html_to_pml(unicode(page))))
+            pml_pages.append(zlib.compress(html_to_pml(unicode(page)).encode('utf-8')))

        return pml_pages            
        
@ -67,7 +67,7 @@ class Writer(object):
        image_items = the number of images
        '''
        version = 10
-        non_text_offset = text_items
+        non_text_offset = text_items + 1
        
        if image_items > 0:
            image_data_offset = text_items + 1