diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 0421534f65..43f1f619d0 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -27,9 +27,9 @@ def chap_head(match): chap = match.group('chap') title = match.group('title') if not title: - return '

'+chap+'


' + return '

'+chap+'


\n' else: - return '

'+chap+'
'+title+'


' + return '

'+chap+'
\n'+title+'


\n' def wrap_lines(match): ital = match.group('ital') @@ -121,7 +121,7 @@ class HTMLPreProcessor(object): # Clean up spaces (re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), # Add space before and after italics - (re.compile(r'(?'), lambda match: ' '), + (re.compile(u'(?'), lambda match: ' '), (re.compile(r'(?=\w)'), lambda match: ' '), ] @@ -161,7 +161,7 @@ class HTMLPreProcessor(object): elif self.is_pdftohtml(html): line_length_rules = [ # Un wrap using punctuation - (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P)?\s*()\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .2), re.UNICODE), wrap_lines), + (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P)?\s*()\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines), ] rules = self.PDFTOHTML + line_length_rules diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py index 88c841b81f..a9c9d2f7a7 100644 --- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py +++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py @@ -50,6 +50,7 @@ PML_HTML_RULES = [ # eReader files are one paragraph per line. # This forces the lines to wrap properly. (re.compile('^(?P.+)$', re.MULTILINE), lambda match: '

%s

' % match.group('text')), + (re.compile('

[ ]*

'), lambda match: ''), # Remove unmatched plm codes. (re.compile(r'(?<=[^\\])\\[pxcriouvtblBk]'), lambda match: ''), @@ -82,7 +83,7 @@ HTML_PML_RULES = [ (re.compile('.+?).*?">'), lambda match: '\\\\Q="%s"' % match.group('target')), (re.compile('#.+?).*?">(?P)', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))), #(re.compile('.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')), - (re.compile('.+?)".*?>'), lambda match: '\\m="%s"' % image_name(match.group('name'))), + (re.compile('.+?)".*?>(.*?)*'), lambda match: '\\m="%s"' % image_name(match.group('name'))), #(re.compile('&#(?P\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))), (re.compile('&#(?P\d\d\d);'), lambda match: '\\a%s' % match.group('num')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), @@ -93,6 +94,8 @@ HTML_PML_RULES = [ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), (re.compile('\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')), @@ -108,8 +111,8 @@ HTML_PML_RULES = [ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')), (re.compile('[0-4]).*?>(?P.+?)', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')), - (re.compile('
'), lambda match: '\\p'), - (re.compile('
'), lambda match: '\\p'), + (re.compile('
'), lambda match: '\n'), + (re.compile('
'), lambda match: '\n'), # Remove remaining HTML tags (re.compile('<.*?>'), lambda match: ''), @@ -119,6 +122,8 @@ HTML_PML_RULES = [ # Remove whitespace on empty lines (re.compile('^[\t\r ]$', re.MULTILINE), lambda match: ''), + # Remove excess whitespace in lines + (re.compile('(?<=.)[ ]{2,}(?=.)'), lambda match: ' '), # Remove excess newlines at the beginning and end (re.compile('^(\r\n){1,}'), lambda match: ''), diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index c6f520ecb2..e0953753f4 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -154,7 +154,7 @@ class Reader(FormatReader): for i in images: manifest.append((os.path.join('images/', i), None)) - + opf.create_manifest(manifest) opf.create_spine(['index.html']) with open('metadata.opf', 'wb') as opffile: diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 1605e15f32..cc90b41fb6 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -39,7 +39,7 @@ class Writer(object): pml_pages = [] for page in pages: - pml_pages.append(zlib.compress(html_to_pml(unicode(page)))) + pml_pages.append(zlib.compress(html_to_pml(unicode(page)).encode('utf-8'))) return pml_pages @@ -67,7 +67,7 @@ class Writer(object): image_items = the number of images ''' version = 10 - non_text_offset = text_items + non_text_offset = text_items + 1 if image_items > 0: image_data_offset = text_items + 1