TXT input convert_basic fixes: Make it handle html reserved characters in the text and detect paragraphs correctly.

2025-07-09 03:04:10 -04:00 · 2009-09-01 06:21:40 -04:00 · 2009-09-01 06:21:40 -04:00 · 26a0327943
commit 26a0327943
parent 9a30868d83
1 changed files with 19 additions and 2 deletions
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -7,6 +7,7 @@ Read content from txt file.
 import os
 import re

+from calibre import prepare_string_for_xml
 from calibre.ebooks.markdown import markdown
 from calibre.ebooks.metadata.opf2 import OPFCreator

@ -14,12 +15,28 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'

-HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>%s</body></html>'
+HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'

 def convert_basic(txt, title=''):
    lines = []
+    # Strip whitespace from the beginning and end of the line. Also replace
+    # all line breaks with \n.
    for line in txt.splitlines():
-        lines.append('<p>%s</p>' % line)
+        lines.append(line.strip())
+    txt = '\n'.join(lines)
+
+    # Remove blank lines from the beginning and end of the document.
+    txt = re.sub('^\s+(?=.)', '', txt)
+    txt = re.sub('(?<=.)\s+$', '', txt)
+    # Remove excessive line breaks.
+    txt = re.sub('\n{3,}', '\n\n', txt)
+
+    lines = []
+    # Split into paragraphs based on having a blank line between text.
+    for line in txt.split('\n\n'):
+        if line.strip():
+            lines.append('<p>%s</p>' % prepare_string_for_xml(line.replace('\n', ' ')))
+
    return HTML_TEMPLATE % (title, '\n'.join(lines))

 def convert_markdown(txt, title=''):