TXT Input: When converting a txt file with a Byte Order Mark, remove the Byte Order Mark before further processing as it can cause the first line of the text to be mis-interpreted.

2025-07-09 03:04:10 -04:00 · 2013-04-07 21:15:42 +05:30 · 2013-04-07 21:15:42 +05:30 · 8523ad9103
commit 8523ad9103
parent a1fd361f81
1 changed files with 6 additions and 0 deletions
--- a/src/calibre/ebooks/conversion/plugins/txt_input.py
+++ b/src/calibre/ebooks/conversion/plugins/txt_input.py
@ -97,6 +97,12 @@ class TXTInput(InputFormatPlugin):
        if not ienc:
            ienc = 'utf-8'
            log.debug('No input encoding specified and could not auto detect using %s' % ienc)
+        # Remove BOM from start of txt as its presence can confuse markdown
+        import codecs
+        for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
+            if txt.startswith(bom):
+                txt = txt[len(bom):]
+                break
        txt = txt.decode(ienc, 'replace')

        # Replace entities