From 8523ad9103ae538031fa089d539067909cd5083e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 7 Apr 2013 21:15:42 +0530 Subject: [PATCH] TXT Input: When converting a txt file with a Byte Order Mark, remove the Byte Order Mark before further processing as it can cause the first line of the text to be mis-interpreted. --- src/calibre/ebooks/conversion/plugins/txt_input.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/calibre/ebooks/conversion/plugins/txt_input.py b/src/calibre/ebooks/conversion/plugins/txt_input.py index e916b30c29..50f1409ea6 100644 --- a/src/calibre/ebooks/conversion/plugins/txt_input.py +++ b/src/calibre/ebooks/conversion/plugins/txt_input.py @@ -97,6 +97,12 @@ class TXTInput(InputFormatPlugin): if not ienc: ienc = 'utf-8' log.debug('No input encoding specified and could not auto detect using %s' % ienc) + # Remove BOM from start of txt as its presence can confuse markdown + import codecs + for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): + if txt.startswith(bom): + txt = txt[len(bom):] + break txt = txt.decode(ienc, 'replace') # Replace entities