From 78e28cbe9e8fa5d67acf0126cc38ddf9ddcb5d3a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 4 Sep 2014 15:45:24 +0530 Subject: [PATCH] When detecting the encoding of HTML documents, if the document contains multiple charset declarations, prefer the HTML 5 syntax to the HTML 4 syntax. Fixes #1364961 [Unicode Conversion on Amazon after Release 2.x](https://bugs.launchpad.net/calibre/+bug/1364961) --- src/calibre/ebooks/chardet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/chardet.py b/src/calibre/ebooks/chardet.py index 61aefafdac..a550b4265d 100644 --- a/src/calibre/ebooks/chardet.py +++ b/src/calibre/ebooks/chardet.py @@ -12,10 +12,10 @@ import re, codecs ENCODING_PATS = [ # XML declaration re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE), - # HTML 4 Pragma directive - re.compile(r''']*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*){0,1}''', re.IGNORECASE), # HTML 5 charset re.compile(r''']*>(?:\s*){0,1}''', re.IGNORECASE), + # HTML 4 Pragma directive + re.compile(r''']*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*){0,1}''', re.IGNORECASE), ] ENTITY_PATTERN = re.compile(r'&(\S+?);')