From a3eee6a22a90b25b773cef27448bf85e5717b771 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 26 Sep 2016 23:19:06 +0530
Subject: [PATCH] More strenuous cleaning for title/summary of feeds

---
 src/calibre/web/feeds/__init__.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py
index cc66a7d67f..1b39202cf8 100644
--- a/src/calibre/web/feeds/__init__.py
+++ b/src/calibre/web/feeds/__init__.py
@@ -8,9 +8,9 @@ Contains the logic for parsing feeds.
 import time, traceback, copy, re
 
 from calibre.utils.logging import default_log
-from calibre import entity_to_unicode, strftime
+from calibre import entity_to_unicode, strftime, force_unicode
 from calibre.utils.date import dt_factory, utcnow, local_tz
-from calibre.utils.cleantext import clean_ascii_chars
+from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
 
 class Article(object):
 
@@ -18,23 +18,23 @@ class Article(object):
         from lxml import html
         self.downloaded = False
         self.id = id
-        self._title = (title or _('Unknown')).strip()
+        title = force_unicode(title or _('Unknown'), 'utf-8')
+        self._title = clean_xml_chars(title).strip()
         try:
             self._title = re.sub(r'&(\S+?);',
                 entity_to_unicode, self._title)
         except:
             pass
-        if not isinstance(self._title, unicode):
-            self._title = self._title.decode('utf-8', 'replace')
         self._title = clean_ascii_chars(self._title)
         self.url = url
         self.author = author
         self.toc_thumbnail = None
         if author and not isinstance(author, unicode):
             author = author.decode('utf-8', 'replace')
-        self.summary = summary
         if summary and not isinstance(summary, unicode):
             summary = summary.decode('utf-8', 'replace')
+        summary = clean_xml_chars(summary) if summary else summary
+        self.summary = summary
         if summary and '<' in summary:
             try:
                 s = html.fragment_fromstring(summary, create_parent=True)