mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
More strenuous cleaning for title/summary of feeds
This commit is contained in:
parent
68df59c633
commit
a3eee6a22a
@ -8,9 +8,9 @@ Contains the logic for parsing feeds.
|
|||||||
import time, traceback, copy, re
|
import time, traceback, copy, re
|
||||||
|
|
||||||
from calibre.utils.logging import default_log
|
from calibre.utils.logging import default_log
|
||||||
from calibre import entity_to_unicode, strftime
|
from calibre import entity_to_unicode, strftime, force_unicode
|
||||||
from calibre.utils.date import dt_factory, utcnow, local_tz
|
from calibre.utils.date import dt_factory, utcnow, local_tz
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
|
||||||
|
|
||||||
class Article(object):
|
class Article(object):
|
||||||
|
|
||||||
@ -18,23 +18,23 @@ class Article(object):
|
|||||||
from lxml import html
|
from lxml import html
|
||||||
self.downloaded = False
|
self.downloaded = False
|
||||||
self.id = id
|
self.id = id
|
||||||
self._title = (title or _('Unknown')).strip()
|
title = force_unicode(title or _('Unknown'), 'utf-8')
|
||||||
|
self._title = clean_xml_chars(title).strip()
|
||||||
try:
|
try:
|
||||||
self._title = re.sub(r'&(\S+?);',
|
self._title = re.sub(r'&(\S+?);',
|
||||||
entity_to_unicode, self._title)
|
entity_to_unicode, self._title)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
if not isinstance(self._title, unicode):
|
|
||||||
self._title = self._title.decode('utf-8', 'replace')
|
|
||||||
self._title = clean_ascii_chars(self._title)
|
self._title = clean_ascii_chars(self._title)
|
||||||
self.url = url
|
self.url = url
|
||||||
self.author = author
|
self.author = author
|
||||||
self.toc_thumbnail = None
|
self.toc_thumbnail = None
|
||||||
if author and not isinstance(author, unicode):
|
if author and not isinstance(author, unicode):
|
||||||
author = author.decode('utf-8', 'replace')
|
author = author.decode('utf-8', 'replace')
|
||||||
self.summary = summary
|
|
||||||
if summary and not isinstance(summary, unicode):
|
if summary and not isinstance(summary, unicode):
|
||||||
summary = summary.decode('utf-8', 'replace')
|
summary = summary.decode('utf-8', 'replace')
|
||||||
|
summary = clean_xml_chars(summary) if summary else summary
|
||||||
|
self.summary = summary
|
||||||
if summary and '<' in summary:
|
if summary and '<' in summary:
|
||||||
try:
|
try:
|
||||||
s = html.fragment_fromstring(summary, create_parent=True)
|
s = html.fragment_fromstring(summary, create_parent=True)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user