News download: Handle HTML entities in article titles

This commit is contained in:
Kovid Goyal 2009-02-20 10:58:45 -08:00
parent 8e248482aa
commit af8f3b56ce

View File

@ -5,10 +5,11 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
''' '''
Contains the logic for parsing feeds. Contains the logic for parsing feeds.
''' '''
import time, logging, traceback, copy import time, logging, traceback, copy, re
from datetime import datetime from datetime import datetime
from calibre.web.feeds.feedparser import parse from calibre.web.feeds.feedparser import parse
from calibre import entity_to_unicode
from lxml import html from lxml import html
class Article(object): class Article(object):
@ -19,6 +20,12 @@ class Article(object):
self.downloaded = False self.downloaded = False
self.id = id self.id = id
self.title = title.strip() if title else title self.title = title.strip() if title else title
try:
self.title = re.sub(r'&(\S+);',
entity_to_unicode, self.title)
print 11111, repr(self.title)
except:
pass
self.url = url self.url = url
self.summary = summary self.summary = summary
if summary and not isinstance(summary, unicode): if summary and not isinstance(summary, unicode):
@ -37,6 +44,7 @@ class Article(object):
self.date = published self.date = published
self.utctime = datetime(*self.date[:6]) self.utctime = datetime(*self.date[:6])
self.localtime = self.utctime + self.time_offset self.localtime = self.utctime + self.time_offset
def __repr__(self): def __repr__(self):
return \ return \
@ -91,7 +99,8 @@ class Feed(object):
if len(self.articles) >= max_articles_per_feed: if len(self.articles) >= max_articles_per_feed:
break break
self.parse_article(item) self.parse_article(item)
def populate_from_preparsed_feed(self, title, articles, oldest_article=7, def populate_from_preparsed_feed(self, title, articles, oldest_article=7,
max_articles_per_feed=100): max_articles_per_feed=100):
self.title = title if title else _('Unknown feed') self.title = title if title else _('Unknown feed')