Fix encoding bug when using full content feeds

2026-01-01 17:50:30 -05:00 · 2009-06-07 11:22:01 -07:00 · 2009-06-07 11:22:01 -07:00 · 3b6b27c3c4
commit 3b6b27c3c4
parent 8b17b20066
1 changed files with 46 additions and 43 deletions
--- a/src/calibre/web/feeds/init.py
+++ b/src/calibre/web/feeds/init.py
@ -13,7 +13,7 @@ from calibre import entity_to_unicode
 from lxml import html

 class Article(object):
-    
+
    time_offset = datetime.now() - datetime.utcnow()

    def __init__(self, id, title, url, summary, published, content):
@ -21,7 +21,7 @@ class Article(object):
        self.id = id
        self.title = title.strip() if title else title
        try:
-            self.title = re.sub(r'&(\S+);', 
+            self.title = re.sub(r'&(\S+);',
                entity_to_unicode, self.title)
        except:
            pass
@ -44,7 +44,7 @@ class Article(object):
        self.utctime = datetime(*self.date[:6])
        self.localtime = self.utctime + self.time_offset

-                
+
    def __repr__(self):
        return \
 (u'''\
@ -58,14 +58,14 @@ Has content : %s

    def __str__(self):
        return repr(self)
-    
+
    def is_same_as(self, other_article):
        #if self.title != getattr(other_article, 'title', False):
        #    return False
        if self.url:
            return self.url == getattr(other_article, 'url', False)
        return self.content == getattr(other_article, 'content', False)
-    
+

 class Feed(object):

@ -75,8 +75,8 @@ class Feed(object):
        '''
        self.logger = logging.getLogger('feeds2disk')
        self.get_article_url = get_article_url
-        
-    def populate_from_feed(self, feed, title=None, oldest_article=7, 
+
+    def populate_from_feed(self, feed, title=None, oldest_article=7,
                           max_articles_per_feed=100):
        entries = feed.entries
        feed = feed.feed
@ -87,30 +87,30 @@ class Feed(object):
        self.image_width  = image.get('width', 88)
        self.image_height = image.get('height', 31)
        self.image_alt    = image.get('title', '')
-        
+
        self.articles = []
        self.id_counter = 0
        self.added_articles = []
-        
+
        self.oldest_article = oldest_article
-        
+
        for item in entries:
            if len(self.articles) >= max_articles_per_feed:
                break
            self.parse_article(item)
-        
-        
-    def populate_from_preparsed_feed(self, title, articles, oldest_article=7, 
+
+
+    def populate_from_preparsed_feed(self, title, articles, oldest_article=7,
                           max_articles_per_feed=100):
        self.title      = title if title else _('Unknown feed')
        self.descrition = ''
        self.image_url  = None
        self.articles   = []
        self.added_articles = []
-         
+
        self.oldest_article = oldest_article
        self.id_counter = 0
-        
+
        for item in articles:
            if len(self.articles) >= max_articles_per_feed:
                break
@ -130,8 +130,8 @@ class Feed(object):
                self.articles.append(article)
            else:
                self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
-         
-    
+
+
    def parse_article(self, item):
        id = item.get('id', 'internal id#'+str(self.id_counter))
        if id in self.added_articles:
@ -141,7 +141,7 @@ class Feed(object):
            published = time.gmtime()
        self.id_counter += 1
        self.added_articles.append(id)
-        
+
        title = item.get('title', _('Untitled article'))
        try:
            link  = self.get_article_url(item)
@ -150,8 +150,11 @@ class Feed(object):
            self.logger.debug(traceback.format_exc())
            link = None
        description = item.get('summary', None)
-        
-        content = '\n'.join(i.value for i in item.get('content', []))
+
+        content = [i.value for i in item.get('content', []) if i.value]
+        content = [i if isinstance(i, unicode) else i.decode('utf-8', 'replace')
+                for i in content]
+        content = u'\n'.join(content)
        if not content.strip():
            content = None
        if not link and not content:
@ -167,66 +170,66 @@ class Feed(object):
                if not isinstance(title, unicode):
                    title = title.decode('utf-8', 'replace')
                self.logger.debug('Skipping article %s as it is too old'%title)
-        
+
    def __iter__(self):
        return iter(self.articles)
-    
+
    def __len__(self):
        return len(self.articles)
-    
+
    def __repr__(self):
        res = [('%20s\n'%'').replace(' ', '_')+repr(art) for art in self]
-        
+
        return '\n'+'\n'.join(res)+'\n'
-    
+
    def __str__(self):
        return repr(self)
-    
+
    def __bool__(self):
        for article in self:
            if getattr(article, 'downloaded', False):
                return True
        return False
-    
+
    def has_embedded_content(self):
        length = 0
        for a in self:
            if a.content or a.summary:
-                length += max(len(a.content if a.content else ''), 
+                length += max(len(a.content if a.content else ''),
                              len(a.summary if a.summary else ''))
-                
+
        return length > 2000 * len(self)
-    
+
    def has_article(self, article):
        for a in self:
            if a.is_same_as(article):
                return True
        return False
-    
+
    def find(self, article):
        for i, a in enumerate(self):
            if a.is_same_as(article):
                return i
        return -1
-    
+
    def remove(self, article):
        i = self.index(article)
        if i > -1:
            self.articles[i:i+1] = []

 class FeedCollection(list):
-    
+
    def __init__(self, feeds):
        list.__init__(self, [f for f in feeds if len(f.articles) > 0])
        found_articles = set([])
        duplicates = set([])
-        
+
        def in_set(s, a):
            for x in s:
                if a.is_same_as(x):
                    return x
            return None
-        
+
        print '#feeds', len(self)
        print map(len, self)
        for f in self:
@ -240,18 +243,18 @@ class FeedCollection(list):
                    found_articles.add(a)
            for x in dups:
                f.articles.remove(x)
-                
+
        self.duplicates = duplicates
        print len(duplicates)
        print map(len, self)
        #raise
-                
+
    def find_article(self, article):
        for j, f in enumerate(self):
            for i, a in enumerate(f):
                if a is article:
                    return (j, i)
-    
+
    def restore_duplicates(self):
        temp = []
        for article, feed in self.duplicates:
@ -261,13 +264,13 @@ class FeedCollection(list):
            temp.append((feed, art))
        for feed, art in temp:
            feed.articles.append(art)
-        

-def feed_from_xml(raw_xml, title=None, oldest_article=7, 
+
+def feed_from_xml(raw_xml, title=None, oldest_article=7,
                  max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)):
    feed = parse(raw_xml)
    pfeed = Feed(get_article_url=get_article_url)
-    pfeed.populate_from_feed(feed, title=title, 
+    pfeed.populate_from_feed(feed, title=title,
                            oldest_article=oldest_article,
                            max_articles_per_feed=max_articles_per_feed)
    return pfeed
@ -281,7 +284,7 @@ def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100):
    feeds = []
    for title, articles in index:
        pfeed = Feed()
-        pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article, 
+        pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
                                       max_articles_per_feed=max_articles_per_feed)
        feeds.append(pfeed)
-    return feeds
+    return feeds