diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py index 4a0f6b47f7..f1e0e11d63 100644 --- a/src/calibre/web/feeds/__init__.py +++ b/src/calibre/web/feeds/__init__.py @@ -13,7 +13,7 @@ from calibre import entity_to_unicode from lxml import html class Article(object): - + time_offset = datetime.now() - datetime.utcnow() def __init__(self, id, title, url, summary, published, content): @@ -21,7 +21,7 @@ class Article(object): self.id = id self.title = title.strip() if title else title try: - self.title = re.sub(r'&(\S+);', + self.title = re.sub(r'&(\S+);', entity_to_unicode, self.title) except: pass @@ -44,7 +44,7 @@ class Article(object): self.utctime = datetime(*self.date[:6]) self.localtime = self.utctime + self.time_offset - + def __repr__(self): return \ (u'''\ @@ -58,14 +58,14 @@ Has content : %s def __str__(self): return repr(self) - + def is_same_as(self, other_article): #if self.title != getattr(other_article, 'title', False): # return False if self.url: return self.url == getattr(other_article, 'url', False) return self.content == getattr(other_article, 'content', False) - + class Feed(object): @@ -75,8 +75,8 @@ class Feed(object): ''' self.logger = logging.getLogger('feeds2disk') self.get_article_url = get_article_url - - def populate_from_feed(self, feed, title=None, oldest_article=7, + + def populate_from_feed(self, feed, title=None, oldest_article=7, max_articles_per_feed=100): entries = feed.entries feed = feed.feed @@ -87,30 +87,30 @@ class Feed(object): self.image_width = image.get('width', 88) self.image_height = image.get('height', 31) self.image_alt = image.get('title', '') - + self.articles = [] self.id_counter = 0 self.added_articles = [] - + self.oldest_article = oldest_article - + for item in entries: if len(self.articles) >= max_articles_per_feed: break self.parse_article(item) - - - def populate_from_preparsed_feed(self, title, articles, oldest_article=7, + + + def populate_from_preparsed_feed(self, title, articles, oldest_article=7, max_articles_per_feed=100): self.title = title if title else _('Unknown feed') self.descrition = '' self.image_url = None self.articles = [] self.added_articles = [] - + self.oldest_article = oldest_article self.id_counter = 0 - + for item in articles: if len(self.articles) >= max_articles_per_feed: break @@ -130,8 +130,8 @@ class Feed(object): self.articles.append(article) else: self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title)) - - + + def parse_article(self, item): id = item.get('id', 'internal id#'+str(self.id_counter)) if id in self.added_articles: @@ -141,7 +141,7 @@ class Feed(object): published = time.gmtime() self.id_counter += 1 self.added_articles.append(id) - + title = item.get('title', _('Untitled article')) try: link = self.get_article_url(item) @@ -150,8 +150,11 @@ class Feed(object): self.logger.debug(traceback.format_exc()) link = None description = item.get('summary', None) - - content = '\n'.join(i.value for i in item.get('content', [])) + + content = [i.value for i in item.get('content', []) if i.value] + content = [i if isinstance(i, unicode) else i.decode('utf-8', 'replace') + for i in content] + content = u'\n'.join(content) if not content.strip(): content = None if not link and not content: @@ -167,66 +170,66 @@ class Feed(object): if not isinstance(title, unicode): title = title.decode('utf-8', 'replace') self.logger.debug('Skipping article %s as it is too old'%title) - + def __iter__(self): return iter(self.articles) - + def __len__(self): return len(self.articles) - + def __repr__(self): res = [('%20s\n'%'').replace(' ', '_')+repr(art) for art in self] - + return '\n'+'\n'.join(res)+'\n' - + def __str__(self): return repr(self) - + def __bool__(self): for article in self: if getattr(article, 'downloaded', False): return True return False - + def has_embedded_content(self): length = 0 for a in self: if a.content or a.summary: - length += max(len(a.content if a.content else ''), + length += max(len(a.content if a.content else ''), len(a.summary if a.summary else '')) - + return length > 2000 * len(self) - + def has_article(self, article): for a in self: if a.is_same_as(article): return True return False - + def find(self, article): for i, a in enumerate(self): if a.is_same_as(article): return i return -1 - + def remove(self, article): i = self.index(article) if i > -1: self.articles[i:i+1] = [] class FeedCollection(list): - + def __init__(self, feeds): list.__init__(self, [f for f in feeds if len(f.articles) > 0]) found_articles = set([]) duplicates = set([]) - + def in_set(s, a): for x in s: if a.is_same_as(x): return x return None - + print '#feeds', len(self) print map(len, self) for f in self: @@ -240,18 +243,18 @@ class FeedCollection(list): found_articles.add(a) for x in dups: f.articles.remove(x) - + self.duplicates = duplicates print len(duplicates) print map(len, self) #raise - + def find_article(self, article): for j, f in enumerate(self): for i, a in enumerate(f): if a is article: return (j, i) - + def restore_duplicates(self): temp = [] for article, feed in self.duplicates: @@ -261,13 +264,13 @@ class FeedCollection(list): temp.append((feed, art)) for feed, art in temp: feed.articles.append(art) - -def feed_from_xml(raw_xml, title=None, oldest_article=7, + +def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)): feed = parse(raw_xml) pfeed = Feed(get_article_url=get_article_url) - pfeed.populate_from_feed(feed, title=title, + pfeed.populate_from_feed(feed, title=title, oldest_article=oldest_article, max_articles_per_feed=max_articles_per_feed) return pfeed @@ -281,7 +284,7 @@ def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100): feeds = [] for title, articles in index: pfeed = Feed() - pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article, + pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article, max_articles_per_feed=max_articles_per_feed) feeds.append(pfeed) - return feeds \ No newline at end of file + return feeds