mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix encoding bug when using full content feeds
This commit is contained in:
parent
8b17b20066
commit
3b6b27c3c4
@ -13,7 +13,7 @@ from calibre import entity_to_unicode
|
|||||||
from lxml import html
|
from lxml import html
|
||||||
|
|
||||||
class Article(object):
|
class Article(object):
|
||||||
|
|
||||||
time_offset = datetime.now() - datetime.utcnow()
|
time_offset = datetime.now() - datetime.utcnow()
|
||||||
|
|
||||||
def __init__(self, id, title, url, summary, published, content):
|
def __init__(self, id, title, url, summary, published, content):
|
||||||
@ -21,7 +21,7 @@ class Article(object):
|
|||||||
self.id = id
|
self.id = id
|
||||||
self.title = title.strip() if title else title
|
self.title = title.strip() if title else title
|
||||||
try:
|
try:
|
||||||
self.title = re.sub(r'&(\S+);',
|
self.title = re.sub(r'&(\S+);',
|
||||||
entity_to_unicode, self.title)
|
entity_to_unicode, self.title)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
@ -44,7 +44,7 @@ class Article(object):
|
|||||||
self.utctime = datetime(*self.date[:6])
|
self.utctime = datetime(*self.date[:6])
|
||||||
self.localtime = self.utctime + self.time_offset
|
self.localtime = self.utctime + self.time_offset
|
||||||
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return \
|
return \
|
||||||
(u'''\
|
(u'''\
|
||||||
@ -58,14 +58,14 @@ Has content : %s
|
|||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return repr(self)
|
return repr(self)
|
||||||
|
|
||||||
def is_same_as(self, other_article):
|
def is_same_as(self, other_article):
|
||||||
#if self.title != getattr(other_article, 'title', False):
|
#if self.title != getattr(other_article, 'title', False):
|
||||||
# return False
|
# return False
|
||||||
if self.url:
|
if self.url:
|
||||||
return self.url == getattr(other_article, 'url', False)
|
return self.url == getattr(other_article, 'url', False)
|
||||||
return self.content == getattr(other_article, 'content', False)
|
return self.content == getattr(other_article, 'content', False)
|
||||||
|
|
||||||
|
|
||||||
class Feed(object):
|
class Feed(object):
|
||||||
|
|
||||||
@ -75,8 +75,8 @@ class Feed(object):
|
|||||||
'''
|
'''
|
||||||
self.logger = logging.getLogger('feeds2disk')
|
self.logger = logging.getLogger('feeds2disk')
|
||||||
self.get_article_url = get_article_url
|
self.get_article_url = get_article_url
|
||||||
|
|
||||||
def populate_from_feed(self, feed, title=None, oldest_article=7,
|
def populate_from_feed(self, feed, title=None, oldest_article=7,
|
||||||
max_articles_per_feed=100):
|
max_articles_per_feed=100):
|
||||||
entries = feed.entries
|
entries = feed.entries
|
||||||
feed = feed.feed
|
feed = feed.feed
|
||||||
@ -87,30 +87,30 @@ class Feed(object):
|
|||||||
self.image_width = image.get('width', 88)
|
self.image_width = image.get('width', 88)
|
||||||
self.image_height = image.get('height', 31)
|
self.image_height = image.get('height', 31)
|
||||||
self.image_alt = image.get('title', '')
|
self.image_alt = image.get('title', '')
|
||||||
|
|
||||||
self.articles = []
|
self.articles = []
|
||||||
self.id_counter = 0
|
self.id_counter = 0
|
||||||
self.added_articles = []
|
self.added_articles = []
|
||||||
|
|
||||||
self.oldest_article = oldest_article
|
self.oldest_article = oldest_article
|
||||||
|
|
||||||
for item in entries:
|
for item in entries:
|
||||||
if len(self.articles) >= max_articles_per_feed:
|
if len(self.articles) >= max_articles_per_feed:
|
||||||
break
|
break
|
||||||
self.parse_article(item)
|
self.parse_article(item)
|
||||||
|
|
||||||
|
|
||||||
def populate_from_preparsed_feed(self, title, articles, oldest_article=7,
|
def populate_from_preparsed_feed(self, title, articles, oldest_article=7,
|
||||||
max_articles_per_feed=100):
|
max_articles_per_feed=100):
|
||||||
self.title = title if title else _('Unknown feed')
|
self.title = title if title else _('Unknown feed')
|
||||||
self.descrition = ''
|
self.descrition = ''
|
||||||
self.image_url = None
|
self.image_url = None
|
||||||
self.articles = []
|
self.articles = []
|
||||||
self.added_articles = []
|
self.added_articles = []
|
||||||
|
|
||||||
self.oldest_article = oldest_article
|
self.oldest_article = oldest_article
|
||||||
self.id_counter = 0
|
self.id_counter = 0
|
||||||
|
|
||||||
for item in articles:
|
for item in articles:
|
||||||
if len(self.articles) >= max_articles_per_feed:
|
if len(self.articles) >= max_articles_per_feed:
|
||||||
break
|
break
|
||||||
@ -130,8 +130,8 @@ class Feed(object):
|
|||||||
self.articles.append(article)
|
self.articles.append(article)
|
||||||
else:
|
else:
|
||||||
self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
|
self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
|
||||||
|
|
||||||
|
|
||||||
def parse_article(self, item):
|
def parse_article(self, item):
|
||||||
id = item.get('id', 'internal id#'+str(self.id_counter))
|
id = item.get('id', 'internal id#'+str(self.id_counter))
|
||||||
if id in self.added_articles:
|
if id in self.added_articles:
|
||||||
@ -141,7 +141,7 @@ class Feed(object):
|
|||||||
published = time.gmtime()
|
published = time.gmtime()
|
||||||
self.id_counter += 1
|
self.id_counter += 1
|
||||||
self.added_articles.append(id)
|
self.added_articles.append(id)
|
||||||
|
|
||||||
title = item.get('title', _('Untitled article'))
|
title = item.get('title', _('Untitled article'))
|
||||||
try:
|
try:
|
||||||
link = self.get_article_url(item)
|
link = self.get_article_url(item)
|
||||||
@ -150,8 +150,11 @@ class Feed(object):
|
|||||||
self.logger.debug(traceback.format_exc())
|
self.logger.debug(traceback.format_exc())
|
||||||
link = None
|
link = None
|
||||||
description = item.get('summary', None)
|
description = item.get('summary', None)
|
||||||
|
|
||||||
content = '\n'.join(i.value for i in item.get('content', []))
|
content = [i.value for i in item.get('content', []) if i.value]
|
||||||
|
content = [i if isinstance(i, unicode) else i.decode('utf-8', 'replace')
|
||||||
|
for i in content]
|
||||||
|
content = u'\n'.join(content)
|
||||||
if not content.strip():
|
if not content.strip():
|
||||||
content = None
|
content = None
|
||||||
if not link and not content:
|
if not link and not content:
|
||||||
@ -167,66 +170,66 @@ class Feed(object):
|
|||||||
if not isinstance(title, unicode):
|
if not isinstance(title, unicode):
|
||||||
title = title.decode('utf-8', 'replace')
|
title = title.decode('utf-8', 'replace')
|
||||||
self.logger.debug('Skipping article %s as it is too old'%title)
|
self.logger.debug('Skipping article %s as it is too old'%title)
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return iter(self.articles)
|
return iter(self.articles)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.articles)
|
return len(self.articles)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
res = [('%20s\n'%'').replace(' ', '_')+repr(art) for art in self]
|
res = [('%20s\n'%'').replace(' ', '_')+repr(art) for art in self]
|
||||||
|
|
||||||
return '\n'+'\n'.join(res)+'\n'
|
return '\n'+'\n'.join(res)+'\n'
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return repr(self)
|
return repr(self)
|
||||||
|
|
||||||
def __bool__(self):
|
def __bool__(self):
|
||||||
for article in self:
|
for article in self:
|
||||||
if getattr(article, 'downloaded', False):
|
if getattr(article, 'downloaded', False):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def has_embedded_content(self):
|
def has_embedded_content(self):
|
||||||
length = 0
|
length = 0
|
||||||
for a in self:
|
for a in self:
|
||||||
if a.content or a.summary:
|
if a.content or a.summary:
|
||||||
length += max(len(a.content if a.content else ''),
|
length += max(len(a.content if a.content else ''),
|
||||||
len(a.summary if a.summary else ''))
|
len(a.summary if a.summary else ''))
|
||||||
|
|
||||||
return length > 2000 * len(self)
|
return length > 2000 * len(self)
|
||||||
|
|
||||||
def has_article(self, article):
|
def has_article(self, article):
|
||||||
for a in self:
|
for a in self:
|
||||||
if a.is_same_as(article):
|
if a.is_same_as(article):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def find(self, article):
|
def find(self, article):
|
||||||
for i, a in enumerate(self):
|
for i, a in enumerate(self):
|
||||||
if a.is_same_as(article):
|
if a.is_same_as(article):
|
||||||
return i
|
return i
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
def remove(self, article):
|
def remove(self, article):
|
||||||
i = self.index(article)
|
i = self.index(article)
|
||||||
if i > -1:
|
if i > -1:
|
||||||
self.articles[i:i+1] = []
|
self.articles[i:i+1] = []
|
||||||
|
|
||||||
class FeedCollection(list):
|
class FeedCollection(list):
|
||||||
|
|
||||||
def __init__(self, feeds):
|
def __init__(self, feeds):
|
||||||
list.__init__(self, [f for f in feeds if len(f.articles) > 0])
|
list.__init__(self, [f for f in feeds if len(f.articles) > 0])
|
||||||
found_articles = set([])
|
found_articles = set([])
|
||||||
duplicates = set([])
|
duplicates = set([])
|
||||||
|
|
||||||
def in_set(s, a):
|
def in_set(s, a):
|
||||||
for x in s:
|
for x in s:
|
||||||
if a.is_same_as(x):
|
if a.is_same_as(x):
|
||||||
return x
|
return x
|
||||||
return None
|
return None
|
||||||
|
|
||||||
print '#feeds', len(self)
|
print '#feeds', len(self)
|
||||||
print map(len, self)
|
print map(len, self)
|
||||||
for f in self:
|
for f in self:
|
||||||
@ -240,18 +243,18 @@ class FeedCollection(list):
|
|||||||
found_articles.add(a)
|
found_articles.add(a)
|
||||||
for x in dups:
|
for x in dups:
|
||||||
f.articles.remove(x)
|
f.articles.remove(x)
|
||||||
|
|
||||||
self.duplicates = duplicates
|
self.duplicates = duplicates
|
||||||
print len(duplicates)
|
print len(duplicates)
|
||||||
print map(len, self)
|
print map(len, self)
|
||||||
#raise
|
#raise
|
||||||
|
|
||||||
def find_article(self, article):
|
def find_article(self, article):
|
||||||
for j, f in enumerate(self):
|
for j, f in enumerate(self):
|
||||||
for i, a in enumerate(f):
|
for i, a in enumerate(f):
|
||||||
if a is article:
|
if a is article:
|
||||||
return (j, i)
|
return (j, i)
|
||||||
|
|
||||||
def restore_duplicates(self):
|
def restore_duplicates(self):
|
||||||
temp = []
|
temp = []
|
||||||
for article, feed in self.duplicates:
|
for article, feed in self.duplicates:
|
||||||
@ -261,13 +264,13 @@ class FeedCollection(list):
|
|||||||
temp.append((feed, art))
|
temp.append((feed, art))
|
||||||
for feed, art in temp:
|
for feed, art in temp:
|
||||||
feed.articles.append(art)
|
feed.articles.append(art)
|
||||||
|
|
||||||
|
|
||||||
def feed_from_xml(raw_xml, title=None, oldest_article=7,
|
|
||||||
|
def feed_from_xml(raw_xml, title=None, oldest_article=7,
|
||||||
max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)):
|
max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)):
|
||||||
feed = parse(raw_xml)
|
feed = parse(raw_xml)
|
||||||
pfeed = Feed(get_article_url=get_article_url)
|
pfeed = Feed(get_article_url=get_article_url)
|
||||||
pfeed.populate_from_feed(feed, title=title,
|
pfeed.populate_from_feed(feed, title=title,
|
||||||
oldest_article=oldest_article,
|
oldest_article=oldest_article,
|
||||||
max_articles_per_feed=max_articles_per_feed)
|
max_articles_per_feed=max_articles_per_feed)
|
||||||
return pfeed
|
return pfeed
|
||||||
@ -281,7 +284,7 @@ def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100):
|
|||||||
feeds = []
|
feeds = []
|
||||||
for title, articles in index:
|
for title, articles in index:
|
||||||
pfeed = Feed()
|
pfeed = Feed()
|
||||||
pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
|
pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
|
||||||
max_articles_per_feed=max_articles_per_feed)
|
max_articles_per_feed=max_articles_per_feed)
|
||||||
feeds.append(pfeed)
|
feeds.append(pfeed)
|
||||||
return feeds
|
return feeds
|
||||||
|
Loading…
x
Reference in New Issue
Block a user