mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 02:34:06 -04:00
Implement support for full-content feeds via a FullContentProfile
This commit is contained in:
parent
2080d9e52d
commit
f79fd7a15c
@ -153,9 +153,10 @@ class DefaultProfile(object):
|
|||||||
return index
|
return index
|
||||||
|
|
||||||
|
|
||||||
def parse_feeds(self):
|
def parse_feeds(self, require_url=True):
|
||||||
'''
|
'''
|
||||||
Create list of articles from a list of feeds.
|
Create list of articles from a list of feeds.
|
||||||
|
@param require_url: If True skip articles that don't have a link to a HTML page with the full article contents.
|
||||||
@return: A dictionary whose keys are feed titles and whose values are each
|
@return: A dictionary whose keys are feed titles and whose values are each
|
||||||
a list of dictionaries. Each list contains dictionaries of the form:
|
a list of dictionaries. Each list contains dictionaries of the form:
|
||||||
{
|
{
|
||||||
@ -163,6 +164,7 @@ class DefaultProfile(object):
|
|||||||
'url' : URL of print version,
|
'url' : URL of print version,
|
||||||
'date' : The publication date of the article as a string,
|
'date' : The publication date of the article as a string,
|
||||||
'description' : A summary of the article
|
'description' : A summary of the article
|
||||||
|
'content' : The full article (can be an empty string). This is unused in DefaultProfile
|
||||||
}
|
}
|
||||||
'''
|
'''
|
||||||
added_articles = {}
|
added_articles = {}
|
||||||
@ -196,14 +198,25 @@ class DefaultProfile(object):
|
|||||||
if url:
|
if url:
|
||||||
break
|
break
|
||||||
|
|
||||||
if not url or not url.string:
|
if require_url and (not url or not url.string):
|
||||||
self.logger.debug('Skipping article as it does not have a link url')
|
self.logger.debug('Skipping article as it does not have a link url')
|
||||||
continue
|
continue
|
||||||
|
url = url.string if (url and url.string) else ''
|
||||||
|
|
||||||
|
content = item.find('content:encoded')
|
||||||
|
if not content:
|
||||||
|
content = item.find('description')
|
||||||
|
if content:
|
||||||
|
content = self.process_html_description(content, strip_links=False)
|
||||||
|
else:
|
||||||
|
content = ''
|
||||||
|
|
||||||
d = {
|
d = {
|
||||||
'title' : item.find('title').string,
|
'title' : item.find('title').string,
|
||||||
'url' : self.print_version(url.string),
|
'url' : self.print_version(url),
|
||||||
'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
|
'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
|
||||||
'date' : pubdate if self.use_pubdate else time.ctime()
|
'date' : pubdate if self.use_pubdate else time.ctime(),
|
||||||
|
'content' : content,
|
||||||
}
|
}
|
||||||
delta = time.time() - d['timestamp']
|
delta = time.time() - d['timestamp']
|
||||||
if not self.allow_duplicates:
|
if not self.allow_duplicates:
|
||||||
@ -240,13 +253,19 @@ class DefaultProfile(object):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def process_html_description(cls, tag):
|
def process_html_description(cls, tag, strip_links=True):
|
||||||
src = '\n'.join(tag.contents)
|
src = '\n'.join(tag.contents)
|
||||||
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
|
match = re.match(r'<\!\[CDATA\[(.*)\]\]>', src.lstrip())
|
||||||
for e in replaced_entities:
|
if match:
|
||||||
ent = '&'+e+';'
|
src = match.group(1)
|
||||||
src = src.replace(ent, unichr(name2codepoint[e]))
|
else:
|
||||||
return re.compile(r'<a.*?>(.*?)</a>', re.IGNORECASE|re.DOTALL).sub(r'\1', src)
|
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
|
||||||
|
for e in replaced_entities:
|
||||||
|
ent = '&'+e+';'
|
||||||
|
src = src.replace(ent, unichr(name2codepoint[e]))
|
||||||
|
if strip_links:
|
||||||
|
src = re.compile(r'<a.*?>(.*?)</a>', re.IGNORECASE|re.DOTALL).sub(r'\1', src)
|
||||||
|
return src
|
||||||
|
|
||||||
|
|
||||||
DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6)
|
DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6)
|
||||||
@ -301,3 +320,81 @@ class DefaultProfile(object):
|
|||||||
return args
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
class FullContentProfile(DefaultProfile):
|
||||||
|
|
||||||
|
max_recursions = 3
|
||||||
|
summary_length = 500 # Max number of characters in the short description
|
||||||
|
|
||||||
|
article_counter = 0
|
||||||
|
|
||||||
|
def build_index(self):
|
||||||
|
'''Build an RSS based index.html'''
|
||||||
|
import os
|
||||||
|
articles = self.parse_feeds(require_url=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def build_sub_index(title, items):
|
||||||
|
ilist = ''
|
||||||
|
li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
|
||||||
|
u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
|
||||||
|
for item in items:
|
||||||
|
content = item['content']
|
||||||
|
if not content:
|
||||||
|
self.logger.debug('Skipping article as it has no content:%s'%item['title'])
|
||||||
|
continue
|
||||||
|
item['description'] = item['description'][:self.summary_length]+'…'
|
||||||
|
self.article_counter = self.article_counter + 1
|
||||||
|
url = os.path.join(self.temp_dir, 'article%d.html'%self.article_counter)
|
||||||
|
item['url'] = url
|
||||||
|
open(url, 'wb').write((u'''\
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h2>%s</h2>
|
||||||
|
<div>
|
||||||
|
%s
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>'''%(item['title'], content)).encode('utf-8')
|
||||||
|
)
|
||||||
|
ilist += li%item
|
||||||
|
return u'''\
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h2>%(title)s</h2>
|
||||||
|
<ul>
|
||||||
|
%(items)s
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
'''%dict(title=title, items=ilist.rstrip())
|
||||||
|
|
||||||
|
cnum = 0
|
||||||
|
clist = ''
|
||||||
|
categories = articles.keys()
|
||||||
|
categories.sort()
|
||||||
|
for category in categories:
|
||||||
|
cnum += 1
|
||||||
|
cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html')
|
||||||
|
prefix = 'file:' if iswindows else ''
|
||||||
|
clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
|
||||||
|
src = build_sub_index(category, articles[category])
|
||||||
|
open(cfile, 'wb').write(src.encode('utf-8'))
|
||||||
|
|
||||||
|
src = '''\
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1>%(title)s</h1>
|
||||||
|
<div style='text-align: right; font-weight: bold'>%(date)s</div>
|
||||||
|
<ul>
|
||||||
|
%(categories)s
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
'''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()),
|
||||||
|
categories=clist, title=self.title)
|
||||||
|
index = os.path.join(self.temp_dir, 'index.html')
|
||||||
|
open(index, 'wb').write(src.encode('utf-8'))
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
@ -4,12 +4,11 @@
|
|||||||
'''
|
'''
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
|
from libprs500.ebooks.lrf.web.profiles import FullContentProfile
|
||||||
|
|
||||||
class Portfolio(DefaultProfile):
|
class Portfolio(FullContentProfile):
|
||||||
|
|
||||||
title = 'Portfolio'
|
title = 'Portfolio'
|
||||||
max_recursions = 0
|
|
||||||
max_articles_per_feed = 50
|
max_articles_per_feed = 50
|
||||||
timefmt = ' [%a, %b %d, %Y]'
|
timefmt = ' [%a, %b %d, %Y]'
|
||||||
html_description = True
|
html_description = True
|
||||||
@ -17,7 +16,6 @@ class Portfolio(DefaultProfile):
|
|||||||
html2lrf_options = ['--ignore-tables']
|
html2lrf_options = ['--ignore-tables']
|
||||||
##delay = 1
|
##delay = 1
|
||||||
|
|
||||||
## Don't grab articles more than 7 days old
|
|
||||||
oldest_article = 30
|
oldest_article = 30
|
||||||
|
|
||||||
## Comment out the feeds you don't want retrieved.
|
## Comment out the feeds you don't want retrieved.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user