mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-11 09:13:57 -04:00
Migrate Dilbert feed to new infrastructure
This commit is contained in:
parent
0983dbeafd
commit
d834a8facc
@ -52,6 +52,7 @@ def xml_to_unicode(raw, verbose=False):
|
|||||||
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
|
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
|
||||||
CHARSET_ALIASES = { "macintosh" : "mac-roman",
|
CHARSET_ALIASES = { "macintosh" : "mac-roman",
|
||||||
"x-sjis" : "shift-jis" }
|
"x-sjis" : "shift-jis" }
|
||||||
|
if encoding:
|
||||||
encoding = encoding.lower()
|
encoding = encoding.lower()
|
||||||
if CHARSET_ALIASES.has_key(encoding):
|
if CHARSET_ALIASES.has_key(encoding):
|
||||||
encoding = CHARSET_ALIASES[encoding]
|
encoding = CHARSET_ALIASES[encoding]
|
||||||
|
@ -24,13 +24,14 @@
|
|||||||
<reference py:if="mi.cover" type="cover" href="${mi.cover}" />
|
<reference py:if="mi.cover" type="cover" href="${mi.cover}" />
|
||||||
</guide>
|
</guide>
|
||||||
|
|
||||||
<manifest>
|
<manifest py:if="getattr(mi, 'manifest', None)">
|
||||||
<py:for each="i, m in enumerate(mi.manifest)">
|
<py:for each="i, m in enumerate(mi.manifest)">
|
||||||
<item id="${str(i)}" href="${m[0]}" media-type="${m[1]}" />
|
<item id="${str(i)}" href="${m[0]}" media-type="${m[1]}" />
|
||||||
</py:for>
|
</py:for>
|
||||||
</manifest>
|
</manifest>
|
||||||
|
|
||||||
<spine py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
|
<spine py:if="getattr(mi, 'manifest', None)"
|
||||||
|
py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
|
||||||
<py:for each="idref in mi.spine">
|
<py:for each="idref in mi.spine">
|
||||||
<itemref idref="${str(idref)}" />
|
<itemref idref="${str(idref)}" />
|
||||||
</py:for>
|
</py:for>
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
'''
|
'''
|
||||||
Contains the logic for parsing feeds.
|
Contains the logic for parsing feeds.
|
||||||
'''
|
'''
|
||||||
import time, logging
|
import time, logging, traceback
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from libprs500.web.feeds.feedparser import parse
|
from libprs500.web.feeds.feedparser import parse
|
||||||
@ -54,11 +54,12 @@ Has content : %s
|
|||||||
|
|
||||||
class Feed(object):
|
class Feed(object):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, get_article_url=lambda item: item.get('link', None)):
|
||||||
'''
|
'''
|
||||||
Parse a feed into articles.
|
Parse a feed into articles.
|
||||||
'''
|
'''
|
||||||
self.logger = logging.getLogger('feeds2disk')
|
self.logger = logging.getLogger('feeds2disk')
|
||||||
|
self.get_article_url = get_article_url
|
||||||
|
|
||||||
def populate_from_feed(self, feed, title=None, oldest_article=7,
|
def populate_from_feed(self, feed, title=None, oldest_article=7,
|
||||||
max_articles_per_feed=100):
|
max_articles_per_feed=100):
|
||||||
@ -124,7 +125,12 @@ class Feed(object):
|
|||||||
self.added_articles.append(id)
|
self.added_articles.append(id)
|
||||||
|
|
||||||
title = item.get('title', _('Untitled article'))
|
title = item.get('title', _('Untitled article'))
|
||||||
link = item.get('link', None)
|
try:
|
||||||
|
link = self.get_article_url(item)
|
||||||
|
except:
|
||||||
|
self.logger.warning('Failed to get link for %s'%title)
|
||||||
|
self.logger.debug(traceback.format_exc())
|
||||||
|
link = None
|
||||||
description = item.get('summary', None)
|
description = item.get('summary', None)
|
||||||
|
|
||||||
content = '\n'.join(i.value for i in item.get('content', []))
|
content = '\n'.join(i.value for i in item.get('content', []))
|
||||||
@ -159,9 +165,10 @@ class Feed(object):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=100):
|
def feed_from_xml(raw_xml, title=None, oldest_article=7,
|
||||||
|
max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)):
|
||||||
feed = parse(raw_xml)
|
feed = parse(raw_xml)
|
||||||
pfeed = Feed()
|
pfeed = Feed(get_article_url=get_article_url)
|
||||||
pfeed.populate_from_feed(feed, title=title,
|
pfeed.populate_from_feed(feed, title=title,
|
||||||
oldest_article=oldest_article,
|
oldest_article=oldest_article,
|
||||||
max_articles_per_feed=max_articles_per_feed)
|
max_articles_per_feed=max_articles_per_feed)
|
||||||
|
@ -20,6 +20,7 @@ from libprs500.web.feeds.news import BasicNewsRecipe
|
|||||||
import sys, os, logging
|
import sys, os, logging
|
||||||
from libprs500.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
|
from libprs500.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
|
||||||
from libprs500.web.fetch.simple import option_parser as _option_parser
|
from libprs500.web.fetch.simple import option_parser as _option_parser
|
||||||
|
from libprs500.web.feeds.news import Profile2Recipe
|
||||||
|
|
||||||
|
|
||||||
def option_parser(usage='''\
|
def option_parser(usage='''\
|
||||||
@ -110,7 +111,7 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
|
|||||||
else:
|
else:
|
||||||
raise Exception('not file')
|
raise Exception('not file')
|
||||||
except:
|
except:
|
||||||
recipe = get_builtin_recipe(recipe_arg)
|
recipe, is_profile = get_builtin_recipe(recipe_arg)
|
||||||
if recipe is None:
|
if recipe is None:
|
||||||
recipe = compile_recipe(recipe_arg)
|
recipe = compile_recipe(recipe_arg)
|
||||||
|
|
||||||
@ -125,6 +126,9 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
|
|||||||
handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
|
handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
|
||||||
logging.getLogger('feeds2disk').addHandler(handler)
|
logging.getLogger('feeds2disk').addHandler(handler)
|
||||||
|
|
||||||
|
if is_profile:
|
||||||
|
recipe = Profile2Recipe(recipe, opts, parser, notification)
|
||||||
|
else:
|
||||||
recipe = recipe(opts, parser, notification)
|
recipe = recipe(opts, parser, notification)
|
||||||
if not os.path.exists(recipe.output_dir):
|
if not os.path.exists(recipe.output_dir):
|
||||||
os.makedirs(recipe.output_dir)
|
os.makedirs(recipe.output_dir)
|
||||||
|
@ -20,7 +20,7 @@ to an ebook.
|
|||||||
import logging, os, cStringIO, time, traceback, re
|
import logging, os, cStringIO, time, traceback, re
|
||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
from libprs500 import browser, __appname__
|
from libprs500 import browser, __appname__, iswindows
|
||||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
|
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
|
||||||
from libprs500.ebooks.metadata.opf import OPFCreator
|
from libprs500.ebooks.metadata.opf import OPFCreator
|
||||||
from libprs500.ebooks.metadata.toc import TOC
|
from libprs500.ebooks.metadata.toc import TOC
|
||||||
@ -41,7 +41,7 @@ class BasicNewsRecipe(object):
|
|||||||
title = _('Unknown News Source')
|
title = _('Unknown News Source')
|
||||||
|
|
||||||
#: The author of this recipe
|
#: The author of this recipe
|
||||||
__author__ = _('Unknown')
|
__author__ = _(__appname__)
|
||||||
|
|
||||||
#: Maximum number of articles to download from each feed
|
#: Maximum number of articles to download from each feed
|
||||||
#: @type: integer
|
#: @type: integer
|
||||||
@ -198,6 +198,14 @@ class BasicNewsRecipe(object):
|
|||||||
'''
|
'''
|
||||||
return browser()
|
return browser()
|
||||||
|
|
||||||
|
def get_article_url(self, item):
|
||||||
|
'''
|
||||||
|
Override to perform extraction of URL for each article.
|
||||||
|
@param item: An article instance from L{feedparser}.
|
||||||
|
@type item: L{FeedParserDict}
|
||||||
|
'''
|
||||||
|
return item.get('link', None)
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
'''
|
'''
|
||||||
This function is called with the source of each downloaded HTML file, before
|
This function is called with the source of each downloaded HTML file, before
|
||||||
@ -335,7 +343,7 @@ class BasicNewsRecipe(object):
|
|||||||
if head:
|
if head:
|
||||||
style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
|
style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
|
||||||
head.insert(len(head.contents), style)
|
head.insert(len(head.contents), style)
|
||||||
if first_fetch:
|
if first_fetch and job_info:
|
||||||
url, f, a, feed_len = job_info
|
url, f, a, feed_len = job_info
|
||||||
body = soup.find('body')
|
body = soup.find('body')
|
||||||
if body is not None:
|
if body is not None:
|
||||||
@ -615,7 +623,8 @@ class BasicNewsRecipe(object):
|
|||||||
parsed_feeds.append(feed_from_xml(self.browser.open(url).read(),
|
parsed_feeds.append(feed_from_xml(self.browser.open(url).read(),
|
||||||
title=title,
|
title=title,
|
||||||
oldest_article=self.oldest_article,
|
oldest_article=self.oldest_article,
|
||||||
max_articles_per_feed=self.max_articles_per_feed))
|
max_articles_per_feed=self.max_articles_per_feed,
|
||||||
|
get_article_url=self.get_article_url))
|
||||||
|
|
||||||
return parsed_feeds
|
return parsed_feeds
|
||||||
|
|
||||||
@ -644,3 +653,55 @@ class BasicNewsRecipe(object):
|
|||||||
elif use_alt and item.has_key('alt'):
|
elif use_alt and item.has_key('alt'):
|
||||||
strings.append(item['alt'])
|
strings.append(item['alt'])
|
||||||
return u''.join(strings)
|
return u''.join(strings)
|
||||||
|
|
||||||
|
class Profile2Recipe(BasicNewsRecipe):
|
||||||
|
'''
|
||||||
|
Used to migrate the old news Profiles to the new Recipes. Uses the settings
|
||||||
|
from the old Profile to populate the settings in the Recipe. Also uses, the
|
||||||
|
Profile's get_browser and parse_feeds.
|
||||||
|
'''
|
||||||
|
def __init__(self, profile_class, options, parser, progress_reporter):
|
||||||
|
self.old_profile = profile_class(logging.getLogger('feeds2disk'),
|
||||||
|
username=options.username,
|
||||||
|
password=options.password,
|
||||||
|
lrf=options.lrf)
|
||||||
|
for attr in ('preprocess_regexps', 'oldest_article', 'delay', 'timeout',
|
||||||
|
'match_regexps', 'filter_regexps', 'html2lrf_options',
|
||||||
|
'timefmt', 'needs_subscription', 'summary_length',
|
||||||
|
'max_articles_per_feed', 'title','no_stylesheets', 'encoding'):
|
||||||
|
setattr(self, attr, getattr(self.old_profile, attr))
|
||||||
|
|
||||||
|
self.simultaneous_downloads = 1
|
||||||
|
BasicNewsRecipe.__init__(self, options, parser, progress_reporter)
|
||||||
|
self.browser = self.old_profile.browser
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
return self.old_profile.parse_feeds()
|
||||||
|
|
||||||
|
class CustomIndexRecipe(BasicNewsRecipe):
|
||||||
|
|
||||||
|
def custom_index(self):
|
||||||
|
'''
|
||||||
|
Return the path to a custom HTML document that will serve as the index for
|
||||||
|
this recipe.
|
||||||
|
@rtype: string
|
||||||
|
'''
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def create_opf(self):
|
||||||
|
mi = MetaInformation(self.title, [__appname__])
|
||||||
|
mi = OPFCreator(self.output_dir, mi)
|
||||||
|
mi.create_manifest_from_files_in([self.output_dir])
|
||||||
|
mi.create_spine(['index.html'])
|
||||||
|
mi.render(open(os.path.join(self.output_dir, 'index.opf'), 'wb'))
|
||||||
|
|
||||||
|
def download(self):
|
||||||
|
index = os.path.abspath(self.custom_index())
|
||||||
|
url = 'file:'+index if iswindows else 'file://'+index
|
||||||
|
fetcher = RecursiveFetcher(self.web2disk_options, self.logger)
|
||||||
|
fetcher.base_dir = self.output_dir
|
||||||
|
fetcher.current_dir = self.output_dir
|
||||||
|
fetcher.show_progress = False
|
||||||
|
res = fetcher.start_fetch(url)
|
||||||
|
self.create_opf()
|
||||||
|
return res
|
@ -17,14 +17,14 @@
|
|||||||
'''
|
'''
|
||||||
Builtin recipes.
|
Builtin recipes.
|
||||||
'''
|
'''
|
||||||
recipes = ['newsweek', 'atlantic', 'economist']
|
recipes = ['newsweek', 'atlantic', 'economist', 'dilbert']
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from libprs500.web.feeds.news import BasicNewsRecipe
|
from libprs500.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe
|
||||||
from libprs500.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile
|
from libprs500.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile
|
||||||
from libprs500.ebooks.lrf.web import available_profiles
|
from libprs500.ebooks.lrf.web import builtin_profiles
|
||||||
|
|
||||||
basic_recipes = (BasicNewsRecipe, DefaultProfile, FullContentProfile)
|
basic_recipes = (BasicNewsRecipe, CustomIndexRecipe, DefaultProfile, FullContentProfile)
|
||||||
basic_recipe_names = (i.__name__ for i in basic_recipes)
|
basic_recipe_names = (i.__name__ for i in basic_recipes)
|
||||||
|
|
||||||
|
|
||||||
@ -51,6 +51,8 @@ recipes = [load_recipe(i) for i in recipes]
|
|||||||
def compile_recipe(src):
|
def compile_recipe(src):
|
||||||
'''
|
'''
|
||||||
Compile the code in src and return the first object that is a recipe or profile.
|
Compile the code in src and return the first object that is a recipe or profile.
|
||||||
|
@param src: Python source code
|
||||||
|
@type src: string
|
||||||
@return: Recipe/Profile class or None, if no such class was found in C{src}
|
@return: Recipe/Profile class or None, if no such class was found in C{src}
|
||||||
'''
|
'''
|
||||||
locals = {}
|
locals = {}
|
||||||
@ -67,13 +69,20 @@ def compile_recipe(src):
|
|||||||
def get_builtin_recipe(title):
|
def get_builtin_recipe(title):
|
||||||
'''
|
'''
|
||||||
Return a builtin recipe/profile class whoose title == C{title} or None if no such
|
Return a builtin recipe/profile class whoose title == C{title} or None if no such
|
||||||
recipe exists.
|
recipe exists. Also returns a flag that is True iff the found recipe is really
|
||||||
|
an old-style Profile.
|
||||||
|
|
||||||
@type title: string
|
@type title: string
|
||||||
@rtype: class or None
|
@rtype: class or None, boolean
|
||||||
'''
|
'''
|
||||||
for r in recipes:
|
for r in recipes:
|
||||||
if r.title == title:
|
if r.title == title:
|
||||||
return r
|
return r, False
|
||||||
|
for p in builtin_profiles:
|
||||||
|
if p.title == title:
|
||||||
|
return p, True
|
||||||
|
return None, False
|
||||||
|
|
||||||
titles = set([r.title for r in recipes])
|
_titles = list(frozenset([r.title for r in recipes] + [p.title for p in builtin_profiles]))
|
||||||
|
_titles.sort()
|
||||||
|
titles = _titles
|
45
src/libprs500/web/feeds/recipes/dilbert.py
Normal file
45
src/libprs500/web/feeds/recipes/dilbert.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||||
|
## This program is free software; you can redistribute it and/or modify
|
||||||
|
## it under the terms of the GNU General Public License as published by
|
||||||
|
## the Free Software Foundation; either version 2 of the License, or
|
||||||
|
## (at your option) any later version.
|
||||||
|
##
|
||||||
|
## This program is distributed in the hope that it will be useful,
|
||||||
|
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
## GNU General Public License for more details.
|
||||||
|
##
|
||||||
|
## You should have received a copy of the GNU General Public License along
|
||||||
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
'''
|
||||||
|
Daily Dilbert
|
||||||
|
'''
|
||||||
|
import os
|
||||||
|
from libprs500.web.feeds.news import CustomIndexRecipe
|
||||||
|
from libprs500.ptempfile import PersistentTemporaryDirectory
|
||||||
|
|
||||||
|
class Dilbert(CustomIndexRecipe):
|
||||||
|
|
||||||
|
title = 'Dilbert'
|
||||||
|
timefmt = ' [%d %b %Y]'
|
||||||
|
|
||||||
|
feeds = [('Dilbert', 'http://feeds.feedburner.com/tapestrydilbert')]
|
||||||
|
|
||||||
|
def get_article_url(self, item):
|
||||||
|
return item.get('enclosures')[0].get('url')
|
||||||
|
|
||||||
|
def custom_index(self):
|
||||||
|
tdir = PersistentTemporaryDirectory('feeds2disk_dilbert')
|
||||||
|
index = os.path.join(tdir, 'index.html')
|
||||||
|
feed = self.parse_feeds()[0]
|
||||||
|
|
||||||
|
res = ''
|
||||||
|
for item in feed:
|
||||||
|
res += '<h3>%s</h3><img style="page-break-after:always" src="%s" />\n'%(item.title, item.url)
|
||||||
|
res = '<html><body><h1>Dilbert</h1>%s</body></html'%res
|
||||||
|
open(index, 'wb').write(res)
|
||||||
|
return index
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user