mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-11 09:13:57 -04:00
Migrate Dilbert feed to new infrastructure
This commit is contained in:
parent
0983dbeafd
commit
d834a8facc
@ -52,7 +52,8 @@ def xml_to_unicode(raw, verbose=False):
|
||||
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
|
||||
CHARSET_ALIASES = { "macintosh" : "mac-roman",
|
||||
"x-sjis" : "shift-jis" }
|
||||
encoding = encoding.lower()
|
||||
if encoding:
|
||||
encoding = encoding.lower()
|
||||
if CHARSET_ALIASES.has_key(encoding):
|
||||
encoding = CHARSET_ALIASES[encoding]
|
||||
return raw.decode(encoding, 'ignore'), encoding
|
||||
|
@ -24,13 +24,14 @@
|
||||
<reference py:if="mi.cover" type="cover" href="${mi.cover}" />
|
||||
</guide>
|
||||
|
||||
<manifest>
|
||||
<manifest py:if="getattr(mi, 'manifest', None)">
|
||||
<py:for each="i, m in enumerate(mi.manifest)">
|
||||
<item id="${str(i)}" href="${m[0]}" media-type="${m[1]}" />
|
||||
</py:for>
|
||||
</manifest>
|
||||
|
||||
<spine py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
|
||||
<spine py:if="getattr(mi, 'manifest', None)"
|
||||
py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
|
||||
<py:for each="idref in mi.spine">
|
||||
<itemref idref="${str(idref)}" />
|
||||
</py:for>
|
||||
|
@ -17,7 +17,7 @@
|
||||
'''
|
||||
Contains the logic for parsing feeds.
|
||||
'''
|
||||
import time, logging
|
||||
import time, logging, traceback
|
||||
from datetime import datetime
|
||||
|
||||
from libprs500.web.feeds.feedparser import parse
|
||||
@ -54,11 +54,12 @@ Has content : %s
|
||||
|
||||
class Feed(object):
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, get_article_url=lambda item: item.get('link', None)):
|
||||
'''
|
||||
Parse a feed into articles.
|
||||
'''
|
||||
self.logger = logging.getLogger('feeds2disk')
|
||||
self.get_article_url = get_article_url
|
||||
|
||||
def populate_from_feed(self, feed, title=None, oldest_article=7,
|
||||
max_articles_per_feed=100):
|
||||
@ -124,7 +125,12 @@ class Feed(object):
|
||||
self.added_articles.append(id)
|
||||
|
||||
title = item.get('title', _('Untitled article'))
|
||||
link = item.get('link', None)
|
||||
try:
|
||||
link = self.get_article_url(item)
|
||||
except:
|
||||
self.logger.warning('Failed to get link for %s'%title)
|
||||
self.logger.debug(traceback.format_exc())
|
||||
link = None
|
||||
description = item.get('summary', None)
|
||||
|
||||
content = '\n'.join(i.value for i in item.get('content', []))
|
||||
@ -159,9 +165,10 @@ class Feed(object):
|
||||
return False
|
||||
|
||||
|
||||
def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=100):
|
||||
def feed_from_xml(raw_xml, title=None, oldest_article=7,
|
||||
max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)):
|
||||
feed = parse(raw_xml)
|
||||
pfeed = Feed()
|
||||
pfeed = Feed(get_article_url=get_article_url)
|
||||
pfeed.populate_from_feed(feed, title=title,
|
||||
oldest_article=oldest_article,
|
||||
max_articles_per_feed=max_articles_per_feed)
|
||||
|
@ -20,6 +20,7 @@ from libprs500.web.feeds.news import BasicNewsRecipe
|
||||
import sys, os, logging
|
||||
from libprs500.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
|
||||
from libprs500.web.fetch.simple import option_parser as _option_parser
|
||||
from libprs500.web.feeds.news import Profile2Recipe
|
||||
|
||||
|
||||
def option_parser(usage='''\
|
||||
@ -110,7 +111,7 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
|
||||
else:
|
||||
raise Exception('not file')
|
||||
except:
|
||||
recipe = get_builtin_recipe(recipe_arg)
|
||||
recipe, is_profile = get_builtin_recipe(recipe_arg)
|
||||
if recipe is None:
|
||||
recipe = compile_recipe(recipe_arg)
|
||||
|
||||
@ -125,7 +126,10 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
|
||||
handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
|
||||
logging.getLogger('feeds2disk').addHandler(handler)
|
||||
|
||||
recipe = recipe(opts, parser, notification)
|
||||
if is_profile:
|
||||
recipe = Profile2Recipe(recipe, opts, parser, notification)
|
||||
else:
|
||||
recipe = recipe(opts, parser, notification)
|
||||
if not os.path.exists(recipe.output_dir):
|
||||
os.makedirs(recipe.output_dir)
|
||||
recipe.download()
|
||||
|
@ -20,7 +20,7 @@ to an ebook.
|
||||
import logging, os, cStringIO, time, traceback, re
|
||||
import urlparse
|
||||
|
||||
from libprs500 import browser, __appname__
|
||||
from libprs500 import browser, __appname__, iswindows
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
|
||||
from libprs500.ebooks.metadata.opf import OPFCreator
|
||||
from libprs500.ebooks.metadata.toc import TOC
|
||||
@ -41,7 +41,7 @@ class BasicNewsRecipe(object):
|
||||
title = _('Unknown News Source')
|
||||
|
||||
#: The author of this recipe
|
||||
__author__ = _('Unknown')
|
||||
__author__ = _(__appname__)
|
||||
|
||||
#: Maximum number of articles to download from each feed
|
||||
#: @type: integer
|
||||
@ -198,6 +198,14 @@ class BasicNewsRecipe(object):
|
||||
'''
|
||||
return browser()
|
||||
|
||||
def get_article_url(self, item):
|
||||
'''
|
||||
Override to perform extraction of URL for each article.
|
||||
@param item: An article instance from L{feedparser}.
|
||||
@type item: L{FeedParserDict}
|
||||
'''
|
||||
return item.get('link', None)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
'''
|
||||
This function is called with the source of each downloaded HTML file, before
|
||||
@ -335,7 +343,7 @@ class BasicNewsRecipe(object):
|
||||
if head:
|
||||
style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
|
||||
head.insert(len(head.contents), style)
|
||||
if first_fetch:
|
||||
if first_fetch and job_info:
|
||||
url, f, a, feed_len = job_info
|
||||
body = soup.find('body')
|
||||
if body is not None:
|
||||
@ -615,7 +623,8 @@ class BasicNewsRecipe(object):
|
||||
parsed_feeds.append(feed_from_xml(self.browser.open(url).read(),
|
||||
title=title,
|
||||
oldest_article=self.oldest_article,
|
||||
max_articles_per_feed=self.max_articles_per_feed))
|
||||
max_articles_per_feed=self.max_articles_per_feed,
|
||||
get_article_url=self.get_article_url))
|
||||
|
||||
return parsed_feeds
|
||||
|
||||
@ -644,3 +653,55 @@ class BasicNewsRecipe(object):
|
||||
elif use_alt and item.has_key('alt'):
|
||||
strings.append(item['alt'])
|
||||
return u''.join(strings)
|
||||
|
||||
class Profile2Recipe(BasicNewsRecipe):
|
||||
'''
|
||||
Used to migrate the old news Profiles to the new Recipes. Uses the settings
|
||||
from the old Profile to populate the settings in the Recipe. Also uses, the
|
||||
Profile's get_browser and parse_feeds.
|
||||
'''
|
||||
def __init__(self, profile_class, options, parser, progress_reporter):
|
||||
self.old_profile = profile_class(logging.getLogger('feeds2disk'),
|
||||
username=options.username,
|
||||
password=options.password,
|
||||
lrf=options.lrf)
|
||||
for attr in ('preprocess_regexps', 'oldest_article', 'delay', 'timeout',
|
||||
'match_regexps', 'filter_regexps', 'html2lrf_options',
|
||||
'timefmt', 'needs_subscription', 'summary_length',
|
||||
'max_articles_per_feed', 'title','no_stylesheets', 'encoding'):
|
||||
setattr(self, attr, getattr(self.old_profile, attr))
|
||||
|
||||
self.simultaneous_downloads = 1
|
||||
BasicNewsRecipe.__init__(self, options, parser, progress_reporter)
|
||||
self.browser = self.old_profile.browser
|
||||
|
||||
def parse_index(self):
|
||||
return self.old_profile.parse_feeds()
|
||||
|
||||
class CustomIndexRecipe(BasicNewsRecipe):
|
||||
|
||||
def custom_index(self):
|
||||
'''
|
||||
Return the path to a custom HTML document that will serve as the index for
|
||||
this recipe.
|
||||
@rtype: string
|
||||
'''
|
||||
raise NotImplementedError
|
||||
|
||||
def create_opf(self):
|
||||
mi = MetaInformation(self.title, [__appname__])
|
||||
mi = OPFCreator(self.output_dir, mi)
|
||||
mi.create_manifest_from_files_in([self.output_dir])
|
||||
mi.create_spine(['index.html'])
|
||||
mi.render(open(os.path.join(self.output_dir, 'index.opf'), 'wb'))
|
||||
|
||||
def download(self):
|
||||
index = os.path.abspath(self.custom_index())
|
||||
url = 'file:'+index if iswindows else 'file://'+index
|
||||
fetcher = RecursiveFetcher(self.web2disk_options, self.logger)
|
||||
fetcher.base_dir = self.output_dir
|
||||
fetcher.current_dir = self.output_dir
|
||||
fetcher.show_progress = False
|
||||
res = fetcher.start_fetch(url)
|
||||
self.create_opf()
|
||||
return res
|
@ -17,14 +17,14 @@
|
||||
'''
|
||||
Builtin recipes.
|
||||
'''
|
||||
recipes = ['newsweek', 'atlantic', 'economist']
|
||||
recipes = ['newsweek', 'atlantic', 'economist', 'dilbert']
|
||||
|
||||
import re
|
||||
from libprs500.web.feeds.news import BasicNewsRecipe
|
||||
from libprs500.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe
|
||||
from libprs500.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile
|
||||
from libprs500.ebooks.lrf.web import available_profiles
|
||||
from libprs500.ebooks.lrf.web import builtin_profiles
|
||||
|
||||
basic_recipes = (BasicNewsRecipe, DefaultProfile, FullContentProfile)
|
||||
basic_recipes = (BasicNewsRecipe, CustomIndexRecipe, DefaultProfile, FullContentProfile)
|
||||
basic_recipe_names = (i.__name__ for i in basic_recipes)
|
||||
|
||||
|
||||
@ -51,6 +51,8 @@ recipes = [load_recipe(i) for i in recipes]
|
||||
def compile_recipe(src):
|
||||
'''
|
||||
Compile the code in src and return the first object that is a recipe or profile.
|
||||
@param src: Python source code
|
||||
@type src: string
|
||||
@return: Recipe/Profile class or None, if no such class was found in C{src}
|
||||
'''
|
||||
locals = {}
|
||||
@ -67,13 +69,20 @@ def compile_recipe(src):
|
||||
def get_builtin_recipe(title):
|
||||
'''
|
||||
Return a builtin recipe/profile class whoose title == C{title} or None if no such
|
||||
recipe exists.
|
||||
recipe exists. Also returns a flag that is True iff the found recipe is really
|
||||
an old-style Profile.
|
||||
|
||||
@type title: string
|
||||
@rtype: class or None
|
||||
@rtype: class or None, boolean
|
||||
'''
|
||||
for r in recipes:
|
||||
if r.title == title:
|
||||
return r
|
||||
return r, False
|
||||
for p in builtin_profiles:
|
||||
if p.title == title:
|
||||
return p, True
|
||||
return None, False
|
||||
|
||||
titles = set([r.title for r in recipes])
|
||||
_titles = list(frozenset([r.title for r in recipes] + [p.title for p in builtin_profiles]))
|
||||
_titles.sort()
|
||||
titles = _titles
|
45
src/libprs500/web/feeds/recipes/dilbert.py
Normal file
45
src/libprs500/web/feeds/recipes/dilbert.py
Normal file
@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
'''
|
||||
Daily Dilbert
|
||||
'''
|
||||
import os
|
||||
from libprs500.web.feeds.news import CustomIndexRecipe
|
||||
from libprs500.ptempfile import PersistentTemporaryDirectory
|
||||
|
||||
class Dilbert(CustomIndexRecipe):
|
||||
|
||||
title = 'Dilbert'
|
||||
timefmt = ' [%d %b %Y]'
|
||||
|
||||
feeds = [('Dilbert', 'http://feeds.feedburner.com/tapestrydilbert')]
|
||||
|
||||
def get_article_url(self, item):
|
||||
return item.get('enclosures')[0].get('url')
|
||||
|
||||
def custom_index(self):
|
||||
tdir = PersistentTemporaryDirectory('feeds2disk_dilbert')
|
||||
index = os.path.join(tdir, 'index.html')
|
||||
feed = self.parse_feeds()[0]
|
||||
|
||||
res = ''
|
||||
for item in feed:
|
||||
res += '<h3>%s</h3><img style="page-break-after:always" src="%s" />\n'%(item.title, item.url)
|
||||
res = '<html><body><h1>Dilbert</h1>%s</body></html'%res
|
||||
open(index, 'wb').write(res)
|
||||
return index
|
||||
|
Loading…
x
Reference in New Issue
Block a user