Implement scheduled news download. Add recipes for De Standaard and DeMorgen.be (thanks to Darko Miletic)

This commit is contained in:
Kovid Goyal 2008-11-20 16:57:59 -08:00
parent 145eab8acf
commit f1db17049c
12 changed files with 1542 additions and 86 deletions

View File

@ -8,30 +8,39 @@ Scheduler for automated recipe downloads
'''
import sys, copy
from threading import RLock
from datetime import datetime, timedelta
from PyQt4.Qt import QDialog, QApplication, QLineEdit, QPalette, SIGNAL, QBrush, \
QColor, QAbstractListModel, Qt, QVariant, QFont, QIcon, \
QFile, QObject, QTimer
QFile, QObject, QTimer, QMutex
from calibre import english_sort
from calibre.gui2.dialogs.scheduler_ui import Ui_Dialog
from calibre.web.feeds.recipes import recipes, recipe_modules, compile_recipe
from calibre.utils.search_query_parser import SearchQueryParser
from calibre.utils.pyparsing import ParseException
from calibre.gui2 import dynamic, NONE, error_dialog
from calibre.gui2 import NONE, error_dialog
from calibre.utils.config import DynamicConfig
config = DynamicConfig('scheduler')
class Recipe(object):
def __init__(self, id, recipe_class, builtin):
self.id = id
self.title = recipe_class.title
self.description = recipe_class.description
self.last_downloaded = datetime.fromordinal(1)
self.downloading = False
self.builtin = builtin
self.schedule = None
self.needs_subscription = recipe_class.needs_subscription
def __init__(self, id=None, recipe_class=None, builtin=True):
self.id = id
self.title = getattr(recipe_class, 'title', None)
self.description = getattr(recipe_class, 'description', None)
self.last_downloaded = datetime.fromordinal(1)
self.downloading = False
self.builtin = builtin
self.schedule = None
self.needs_subscription = getattr(recipe_class, 'needs_subscription', False)
def pickle(self):
return self.__dict__.copy()
def unpickle(self, dict):
self.__dict__.update(dict)
return self
def __cmp__(self, other):
if self.id == getattr(other, 'id', None):
@ -53,10 +62,17 @@ class Recipe(object):
return self.id == getattr(other, 'id', None)
def __repr__(self):
return u'%s:%s'%(self.id, self.title)
return u'%s|%s|%s|%s'%(self.id, self.title, self.last_downloaded.ctime(), self.schedule)
builtin_recipes = [Recipe(m, r, True) for r, m in zip(recipes, recipe_modules)]
def save_recipes(recipes):
config['scheduled_recipes'] = [r.pickle() for r in recipes]
def load_recipes():
config.refresh()
return [Recipe().unpickle(r) for r in config.get('scheduled_recipes', [])]
class RecipeModel(QAbstractListModel, SearchQueryParser):
LOCATIONS = ['all']
@ -70,16 +86,18 @@ class RecipeModel(QAbstractListModel, SearchQueryParser):
for x in db.get_recipes():
recipe = compile_recipe(x[1])
self.recipes.append(Recipe(x[0], recipe, False))
sr = dynamic['scheduled_recipes']
if not sr:
sr = []
self.refresh()
self._map = list(range(len(self.recipes)))
def refresh(self):
sr = load_recipes()
for recipe in self.recipes:
if recipe in sr:
recipe.schedule = sr[sr.index(recipe)].schedule
recipe.last_downloaded = sr[sr.index(recipe)].last_downloaded
self.recipes.sort()
self._map = list(range(len(self.recipes)))
def universal_set(self):
return set(self.recipes)
@ -203,7 +221,7 @@ class SchedulerDialog(QDialog, Ui_Dialog):
lambda state: self.interval.setEnabled(state == Qt.Checked))
self.connect(self.show_password, SIGNAL('stateChanged(int)'),
lambda state: self.password.setEchoMode(self.password.Normal if state == Qt.Checked else self.password.Password))
self.connect(self.interval, SIGNAL('valueChanged(int)'), self.do_schedule)
self.connect(self.interval, SIGNAL('valueChanged(double)'), self.do_schedule)
self.connect(self.search, SIGNAL('search(PyQt_PyObject)'), self._model.search)
self.connect(self._model, SIGNAL('modelReset()'), lambda : self.detail_box.setVisible(False))
self.connect(self.download, SIGNAL('clicked()'), self.download_now)
@ -218,32 +236,32 @@ class SchedulerDialog(QDialog, Ui_Dialog):
username, password = username.strip(), password.strip()
recipe = self._model.data(self.recipes.currentIndex(), Qt.UserRole)
key = 'recipe_account_info_%s'%recipe.id
dynamic[key] = (username, password) if username and password else None
config[key] = (username, password) if username and password else None
def do_schedule(self, *args):
recipe = self.recipes.currentIndex()
if not recipe.isValid():
return
recipe = self._model.data(recipe, Qt.UserRole)
recipes = dynamic['scheduled_recipes']
recipes = load_recipes()
if self.schedule.checkState() == Qt.Checked:
if recipe in recipes:
recipe = recipes[recipes.index(recipe)]
else:
recipe.last_downloaded = datetime.fromordinal(1)
recipes.append(recipe)
recipes.schedule = self.interval.value()
if recipes.schedule == 0.0:
recipes.schedule = 1/24.
if recipe.need_subscription and not dynamic['recipe_account_info_%s'%recipe.id]:
recipe.schedule = self.interval.value()
if recipe.schedule < 0.1:
recipe.schedule = 1/24.
if recipe.needs_subscription and not config['recipe_account_info_%s'%recipe.id]:
error_dialog(self, _('Must set account information'), _('This recipe requires a username and password')).exec_()
self.schedule.setCheckState(Qt.Unchecked)
return
else:
if recipe in recipes:
recipes.remove(recipe)
dynamic['scheduled_recipes'] = recipes
save_recipes(recipes)
self.emit(SIGNAL('new_schedule(PyQt_PyObject)'), recipes)
self._model.resort()
def show_recipe(self, index):
recipe = self._model.data(index, Qt.UserRole)
@ -254,9 +272,9 @@ class SchedulerDialog(QDialog, Ui_Dialog):
self.interval.setValue(recipe.schedule if recipe.schedule is not None else 1)
self.detail_box.setVisible(True)
self.account.setVisible(recipe.needs_subscription)
self.interval.setEnabled(self.schedule.checkState == Qt.Checked)
self.interval.setEnabled(self.schedule.checkState() == Qt.Checked)
key = 'recipe_account_info_%s'%recipe.id
account_info = dynamic[key]
account_info = config[key]
self.show_password.setChecked(False)
if account_info:
self.username.blockSignals(True)
@ -265,73 +283,120 @@ class SchedulerDialog(QDialog, Ui_Dialog):
self.password.setText(account_info[1])
self.username.blockSignals(False)
self.password.blockSignals(False)
d = datetime.utcnow() - recipe.last_downloaded
ld = '%.1f'%(d.days + d.seconds/(24*3600))
if d < timedelta(days=366):
self.last_downloaded.setText(_('Last downloaded: %s days ago')%ld)
else:
self.last_downloaded.setText(_('Last downloaded: never'))
class Scheduler(QObject):
INTERVAL = 5 # minutes
INTERVAL = 1 # minutes
def __init__(self, main):
self.main = main
self.verbose = main.verbose
QObject.__init__(self)
self.lock = RLock()
self.lock = QMutex(QMutex.Recursive)
self.queue = set([])
recipes = dynamic['scheduled_recipes']
if not recipes:
recipes = []
recipes = load_recipes()
self.refresh_schedule(recipes)
self.timer = QTimer()
self.dirtied = False
self.connect(self.timer, SIGNAL('timeout()'), self.check)
self.timer.start(self.INTERVAL * 60000)
self.timer.start(int(self.INTERVAL * 60000))
def debug(self, *args):
if self.verbose:
sys.stdout.write(' '.join(map(unicode, args))+'\n')
sys.stdout.flush()
def check(self):
db = self.main.library_view.model().db
now = datetime.utcnow()
needs_downloading = set([])
for recipe in self.recipes:
delta = now - recipe.last_downloaded
if delta > timedelta(days=recipe.schedule):
needs_downloading.add(recipe)
with self.lock:
if not self.lock.tryLock():
return
try:
if self.dirtied:
self.refresh_schedule(load_recipes())
self.dirtied = False
needs_downloading = set([])
self.debug('Checking...')
now = datetime.utcnow()
for recipe in self.recipes:
if recipe.schedule is None:
continue
delta = now - recipe.last_downloaded
if delta > timedelta(days=recipe.schedule):
needs_downloading.add(recipe)
self.debug('Needs downloading:', needs_downloading)
needs_downloading = [r for r in needs_downloading if r not in self.queue]
for recipe in needs_downloading:
try:
id = int(recipe.id)
script = db.get_recipe(id)
if script is None:
self.recipes.remove(recipe)
dynamic['scheduled_recipes'] = self.recipes
continue
except ValueError:
script = recipe.title
self.main.download_scheduled_recipe(recipe, script, self.recipe_downloaded)
self.queue.add(recipe)
self.do_download(recipe)
finally:
self.lock.unlock()
def do_download(self, recipe):
try:
id = int(recipe.id)
script = self.main.library_view.model().db.get_recipe(id)
if script is None:
self.recipes.remove(recipe)
save_recipes(self.recipes)
return
except ValueError:
script = recipe.title
self.debug('\tQueueing:', recipe)
self.main.download_scheduled_recipe(recipe, script, self.recipe_downloaded)
self.queue.add(recipe)
def recipe_downloaded(self, recipe):
with self.lock:
self.lock.lock()
try:
if recipe in self.recipes:
recipe = self.recipes[self.recipes.index(recipe)]
now = datetime.utcnow()
d = now - recipe.last_downloaded
if recipe.schedule is not None:
interval = timedelta(days=recipe.schedule)
if abs(d - interval) < timedelta(hours=1):
recipe.last_downloaded += interval
else:
recipe.last_downloaded = now
else:
recipe.last_downloaded = now
save_recipes(self.recipes)
self.queue.remove(recipe)
recipe = self.recipes[self.recipes.index(recipe)]
now = datetime.utcnow()
d = now - recipe.last_downloaded
interval = timedelta(days=recipe.schedule)
if abs(d - interval) < timedelta(hours=1):
recipe.last_downloaded += interval
else:
recipe.last_downloaded = now
dynamic['scheduled_recipes'] = self.recipes
self.dirtied = True
finally:
self.lock.unlock()
self.debug('Downloaded:', recipe)
def download(self, recipe):
if recipe in self.recipes:
recipe = self.recipes[self.recipes.index(recipe)]
raise NotImplementedError
self.lock.lock()
try:
if recipe in self.recipes:
recipe = self.recipes[self.recipes.index(recipe)]
if recipe not in self.queue:
self.do_download(recipe)
finally:
self.lock.unlock()
def refresh_schedule(self, recipes):
self.recipes = recipes
def show_dialog(self):
d = SchedulerDialog(self.main.library_view.model().db)
self.connect(d, SIGNAL('new_schedule(PyQt_PyObject)'), self.refresh_schedule)
self.connect(d, SIGNAL('download_now(PyQt_PyObject)'), self.download)
d.exec_()
self.lock.lock()
try:
d = SchedulerDialog(self.main.library_view.model().db)
self.connect(d, SIGNAL('new_schedule(PyQt_PyObject)'), self.refresh_schedule)
self.connect(d, SIGNAL('download_now(PyQt_PyObject)'), self.download)
d.exec_()
self.recipes = load_recipes()
finally:
self.lock.unlock()
def main(args=sys.argv):
app = QApplication([])

View File

@ -10,11 +10,11 @@
</rect>
</property>
<property name="windowTitle" >
<string>Schedule recipes for download</string>
<string>Schedule news download</string>
</property>
<property name="windowIcon" >
<iconset resource="../images.qrc" >
<normaloff>:/images/news.svg</normaloff>:/images/news.svg</iconset>
<normaloff>:/images/scheduler.svg</normaloff>:/images/scheduler.svg</iconset>
</property>
<layout class="QGridLayout" name="gridLayout" >
<item rowspan="2" row="0" column="0" >
@ -161,6 +161,13 @@
</item>
</layout>
</item>
<item>
<widget class="QLabel" name="last_downloaded" >
<property name="text" >
<string> </string>
</property>
</widget>
</item>
<item>
<widget class="QGroupBox" name="account" >
<property name="title" >

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 48 KiB

View File

@ -23,6 +23,7 @@ from calibre.gui2 import APP_UID, warning_dialog, choose_files, error_dialog, \
max_available_height, config
from calibre.gui2.cover_flow import CoverFlow, DatabaseImages, pictureflowerror
from calibre.library.database import LibraryDatabase
from calibre.gui2.dialogs.scheduler import Scheduler
from calibre.gui2.update import CheckForUpdates
from calibre.gui2.main_window import MainWindow, option_parser
from calibre.gui2.main_ui import Ui_MainWindow
@ -74,6 +75,7 @@ class Main(MainWindow, Ui_MainWindow):
Ui_MainWindow.__init__(self)
self.setupUi(self)
self.setWindowTitle(__appname__)
self.verbose = opts.verbose
self.read_settings()
self.job_manager = JobManager()
self.jobs_dialog = JobsDialog(self, self.job_manager)
@ -290,7 +292,10 @@ class Main(MainWindow, Ui_MainWindow):
from calibre.library import server_config
self.content_server = start_threaded_server(db, server_config().parse())
self.test_server_timer = QTimer.singleShot(10000, self.test_server)
self.scheduler = Scheduler(self)
self.connect(self.news_menu.scheduler, SIGNAL('triggered(bool)'), lambda x :self.scheduler.show_dialog())
def test_server(self, *args):
if self.content_server.exception is not None:
error_dialog(self, _('Failed to start content server'),
@ -1294,6 +1299,8 @@ path_to_ebook to the database.
''')
parser.add_option('--with-library', default=None, action='store',
help=_('Use the library located at the specified path.'))
parser.add_option('-v', '--verbose', default=0, action='count',
help=_('Log debugging information to console'))
opts, args = parser.parse_args(args)
if opts.with_library is not None and os.path.isdir(opts.with_library):
prefs.set('library_path', opts.with_library)

View File

@ -29,18 +29,25 @@ class NewsMenu(QMenu):
def __init__(self, customize_feeds_func):
QMenu.__init__(self)
self.scheduler = QAction(QIcon(':/images/scheduler.svg'), _('Schedule news download'), self)
self.addAction(self.scheduler)
self.cac = QAction(QIcon(':/images/user_profile.svg'), _('Add a custom news source'), self)
self.connect(self.cac, SIGNAL('triggered(bool)'), customize_feeds_func)
self.addAction(self.cac)
self.addSeparator()
self.custom_menu = CustomNewsMenu()
self.addMenu(self.custom_menu)
self.connect(self.custom_menu, SIGNAL('start_news_fetch(PyQt_PyObject, PyQt_PyObject)'),
self.fetch_news)
self.addSeparator()
self.dmenu = QMenu(self)
self.dmenu.setTitle(_('Download news'))
self.dmenu.setIcon(QIcon(':/images/news.svg'))
self.addMenu(self.dmenu)
for title in titles:
recipe = get_builtin_recipe(title)[0]
self.addAction(NewsAction(recipe, self))
self.dmenu.addAction(NewsAction(recipe, self))
def fetch_news(self, recipe, module):
@ -76,7 +83,7 @@ class CustomNewsMenu(QMenu):
def __init__(self):
QMenu.__init__(self)
self.setTitle(_('Custom news sources'))
self.setTitle(_('Download custom news'))
self.connect(self, SIGNAL('triggered(QAction*)'), self.launch)
def launch(self, action):

View File

@ -361,12 +361,13 @@ def _fetch_news(data, fmt):
def fetch_scheduled_recipe(recipe, script):
from calibre.gui2.dialogs.scheduler import config
fmt = prefs['output_format'].lower()
pt = PersistentTemporaryFile(suffix='_feeds2%s.%s'%(fmt.lower(), fmt.lower()))
pt.close()
args = ['feeds2%s'%fmt.lower(), '--output', pt.name, '--debug']
if recipe.needs_subscription:
x = dynamic['recipe_account_info_%s'%recipe.id]
x = config.get('recipe_account_info_%s'%recipe.id, False)
if not x:
raise ValueError(_('You must set a username and password for %s')%recipe.title)
args.extend(['--username', x[0], '--password', x[1]])

View File

@ -716,7 +716,7 @@ class LibraryDatabase2(LibraryDatabase):
self.conn.commit()
def get_recipes(self):
return self.conn.get('SELECT id, title FROM feeds')
return self.conn.get('SELECT id, script FROM feeds')
def get_recipe(self, id):
return self.conn.get('SELECT script FROM feeds WHERE id=?', (id,), all=False)

View File

@ -473,8 +473,12 @@ class DynamicConfig(dict):
class for preferences that you don't intend to have the users edit directly.
'''
def __init__(self, name='dynamic'):
dict.__init__(self, {})
self.name = name
self.file_path = os.path.join(config_dir, name+'.pickle')
self.refresh()
def refresh(self):
d = {}
if os.path.exists(self.file_path):
with ExclusiveFile(self.file_path) as f:
@ -482,8 +486,11 @@ class DynamicConfig(dict):
try:
d = cPickle.loads(raw) if raw.strip() else {}
except:
import traceback
traceback.print_exc()
d = {}
dict.__init__(self, d)
self.clear()
self.update(d)
def __getitem__(self, key):
try:

View File

@ -12,6 +12,7 @@ recipe_modules = [
'discover_magazine', 'scientific_american', 'new_york_review_of_books',
'daily_telegraph', 'guardian', 'el_pais', 'new_scientist', 'b92',
'politika', 'moscow_times', 'latimes', 'japan_times', 'san_fran_chronicle',
'demorgen_be', 'de_standaard'
]
import re, imp, inspect, time, os

View File

@ -0,0 +1,32 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
'''
standaard.be
'''
from calibre.web.feeds.news import BasicNewsRecipe
class DeStandaard(BasicNewsRecipe):
title = u'De Standaard'
__author__ = u'Darko Miletic'
description = u'News from Belgium'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
keep_only_tags = [dict(name='div' , attrs={'id':'_parts_midContainer_div'})]
remove_tags_after = dict(name='h3', attrs={'title':'Binnenland'})
remove_tags = [
dict(name='h3' , attrs={'title':'Binnenland' })
,dict(name='p' , attrs={'class':'by' })
,dict(name='div' , attrs={'class':'articlesright'})
,dict(name='a' , attrs={'class':'help' })
,dict(name='a' , attrs={'class':'archive' })
,dict(name='a' , attrs={'class':'print' })
,dict(name='a' , attrs={'class':'email' })
]
feeds = [
(u'De Standaard Online', u'http://feeds.feedburner.com/dso-front')
]

View File

@ -0,0 +1,31 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
'''
demorgen.be
'''
from calibre.web.feeds.news import BasicNewsRecipe
class DeMorganBe(BasicNewsRecipe):
title = u'DeMorgen.be'
__author__ = u'Darko Miletic'
description = u'News from Belgium'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
keep_only_tags = [dict(name='div' , attrs={'class':'art_box2'})]
feeds = [
(u'Nieuws' , u'http://www.demorgen.be/nieuws/rss.xml' )
,(u'De Gedachte' , u'http://www.demorgen.be/degedachte/rss.xml' )
,(u'Financiele morgen' , u'http://www.demorgen.be/financielemorgen/rss.xml')
,(u'Financiele morgen' , u'http://www.demorgen.be/financielemorgen/rss.xml')
,(u'Sport' , u'http://www.demorgen.be/sport/rss.xml' )
,(u'Bis' , u'http://www.demorgen.be/bis/rss.xml' )
,(u'Magazine' , u'http://www.demorgen.be/magazine/rss.xml' )
,(u'De stand der dingen', u'http://www.demorgen.be/standderdingen/rss.xml' )
]

View File

@ -127,10 +127,13 @@ class RecursiveFetcher(object, LoggingInterface):
if self.keep_only_tags:
body = Tag(soup, 'body')
for spec in self.keep_only_tags:
for tag in soup.find('body').findAll(**spec):
body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body)
try:
for spec in self.keep_only_tags:
for tag in soup.find('body').findAll(**spec):
body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body)
except AttributeError: # soup has no body element
pass
def remove_beyond(tag, next):
while tag is not None and tag.name != 'body':