Merge branch 'kovidgoyal/master'

This commit is contained in:
Charles Haley 2013-07-13 09:20:39 +02:00
commit b3dbda5492
9 changed files with 369 additions and 13 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 446 B

View File

@ -0,0 +1,111 @@
# vim:fileencoding=utf-8
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013'
'''
monde-diplomatique.fr
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds import feeds_from_index
class LeMondeDiplomatiqueSiteWeb(BasicNewsRecipe):
title = u'Le Monde diplomatique.fr'
__author__ = 'Gaëtan Lehmann'
description = "Le Monde diplomatique est un mensuel français dinformation et dopinion à la ligne éditoriale nettement engagée en faveur d'une gauche de rupture avec le capitalisme. Il aborde de nombreux sujets — géopolitique, relations internationales, économie, questions sociales, écologie, culture, médias, …" # noqa
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
publisher = 'monde-diplomatique.fr'
category = 'news, France, world'
language = 'fr'
masthead_url = 'http://www.monde-diplomatique.fr/squelettes/images/logotyfa.png'
timefmt = ' [%d %b %Y]'
no_stylesheets = True
feeds = [(u'Blogs', u'http://blog.mondediplo.net/spip.php?page=backend'), (u'Archives', u'http://www.monde-diplomatique.fr/rss/')]
preprocess_regexps = [
(re.compile(r'<title>(.*) - Les blogs du Diplo</title>'), lambda m: '<title>' + m.group(1) + '</title>'),
(re.compile(r'<h2>(.*) - Les blogs du Diplo</h2>'), lambda m: '<h2>' + m.group(1) + '</h2>'),
(re.compile(r'<title>(.*) \(Le Monde diplomatique\)</title>'), lambda m: '<title>' + m.group(1) + '</title>'),
(re.compile(r'<h2>(.*) \(Le Monde diplomatique\)</h2>'), lambda m: '<h2>' + m.group(1) + '</h2>'),
(re.compile(r'<h3>Grand format</h3>'), lambda m: '')]
remove_tags = [dict(name='div', attrs={'class':'voiraussi liste'}),
dict(name='ul', attrs={'class':'hermetique carto hombre_demi_inverse'}),
dict(name='a', attrs={'class':'tousles'}),
dict(name='h3', attrs={'class':'cat'}),
dict(name='div', attrs={'class':'logodiplo'}),
dict(name='img', attrs={'class':'spip_logos'}),
dict(name='p', attrs={'id':'hierarchie'}),
dict(name='div', attrs={'class':'espace'})]
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
}
remove_empty_feeds = True
filterDuplicates = True
# don't use parse_index - we need it to send an exception so we can mix
# feed and parse_index results in parse_feeds
def parse_index_valise(self):
articles = []
soup = self.index_to_soup('http://www.monde-diplomatique.fr/carnet/')
cnt = soup.find('ul',attrs={'class':'hermetique liste'})
for item in cnt.findAll('li'):
description = ''
feed_link = item.find('a')
desc = item.find('div',attrs={'class':'intro'})
date = item.find('div',attrs={'class':'dates_auteurs'})
if desc:
description = desc.string
if feed_link and feed_link.has_key('href'):
url = 'http://www.monde-diplomatique.fr' + feed_link['href']
title = self.tag_to_string(feed_link)
articles.append({
'title' :title
,'date' :date.string.strip()
,'url' :url
,'description':description
})
return [("La valise diplomatique", articles)]
def parse_index_cartes(self):
articles = []
soup = self.index_to_soup('http://www.monde-diplomatique.fr/cartes/')
cnt = soup.find('div',attrs={'class':'decale hermetique'})
for item in cnt.findAll('div',attrs={'class':re.compile('grid_3 filet hombre_demi')}):
feed_link = item.find('a',attrs={'class':'couve'})
h3 = item.find('h3')
authorAndDate = item.find('div',attrs={'class':'dates_auteurs'})
author, date = authorAndDate.string.strip().split(', ')
if feed_link and feed_link.has_key('href'):
url = 'http://www.monde-diplomatique.fr' + feed_link['href']
title = self.tag_to_string(h3)
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description': author
})
return [("Cartes", articles)]
def parse_feeds(self):
feeds = BasicNewsRecipe.parse_feeds(self)
valise = feeds_from_index(self.parse_index_valise(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
cartes = feeds_from_index(self.parse_index_cartes(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
feeds = valise + feeds + cartes
return feeds

View File

@ -8,7 +8,7 @@ __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
# Imports {{{ # Imports {{{
import os, shutil, uuid, json, glob, time import os, shutil, uuid, json, glob, time, cPickle
from functools import partial from functools import partial
import apsw import apsw
@ -1216,5 +1216,27 @@ class DB(object):
def get_ids_for_custom_book_data(self, name): def get_ids_for_custom_book_data(self, name):
return frozenset(r[0] for r in self.conn.execute('SELECT book FROM books_plugin_data WHERE name=?', (name,))) return frozenset(r[0] for r in self.conn.execute('SELECT book FROM books_plugin_data WHERE name=?', (name,)))
def conversion_options(self, book_id, fmt):
for (data,) in self.conn.get('SELECT data FROM conversion_options WHERE book=? AND format=?', (book_id, fmt.upper())):
if data:
return cPickle.loads(bytes(data))
def has_conversion_options(self, ids, fmt='PIPE'):
ids = frozenset(ids)
self.conn.execute('DROP TABLE IF EXISTS conversion_options_temp; CREATE TEMP TABLE conversion_options_temp (id INTEGER PRIMARY KEY);')
self.conn.executemany('INSERT INTO conversion_options_temp VALUES (?)', [(x,) for x in ids])
for (book_id,) in self.conn.get(
'SELECT book FROM conversion_options WHERE format=? AND book IN (SELECT id FROM conversion_options_temp)', (fmt.upper(),)):
return True
return False
def delete_conversion_options(self, book_ids, fmt):
self.conn.executemany('DELETE FROM conversion_options WHERE book=? AND format=?',
[(book_id, fmt.upper()) for book_id in book_ids])
def set_conversion_options(self, options, fmt):
options = [(book_id, fmt.upper(), buffer(cPickle.dumps(data, -1))) for book_id, data in options.iteritems()]
self.conn.executemany('INSERT OR REPLACE INTO conversion_options(book,format,data) VALUES (?,?,?)', options)
# }}} # }}}

View File

@ -265,8 +265,10 @@ class Cache(object):
for name, field in self.fields.iteritems(): for name, field in self.fields.iteritems():
if name[0] == '#' and name.endswith('_index'): if name[0] == '#' and name.endswith('_index'):
field.series_field = self.fields[name[:-len('_index')]] field.series_field = self.fields[name[:-len('_index')]]
self.fields[name[:-len('_index')]].index_field = field
elif name == 'series_index': elif name == 'series_index':
field.series_field = self.fields['series'] field.series_field = self.fields['series']
self.fields['series'].index_field = field
elif name == 'authors': elif name == 'authors':
field.author_sort_field = self.fields['author_sort'] field.author_sort_field = self.fields['author_sort']
elif name == 'title': elif name == 'title':
@ -1179,6 +1181,18 @@ class Cache(object):
else: else:
table.remove_books(book_ids, self.backend) table.remove_books(book_ids, self.backend)
@write_api
def remove_items(self, field, item_ids):
''' Delete all items in the specified field with the specified ids. Returns the set of affected book ids. '''
field = self.fields[field]
affected_books = field.table.remove_items(item_ids, self.backend)
if affected_books:
if hasattr(field, 'index_field'):
self._set_field(field.index_field.name, {bid:1.0 for bid in affected_books})
else:
self._mark_as_dirty(affected_books)
return affected_books
@write_api @write_api
def add_custom_book_data(self, name, val_map, delete_first=False): def add_custom_book_data(self, name, val_map, delete_first=False):
''' Add data for name where val_map is a map of book_ids to values. If ''' Add data for name where val_map is a map of book_ids to values. If
@ -1208,6 +1222,22 @@ class Cache(object):
''' Return the set of book ids for which name has data. ''' ''' Return the set of book ids for which name has data. '''
return self.backend.get_ids_for_custom_book_data(name) return self.backend.get_ids_for_custom_book_data(name)
@read_api
def conversion_options(self, book_id, fmt='PIPE'):
return self.backend.conversion_options(book_id, fmt)
@read_api
def has_conversion_options(self, ids, fmt='PIPE'):
return self.backend.has_conversion_options(ids, fmt)
@write_api
def delete_conversion_options(self, book_ids, fmt='PIPE'):
return self.backend.delete_conversion_options(book_ids, fmt)
@write_api
def set_conversion_options(self, options, fmt='PIPE'):
''' options must be a map of the form {book_id:conversion_options} '''
return self.backend.set_conversion_options(options, fmt)
# }}} # }}}

View File

@ -98,6 +98,13 @@ class LibraryDatabase(object):
return self.new_api.get_item_name(field, item_id) return self.new_api.get_item_name(field, item_id)
return func return func
setattr(self, '%s_name' % field, MT(getter(field))) setattr(self, '%s_name' % field, MT(getter(field)))
for field in ('publisher', 'series', 'tag'):
def getter(field):
fname = 'tags' if field == 'tag' else field
def func(self, item_id):
self.new_api.remove_items(fname, (item_id,))
return func
setattr(self, 'delete_%s_using_id' % field, MT(getter(field)))
# Legacy field API # Legacy field API
for func in ( for func in (
@ -383,6 +390,18 @@ class LibraryDatabase(object):
break break
return ans return ans
def set_conversion_options(self, book_id, fmt, options):
self.new_api.set_conversion_options({book_id:options}, fmt=fmt)
def conversion_options(self, book_id, fmt):
return self.new_api.conversion_options(book_id, fmt=fmt)
def has_conversion_options(self, ids, format='PIPE'):
return self.new_api.has_conversion_options(ids, fmt=format)
def delete_conversion_options(self, book_id, fmt, commit=True):
self.new_api.delete_conversion_options((book_id,), fmt=fmt)
# Private interface {{{ # Private interface {{{
def __iter__(self): def __iter__(self):
for row in self.data.iterall(): for row in self.data.iterall():

View File

@ -204,6 +204,21 @@ class ManyToOneTable(Table):
[(x,) for x in clean]) [(x,) for x in clean])
return clean return clean
def remove_items(self, item_ids, db):
affected_books = set()
for item_id in item_ids:
val = self.id_map.pop(item_id, null)
if val is null:
continue
book_ids = self.col_book_map.pop(item_id, set())
for book_id in book_ids:
self.book_col_map.pop(book_id, None)
affected_books.update(book_ids)
item_ids = tuple((x,) for x in item_ids)
db.conn.executemany('DELETE FROM {0} WHERE {1}=?'.format(self.link_table, self.metadata['link_column']), item_ids)
db.conn.executemany('DELETE FROM {0} WHERE id=?'.format(self.metadata['table']), item_ids)
return affected_books
class ManyToManyTable(ManyToOneTable): class ManyToManyTable(ManyToOneTable):
''' '''
@ -250,6 +265,21 @@ class ManyToManyTable(ManyToOneTable):
[(x,) for x in clean]) [(x,) for x in clean])
return clean return clean
def remove_items(self, item_ids, db):
affected_books = set()
for item_id in item_ids:
val = self.id_map.pop(item_id, null)
if val is null:
continue
book_ids = self.col_book_map.pop(item_id, set())
for book_id in book_ids:
self.book_col_map[book_id] = tuple(x for x in self.book_col_map.get(book_id, ()) if x != item_id)
affected_books.update(book_ids)
item_ids = tuple((x,) for x in item_ids)
db.conn.executemany('DELETE FROM {0} WHERE {1}=?'.format(self.link_table, self.metadata['link_column']), item_ids)
db.conn.executemany('DELETE FROM {0} WHERE id=?'.format(self.metadata['table']), item_ids)
return affected_books
class AuthorsTable(ManyToManyTable): class AuthorsTable(ManyToManyTable):
def read_id_maps(self, db): def read_id_maps(self, db):
@ -274,6 +304,9 @@ class AuthorsTable(ManyToManyTable):
self.asort_map.pop(item_id, None) self.asort_map.pop(item_id, None)
return clean return clean
def remove_items(self, item_ids, db):
raise ValueError('Direct removal of authors is not allowed')
class FormatsTable(ManyToManyTable): class FormatsTable(ManyToManyTable):
do_clean_on_remove = False do_clean_on_remove = False
@ -331,6 +364,9 @@ class FormatsTable(ManyToManyTable):
return {book_id:zero_max(book_id) for book_id in formats_map} return {book_id:zero_max(book_id) for book_id in formats_map}
def remove_items(self, item_ids, db):
raise NotImplementedError('Cannot delete a format directly')
def update_fmt(self, book_id, fmt, fname, size, db): def update_fmt(self, book_id, fmt, fname, size, db):
fmts = list(self.book_col_map.get(book_id, [])) fmts = list(self.book_col_map.get(book_id, []))
try: try:
@ -381,4 +417,6 @@ class IdentifiersTable(ManyToManyTable):
clean.add(item_id) clean.add(item_id)
return clean return clean
def remove_items(self, item_ids, db):
raise NotImplementedError('Direct deletion of identifiers is not implemented')

View File

@ -191,6 +191,52 @@ class LegacyTest(BaseTest):
db.close() db.close()
# }}} # }}}
def test_legacy_conversion_options(self): # {{{
'Test conversion options API'
ndb = self.init_legacy()
db = self.init_old()
all_ids = ndb.new_api.all_book_ids()
op1, op2 = {'xx':'yy'}, {'yy':'zz'}
for x in (
('has_conversion_options', all_ids),
('conversion_options', 1, 'PIPE'),
('set_conversion_options', 1, 'PIPE', op1),
('has_conversion_options', all_ids),
('conversion_options', 1, 'PIPE'),
('delete_conversion_options', 1, 'PIPE'),
('has_conversion_options', all_ids),
):
meth, args = x[0], x[1:]
self.assertEqual((getattr(db, meth)(*args)), (getattr(ndb, meth)(*args)),
'The method: %s() returned different results for argument %s' % (meth, args))
db.close()
# }}}
def test_legacy_delete_using(self): # {{{
'Test delete_using() API'
ndb = self.init_legacy()
db = self.init_old()
cache = ndb.new_api
tmap = cache.get_id_map('tags')
t = next(tmap.iterkeys())
pmap = cache.get_id_map('publisher')
p = next(pmap.iterkeys())
for x in (
('delete_tag_using_id', t),
('delete_publisher_using_id', p),
(db.refresh,),
('all_tag_names',), ('tags', 0), ('tags', 1), ('tags', 2),
('all_publisher_names',), ('publisher', 0), ('publisher', 1), ('publisher', 2),
):
meth, args = x[0], x[1:]
if callable(meth):
meth(*args)
else:
self.assertEqual((getattr(db, meth)(*args)), (getattr(ndb, meth)(*args)),
'The method: %s() returned different results for argument %s' % (meth, args))
db.close()
# }}}
def test_legacy_adding_books(self): # {{{ def test_legacy_adding_books(self): # {{{
'Test various adding books methods' 'Test various adding books methods'
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
@ -269,7 +315,10 @@ class LegacyTest(BaseTest):
'books_in_old_database', # unused 'books_in_old_database', # unused
# Internal API # Internal API
'clean_user_categories', 'cleanup_tags', 'books_list_filter', 'clean_user_categories', 'cleanup_tags', 'books_list_filter', 'conn', 'connect', 'construct_file_name',
'construct_path_name', 'clear_dirtied', 'commit_dirty_cache', 'initialize_database', 'initialize_dynamic',
'run_import_plugins', 'vacuum', 'set_path', 'row', 'row_factory', 'rows', 'rmtree', 'series_index_pat',
'import_old_database', 'dirtied_lock', 'dirtied_cache', 'dirty_queue_length', 'dirty_books_referencing',
} }
SKIP_ARGSPEC = { SKIP_ARGSPEC = {
'__init__', 'get_next_series_num_for', 'has_book', 'author_sort_from_authors', '__init__', 'get_next_series_num_for', 'has_book', 'author_sort_from_authors',
@ -280,7 +329,7 @@ class LegacyTest(BaseTest):
try: try:
total = 0 total = 0
for attr in dir(db): for attr in dir(db):
if attr in SKIP_ATTRS: if attr in SKIP_ATTRS or attr.startswith('upgrade_version'):
continue continue
total += 1 total += 1
if not hasattr(ndb, attr): if not hasattr(ndb, attr):
@ -302,7 +351,7 @@ class LegacyTest(BaseTest):
if missing: if missing:
pc = len(missing)/total pc = len(missing)/total
raise AssertionError('{0:.1%} of API ({2} attrs) are missing. For example: {1}'.format(pc, ', '.join(missing[:5]), len(missing))) raise AssertionError('{0:.1%} of API ({2} attrs) are missing: {1}'.format(pc, ', '.join(missing), len(missing)))
# }}} # }}}

View File

@ -419,3 +419,58 @@ class WritingTest(BaseTest):
# }}} # }}}
def test_conversion_options(self): # {{{
' Test saving of conversion options '
cache = self.init_cache()
all_ids = cache.all_book_ids()
self.assertFalse(cache.has_conversion_options(all_ids))
self.assertIsNone(cache.conversion_options(1))
op1, op2 = {'xx':'yy'}, {'yy':'zz'}
cache.set_conversion_options({1:op1, 2:op2})
self.assertTrue(cache.has_conversion_options(all_ids))
self.assertEqual(cache.conversion_options(1), op1)
self.assertEqual(cache.conversion_options(2), op2)
cache.set_conversion_options({1:op2})
self.assertEqual(cache.conversion_options(1), op2)
cache.delete_conversion_options(all_ids)
self.assertFalse(cache.has_conversion_options(all_ids))
# }}}
def test_remove_items(self): # {{{
' Test removal of many-(many,one) items '
cache = self.init_cache()
tmap = cache.get_id_map('tags')
self.assertEqual(cache.remove_items('tags', tmap), {1, 2})
tmap = cache.get_id_map('#tags')
t = {v:k for k, v in tmap.iteritems()}['My Tag Two']
self.assertEqual(cache.remove_items('#tags', (t,)), {1, 2})
smap = cache.get_id_map('series')
self.assertEqual(cache.remove_items('series', smap), {1, 2})
smap = cache.get_id_map('#series')
s = {v:k for k, v in smap.iteritems()}['My Series Two']
self.assertEqual(cache.remove_items('#series', (s,)), {1})
for c in (cache, self.init_cache()):
self.assertFalse(c.get_id_map('tags'))
self.assertFalse(c.all_field_names('tags'))
for bid in c.all_book_ids():
self.assertFalse(c.field_for('tags', bid))
self.assertEqual(len(c.get_id_map('#tags')), 1)
self.assertEqual(c.all_field_names('#tags'), {'My Tag One'})
for bid in c.all_book_ids():
self.assertIn(c.field_for('#tags', bid), ((), ('My Tag One',)))
for bid in (1, 2):
self.assertEqual(c.field_for('series_index', bid), 1.0)
self.assertFalse(c.get_id_map('series'))
self.assertFalse(c.all_field_names('series'))
for bid in c.all_book_ids():
self.assertFalse(c.field_for('series', bid))
self.assertEqual(c.field_for('series_index', 1), 1.0)
self.assertEqual(c.all_field_names('#series'), {'My Series One'})
for bid in c.all_book_ids():
self.assertIn(c.field_for('#series', bid), (None, 'My Series One'))
# }}}

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function) from __future__ import (unicode_literals, division, absolute_import, print_function)
store_version = 3 # Needed for dynamic plugin loading store_version = 4 # Needed for dynamic plugin loading
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
@ -126,15 +126,47 @@ class AmazonKindleStore(StorePlugin):
counter = max_results counter = max_results
with closing(br.open(url, timeout=timeout)) as f: with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read().decode('latin-1', 'replace')) doc = html.fromstring(f.read())
data_xpath = '//div[contains(@class, "prod")]' if doc.xpath('//div[@id = "atfResults" and contains(@class, "grid")]'):
format_xpath = './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' data_xpath = '//div[contains(@class, "prod")]'
asin_xpath = '@name' format_xpath = (
cover_xpath = './/img[@class="productImage"]/@src' './/ul[contains(@class, "rsltGridList")]'
title_xpath = './/h3[@class="newaps"]/a//text()' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' asin_xpath = '@name'
price_xpath = './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' cover_xpath = './/img[@class="productImage"]/@src'
title_xpath = './/h3[@class="newaps"]/a//text()'
author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
price_xpath = (
'.//ul[contains(@class, "rsltGridList")]'
'//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
elif doc.xpath('//div[@id = "atfResults" and contains(@class, "ilresults")]'):
data_xpath = '//li[(@class="ilo")]'
format_xpath = (
'.//ul[contains(@class, "rsltGridList")]'
'//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
asin_xpath = '@name'
cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src'
title_xpath = './/h3[@class="newaps"]/a//text()'
author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
# Results can be in a grid (table) or a column
price_xpath = (
'.//ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
'//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
elif doc.xpath('//div[@id = "atfResults" and contains(@class, "list")]'):
data_xpath = '//div[contains(@class, "prod")]'
format_xpath = (
'.//ul[contains(@class, "rsltL")]'
'//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
asin_xpath = '@name'
cover_xpath = './/img[@class="productImage"]/@src'
title_xpath = './/h3[@class="newaps"]/a//text()'
author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
price_xpath = (
'.//ul[contains(@class, "rsltL")]'
'//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
else:
return
for data in doc.xpath(data_xpath): for data in doc.xpath(data_xpath):
if counter <= 0: if counter <= 0: