F-Secure by louhike and more work on the new metadata download system

This commit is contained in:
Kovid Goyal 2011-04-04 08:02:28 -06:00
parent 492d16e5c9
commit 6be7471d2e
4 changed files with 186 additions and 15 deletions

22
recipes/f_secure.recipe Normal file
View File

@ -0,0 +1,22 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1301860159(BasicNewsRecipe):
title = u'F-Secure Weblog'
language = 'en'
__author__ = 'louhike'
description = u'All the news from the weblog of F-Secure'
publisher = u'F-Secure'
timefmt = ' [%a, %d %b, %Y]'
encoding = 'ISO-8859-1'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
language = 'en_EN'
remove_javascript = True
keep_only_tags = [dict(name='div', attrs={'class':'modSectionTd2'})]
remove_tags = [dict(name='a'),dict(name='hr')]
feeds = [(u'Weblog', u'http://www.f-secure.com/weblog/weblog.rss')]
def get_cover_url(self):
return 'http://www.f-secure.com/weblog/archives/images/company_logo.png'

View File

@ -282,6 +282,7 @@ class Amazon(Source):
capabilities = frozenset(['identify', 'cover']) capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset(['title', 'authors', 'identifier:amazon', touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate']) 'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate'])
has_html_comments = True
AMAZON_DOMAINS = { AMAZON_DOMAINS = {
'com': _('US'), 'com': _('US'),

View File

@ -18,6 +18,9 @@ from calibre.utils.titlecase import titlecase
from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata import check_isbn
msprefs = JSONConfig('metadata_sources.json') msprefs = JSONConfig('metadata_sources.json')
msprefs.defaults['txt_comments'] = False
msprefs.defaults['ignore_fields'] = []
msprefs.defaults['max_tags'] = 10
def create_log(ostream=None): def create_log(ostream=None):
log = ThreadSafeLog(level=ThreadSafeLog.DEBUG) log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
@ -104,6 +107,9 @@ class Source(Plugin):
#: during the identify phase #: during the identify phase
touched_fields = frozenset() touched_fields = frozenset()
#: Set this to True if your plugin return HTML formatted comments
has_html_comments = False
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
Plugin.__init__(self, *args, **kwargs) Plugin.__init__(self, *args, **kwargs)
self._isbn_to_identifier_cache = {} self._isbn_to_identifier_cache = {}

View File

@ -8,13 +8,18 @@ __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import time import time
from datetime import datetime
from Queue import Queue, Empty from Queue import Queue, Empty
from threading import Thread from threading import Thread
from io import BytesIO from io import BytesIO
from operator import attrgetter
from calibre.customize.ui import metadata_plugins from calibre.customize.ui import metadata_plugins
from calibre.ebooks.metadata.sources.base import create_log from calibre.ebooks.metadata.sources.base import create_log, msprefs
from calibre.ebooks.metadata.xisbn import xisbn from calibre.ebooks.metadata.xisbn import xisbn
from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.date import utc_tz
from calibre.utils.html2text import html2text
# How long to wait for more results after first result is found # How long to wait for more results after first result is found
WAIT_AFTER_FIRST_RESULT = 30 # seconds WAIT_AFTER_FIRST_RESULT = 30 # seconds
@ -117,14 +122,30 @@ def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
log('Merging results from different sources and finding earliest', log('Merging results from different sources and finding earliest',
'publication dates') 'publication dates')
start_time = time.time() start_time = time.time()
merged_results = merge_identify_results(results, log) results = merge_identify_results(results, log)
log('We have %d merged results, merging took: %.2f seconds' % log('We have %d merged results, merging took: %.2f seconds' %
(len(merged_results), time.time() - start_time)) (len(results), time.time() - start_time))
if msprefs['txt_comments']:
for r in results:
if r.plugin.has_html_comments and r.comments:
r.comments = html2text(r.comments)
dummy = Metadata(_('Unknown'))
max_tags = msprefs['max_tags']
for f in msprefs['ignore_fields']:
for r in results:
setattr(r, f, getattr(dummy, f))
r.tags = r.tags[:max_tags]
return results
class ISBNMerge(object): class ISBNMerge(object):
def __init__(self): def __init__(self):
self.pools = {} self.pools = {}
self.isbnless_results = []
def isbn_in_pool(self, isbn): def isbn_in_pool(self, isbn):
if isbn: if isbn:
@ -140,7 +161,9 @@ class ISBNMerge(object):
return True return True
return False return False
def add_result(self, result, isbn): def add_result(self, result):
isbn = result.isbn
if isbn:
pool = self.isbn_in_pool(isbn) pool = self.isbn_in_pool(isbn)
if pool is None: if pool is None:
isbns, min_year = xisbn.get_isbn_pool(isbn) isbns, min_year = xisbn.get_isbn_pool(isbn)
@ -150,12 +173,131 @@ class ISBNMerge(object):
if not self.pool_has_result_from_same_source(pool, result): if not self.pool_has_result_from_same_source(pool, result):
pool[1].append(result) pool[1].append(result)
else:
self.isbnless_results.append(result)
def finalize(self):
has_isbn_result = False
for results in self.pools.itervalues():
if results:
has_isbn_result = True
break
self.has_isbn_result = has_isbn_result
if has_isbn_result:
self.merge_isbn_results()
else:
self.results = sorted(self.isbnless_results,
key=attrgetter('relevance_in_source'))
return self.results
def merge_isbn_results(self):
self.results = []
for min_year, results in self.pool.itervalues():
if results:
self.results.append(self.merge(results, min_year))
self.results.sort(key=attrgetter('average_source_relevance'))
def length_merge(self, attr, results, null_value=None, shortest=True):
values = [getattr(x, attr) for x in results if not x.is_null(attr)]
values = [x for x in values if len(x) > 0]
if not values:
return null_value
values.sort(key=len, reverse=not shortest)
return values[0]
def random_merge(self, attr, results, null_value=None):
values = [getattr(x, attr) for x in results if not x.is_null(attr)]
return values[0] if values else null_value
def merge(self, results, min_year):
ans = Metadata(_('Unknown'))
# We assume the shortest title has the least cruft in it
ans.title = self.length_merge('title', results, null_value=ans.title)
# No harm in having extra authors, maybe something useful like an
# editor or translator
ans.authors = self.length_merge('authors', results,
null_value=ans.authors, shortest=False)
# We assume the shortest publisher has the least cruft in it
ans.publisher = self.length_merge('publisher', results,
null_value=ans.publisher)
# We assume the smallest set of tags has the least cruft in it
ans.tags = self.length_merge('tags', results,
null_value=ans.tags)
# We assume the longest series has the most info in it
ans.series = self.length_merge('series', results,
null_value=ans.series, shortest=False)
for r in results:
if r.series and r.series == ans.series:
ans.series_index = r.series_index
break
# Average the rating over all sources
ratings = []
for r in results:
rating = r.rating
if rating and rating > 0 and rating <= 5:
ratings.append(rating)
if ratings:
ans.rating = sum(ratings)/len(ratings)
# Smallest language is likely to be valid
ans.language = self.length_merge('language', results,
null_value=ans.language)
# Choose longest comments
ans.comments = self.length_merge('comments', results,
null_value=ans.comments, shortest=False)
# Published date
if min_year:
min_date = datetime(min_year, 1, 2, tzinfo=utc_tz)
ans.pubdate = min_date
else:
min_date = datetime(10000, 1, 1, tzinfo=utc_tz)
for r in results:
if r.pubdate is not None and r.pubdate < min_date:
min_date = r.pubdate
if min_date.year < 10000:
ans.pubdate = min_date
# Identifiers
for r in results:
ans.identifiers.update(r.identifiers)
# Merge any other fields with no special handling (random merge)
touched_fields = set()
for r in results:
touched_fields |= r.plugin.touched_fields
for f in touched_fields:
if f.startswith('identifier:') or not ans.is_null(f):
continue
setattr(ans, f, self.random_merge(f, results,
null_value=getattr(ans, f)))
avg = [x.relevance_in_source for x in results]
avg = sum(avg)/len(avg)
ans.average_source_relevance = avg
return ans
def merge_identify_results(result_map, log): def merge_identify_results(result_map, log):
isbn_merge = ISBNMerge()
for plugin, results in result_map.iteritems(): for plugin, results in result_map.iteritems():
for result in results: for result in results:
isbn = result.isbn isbn_merge.add_result(result)
if isbn:
isbns, min_year = xisbn.get_isbn_pool(isbn) return isbn_merge.finalize()