F-Secure by louhike and more work on the new metadata download system

This commit is contained in:
Kovid Goyal 2011-04-04 08:02:28 -06:00
parent 492d16e5c9
commit 6be7471d2e
4 changed files with 186 additions and 15 deletions

22
recipes/f_secure.recipe Normal file
View File

@ -0,0 +1,22 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1301860159(BasicNewsRecipe):
title = u'F-Secure Weblog'
language = 'en'
__author__ = 'louhike'
description = u'All the news from the weblog of F-Secure'
publisher = u'F-Secure'
timefmt = ' [%a, %d %b, %Y]'
encoding = 'ISO-8859-1'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
language = 'en_EN'
remove_javascript = True
keep_only_tags = [dict(name='div', attrs={'class':'modSectionTd2'})]
remove_tags = [dict(name='a'),dict(name='hr')]
feeds = [(u'Weblog', u'http://www.f-secure.com/weblog/weblog.rss')]
def get_cover_url(self):
return 'http://www.f-secure.com/weblog/archives/images/company_logo.png'

View File

@ -282,6 +282,7 @@ class Amazon(Source):
capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate'])
has_html_comments = True
AMAZON_DOMAINS = {
'com': _('US'),

View File

@ -18,6 +18,9 @@ from calibre.utils.titlecase import titlecase
from calibre.ebooks.metadata import check_isbn
msprefs = JSONConfig('metadata_sources.json')
msprefs.defaults['txt_comments'] = False
msprefs.defaults['ignore_fields'] = []
msprefs.defaults['max_tags'] = 10
def create_log(ostream=None):
log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
@ -104,6 +107,9 @@ class Source(Plugin):
#: during the identify phase
touched_fields = frozenset()
#: Set this to True if your plugin return HTML formatted comments
has_html_comments = False
def __init__(self, *args, **kwargs):
Plugin.__init__(self, *args, **kwargs)
self._isbn_to_identifier_cache = {}

View File

@ -8,13 +8,18 @@ __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import time
from datetime import datetime
from Queue import Queue, Empty
from threading import Thread
from io import BytesIO
from operator import attrgetter
from calibre.customize.ui import metadata_plugins
from calibre.ebooks.metadata.sources.base import create_log
from calibre.ebooks.metadata.sources.base import create_log, msprefs
from calibre.ebooks.metadata.xisbn import xisbn
from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.date import utc_tz
from calibre.utils.html2text import html2text
# How long to wait for more results after first result is found
WAIT_AFTER_FIRST_RESULT = 30 # seconds
@ -117,14 +122,30 @@ def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
log('Merging results from different sources and finding earliest',
'publication dates')
start_time = time.time()
merged_results = merge_identify_results(results, log)
results = merge_identify_results(results, log)
log('We have %d merged results, merging took: %.2f seconds' %
(len(merged_results), time.time() - start_time))
(len(results), time.time() - start_time))
if msprefs['txt_comments']:
for r in results:
if r.plugin.has_html_comments and r.comments:
r.comments = html2text(r.comments)
dummy = Metadata(_('Unknown'))
max_tags = msprefs['max_tags']
for f in msprefs['ignore_fields']:
for r in results:
setattr(r, f, getattr(dummy, f))
r.tags = r.tags[:max_tags]
return results
class ISBNMerge(object):
def __init__(self):
self.pools = {}
self.isbnless_results = []
def isbn_in_pool(self, isbn):
if isbn:
@ -140,7 +161,9 @@ class ISBNMerge(object):
return True
return False
def add_result(self, result, isbn):
def add_result(self, result):
isbn = result.isbn
if isbn:
pool = self.isbn_in_pool(isbn)
if pool is None:
isbns, min_year = xisbn.get_isbn_pool(isbn)
@ -150,12 +173,131 @@ class ISBNMerge(object):
if not self.pool_has_result_from_same_source(pool, result):
pool[1].append(result)
else:
self.isbnless_results.append(result)
def finalize(self):
has_isbn_result = False
for results in self.pools.itervalues():
if results:
has_isbn_result = True
break
self.has_isbn_result = has_isbn_result
if has_isbn_result:
self.merge_isbn_results()
else:
self.results = sorted(self.isbnless_results,
key=attrgetter('relevance_in_source'))
return self.results
def merge_isbn_results(self):
self.results = []
for min_year, results in self.pool.itervalues():
if results:
self.results.append(self.merge(results, min_year))
self.results.sort(key=attrgetter('average_source_relevance'))
def length_merge(self, attr, results, null_value=None, shortest=True):
values = [getattr(x, attr) for x in results if not x.is_null(attr)]
values = [x for x in values if len(x) > 0]
if not values:
return null_value
values.sort(key=len, reverse=not shortest)
return values[0]
def random_merge(self, attr, results, null_value=None):
values = [getattr(x, attr) for x in results if not x.is_null(attr)]
return values[0] if values else null_value
def merge(self, results, min_year):
ans = Metadata(_('Unknown'))
# We assume the shortest title has the least cruft in it
ans.title = self.length_merge('title', results, null_value=ans.title)
# No harm in having extra authors, maybe something useful like an
# editor or translator
ans.authors = self.length_merge('authors', results,
null_value=ans.authors, shortest=False)
# We assume the shortest publisher has the least cruft in it
ans.publisher = self.length_merge('publisher', results,
null_value=ans.publisher)
# We assume the smallest set of tags has the least cruft in it
ans.tags = self.length_merge('tags', results,
null_value=ans.tags)
# We assume the longest series has the most info in it
ans.series = self.length_merge('series', results,
null_value=ans.series, shortest=False)
for r in results:
if r.series and r.series == ans.series:
ans.series_index = r.series_index
break
# Average the rating over all sources
ratings = []
for r in results:
rating = r.rating
if rating and rating > 0 and rating <= 5:
ratings.append(rating)
if ratings:
ans.rating = sum(ratings)/len(ratings)
# Smallest language is likely to be valid
ans.language = self.length_merge('language', results,
null_value=ans.language)
# Choose longest comments
ans.comments = self.length_merge('comments', results,
null_value=ans.comments, shortest=False)
# Published date
if min_year:
min_date = datetime(min_year, 1, 2, tzinfo=utc_tz)
ans.pubdate = min_date
else:
min_date = datetime(10000, 1, 1, tzinfo=utc_tz)
for r in results:
if r.pubdate is not None and r.pubdate < min_date:
min_date = r.pubdate
if min_date.year < 10000:
ans.pubdate = min_date
# Identifiers
for r in results:
ans.identifiers.update(r.identifiers)
# Merge any other fields with no special handling (random merge)
touched_fields = set()
for r in results:
touched_fields |= r.plugin.touched_fields
for f in touched_fields:
if f.startswith('identifier:') or not ans.is_null(f):
continue
setattr(ans, f, self.random_merge(f, results,
null_value=getattr(ans, f)))
avg = [x.relevance_in_source for x in results]
avg = sum(avg)/len(avg)
ans.average_source_relevance = avg
return ans
def merge_identify_results(result_map, log):
isbn_merge = ISBNMerge()
for plugin, results in result_map.iteritems():
for result in results:
isbn = result.isbn
if isbn:
isbns, min_year = xisbn.get_isbn_pool(isbn)
isbn_merge.add_result(result)
return isbn_merge.finalize()