mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
F-Secure by louhike and more work on the new metadata download system
This commit is contained in:
parent
492d16e5c9
commit
6be7471d2e
22
recipes/f_secure.recipe
Normal file
22
recipes/f_secure.recipe
Normal file
@ -0,0 +1,22 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1301860159(BasicNewsRecipe):
|
||||
title = u'F-Secure Weblog'
|
||||
language = 'en'
|
||||
__author__ = 'louhike'
|
||||
description = u'All the news from the weblog of F-Secure'
|
||||
publisher = u'F-Secure'
|
||||
timefmt = ' [%a, %d %b, %Y]'
|
||||
encoding = 'ISO-8859-1'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'en_EN'
|
||||
remove_javascript = True
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'modSectionTd2'})]
|
||||
remove_tags = [dict(name='a'),dict(name='hr')]
|
||||
|
||||
feeds = [(u'Weblog', u'http://www.f-secure.com/weblog/weblog.rss')]
|
||||
def get_cover_url(self):
|
||||
return 'http://www.f-secure.com/weblog/archives/images/company_logo.png'
|
@ -282,6 +282,7 @@ class Amazon(Source):
|
||||
capabilities = frozenset(['identify', 'cover'])
|
||||
touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
|
||||
'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate'])
|
||||
has_html_comments = True
|
||||
|
||||
AMAZON_DOMAINS = {
|
||||
'com': _('US'),
|
||||
|
@ -18,6 +18,9 @@ from calibre.utils.titlecase import titlecase
|
||||
from calibre.ebooks.metadata import check_isbn
|
||||
|
||||
msprefs = JSONConfig('metadata_sources.json')
|
||||
msprefs.defaults['txt_comments'] = False
|
||||
msprefs.defaults['ignore_fields'] = []
|
||||
msprefs.defaults['max_tags'] = 10
|
||||
|
||||
def create_log(ostream=None):
|
||||
log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
|
||||
@ -104,6 +107,9 @@ class Source(Plugin):
|
||||
#: during the identify phase
|
||||
touched_fields = frozenset()
|
||||
|
||||
#: Set this to True if your plugin return HTML formatted comments
|
||||
has_html_comments = False
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
Plugin.__init__(self, *args, **kwargs)
|
||||
self._isbn_to_identifier_cache = {}
|
||||
|
@ -8,13 +8,18 @@ __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import time
|
||||
from datetime import datetime
|
||||
from Queue import Queue, Empty
|
||||
from threading import Thread
|
||||
from io import BytesIO
|
||||
from operator import attrgetter
|
||||
|
||||
from calibre.customize.ui import metadata_plugins
|
||||
from calibre.ebooks.metadata.sources.base import create_log
|
||||
from calibre.ebooks.metadata.sources.base import create_log, msprefs
|
||||
from calibre.ebooks.metadata.xisbn import xisbn
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
from calibre.utils.date import utc_tz
|
||||
from calibre.utils.html2text import html2text
|
||||
|
||||
# How long to wait for more results after first result is found
|
||||
WAIT_AFTER_FIRST_RESULT = 30 # seconds
|
||||
@ -117,14 +122,30 @@ def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
|
||||
log('Merging results from different sources and finding earliest',
|
||||
'publication dates')
|
||||
start_time = time.time()
|
||||
merged_results = merge_identify_results(results, log)
|
||||
results = merge_identify_results(results, log)
|
||||
log('We have %d merged results, merging took: %.2f seconds' %
|
||||
(len(merged_results), time.time() - start_time))
|
||||
(len(results), time.time() - start_time))
|
||||
|
||||
if msprefs['txt_comments']:
|
||||
for r in results:
|
||||
if r.plugin.has_html_comments and r.comments:
|
||||
r.comments = html2text(r.comments)
|
||||
|
||||
dummy = Metadata(_('Unknown'))
|
||||
max_tags = msprefs['max_tags']
|
||||
for f in msprefs['ignore_fields']:
|
||||
for r in results:
|
||||
setattr(r, f, getattr(dummy, f))
|
||||
r.tags = r.tags[:max_tags]
|
||||
|
||||
return results
|
||||
|
||||
|
||||
class ISBNMerge(object):
|
||||
|
||||
def __init__(self):
|
||||
self.pools = {}
|
||||
self.isbnless_results = []
|
||||
|
||||
def isbn_in_pool(self, isbn):
|
||||
if isbn:
|
||||
@ -140,22 +161,143 @@ class ISBNMerge(object):
|
||||
return True
|
||||
return False
|
||||
|
||||
def add_result(self, result, isbn):
|
||||
pool = self.isbn_in_pool(isbn)
|
||||
if pool is None:
|
||||
isbns, min_year = xisbn.get_isbn_pool(isbn)
|
||||
if not isbns:
|
||||
isbns = frozenset([isbn])
|
||||
self.pool[isbns] = pool = (min_year, [])
|
||||
def add_result(self, result):
|
||||
isbn = result.isbn
|
||||
if isbn:
|
||||
pool = self.isbn_in_pool(isbn)
|
||||
if pool is None:
|
||||
isbns, min_year = xisbn.get_isbn_pool(isbn)
|
||||
if not isbns:
|
||||
isbns = frozenset([isbn])
|
||||
self.pool[isbns] = pool = (min_year, [])
|
||||
|
||||
if not self.pool_has_result_from_same_source(pool, result):
|
||||
pool[1].append(result)
|
||||
else:
|
||||
self.isbnless_results.append(result)
|
||||
|
||||
def finalize(self):
|
||||
has_isbn_result = False
|
||||
for results in self.pools.itervalues():
|
||||
if results:
|
||||
has_isbn_result = True
|
||||
break
|
||||
self.has_isbn_result = has_isbn_result
|
||||
|
||||
if has_isbn_result:
|
||||
self.merge_isbn_results()
|
||||
else:
|
||||
self.results = sorted(self.isbnless_results,
|
||||
key=attrgetter('relevance_in_source'))
|
||||
|
||||
return self.results
|
||||
|
||||
def merge_isbn_results(self):
|
||||
self.results = []
|
||||
for min_year, results in self.pool.itervalues():
|
||||
if results:
|
||||
self.results.append(self.merge(results, min_year))
|
||||
|
||||
self.results.sort(key=attrgetter('average_source_relevance'))
|
||||
|
||||
def length_merge(self, attr, results, null_value=None, shortest=True):
|
||||
values = [getattr(x, attr) for x in results if not x.is_null(attr)]
|
||||
values = [x for x in values if len(x) > 0]
|
||||
if not values:
|
||||
return null_value
|
||||
values.sort(key=len, reverse=not shortest)
|
||||
return values[0]
|
||||
|
||||
def random_merge(self, attr, results, null_value=None):
|
||||
values = [getattr(x, attr) for x in results if not x.is_null(attr)]
|
||||
return values[0] if values else null_value
|
||||
|
||||
def merge(self, results, min_year):
|
||||
ans = Metadata(_('Unknown'))
|
||||
|
||||
# We assume the shortest title has the least cruft in it
|
||||
ans.title = self.length_merge('title', results, null_value=ans.title)
|
||||
|
||||
# No harm in having extra authors, maybe something useful like an
|
||||
# editor or translator
|
||||
ans.authors = self.length_merge('authors', results,
|
||||
null_value=ans.authors, shortest=False)
|
||||
|
||||
# We assume the shortest publisher has the least cruft in it
|
||||
ans.publisher = self.length_merge('publisher', results,
|
||||
null_value=ans.publisher)
|
||||
|
||||
# We assume the smallest set of tags has the least cruft in it
|
||||
ans.tags = self.length_merge('tags', results,
|
||||
null_value=ans.tags)
|
||||
|
||||
# We assume the longest series has the most info in it
|
||||
ans.series = self.length_merge('series', results,
|
||||
null_value=ans.series, shortest=False)
|
||||
for r in results:
|
||||
if r.series and r.series == ans.series:
|
||||
ans.series_index = r.series_index
|
||||
break
|
||||
|
||||
# Average the rating over all sources
|
||||
ratings = []
|
||||
for r in results:
|
||||
rating = r.rating
|
||||
if rating and rating > 0 and rating <= 5:
|
||||
ratings.append(rating)
|
||||
if ratings:
|
||||
ans.rating = sum(ratings)/len(ratings)
|
||||
|
||||
# Smallest language is likely to be valid
|
||||
ans.language = self.length_merge('language', results,
|
||||
null_value=ans.language)
|
||||
|
||||
# Choose longest comments
|
||||
ans.comments = self.length_merge('comments', results,
|
||||
null_value=ans.comments, shortest=False)
|
||||
|
||||
# Published date
|
||||
if min_year:
|
||||
min_date = datetime(min_year, 1, 2, tzinfo=utc_tz)
|
||||
ans.pubdate = min_date
|
||||
else:
|
||||
min_date = datetime(10000, 1, 1, tzinfo=utc_tz)
|
||||
for r in results:
|
||||
if r.pubdate is not None and r.pubdate < min_date:
|
||||
min_date = r.pubdate
|
||||
if min_date.year < 10000:
|
||||
ans.pubdate = min_date
|
||||
|
||||
# Identifiers
|
||||
for r in results:
|
||||
ans.identifiers.update(r.identifiers)
|
||||
|
||||
# Merge any other fields with no special handling (random merge)
|
||||
touched_fields = set()
|
||||
for r in results:
|
||||
touched_fields |= r.plugin.touched_fields
|
||||
|
||||
for f in touched_fields:
|
||||
if f.startswith('identifier:') or not ans.is_null(f):
|
||||
continue
|
||||
setattr(ans, f, self.random_merge(f, results,
|
||||
null_value=getattr(ans, f)))
|
||||
|
||||
avg = [x.relevance_in_source for x in results]
|
||||
avg = sum(avg)/len(avg)
|
||||
ans.average_source_relevance = avg
|
||||
|
||||
return ans
|
||||
|
||||
if not self.pool_has_result_from_same_source(pool, result):
|
||||
pool[1].append(result)
|
||||
|
||||
def merge_identify_results(result_map, log):
|
||||
isbn_merge = ISBNMerge()
|
||||
for plugin, results in result_map.iteritems():
|
||||
for result in results:
|
||||
isbn = result.isbn
|
||||
if isbn:
|
||||
isbns, min_year = xisbn.get_isbn_pool(isbn)
|
||||
isbn_merge.add_result(result)
|
||||
|
||||
return isbn_merge.finalize()
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user