mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
F-Secure by louhike and more work on the new metadata download system
This commit is contained in:
parent
492d16e5c9
commit
6be7471d2e
22
recipes/f_secure.recipe
Normal file
22
recipes/f_secure.recipe
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1301860159(BasicNewsRecipe):
|
||||||
|
title = u'F-Secure Weblog'
|
||||||
|
language = 'en'
|
||||||
|
__author__ = 'louhike'
|
||||||
|
description = u'All the news from the weblog of F-Secure'
|
||||||
|
publisher = u'F-Secure'
|
||||||
|
timefmt = ' [%a, %d %b, %Y]'
|
||||||
|
encoding = 'ISO-8859-1'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'en_EN'
|
||||||
|
remove_javascript = True
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':'modSectionTd2'})]
|
||||||
|
remove_tags = [dict(name='a'),dict(name='hr')]
|
||||||
|
|
||||||
|
feeds = [(u'Weblog', u'http://www.f-secure.com/weblog/weblog.rss')]
|
||||||
|
def get_cover_url(self):
|
||||||
|
return 'http://www.f-secure.com/weblog/archives/images/company_logo.png'
|
@ -282,6 +282,7 @@ class Amazon(Source):
|
|||||||
capabilities = frozenset(['identify', 'cover'])
|
capabilities = frozenset(['identify', 'cover'])
|
||||||
touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
|
touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
|
||||||
'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate'])
|
'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate'])
|
||||||
|
has_html_comments = True
|
||||||
|
|
||||||
AMAZON_DOMAINS = {
|
AMAZON_DOMAINS = {
|
||||||
'com': _('US'),
|
'com': _('US'),
|
||||||
|
@ -18,6 +18,9 @@ from calibre.utils.titlecase import titlecase
|
|||||||
from calibre.ebooks.metadata import check_isbn
|
from calibre.ebooks.metadata import check_isbn
|
||||||
|
|
||||||
msprefs = JSONConfig('metadata_sources.json')
|
msprefs = JSONConfig('metadata_sources.json')
|
||||||
|
msprefs.defaults['txt_comments'] = False
|
||||||
|
msprefs.defaults['ignore_fields'] = []
|
||||||
|
msprefs.defaults['max_tags'] = 10
|
||||||
|
|
||||||
def create_log(ostream=None):
|
def create_log(ostream=None):
|
||||||
log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
|
log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
|
||||||
@ -104,6 +107,9 @@ class Source(Plugin):
|
|||||||
#: during the identify phase
|
#: during the identify phase
|
||||||
touched_fields = frozenset()
|
touched_fields = frozenset()
|
||||||
|
|
||||||
|
#: Set this to True if your plugin return HTML formatted comments
|
||||||
|
has_html_comments = False
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
Plugin.__init__(self, *args, **kwargs)
|
Plugin.__init__(self, *args, **kwargs)
|
||||||
self._isbn_to_identifier_cache = {}
|
self._isbn_to_identifier_cache = {}
|
||||||
|
@ -8,13 +8,18 @@ __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
|||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
from datetime import datetime
|
||||||
from Queue import Queue, Empty
|
from Queue import Queue, Empty
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
from operator import attrgetter
|
||||||
|
|
||||||
from calibre.customize.ui import metadata_plugins
|
from calibre.customize.ui import metadata_plugins
|
||||||
from calibre.ebooks.metadata.sources.base import create_log
|
from calibre.ebooks.metadata.sources.base import create_log, msprefs
|
||||||
from calibre.ebooks.metadata.xisbn import xisbn
|
from calibre.ebooks.metadata.xisbn import xisbn
|
||||||
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
|
from calibre.utils.date import utc_tz
|
||||||
|
from calibre.utils.html2text import html2text
|
||||||
|
|
||||||
# How long to wait for more results after first result is found
|
# How long to wait for more results after first result is found
|
||||||
WAIT_AFTER_FIRST_RESULT = 30 # seconds
|
WAIT_AFTER_FIRST_RESULT = 30 # seconds
|
||||||
@ -117,14 +122,30 @@ def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
|
|||||||
log('Merging results from different sources and finding earliest',
|
log('Merging results from different sources and finding earliest',
|
||||||
'publication dates')
|
'publication dates')
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
merged_results = merge_identify_results(results, log)
|
results = merge_identify_results(results, log)
|
||||||
log('We have %d merged results, merging took: %.2f seconds' %
|
log('We have %d merged results, merging took: %.2f seconds' %
|
||||||
(len(merged_results), time.time() - start_time))
|
(len(results), time.time() - start_time))
|
||||||
|
|
||||||
|
if msprefs['txt_comments']:
|
||||||
|
for r in results:
|
||||||
|
if r.plugin.has_html_comments and r.comments:
|
||||||
|
r.comments = html2text(r.comments)
|
||||||
|
|
||||||
|
dummy = Metadata(_('Unknown'))
|
||||||
|
max_tags = msprefs['max_tags']
|
||||||
|
for f in msprefs['ignore_fields']:
|
||||||
|
for r in results:
|
||||||
|
setattr(r, f, getattr(dummy, f))
|
||||||
|
r.tags = r.tags[:max_tags]
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
class ISBNMerge(object):
|
class ISBNMerge(object):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.pools = {}
|
self.pools = {}
|
||||||
|
self.isbnless_results = []
|
||||||
|
|
||||||
def isbn_in_pool(self, isbn):
|
def isbn_in_pool(self, isbn):
|
||||||
if isbn:
|
if isbn:
|
||||||
@ -140,7 +161,9 @@ class ISBNMerge(object):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def add_result(self, result, isbn):
|
def add_result(self, result):
|
||||||
|
isbn = result.isbn
|
||||||
|
if isbn:
|
||||||
pool = self.isbn_in_pool(isbn)
|
pool = self.isbn_in_pool(isbn)
|
||||||
if pool is None:
|
if pool is None:
|
||||||
isbns, min_year = xisbn.get_isbn_pool(isbn)
|
isbns, min_year = xisbn.get_isbn_pool(isbn)
|
||||||
@ -150,12 +173,131 @@ class ISBNMerge(object):
|
|||||||
|
|
||||||
if not self.pool_has_result_from_same_source(pool, result):
|
if not self.pool_has_result_from_same_source(pool, result):
|
||||||
pool[1].append(result)
|
pool[1].append(result)
|
||||||
|
else:
|
||||||
|
self.isbnless_results.append(result)
|
||||||
|
|
||||||
|
def finalize(self):
|
||||||
|
has_isbn_result = False
|
||||||
|
for results in self.pools.itervalues():
|
||||||
|
if results:
|
||||||
|
has_isbn_result = True
|
||||||
|
break
|
||||||
|
self.has_isbn_result = has_isbn_result
|
||||||
|
|
||||||
|
if has_isbn_result:
|
||||||
|
self.merge_isbn_results()
|
||||||
|
else:
|
||||||
|
self.results = sorted(self.isbnless_results,
|
||||||
|
key=attrgetter('relevance_in_source'))
|
||||||
|
|
||||||
|
return self.results
|
||||||
|
|
||||||
|
def merge_isbn_results(self):
|
||||||
|
self.results = []
|
||||||
|
for min_year, results in self.pool.itervalues():
|
||||||
|
if results:
|
||||||
|
self.results.append(self.merge(results, min_year))
|
||||||
|
|
||||||
|
self.results.sort(key=attrgetter('average_source_relevance'))
|
||||||
|
|
||||||
|
def length_merge(self, attr, results, null_value=None, shortest=True):
|
||||||
|
values = [getattr(x, attr) for x in results if not x.is_null(attr)]
|
||||||
|
values = [x for x in values if len(x) > 0]
|
||||||
|
if not values:
|
||||||
|
return null_value
|
||||||
|
values.sort(key=len, reverse=not shortest)
|
||||||
|
return values[0]
|
||||||
|
|
||||||
|
def random_merge(self, attr, results, null_value=None):
|
||||||
|
values = [getattr(x, attr) for x in results if not x.is_null(attr)]
|
||||||
|
return values[0] if values else null_value
|
||||||
|
|
||||||
|
def merge(self, results, min_year):
|
||||||
|
ans = Metadata(_('Unknown'))
|
||||||
|
|
||||||
|
# We assume the shortest title has the least cruft in it
|
||||||
|
ans.title = self.length_merge('title', results, null_value=ans.title)
|
||||||
|
|
||||||
|
# No harm in having extra authors, maybe something useful like an
|
||||||
|
# editor or translator
|
||||||
|
ans.authors = self.length_merge('authors', results,
|
||||||
|
null_value=ans.authors, shortest=False)
|
||||||
|
|
||||||
|
# We assume the shortest publisher has the least cruft in it
|
||||||
|
ans.publisher = self.length_merge('publisher', results,
|
||||||
|
null_value=ans.publisher)
|
||||||
|
|
||||||
|
# We assume the smallest set of tags has the least cruft in it
|
||||||
|
ans.tags = self.length_merge('tags', results,
|
||||||
|
null_value=ans.tags)
|
||||||
|
|
||||||
|
# We assume the longest series has the most info in it
|
||||||
|
ans.series = self.length_merge('series', results,
|
||||||
|
null_value=ans.series, shortest=False)
|
||||||
|
for r in results:
|
||||||
|
if r.series and r.series == ans.series:
|
||||||
|
ans.series_index = r.series_index
|
||||||
|
break
|
||||||
|
|
||||||
|
# Average the rating over all sources
|
||||||
|
ratings = []
|
||||||
|
for r in results:
|
||||||
|
rating = r.rating
|
||||||
|
if rating and rating > 0 and rating <= 5:
|
||||||
|
ratings.append(rating)
|
||||||
|
if ratings:
|
||||||
|
ans.rating = sum(ratings)/len(ratings)
|
||||||
|
|
||||||
|
# Smallest language is likely to be valid
|
||||||
|
ans.language = self.length_merge('language', results,
|
||||||
|
null_value=ans.language)
|
||||||
|
|
||||||
|
# Choose longest comments
|
||||||
|
ans.comments = self.length_merge('comments', results,
|
||||||
|
null_value=ans.comments, shortest=False)
|
||||||
|
|
||||||
|
# Published date
|
||||||
|
if min_year:
|
||||||
|
min_date = datetime(min_year, 1, 2, tzinfo=utc_tz)
|
||||||
|
ans.pubdate = min_date
|
||||||
|
else:
|
||||||
|
min_date = datetime(10000, 1, 1, tzinfo=utc_tz)
|
||||||
|
for r in results:
|
||||||
|
if r.pubdate is not None and r.pubdate < min_date:
|
||||||
|
min_date = r.pubdate
|
||||||
|
if min_date.year < 10000:
|
||||||
|
ans.pubdate = min_date
|
||||||
|
|
||||||
|
# Identifiers
|
||||||
|
for r in results:
|
||||||
|
ans.identifiers.update(r.identifiers)
|
||||||
|
|
||||||
|
# Merge any other fields with no special handling (random merge)
|
||||||
|
touched_fields = set()
|
||||||
|
for r in results:
|
||||||
|
touched_fields |= r.plugin.touched_fields
|
||||||
|
|
||||||
|
for f in touched_fields:
|
||||||
|
if f.startswith('identifier:') or not ans.is_null(f):
|
||||||
|
continue
|
||||||
|
setattr(ans, f, self.random_merge(f, results,
|
||||||
|
null_value=getattr(ans, f)))
|
||||||
|
|
||||||
|
avg = [x.relevance_in_source for x in results]
|
||||||
|
avg = sum(avg)/len(avg)
|
||||||
|
ans.average_source_relevance = avg
|
||||||
|
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def merge_identify_results(result_map, log):
|
def merge_identify_results(result_map, log):
|
||||||
|
isbn_merge = ISBNMerge()
|
||||||
for plugin, results in result_map.iteritems():
|
for plugin, results in result_map.iteritems():
|
||||||
for result in results:
|
for result in results:
|
||||||
isbn = result.isbn
|
isbn_merge.add_result(result)
|
||||||
if isbn:
|
|
||||||
isbns, min_year = xisbn.get_isbn_pool(isbn)
|
return isbn_merge.finalize()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user