identify(): Merge results with identical title and authors that aren't matched by xISBN

This commit is contained in:
Kovid Goyal 2011-04-06 00:04:27 -06:00
parent 6773cf71af
commit 62b1ae9176
2 changed files with 47 additions and 8 deletions

View File

@ -89,7 +89,7 @@ def main(args=sys.argv):
print (log, file=sys.stderr)
print (result)
if not opts.opf:
if not opts.opf and opts.cover:
prints('Cover :', cf)
return 0

View File

@ -20,6 +20,7 @@ from calibre.ebooks.metadata.xisbn import xisbn
from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.date import utc_tz
from calibre.utils.html2text import html2text
from calibre.utils.icu import lower
# Download worker {{{
class Worker(Thread):
@ -97,11 +98,45 @@ class ISBNMerge(object):
if has_isbn_result:
self.merge_isbn_results()
else:
self.results = sorted(self.isbnless_results,
results = sorted(self.isbnless_results,
key=attrgetter('relevance_in_source'))
# Pick only the most relevant result from each source
self.results = []
seen = set()
for result in results:
if result.identify_plugin not in seen:
seen.add(result.identify_plugin)
self.results.append(result)
result.average_source_relevance = \
result.relevance_in_source
self.merge_metadata_results()
return self.results
def merge_metadata_results(self):
' Merge results with identical title and authors '
groups = {}
for result in self.results:
title = lower(result.title if result.title else '')
key = (title, tuple([lower(x) for x in result.authors]))
if key not in groups:
groups[key] = []
groups[key].append(result)
if len(groups) != len(self.results):
self.results = []
for rgroup in groups.itervalues():
rel = [r.average_source_relevance for r in rgroup]
if len(rgroup) > 1:
result = self.merge(rgroup, None, do_asr=False)
result.average_source_relevance = sum(rel)/len(rel)
else:
result = rgroup[0]
self.results.append(result)
self.results.sort(key=attrgetter('average_source_relevance'))
def merge_isbn_results(self):
self.results = []
for min_year, results in self.pools.itervalues():
@ -122,7 +157,7 @@ class ISBNMerge(object):
values = [getattr(x, attr) for x in results if not x.is_null(attr)]
return values[0] if values else null_value
def merge(self, results, min_year):
def merge(self, results, min_year, do_asr=True):
ans = Metadata(_('Unknown'))
# We assume the shortest title has the least cruft in it
@ -185,6 +220,7 @@ class ISBNMerge(object):
# Merge any other fields with no special handling (random merge)
touched_fields = set()
for r in results:
if hasattr(r, 'identify_plugin'):
touched_fields |= r.identify_plugin.touched_fields
for f in touched_fields:
@ -193,6 +229,7 @@ class ISBNMerge(object):
setattr(ans, f, self.random_merge(f, results,
null_value=getattr(ans, f)))
if do_asr:
avg = [x.relevance_in_source for x in results]
avg = sum(avg)/len(avg)
ans.average_source_relevance = avg
@ -210,7 +247,8 @@ def merge_identify_results(result_map, log):
# }}}
def identify(log, abort, title=None, authors=None, identifiers={}, timeout=30):
def identify(log, abort, # {{{
title=None, authors=None, identifiers={}, timeout=30):
start_time = time.time()
plugins = list(metadata_plugins(['identify']))
@ -322,6 +360,7 @@ def identify(log, abort, title=None, authors=None, identifiers={}, timeout=30):
r.tags = r.tags[:max_tags]
return results
# }}}
if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e