mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge from main branch
This commit is contained in:
commit
342b1b9985
22
recipes/f_secure.recipe
Normal file
22
recipes/f_secure.recipe
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1301860159(BasicNewsRecipe):
|
||||||
|
title = u'F-Secure Weblog'
|
||||||
|
language = 'en'
|
||||||
|
__author__ = 'louhike'
|
||||||
|
description = u'All the news from the weblog of F-Secure'
|
||||||
|
publisher = u'F-Secure'
|
||||||
|
timefmt = ' [%a, %d %b, %Y]'
|
||||||
|
encoding = 'ISO-8859-1'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'en_EN'
|
||||||
|
remove_javascript = True
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':'modSectionTd2'})]
|
||||||
|
remove_tags = [dict(name='a'),dict(name='hr')]
|
||||||
|
|
||||||
|
feeds = [(u'Weblog', u'http://www.f-secure.com/weblog/weblog.rss')]
|
||||||
|
def get_cover_url(self):
|
||||||
|
return 'http://www.f-secure.com/weblog/archives/images/company_logo.png'
|
@ -1,3 +1,4 @@
|
|||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class TimesOfIndia(BasicNewsRecipe):
|
class TimesOfIndia(BasicNewsRecipe):
|
||||||
@ -8,10 +9,10 @@ class TimesOfIndia(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 25
|
max_articles_per_feed = 25
|
||||||
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
keep_only_tags = [dict(attrs={'class':'maintable12'})]
|
keep_only_tags = [{'class':['maintable12', 'prttabl']}]
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(style=lambda x: x and 'float' in x),
|
dict(style=lambda x: x and 'float' in x),
|
||||||
dict(attrs={'class':'prvnxtbg'}),
|
{'class':['prvnxtbg', 'footbdrin', 'bcclftr']},
|
||||||
]
|
]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
@ -38,8 +39,28 @@ class TimesOfIndia(BasicNewsRecipe):
|
|||||||
('Most Read',
|
('Most Read',
|
||||||
'http://timesofindia.indiatimes.com/rssfeedmostread.cms')
|
'http://timesofindia.indiatimes.com/rssfeedmostread.cms')
|
||||||
]
|
]
|
||||||
def print_version(self, url):
|
|
||||||
return url + '?prtpage=1'
|
def get_article_url(self, article):
|
||||||
|
url = BasicNewsRecipe.get_article_url(self, article)
|
||||||
|
if '/0Ltimesofindia' in url:
|
||||||
|
url = url.partition('/0L')[-1]
|
||||||
|
url = url.replace('0B', '.').replace('0N', '.com').replace('0C',
|
||||||
|
'/').replace('0E', '-')
|
||||||
|
url = 'http://' + url.rpartition('/')[0]
|
||||||
|
match = re.search(r'/([0-9a-zA-Z]+?)\.cms', url)
|
||||||
|
if match is not None:
|
||||||
|
num = match.group(1)
|
||||||
|
num = re.sub(r'[^0-9]', '', num)
|
||||||
|
return ('http://timesofindia.indiatimes.com/articleshow/%s.cms?prtpage=1' %
|
||||||
|
num)
|
||||||
|
else:
|
||||||
|
cms = re.search(r'/(\d+)\.cms', url)
|
||||||
|
if cms is not None:
|
||||||
|
return ('http://timesofindia.indiatimes.com/articleshow/%s.cms?prtpage=1' %
|
||||||
|
cms.group(1))
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
return soup
|
return soup
|
||||||
|
@ -244,7 +244,8 @@ class POCKETBOOK602(USBMS):
|
|||||||
BCD = [0x0324]
|
BCD = [0x0324]
|
||||||
|
|
||||||
VENDOR_NAME = ''
|
VENDOR_NAME = ''
|
||||||
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['PB602', 'PB603', 'PB902', 'PB903']
|
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['PB602', 'PB603', 'PB902',
|
||||||
|
'PB903', 'PB']
|
||||||
|
|
||||||
class POCKETBOOK701(USBMS):
|
class POCKETBOOK701(USBMS):
|
||||||
|
|
||||||
|
@ -282,6 +282,7 @@ class Amazon(Source):
|
|||||||
capabilities = frozenset(['identify', 'cover'])
|
capabilities = frozenset(['identify', 'cover'])
|
||||||
touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
|
touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
|
||||||
'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate'])
|
'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate'])
|
||||||
|
has_html_comments = True
|
||||||
|
|
||||||
AMAZON_DOMAINS = {
|
AMAZON_DOMAINS = {
|
||||||
'com': _('US'),
|
'com': _('US'),
|
||||||
|
@ -18,6 +18,9 @@ from calibre.utils.titlecase import titlecase
|
|||||||
from calibre.ebooks.metadata import check_isbn
|
from calibre.ebooks.metadata import check_isbn
|
||||||
|
|
||||||
msprefs = JSONConfig('metadata_sources.json')
|
msprefs = JSONConfig('metadata_sources.json')
|
||||||
|
msprefs.defaults['txt_comments'] = False
|
||||||
|
msprefs.defaults['ignore_fields'] = []
|
||||||
|
msprefs.defaults['max_tags'] = 10
|
||||||
|
|
||||||
def create_log(ostream=None):
|
def create_log(ostream=None):
|
||||||
log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
|
log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
|
||||||
@ -104,6 +107,9 @@ class Source(Plugin):
|
|||||||
#: during the identify phase
|
#: during the identify phase
|
||||||
touched_fields = frozenset()
|
touched_fields = frozenset()
|
||||||
|
|
||||||
|
#: Set this to True if your plugin return HTML formatted comments
|
||||||
|
has_html_comments = False
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
Plugin.__init__(self, *args, **kwargs)
|
Plugin.__init__(self, *args, **kwargs)
|
||||||
self._isbn_to_identifier_cache = {}
|
self._isbn_to_identifier_cache = {}
|
||||||
|
@ -8,13 +8,18 @@ __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
|||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
from datetime import datetime
|
||||||
from Queue import Queue, Empty
|
from Queue import Queue, Empty
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
from operator import attrgetter
|
||||||
|
|
||||||
from calibre.customize.ui import metadata_plugins
|
from calibre.customize.ui import metadata_plugins
|
||||||
from calibre.ebooks.metadata.sources.base import create_log
|
from calibre.ebooks.metadata.sources.base import create_log, msprefs
|
||||||
from calibre.ebooks.metadata.xisbn import xisbn
|
from calibre.ebooks.metadata.xisbn import xisbn
|
||||||
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
|
from calibre.utils.date import utc_tz
|
||||||
|
from calibre.utils.html2text import html2text
|
||||||
|
|
||||||
# How long to wait for more results after first result is found
|
# How long to wait for more results after first result is found
|
||||||
WAIT_AFTER_FIRST_RESULT = 30 # seconds
|
WAIT_AFTER_FIRST_RESULT = 30 # seconds
|
||||||
@ -117,14 +122,30 @@ def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
|
|||||||
log('Merging results from different sources and finding earliest',
|
log('Merging results from different sources and finding earliest',
|
||||||
'publication dates')
|
'publication dates')
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
merged_results = merge_identify_results(results, log)
|
results = merge_identify_results(results, log)
|
||||||
log('We have %d merged results, merging took: %.2f seconds' %
|
log('We have %d merged results, merging took: %.2f seconds' %
|
||||||
(len(merged_results), time.time() - start_time))
|
(len(results), time.time() - start_time))
|
||||||
|
|
||||||
|
if msprefs['txt_comments']:
|
||||||
|
for r in results:
|
||||||
|
if r.plugin.has_html_comments and r.comments:
|
||||||
|
r.comments = html2text(r.comments)
|
||||||
|
|
||||||
|
dummy = Metadata(_('Unknown'))
|
||||||
|
max_tags = msprefs['max_tags']
|
||||||
|
for f in msprefs['ignore_fields']:
|
||||||
|
for r in results:
|
||||||
|
setattr(r, f, getattr(dummy, f))
|
||||||
|
r.tags = r.tags[:max_tags]
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
class ISBNMerge(object):
|
class ISBNMerge(object):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.pools = {}
|
self.pools = {}
|
||||||
|
self.isbnless_results = []
|
||||||
|
|
||||||
def isbn_in_pool(self, isbn):
|
def isbn_in_pool(self, isbn):
|
||||||
if isbn:
|
if isbn:
|
||||||
@ -140,22 +161,143 @@ class ISBNMerge(object):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def add_result(self, result, isbn):
|
def add_result(self, result):
|
||||||
pool = self.isbn_in_pool(isbn)
|
isbn = result.isbn
|
||||||
if pool is None:
|
if isbn:
|
||||||
isbns, min_year = xisbn.get_isbn_pool(isbn)
|
pool = self.isbn_in_pool(isbn)
|
||||||
if not isbns:
|
if pool is None:
|
||||||
isbns = frozenset([isbn])
|
isbns, min_year = xisbn.get_isbn_pool(isbn)
|
||||||
self.pool[isbns] = pool = (min_year, [])
|
if not isbns:
|
||||||
|
isbns = frozenset([isbn])
|
||||||
|
self.pool[isbns] = pool = (min_year, [])
|
||||||
|
|
||||||
|
if not self.pool_has_result_from_same_source(pool, result):
|
||||||
|
pool[1].append(result)
|
||||||
|
else:
|
||||||
|
self.isbnless_results.append(result)
|
||||||
|
|
||||||
|
def finalize(self):
|
||||||
|
has_isbn_result = False
|
||||||
|
for results in self.pools.itervalues():
|
||||||
|
if results:
|
||||||
|
has_isbn_result = True
|
||||||
|
break
|
||||||
|
self.has_isbn_result = has_isbn_result
|
||||||
|
|
||||||
|
if has_isbn_result:
|
||||||
|
self.merge_isbn_results()
|
||||||
|
else:
|
||||||
|
self.results = sorted(self.isbnless_results,
|
||||||
|
key=attrgetter('relevance_in_source'))
|
||||||
|
|
||||||
|
return self.results
|
||||||
|
|
||||||
|
def merge_isbn_results(self):
|
||||||
|
self.results = []
|
||||||
|
for min_year, results in self.pool.itervalues():
|
||||||
|
if results:
|
||||||
|
self.results.append(self.merge(results, min_year))
|
||||||
|
|
||||||
|
self.results.sort(key=attrgetter('average_source_relevance'))
|
||||||
|
|
||||||
|
def length_merge(self, attr, results, null_value=None, shortest=True):
|
||||||
|
values = [getattr(x, attr) for x in results if not x.is_null(attr)]
|
||||||
|
values = [x for x in values if len(x) > 0]
|
||||||
|
if not values:
|
||||||
|
return null_value
|
||||||
|
values.sort(key=len, reverse=not shortest)
|
||||||
|
return values[0]
|
||||||
|
|
||||||
|
def random_merge(self, attr, results, null_value=None):
|
||||||
|
values = [getattr(x, attr) for x in results if not x.is_null(attr)]
|
||||||
|
return values[0] if values else null_value
|
||||||
|
|
||||||
|
def merge(self, results, min_year):
|
||||||
|
ans = Metadata(_('Unknown'))
|
||||||
|
|
||||||
|
# We assume the shortest title has the least cruft in it
|
||||||
|
ans.title = self.length_merge('title', results, null_value=ans.title)
|
||||||
|
|
||||||
|
# No harm in having extra authors, maybe something useful like an
|
||||||
|
# editor or translator
|
||||||
|
ans.authors = self.length_merge('authors', results,
|
||||||
|
null_value=ans.authors, shortest=False)
|
||||||
|
|
||||||
|
# We assume the shortest publisher has the least cruft in it
|
||||||
|
ans.publisher = self.length_merge('publisher', results,
|
||||||
|
null_value=ans.publisher)
|
||||||
|
|
||||||
|
# We assume the smallest set of tags has the least cruft in it
|
||||||
|
ans.tags = self.length_merge('tags', results,
|
||||||
|
null_value=ans.tags)
|
||||||
|
|
||||||
|
# We assume the longest series has the most info in it
|
||||||
|
ans.series = self.length_merge('series', results,
|
||||||
|
null_value=ans.series, shortest=False)
|
||||||
|
for r in results:
|
||||||
|
if r.series and r.series == ans.series:
|
||||||
|
ans.series_index = r.series_index
|
||||||
|
break
|
||||||
|
|
||||||
|
# Average the rating over all sources
|
||||||
|
ratings = []
|
||||||
|
for r in results:
|
||||||
|
rating = r.rating
|
||||||
|
if rating and rating > 0 and rating <= 5:
|
||||||
|
ratings.append(rating)
|
||||||
|
if ratings:
|
||||||
|
ans.rating = sum(ratings)/len(ratings)
|
||||||
|
|
||||||
|
# Smallest language is likely to be valid
|
||||||
|
ans.language = self.length_merge('language', results,
|
||||||
|
null_value=ans.language)
|
||||||
|
|
||||||
|
# Choose longest comments
|
||||||
|
ans.comments = self.length_merge('comments', results,
|
||||||
|
null_value=ans.comments, shortest=False)
|
||||||
|
|
||||||
|
# Published date
|
||||||
|
if min_year:
|
||||||
|
min_date = datetime(min_year, 1, 2, tzinfo=utc_tz)
|
||||||
|
ans.pubdate = min_date
|
||||||
|
else:
|
||||||
|
min_date = datetime(10000, 1, 1, tzinfo=utc_tz)
|
||||||
|
for r in results:
|
||||||
|
if r.pubdate is not None and r.pubdate < min_date:
|
||||||
|
min_date = r.pubdate
|
||||||
|
if min_date.year < 10000:
|
||||||
|
ans.pubdate = min_date
|
||||||
|
|
||||||
|
# Identifiers
|
||||||
|
for r in results:
|
||||||
|
ans.identifiers.update(r.identifiers)
|
||||||
|
|
||||||
|
# Merge any other fields with no special handling (random merge)
|
||||||
|
touched_fields = set()
|
||||||
|
for r in results:
|
||||||
|
touched_fields |= r.plugin.touched_fields
|
||||||
|
|
||||||
|
for f in touched_fields:
|
||||||
|
if f.startswith('identifier:') or not ans.is_null(f):
|
||||||
|
continue
|
||||||
|
setattr(ans, f, self.random_merge(f, results,
|
||||||
|
null_value=getattr(ans, f)))
|
||||||
|
|
||||||
|
avg = [x.relevance_in_source for x in results]
|
||||||
|
avg = sum(avg)/len(avg)
|
||||||
|
ans.average_source_relevance = avg
|
||||||
|
|
||||||
|
return ans
|
||||||
|
|
||||||
if not self.pool_has_result_from_same_source(pool, result):
|
|
||||||
pool[1].append(result)
|
|
||||||
|
|
||||||
def merge_identify_results(result_map, log):
|
def merge_identify_results(result_map, log):
|
||||||
|
isbn_merge = ISBNMerge()
|
||||||
for plugin, results in result_map.iteritems():
|
for plugin, results in result_map.iteritems():
|
||||||
for result in results:
|
for result in results:
|
||||||
isbn = result.isbn
|
isbn_merge.add_result(result)
|
||||||
if isbn:
|
|
||||||
isbns, min_year = xisbn.get_isbn_pool(isbn)
|
return isbn_merge.finalize()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user