mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge from trunk
This commit is contained in:
commit
ff912773cf
@ -1,7 +1,9 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Eddie Lau'
|
||||
__copyright__ = '2010-2011, Eddie Lau'
|
||||
'''
|
||||
Change Log:
|
||||
2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
|
||||
clean up the indentation
|
||||
2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
|
||||
(to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
|
||||
2010/11/22: add English section, remove eco-news section which is not updated daily, correct
|
||||
@ -18,21 +20,19 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from contextlib import nested
|
||||
|
||||
|
||||
from calibre import __appname__
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
|
||||
class MPHKRecipe(BasicNewsRecipe):
|
||||
IsKindleUsed = True # to avoid generating periodical in which CJK characters can't be displayed in section/article view
|
||||
|
||||
IsCJKWellSupported = True # Set to False to avoid generating periodical in which CJK characters can't be displayed in section/article view
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
__author__ = 'Eddie Lau'
|
||||
description = 'Hong Kong Chinese Newspaper'
|
||||
publisher = 'news.mingpao.com'
|
||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||
publisher = 'MingPao'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
@ -46,9 +46,10 @@ class MPHKRecipe(BasicNewsRecipe):
|
||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||
keep_only_tags = [dict(name='h1'),
|
||||
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
||||
dict(attrs={'class':['photo']}),
|
||||
dict(attrs={'id':['newscontent']}), # entertainment page content
|
||||
dict(attrs={'id':['newscontent01','newscontent02']})]
|
||||
dict(attrs={'id':['newscontent01','newscontent02']}),
|
||||
dict(attrs={'class':['photo']})
|
||||
]
|
||||
remove_tags = [dict(name='style'),
|
||||
dict(attrs={'id':['newscontent135']})] # for the finance page
|
||||
remove_attributes = ['width']
|
||||
@ -107,6 +108,9 @@ class MPHKRecipe(BasicNewsRecipe):
|
||||
def get_fetchdate(self):
|
||||
return self.get_dtlocal().strftime("%Y%m%d")
|
||||
|
||||
def get_fetchformatteddate(self):
|
||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||
|
||||
def get_fetchday(self):
|
||||
# convert UTC to local hk time - at around HKT 6.00am, all news are available
|
||||
return self.get_dtlocal().strftime("%d")
|
||||
@ -124,13 +128,13 @@ class MPHKRecipe(BasicNewsRecipe):
|
||||
feeds = []
|
||||
dateStr = self.get_fetchdate()
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
|
||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||
(u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
|
||||
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
|
||||
(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
|
||||
('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
|
||||
(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
@ -141,14 +145,10 @@ class MPHKRecipe(BasicNewsRecipe):
|
||||
fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||
if fin_articles:
|
||||
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||
# special - eco-friendly
|
||||
# eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm')
|
||||
# if eco_articles:
|
||||
# feeds.append((u'\u74b0\u4fdd Eco News', eco_articles))
|
||||
# special - entertainment
|
||||
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||
if ent_articles:
|
||||
feeds.append((u'\u5f71\u8996 Entertainment', ent_articles))
|
||||
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||
return feeds
|
||||
|
||||
def parse_section(self, url):
|
||||
@ -174,31 +174,17 @@ class MPHKRecipe(BasicNewsRecipe):
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href= True)
|
||||
current_articles = []
|
||||
for i in a:
|
||||
url = i.get('href', False)
|
||||
if not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://www.mpfinance.com/cfm/' +url
|
||||
current_articles.append({'title': title, 'url': url, 'description':''})
|
||||
return current_articles
|
||||
|
||||
def parse_eco_section(self, url):
|
||||
dateStr = self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
divs = soup.findAll(attrs={'class': ['bullet']})
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in divs:
|
||||
a = i.find('a', href = True)
|
||||
title = self.tag_to_string(a)
|
||||
url = a.get('href', False)
|
||||
url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url
|
||||
if url not in included_urls and url.rfind('Redirect') == -1 and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1:
|
||||
for i in a:
|
||||
url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
|
||||
if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
|
||||
title = self.tag_to_string(i)
|
||||
current_articles.append({'title': title, 'url': url, 'description':''})
|
||||
included_urls.append(url)
|
||||
return current_articles
|
||||
|
||||
def parse_ent_section(self, url):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
@ -223,18 +209,22 @@ class MPHKRecipe(BasicNewsRecipe):
|
||||
return soup
|
||||
|
||||
def create_opf(self, feeds, dir=None):
|
||||
if self.IsKindleUsed == False:
|
||||
super(MPHKRecipe,self).create_opf(feeds, dir)
|
||||
return
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
title = self.short_title()
|
||||
title += ' ' + self.get_fetchdate()
|
||||
#if self.output_profile.periodical_date_in_title:
|
||||
if self.IsCJKWellSupported == True:
|
||||
# use Chinese title
|
||||
title = u'\u660e\u5831 (\u9999\u6e2f) ' + self.get_fetchformatteddate()
|
||||
else:
|
||||
# use English title
|
||||
title = self.short_title() + ' ' + self.get_fetchformatteddate()
|
||||
if True: # force date in title
|
||||
# title += strftime(self.timefmt)
|
||||
mi = MetaInformation(title, [__appname__])
|
||||
mi.publisher = __appname__
|
||||
mi.author_sort = __appname__
|
||||
mi = MetaInformation(title, [self.publisher])
|
||||
mi.publisher = self.publisher
|
||||
mi.author_sort = self.publisher
|
||||
if self.IsCJKWellSupported == True:
|
||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
else:
|
||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||
#mi.timestamp = nowf()
|
||||
mi.timestamp = self.get_dtlocal()
|
||||
@ -321,7 +311,7 @@ class MPHKRecipe(BasicNewsRecipe):
|
||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||
templ = self.navbar.generate(True, num, j, len(f),
|
||||
not self.has_single_feed,
|
||||
a.orig_url, __appname__, prefix=prefix,
|
||||
a.orig_url, self.publisher, prefix=prefix,
|
||||
center=self.center_navbar)
|
||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||
body.insert(len(body.contents), elem)
|
||||
@ -357,4 +347,3 @@ class MPHKRecipe(BasicNewsRecipe):
|
||||
|
||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||
opf.render(opf_file, ncx_file)
|
||||
|
||||
|
50
resources/recipes/osnews_pl.recipe
Normal file
50
resources/recipes/osnews_pl.recipe
Normal file
@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
'''
|
||||
OSNews.pl
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
class OSNewsRecipe(BasicNewsRecipe):
|
||||
__author__ = u'Mori & Tomasz D\u0142ugosz'
|
||||
language = 'pl'
|
||||
|
||||
title = u'OSnews.pl'
|
||||
publisher = u'OSnews.pl'
|
||||
description = u'OSnews.pl jest spo\u0142eczno\u015bciowym serwisem informacyjnym po\u015bwi\u0119conym oprogramowaniu, systemom operacyjnym i \u015bwiatowi IT'
|
||||
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
encoding = 'utf-8'
|
||||
use_embedded_content = False;
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
|
||||
extra_css = '''
|
||||
.news-heading {font-size:150%}
|
||||
.newsinformations li {display:inline;}
|
||||
blockquote {border:2px solid #000; padding:5px;}
|
||||
'''
|
||||
|
||||
feeds = [
|
||||
(u'OSNews.pl', u'http://feeds.feedburner.com/OSnewspl')
|
||||
]
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name = 'a', attrs = {'class' : 'news-heading'}),
|
||||
dict(name = 'div', attrs = {'class' : 'newsinformations'}),
|
||||
dict(name = 'div', attrs = {'id' : 'news-content'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name = 'div', attrs = {'class' : 'sociable'}),
|
||||
dict(name = 'div', attrs = {'class' : 'post_prev'}),
|
||||
dict(name = 'div', attrs = {'class' : 'post_next'}),
|
||||
dict(name = 'div', attrs = {'class' : 'clr'})
|
||||
]
|
||||
|
||||
preprocess_regexps = [(re.compile(u'</span>Komentarze: \(?[0-9]+\)? ?<span'), lambda match: '</span><span')]
|
24
resources/recipes/swiatkindle.recipe
Normal file
24
resources/recipes/swiatkindle.recipe
Normal file
@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2011, Tomasz Dlugosz <tomek3d@gmail.com>'
|
||||
'''
|
||||
swiatkindle.pl
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
class swiatkindle(BasicNewsRecipe):
|
||||
title = u'Swiat Kindle'
|
||||
description = u'Blog o czytniku Amazon Kindle. Wersje, ksi\u0105\u017cki, kupowanie i korzystanie w Polsce'
|
||||
language = 'pl'
|
||||
__author__ = u'Tomasz D\u0142ugosz'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
|
||||
feeds = [(u'\u015awiat Kindle - wpisy', u'http://swiatkindle.pl/feed')]
|
||||
|
||||
remove_tags = [dict(name = 'ul', attrs = {'class' : 'similar-posts'})]
|
||||
|
||||
preprocess_regexps = [(re.compile(u'<h3>Czytaj dalej:</h3>'), lambda match: '')]
|
||||
|
@ -573,8 +573,8 @@ from calibre.devices.edge.driver import EDGE
|
||||
from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, \
|
||||
SOVOS, PICO, SUNSTECH_EB700, ARCHOS7O, STASH
|
||||
from calibre.devices.sne.driver import SNE
|
||||
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \
|
||||
GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, Q600, LUMIREAD, ALURATEK_COLOR, \
|
||||
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, \
|
||||
GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, LUMIREAD, ALURATEK_COLOR, \
|
||||
TREKSTOR, EEEREADER, NEXTBOOK
|
||||
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
|
||||
from calibre.devices.kobo.driver import KOBO
|
||||
@ -691,8 +691,6 @@ plugins += [
|
||||
AVANT,
|
||||
MENTOR,
|
||||
SWEEX,
|
||||
Q600,
|
||||
KOGAN,
|
||||
PDNOVEL,
|
||||
SPECTRA,
|
||||
GEMEI,
|
||||
|
@ -121,7 +121,8 @@ def enable_plugin(plugin_or_name):
|
||||
config['enabled_plugins'] = ep
|
||||
|
||||
default_disabled_plugins = set([
|
||||
'Douban Books', 'Douban.com covers', 'Nicebooks', 'Nicebooks covers'
|
||||
'Douban Books', 'Douban.com covers', 'Nicebooks', 'Nicebooks covers',
|
||||
'Kent District Library'
|
||||
])
|
||||
|
||||
def is_disabled(plugin):
|
||||
|
@ -54,41 +54,24 @@ class AVANT(USBMS):
|
||||
class SWEEX(USBMS):
|
||||
# Identical to the Promedia
|
||||
name = 'Sweex Device Interface'
|
||||
gui_name = 'Sweex'
|
||||
description = _('Communicate with the Sweex MM300')
|
||||
gui_name = 'Sweex/Kogan/Q600/Wink'
|
||||
description = _('Communicate with the Sweex/Kogan/Q600/Wink')
|
||||
author = 'Kovid Goyal'
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
|
||||
# Ordered list of supported formats
|
||||
FORMATS = ['epub', 'prc', 'fb2', 'html', 'rtf', 'chm', 'pdf', 'txt']
|
||||
FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'html', 'rtf', 'chm', 'pdf', 'txt']
|
||||
|
||||
VENDOR_ID = [0x0525, 0x177f]
|
||||
PRODUCT_ID = [0xa4a5, 0x300]
|
||||
BCD = [0x0319, 0x110]
|
||||
BCD = [0x0319, 0x110, 0x325]
|
||||
|
||||
VENDOR_NAME = 'SWEEX'
|
||||
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'EBOOKREADER'
|
||||
VENDOR_NAME = ['SWEEX', 'LINUX']
|
||||
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['EBOOKREADER', 'FILE-STOR_GADGET']
|
||||
|
||||
EBOOK_DIR_MAIN = ''
|
||||
SUPPORTS_SUB_DIRS = True
|
||||
|
||||
class Q600(SWEEX):
|
||||
|
||||
name = 'Digma Q600 Device interface'
|
||||
gui_name = 'Q600'
|
||||
description = _('Communicate with the Digma Q600')
|
||||
|
||||
BCD = [0x325]
|
||||
FORMATS = ['epub', 'fb2', 'mobi', 'prc', 'html', 'rtf', 'chm', 'pdf', 'txt']
|
||||
|
||||
class KOGAN(SWEEX):
|
||||
|
||||
name = 'Kogan Device Interface'
|
||||
gui_name = 'Kogan'
|
||||
description = _('Communicate with the Kogan')
|
||||
VENDOR_NAME = 'LINUX'
|
||||
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'FILE-STOR_GADGET'
|
||||
EBOOK_DIR_MAIN = 'Kogan eBooks'
|
||||
|
||||
class PDNOVEL(USBMS):
|
||||
name = 'Pandigital Novel device interface'
|
||||
|
@ -350,6 +350,8 @@ class FB2MLizer(object):
|
||||
# Number of blank lines above tag
|
||||
try:
|
||||
ems = int(round((float(style.marginTop) / style.fontSize) - 1))
|
||||
if ems < 0:
|
||||
ems = 0
|
||||
except:
|
||||
ems = 0
|
||||
|
||||
@ -397,7 +399,7 @@ class FB2MLizer(object):
|
||||
fb2_out += p_txt
|
||||
tags += p_tag
|
||||
fb2_out.append('<image xlink:href="#%s" />' % self.image_hrefs[page.abshref(elem_tree.attrib['src'])])
|
||||
if tag in ('br', 'hr') or ems:
|
||||
if tag in ('br', 'hr') or ems >= 1:
|
||||
if ems < 1:
|
||||
multiplier = 1
|
||||
else:
|
||||
|
@ -205,7 +205,10 @@ def main(args=sys.argv):
|
||||
open(cpath, 'wb').write(br.open_novisit(curl).read())
|
||||
print 'Cover for', title, 'saved to', cpath
|
||||
|
||||
#import time
|
||||
#st = time.time()
|
||||
print get_social_metadata(title, None, None, isbn)
|
||||
#print '\n\n', time.time() - st, '\n\n'
|
||||
|
||||
return 0
|
||||
|
||||
|
@ -106,6 +106,9 @@ class MetadataSource(Plugin): # {{{
|
||||
def join(self):
|
||||
return self.worker.join()
|
||||
|
||||
def is_alive(self):
|
||||
return self.worker.is_alive()
|
||||
|
||||
def is_customizable(self):
|
||||
return True
|
||||
|
||||
@ -251,7 +254,9 @@ class KentDistrictLibrary(MetadataSource): # {{{
|
||||
|
||||
name = 'Kent District Library'
|
||||
metadata_type = 'social'
|
||||
description = _('Downloads series information from ww2.kdl.org')
|
||||
description = _('Downloads series information from ww2.kdl.org. '
|
||||
'This website cannot handle large numbers of queries, '
|
||||
'so the plugin is disabled by default.')
|
||||
|
||||
def fetch(self):
|
||||
if not self.title or not self.book_author:
|
||||
|
@ -5,7 +5,9 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re, urllib, urlparse
|
||||
import re, urllib, urlparse, socket
|
||||
|
||||
from mechanize import URLError
|
||||
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
from calibre import browser
|
||||
@ -17,7 +19,7 @@ URL = \
|
||||
|
||||
_ignore_starts = u'\'"'+u''.join(unichr(x) for x in range(0x2018, 0x201e)+[0x2032, 0x2033])
|
||||
|
||||
def get_series(title, authors):
|
||||
def get_series(title, authors, timeout=60):
|
||||
mi = Metadata(title, authors)
|
||||
if title and title[0] in _ignore_starts:
|
||||
title = title[1:]
|
||||
@ -39,7 +41,12 @@ def get_series(title, authors):
|
||||
|
||||
url = URL.format(author, title)
|
||||
br = browser()
|
||||
raw = br.open(url).read()
|
||||
try:
|
||||
raw = br.open_novisit(url, timeout=timeout).read()
|
||||
except URLError, e:
|
||||
if isinstance(e.reason, socket.timeout):
|
||||
raise Exception('KDL Server busy, try again later')
|
||||
raise
|
||||
if 'see the full results' not in raw:
|
||||
return mi
|
||||
raw = xml_to_unicode(raw)[0]
|
||||
|
@ -85,7 +85,8 @@ class Source(Plugin):
|
||||
|
||||
# Metadata API {{{
|
||||
|
||||
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
|
||||
def identify(self, log, result_queue, abort, title=None, authors=None,
|
||||
identifiers={}, timeout=5):
|
||||
'''
|
||||
Identify a book by its title/author/isbn/etc.
|
||||
|
||||
@ -98,6 +99,8 @@ class Source(Plugin):
|
||||
:param authors: A list of authors of the book, can be None
|
||||
:param identifiers: A dictionary of other identifiers, most commonly
|
||||
{'isbn':'1234...'}
|
||||
:param timeout: Timeout in seconds, no network request should hang for
|
||||
longer than timeout.
|
||||
:return: None if no errors occurred, otherwise a unicode representation
|
||||
of the error suitable for showing to the user
|
||||
|
||||
|
@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en'
|
||||
import time
|
||||
from urllib import urlencode
|
||||
from functools import partial
|
||||
from threading import Thread
|
||||
|
||||
from lxml import etree
|
||||
|
||||
@ -18,6 +17,7 @@ from calibre.ebooks.metadata.sources.base import Source
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.date import parse_date, utcnow
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
from calibre import browser, as_unicode
|
||||
|
||||
NAMESPACES = {
|
||||
@ -41,20 +41,20 @@ subject = XPath('descendant::dc:subject')
|
||||
description = XPath('descendant::dc:description')
|
||||
language = XPath('descendant::dc:language')
|
||||
|
||||
def get_details(browser, url):
|
||||
def get_details(browser, url, timeout):
|
||||
try:
|
||||
raw = browser.open_novisit(url).read()
|
||||
raw = browser.open_novisit(url, timeout=timeout).read()
|
||||
except Exception as e:
|
||||
gc = getattr(e, 'getcode', lambda : -1)
|
||||
if gc() != 403:
|
||||
raise
|
||||
# Google is throttling us, wait a little
|
||||
time.sleep(2)
|
||||
raw = browser.open_novisit(url).read()
|
||||
time.sleep(1)
|
||||
raw = browser.open_novisit(url, timeout=timeout).read()
|
||||
|
||||
return raw
|
||||
|
||||
def to_metadata(browser, log, entry_):
|
||||
def to_metadata(browser, log, entry_, timeout):
|
||||
|
||||
def get_text(extra, x):
|
||||
try:
|
||||
@ -79,8 +79,9 @@ def to_metadata(browser, log, entry_):
|
||||
|
||||
mi = Metadata(title_, authors)
|
||||
try:
|
||||
raw = get_details(browser, id_url)
|
||||
feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True)[0])
|
||||
raw = get_details(browser, id_url, timeout)
|
||||
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
|
||||
strip_encoding_pats=True)[0])
|
||||
extra = entry(feed)[0]
|
||||
except:
|
||||
log.exception('Failed to get additional details for', mi.title)
|
||||
@ -131,25 +132,18 @@ def to_metadata(browser, log, entry_):
|
||||
|
||||
return mi
|
||||
|
||||
class Worker(Thread):
|
||||
|
||||
def __init__(self, log, entries, abort, result_queue):
|
||||
self.browser, self.log, self.entries = browser(), log, entries
|
||||
self.abort, self.result_queue = abort, result_queue
|
||||
Thread.__init__(self)
|
||||
self.daemon = True
|
||||
|
||||
def run(self):
|
||||
for i in self.entries:
|
||||
def get_all_details(br, log, entries, abort, result_queue, timeout):
|
||||
for i in entries:
|
||||
try:
|
||||
ans = to_metadata(self.browser, self.log, i)
|
||||
ans = to_metadata(br, log, i, timeout)
|
||||
if isinstance(ans, Metadata):
|
||||
self.result_queue.put(ans)
|
||||
result_queue.put(ans)
|
||||
except:
|
||||
self.log.exception(
|
||||
log.exception(
|
||||
'Failed to get metadata for identify entry:',
|
||||
etree.tostring(i))
|
||||
if self.abort.is_set():
|
||||
if abort.is_set():
|
||||
break
|
||||
|
||||
|
||||
@ -192,54 +186,40 @@ class GoogleBooks(Source):
|
||||
})
|
||||
|
||||
|
||||
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
|
||||
def identify(self, log, result_queue, abort, title=None, authors=None,
|
||||
identifiers={}, timeout=5):
|
||||
query = self.create_query(log, title=title, authors=authors,
|
||||
identifiers=identifiers)
|
||||
br = browser()
|
||||
try:
|
||||
raw = browser().open_novisit(query).read()
|
||||
raw = br.open_novisit(query, timeout=timeout).read()
|
||||
except Exception, e:
|
||||
log.exception('Failed to make identify query: %r'%query)
|
||||
return as_unicode(e)
|
||||
|
||||
try:
|
||||
parser = etree.XMLParser(recover=True, no_network=True)
|
||||
feed = etree.fromstring(xml_to_unicode(raw,
|
||||
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
|
||||
strip_encoding_pats=True)[0], parser=parser)
|
||||
entries = entry(feed)
|
||||
except Exception, e:
|
||||
log.exception('Failed to parse identify results')
|
||||
return as_unicode(e)
|
||||
|
||||
|
||||
groups = self.split_jobs(entries, 5) # At most 5 threads
|
||||
if not groups:
|
||||
return None
|
||||
workers = [Worker(log, entries, abort, result_queue) for entries in
|
||||
groups]
|
||||
|
||||
if abort.is_set():
|
||||
return None
|
||||
|
||||
for worker in workers: worker.start()
|
||||
|
||||
has_alive_worker = True
|
||||
while has_alive_worker and not abort.is_set():
|
||||
time.sleep(0.1)
|
||||
has_alive_worker = False
|
||||
for worker in workers:
|
||||
if worker.is_alive():
|
||||
has_alive_worker = True
|
||||
# There is no point running these queries in threads as google
|
||||
# throttles requests returning Forbidden errors
|
||||
get_all_details(br, log, entries, abort, result_queue, timeout)
|
||||
|
||||
return None
|
||||
|
||||
if __name__ == '__main__':
|
||||
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py
|
||||
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
||||
isbn_test)
|
||||
title_test)
|
||||
test_identify_plugin(GoogleBooks.name,
|
||||
[
|
||||
(
|
||||
{'title': 'Great Expectations', 'authors':['Charles Dickens']},
|
||||
[isbn_test('9781607541592')]
|
||||
[title_test('Great Expectations', exact=True)]
|
||||
),
|
||||
])
|
||||
|
@ -7,7 +7,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, tempfile
|
||||
import os, tempfile, time
|
||||
from Queue import Queue, Empty
|
||||
from threading import Event
|
||||
|
||||
@ -26,6 +26,17 @@ def isbn_test(isbn):
|
||||
|
||||
return test
|
||||
|
||||
def title_test(title, exact=False):
|
||||
|
||||
title = title.lower()
|
||||
|
||||
def test(mi):
|
||||
mt = mi.title.lower()
|
||||
return (exact and mt == title) or \
|
||||
(not exact and title in mt)
|
||||
|
||||
return test
|
||||
|
||||
def test_identify_plugin(name, tests):
|
||||
'''
|
||||
:param name: Plugin name
|
||||
@ -48,11 +59,15 @@ def test_identify_plugin(name, tests):
|
||||
abort = Event()
|
||||
prints('Log saved to', lf)
|
||||
|
||||
times = []
|
||||
for kwargs, test_funcs in tests:
|
||||
prints('Running test with:', kwargs)
|
||||
rq = Queue()
|
||||
args = (log, rq, abort)
|
||||
start_time = time.time()
|
||||
err = plugin.identify(*args, **kwargs)
|
||||
total_time = time.time() - start_time
|
||||
times.append(total_time)
|
||||
if err is not None:
|
||||
prints('identify returned an error for args', args)
|
||||
prints(err)
|
||||
@ -87,6 +102,8 @@ def test_identify_plugin(name, tests):
|
||||
prints('Log saved to', lf)
|
||||
raise SystemExit(1)
|
||||
|
||||
prints('Average time per query', sum(times)/len(times))
|
||||
|
||||
if os.stat(lf).st_size > 10:
|
||||
prints('There were some errors, see log', lf)
|
||||
|
||||
|
@ -11,6 +11,12 @@ from calibre import browser
|
||||
|
||||
class xISBN(object):
|
||||
|
||||
'''
|
||||
This class is used to find the ISBN numbers of "related" editions of a
|
||||
book, given its ISBN. Useful when querying services for metadata by ISBN,
|
||||
in case they do not have the ISBN for the particular edition.
|
||||
'''
|
||||
|
||||
QUERY = 'http://xisbn.worldcat.org/webservices/xid/isbn/%s?method=getEditions&format=json&fl=form,year,lang,ed'
|
||||
|
||||
def __init__(self):
|
||||
|
@ -259,6 +259,7 @@ class AddAction(InterfaceAction):
|
||||
if hasattr(self.gui, 'db_images'):
|
||||
self.gui.db_images.reset()
|
||||
self.gui.tags_view.recount()
|
||||
|
||||
if getattr(self._adder, 'merged_books', False):
|
||||
books = u'\n'.join([x if isinstance(x, unicode) else
|
||||
x.decode(preferred_encoding, 'replace') for x in
|
||||
@ -266,6 +267,17 @@ class AddAction(InterfaceAction):
|
||||
info_dialog(self.gui, _('Merged some books'),
|
||||
_('The following duplicate books were found and incoming book formats were '
|
||||
'processed and merged into your Calibre database according to your automerge settings:'), det_msg=books, show=True)
|
||||
|
||||
if getattr(self._adder, 'number_of_books_added', 0) > 0 or \
|
||||
getattr(self._adder, 'merged_books', False):
|
||||
# The formats of the current book could have changed if
|
||||
# automerge is enabled
|
||||
current_idx = self.gui.library_view.currentIndex()
|
||||
if current_idx.isValid():
|
||||
self.gui.library_view.model().current_changed(current_idx,
|
||||
current_idx)
|
||||
|
||||
|
||||
if getattr(self._adder, 'critical', None):
|
||||
det_msg = []
|
||||
for name, log in self._adder.critical.items():
|
||||
|
Loading…
x
Reference in New Issue
Block a user