Merge from trunk

This commit is contained in:
Charles Haley 2011-02-21 09:37:08 +00:00
commit ff912773cf
15 changed files with 302 additions and 222 deletions

View File

@ -1,7 +1,9 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, Eddie Lau' __copyright__ = '2010-2011, Eddie Lau'
''' '''
Change Log: Change Log:
2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
clean up the indentation
2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list 2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
(to avoid wrong date display in case the user generates the ebook in a time zone different from HKT) (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
2010/11/22: add English section, remove eco-news section which is not updated daily, correct 2010/11/22: add English section, remove eco-news section which is not updated daily, correct
@ -18,21 +20,19 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested from contextlib import nested
from calibre import __appname__
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
class MPHKRecipe(BasicNewsRecipe): class MPHKRecipe(BasicNewsRecipe):
IsKindleUsed = True # to avoid generating periodical in which CJK characters can't be displayed in section/article view IsCJKWellSupported = True # Set to False to avoid generating periodical in which CJK characters can't be displayed in section/article view
title = 'Ming Pao - Hong Kong' title = 'Ming Pao - Hong Kong'
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
__author__ = 'Eddie Lau' __author__ = 'Eddie Lau'
description = 'Hong Kong Chinese Newspaper' description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
publisher = 'news.mingpao.com' publisher = 'MingPao'
category = 'Chinese, News, Hong Kong' category = 'Chinese, News, Hong Kong'
remove_javascript = True remove_javascript = True
use_embedded_content = False use_embedded_content = False
@ -46,9 +46,10 @@ class MPHKRecipe(BasicNewsRecipe):
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
keep_only_tags = [dict(name='h1'), keep_only_tags = [dict(name='h1'),
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
dict(attrs={'class':['photo']}),
dict(attrs={'id':['newscontent']}), # entertainment page content dict(attrs={'id':['newscontent']}), # entertainment page content
dict(attrs={'id':['newscontent01','newscontent02']})] dict(attrs={'id':['newscontent01','newscontent02']}),
dict(attrs={'class':['photo']})
]
remove_tags = [dict(name='style'), remove_tags = [dict(name='style'),
dict(attrs={'id':['newscontent135']})] # for the finance page dict(attrs={'id':['newscontent135']})] # for the finance page
remove_attributes = ['width'] remove_attributes = ['width']
@ -107,6 +108,9 @@ class MPHKRecipe(BasicNewsRecipe):
def get_fetchdate(self): def get_fetchdate(self):
return self.get_dtlocal().strftime("%Y%m%d") return self.get_dtlocal().strftime("%Y%m%d")
def get_fetchformatteddate(self):
return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchday(self): def get_fetchday(self):
# convert UTC to local hk time - at around HKT 6.00am, all news are available # convert UTC to local hk time - at around HKT 6.00am, all news are available
return self.get_dtlocal().strftime("%d") return self.get_dtlocal().strftime("%d")
@ -124,13 +128,13 @@ class MPHKRecipe(BasicNewsRecipe):
feeds = [] feeds = []
dateStr = self.get_fetchdate() dateStr = self.get_fetchdate()
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
(u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
@ -141,14 +145,10 @@ class MPHKRecipe(BasicNewsRecipe):
fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
if fin_articles: if fin_articles:
feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
# special - eco-friendly
# eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm')
# if eco_articles:
# feeds.append((u'\u74b0\u4fdd Eco News', eco_articles))
# special - entertainment # special - entertainment
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
if ent_articles: if ent_articles:
feeds.append((u'\u5f71\u8996 Entertainment', ent_articles)) feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
return feeds return feeds
def parse_section(self, url): def parse_section(self, url):
@ -174,31 +174,17 @@ class MPHKRecipe(BasicNewsRecipe):
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
a = soup.findAll('a', href= True) a = soup.findAll('a', href= True)
current_articles = [] current_articles = []
for i in a:
url = i.get('href', False)
if not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
title = self.tag_to_string(i)
url = 'http://www.mpfinance.com/cfm/' +url
current_articles.append({'title': title, 'url': url, 'description':''})
return current_articles
def parse_eco_section(self, url):
dateStr = self.get_fetchdate()
soup = self.index_to_soup(url)
divs = soup.findAll(attrs={'class': ['bullet']})
current_articles = []
included_urls = [] included_urls = []
for i in divs: for i in a:
a = i.find('a', href = True) url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
title = self.tag_to_string(a) if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
url = a.get('href', False) title = self.tag_to_string(i)
url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url
if url not in included_urls and url.rfind('Redirect') == -1 and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1:
current_articles.append({'title': title, 'url': url, 'description':''}) current_articles.append({'title': title, 'url': url, 'description':''})
included_urls.append(url) included_urls.append(url)
return current_articles return current_articles
def parse_ent_section(self, url): def parse_ent_section(self, url):
self.get_fetchdate()
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
a = soup.findAll('a', href=True) a = soup.findAll('a', href=True)
a.reverse() a.reverse()
@ -223,18 +209,22 @@ class MPHKRecipe(BasicNewsRecipe):
return soup return soup
def create_opf(self, feeds, dir=None): def create_opf(self, feeds, dir=None):
if self.IsKindleUsed == False:
super(MPHKRecipe,self).create_opf(feeds, dir)
return
if dir is None: if dir is None:
dir = self.output_dir dir = self.output_dir
title = self.short_title() if self.IsCJKWellSupported == True:
title += ' ' + self.get_fetchdate() # use Chinese title
#if self.output_profile.periodical_date_in_title: title = u'\u660e\u5831 (\u9999\u6e2f) ' + self.get_fetchformatteddate()
else:
# use English title
title = self.short_title() + ' ' + self.get_fetchformatteddate()
if True: # force date in title
# title += strftime(self.timefmt) # title += strftime(self.timefmt)
mi = MetaInformation(title, [__appname__]) mi = MetaInformation(title, [self.publisher])
mi.publisher = __appname__ mi.publisher = self.publisher
mi.author_sort = __appname__ mi.author_sort = self.publisher
if self.IsCJKWellSupported == True:
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
else:
mi.publication_type = self.publication_type+':'+self.short_title() mi.publication_type = self.publication_type+':'+self.short_title()
#mi.timestamp = nowf() #mi.timestamp = nowf()
mi.timestamp = self.get_dtlocal() mi.timestamp = self.get_dtlocal()
@ -321,7 +311,7 @@ class MPHKRecipe(BasicNewsRecipe):
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f), templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed, not self.has_single_feed,
a.orig_url, __appname__, prefix=prefix, a.orig_url, self.publisher, prefix=prefix,
center=self.center_navbar) center=self.center_navbar)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem) body.insert(len(body.contents), elem)
@ -357,4 +347,3 @@ class MPHKRecipe(BasicNewsRecipe):
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file) opf.render(opf_file, ncx_file)

View File

@ -0,0 +1,50 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
OSNews.pl
'''
from calibre.web.feeds.news import BasicNewsRecipe
import re
class OSNewsRecipe(BasicNewsRecipe):
__author__ = u'Mori & Tomasz D\u0142ugosz'
language = 'pl'
title = u'OSnews.pl'
publisher = u'OSnews.pl'
description = u'OSnews.pl jest spo\u0142eczno\u015bciowym serwisem informacyjnym po\u015bwi\u0119conym oprogramowaniu, systemom operacyjnym i \u015bwiatowi IT'
no_stylesheets = True
remove_javascript = True
encoding = 'utf-8'
use_embedded_content = False;
oldest_article = 7
max_articles_per_feed = 100
extra_css = '''
.news-heading {font-size:150%}
.newsinformations li {display:inline;}
blockquote {border:2px solid #000; padding:5px;}
'''
feeds = [
(u'OSNews.pl', u'http://feeds.feedburner.com/OSnewspl')
]
keep_only_tags = [
dict(name = 'a', attrs = {'class' : 'news-heading'}),
dict(name = 'div', attrs = {'class' : 'newsinformations'}),
dict(name = 'div', attrs = {'id' : 'news-content'})
]
remove_tags = [
dict(name = 'div', attrs = {'class' : 'sociable'}),
dict(name = 'div', attrs = {'class' : 'post_prev'}),
dict(name = 'div', attrs = {'class' : 'post_next'}),
dict(name = 'div', attrs = {'class' : 'clr'})
]
preprocess_regexps = [(re.compile(u'</span>Komentarze: \(?[0-9]+\)? ?<span'), lambda match: '</span><span')]

View File

@ -0,0 +1,24 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = u'2011, Tomasz Dlugosz <tomek3d@gmail.com>'
'''
swiatkindle.pl
'''
import re
class swiatkindle(BasicNewsRecipe):
title = u'Swiat Kindle'
description = u'Blog o czytniku Amazon Kindle. Wersje, ksi\u0105\u017cki, kupowanie i korzystanie w Polsce'
language = 'pl'
__author__ = u'Tomasz D\u0142ugosz'
oldest_article = 7
max_articles_per_feed = 100
feeds = [(u'\u015awiat Kindle - wpisy', u'http://swiatkindle.pl/feed')]
remove_tags = [dict(name = 'ul', attrs = {'class' : 'similar-posts'})]
preprocess_regexps = [(re.compile(u'<h3>Czytaj dalej:</h3>'), lambda match: '')]

View File

@ -573,8 +573,8 @@ from calibre.devices.edge.driver import EDGE
from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, \ from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, \
SOVOS, PICO, SUNSTECH_EB700, ARCHOS7O, STASH SOVOS, PICO, SUNSTECH_EB700, ARCHOS7O, STASH
from calibre.devices.sne.driver import SNE from calibre.devices.sne.driver import SNE
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \ from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, \
GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, Q600, LUMIREAD, ALURATEK_COLOR, \ GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, LUMIREAD, ALURATEK_COLOR, \
TREKSTOR, EEEREADER, NEXTBOOK TREKSTOR, EEEREADER, NEXTBOOK
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
from calibre.devices.kobo.driver import KOBO from calibre.devices.kobo.driver import KOBO
@ -691,8 +691,6 @@ plugins += [
AVANT, AVANT,
MENTOR, MENTOR,
SWEEX, SWEEX,
Q600,
KOGAN,
PDNOVEL, PDNOVEL,
SPECTRA, SPECTRA,
GEMEI, GEMEI,

View File

@ -121,7 +121,8 @@ def enable_plugin(plugin_or_name):
config['enabled_plugins'] = ep config['enabled_plugins'] = ep
default_disabled_plugins = set([ default_disabled_plugins = set([
'Douban Books', 'Douban.com covers', 'Nicebooks', 'Nicebooks covers' 'Douban Books', 'Douban.com covers', 'Nicebooks', 'Nicebooks covers',
'Kent District Library'
]) ])
def is_disabled(plugin): def is_disabled(plugin):

View File

@ -54,41 +54,24 @@ class AVANT(USBMS):
class SWEEX(USBMS): class SWEEX(USBMS):
# Identical to the Promedia # Identical to the Promedia
name = 'Sweex Device Interface' name = 'Sweex Device Interface'
gui_name = 'Sweex' gui_name = 'Sweex/Kogan/Q600/Wink'
description = _('Communicate with the Sweex MM300') description = _('Communicate with the Sweex/Kogan/Q600/Wink')
author = 'Kovid Goyal' author = 'Kovid Goyal'
supported_platforms = ['windows', 'osx', 'linux'] supported_platforms = ['windows', 'osx', 'linux']
# Ordered list of supported formats # Ordered list of supported formats
FORMATS = ['epub', 'prc', 'fb2', 'html', 'rtf', 'chm', 'pdf', 'txt'] FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'html', 'rtf', 'chm', 'pdf', 'txt']
VENDOR_ID = [0x0525, 0x177f] VENDOR_ID = [0x0525, 0x177f]
PRODUCT_ID = [0xa4a5, 0x300] PRODUCT_ID = [0xa4a5, 0x300]
BCD = [0x0319, 0x110] BCD = [0x0319, 0x110, 0x325]
VENDOR_NAME = 'SWEEX' VENDOR_NAME = ['SWEEX', 'LINUX']
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'EBOOKREADER' WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['EBOOKREADER', 'FILE-STOR_GADGET']
EBOOK_DIR_MAIN = '' EBOOK_DIR_MAIN = ''
SUPPORTS_SUB_DIRS = True SUPPORTS_SUB_DIRS = True
class Q600(SWEEX):
name = 'Digma Q600 Device interface'
gui_name = 'Q600'
description = _('Communicate with the Digma Q600')
BCD = [0x325]
FORMATS = ['epub', 'fb2', 'mobi', 'prc', 'html', 'rtf', 'chm', 'pdf', 'txt']
class KOGAN(SWEEX):
name = 'Kogan Device Interface'
gui_name = 'Kogan'
description = _('Communicate with the Kogan')
VENDOR_NAME = 'LINUX'
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'FILE-STOR_GADGET'
EBOOK_DIR_MAIN = 'Kogan eBooks'
class PDNOVEL(USBMS): class PDNOVEL(USBMS):
name = 'Pandigital Novel device interface' name = 'Pandigital Novel device interface'

View File

@ -350,6 +350,8 @@ class FB2MLizer(object):
# Number of blank lines above tag # Number of blank lines above tag
try: try:
ems = int(round((float(style.marginTop) / style.fontSize) - 1)) ems = int(round((float(style.marginTop) / style.fontSize) - 1))
if ems < 0:
ems = 0
except: except:
ems = 0 ems = 0
@ -397,7 +399,7 @@ class FB2MLizer(object):
fb2_out += p_txt fb2_out += p_txt
tags += p_tag tags += p_tag
fb2_out.append('<image xlink:href="#%s" />' % self.image_hrefs[page.abshref(elem_tree.attrib['src'])]) fb2_out.append('<image xlink:href="#%s" />' % self.image_hrefs[page.abshref(elem_tree.attrib['src'])])
if tag in ('br', 'hr') or ems: if tag in ('br', 'hr') or ems >= 1:
if ems < 1: if ems < 1:
multiplier = 1 multiplier = 1
else: else:

View File

@ -205,7 +205,10 @@ def main(args=sys.argv):
open(cpath, 'wb').write(br.open_novisit(curl).read()) open(cpath, 'wb').write(br.open_novisit(curl).read())
print 'Cover for', title, 'saved to', cpath print 'Cover for', title, 'saved to', cpath
#import time
#st = time.time()
print get_social_metadata(title, None, None, isbn) print get_social_metadata(title, None, None, isbn)
#print '\n\n', time.time() - st, '\n\n'
return 0 return 0

View File

@ -106,6 +106,9 @@ class MetadataSource(Plugin): # {{{
def join(self): def join(self):
return self.worker.join() return self.worker.join()
def is_alive(self):
return self.worker.is_alive()
def is_customizable(self): def is_customizable(self):
return True return True
@ -251,7 +254,9 @@ class KentDistrictLibrary(MetadataSource): # {{{
name = 'Kent District Library' name = 'Kent District Library'
metadata_type = 'social' metadata_type = 'social'
description = _('Downloads series information from ww2.kdl.org') description = _('Downloads series information from ww2.kdl.org. '
'This website cannot handle large numbers of queries, '
'so the plugin is disabled by default.')
def fetch(self): def fetch(self):
if not self.title or not self.book_author: if not self.title or not self.book_author:

View File

@ -5,7 +5,9 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re, urllib, urlparse import re, urllib, urlparse, socket
from mechanize import URLError
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre import browser from calibre import browser
@ -17,7 +19,7 @@ URL = \
_ignore_starts = u'\'"'+u''.join(unichr(x) for x in range(0x2018, 0x201e)+[0x2032, 0x2033]) _ignore_starts = u'\'"'+u''.join(unichr(x) for x in range(0x2018, 0x201e)+[0x2032, 0x2033])
def get_series(title, authors): def get_series(title, authors, timeout=60):
mi = Metadata(title, authors) mi = Metadata(title, authors)
if title and title[0] in _ignore_starts: if title and title[0] in _ignore_starts:
title = title[1:] title = title[1:]
@ -39,7 +41,12 @@ def get_series(title, authors):
url = URL.format(author, title) url = URL.format(author, title)
br = browser() br = browser()
raw = br.open(url).read() try:
raw = br.open_novisit(url, timeout=timeout).read()
except URLError, e:
if isinstance(e.reason, socket.timeout):
raise Exception('KDL Server busy, try again later')
raise
if 'see the full results' not in raw: if 'see the full results' not in raw:
return mi return mi
raw = xml_to_unicode(raw)[0] raw = xml_to_unicode(raw)[0]

View File

@ -85,7 +85,8 @@ class Source(Plugin):
# Metadata API {{{ # Metadata API {{{
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}): def identify(self, log, result_queue, abort, title=None, authors=None,
identifiers={}, timeout=5):
''' '''
Identify a book by its title/author/isbn/etc. Identify a book by its title/author/isbn/etc.
@ -98,6 +99,8 @@ class Source(Plugin):
:param authors: A list of authors of the book, can be None :param authors: A list of authors of the book, can be None
:param identifiers: A dictionary of other identifiers, most commonly :param identifiers: A dictionary of other identifiers, most commonly
{'isbn':'1234...'} {'isbn':'1234...'}
:param timeout: Timeout in seconds, no network request should hang for
longer than timeout.
:return: None if no errors occurred, otherwise a unicode representation :return: None if no errors occurred, otherwise a unicode representation
of the error suitable for showing to the user of the error suitable for showing to the user

View File

@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en'
import time import time
from urllib import urlencode from urllib import urlencode
from functools import partial from functools import partial
from threading import Thread
from lxml import etree from lxml import etree
@ -18,6 +17,7 @@ from calibre.ebooks.metadata.sources.base import Source
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.date import parse_date, utcnow from calibre.utils.date import parse_date, utcnow
from calibre.utils.cleantext import clean_ascii_chars
from calibre import browser, as_unicode from calibre import browser, as_unicode
NAMESPACES = { NAMESPACES = {
@ -41,20 +41,20 @@ subject = XPath('descendant::dc:subject')
description = XPath('descendant::dc:description') description = XPath('descendant::dc:description')
language = XPath('descendant::dc:language') language = XPath('descendant::dc:language')
def get_details(browser, url): def get_details(browser, url, timeout):
try: try:
raw = browser.open_novisit(url).read() raw = browser.open_novisit(url, timeout=timeout).read()
except Exception as e: except Exception as e:
gc = getattr(e, 'getcode', lambda : -1) gc = getattr(e, 'getcode', lambda : -1)
if gc() != 403: if gc() != 403:
raise raise
# Google is throttling us, wait a little # Google is throttling us, wait a little
time.sleep(2) time.sleep(1)
raw = browser.open_novisit(url).read() raw = browser.open_novisit(url, timeout=timeout).read()
return raw return raw
def to_metadata(browser, log, entry_): def to_metadata(browser, log, entry_, timeout):
def get_text(extra, x): def get_text(extra, x):
try: try:
@ -79,8 +79,9 @@ def to_metadata(browser, log, entry_):
mi = Metadata(title_, authors) mi = Metadata(title_, authors)
try: try:
raw = get_details(browser, id_url) raw = get_details(browser, id_url, timeout)
feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True)[0]) feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
strip_encoding_pats=True)[0])
extra = entry(feed)[0] extra = entry(feed)[0]
except: except:
log.exception('Failed to get additional details for', mi.title) log.exception('Failed to get additional details for', mi.title)
@ -131,25 +132,18 @@ def to_metadata(browser, log, entry_):
return mi return mi
class Worker(Thread):
def __init__(self, log, entries, abort, result_queue): def get_all_details(br, log, entries, abort, result_queue, timeout):
self.browser, self.log, self.entries = browser(), log, entries for i in entries:
self.abort, self.result_queue = abort, result_queue
Thread.__init__(self)
self.daemon = True
def run(self):
for i in self.entries:
try: try:
ans = to_metadata(self.browser, self.log, i) ans = to_metadata(br, log, i, timeout)
if isinstance(ans, Metadata): if isinstance(ans, Metadata):
self.result_queue.put(ans) result_queue.put(ans)
except: except:
self.log.exception( log.exception(
'Failed to get metadata for identify entry:', 'Failed to get metadata for identify entry:',
etree.tostring(i)) etree.tostring(i))
if self.abort.is_set(): if abort.is_set():
break break
@ -192,54 +186,40 @@ class GoogleBooks(Source):
}) })
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}): def identify(self, log, result_queue, abort, title=None, authors=None,
identifiers={}, timeout=5):
query = self.create_query(log, title=title, authors=authors, query = self.create_query(log, title=title, authors=authors,
identifiers=identifiers) identifiers=identifiers)
br = browser()
try: try:
raw = browser().open_novisit(query).read() raw = br.open_novisit(query, timeout=timeout).read()
except Exception, e: except Exception, e:
log.exception('Failed to make identify query: %r'%query) log.exception('Failed to make identify query: %r'%query)
return as_unicode(e) return as_unicode(e)
try: try:
parser = etree.XMLParser(recover=True, no_network=True) parser = etree.XMLParser(recover=True, no_network=True)
feed = etree.fromstring(xml_to_unicode(raw, feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
strip_encoding_pats=True)[0], parser=parser) strip_encoding_pats=True)[0], parser=parser)
entries = entry(feed) entries = entry(feed)
except Exception, e: except Exception, e:
log.exception('Failed to parse identify results') log.exception('Failed to parse identify results')
return as_unicode(e) return as_unicode(e)
# There is no point running these queries in threads as google
groups = self.split_jobs(entries, 5) # At most 5 threads # throttles requests returning Forbidden errors
if not groups: get_all_details(br, log, entries, abort, result_queue, timeout)
return None
workers = [Worker(log, entries, abort, result_queue) for entries in
groups]
if abort.is_set():
return None
for worker in workers: worker.start()
has_alive_worker = True
while has_alive_worker and not abort.is_set():
time.sleep(0.1)
has_alive_worker = False
for worker in workers:
if worker.is_alive():
has_alive_worker = True
return None return None
if __name__ == '__main__': if __name__ == '__main__':
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin, from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
isbn_test) title_test)
test_identify_plugin(GoogleBooks.name, test_identify_plugin(GoogleBooks.name,
[ [
( (
{'title': 'Great Expectations', 'authors':['Charles Dickens']}, {'title': 'Great Expectations', 'authors':['Charles Dickens']},
[isbn_test('9781607541592')] [title_test('Great Expectations', exact=True)]
), ),
]) ])

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, tempfile import os, tempfile, time
from Queue import Queue, Empty from Queue import Queue, Empty
from threading import Event from threading import Event
@ -26,6 +26,17 @@ def isbn_test(isbn):
return test return test
def title_test(title, exact=False):
title = title.lower()
def test(mi):
mt = mi.title.lower()
return (exact and mt == title) or \
(not exact and title in mt)
return test
def test_identify_plugin(name, tests): def test_identify_plugin(name, tests):
''' '''
:param name: Plugin name :param name: Plugin name
@ -48,11 +59,15 @@ def test_identify_plugin(name, tests):
abort = Event() abort = Event()
prints('Log saved to', lf) prints('Log saved to', lf)
times = []
for kwargs, test_funcs in tests: for kwargs, test_funcs in tests:
prints('Running test with:', kwargs) prints('Running test with:', kwargs)
rq = Queue() rq = Queue()
args = (log, rq, abort) args = (log, rq, abort)
start_time = time.time()
err = plugin.identify(*args, **kwargs) err = plugin.identify(*args, **kwargs)
total_time = time.time() - start_time
times.append(total_time)
if err is not None: if err is not None:
prints('identify returned an error for args', args) prints('identify returned an error for args', args)
prints(err) prints(err)
@ -87,6 +102,8 @@ def test_identify_plugin(name, tests):
prints('Log saved to', lf) prints('Log saved to', lf)
raise SystemExit(1) raise SystemExit(1)
prints('Average time per query', sum(times)/len(times))
if os.stat(lf).st_size > 10: if os.stat(lf).st_size > 10:
prints('There were some errors, see log', lf) prints('There were some errors, see log', lf)

View File

@ -11,6 +11,12 @@ from calibre import browser
class xISBN(object): class xISBN(object):
'''
This class is used to find the ISBN numbers of "related" editions of a
book, given its ISBN. Useful when querying services for metadata by ISBN,
in case they do not have the ISBN for the particular edition.
'''
QUERY = 'http://xisbn.worldcat.org/webservices/xid/isbn/%s?method=getEditions&format=json&fl=form,year,lang,ed' QUERY = 'http://xisbn.worldcat.org/webservices/xid/isbn/%s?method=getEditions&format=json&fl=form,year,lang,ed'
def __init__(self): def __init__(self):

View File

@ -259,6 +259,7 @@ class AddAction(InterfaceAction):
if hasattr(self.gui, 'db_images'): if hasattr(self.gui, 'db_images'):
self.gui.db_images.reset() self.gui.db_images.reset()
self.gui.tags_view.recount() self.gui.tags_view.recount()
if getattr(self._adder, 'merged_books', False): if getattr(self._adder, 'merged_books', False):
books = u'\n'.join([x if isinstance(x, unicode) else books = u'\n'.join([x if isinstance(x, unicode) else
x.decode(preferred_encoding, 'replace') for x in x.decode(preferred_encoding, 'replace') for x in
@ -266,6 +267,17 @@ class AddAction(InterfaceAction):
info_dialog(self.gui, _('Merged some books'), info_dialog(self.gui, _('Merged some books'),
_('The following duplicate books were found and incoming book formats were ' _('The following duplicate books were found and incoming book formats were '
'processed and merged into your Calibre database according to your automerge settings:'), det_msg=books, show=True) 'processed and merged into your Calibre database according to your automerge settings:'), det_msg=books, show=True)
if getattr(self._adder, 'number_of_books_added', 0) > 0 or \
getattr(self._adder, 'merged_books', False):
# The formats of the current book could have changed if
# automerge is enabled
current_idx = self.gui.library_view.currentIndex()
if current_idx.isValid():
self.gui.library_view.model().current_changed(current_idx,
current_idx)
if getattr(self._adder, 'critical', None): if getattr(self._adder, 'critical', None):
det_msg = [] det_msg = []
for name, log in self._adder.critical.items(): for name, log in self._adder.critical.items():