diff --git a/resources/recipes/ming_pao.recipe b/resources/recipes/ming_pao.recipe index 726181f57b..9febcec0e5 100644 --- a/resources/recipes/ming_pao.recipe +++ b/resources/recipes/ming_pao.recipe @@ -1,7 +1,9 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Eddie Lau' +__copyright__ = '2010-2011, Eddie Lau' ''' Change Log: +2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles + clean up the indentation 2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT) 2010/11/22: add English section, remove eco-news section which is not updated daily, correct @@ -18,21 +20,19 @@ from calibre.web.feeds.recipes import BasicNewsRecipe from contextlib import nested -from calibre import __appname__ from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata import MetaInformation class MPHKRecipe(BasicNewsRecipe): - IsKindleUsed = True # to avoid generating periodical in which CJK characters can't be displayed in section/article view - + IsCJKWellSupported = True # Set to False to avoid generating periodical in which CJK characters can't be displayed in section/article view title = 'Ming Pao - Hong Kong' oldest_article = 1 max_articles_per_feed = 100 __author__ = 'Eddie Lau' - description = 'Hong Kong Chinese Newspaper' - publisher = 'news.mingpao.com' + description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)' + publisher = 'MingPao' category = 'Chinese, News, Hong Kong' remove_javascript = True use_embedded_content = False @@ -46,19 +46,20 @@ class MPHKRecipe(BasicNewsRecipe): masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' keep_only_tags = [dict(name='h1'), dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title - dict(attrs={'class':['photo']}), dict(attrs={'id':['newscontent']}), # entertainment page content - dict(attrs={'id':['newscontent01','newscontent02']})] + dict(attrs={'id':['newscontent01','newscontent02']}), + dict(attrs={'class':['photo']}) + ] remove_tags = [dict(name='style'), dict(attrs={'id':['newscontent135']})] # for the finance page remove_attributes = ['width'] preprocess_regexps = [ - (re.compile(r'
', re.DOTALL|re.IGNORECASE), - lambda match: '

'), - (re.compile(r'

', re.DOTALL|re.IGNORECASE), - lambda match: ''), - (re.compile(r'

', re.DOTALL|re.IGNORECASE), # for entertainment page - lambda match: '') + (re.compile(r'
', re.DOTALL|re.IGNORECASE), + lambda match: '

'), + (re.compile(r'

', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(r'

', re.DOTALL|re.IGNORECASE), # for entertainment page + lambda match: '') ] def image_url_processor(cls, baseurl, url): @@ -107,6 +108,9 @@ class MPHKRecipe(BasicNewsRecipe): def get_fetchdate(self): return self.get_dtlocal().strftime("%Y%m%d") + def get_fetchformatteddate(self): + return self.get_dtlocal().strftime("%Y-%m-%d") + def get_fetchday(self): # convert UTC to local hk time - at around HKT 6.00am, all news are available return self.get_dtlocal().strftime("%d") @@ -121,84 +125,66 @@ class MPHKRecipe(BasicNewsRecipe): return cover def parse_index(self): - feeds = [] - dateStr = self.get_fetchdate() - for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), - (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), - (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), - (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), - (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), - (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), - (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), - ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), - (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), - (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), - (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) - # special - finance - fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') - if fin_articles: - feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) - # special - eco-friendly - # eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm') - # if eco_articles: - # feeds.append((u'\u74b0\u4fdd Eco News', eco_articles)) - # special - entertainment - ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') - if ent_articles: - feeds.append((u'\u5f71\u8996 Entertainment', ent_articles)) - return feeds + feeds = [] + dateStr = self.get_fetchdate() + for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), + (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), + (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), + (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), + (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), + (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), + ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), + (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), + (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), + (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), + (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + # special - finance + fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') + if fin_articles: + feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) + # special - entertainment + ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') + if ent_articles: + feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) + return feeds def parse_section(self, url): - dateStr = self.get_fetchdate() - soup = self.index_to_soup(url) - divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']}) - current_articles = [] - included_urls = [] - divs.reverse() - for i in divs: - a = i.find('a', href = True) - title = self.tag_to_string(a) - url = a.get('href', False) - url = 'http://news.mingpao.com/' + dateStr + '/' +url - if url not in included_urls and url.rfind('Redirect') == -1: - current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) - included_urls.append(url) - current_articles.reverse() - return current_articles + dateStr = self.get_fetchdate() + soup = self.index_to_soup(url) + divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']}) + current_articles = [] + included_urls = [] + divs.reverse() + for i in divs: + a = i.find('a', href = True) + title = self.tag_to_string(a) + url = a.get('href', False) + url = 'http://news.mingpao.com/' + dateStr + '/' +url + if url not in included_urls and url.rfind('Redirect') == -1: + current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) + included_urls.append(url) + current_articles.reverse() + return current_articles def parse_fin_section(self, url): dateStr = self.get_fetchdate() soup = self.index_to_soup(url) a = soup.findAll('a', href= True) current_articles = [] - for i in a: - url = i.get('href', False) - if not url.rfind(dateStr) == -1 and url.rfind('index') == -1: - title = self.tag_to_string(i) - url = 'http://www.mpfinance.com/cfm/' +url - current_articles.append({'title': title, 'url': url, 'description':''}) - return current_articles - - def parse_eco_section(self, url): - dateStr = self.get_fetchdate() - soup = self.index_to_soup(url) - divs = soup.findAll(attrs={'class': ['bullet']}) - current_articles = [] included_urls = [] - for i in divs: - a = i.find('a', href = True) - title = self.tag_to_string(a) - url = a.get('href', False) - url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url - if url not in included_urls and url.rfind('Redirect') == -1 and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1: + for i in a: + url = 'http://www.mpfinance.com/cfm/' + i.get('href', False) + if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1: + title = self.tag_to_string(i) current_articles.append({'title': title, 'url': url, 'description':''}) included_urls.append(url) return current_articles def parse_ent_section(self, url): + self.get_fetchdate() soup = self.index_to_soup(url) a = soup.findAll('a', href=True) a.reverse() @@ -223,67 +209,71 @@ class MPHKRecipe(BasicNewsRecipe): return soup def create_opf(self, feeds, dir=None): - if self.IsKindleUsed == False: - super(MPHKRecipe,self).create_opf(feeds, dir) - return if dir is None: dir = self.output_dir - title = self.short_title() - title += ' ' + self.get_fetchdate() - #if self.output_profile.periodical_date_in_title: - # title += strftime(self.timefmt) - mi = MetaInformation(title, [__appname__]) - mi.publisher = __appname__ - mi.author_sort = __appname__ - mi.publication_type = self.publication_type+':'+self.short_title() - #mi.timestamp = nowf() - mi.timestamp = self.get_dtlocal() - mi.comments = self.description - if not isinstance(mi.comments, unicode): - mi.comments = mi.comments.decode('utf-8', 'replace') - #mi.pubdate = nowf() - mi.pubdate = self.get_dtlocal() - opf_path = os.path.join(dir, 'index.opf') - ncx_path = os.path.join(dir, 'index.ncx') - opf = OPFCreator(dir, mi) - # Add mastheadImage entry to section - mp = getattr(self, 'masthead_path', None) - if mp is not None and os.access(mp, os.R_OK): - from calibre.ebooks.metadata.opf2 import Guide - ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu()) - ref.type = 'masthead' - ref.title = 'Masthead Image' - opf.guide.append(ref) + if self.IsCJKWellSupported == True: + # use Chinese title + title = u'\u660e\u5831 (\u9999\u6e2f) ' + self.get_fetchformatteddate() + else: + # use English title + title = self.short_title() + ' ' + self.get_fetchformatteddate() + if True: # force date in title + # title += strftime(self.timefmt) + mi = MetaInformation(title, [self.publisher]) + mi.publisher = self.publisher + mi.author_sort = self.publisher + if self.IsCJKWellSupported == True: + mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() + else: + mi.publication_type = self.publication_type+':'+self.short_title() + #mi.timestamp = nowf() + mi.timestamp = self.get_dtlocal() + mi.comments = self.description + if not isinstance(mi.comments, unicode): + mi.comments = mi.comments.decode('utf-8', 'replace') + #mi.pubdate = nowf() + mi.pubdate = self.get_dtlocal() + opf_path = os.path.join(dir, 'index.opf') + ncx_path = os.path.join(dir, 'index.ncx') + opf = OPFCreator(dir, mi) + # Add mastheadImage entry to section + mp = getattr(self, 'masthead_path', None) + if mp is not None and os.access(mp, os.R_OK): + from calibre.ebooks.metadata.opf2 import Guide + ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu()) + ref.type = 'masthead' + ref.title = 'Masthead Image' + opf.guide.append(ref) - manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] - manifest.append(os.path.join(dir, 'index.html')) - manifest.append(os.path.join(dir, 'index.ncx')) + manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] + manifest.append(os.path.join(dir, 'index.html')) + manifest.append(os.path.join(dir, 'index.ncx')) - # Get cover - cpath = getattr(self, 'cover_path', None) - if cpath is None: - pf = open(os.path.join(dir, 'cover.jpg'), 'wb') - if self.default_cover(pf): - cpath = pf.name - if cpath is not None and os.access(cpath, os.R_OK): - opf.cover = cpath - manifest.append(cpath) + # Get cover + cpath = getattr(self, 'cover_path', None) + if cpath is None: + pf = open(os.path.join(dir, 'cover.jpg'), 'wb') + if self.default_cover(pf): + cpath = pf.name + if cpath is not None and os.access(cpath, os.R_OK): + opf.cover = cpath + manifest.append(cpath) - # Get masthead - mpath = getattr(self, 'masthead_path', None) - if mpath is not None and os.access(mpath, os.R_OK): - manifest.append(mpath) + # Get masthead + mpath = getattr(self, 'masthead_path', None) + if mpath is not None and os.access(mpath, os.R_OK): + manifest.append(mpath) - opf.create_manifest_from_files_in(manifest) - for mani in opf.manifest: - if mani.path.endswith('.ncx'): - mani.id = 'ncx' - if mani.path.endswith('mastheadImage.jpg'): - mani.id = 'masthead-image' - entries = ['index.html'] - toc = TOC(base_path=dir) - self.play_order_counter = 0 - self.play_order_map = {} + opf.create_manifest_from_files_in(manifest) + for mani in opf.manifest: + if mani.path.endswith('.ncx'): + mani.id = 'ncx' + if mani.path.endswith('mastheadImage.jpg'): + mani.id = 'masthead-image' + entries = ['index.html'] + toc = TOC(base_path=dir) + self.play_order_counter = 0 + self.play_order_map = {} def feed_index(num, parent): f = feeds[num] @@ -321,7 +311,7 @@ class MPHKRecipe(BasicNewsRecipe): prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) templ = self.navbar.generate(True, num, j, len(f), not self.has_single_feed, - a.orig_url, __appname__, prefix=prefix, + a.orig_url, self.publisher, prefix=prefix, center=self.center_navbar) elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') body.insert(len(body.contents), elem) @@ -344,7 +334,7 @@ class MPHKRecipe(BasicNewsRecipe): if not desc: desc = None feed_index(i, toc.add_item('feed_%d/index.html'%i, None, - f.title, play_order=po, description=desc, author=auth)) + f.title, play_order=po, description=desc, author=auth)) else: entries.append('feed_%d/index.html'%0) @@ -357,4 +347,3 @@ class MPHKRecipe(BasicNewsRecipe): with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): opf.render(opf_file, ncx_file) - diff --git a/resources/recipes/osnews_pl.recipe b/resources/recipes/osnews_pl.recipe new file mode 100644 index 0000000000..5d851ab179 --- /dev/null +++ b/resources/recipes/osnews_pl.recipe @@ -0,0 +1,50 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +''' +OSNews.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class OSNewsRecipe(BasicNewsRecipe): + __author__ = u'Mori & Tomasz D\u0142ugosz' + language = 'pl' + + title = u'OSnews.pl' + publisher = u'OSnews.pl' + description = u'OSnews.pl jest spo\u0142eczno\u015bciowym serwisem informacyjnym po\u015bwi\u0119conym oprogramowaniu, systemom operacyjnym i \u015bwiatowi IT' + + no_stylesheets = True + remove_javascript = True + encoding = 'utf-8' + use_embedded_content = False; + + oldest_article = 7 + max_articles_per_feed = 100 + + extra_css = ''' + .news-heading {font-size:150%} + .newsinformations li {display:inline;} + blockquote {border:2px solid #000; padding:5px;} + ''' + + feeds = [ + (u'OSNews.pl', u'http://feeds.feedburner.com/OSnewspl') + ] + + keep_only_tags = [ + dict(name = 'a', attrs = {'class' : 'news-heading'}), + dict(name = 'div', attrs = {'class' : 'newsinformations'}), + dict(name = 'div', attrs = {'id' : 'news-content'}) + ] + + remove_tags = [ + dict(name = 'div', attrs = {'class' : 'sociable'}), + dict(name = 'div', attrs = {'class' : 'post_prev'}), + dict(name = 'div', attrs = {'class' : 'post_next'}), + dict(name = 'div', attrs = {'class' : 'clr'}) + ] + + preprocess_regexps = [(re.compile(u'Komentarze: \(?[0-9]+\)? ?' +''' +swiatkindle.pl +''' + +import re + +class swiatkindle(BasicNewsRecipe): + title = u'Swiat Kindle' + description = u'Blog o czytniku Amazon Kindle. Wersje, ksi\u0105\u017cki, kupowanie i korzystanie w Polsce' + language = 'pl' + __author__ = u'Tomasz D\u0142ugosz' + oldest_article = 7 + max_articles_per_feed = 100 + + feeds = [(u'\u015awiat Kindle - wpisy', u'http://swiatkindle.pl/feed')] + + remove_tags = [dict(name = 'ul', attrs = {'class' : 'similar-posts'})] + + preprocess_regexps = [(re.compile(u'

Czytaj dalej:

'), lambda match: '')] + diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 6cfe915036..4f3574559e 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -573,8 +573,8 @@ from calibre.devices.edge.driver import EDGE from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, \ SOVOS, PICO, SUNSTECH_EB700, ARCHOS7O, STASH from calibre.devices.sne.driver import SNE -from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \ - GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, Q600, LUMIREAD, ALURATEK_COLOR, \ +from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, \ + GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, LUMIREAD, ALURATEK_COLOR, \ TREKSTOR, EEEREADER, NEXTBOOK from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG from calibre.devices.kobo.driver import KOBO @@ -691,8 +691,6 @@ plugins += [ AVANT, MENTOR, SWEEX, - Q600, - KOGAN, PDNOVEL, SPECTRA, GEMEI, diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index 5f67e23d92..e9feacc67e 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -121,7 +121,8 @@ def enable_plugin(plugin_or_name): config['enabled_plugins'] = ep default_disabled_plugins = set([ - 'Douban Books', 'Douban.com covers', 'Nicebooks', 'Nicebooks covers' + 'Douban Books', 'Douban.com covers', 'Nicebooks', 'Nicebooks covers', + 'Kent District Library' ]) def is_disabled(plugin): diff --git a/src/calibre/devices/misc.py b/src/calibre/devices/misc.py index e549a4a9fd..d74f727a0e 100644 --- a/src/calibre/devices/misc.py +++ b/src/calibre/devices/misc.py @@ -54,41 +54,24 @@ class AVANT(USBMS): class SWEEX(USBMS): # Identical to the Promedia name = 'Sweex Device Interface' - gui_name = 'Sweex' - description = _('Communicate with the Sweex MM300') + gui_name = 'Sweex/Kogan/Q600/Wink' + description = _('Communicate with the Sweex/Kogan/Q600/Wink') author = 'Kovid Goyal' supported_platforms = ['windows', 'osx', 'linux'] # Ordered list of supported formats - FORMATS = ['epub', 'prc', 'fb2', 'html', 'rtf', 'chm', 'pdf', 'txt'] + FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'html', 'rtf', 'chm', 'pdf', 'txt'] VENDOR_ID = [0x0525, 0x177f] PRODUCT_ID = [0xa4a5, 0x300] - BCD = [0x0319, 0x110] + BCD = [0x0319, 0x110, 0x325] - VENDOR_NAME = 'SWEEX' - WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'EBOOKREADER' + VENDOR_NAME = ['SWEEX', 'LINUX'] + WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['EBOOKREADER', 'FILE-STOR_GADGET'] EBOOK_DIR_MAIN = '' SUPPORTS_SUB_DIRS = True -class Q600(SWEEX): - - name = 'Digma Q600 Device interface' - gui_name = 'Q600' - description = _('Communicate with the Digma Q600') - - BCD = [0x325] - FORMATS = ['epub', 'fb2', 'mobi', 'prc', 'html', 'rtf', 'chm', 'pdf', 'txt'] - -class KOGAN(SWEEX): - - name = 'Kogan Device Interface' - gui_name = 'Kogan' - description = _('Communicate with the Kogan') - VENDOR_NAME = 'LINUX' - WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'FILE-STOR_GADGET' - EBOOK_DIR_MAIN = 'Kogan eBooks' class PDNOVEL(USBMS): name = 'Pandigital Novel device interface' diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 6af058da7b..43f93807a1 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -350,6 +350,8 @@ class FB2MLizer(object): # Number of blank lines above tag try: ems = int(round((float(style.marginTop) / style.fontSize) - 1)) + if ems < 0: + ems = 0 except: ems = 0 @@ -397,7 +399,7 @@ class FB2MLizer(object): fb2_out += p_txt tags += p_tag fb2_out.append('' % self.image_hrefs[page.abshref(elem_tree.attrib['src'])]) - if tag in ('br', 'hr') or ems: + if tag in ('br', 'hr') or ems >= 1: if ems < 1: multiplier = 1 else: diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index 58dd3f1d22..c87249ed39 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -205,7 +205,10 @@ def main(args=sys.argv): open(cpath, 'wb').write(br.open_novisit(curl).read()) print 'Cover for', title, 'saved to', cpath + #import time + #st = time.time() print get_social_metadata(title, None, None, isbn) + #print '\n\n', time.time() - st, '\n\n' return 0 diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index 4f246b2b9a..667b4f4d7c 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -106,6 +106,9 @@ class MetadataSource(Plugin): # {{{ def join(self): return self.worker.join() + def is_alive(self): + return self.worker.is_alive() + def is_customizable(self): return True @@ -251,7 +254,9 @@ class KentDistrictLibrary(MetadataSource): # {{{ name = 'Kent District Library' metadata_type = 'social' - description = _('Downloads series information from ww2.kdl.org') + description = _('Downloads series information from ww2.kdl.org. ' + 'This website cannot handle large numbers of queries, ' + 'so the plugin is disabled by default.') def fetch(self): if not self.title or not self.book_author: diff --git a/src/calibre/ebooks/metadata/kdl.py b/src/calibre/ebooks/metadata/kdl.py index 4eca49ad45..b0b961b603 100644 --- a/src/calibre/ebooks/metadata/kdl.py +++ b/src/calibre/ebooks/metadata/kdl.py @@ -5,7 +5,9 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import re, urllib, urlparse +import re, urllib, urlparse, socket + +from mechanize import URLError from calibre.ebooks.metadata.book.base import Metadata from calibre import browser @@ -17,7 +19,7 @@ URL = \ _ignore_starts = u'\'"'+u''.join(unichr(x) for x in range(0x2018, 0x201e)+[0x2032, 0x2033]) -def get_series(title, authors): +def get_series(title, authors, timeout=60): mi = Metadata(title, authors) if title and title[0] in _ignore_starts: title = title[1:] @@ -39,7 +41,12 @@ def get_series(title, authors): url = URL.format(author, title) br = browser() - raw = br.open(url).read() + try: + raw = br.open_novisit(url, timeout=timeout).read() + except URLError, e: + if isinstance(e.reason, socket.timeout): + raise Exception('KDL Server busy, try again later') + raise if 'see the full results' not in raw: return mi raw = xml_to_unicode(raw)[0] diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index e5490ef56e..74e184cc66 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -85,7 +85,8 @@ class Source(Plugin): # Metadata API {{{ - def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}): + def identify(self, log, result_queue, abort, title=None, authors=None, + identifiers={}, timeout=5): ''' Identify a book by its title/author/isbn/etc. @@ -98,6 +99,8 @@ class Source(Plugin): :param authors: A list of authors of the book, can be None :param identifiers: A dictionary of other identifiers, most commonly {'isbn':'1234...'} + :param timeout: Timeout in seconds, no network request should hang for + longer than timeout. :return: None if no errors occurred, otherwise a unicode representation of the error suitable for showing to the user diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py index c59bbe6dc5..498c7574ea 100644 --- a/src/calibre/ebooks/metadata/sources/google.py +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en' import time from urllib import urlencode from functools import partial -from threading import Thread from lxml import etree @@ -18,6 +17,7 @@ from calibre.ebooks.metadata.sources.base import Source from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.date import parse_date, utcnow +from calibre.utils.cleantext import clean_ascii_chars from calibre import browser, as_unicode NAMESPACES = { @@ -41,20 +41,20 @@ subject = XPath('descendant::dc:subject') description = XPath('descendant::dc:description') language = XPath('descendant::dc:language') -def get_details(browser, url): +def get_details(browser, url, timeout): try: - raw = browser.open_novisit(url).read() + raw = browser.open_novisit(url, timeout=timeout).read() except Exception as e: gc = getattr(e, 'getcode', lambda : -1) if gc() != 403: raise # Google is throttling us, wait a little - time.sleep(2) - raw = browser.open_novisit(url).read() + time.sleep(1) + raw = browser.open_novisit(url, timeout=timeout).read() return raw -def to_metadata(browser, log, entry_): +def to_metadata(browser, log, entry_, timeout): def get_text(extra, x): try: @@ -79,8 +79,9 @@ def to_metadata(browser, log, entry_): mi = Metadata(title_, authors) try: - raw = get_details(browser, id_url) - feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True)[0]) + raw = get_details(browser, id_url, timeout) + feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), + strip_encoding_pats=True)[0]) extra = entry(feed)[0] except: log.exception('Failed to get additional details for', mi.title) @@ -131,26 +132,19 @@ def to_metadata(browser, log, entry_): return mi -class Worker(Thread): - def __init__(self, log, entries, abort, result_queue): - self.browser, self.log, self.entries = browser(), log, entries - self.abort, self.result_queue = abort, result_queue - Thread.__init__(self) - self.daemon = True - - def run(self): - for i in self.entries: - try: - ans = to_metadata(self.browser, self.log, i) - if isinstance(ans, Metadata): - self.result_queue.put(ans) - except: - self.log.exception( - 'Failed to get metadata for identify entry:', - etree.tostring(i)) - if self.abort.is_set(): - break +def get_all_details(br, log, entries, abort, result_queue, timeout): + for i in entries: + try: + ans = to_metadata(br, log, i, timeout) + if isinstance(ans, Metadata): + result_queue.put(ans) + except: + log.exception( + 'Failed to get metadata for identify entry:', + etree.tostring(i)) + if abort.is_set(): + break class GoogleBooks(Source): @@ -192,54 +186,40 @@ class GoogleBooks(Source): }) - def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}): + def identify(self, log, result_queue, abort, title=None, authors=None, + identifiers={}, timeout=5): query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) + br = browser() try: - raw = browser().open_novisit(query).read() + raw = br.open_novisit(query, timeout=timeout).read() except Exception, e: log.exception('Failed to make identify query: %r'%query) return as_unicode(e) try: parser = etree.XMLParser(recover=True, no_network=True) - feed = etree.fromstring(xml_to_unicode(raw, + feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=parser) entries = entry(feed) except Exception, e: log.exception('Failed to parse identify results') return as_unicode(e) - - groups = self.split_jobs(entries, 5) # At most 5 threads - if not groups: - return None - workers = [Worker(log, entries, abort, result_queue) for entries in - groups] - - if abort.is_set(): - return None - - for worker in workers: worker.start() - - has_alive_worker = True - while has_alive_worker and not abort.is_set(): - time.sleep(0.1) - has_alive_worker = False - for worker in workers: - if worker.is_alive(): - has_alive_worker = True + # There is no point running these queries in threads as google + # throttles requests returning Forbidden errors + get_all_details(br, log, entries, abort, result_queue, timeout) return None if __name__ == '__main__': # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py from calibre.ebooks.metadata.sources.test import (test_identify_plugin, - isbn_test) + title_test) test_identify_plugin(GoogleBooks.name, [ ( {'title': 'Great Expectations', 'authors':['Charles Dickens']}, - [isbn_test('9781607541592')] + [title_test('Great Expectations', exact=True)] ), ]) diff --git a/src/calibre/ebooks/metadata/sources/test.py b/src/calibre/ebooks/metadata/sources/test.py index cd7e7ab6e8..3b41e69d40 100644 --- a/src/calibre/ebooks/metadata/sources/test.py +++ b/src/calibre/ebooks/metadata/sources/test.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import os, tempfile +import os, tempfile, time from Queue import Queue, Empty from threading import Event @@ -26,6 +26,17 @@ def isbn_test(isbn): return test +def title_test(title, exact=False): + + title = title.lower() + + def test(mi): + mt = mi.title.lower() + return (exact and mt == title) or \ + (not exact and title in mt) + + return test + def test_identify_plugin(name, tests): ''' :param name: Plugin name @@ -48,11 +59,15 @@ def test_identify_plugin(name, tests): abort = Event() prints('Log saved to', lf) + times = [] for kwargs, test_funcs in tests: prints('Running test with:', kwargs) rq = Queue() args = (log, rq, abort) + start_time = time.time() err = plugin.identify(*args, **kwargs) + total_time = time.time() - start_time + times.append(total_time) if err is not None: prints('identify returned an error for args', args) prints(err) @@ -87,6 +102,8 @@ def test_identify_plugin(name, tests): prints('Log saved to', lf) raise SystemExit(1) + prints('Average time per query', sum(times)/len(times)) + if os.stat(lf).st_size > 10: prints('There were some errors, see log', lf) diff --git a/src/calibre/ebooks/metadata/xisbn.py b/src/calibre/ebooks/metadata/xisbn.py index 2ee74396c7..aaeb1c6b98 100644 --- a/src/calibre/ebooks/metadata/xisbn.py +++ b/src/calibre/ebooks/metadata/xisbn.py @@ -11,6 +11,12 @@ from calibre import browser class xISBN(object): + ''' + This class is used to find the ISBN numbers of "related" editions of a + book, given its ISBN. Useful when querying services for metadata by ISBN, + in case they do not have the ISBN for the particular edition. + ''' + QUERY = 'http://xisbn.worldcat.org/webservices/xid/isbn/%s?method=getEditions&format=json&fl=form,year,lang,ed' def __init__(self): diff --git a/src/calibre/gui2/actions/add.py b/src/calibre/gui2/actions/add.py index 49a7a4677a..f99e48eb2b 100644 --- a/src/calibre/gui2/actions/add.py +++ b/src/calibre/gui2/actions/add.py @@ -259,6 +259,7 @@ class AddAction(InterfaceAction): if hasattr(self.gui, 'db_images'): self.gui.db_images.reset() self.gui.tags_view.recount() + if getattr(self._adder, 'merged_books', False): books = u'\n'.join([x if isinstance(x, unicode) else x.decode(preferred_encoding, 'replace') for x in @@ -266,6 +267,17 @@ class AddAction(InterfaceAction): info_dialog(self.gui, _('Merged some books'), _('The following duplicate books were found and incoming book formats were ' 'processed and merged into your Calibre database according to your automerge settings:'), det_msg=books, show=True) + + if getattr(self._adder, 'number_of_books_added', 0) > 0 or \ + getattr(self._adder, 'merged_books', False): + # The formats of the current book could have changed if + # automerge is enabled + current_idx = self.gui.library_view.currentIndex() + if current_idx.isValid(): + self.gui.library_view.model().current_changed(current_idx, + current_idx) + + if getattr(self._adder, 'critical', None): det_msg = [] for name, log in self._adder.critical.items():