diff --git a/resources/recipes/ming_pao.recipe b/resources/recipes/ming_pao.recipe
index 726181f57b..9febcec0e5 100644
--- a/resources/recipes/ming_pao.recipe
+++ b/resources/recipes/ming_pao.recipe
@@ -1,7 +1,9 @@
__license__ = 'GPL v3'
-__copyright__ = '2010, Eddie Lau'
+__copyright__ = '2010-2011, Eddie Lau'
'''
Change Log:
+2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
+ clean up the indentation
2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
(to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
2010/11/22: add English section, remove eco-news section which is not updated daily, correct
@@ -18,21 +20,19 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested
-from calibre import __appname__
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
class MPHKRecipe(BasicNewsRecipe):
- IsKindleUsed = True # to avoid generating periodical in which CJK characters can't be displayed in section/article view
-
+ IsCJKWellSupported = True # Set to False to avoid generating periodical in which CJK characters can't be displayed in section/article view
title = 'Ming Pao - Hong Kong'
oldest_article = 1
max_articles_per_feed = 100
__author__ = 'Eddie Lau'
- description = 'Hong Kong Chinese Newspaper'
- publisher = 'news.mingpao.com'
+ description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
+ publisher = 'MingPao'
category = 'Chinese, News, Hong Kong'
remove_javascript = True
use_embedded_content = False
@@ -46,19 +46,20 @@ class MPHKRecipe(BasicNewsRecipe):
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
keep_only_tags = [dict(name='h1'),
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
- dict(attrs={'class':['photo']}),
dict(attrs={'id':['newscontent']}), # entertainment page content
- dict(attrs={'id':['newscontent01','newscontent02']})]
+ dict(attrs={'id':['newscontent01','newscontent02']}),
+ dict(attrs={'class':['photo']})
+ ]
remove_tags = [dict(name='style'),
dict(attrs={'id':['newscontent135']})] # for the finance page
remove_attributes = ['width']
preprocess_regexps = [
- (re.compile(r'
', re.DOTALL|re.IGNORECASE),
- lambda match: ''),
- (re.compile(r'
', re.DOTALL|re.IGNORECASE),
- lambda match: ''),
- (re.compile(r'
', re.DOTALL|re.IGNORECASE), # for entertainment page
- lambda match: '')
+ (re.compile(r'', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'
', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'
', re.DOTALL|re.IGNORECASE), # for entertainment page
+ lambda match: '')
]
def image_url_processor(cls, baseurl, url):
@@ -107,6 +108,9 @@ class MPHKRecipe(BasicNewsRecipe):
def get_fetchdate(self):
return self.get_dtlocal().strftime("%Y%m%d")
+ def get_fetchformatteddate(self):
+ return self.get_dtlocal().strftime("%Y-%m-%d")
+
def get_fetchday(self):
# convert UTC to local hk time - at around HKT 6.00am, all news are available
return self.get_dtlocal().strftime("%d")
@@ -121,84 +125,66 @@ class MPHKRecipe(BasicNewsRecipe):
return cover
def parse_index(self):
- feeds = []
- dateStr = self.get_fetchdate()
- for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
- (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
- (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
- (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
- (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
- (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
- (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
- ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
- (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
- (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
- (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
- articles = self.parse_section(url)
- if articles:
- feeds.append((title, articles))
- # special - finance
- fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
- if fin_articles:
- feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
- # special - eco-friendly
- # eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm')
- # if eco_articles:
- # feeds.append((u'\u74b0\u4fdd Eco News', eco_articles))
- # special - entertainment
- ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
- if ent_articles:
- feeds.append((u'\u5f71\u8996 Entertainment', ent_articles))
- return feeds
+ feeds = []
+ dateStr = self.get_fetchdate()
+ for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
+ (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
+ (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
+ (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
+ (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
+ (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
+ ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
+ (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
+ (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
+ (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
+ (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
+ articles = self.parse_section(url)
+ if articles:
+ feeds.append((title, articles))
+ # special - finance
+ fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
+ if fin_articles:
+ feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
+ # special - entertainment
+ ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
+ if ent_articles:
+ feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
+ return feeds
def parse_section(self, url):
- dateStr = self.get_fetchdate()
- soup = self.index_to_soup(url)
- divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
- current_articles = []
- included_urls = []
- divs.reverse()
- for i in divs:
- a = i.find('a', href = True)
- title = self.tag_to_string(a)
- url = a.get('href', False)
- url = 'http://news.mingpao.com/' + dateStr + '/' +url
- if url not in included_urls and url.rfind('Redirect') == -1:
- current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
- included_urls.append(url)
- current_articles.reverse()
- return current_articles
+ dateStr = self.get_fetchdate()
+ soup = self.index_to_soup(url)
+ divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
+ current_articles = []
+ included_urls = []
+ divs.reverse()
+ for i in divs:
+ a = i.find('a', href = True)
+ title = self.tag_to_string(a)
+ url = a.get('href', False)
+ url = 'http://news.mingpao.com/' + dateStr + '/' +url
+ if url not in included_urls and url.rfind('Redirect') == -1:
+ current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
+ included_urls.append(url)
+ current_articles.reverse()
+ return current_articles
def parse_fin_section(self, url):
dateStr = self.get_fetchdate()
soup = self.index_to_soup(url)
a = soup.findAll('a', href= True)
current_articles = []
- for i in a:
- url = i.get('href', False)
- if not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
- title = self.tag_to_string(i)
- url = 'http://www.mpfinance.com/cfm/' +url
- current_articles.append({'title': title, 'url': url, 'description':''})
- return current_articles
-
- def parse_eco_section(self, url):
- dateStr = self.get_fetchdate()
- soup = self.index_to_soup(url)
- divs = soup.findAll(attrs={'class': ['bullet']})
- current_articles = []
included_urls = []
- for i in divs:
- a = i.find('a', href = True)
- title = self.tag_to_string(a)
- url = a.get('href', False)
- url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url
- if url not in included_urls and url.rfind('Redirect') == -1 and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1:
+ for i in a:
+ url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
+ if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
+ title = self.tag_to_string(i)
current_articles.append({'title': title, 'url': url, 'description':''})
included_urls.append(url)
return current_articles
def parse_ent_section(self, url):
+ self.get_fetchdate()
soup = self.index_to_soup(url)
a = soup.findAll('a', href=True)
a.reverse()
@@ -223,67 +209,71 @@ class MPHKRecipe(BasicNewsRecipe):
return soup
def create_opf(self, feeds, dir=None):
- if self.IsKindleUsed == False:
- super(MPHKRecipe,self).create_opf(feeds, dir)
- return
if dir is None:
dir = self.output_dir
- title = self.short_title()
- title += ' ' + self.get_fetchdate()
- #if self.output_profile.periodical_date_in_title:
- # title += strftime(self.timefmt)
- mi = MetaInformation(title, [__appname__])
- mi.publisher = __appname__
- mi.author_sort = __appname__
- mi.publication_type = self.publication_type+':'+self.short_title()
- #mi.timestamp = nowf()
- mi.timestamp = self.get_dtlocal()
- mi.comments = self.description
- if not isinstance(mi.comments, unicode):
- mi.comments = mi.comments.decode('utf-8', 'replace')
- #mi.pubdate = nowf()
- mi.pubdate = self.get_dtlocal()
- opf_path = os.path.join(dir, 'index.opf')
- ncx_path = os.path.join(dir, 'index.ncx')
- opf = OPFCreator(dir, mi)
- # Add mastheadImage entry to section
- mp = getattr(self, 'masthead_path', None)
- if mp is not None and os.access(mp, os.R_OK):
- from calibre.ebooks.metadata.opf2 import Guide
- ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
- ref.type = 'masthead'
- ref.title = 'Masthead Image'
- opf.guide.append(ref)
+ if self.IsCJKWellSupported == True:
+ # use Chinese title
+ title = u'\u660e\u5831 (\u9999\u6e2f) ' + self.get_fetchformatteddate()
+ else:
+ # use English title
+ title = self.short_title() + ' ' + self.get_fetchformatteddate()
+ if True: # force date in title
+ # title += strftime(self.timefmt)
+ mi = MetaInformation(title, [self.publisher])
+ mi.publisher = self.publisher
+ mi.author_sort = self.publisher
+ if self.IsCJKWellSupported == True:
+ mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
+ else:
+ mi.publication_type = self.publication_type+':'+self.short_title()
+ #mi.timestamp = nowf()
+ mi.timestamp = self.get_dtlocal()
+ mi.comments = self.description
+ if not isinstance(mi.comments, unicode):
+ mi.comments = mi.comments.decode('utf-8', 'replace')
+ #mi.pubdate = nowf()
+ mi.pubdate = self.get_dtlocal()
+ opf_path = os.path.join(dir, 'index.opf')
+ ncx_path = os.path.join(dir, 'index.ncx')
+ opf = OPFCreator(dir, mi)
+ # Add mastheadImage entry to section
+ mp = getattr(self, 'masthead_path', None)
+ if mp is not None and os.access(mp, os.R_OK):
+ from calibre.ebooks.metadata.opf2 import Guide
+ ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
+ ref.type = 'masthead'
+ ref.title = 'Masthead Image'
+ opf.guide.append(ref)
- manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
- manifest.append(os.path.join(dir, 'index.html'))
- manifest.append(os.path.join(dir, 'index.ncx'))
+ manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
+ manifest.append(os.path.join(dir, 'index.html'))
+ manifest.append(os.path.join(dir, 'index.ncx'))
- # Get cover
- cpath = getattr(self, 'cover_path', None)
- if cpath is None:
- pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
- if self.default_cover(pf):
- cpath = pf.name
- if cpath is not None and os.access(cpath, os.R_OK):
- opf.cover = cpath
- manifest.append(cpath)
+ # Get cover
+ cpath = getattr(self, 'cover_path', None)
+ if cpath is None:
+ pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
+ if self.default_cover(pf):
+ cpath = pf.name
+ if cpath is not None and os.access(cpath, os.R_OK):
+ opf.cover = cpath
+ manifest.append(cpath)
- # Get masthead
- mpath = getattr(self, 'masthead_path', None)
- if mpath is not None and os.access(mpath, os.R_OK):
- manifest.append(mpath)
+ # Get masthead
+ mpath = getattr(self, 'masthead_path', None)
+ if mpath is not None and os.access(mpath, os.R_OK):
+ manifest.append(mpath)
- opf.create_manifest_from_files_in(manifest)
- for mani in opf.manifest:
- if mani.path.endswith('.ncx'):
- mani.id = 'ncx'
- if mani.path.endswith('mastheadImage.jpg'):
- mani.id = 'masthead-image'
- entries = ['index.html']
- toc = TOC(base_path=dir)
- self.play_order_counter = 0
- self.play_order_map = {}
+ opf.create_manifest_from_files_in(manifest)
+ for mani in opf.manifest:
+ if mani.path.endswith('.ncx'):
+ mani.id = 'ncx'
+ if mani.path.endswith('mastheadImage.jpg'):
+ mani.id = 'masthead-image'
+ entries = ['index.html']
+ toc = TOC(base_path=dir)
+ self.play_order_counter = 0
+ self.play_order_map = {}
def feed_index(num, parent):
f = feeds[num]
@@ -321,7 +311,7 @@ class MPHKRecipe(BasicNewsRecipe):
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed,
- a.orig_url, __appname__, prefix=prefix,
+ a.orig_url, self.publisher, prefix=prefix,
center=self.center_navbar)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem)
@@ -344,7 +334,7 @@ class MPHKRecipe(BasicNewsRecipe):
if not desc:
desc = None
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
- f.title, play_order=po, description=desc, author=auth))
+ f.title, play_order=po, description=desc, author=auth))
else:
entries.append('feed_%d/index.html'%0)
@@ -357,4 +347,3 @@ class MPHKRecipe(BasicNewsRecipe):
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file)
-
diff --git a/resources/recipes/osnews_pl.recipe b/resources/recipes/osnews_pl.recipe
new file mode 100644
index 0000000000..5d851ab179
--- /dev/null
+++ b/resources/recipes/osnews_pl.recipe
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+
+__license__ = 'GPL v3'
+'''
+OSNews.pl
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+import re
+
+class OSNewsRecipe(BasicNewsRecipe):
+ __author__ = u'Mori & Tomasz D\u0142ugosz'
+ language = 'pl'
+
+ title = u'OSnews.pl'
+ publisher = u'OSnews.pl'
+ description = u'OSnews.pl jest spo\u0142eczno\u015bciowym serwisem informacyjnym po\u015bwi\u0119conym oprogramowaniu, systemom operacyjnym i \u015bwiatowi IT'
+
+ no_stylesheets = True
+ remove_javascript = True
+ encoding = 'utf-8'
+ use_embedded_content = False;
+
+ oldest_article = 7
+ max_articles_per_feed = 100
+
+ extra_css = '''
+ .news-heading {font-size:150%}
+ .newsinformations li {display:inline;}
+ blockquote {border:2px solid #000; padding:5px;}
+ '''
+
+ feeds = [
+ (u'OSNews.pl', u'http://feeds.feedburner.com/OSnewspl')
+ ]
+
+ keep_only_tags = [
+ dict(name = 'a', attrs = {'class' : 'news-heading'}),
+ dict(name = 'div', attrs = {'class' : 'newsinformations'}),
+ dict(name = 'div', attrs = {'id' : 'news-content'})
+ ]
+
+ remove_tags = [
+ dict(name = 'div', attrs = {'class' : 'sociable'}),
+ dict(name = 'div', attrs = {'class' : 'post_prev'}),
+ dict(name = 'div', attrs = {'class' : 'post_next'}),
+ dict(name = 'div', attrs = {'class' : 'clr'})
+ ]
+
+ preprocess_regexps = [(re.compile(u'Komentarze: \(?[0-9]+\)? ?'
+'''
+swiatkindle.pl
+'''
+
+import re
+
+class swiatkindle(BasicNewsRecipe):
+ title = u'Swiat Kindle'
+ description = u'Blog o czytniku Amazon Kindle. Wersje, ksi\u0105\u017cki, kupowanie i korzystanie w Polsce'
+ language = 'pl'
+ __author__ = u'Tomasz D\u0142ugosz'
+ oldest_article = 7
+ max_articles_per_feed = 100
+
+ feeds = [(u'\u015awiat Kindle - wpisy', u'http://swiatkindle.pl/feed')]
+
+ remove_tags = [dict(name = 'ul', attrs = {'class' : 'similar-posts'})]
+
+ preprocess_regexps = [(re.compile(u'Czytaj dalej:
'), lambda match: '')]
+
diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index 6cfe915036..4f3574559e 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -573,8 +573,8 @@ from calibre.devices.edge.driver import EDGE
from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, \
SOVOS, PICO, SUNSTECH_EB700, ARCHOS7O, STASH
from calibre.devices.sne.driver import SNE
-from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \
- GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, Q600, LUMIREAD, ALURATEK_COLOR, \
+from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, \
+ GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, LUMIREAD, ALURATEK_COLOR, \
TREKSTOR, EEEREADER, NEXTBOOK
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
from calibre.devices.kobo.driver import KOBO
@@ -691,8 +691,6 @@ plugins += [
AVANT,
MENTOR,
SWEEX,
- Q600,
- KOGAN,
PDNOVEL,
SPECTRA,
GEMEI,
diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py
index 5f67e23d92..e9feacc67e 100644
--- a/src/calibre/customize/ui.py
+++ b/src/calibre/customize/ui.py
@@ -121,7 +121,8 @@ def enable_plugin(plugin_or_name):
config['enabled_plugins'] = ep
default_disabled_plugins = set([
- 'Douban Books', 'Douban.com covers', 'Nicebooks', 'Nicebooks covers'
+ 'Douban Books', 'Douban.com covers', 'Nicebooks', 'Nicebooks covers',
+ 'Kent District Library'
])
def is_disabled(plugin):
diff --git a/src/calibre/devices/misc.py b/src/calibre/devices/misc.py
index e549a4a9fd..d74f727a0e 100644
--- a/src/calibre/devices/misc.py
+++ b/src/calibre/devices/misc.py
@@ -54,41 +54,24 @@ class AVANT(USBMS):
class SWEEX(USBMS):
# Identical to the Promedia
name = 'Sweex Device Interface'
- gui_name = 'Sweex'
- description = _('Communicate with the Sweex MM300')
+ gui_name = 'Sweex/Kogan/Q600/Wink'
+ description = _('Communicate with the Sweex/Kogan/Q600/Wink')
author = 'Kovid Goyal'
supported_platforms = ['windows', 'osx', 'linux']
# Ordered list of supported formats
- FORMATS = ['epub', 'prc', 'fb2', 'html', 'rtf', 'chm', 'pdf', 'txt']
+ FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'html', 'rtf', 'chm', 'pdf', 'txt']
VENDOR_ID = [0x0525, 0x177f]
PRODUCT_ID = [0xa4a5, 0x300]
- BCD = [0x0319, 0x110]
+ BCD = [0x0319, 0x110, 0x325]
- VENDOR_NAME = 'SWEEX'
- WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'EBOOKREADER'
+ VENDOR_NAME = ['SWEEX', 'LINUX']
+ WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['EBOOKREADER', 'FILE-STOR_GADGET']
EBOOK_DIR_MAIN = ''
SUPPORTS_SUB_DIRS = True
-class Q600(SWEEX):
-
- name = 'Digma Q600 Device interface'
- gui_name = 'Q600'
- description = _('Communicate with the Digma Q600')
-
- BCD = [0x325]
- FORMATS = ['epub', 'fb2', 'mobi', 'prc', 'html', 'rtf', 'chm', 'pdf', 'txt']
-
-class KOGAN(SWEEX):
-
- name = 'Kogan Device Interface'
- gui_name = 'Kogan'
- description = _('Communicate with the Kogan')
- VENDOR_NAME = 'LINUX'
- WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'FILE-STOR_GADGET'
- EBOOK_DIR_MAIN = 'Kogan eBooks'
class PDNOVEL(USBMS):
name = 'Pandigital Novel device interface'
diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py
index 6af058da7b..43f93807a1 100644
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@@ -350,6 +350,8 @@ class FB2MLizer(object):
# Number of blank lines above tag
try:
ems = int(round((float(style.marginTop) / style.fontSize) - 1))
+ if ems < 0:
+ ems = 0
except:
ems = 0
@@ -397,7 +399,7 @@ class FB2MLizer(object):
fb2_out += p_txt
tags += p_tag
fb2_out.append('' % self.image_hrefs[page.abshref(elem_tree.attrib['src'])])
- if tag in ('br', 'hr') or ems:
+ if tag in ('br', 'hr') or ems >= 1:
if ems < 1:
multiplier = 1
else:
diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py
index 58dd3f1d22..c87249ed39 100644
--- a/src/calibre/ebooks/metadata/amazon.py
+++ b/src/calibre/ebooks/metadata/amazon.py
@@ -205,7 +205,10 @@ def main(args=sys.argv):
open(cpath, 'wb').write(br.open_novisit(curl).read())
print 'Cover for', title, 'saved to', cpath
+ #import time
+ #st = time.time()
print get_social_metadata(title, None, None, isbn)
+ #print '\n\n', time.time() - st, '\n\n'
return 0
diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py
index 4f246b2b9a..667b4f4d7c 100644
--- a/src/calibre/ebooks/metadata/fetch.py
+++ b/src/calibre/ebooks/metadata/fetch.py
@@ -106,6 +106,9 @@ class MetadataSource(Plugin): # {{{
def join(self):
return self.worker.join()
+ def is_alive(self):
+ return self.worker.is_alive()
+
def is_customizable(self):
return True
@@ -251,7 +254,9 @@ class KentDistrictLibrary(MetadataSource): # {{{
name = 'Kent District Library'
metadata_type = 'social'
- description = _('Downloads series information from ww2.kdl.org')
+ description = _('Downloads series information from ww2.kdl.org. '
+ 'This website cannot handle large numbers of queries, '
+ 'so the plugin is disabled by default.')
def fetch(self):
if not self.title or not self.book_author:
diff --git a/src/calibre/ebooks/metadata/kdl.py b/src/calibre/ebooks/metadata/kdl.py
index 4eca49ad45..b0b961b603 100644
--- a/src/calibre/ebooks/metadata/kdl.py
+++ b/src/calibre/ebooks/metadata/kdl.py
@@ -5,7 +5,9 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import re, urllib, urlparse
+import re, urllib, urlparse, socket
+
+from mechanize import URLError
from calibre.ebooks.metadata.book.base import Metadata
from calibre import browser
@@ -17,7 +19,7 @@ URL = \
_ignore_starts = u'\'"'+u''.join(unichr(x) for x in range(0x2018, 0x201e)+[0x2032, 0x2033])
-def get_series(title, authors):
+def get_series(title, authors, timeout=60):
mi = Metadata(title, authors)
if title and title[0] in _ignore_starts:
title = title[1:]
@@ -39,7 +41,12 @@ def get_series(title, authors):
url = URL.format(author, title)
br = browser()
- raw = br.open(url).read()
+ try:
+ raw = br.open_novisit(url, timeout=timeout).read()
+ except URLError, e:
+ if isinstance(e.reason, socket.timeout):
+ raise Exception('KDL Server busy, try again later')
+ raise
if 'see the full results' not in raw:
return mi
raw = xml_to_unicode(raw)[0]
diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py
index e5490ef56e..74e184cc66 100644
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@@ -85,7 +85,8 @@ class Source(Plugin):
# Metadata API {{{
- def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
+ def identify(self, log, result_queue, abort, title=None, authors=None,
+ identifiers={}, timeout=5):
'''
Identify a book by its title/author/isbn/etc.
@@ -98,6 +99,8 @@ class Source(Plugin):
:param authors: A list of authors of the book, can be None
:param identifiers: A dictionary of other identifiers, most commonly
{'isbn':'1234...'}
+ :param timeout: Timeout in seconds, no network request should hang for
+ longer than timeout.
:return: None if no errors occurred, otherwise a unicode representation
of the error suitable for showing to the user
diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py
index c59bbe6dc5..498c7574ea 100644
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en'
import time
from urllib import urlencode
from functools import partial
-from threading import Thread
from lxml import etree
@@ -18,6 +17,7 @@ from calibre.ebooks.metadata.sources.base import Source
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.date import parse_date, utcnow
+from calibre.utils.cleantext import clean_ascii_chars
from calibre import browser, as_unicode
NAMESPACES = {
@@ -41,20 +41,20 @@ subject = XPath('descendant::dc:subject')
description = XPath('descendant::dc:description')
language = XPath('descendant::dc:language')
-def get_details(browser, url):
+def get_details(browser, url, timeout):
try:
- raw = browser.open_novisit(url).read()
+ raw = browser.open_novisit(url, timeout=timeout).read()
except Exception as e:
gc = getattr(e, 'getcode', lambda : -1)
if gc() != 403:
raise
# Google is throttling us, wait a little
- time.sleep(2)
- raw = browser.open_novisit(url).read()
+ time.sleep(1)
+ raw = browser.open_novisit(url, timeout=timeout).read()
return raw
-def to_metadata(browser, log, entry_):
+def to_metadata(browser, log, entry_, timeout):
def get_text(extra, x):
try:
@@ -79,8 +79,9 @@ def to_metadata(browser, log, entry_):
mi = Metadata(title_, authors)
try:
- raw = get_details(browser, id_url)
- feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True)[0])
+ raw = get_details(browser, id_url, timeout)
+ feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
+ strip_encoding_pats=True)[0])
extra = entry(feed)[0]
except:
log.exception('Failed to get additional details for', mi.title)
@@ -131,26 +132,19 @@ def to_metadata(browser, log, entry_):
return mi
-class Worker(Thread):
- def __init__(self, log, entries, abort, result_queue):
- self.browser, self.log, self.entries = browser(), log, entries
- self.abort, self.result_queue = abort, result_queue
- Thread.__init__(self)
- self.daemon = True
-
- def run(self):
- for i in self.entries:
- try:
- ans = to_metadata(self.browser, self.log, i)
- if isinstance(ans, Metadata):
- self.result_queue.put(ans)
- except:
- self.log.exception(
- 'Failed to get metadata for identify entry:',
- etree.tostring(i))
- if self.abort.is_set():
- break
+def get_all_details(br, log, entries, abort, result_queue, timeout):
+ for i in entries:
+ try:
+ ans = to_metadata(br, log, i, timeout)
+ if isinstance(ans, Metadata):
+ result_queue.put(ans)
+ except:
+ log.exception(
+ 'Failed to get metadata for identify entry:',
+ etree.tostring(i))
+ if abort.is_set():
+ break
class GoogleBooks(Source):
@@ -192,54 +186,40 @@ class GoogleBooks(Source):
})
- def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
+ def identify(self, log, result_queue, abort, title=None, authors=None,
+ identifiers={}, timeout=5):
query = self.create_query(log, title=title, authors=authors,
identifiers=identifiers)
+ br = browser()
try:
- raw = browser().open_novisit(query).read()
+ raw = br.open_novisit(query, timeout=timeout).read()
except Exception, e:
log.exception('Failed to make identify query: %r'%query)
return as_unicode(e)
try:
parser = etree.XMLParser(recover=True, no_network=True)
- feed = etree.fromstring(xml_to_unicode(raw,
+ feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
strip_encoding_pats=True)[0], parser=parser)
entries = entry(feed)
except Exception, e:
log.exception('Failed to parse identify results')
return as_unicode(e)
-
- groups = self.split_jobs(entries, 5) # At most 5 threads
- if not groups:
- return None
- workers = [Worker(log, entries, abort, result_queue) for entries in
- groups]
-
- if abort.is_set():
- return None
-
- for worker in workers: worker.start()
-
- has_alive_worker = True
- while has_alive_worker and not abort.is_set():
- time.sleep(0.1)
- has_alive_worker = False
- for worker in workers:
- if worker.is_alive():
- has_alive_worker = True
+ # There is no point running these queries in threads as google
+ # throttles requests returning Forbidden errors
+ get_all_details(br, log, entries, abort, result_queue, timeout)
return None
if __name__ == '__main__':
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
- isbn_test)
+ title_test)
test_identify_plugin(GoogleBooks.name,
[
(
{'title': 'Great Expectations', 'authors':['Charles Dickens']},
- [isbn_test('9781607541592')]
+ [title_test('Great Expectations', exact=True)]
),
])
diff --git a/src/calibre/ebooks/metadata/sources/test.py b/src/calibre/ebooks/metadata/sources/test.py
index cd7e7ab6e8..3b41e69d40 100644
--- a/src/calibre/ebooks/metadata/sources/test.py
+++ b/src/calibre/ebooks/metadata/sources/test.py
@@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import os, tempfile
+import os, tempfile, time
from Queue import Queue, Empty
from threading import Event
@@ -26,6 +26,17 @@ def isbn_test(isbn):
return test
+def title_test(title, exact=False):
+
+ title = title.lower()
+
+ def test(mi):
+ mt = mi.title.lower()
+ return (exact and mt == title) or \
+ (not exact and title in mt)
+
+ return test
+
def test_identify_plugin(name, tests):
'''
:param name: Plugin name
@@ -48,11 +59,15 @@ def test_identify_plugin(name, tests):
abort = Event()
prints('Log saved to', lf)
+ times = []
for kwargs, test_funcs in tests:
prints('Running test with:', kwargs)
rq = Queue()
args = (log, rq, abort)
+ start_time = time.time()
err = plugin.identify(*args, **kwargs)
+ total_time = time.time() - start_time
+ times.append(total_time)
if err is not None:
prints('identify returned an error for args', args)
prints(err)
@@ -87,6 +102,8 @@ def test_identify_plugin(name, tests):
prints('Log saved to', lf)
raise SystemExit(1)
+ prints('Average time per query', sum(times)/len(times))
+
if os.stat(lf).st_size > 10:
prints('There were some errors, see log', lf)
diff --git a/src/calibre/ebooks/metadata/xisbn.py b/src/calibre/ebooks/metadata/xisbn.py
index 2ee74396c7..aaeb1c6b98 100644
--- a/src/calibre/ebooks/metadata/xisbn.py
+++ b/src/calibre/ebooks/metadata/xisbn.py
@@ -11,6 +11,12 @@ from calibre import browser
class xISBN(object):
+ '''
+ This class is used to find the ISBN numbers of "related" editions of a
+ book, given its ISBN. Useful when querying services for metadata by ISBN,
+ in case they do not have the ISBN for the particular edition.
+ '''
+
QUERY = 'http://xisbn.worldcat.org/webservices/xid/isbn/%s?method=getEditions&format=json&fl=form,year,lang,ed'
def __init__(self):
diff --git a/src/calibre/gui2/actions/add.py b/src/calibre/gui2/actions/add.py
index 49a7a4677a..f99e48eb2b 100644
--- a/src/calibre/gui2/actions/add.py
+++ b/src/calibre/gui2/actions/add.py
@@ -259,6 +259,7 @@ class AddAction(InterfaceAction):
if hasattr(self.gui, 'db_images'):
self.gui.db_images.reset()
self.gui.tags_view.recount()
+
if getattr(self._adder, 'merged_books', False):
books = u'\n'.join([x if isinstance(x, unicode) else
x.decode(preferred_encoding, 'replace') for x in
@@ -266,6 +267,17 @@ class AddAction(InterfaceAction):
info_dialog(self.gui, _('Merged some books'),
_('The following duplicate books were found and incoming book formats were '
'processed and merged into your Calibre database according to your automerge settings:'), det_msg=books, show=True)
+
+ if getattr(self._adder, 'number_of_books_added', 0) > 0 or \
+ getattr(self._adder, 'merged_books', False):
+ # The formats of the current book could have changed if
+ # automerge is enabled
+ current_idx = self.gui.library_view.currentIndex()
+ if current_idx.isValid():
+ self.gui.library_view.model().current_changed(current_idx,
+ current_idx)
+
+
if getattr(self._adder, 'critical', None):
det_msg = []
for name, log in self._adder.critical.items():