diff --git a/recipes/daily_mirror.recipe b/recipes/daily_mirror.recipe index 8bac57951c..ebced64283 100644 --- a/recipes/daily_mirror.recipe +++ b/recipes/daily_mirror.recipe @@ -1,13 +1,13 @@ - from calibre.web.feeds.news import BasicNewsRecipe +from claibre import browser import re -import mechanize + class AdvancedUserRecipe1306061239(BasicNewsRecipe): title = u'The Daily Mirror' - description = 'News as provide by The Daily Mirror -UK' + description = 'News as provided by The Daily Mirror -UK' __author__ = 'Dave Asbury' - # last updated 7/4/12 + # last updated 28/4/12 language = 'en_GB' #cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg' @@ -15,89 +15,80 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe): oldest_article = 1 - max_articles_per_feed = 10 + max_articles_per_feed = 12 remove_empty_feeds = True remove_javascript = True no_stylesheets = True - auto_cleanup = True + # auto_cleanup = True #conversion_options = { 'linearize_tables' : True } - #keep_only_tags = [ - # dict(name='h1'), - # dict(name='div',attrs={'id' : 'body-content'}), - #dict(name='div',atts={'class' : 'article-body'}), + + keep_only_tags = [ dict(name='h1'), + dict(name='div',attrs={'class' : 'lead-text'}), + dict(name='div',attrs={'class' : 'styleGroup clearfix'}), + dict(name='div',attrs={'class' : 'widget relatedContents pictures widget-editable viziwyg-section-245 inpage-widget-158123'}), + dict(name='figure',attrs={'class' : 'clearfix'}), + dict(name='div',attrs={'class' :'body '}), + #dict(attrs={'class' : ['article-attr','byline append-1','published']}), #dict(name='p'), - # ] + ] - #remove_tags_after = [dict (name='div',attrs={'class' : 'related'})] remove_tags = [ + dict(attrs={'class' : 'comment'}), dict(name='title'), - dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}), - # dict(name='div',attrs={'id' : ['sidebar','menu','search-box','roffers-top']}), - #dict(name='div',attrs={'class' :['inline-ad span-16 last','article-resize','related','list teasers']}), - #dict(attrs={'class' : ['channellink','article-tags','replace','append-html']}), - ] - - # preprocess_regexps = [ - #(re.compile(r'', re.IGNORECASE | re.DOTALL), lambda match: '')] - preprocess_regexps = [ - (re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')] + dict(name='ul',attrs={'class' : 'clearfix breadcrumbs '}), + dict(name='ul',attrs={'id' : 'login-201109171215'}), + dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),#'widget navigation breadcrumb widget-editable viziwyg-section-198 inpage-widget-80721 span-17','image-credit' + ] preprocess_regexps = [ - (re.compile(r'Advertisement >>', re.IGNORECASE | re.DOTALL), lambda match: '')] + (re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')] - #preprocess_regexps = [ - #(re.compile(r'Sponsored Links', re.IGNORECASE | re.DOTALL), lambda match: '')] feeds = [ + (u'News',u'http://www.mirror.co.uk/news/rss.xml'), + (u'Sports',u'http://www.mirror.co.uk/sport/rss.xml'), + (u'3AM',u'http://www.mirror.co.uk/3am/rss.xml'), + (u'Lifestyle',u'http://www.mirror.co.uk/lifestyle/rss.xml') - (u'UK News', u'http://feed43.com/0287771688643868.xml') - ,(u'Tech News', u'http://feed43.com/2455520588350501.xml') - ,(u'Weird World','http://feed43.com/0863800333634654.xml') - ,(u'Sport','http://feed43.com/7713243036546130.xml') - ,(u'Sport : Boxing ','http://feed43.com/0414732220804255.xml') - ,(u'Sport : Rugby Union','http://feed43.com/4710138762362383.xml') - ,(u'Sport : Other','http://feed43.com/4501416886323415.xml') - ,(u'TV and Film','http://feed43.com/5238302853765104.xml') - ,(u'Celebs','http://feed43.com/8770061048844683.xml') - ,(u'Life Style : Family','http://feed43.com/4356170742410338.xml') - ,(u'Travel','http://feed43.com/1436576006476607.xml') # example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml') ] extra_css = ''' - body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;} - h1{ font-size:18px;} - img { display:block} - ''' + h1{ font-size:medium;} + body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;} + img { display:block} + '''# def get_cover_url(self): soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html') - # look for the block containing the mirror button and url +# look for the block containing the mirror button and url cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_92.gif);'}) cov2 = str(cov) cov2='http://www.politicshome.com'+cov2[9:-142] - #cov2 now contains url of the page containing pic +#cov2 now contains url of the page containing pic soup = self.index_to_soup(cov2) cov = soup.find(attrs={'id' : 'large'}) cov2 = str(cov) cov2=cov2[27:-18] #cov2 now is pic url, now go back to original function - br = mechanize.Browser() + br = browser() br.set_handle_redirect(False) try: - br.open_novisit(cov2) - cover_url = cov2 + br.open_novisit(cov2) + cover_url = cov2 except: - cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg' + cover_url ='http://profile.ak.fbcdn.net/hprofile-ak-snc4/373019_6149699161_1710984811_n.jpg' + # print '******** string is ', cov2,' ***' #cover_url = cov2 #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png' return cover_url + diff --git a/recipes/el_mundo_today.recipe b/recipes/el_mundo_today.recipe new file mode 100644 index 0000000000..77a9f331a0 --- /dev/null +++ b/recipes/el_mundo_today.recipe @@ -0,0 +1,21 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class ElMundoTodayRecipe(BasicNewsRecipe): + title = 'El Mundo Today' + __author__ = 'atordo' + description = u'La actualidad del mañana' + category = 'Noticias, humor' + cover_url = 'http://www.elmundotoday.com/wp-content/themes/EarthlyTouch/images/logo.png' + oldest_article = 30 + max_articles_per_feed = 30 + auto_cleanup = True + no_stylesheets = True + language = 'es' + use_embedded_content = True + + feeds = [('El Mundo Today', 'http://www.elmundotoday.com/feed/')] + + def get_broser(self): + br = BasicNewsRecipe.get_browser(self) + br.set_handle_gzip(True) + return br diff --git a/recipes/ming_pao.recipe b/recipes/ming_pao.recipe index ea1823e1cc..7866c89861 100644 --- a/recipes/ming_pao.recipe +++ b/recipes/ming_pao.recipe @@ -18,16 +18,21 @@ __IncludeThumbnails__ = True __UseLife__ = True # (HK only) It is to disable premium content (Default: False) __InclPremium__ = False -# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True) -__ParsePFF__ = True +# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: False) +__ParsePF__ = False +# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with text formats (Default: True) -- override __ParsePF__ +__ParseTxt__ = True +# (HK only) Use mobile text version for some articles (Default: False) +__ParseSelectedMobile__ = False # (HK only) Turn below to True if you wish hi-res images (Default: False) __HiResImg__ = False -# Override the date returned by the program if specifying a YYYYMMDD below +# Override the date returned by the program if specifying a YYYYMMDD below (not work if __ParseSelectedMobile__ is True and __UseLife__ is False) __Date__ = '' ''' Change Log: +2012/04/24: improved parsing of news.mingpao.com content 2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device. @@ -81,6 +86,7 @@ class MPRecipe(BasicNewsRecipe): category = 'Chinese, News, Hong Kong' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' + remove_tags_before = dict(name='font', attrs={'color':['navy']}) keep_only_tags = [dict(name='h1'), dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title dict(name='font', attrs={'color':['AA0000']}), # for column articles title @@ -91,13 +97,17 @@ class MPRecipe(BasicNewsRecipe): dict(attrs={'class':['photo']}), dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com - dict(attrs={'class':['images']}) # for images from txt + dict(attrs={'class':['images']}), # for images from txt + dict(name='table', attrs={'width':['100%'], 'cellspacing':['0'], 'cellpadding':['0'], 'border':['0']}) # content table in pda site ] if __KeepImages__: remove_tags = [dict(name='style'), dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article + dict(name='img', attrs={'alt':["明報網站", "按此列印", "關閉本視窗"]}), # non-article images in life.mingpao.com article + dict(name='img', attrs={'src':["../image/top_2.gif"]}) #dict(name='table') # for content fetched from life.mingpao.com + #dict(name='table', attrs={'width':['98%'], 'cellspacing':['0'], 'cellpadding':['0'], 'border':['0']}) ] else: remove_tags = [dict(name='style'), @@ -105,6 +115,7 @@ class MPRecipe(BasicNewsRecipe): dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article dict(name='img'), #dict(name='table') # for content fetched from life.mingpao.com + #dict(name='table', attrs={'width':['98%'], 'cellspacing':['0'], 'cellpadding':['0'], 'border':['0']}) ] remove_attributes = ['width'] preprocess_regexps = [ @@ -118,7 +129,15 @@ class MPRecipe(BasicNewsRecipe): (re.compile(r"

", re.DOTALL|re.IGNORECASE), lambda match: "
"), (re.compile(r"

", re.DOTALL|re.IGNORECASE), - lambda match: "") + lambda match: ""), + (re.compile(r'


', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(r'', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(r'', re.DOTALL|re.IGNORECASE), + lambda match: ''), + #(re.compile(r'[.+?]', re.DOTALL|re.IGNORECASE), + #lambda match: '') ] elif __Region__ == 'Vancouver': if __UseChineseTitle__ == True: @@ -221,6 +240,10 @@ class MPRecipe(BasicNewsRecipe): return __Date__[6:8] else: return self.get_dtlocal().strftime("%d") + + # Note: does not work with custom date given by __Date__ + def get_weekday(self): + return self.get_dtlocal().weekday() def get_cover_url(self): if __Region__ == 'Hong Kong': @@ -260,7 +283,23 @@ class MPRecipe(BasicNewsRecipe): if articles: feeds.append((title, articles)) - if __InclPremium__ == True: +# if __InclPremium__ == True: +# # parse column section articles directly from .txt files +# for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') +# ]: +# articles = self.parse_section2_txt(url, keystr) +# if articles: +# feeds.append((title, articles)) +# +# for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), +# (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: +# articles = self.parse_section(url) +# if articles: +# feeds.append((title, articles)) + + # new + if __InclPremium__ == True and (self.get_weekday() <> 6 or __ParseSelectedMobile__ == False): + # if both not on Sunday and not __ParseSelectedMobile__, go ahead # parse column section articles directly from .txt files for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') ]: @@ -268,17 +307,45 @@ class MPRecipe(BasicNewsRecipe): if articles: feeds.append((title, articles)) - for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), - (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: - articles = self.parse_section(url) + if __InclPremium__ == False or self.get_weekday() <> 6: + for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]: + if __ParseTxt__ == False: + articles = self.parse_section(url) + else: + articles = self.parse_section_txt(url, seckey) + if articles: + feeds.append((title, articles)) + else: + if __InclPremium__ == True and __ParseSelectedMobile__ == True: + articles = self.parse_section_mobile('http://premium.mingpao.com/pda/palm', 'HotNews1.cfm?cat=ja&token=b218bc260b89c0&online=1') + if articles: + feeds.append((u'\u526f\u520a Supplement', articles)) + else: + for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]: + if __ParseTxt__ == False: + articles = self.parse_section(url) + else: + articles = self.parse_section_txt(url, seckey) + if articles: + feeds.append((title, articles)) + + for title, url, seckey in [(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm', 'em')]: + if __ParseTxt__ == False: + articles = self.parse_section(url) + else: + articles = self.parse_section_txt(url, seckey) if articles: feeds.append((title, articles)) + # end of new else: - for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), - (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), - (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), - (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]: - articles = self.parse_section(url) + for title, url, seckey in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm', 'ga'), + (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm', 'gb'), + (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm', 'gf'), + (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm', 'mr')]: + if __ParseTxt__ == False: + articles = self.parse_section(url) + else: + articles = self.parse_section_txt(url, seckey) if articles: feeds.append((title, articles)) @@ -287,10 +354,13 @@ class MPRecipe(BasicNewsRecipe): #if ed_articles: # feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles)) - for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), - (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), - (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]: - articles = self.parse_section(url) + for title, url, seckey in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm', 'fa'), + (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm', 'ca'), + (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm', 'ta')]: + if __ParseTxt__ == False: + articles = self.parse_section(url) + else: + articles = self.parse_section_txt(url, seckey) if articles: feeds.append((title, articles)) @@ -322,7 +392,9 @@ class MPRecipe(BasicNewsRecipe): if articles: feeds.append((title, articles)) - if __InclPremium__ == True: + + if __InclPremium__ == True and (self.get_weekday() <> 6 or __ParseSelectedMobile__ == False): + # if both not on Sunday or not __ParseSelectedMobile__, go ahead # parse column section articles directly from .txt files for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') ]: @@ -330,12 +402,36 @@ class MPRecipe(BasicNewsRecipe): if articles: feeds.append((title, articles)) - for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), - (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: - articles = self.parse_section(url) + if __InclPremium__ == False or self.get_weekday() <> 6: + for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]: + if __ParseTxt__ == False: + articles = self.parse_section(url) + else: + articles = self.parse_section_txt(url, seckey) + if articles: + feeds.append((title, articles)) + else: + if __InclPremium__ == True and __ParseSelectedMobile__ == True: + articles = self.parse_section_mobile('http://premium.mingpao.com/pda/palm', 'HotNews1.cfm?cat=ja&token=b218bc260b89c0&online=1') + if articles: + feeds.append((u'\u526f\u520a Supplement', articles)) + else: + for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]: + if __ParseTxt__ == False: + articles = self.parse_section(url) + else: + articles = self.parse_section_txt(url, seckey) + if articles: + feeds.append((title, articles)) + + for title, url, seckey in [(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm', 'em')]: + if __ParseTxt__ == False: + articles = self.parse_section(url) + else: + articles = self.parse_section_txt(url, seckey) if articles: feeds.append((title, articles)) - + elif __Region__ == 'Vancouver': for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'), (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'), @@ -366,7 +462,7 @@ class MPRecipe(BasicNewsRecipe): feeds.append((title, articles)) return feeds - # parse from news.mingpao.com + # parse from news.mingpao.com (web html) def parse_section(self, url): dateStr = self.get_fetchdate() soup = self.index_to_soup(url) @@ -379,17 +475,57 @@ class MPRecipe(BasicNewsRecipe): title = self.tag_to_string(a) url = a.get('href', False) url = 'http://news.mingpao.com/' + dateStr + '/' +url - # replace the url to the print-friendly version - if __ParsePFF__ == True: + # replace the url to the alternative version + if __ParsePF__ == True: + # printer-friendly option if url.rfind('Redirect') <> -1 and __InclPremium__ == True: url = re.sub(dateStr + '.*' + dateStr, dateStr, url) url = re.sub('%2F.*%2F', '/', url) - title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '') + if __InclPremium__ == True: + title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '') url = url.replace('%2Etxt', '_print.htm') url = url.replace('%5F', '_') else: url = url.replace('.htm', '_print.htm') - if url not in included_urls and url.rfind('Redirect') == -1: + #if url not in included_urls and url.rfind('Redirect') == -1 and (__InclPremium__ == False or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1): + if url not in included_urls and (__InclPremium__ == True or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1): + current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) + included_urls.append(url) + current_articles.reverse() + return current_articles + + # parse from news.mingpao.com (txt) + def parse_section_txt(self, url, ch): + dateStr = self.get_fetchdate() + soup = self.index_to_soup(url) + divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']}) + current_articles = [] + included_urls = [] + divs.reverse() + for i in divs: + a = i.find('a', href = True) + title = self.tag_to_string(a) + url = a.get('href', False) + #print 'Base url: ', url + # replace the url to the alternative version + # text version + if url.rfind('Redirect') <> -1: + url = 'http://news.mingpao.com/' + dateStr + '/' +url + #print 'original url: ', url + url = re.sub(dateStr + '/../cfm/Redirect.cfm.*NewsFile=', 'ftp/WebNews2/', url) + url = re.sub('%2F', '/', url) + if __InclPremium__ == True: + title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '') + url = url.replace('%2Etxt', '.txt') + url = url.replace('%5F', '_') + else: + # get the first two char in url as ch + seckey = url[0:2] + url = url.replace('.htm', '.txt') + url = 'http://news.mingpao.com/ftp/WebNews2/' + dateStr + '/' + ch + '/' + seckey + '/' + url + #print 'updated url: ', url + if url not in included_urls and (__InclPremium__ == True or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1): + #if url not in included_urls and (url.rfind('Redirect') == -1) and (__InclPremium__ == False or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1): current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) included_urls.append(url) current_articles.reverse() @@ -415,7 +551,7 @@ class MPRecipe(BasicNewsRecipe): current_articles.append({'title': title, 'url': url, 'description': ''}) included_urls.append(url) except: - print 'skipping a premium article' + print 'skipping a premium article' current_articles.reverse() return current_articles @@ -437,6 +573,20 @@ class MPRecipe(BasicNewsRecipe): current_articles.reverse() return current_articles + # parse from mobile version + def parse_section_mobile(self, base, page): + soup = self.index_to_soup(base + '/' + page) + a = soup.findAll('a', href=True) + current_articles = [] + included_urls = [] + for i in a: + title = self.tag_to_string(i) + url = i.get('href', False) + if url not in included_urls and url.rfind('HotNews2.cfm') <> -1: + current_articles.append({'title': title, 'url': base + '/' + url, 'description': ''}) + included_urls.append(url) + return current_articles + # parse from www.mingpaovan.com def parse_section3(self, url, baseUrl): self.get_fetchdate() @@ -631,15 +781,22 @@ class MPRecipe(BasicNewsRecipe): br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1]) new_html = new_html.replace(img, gifimg) except: - # find the location of the first _ - pos = img.find('_') - if pos > -1: - # if found, insert _ after the first _ - newimg = img[0:pos] + '_' + img[pos:] - new_html = new_html.replace(img, newimg) + if __ParseTxt__ == False: + # find the location of the first _ + pos = img.find('_') + if pos > -1: + # if found, insert _ after the first _ + newimg = img[0:pos] + '_' + img[pos:] + new_html = new_html.replace(img, newimg) + else: + # if not found, insert _ after " + new_html = new_html.replace(img[1:], '"_' + img[1:]) else: - # if not found, insert _ after " - new_html = new_html.replace(img[1:], '"_' + img[1:]) + # insert to front + #print 'imgstr: ', img + pos = img.find('_') + new_html = new_html.replace(img[5:], '_' + img[5:]) + elif url.rfind('life.mingpao.com') > -1: imglist = re.findall('src=\'?.*?jpg\'', new_html) br = mechanize.Browser() @@ -673,9 +830,13 @@ class MPRecipe(BasicNewsRecipe): newimg = img[0:pos+1] + '_' + img[pos+1:] #print 'Use hi-res img', newimg new_html = new_html.replace(img, newimg) + # test + #print new_html return new_html def preprocess_html(self, soup): + for mobiletitle in soup.findAll('font', attrs={'color': ['navy']}): + mobiletitle.name = 'h1' for item in soup.findAll(style=True): del item['style'] for item in soup.findAll(style=True): @@ -909,3 +1070,4 @@ class MPRecipe(BasicNewsRecipe): opf.render(opf_file, ncx_file) + diff --git a/recipes/monbiot.recipe b/recipes/monbiot.recipe new file mode 100644 index 0000000000..5cc50c24d1 --- /dev/null +++ b/recipes/monbiot.recipe @@ -0,0 +1,43 @@ +__license__ = 'GPL v3' +__copyright__ = '2012, Darko Miletic ' +''' +www.monbiot.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class GeorgeMonbiot(BasicNewsRecipe): + title = 'George Monbiot - blog' + __author__ = 'Darko Miletic' + description = 'Tell people something they know already and they will thank you for it. Tell people something new and they will hate you for it.' + publisher = 'George Monbiot' + category = 'news, politics, UK, World' + oldest_article = 15 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'en_GB' + remove_empty_feeds = True + publication_type = 'blog' + extra_css = """ + body{font-family: Arial,Helvetica,sans-serif } + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_tags = [ + dict(name=['meta','link']), + dict(attrs={'class':'shareinpost'}), + dict(attrs={'id':'paging'}) + ] + remove_attributes=['lang'] + keep_only_tags=[dict(attrs={'id':'content'})] + + feeds = [(u'Articles', u'http://www.monbiot.com/feed/atom/')] diff --git a/recipes/newsweek_polska.recipe b/recipes/newsweek_polska.recipe index c8c53af655..4625eb89e6 100644 --- a/recipes/newsweek_polska.recipe +++ b/recipes/newsweek_polska.recipe @@ -2,20 +2,25 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2010, matek09, matek09@gmail.com' +__copyright__ = '2010, matek09, matek09@gmail.com; 2012, admroz, a.rozewicki@gmail.com' from calibre.web.feeds.news import BasicNewsRecipe from calibre.ptempfile import PersistentTemporaryFile +from string import capwords import datetime class Newsweek(BasicNewsRecipe): + + # how many issues to go back, 0 means get the most current one + BACK_ISSUES = 1 + EDITION = '0' DATE = None YEAR = datetime.datetime.now().year title = u'Newsweek Polska' - __author__ = 'matek09' + __author__ = 'matek09, admroz' description = 'Weekly magazine' encoding = 'utf-8' language = 'pl' @@ -25,6 +30,9 @@ class Newsweek(BasicNewsRecipe): articles_are_obfuscated = True + # + # Parses each article + # def get_obfuscated_article(self, url): br = self.get_browser() br.open(url) @@ -37,7 +45,28 @@ class Newsweek(BasicNewsRecipe): info = main_section.find('ul', attrs={'class' : 'articleInfo'}) authors = info.find('li').find('h4') article = main_section.find('div', attrs={'id' : 'article'}) - html = unicode(title) + unicode(authors) + unicode(article) + + # remove related articles box + related = article.find('div', attrs={'class' : 'relatedBox'}) + if related is not None: + related.extract() + + # remove div with social networking links and links to + # other articles in web version + for div in article.findAll('div'): + if div.find('span', attrs={'class' : 'google-plus'}): + div.extract() + + for p in div.findAll('p'): + if p.find('span', attrs={'style' : 'color: rgb(255, 0, 0);'}): + p.extract() + continue + for a in p.findAll('a'): + if a.find('span', attrs={'style' : 'font-size: larger;'}): + a.extract() + + + html = unicode(title) + unicode(authors) + unicode(article) next = main_section.find('li', attrs={'class' : 'next'}) while next: @@ -58,33 +87,35 @@ class Newsweek(BasicNewsRecipe): self.temp_files[-1].write(html) self.temp_files[-1].close() return self.temp_files[-1].name - - def is_full(self, issue_soup): - while True: - main_section = issue_soup.find(id='mainSection') - next = main_section.find('li', attrs={'class' : 'next'}) - if len(main_section.findAll(attrs={'class' : 'locked'})) > 1: - return False - elif next is None: - return True - else: - issue_soup = self.index_to_soup(next.find('a')['href']) - def find_last_full_issue(self, archive_url): + + # + # Goes back given number of issues. It also knows how to go back + # to the previous year if there are not enough issues in the current one + # + def find_last_issue(self, archive_url): archive_soup = self.index_to_soup(archive_url) select = archive_soup.find('select', attrs={'id' : 'paper_issue_select'}) - for option in select.findAll(lambda tag: tag.name == 'option' and tag.has_key('value')): + options = select.findAll(lambda tag: tag.name == 'option' and tag.has_key('value')) + + # check if need to go back to previous year + if len(options) > self.BACK_ISSUES: + option = options[self.BACK_ISSUES]; self.EDITION = option['value'].replace('http://www.newsweek.pl/wydania/','') issue_soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION) - if self.is_full(issue_soup): - return - - self.YEAR = self.YEAR - 1 - self.find_last_full_issue(archive_url + ',' + str(self.YEAR)) - + else: + self.BACK_ISSUES = self.BACK_ISSUES - len(options) + self.YEAR = self.YEAR - 1 + self.find_last_issue(archive_url + ',' + str(self.YEAR)) + + + # + # Looks for the last issue which we want to download. Then goes on each + # section and article and stores them (assigning to sections) + # def parse_index(self): archive_url = 'http://www.newsweek.pl/wydania/archiwum' - self.find_last_full_issue(archive_url) + self.find_last_issue(archive_url) soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION) self.DATE = self.tag_to_string(soup.find('span', attrs={'class' : 'data'})) main_section = soup.find(id='mainSection') @@ -93,32 +124,44 @@ class Newsweek(BasicNewsRecipe): feeds = [] articles = {} sections = [] - while True: - news_list = main_section.find('ul', attrs={'class' : 'newsList'}) - for h2 in news_list.findAll('h2'): + + news_list = main_section.find('ul', attrs={'class' : 'newsList'}) + section = 'Inne' + + for li in news_list.findAll('li'): + h3 = li.find('h3') + if h3 is not None: + section = capwords(self.tag_to_string(h3)) + continue + else: + h2 = li.find('h2') + if h2 is not None: + article = self.create_article(h2) + if article is None : + continue - article = self.create_article(h2) - category_div = h2.findNext('div', attrs={'class' : 'kategorie'}) - section = self.tag_to_string(category_div) - if articles.has_key(section): - articles[section].append(article) - else: - articles[section] = [article] - sections.append(section) + if articles.has_key(section): + articles[section].append(article) + else: + articles[section] = [article] + sections.append(section) - next = main_section.find('li', attrs={'class' : 'next'}) - if next is None: - break - soup = self.index_to_soup(next.find('a')['href']) - main_section = soup.find(id='mainSection') for section in sections: feeds.append((section, articles[section])) return feeds + + # + # Creates each article metadata (skips locked ones). The content will + # be extracted later by other method (get_obfuscated_article). + # def create_article(self, h2): article = {} a = h2.find('a') + if a is None: + return None + article['title'] = self.tag_to_string(a) article['url'] = a['href'] article['date'] = self.DATE @@ -129,7 +172,3 @@ class Newsweek(BasicNewsRecipe): else: article['description'] = '' return article - - - - diff --git a/recipes/the_sun.recipe b/recipes/the_sun.recipe index db74e003a0..11500430ff 100644 --- a/recipes/the_sun.recipe +++ b/recipes/the_sun.recipe @@ -1,12 +1,14 @@ -import re, mechanize +import re, random + +from calibre import browser from calibre.web.feeds.recipes import BasicNewsRecipe + class AdvancedUserRecipe1325006965(BasicNewsRecipe): title = u'The Sun UK' - description = 'A Recipe for The Sun tabloid UK' __author__ = 'Dave Asbury' - # last updated 7/4/12 + # last updated 29/4/12 language = 'en_GB' oldest_article = 1 max_articles_per_feed = 15 @@ -48,12 +50,10 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): feeds = [ - (u'News','http://feed43.com/2517447382644748.xml'), - (u'Sport', u'http://feed43.com/4283846255668687.xml'), - (u'Bizarre', u'http://feed43.com/0233840304242011.xml'), - (u'Film',u'http://feed43.com/1307545221226200.xml'), - (u'Music',u'http://feed43.com/1701513435064132.xml'), - (u'Sun Woman',u'http://feed43.com/0022626854226453.xml'), + (u'News', u'http://www.thesun.co.uk/sol/homepage/news/rss'), + (u'Sport', u'http://www.thesun.co.uk/sol/homepage/sport/rss'), + (u'Showbiz', u'http://www.thesun.co.uk/sol/homepage/showbiz/rss'), + (u'Woman', u'http://www.thesun.co.uk/sol/homepage/woman/rss'), ] def get_cover_url(self): @@ -61,14 +61,11 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): # look for the block containing the sun button and url cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'}) - - #cov = soup.find(attrs={'id' : 'large'}) cov2 = str(cov) cov2='http://www.politicshome.com'+cov2[9:-133] #cov2 now contains url of the page containing pic - #cov2 now contains url of the page containing pic soup = self.index_to_soup(cov2) cov = soup.find(attrs={'id' : 'large'}) @@ -76,16 +73,21 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): cov2=cov2[27:-18] #cov2 now is pic url, now go back to original function - br = mechanize.Browser() + br = browser() br.set_handle_redirect(False) try: br.open_novisit(cov2) cover_url = cov2 except: - cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png' + cover_url = random.choice(( + 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage6_677961a_905507a.jpg' + ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage7_677962a_905505a.jpg' + ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage5_677960a_905512a.jpg' + ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage2_677957a_905502a.jpg' + ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage3_677958a_905503a.jpg' + )) - #cover_url = cov2 - #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png' return cover_url + diff --git a/recipes/vice_magazine.recipe b/recipes/vice_magazine.recipe new file mode 100644 index 0000000000..262c09269c --- /dev/null +++ b/recipes/vice_magazine.recipe @@ -0,0 +1,17 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from calibre.web.feeds.news import BasicNewsRecipe + +class ViceESRecipe(BasicNewsRecipe): + title = u'Vice Magazine España' + __author__ = 'atordo' + description = u'La página web oficial de la revista Vice España' + category = u'noticias, fotografía, blogs, moda, arte, cine, música, literatura, tecnología' + cover_url = 'http://www.seeklogo.com/images/V/Vice-logo-668578AC94-seeklogo.com.gif' + oldest_article = 20 + max_articles_per_feed = 30 + auto_cleanup = True + no_stylesheets = True + language = 'es' + + feeds = [('Vice', 'http://www.vice.com/es/rss')] + diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index e8d5c5fc91..f6f8c83666 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -445,7 +445,7 @@ class LRFMetadataWriter(MetadataWriterPlugin): class MOBIMetadataWriter(MetadataWriterPlugin): name = 'Set MOBI metadata' - file_types = set(['mobi', 'prc', 'azw', 'azw4']) + file_types = set(['mobi', 'prc', 'azw', 'azw3', 'azw4']) description = _('Set metadata in %s files')%'MOBI' author = 'Marshall T. Vandegrift' @@ -539,7 +539,8 @@ from calibre.ebooks.conversion.plugins.epub_output import EPUBOutput from calibre.ebooks.conversion.plugins.fb2_output import FB2Output from calibre.ebooks.conversion.plugins.lit_output import LITOutput from calibre.ebooks.conversion.plugins.lrf_output import LRFOutput -from calibre.ebooks.conversion.plugins.mobi_output import MOBIOutput +from calibre.ebooks.conversion.plugins.mobi_output import (MOBIOutput, + AZW3Output) from calibre.ebooks.conversion.plugins.oeb_output import OEBOutput from calibre.ebooks.conversion.plugins.pdb_output import PDBOutput from calibre.ebooks.conversion.plugins.pdf_output import PDFOutput @@ -580,7 +581,7 @@ plugins += [ FB2Output, LITOutput, LRFOutput, - MOBIOutput, + MOBIOutput, AZW3Output, OEBOutput, PDBOutput, PDFOutput, @@ -1253,6 +1254,15 @@ class StoreBeWriteStore(StoreBase): headquarters = 'US' formats = ['EPUB', 'MOBI', 'PDF'] +class StoreBiblioStore(StoreBase): + name = u'Библио.бг' + author = 'Alex Stanev' + description = u'Електронна книжарница за книги и списания във формати ePUB и PDF. Част от заглавията са с активна DRM защита.' + actual_plugin = 'calibre.gui2.store.stores.biblio_plugin:BiblioStore' + + headquarters = 'BG' + formats = ['EPUB, PDF'] + class StoreBookotekaStore(StoreBase): name = 'Bookoteka' author = u'Tomasz Długosz' @@ -1596,6 +1606,7 @@ plugins += [ StoreBNStore, StoreBeamEBooksDEStore, StoreBeWriteStore, + StoreBiblioStore, StoreBookotekaStore, StoreChitankaStore, StoreDieselEbooksStore, diff --git a/src/calibre/debug.py b/src/calibre/debug.py index f5f803ec84..f2ae5d8eaf 100644 --- a/src/calibre/debug.py +++ b/src/calibre/debug.py @@ -54,6 +54,15 @@ Run an embedded python interpreter. parser.add_option('-m', '--inspect-mobi', action='store_true', default=False, help='Inspect the MOBI file(s) at the specified path(s)') + parser.add_option('--tweak-book', default=None, + help='Tweak the book (exports the book as a collection of HTML ' + 'files and metadata, which you can edit using standard HTML ' + 'editing tools, and then rebuilds the file from the edited HTML. ' + 'Makes no additional changes to the HTML, unlike a full calibre ' + 'conversion). Note that this tool will try to open the ' + 'folder containing the HTML files in the editor pointed to by the' + ' EDITOR environment variable.') + parser.add_option('--test-build', help='Test binary modules in build', action='store_true', default=False) @@ -239,7 +248,9 @@ def main(args=sys.argv): prints('Inspecting:', path) inspect_mobi(path) print - + elif opts.tweak_book: + from calibre.ebooks.tweak import tweak + tweak(opts.tweak_book) elif opts.test_build: from calibre.test_build import test test() diff --git a/src/calibre/ebooks/conversion/plugins/mobi_output.py b/src/calibre/ebooks/conversion/plugins/mobi_output.py index e759df5b78..eee2d480a3 100644 --- a/src/calibre/ebooks/conversion/plugins/mobi_output.py +++ b/src/calibre/ebooks/conversion/plugins/mobi_output.py @@ -6,8 +6,32 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -from calibre.customize.conversion import OutputFormatPlugin -from calibre.customize.conversion import OptionRecommendation +from calibre.customize.conversion import (OutputFormatPlugin, + OptionRecommendation) + +def remove_html_cover(oeb, log): + from calibre.ebooks.oeb.base import OEB_DOCS + + if not oeb.metadata.cover \ + or 'cover' not in oeb.guide: + return + href = oeb.guide['cover'].href + del oeb.guide['cover'] + item = oeb.manifest.hrefs[href] + if item.spine_position is not None: + log.warn('Found an HTML cover: ', item.href, 'removing it.', + 'If you find some content missing from the output MOBI, it ' + 'is because you misidentified the HTML cover in the input ' + 'document') + oeb.spine.remove(item) + if item.media_type in OEB_DOCS: + oeb.manifest.remove(item) + +def extract_mobi(output_path, opts): + if opts.extract_to is not None: + from calibre.ebooks.mobi.debug.main import inspect_mobi + ddir = opts.extract_to + inspect_mobi(output_path, ddir=ddir) class MOBIOutput(OutputFormatPlugin): @@ -140,25 +164,6 @@ class MOBIOutput(OutputFormatPlugin): # Fix up the periodical href to point to first section href toc.nodes[0].href = toc.nodes[0].nodes[0].href - def remove_html_cover(self): - from calibre.ebooks.oeb.base import OEB_DOCS - - oeb = self.oeb - if not oeb.metadata.cover \ - or 'cover' not in oeb.guide: - return - href = oeb.guide['cover'].href - del oeb.guide['cover'] - item = oeb.manifest.hrefs[href] - if item.spine_position is not None: - self.log.warn('Found an HTML cover: ', item.href, 'removing it.', - 'If you find some content missing from the output MOBI, it ' - 'is because you misidentified the HTML cover in the input ' - 'document') - oeb.spine.remove(item) - if item.media_type in OEB_DOCS: - self.oeb.manifest.remove(item) - def convert(self, oeb, output_path, input_plugin, opts, log): from calibre.utils.config import tweaks from calibre.ebooks.mobi.writer2.resources import Resources @@ -169,7 +174,7 @@ class MOBIOutput(OutputFormatPlugin): mobi_type = 'old' # Amazon does not support KF8 periodicals create_kf8 = mobi_type in ('new', 'both') - self.remove_html_cover() + remove_html_cover(self.oeb, self.log) resources = Resources(oeb, opts, self.is_periodical, add_fonts=create_kf8) self.check_for_periodical() @@ -185,7 +190,7 @@ class MOBIOutput(OutputFormatPlugin): ) if create_kf8 else None if mobi_type == 'new': kf8.write(output_path) - self.extract_mobi(output_path, opts) + extract_mobi(output_path, opts) return self.log('Creating MOBI 6 output') @@ -225,11 +230,72 @@ class MOBIOutput(OutputFormatPlugin): writer = MobiWriter(opts, resources, kf8, write_page_breaks_after_item=write_page_breaks_after_item) writer(oeb, output_path) - self.extract_mobi(output_path, opts) + extract_mobi(output_path, opts) + +class AZW3Output(OutputFormatPlugin): + + name = 'AZW3 Output' + author = 'Kovid Goyal' + file_type = 'azw3' + + options = set([ + OptionRecommendation(name='prefer_author_sort', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('When present, use author sort field as author.') + ), + OptionRecommendation(name='no_inline_toc', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Don\'t add Table of Contents to the book. Useful if ' + 'the book has its own table of contents.')), + OptionRecommendation(name='toc_title', recommended_value=None, + help=_('Title for any generated in-line table of contents.') + ), + OptionRecommendation(name='dont_compress', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Disable compression of the file contents.') + ), + OptionRecommendation(name='personal_doc', recommended_value='[PDOC]', + help=_('Tag marking book to be filed with Personal Docs') + ), + OptionRecommendation(name='mobi_toc_at_start', + recommended_value=False, + help=_('When adding the Table of Contents to the book, add it at the start of the ' + 'book instead of the end. Not recommended.') + ), + OptionRecommendation(name='extract_to', recommended_value=None, + help=_('Extract the contents of the MOBI file to the' + ' specified directory. If the directory already ' + 'exists, it will be deleted.') + ), + OptionRecommendation(name='share_not_sync', recommended_value=False, + help=_('Enable sharing of book content via Facebook etc. ' + ' on the Kindle. WARNING: Using this feature means that ' + ' the book will not auto sync its last read position ' + ' on multiple devices. Complain to Amazon.') + ), + ]) + + def convert(self, oeb, output_path, input_plugin, opts, log): + from calibre.ebooks.mobi.writer2.resources import Resources + from calibre.ebooks.mobi.writer8.main import create_kf8_book + + self.oeb, self.opts, self.log = oeb, opts, log + opts.mobi_periodical = self.is_periodical + passthrough = getattr(opts, 'mobi_passthrough', False) + + resources = Resources(self.oeb, self.opts, self.is_periodical, + add_fonts=True, process_images=False) + if not passthrough: + remove_html_cover(self.oeb, self.log) + + # Split on pagebreaks so that the resulting KF8 works better with + # calibre's viewer, which does not support CSS page breaks + from calibre.ebooks.oeb.transforms.split import Split + Split()(self.oeb, self.opts) + + kf8 = create_kf8_book(self.oeb, self.opts, resources, for_joint=False) + + kf8.write(output_path) + extract_mobi(output_path, opts) - def extract_mobi(self, output_path, opts): - if opts.extract_to is not None: - from calibre.ebooks.mobi.debug.main import inspect_mobi - ddir = opts.extract_to - inspect_mobi(output_path, ddir=ddir) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 2c1a5cd4d3..86a4668b9b 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -179,8 +179,12 @@ class HeuristicProcessor(object): for match in re.finditer(pat, search_text): ital_string = str(match.group('words')) #self.log.debug("italicising "+str(match.group(0))+" with "+ital_string+"") - html = re.sub(re.escape(str(match.group(0))), '%s' % ital_string, html) - + try: + html = re.sub(re.escape(str(match.group(0))), '%s' % ital_string, html) + except OverflowError: + # match.group(0) was too large to be compiled into a regex + continue + return html def markup_chapters(self, html, wordcount, blanks_between_paragraphs): @@ -319,13 +323,13 @@ class HeuristicProcessor(object): ''' Unwraps lines based on line length and punctuation supports a range of html markup and text files - + the lookahead regex below is meant look for any non-full stop characters - punctuation characters which can be used as a full stop should *not* be added below - e.g. ?!“”. etc the reason for this is to prevent false positive wrapping. False positives are more difficult to detect than false negatives during a manual review of the doc - - This function intentionally leaves hyphenated content alone as that is handled by the + + This function intentionally leaves hyphenated content alone as that is handled by the dehyphenate routine in a separate step ''' diff --git a/src/calibre/ebooks/mobi/debug/mobi8.py b/src/calibre/ebooks/mobi/debug/mobi8.py index a03205edd7..788ca3ed0a 100644 --- a/src/calibre/ebooks/mobi/debug/mobi8.py +++ b/src/calibre/ebooks/mobi/debug/mobi8.py @@ -141,9 +141,10 @@ class MOBIFile(object): self.files.append(File(skel, skeleton, ftext, first_aid, sections)) def dump_flows(self, ddir): - if self.fdst is None: - raise ValueError('This MOBI file has no FDST record') - for i, x in enumerate(self.fdst.sections): + boundaries = [(0, len(self.raw_text))] + if self.fdst is not None: + boundaries = self.fdst.sections + for i, x in enumerate(boundaries): start, end = x raw = self.raw_text[start:end] with open(os.path.join(ddir, 'flow%04d.txt'%i), 'wb') as f: diff --git a/src/calibre/ebooks/mobi/reader/headers.py b/src/calibre/ebooks/mobi/reader/headers.py index 0162fddda7..a5ca4a7132 100644 --- a/src/calibre/ebooks/mobi/reader/headers.py +++ b/src/calibre/ebooks/mobi/reader/headers.py @@ -234,6 +234,22 @@ class MetadataHeader(BookHeader): else: self.exth = None + @property + def kf8_type(self): + if (self.mobi_version == 8 and getattr(self, 'skelidx', NULL_INDEX) != + NULL_INDEX): + return u'standalone' + + kf8_header_index = getattr(self.exth, 'kf8_header', None) + if kf8_header_index is None: + return None + try: + if self.section_data(kf8_header_index-1) == b'BOUNDARY': + return u'joint' + except: + pass + return None + def identity(self): self.stream.seek(60) ident = self.stream.read(8).upper() diff --git a/src/calibre/ebooks/mobi/tweak.py b/src/calibre/ebooks/mobi/tweak.py new file mode 100644 index 0000000000..248ed97261 --- /dev/null +++ b/src/calibre/ebooks/mobi/tweak.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os, glob + +from calibre import CurrentDir +from calibre.ebooks.mobi import MobiError +from calibre.ebooks.mobi.reader.mobi6 import MobiReader +from calibre.ebooks.mobi.reader.headers import MetadataHeader +from calibre.utils.logging import default_log +from calibre.ebooks import DRMError +from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader +from calibre.ebooks.conversion.plumber import Plumber, create_oebbook +from calibre.customize.ui import (plugin_for_input_format, + plugin_for_output_format) +from calibre.utils.ipc.simple_worker import fork_job + +class BadFormat(ValueError): + pass + +def do_explode(path, dest): + with open(path, 'rb') as stream: + mr = MobiReader(stream, default_log, None, None) + + with CurrentDir(dest): + mr = Mobi8Reader(mr, default_log) + opf = os.path.abspath(mr()) + + return opf + +def explode(path, dest, question=lambda x:True): + with open(path, 'rb') as stream: + raw = stream.read(3) + stream.seek(0) + if raw == b'TPZ': + raise BadFormat(_('This is not a MOBI file. It is a Topaz file.')) + + try: + header = MetadataHeader(stream, default_log) + except MobiError: + raise BadFormat(_('This is not a MOBI file.')) + + if header.encryption_type != 0: + raise DRMError(_('This file is locked with DRM. It cannot be tweaked.')) + + kf8_type = header.kf8_type + + if kf8_type is None: + raise BadFormat('This MOBI file does not contain a KF8 format book') + + if kf8_type == 'joint': + if not question(_('This MOBI file contains both KF8 and ' + 'older Mobi6 data. Tweaking it will remove the Mobi6 data, which ' + 'means the file will not be usable on older Kindles. Are you ' + 'sure?')): + return None + + return fork_job('calibre.ebooks.mobi.tweak', 'do_explode', args=(path, + dest), no_output=True)['result'] + +def do_rebuild(opf, dest_path): + plumber = Plumber(opf, dest_path, default_log) + plumber.setup_options() + inp = plugin_for_input_format('azw3') + outp = plugin_for_output_format('azw3') + + plumber.opts.mobi_passthrough = True + oeb = create_oebbook(default_log, opf, plumber.opts) + outp.convert(oeb, dest_path, inp, plumber.opts, default_log) + +def rebuild(src_dir, dest_path): + opf = glob.glob(os.path.join(src_dir, '*.opf')) + if not opf: + raise ValueError('No OPF file found in %s'%src_dir) + opf = opf[0] + fork_job('calibre.ebooks.mobi.tweak', 'do_rebuild', args=(opf, dest_path), + no_output=True) + diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index 27c4838a4b..9afd39a211 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -25,6 +25,15 @@ from calibre.ebooks.mobi.writer2.indexer import Indexer WRITE_UNCROSSABLE_BREAKS = False NULL_INDEX = 0xffffffff +FLIS = (b'FLIS\0\0\0\x08\0\x41\0\0\0\0\0\0\xff\xff\xff\xff\0\x01\0\x03\0\0\0\x03\0\0\0\x01'+ + b'\xff'*4) + +def fcis(text_length): + fcis = b'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00' + fcis += pack(b'>I', text_length) + fcis += b'\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00' + return fcis + class MobiWriter(object): def __init__(self, opts, resources, kf8, write_page_breaks_after_item=True): @@ -208,14 +217,9 @@ class MobiWriter(object): # FCIS/FLIS (Seems to serve no purpose) flis_number = len(self.records) - self.records.append( - b'FLIS\0\0\0\x08\0\x41\0\0\0\0\0\0\xff\xff\xff\xff\0\x01\0\x03\0\0\0\x03\0\0\0\x01'+ - b'\xff'*4) - fcis = b'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00' - fcis += pack(b'>I', self.text_length) - fcis += b'\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00' + self.records.append(FLIS) fcis_number = len(self.records) - self.records.append(fcis) + self.records.append(fcis(self.text_length)) # EOF record self.records.append(b'\xE9\x8E\x0D\x0A') @@ -379,6 +383,12 @@ class MobiWriter(object): self.resources.serialize(self.records, used_images) resource_record_count = len(self.records) - old + # FCIS/FLIS (Seems to serve no purpose) + flis_number = len(self.records) + self.records.append(FLIS) + fcis_number = len(self.records) + self.records.append(fcis(self.text_length)) + # Insert KF8 records self.records.append(b'BOUNDARY') kf8_header_index = len(self.records) @@ -398,6 +408,8 @@ class MobiWriter(object): header_fields['exth_flags'] = 0b100001010000 # Kinglegen uses this header_fields['fdst_record'] = NULL_INDEX header_fields['fdst_count'] = 1 # Why not 0? Kindlegen uses 1 + header_fields['flis_record'] = flis_number + header_fields['fcis_record'] = fcis_number extra_data_flags = 0b1 # Has multibyte overlap bytes if self.primary_index_record_idx is not None: extra_data_flags |= 0b10 diff --git a/src/calibre/ebooks/mobi/writer2/resources.py b/src/calibre/ebooks/mobi/writer2/resources.py index 2fcb93790c..2f12793b03 100644 --- a/src/calibre/ebooks/mobi/writer2/resources.py +++ b/src/calibre/ebooks/mobi/writer2/resources.py @@ -19,9 +19,11 @@ PLACEHOLDER_GIF = b'GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff!\ class Resources(object): - def __init__(self, oeb, opts, is_periodical, add_fonts=False): + def __init__(self, oeb, opts, is_periodical, add_fonts=False, + process_images=True): self.oeb, self.log, self.opts = oeb, oeb.log, opts self.is_periodical = is_periodical + self.process_images = process_images self.item_map = {} self.records = [] @@ -34,6 +36,8 @@ class Resources(object): self.add_resources(add_fonts) def process_image(self, data): + if not self.process_images: + return data return (mobify_image(data) if self.opts.mobi_keep_original_images else rescale_image(data)) diff --git a/src/calibre/ebooks/mobi/writer8/mobi.py b/src/calibre/ebooks/mobi/writer8/mobi.py index 18f19a4084..eabcf97047 100644 --- a/src/calibre/ebooks/mobi/writer8/mobi.py +++ b/src/calibre/ebooks/mobi/writer8/mobi.py @@ -18,6 +18,14 @@ from calibre.ebooks.mobi.writer8.exth import build_exth from calibre.utils.filenames import ascii_filename NULL_INDEX = 0xffffffff +FLIS = b'FLIS\0\0\0\x08\0\x41\0\0\0\0\0\0\xff\xff\xff\xff\0\x01\0\x03\0\0\0\x03\0\0\0\x01'+ b'\xff'*4 + +def fcis(text_length): + fcis = b'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x02\x00\x00\x00\x00' + fcis += pack(b'>L', text_length) + fcis += b'\x00\x00\x00\x00\x00\x00\x00\x28\x00\x00\x00\x00\x00\x00\x00' + fcis += b'\x28\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00' + return fcis class MOBIHeader(Header): # {{{ ''' @@ -115,7 +123,10 @@ class MOBIHeader(Header): # {{{ exth_flags = DYN # 132: Unknown - unknown = zeroes(36) + unknown = zeroes(32) + + # 164: Unknown + unknown_index = NULL # 168: DRM drm_offset = NULL @@ -130,13 +141,13 @@ class MOBIHeader(Header): # {{{ fdst_record = DYN fdst_count = DYN - # 200: FCI - fcis_record = NULL - fcis_count + # 200: FCIS + fcis_record = DYN + fcis_count = 1 # 208: FLIS - flis_record = NULL - flis_count + flis_record = DYN + flis_count = 1 # 216: Unknown unknown3 = zeroes(8) @@ -193,7 +204,7 @@ HEADER_FIELDS = {'compression', 'text_length', 'last_text_record', 'book_type', 'first_resource_record', 'exth_flags', 'fdst_record', 'fdst_count', 'ncx_index', 'chunk_index', 'skel_index', 'guide_index', 'exth', 'full_title', 'extra_data_flags', - 'uid'} + 'flis_record', 'fcis_record', 'uid'} class KF8Book(object): @@ -241,6 +252,12 @@ class KF8Book(object): self.fdst_record = len(self.records) self.records.extend(writer.fdst_records) + # FLIS/FCIS + self.flis_record = len(self.records) + self.records.append(FLIS) + self.fcis_record = len(self.records) + self.records.append(fcis(self.text_length)) + # EOF self.records.append(b'\xe9\x8e\r\n') # EOF record diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index c2cd9b4283..8fd4714e1c 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -13,7 +13,7 @@ from functools import partial from lxml import etree -from calibre.ebooks.oeb.base import XHTML_NS +from calibre.ebooks.oeb.base import XHTML_NS, extract from calibre.constants import ispy3 from calibre.ebooks.mobi.utils import to_base @@ -224,14 +224,24 @@ class Chunker(object): nroot.text = root.text nroot.tail = '\n' - for tag in root.iterdescendants(etree.Element): - # We are ignoring all non tag entities in the tree - # like comments and processing instructions, as they make the - # chunking code even harder, for minimal gain. - elem = nroot.makeelement(tag.tag.rpartition('}')[-1], - attrib={k.rpartition('}')[-1]:v for k, v in - tag.attrib.iteritems()}) - elem.text, elem.tail = tag.text, tag.tail + # Remove Comments and ProcessingInstructions as kindlegen seems to + # remove them as well + for tag in root.iterdescendants(): + if tag.tag in {etree.Comment, etree.ProcessingInstruction}: + extract(tag) + + for tag in root.iterdescendants(): + if tag.tag == etree.Entity: + elem = etree.Entity(tag.name) + else: + tn = tag.tag + if tn is not None: + tn = tn.rpartition('}')[-1] + elem = nroot.makeelement(tn, + attrib={k.rpartition('}')[-1]:v for k, v in + tag.attrib.iteritems()}) + elem.text = tag.text + elem.tail = tag.tail parent = node_from_path(nroot, path_to_node(tag.getparent())) parent.append(elem) @@ -251,6 +261,11 @@ class Chunker(object): # Now loop over children for child in list(tag): raw = tostring(child, with_tail=False) + if child.tag == etree.Entity: + chunks.append(raw) + if child.tail: + chunks.extend(self.chunk_up_text(child.tail, aid)) + continue raw = close_self_closing_tags(raw) if len(raw) > CHUNK_SIZE and child.get('aid', None): self.step_into_tag(child, chunks) diff --git a/src/calibre/ebooks/tweak.py b/src/calibre/ebooks/tweak.py new file mode 100644 index 0000000000..72e4c0a56c --- /dev/null +++ b/src/calibre/ebooks/tweak.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import sys, os, shlex, subprocess + +from calibre import prints, as_unicode, walk +from calibre.constants import iswindows, __appname__ +from calibre.ptempfile import TemporaryDirectory +from calibre.libunzip import extract as zipextract +from calibre.utils.zipfile import ZipFile, ZIP_DEFLATED, ZIP_STORED +from calibre.utils.ipc.simple_worker import WorkerError + +class Error(ValueError): + pass + +def ask_cli_question(msg): + prints(msg, end=' [y/N]: ') + sys.stdout.flush() + + if iswindows: + import msvcrt + ans = msvcrt.getch() + else: + import tty, termios + old_settings = termios.tcgetattr(sys.stdin.fileno()) + try: + tty.setraw(sys.stdin.fileno()) + try: + ans = sys.stdin.read(1) + except KeyboardInterrupt: + ans = b'' + finally: + termios.tcsetattr(sys.stdin.fileno(), termios.TCSADRAIN, old_settings) + print() + return ans == b'y' + +def mobi_exploder(path, tdir, question=lambda x:True): + from calibre.ebooks.mobi.tweak import explode, BadFormat + try: + return explode(path, tdir, question=question) + except BadFormat as e: + raise Error(as_unicode(e)) + +def zip_exploder(path, tdir, question=lambda x:True): + zipextract(path, tdir) + for f in walk(tdir): + if f.lower().endswith('.opf'): + return f + raise Error('Invalid book: Could not find .opf') + +def zip_rebuilder(tdir, path): + with ZipFile(path, 'w', compression=ZIP_DEFLATED) as zf: + # Write mimetype + mt = os.path.join(tdir, 'mimetype') + if os.path.exists(mt): + zf.write(mt, 'mimetype', compress_type=ZIP_STORED) + # Write everything else + exclude_files = {'.DS_Store', 'mimetype', 'iTunesMetadata.plist'} + for root, dirs, files in os.walk(tdir): + for fn in files: + if fn in exclude_files: + continue + absfn = os.path.join(root, fn) + zfn = os.path.relpath(absfn, tdir).replace(os.sep, '/') + zf.write(absfn, zfn) + +def get_tools(fmt): + fmt = fmt.lower() + + if fmt in {'mobi', 'azw', 'azw3'}: + from calibre.ebooks.mobi.tweak import rebuild + ans = mobi_exploder, rebuild + elif fmt in {'epub', 'htmlz'}: + ans = zip_exploder, zip_rebuilder + else: + ans = None, None + + return ans + +def tweak(ebook_file): + ''' Command line interface to the Tweak Book tool ''' + fmt = ebook_file.rpartition('.')[-1].lower() + exploder, rebuilder = get_tools(fmt) + if exploder is None: + prints('Cannot tweak %s files. Supported formats are: EPUB, HTMLZ, AZW3, MOBI' + , file=sys.stderr) + raise SystemExit(1) + + with TemporaryDirectory('_tweak_'+ + os.path.basename(ebook_file).rpartition('.')[0]) as tdir: + try: + opf = exploder(ebook_file, tdir, question=ask_cli_question) + except WorkerError as e: + prints('Failed to unpack', ebook_file) + prints(e.orig_tb) + raise SystemExit(1) + except Error as e: + prints(as_unicode(e), file=sys.stderr) + raise SystemExit(1) + + if opf is None: + # The question was answered with No + return + + ed = os.environ.get('EDITOR', None) + proceed = False + if ed is None: + prints('Book extracted to', tdir) + prints('Make your tweaks and once you are done,', __appname__, + 'will rebuild', ebook_file, 'from', tdir) + print() + proceed = ask_cli_question('Rebuild ' + ebook_file + '?') + else: + cmd = shlex.split(ed) + try: + subprocess.check_call(cmd + [tdir]) + except: + prints(ed, 'failed, aborting...') + raise SystemExit(1) + proceed = True + + if proceed: + prints('Rebuilding', ebook_file, 'please wait ...') + try: + rebuilder(tdir, ebook_file) + except WorkerError as e: + prints('Failed to rebuild', ebook_file) + prints(e.orig_tb) + raise SystemExit(1) + prints(ebook_file, 'successfully tweaked') + diff --git a/src/calibre/gui2/convert/azw3_output.py b/src/calibre/gui2/convert/azw3_output.py new file mode 100644 index 0000000000..8b1ef25aac --- /dev/null +++ b/src/calibre/gui2/convert/azw3_output.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +from calibre.gui2.convert.azw3_output_ui import Ui_Form +from calibre.gui2.convert import Widget + +font_family_model = None + +class PluginWidget(Widget, Ui_Form): + + TITLE = _('AZW3 Output') + HELP = _('Options specific to')+' AZW3 '+_('output') + COMMIT_NAME = 'azw3_output' + ICON = I('mimetypes/mobi.png') + + def __init__(self, parent, get_option, get_help, db=None, book_id=None): + Widget.__init__(self, parent, + ['prefer_author_sort', 'toc_title', + 'mobi_ignore_margins', 'mobi_toc_at_start', + 'dont_compress', 'no_inline_toc', 'share_not_sync', + 'personal_doc']#, 'mobi_navpoints_only_deepest'] + ) + self.db, self.book_id = db, book_id + + self.initialize_options(get_option, get_help, db, book_id) + + diff --git a/src/calibre/gui2/convert/azw3_output.ui b/src/calibre/gui2/convert/azw3_output.ui new file mode 100644 index 0000000000..657a38861d --- /dev/null +++ b/src/calibre/gui2/convert/azw3_output.ui @@ -0,0 +1,125 @@ + + + Form + + + + 0 + 0 + 588 + 342 + + + + Form + + + + + + Use author &sort for author + + + + + + + &Title for Table of Contents: + + + opt_toc_title + + + + + + + + + + Disable compression of the file contents + + + + + + + Do not add Table of Contents to book + + + + + + + Put generated Table of Contents at &start of book instead of end + + + + + + + Ignore &margins + + + + + + + Kindle options + + + + + + + + Personal Doc tag: + + + + + + + + + + + + Enable sharing of book content via Facebook, etc. WARNING: Disables last read syncing + + + + + + + Qt::Vertical + + + + 20 + 40 + + + + + + + + + + + Qt::Vertical + + + + 20 + 40 + + + + + + + + + diff --git a/src/calibre/gui2/store/stores/biblio_plugin.py b/src/calibre/gui2/store/stores/biblio_plugin.py new file mode 100644 index 0000000000..5a40ec57cc --- /dev/null +++ b/src/calibre/gui2/store/stores/biblio_plugin.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- + +from __future__ import (unicode_literals, division, absolute_import, print_function) + +__license__ = 'GPL 3' +__copyright__ = '2012, Alex Stanev ' +__docformat__ = 'restructuredtext en' + +import re + +from calibre.gui2.store.basic_config import BasicStoreConfig +from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore +from calibre.gui2.store.search_result import SearchResult + +class BiblioStore(BasicStoreConfig, OpenSearchOPDSStore): + + open_search_url = 'http://biblio.bg/feed.opds.php' + web_url = 'http://biblio.bg/' + + def search(self, query, max_results=10, timeout=60): + # check for cyrillic symbols before performing search + uquery = unicode(query.strip(), 'utf-8') + reObj = re.search(u'^[а-яА-Я\\d\\s]{3,}$', uquery) + if not reObj: + return + + for s in OpenSearchOPDSStore.search(self, query, max_results, timeout): + yield s + + def get_details(self, search_result, timeout): + # get format and DRM status + from calibre import browser + from contextlib import closing + from lxml import html + + br = browser() + with closing(br.open(search_result.detail_item, timeout=timeout)) as nf: + idata = html.fromstring(nf.read()) + search_result.formats = '' + if idata.xpath('.//span[@class="format epub"]'): + search_result.formats = 'EPUB' + + if idata.xpath('.//span[@class="format pdf"]'): + if search_result.formats == '': + search_result.formats = 'PDF' + else: + search_result.formats.join(', PDF') + + if idata.xpath('.//span[@class="format nodrm-icon"]'): + search_result.drm = SearchResult.DRM_UNLOCKED + else: + search_result.drm = SearchResult.DRM_LOCKED + + return True diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index f0d9aa8bd3..34e54592e4 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -22,7 +22,7 @@ It can convert every input format in the following list, to every output format. *Input Formats:* CBZ, CBR, CBC, CHM, DJVU, EPUB, FB2, HTML, HTMLZ, LIT, LRF, MOBI, ODT, PDF, PRC, PDB, PML, RB, RTF, SNB, TCR, TXT, TXTZ -*Output Formats:* EPUB, FB2, OEB, LIT, LRF, MOBI, HTMLZ, PDB, PML, RB, PDF, RTF, SNB, TCR, TXT, TXTZ +*Output Formats:* AZW3, EPUB, FB2, OEB, LIT, LRF, MOBI, HTMLZ, PDB, PML, RB, PDF, RTF, SNB, TCR, TXT, TXTZ .. note :: @@ -35,7 +35,7 @@ It can convert every input format in the following list, to every output format. What are the best source formats to convert? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In order of decreasing preference: LIT, MOBI, EPUB, FB2, HTML, PRC, RTF, PDB, TXT, PDF +In order of decreasing preference: LIT, MOBI, AZW, EPUB, AZW3, FB2, HTML, PRC, RTF, PDB, TXT, PDF I converted a PDF file, but the result has various problems? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/calibre/manual/template_lang.rst b/src/calibre/manual/template_lang.rst index 782673ce16..fb035a8cb3 100644 --- a/src/calibre/manual/template_lang.rst +++ b/src/calibre/manual/template_lang.rst @@ -245,7 +245,7 @@ The following functions are available in addition to those described in single-f * ``current_library_name() -- `` return the last name on the path to the current calibre library. This function can be called in template program mode using the template ``{:'current_library_name()'}``. * ``days_between(date1, date2)`` -- return the number of days between ``date1`` and ``date2``. The number is positive if ``date1`` is greater than ``date2``, otherwise negative. If either ``date1`` or ``date2`` are not dates, the function returns the empty string. * ``divide(x, y)`` -- returns x / y. Throws an exception if either x or y are not numbers. - * ``eval(string)`` -- evaluates the string as a program, passing the local variables (those ``assign`` ed to). This permits using the template processor to construct complex results from local variables. + * ``eval(string)`` -- evaluates the string as a program, passing the local variables (those ``assign`` ed to). This permits using the template processor to construct complex results from local variables. Because the `{` and `}` characters are special, you must use `[[` for the `{` character and `]]` for the '}' character; they are converted automatically. Note also that prefixes and suffixes (the "|prefix|suffix" syntax) cannot be used in the argument to this function when using template program mode. * ``field(name)`` -- returns the metadata field named by ``name``. * ``first_non_empty(value, value, ...)`` -- returns the first value that is not empty. If all values are empty, then the empty value is returned. You can have as many values as you want. * ``format_date(x, date_format)`` -- format_date(val, format_string) -- format the value, which must be a date field, using the format_string, returning a string. The formatting codes are:: @@ -306,7 +306,7 @@ The following functions are available in addition to those described in single-f * ``substr(str, start, end)`` -- returns the ``start``'th through the ``end``'th characters of ``str``. The first character in ``str`` is the zero'th character. If end is negative, then it indicates that many characters counting from the right. If end is zero, then it indicates the last character. For example, ``substr('12345', 1, 0)`` returns ``'2345'``, and ``substr('12345', 1, -1)`` returns ``'234'``. * ``subtract(x, y)`` -- returns x - y. Throws an exception if either x or y are not numbers. * ``today()`` -- return a date string for today. This value is designed for use in format_date or days_between, but can be manipulated like any other string. The date is in ISO format. - * ``template(x)`` -- evaluates x as a template. The evaluation is done in its own context, meaning that variables are not shared between the caller and the template evaluation. Because the `{` and `}` characters are special, you must use `[[` for the `{` character and `]]` for the '}' character; they are converted automatically. For example, ``template('[[title_sort]]') will evaluate the template ``{title_sort}`` and return its value. + * ``template(x)`` -- evaluates x as a template. The evaluation is done in its own context, meaning that variables are not shared between the caller and the template evaluation. Because the `{` and `}` characters are special, you must use `[[` for the `{` character and `]]` for the '}' character; they are converted automatically. For example, ``template('[[title_sort]]') will evaluate the template ``{title_sort}`` and return its value. Note also that prefixes and suffixes (the "|prefix|suffix" syntax) cannot be used in the argument to this function when using template program mode. .. _template_functions_reference: diff --git a/src/calibre/utils/config_base.py b/src/calibre/utils/config_base.py index 7fb120d028..ab22c6b30b 100644 --- a/src/calibre/utils/config_base.py +++ b/src/calibre/utils/config_base.py @@ -387,7 +387,7 @@ def _prefs(): help=_('The language in which to display the user interface')) c.add_opt('output_format', default='EPUB', help=_('The default output format for ebook conversions.')) - c.add_opt('input_format_order', default=['EPUB', 'MOBI', 'LIT', 'PRC', + c.add_opt('input_format_order', default=['EPUB', 'AZW3', 'MOBI', 'LIT', 'PRC', 'FB2', 'HTML', 'HTM', 'XHTM', 'SHTML', 'XHTML', 'ZIP', 'ODT', 'RTF', 'PDF', 'TXT'], help=_('Ordered list of formats to prefer for input.')) diff --git a/src/calibre/utils/formatter_functions.py b/src/calibre/utils/formatter_functions.py index bfb2f036c0..5b620e54e3 100644 --- a/src/calibre/utils/formatter_functions.py +++ b/src/calibre/utils/formatter_functions.py @@ -217,7 +217,9 @@ class BuiltinTemplate(BuiltinFormatterFunction): 'characters are special, you must use [[ for the { character and ' ']] for the } character; they are converted automatically. ' 'For example, template(\'[[title_sort]]\') will evaluate the ' - 'template {title_sort} and return its value.') + 'template {title_sort} and return its value. Note also that ' + 'prefixes and suffixes (the "|prefix|suffix" syntax) cannot be ' + 'used in the argument to this function when using template program mode.') def evaluate(self, formatter, kwargs, mi, locals, template): template = template.replace('[[', '{').replace(']]', '}') @@ -230,7 +232,12 @@ class BuiltinEval(BuiltinFormatterFunction): __doc__ = doc = _('eval(template) -- evaluates the template, passing the local ' 'variables (those \'assign\'ed to) instead of the book metadata. ' ' This permits using the template processor to construct complex ' - 'results from local variables.') + 'results from local variables. Because the { and } ' + 'characters are special, you must use [[ for the { character and ' + ']] for the } character; they are converted automatically. ' + 'Note also that prefixes and suffixes (the "|prefix|suffix" syntax) ' + 'cannot be used in the argument to this function when using ' + 'template program mode.') def evaluate(self, formatter, kwargs, mi, locals, template): from formatter import EvalFormatter