merge from trunk

This commit is contained in:
Lee 2012-05-03 01:54:58 +08:00
commit 74c85a9749
26 changed files with 1086 additions and 214 deletions

View File

@ -1,13 +1,13 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from claibre import browser
import re import re
import mechanize
class AdvancedUserRecipe1306061239(BasicNewsRecipe): class AdvancedUserRecipe1306061239(BasicNewsRecipe):
title = u'The Daily Mirror' title = u'The Daily Mirror'
description = 'News as provide by The Daily Mirror -UK' description = 'News as provided by The Daily Mirror -UK'
__author__ = 'Dave Asbury' __author__ = 'Dave Asbury'
# last updated 7/4/12 # last updated 28/4/12
language = 'en_GB' language = 'en_GB'
#cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg' #cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
@ -15,66 +15,55 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 10 max_articles_per_feed = 12
remove_empty_feeds = True remove_empty_feeds = True
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True
auto_cleanup = True # auto_cleanup = True
#conversion_options = { 'linearize_tables' : True } #conversion_options = { 'linearize_tables' : True }
#keep_only_tags = [
# dict(name='h1'), keep_only_tags = [ dict(name='h1'),
# dict(name='div',attrs={'id' : 'body-content'}), dict(name='div',attrs={'class' : 'lead-text'}),
#dict(name='div',atts={'class' : 'article-body'}), dict(name='div',attrs={'class' : 'styleGroup clearfix'}),
dict(name='div',attrs={'class' : 'widget relatedContents pictures widget-editable viziwyg-section-245 inpage-widget-158123'}),
dict(name='figure',attrs={'class' : 'clearfix'}),
dict(name='div',attrs={'class' :'body '}),
#dict(attrs={'class' : ['article-attr','byline append-1','published']}), #dict(attrs={'class' : ['article-attr','byline append-1','published']}),
#dict(name='p'), #dict(name='p'),
# ]
#remove_tags_after = [dict (name='div',attrs={'class' : 'related'})]
remove_tags = [
dict(name='title'),
dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),
# dict(name='div',attrs={'id' : ['sidebar','menu','search-box','roffers-top']}),
#dict(name='div',attrs={'class' :['inline-ad span-16 last','article-resize','related','list teasers']}),
#dict(attrs={'class' : ['channellink','article-tags','replace','append-html']}),
] ]
# preprocess_regexps = [
#(re.compile(r'<dl class="q-search">.*?</dl>', re.IGNORECASE | re.DOTALL), lambda match: '')] remove_tags = [
dict(attrs={'class' : 'comment'}),
dict(name='title'),
dict(name='ul',attrs={'class' : 'clearfix breadcrumbs '}),
dict(name='ul',attrs={'id' : 'login-201109171215'}),
dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),#'widget navigation breadcrumb widget-editable viziwyg-section-198 inpage-widget-80721 span-17','image-credit'
]
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')] (re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]
preprocess_regexps = [
(re.compile(r'Advertisement >>', re.IGNORECASE | re.DOTALL), lambda match: '')]
#preprocess_regexps = [
#(re.compile(r'Sponsored Links', re.IGNORECASE | re.DOTALL), lambda match: '')]
feeds = [ feeds = [
(u'News',u'http://www.mirror.co.uk/news/rss.xml'),
(u'Sports',u'http://www.mirror.co.uk/sport/rss.xml'),
(u'3AM',u'http://www.mirror.co.uk/3am/rss.xml'),
(u'Lifestyle',u'http://www.mirror.co.uk/lifestyle/rss.xml')
(u'UK News', u'http://feed43.com/0287771688643868.xml')
,(u'Tech News', u'http://feed43.com/2455520588350501.xml')
,(u'Weird World','http://feed43.com/0863800333634654.xml')
,(u'Sport','http://feed43.com/7713243036546130.xml')
,(u'Sport : Boxing ','http://feed43.com/0414732220804255.xml')
,(u'Sport : Rugby Union','http://feed43.com/4710138762362383.xml')
,(u'Sport : Other','http://feed43.com/4501416886323415.xml')
,(u'TV and Film','http://feed43.com/5238302853765104.xml')
,(u'Celebs','http://feed43.com/8770061048844683.xml')
,(u'Life Style : Family','http://feed43.com/4356170742410338.xml')
,(u'Travel','http://feed43.com/1436576006476607.xml')
# example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml') # example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
] ]
extra_css = ''' extra_css = '''
h1{ font-size:medium;}
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;} body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
h1{ font-size:18px;}
img { display:block} img { display:block}
''' '''#
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html') soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
@ -88,16 +77,18 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
cov2 = str(cov) cov2 = str(cov)
cov2=cov2[27:-18] cov2=cov2[27:-18]
#cov2 now is pic url, now go back to original function #cov2 now is pic url, now go back to original function
br = mechanize.Browser() br = browser()
br.set_handle_redirect(False) br.set_handle_redirect(False)
try: try:
br.open_novisit(cov2) br.open_novisit(cov2)
cover_url = cov2 cover_url = cov2
except: except:
cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg' cover_url ='http://profile.ak.fbcdn.net/hprofile-ak-snc4/373019_6149699161_1710984811_n.jpg'
# print '******** string is ', cov2,' ***'
#cover_url = cov2 #cover_url = cov2
#cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png' #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
return cover_url return cover_url

View File

@ -0,0 +1,21 @@
from calibre.web.feeds.news import BasicNewsRecipe
class ElMundoTodayRecipe(BasicNewsRecipe):
title = 'El Mundo Today'
__author__ = 'atordo'
description = u'La actualidad del mañana'
category = 'Noticias, humor'
cover_url = 'http://www.elmundotoday.com/wp-content/themes/EarthlyTouch/images/logo.png'
oldest_article = 30
max_articles_per_feed = 30
auto_cleanup = True
no_stylesheets = True
language = 'es'
use_embedded_content = True
feeds = [('El Mundo Today', 'http://www.elmundotoday.com/feed/')]
def get_broser(self):
br = BasicNewsRecipe.get_browser(self)
br.set_handle_gzip(True)
return br

View File

@ -18,16 +18,21 @@ __IncludeThumbnails__ = True
__UseLife__ = True __UseLife__ = True
# (HK only) It is to disable premium content (Default: False) # (HK only) It is to disable premium content (Default: False)
__InclPremium__ = False __InclPremium__ = False
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True) # (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: False)
__ParsePFF__ = True __ParsePF__ = False
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with text formats (Default: True) -- override __ParsePF__
__ParseTxt__ = True
# (HK only) Use mobile text version for some articles (Default: False)
__ParseSelectedMobile__ = False
# (HK only) Turn below to True if you wish hi-res images (Default: False) # (HK only) Turn below to True if you wish hi-res images (Default: False)
__HiResImg__ = False __HiResImg__ = False
# Override the date returned by the program if specifying a YYYYMMDD below # Override the date returned by the program if specifying a YYYYMMDD below (not work if __ParseSelectedMobile__ is True and __UseLife__ is False)
__Date__ = '' __Date__ = ''
''' '''
Change Log: Change Log:
2012/04/24: improved parsing of news.mingpao.com content
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away 2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device. download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
@ -81,6 +86,7 @@ class MPRecipe(BasicNewsRecipe):
category = 'Chinese, News, Hong Kong' category = 'Chinese, News, Hong Kong'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
remove_tags_before = dict(name='font', attrs={'color':['navy']})
keep_only_tags = [dict(name='h1'), keep_only_tags = [dict(name='h1'),
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
dict(name='font', attrs={'color':['AA0000']}), # for column articles title dict(name='font', attrs={'color':['AA0000']}), # for column articles title
@ -91,13 +97,17 @@ class MPRecipe(BasicNewsRecipe):
dict(attrs={'class':['photo']}), dict(attrs={'class':['photo']}),
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
dict(attrs={'class':['images']}) # for images from txt dict(attrs={'class':['images']}), # for images from txt
dict(name='table', attrs={'width':['100%'], 'cellspacing':['0'], 'cellpadding':['0'], 'border':['0']}) # content table in pda site
] ]
if __KeepImages__: if __KeepImages__:
remove_tags = [dict(name='style'), remove_tags = [dict(name='style'),
dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com
dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
dict(name='img', attrs={'alt':["明報網站", "按此列印", "關閉本視窗"]}), # non-article images in life.mingpao.com article
dict(name='img', attrs={'src':["../image/top_2.gif"]})
#dict(name='table') # for content fetched from life.mingpao.com #dict(name='table') # for content fetched from life.mingpao.com
#dict(name='table', attrs={'width':['98%'], 'cellspacing':['0'], 'cellpadding':['0'], 'border':['0']})
] ]
else: else:
remove_tags = [dict(name='style'), remove_tags = [dict(name='style'),
@ -105,6 +115,7 @@ class MPRecipe(BasicNewsRecipe):
dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
dict(name='img'), dict(name='img'),
#dict(name='table') # for content fetched from life.mingpao.com #dict(name='table') # for content fetched from life.mingpao.com
#dict(name='table', attrs={'width':['98%'], 'cellspacing':['0'], 'cellpadding':['0'], 'border':['0']})
] ]
remove_attributes = ['width'] remove_attributes = ['width']
preprocess_regexps = [ preprocess_regexps = [
@ -118,7 +129,15 @@ class MPRecipe(BasicNewsRecipe):
(re.compile(r"<div id='newscontent'><br>", re.DOTALL|re.IGNORECASE), (re.compile(r"<div id='newscontent'><br>", re.DOTALL|re.IGNORECASE),
lambda match: "<div id='newscontent'>"), lambda match: "<div id='newscontent'>"),
(re.compile(r"<br><br></b>", re.DOTALL|re.IGNORECASE), (re.compile(r"<br><br></b>", re.DOTALL|re.IGNORECASE),
lambda match: "</b>") lambda match: "</b>"),
(re.compile(r'<br><br><img src="http://pda.mingpao.com/image/shim.gif" width=11><br>', re.DOTALL|re.IGNORECASE),
lambda match: ''),
(re.compile(r'<img src="http://pda.mingpao.com/image/mbup.gif" border=0>', re.DOTALL|re.IGNORECASE),
lambda match: ''),
(re.compile(r'<img src="http://pda.mingpao.com/image/mbun.gif" border=0>', re.DOTALL|re.IGNORECASE),
lambda match: ''),
#(re.compile(r'[<a href="HotNews1.cfm.+?">.+?</a>]', re.DOTALL|re.IGNORECASE),
#lambda match: '')
] ]
elif __Region__ == 'Vancouver': elif __Region__ == 'Vancouver':
if __UseChineseTitle__ == True: if __UseChineseTitle__ == True:
@ -222,6 +241,10 @@ class MPRecipe(BasicNewsRecipe):
else: else:
return self.get_dtlocal().strftime("%d") return self.get_dtlocal().strftime("%d")
# Note: does not work with custom date given by __Date__
def get_weekday(self):
return self.get_dtlocal().weekday()
def get_cover_url(self): def get_cover_url(self):
if __Region__ == 'Hong Kong': if __Region__ == 'Hong Kong':
cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg' cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
@ -260,7 +283,23 @@ class MPRecipe(BasicNewsRecipe):
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
if __InclPremium__ == True: # if __InclPremium__ == True:
# # parse column section articles directly from .txt files
# for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
# ]:
# articles = self.parse_section2_txt(url, keystr)
# if articles:
# feeds.append((title, articles))
#
# for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
# (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
# articles = self.parse_section(url)
# if articles:
# feeds.append((title, articles))
# new
if __InclPremium__ == True and (self.get_weekday() <> 6 or __ParseSelectedMobile__ == False):
# if both not on Sunday and not __ParseSelectedMobile__, go ahead
# parse column section articles directly from .txt files # parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
]: ]:
@ -268,17 +307,45 @@ class MPRecipe(BasicNewsRecipe):
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), if __InclPremium__ == False or self.get_weekday() <> 6:
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
if __ParseTxt__ == False:
articles = self.parse_section(url) articles = self.parse_section(url)
else:
articles = self.parse_section_txt(url, seckey)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
else: else:
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), if __InclPremium__ == True and __ParseSelectedMobile__ == True:
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), articles = self.parse_section_mobile('http://premium.mingpao.com/pda/palm', 'HotNews1.cfm?cat=ja&token=b218bc260b89c0&online=1')
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), if articles:
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]: feeds.append((u'\u526f\u520a Supplement', articles))
else:
for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
if __ParseTxt__ == False:
articles = self.parse_section(url) articles = self.parse_section(url)
else:
articles = self.parse_section_txt(url, seckey)
if articles:
feeds.append((title, articles))
for title, url, seckey in [(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm', 'em')]:
if __ParseTxt__ == False:
articles = self.parse_section(url)
else:
articles = self.parse_section_txt(url, seckey)
if articles:
feeds.append((title, articles))
# end of new
else:
for title, url, seckey in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm', 'ga'),
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm', 'gb'),
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm', 'gf'),
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm', 'mr')]:
if __ParseTxt__ == False:
articles = self.parse_section(url)
else:
articles = self.parse_section_txt(url, seckey)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
@ -287,10 +354,13 @@ class MPRecipe(BasicNewsRecipe):
#if ed_articles: #if ed_articles:
# feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles)) # feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), for title, url, seckey in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm', 'fa'),
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm', 'ca'),
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]: (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm', 'ta')]:
if __ParseTxt__ == False:
articles = self.parse_section(url) articles = self.parse_section(url)
else:
articles = self.parse_section_txt(url, seckey)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
@ -322,7 +392,9 @@ class MPRecipe(BasicNewsRecipe):
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
if __InclPremium__ == True:
if __InclPremium__ == True and (self.get_weekday() <> 6 or __ParseSelectedMobile__ == False):
# if both not on Sunday or not __ParseSelectedMobile__, go ahead
# parse column section articles directly from .txt files # parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
]: ]:
@ -330,9 +402,33 @@ class MPRecipe(BasicNewsRecipe):
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), if __InclPremium__ == False or self.get_weekday() <> 6:
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
if __ParseTxt__ == False:
articles = self.parse_section(url) articles = self.parse_section(url)
else:
articles = self.parse_section_txt(url, seckey)
if articles:
feeds.append((title, articles))
else:
if __InclPremium__ == True and __ParseSelectedMobile__ == True:
articles = self.parse_section_mobile('http://premium.mingpao.com/pda/palm', 'HotNews1.cfm?cat=ja&token=b218bc260b89c0&online=1')
if articles:
feeds.append((u'\u526f\u520a Supplement', articles))
else:
for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
if __ParseTxt__ == False:
articles = self.parse_section(url)
else:
articles = self.parse_section_txt(url, seckey)
if articles:
feeds.append((title, articles))
for title, url, seckey in [(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm', 'em')]:
if __ParseTxt__ == False:
articles = self.parse_section(url)
else:
articles = self.parse_section_txt(url, seckey)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
@ -366,7 +462,7 @@ class MPRecipe(BasicNewsRecipe):
feeds.append((title, articles)) feeds.append((title, articles))
return feeds return feeds
# parse from news.mingpao.com # parse from news.mingpao.com (web html)
def parse_section(self, url): def parse_section(self, url):
dateStr = self.get_fetchdate() dateStr = self.get_fetchdate()
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
@ -379,17 +475,57 @@ class MPRecipe(BasicNewsRecipe):
title = self.tag_to_string(a) title = self.tag_to_string(a)
url = a.get('href', False) url = a.get('href', False)
url = 'http://news.mingpao.com/' + dateStr + '/' +url url = 'http://news.mingpao.com/' + dateStr + '/' +url
# replace the url to the print-friendly version # replace the url to the alternative version
if __ParsePFF__ == True: if __ParsePF__ == True:
# printer-friendly option
if url.rfind('Redirect') <> -1 and __InclPremium__ == True: if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
url = re.sub(dateStr + '.*' + dateStr, dateStr, url) url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
url = re.sub('%2F.*%2F', '/', url) url = re.sub('%2F.*%2F', '/', url)
if __InclPremium__ == True:
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '') title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
url = url.replace('%2Etxt', '_print.htm') url = url.replace('%2Etxt', '_print.htm')
url = url.replace('%5F', '_') url = url.replace('%5F', '_')
else: else:
url = url.replace('.htm', '_print.htm') url = url.replace('.htm', '_print.htm')
if url not in included_urls and url.rfind('Redirect') == -1: #if url not in included_urls and url.rfind('Redirect') == -1 and (__InclPremium__ == False or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
if url not in included_urls and (__InclPremium__ == True or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
included_urls.append(url)
current_articles.reverse()
return current_articles
# parse from news.mingpao.com (txt)
def parse_section_txt(self, url, ch):
dateStr = self.get_fetchdate()
soup = self.index_to_soup(url)
divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
current_articles = []
included_urls = []
divs.reverse()
for i in divs:
a = i.find('a', href = True)
title = self.tag_to_string(a)
url = a.get('href', False)
#print 'Base url: ', url
# replace the url to the alternative version
# text version
if url.rfind('Redirect') <> -1:
url = 'http://news.mingpao.com/' + dateStr + '/' +url
#print 'original url: ', url
url = re.sub(dateStr + '/../cfm/Redirect.cfm.*NewsFile=', 'ftp/WebNews2/', url)
url = re.sub('%2F', '/', url)
if __InclPremium__ == True:
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
url = url.replace('%2Etxt', '.txt')
url = url.replace('%5F', '_')
else:
# get the first two char in url as ch
seckey = url[0:2]
url = url.replace('.htm', '.txt')
url = 'http://news.mingpao.com/ftp/WebNews2/' + dateStr + '/' + ch + '/' + seckey + '/' + url
#print 'updated url: ', url
if url not in included_urls and (__InclPremium__ == True or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
#if url not in included_urls and (url.rfind('Redirect') == -1) and (__InclPremium__ == False or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
included_urls.append(url) included_urls.append(url)
current_articles.reverse() current_articles.reverse()
@ -437,6 +573,20 @@ class MPRecipe(BasicNewsRecipe):
current_articles.reverse() current_articles.reverse()
return current_articles return current_articles
# parse from mobile version
def parse_section_mobile(self, base, page):
soup = self.index_to_soup(base + '/' + page)
a = soup.findAll('a', href=True)
current_articles = []
included_urls = []
for i in a:
title = self.tag_to_string(i)
url = i.get('href', False)
if url not in included_urls and url.rfind('HotNews2.cfm') <> -1:
current_articles.append({'title': title, 'url': base + '/' + url, 'description': ''})
included_urls.append(url)
return current_articles
# parse from www.mingpaovan.com # parse from www.mingpaovan.com
def parse_section3(self, url, baseUrl): def parse_section3(self, url, baseUrl):
self.get_fetchdate() self.get_fetchdate()
@ -631,6 +781,7 @@ class MPRecipe(BasicNewsRecipe):
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1]) br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
new_html = new_html.replace(img, gifimg) new_html = new_html.replace(img, gifimg)
except: except:
if __ParseTxt__ == False:
# find the location of the first _ # find the location of the first _
pos = img.find('_') pos = img.find('_')
if pos > -1: if pos > -1:
@ -640,6 +791,12 @@ class MPRecipe(BasicNewsRecipe):
else: else:
# if not found, insert _ after " # if not found, insert _ after "
new_html = new_html.replace(img[1:], '"_' + img[1:]) new_html = new_html.replace(img[1:], '"_' + img[1:])
else:
# insert to front
#print 'imgstr: ', img
pos = img.find('_')
new_html = new_html.replace(img[5:], '_' + img[5:])
elif url.rfind('life.mingpao.com') > -1: elif url.rfind('life.mingpao.com') > -1:
imglist = re.findall('src=\'?.*?jpg\'', new_html) imglist = re.findall('src=\'?.*?jpg\'', new_html)
br = mechanize.Browser() br = mechanize.Browser()
@ -673,9 +830,13 @@ class MPRecipe(BasicNewsRecipe):
newimg = img[0:pos+1] + '_' + img[pos+1:] newimg = img[0:pos+1] + '_' + img[pos+1:]
#print 'Use hi-res img', newimg #print 'Use hi-res img', newimg
new_html = new_html.replace(img, newimg) new_html = new_html.replace(img, newimg)
# test
#print new_html
return new_html return new_html
def preprocess_html(self, soup): def preprocess_html(self, soup):
for mobiletitle in soup.findAll('font', attrs={'color': ['navy']}):
mobiletitle.name = 'h1'
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
@ -909,3 +1070,4 @@ class MPRecipe(BasicNewsRecipe):
opf.render(opf_file, ncx_file) opf.render(opf_file, ncx_file)

43
recipes/monbiot.recipe Normal file
View File

@ -0,0 +1,43 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
'''
www.monbiot.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class GeorgeMonbiot(BasicNewsRecipe):
title = 'George Monbiot - blog'
__author__ = 'Darko Miletic'
description = 'Tell people something they know already and they will thank you for it. Tell people something new and they will hate you for it.'
publisher = 'George Monbiot'
category = 'news, politics, UK, World'
oldest_article = 15
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en_GB'
remove_empty_feeds = True
publication_type = 'blog'
extra_css = """
body{font-family: Arial,Helvetica,sans-serif }
img{margin-bottom: 0.4em; display:block}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_tags = [
dict(name=['meta','link']),
dict(attrs={'class':'shareinpost'}),
dict(attrs={'id':'paging'})
]
remove_attributes=['lang']
keep_only_tags=[dict(attrs={'id':'content'})]
feeds = [(u'Articles', u'http://www.monbiot.com/feed/atom/')]

View File

@ -2,20 +2,25 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, matek09, matek09@gmail.com' __copyright__ = '2010, matek09, matek09@gmail.com; 2012, admroz, a.rozewicki@gmail.com'
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from string import capwords
import datetime import datetime
class Newsweek(BasicNewsRecipe): class Newsweek(BasicNewsRecipe):
# how many issues to go back, 0 means get the most current one
BACK_ISSUES = 1
EDITION = '0' EDITION = '0'
DATE = None DATE = None
YEAR = datetime.datetime.now().year YEAR = datetime.datetime.now().year
title = u'Newsweek Polska' title = u'Newsweek Polska'
__author__ = 'matek09' __author__ = 'matek09, admroz'
description = 'Weekly magazine' description = 'Weekly magazine'
encoding = 'utf-8' encoding = 'utf-8'
language = 'pl' language = 'pl'
@ -25,6 +30,9 @@ class Newsweek(BasicNewsRecipe):
articles_are_obfuscated = True articles_are_obfuscated = True
#
# Parses each article
#
def get_obfuscated_article(self, url): def get_obfuscated_article(self, url):
br = self.get_browser() br = self.get_browser()
br.open(url) br.open(url)
@ -37,6 +45,27 @@ class Newsweek(BasicNewsRecipe):
info = main_section.find('ul', attrs={'class' : 'articleInfo'}) info = main_section.find('ul', attrs={'class' : 'articleInfo'})
authors = info.find('li').find('h4') authors = info.find('li').find('h4')
article = main_section.find('div', attrs={'id' : 'article'}) article = main_section.find('div', attrs={'id' : 'article'})
# remove related articles box
related = article.find('div', attrs={'class' : 'relatedBox'})
if related is not None:
related.extract()
# remove div with social networking links and links to
# other articles in web version
for div in article.findAll('div'):
if div.find('span', attrs={'class' : 'google-plus'}):
div.extract()
for p in div.findAll('p'):
if p.find('span', attrs={'style' : 'color: rgb(255, 0, 0);'}):
p.extract()
continue
for a in p.findAll('a'):
if a.find('span', attrs={'style' : 'font-size: larger;'}):
a.extract()
html = unicode(title) + unicode(authors) + unicode(article) html = unicode(title) + unicode(authors) + unicode(article)
next = main_section.find('li', attrs={'class' : 'next'}) next = main_section.find('li', attrs={'class' : 'next'})
@ -59,32 +88,34 @@ class Newsweek(BasicNewsRecipe):
self.temp_files[-1].close() self.temp_files[-1].close()
return self.temp_files[-1].name return self.temp_files[-1].name
def is_full(self, issue_soup):
while True:
main_section = issue_soup.find(id='mainSection')
next = main_section.find('li', attrs={'class' : 'next'})
if len(main_section.findAll(attrs={'class' : 'locked'})) > 1:
return False
elif next is None:
return True
else:
issue_soup = self.index_to_soup(next.find('a')['href'])
def find_last_full_issue(self, archive_url): #
# Goes back given number of issues. It also knows how to go back
# to the previous year if there are not enough issues in the current one
#
def find_last_issue(self, archive_url):
archive_soup = self.index_to_soup(archive_url) archive_soup = self.index_to_soup(archive_url)
select = archive_soup.find('select', attrs={'id' : 'paper_issue_select'}) select = archive_soup.find('select', attrs={'id' : 'paper_issue_select'})
for option in select.findAll(lambda tag: tag.name == 'option' and tag.has_key('value')): options = select.findAll(lambda tag: tag.name == 'option' and tag.has_key('value'))
# check if need to go back to previous year
if len(options) > self.BACK_ISSUES:
option = options[self.BACK_ISSUES];
self.EDITION = option['value'].replace('http://www.newsweek.pl/wydania/','') self.EDITION = option['value'].replace('http://www.newsweek.pl/wydania/','')
issue_soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION) issue_soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
if self.is_full(issue_soup): else:
return self.BACK_ISSUES = self.BACK_ISSUES - len(options)
self.YEAR = self.YEAR - 1 self.YEAR = self.YEAR - 1
self.find_last_full_issue(archive_url + ',' + str(self.YEAR)) self.find_last_issue(archive_url + ',' + str(self.YEAR))
#
# Looks for the last issue which we want to download. Then goes on each
# section and article and stores them (assigning to sections)
#
def parse_index(self): def parse_index(self):
archive_url = 'http://www.newsweek.pl/wydania/archiwum' archive_url = 'http://www.newsweek.pl/wydania/archiwum'
self.find_last_full_issue(archive_url) self.find_last_issue(archive_url)
soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION) soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
self.DATE = self.tag_to_string(soup.find('span', attrs={'class' : 'data'})) self.DATE = self.tag_to_string(soup.find('span', attrs={'class' : 'data'}))
main_section = soup.find(id='mainSection') main_section = soup.find(id='mainSection')
@ -93,32 +124,44 @@ class Newsweek(BasicNewsRecipe):
feeds = [] feeds = []
articles = {} articles = {}
sections = [] sections = []
while True:
news_list = main_section.find('ul', attrs={'class' : 'newsList'})
for h2 in news_list.findAll('h2'):
news_list = main_section.find('ul', attrs={'class' : 'newsList'})
section = 'Inne'
for li in news_list.findAll('li'):
h3 = li.find('h3')
if h3 is not None:
section = capwords(self.tag_to_string(h3))
continue
else:
h2 = li.find('h2')
if h2 is not None:
article = self.create_article(h2) article = self.create_article(h2)
category_div = h2.findNext('div', attrs={'class' : 'kategorie'}) if article is None :
section = self.tag_to_string(category_div) continue
if articles.has_key(section): if articles.has_key(section):
articles[section].append(article) articles[section].append(article)
else: else:
articles[section] = [article] articles[section] = [article]
sections.append(section) sections.append(section)
next = main_section.find('li', attrs={'class' : 'next'})
if next is None:
break
soup = self.index_to_soup(next.find('a')['href'])
main_section = soup.find(id='mainSection')
for section in sections: for section in sections:
feeds.append((section, articles[section])) feeds.append((section, articles[section]))
return feeds return feeds
#
# Creates each article metadata (skips locked ones). The content will
# be extracted later by other method (get_obfuscated_article).
#
def create_article(self, h2): def create_article(self, h2):
article = {} article = {}
a = h2.find('a') a = h2.find('a')
if a is None:
return None
article['title'] = self.tag_to_string(a) article['title'] = self.tag_to_string(a)
article['url'] = a['href'] article['url'] = a['href']
article['date'] = self.DATE article['date'] = self.DATE
@ -129,7 +172,3 @@ class Newsweek(BasicNewsRecipe):
else: else:
article['description'] = '' article['description'] = ''
return article return article

View File

@ -1,12 +1,14 @@
import re, mechanize import re, random
from calibre import browser
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1325006965(BasicNewsRecipe): class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'The Sun UK' title = u'The Sun UK'
description = 'A Recipe for The Sun tabloid UK' description = 'A Recipe for The Sun tabloid UK'
__author__ = 'Dave Asbury' __author__ = 'Dave Asbury'
# last updated 7/4/12 # last updated 29/4/12
language = 'en_GB' language = 'en_GB'
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 15 max_articles_per_feed = 15
@ -48,12 +50,10 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
feeds = [ feeds = [
(u'News','http://feed43.com/2517447382644748.xml'), (u'News', u'http://www.thesun.co.uk/sol/homepage/news/rss'),
(u'Sport', u'http://feed43.com/4283846255668687.xml'), (u'Sport', u'http://www.thesun.co.uk/sol/homepage/sport/rss'),
(u'Bizarre', u'http://feed43.com/0233840304242011.xml'), (u'Showbiz', u'http://www.thesun.co.uk/sol/homepage/showbiz/rss'),
(u'Film',u'http://feed43.com/1307545221226200.xml'), (u'Woman', u'http://www.thesun.co.uk/sol/homepage/woman/rss'),
(u'Music',u'http://feed43.com/1701513435064132.xml'),
(u'Sun Woman',u'http://feed43.com/0022626854226453.xml'),
] ]
def get_cover_url(self): def get_cover_url(self):
@ -61,14 +61,11 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
# look for the block containing the sun button and url # look for the block containing the sun button and url
cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'}) cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'})
#cov = soup.find(attrs={'id' : 'large'}) #cov = soup.find(attrs={'id' : 'large'})
cov2 = str(cov) cov2 = str(cov)
cov2='http://www.politicshome.com'+cov2[9:-133] cov2='http://www.politicshome.com'+cov2[9:-133]
#cov2 now contains url of the page containing pic #cov2 now contains url of the page containing pic
#cov2 now contains url of the page containing pic #cov2 now contains url of the page containing pic
soup = self.index_to_soup(cov2) soup = self.index_to_soup(cov2)
cov = soup.find(attrs={'id' : 'large'}) cov = soup.find(attrs={'id' : 'large'})
@ -76,16 +73,21 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
cov2=cov2[27:-18] cov2=cov2[27:-18]
#cov2 now is pic url, now go back to original function #cov2 now is pic url, now go back to original function
br = mechanize.Browser() br = browser()
br.set_handle_redirect(False) br.set_handle_redirect(False)
try: try:
br.open_novisit(cov2) br.open_novisit(cov2)
cover_url = cov2 cover_url = cov2
except: except:
cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png' cover_url = random.choice((
'http://img.thesun.co.uk/multimedia/archive/00905/errorpage6_677961a_905507a.jpg'
,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage7_677962a_905505a.jpg'
,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage5_677960a_905512a.jpg'
,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage2_677957a_905502a.jpg'
,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage3_677958a_905503a.jpg'
))
#cover_url = cov2
#cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
return cover_url return cover_url

View File

@ -0,0 +1,17 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
class ViceESRecipe(BasicNewsRecipe):
title = u'Vice Magazine España'
__author__ = 'atordo'
description = u'La página web oficial de la revista Vice España'
category = u'noticias, fotografía, blogs, moda, arte, cine, música, literatura, tecnología'
cover_url = 'http://www.seeklogo.com/images/V/Vice-logo-668578AC94-seeklogo.com.gif'
oldest_article = 20
max_articles_per_feed = 30
auto_cleanup = True
no_stylesheets = True
language = 'es'
feeds = [('Vice', 'http://www.vice.com/es/rss')]

View File

@ -445,7 +445,7 @@ class LRFMetadataWriter(MetadataWriterPlugin):
class MOBIMetadataWriter(MetadataWriterPlugin): class MOBIMetadataWriter(MetadataWriterPlugin):
name = 'Set MOBI metadata' name = 'Set MOBI metadata'
file_types = set(['mobi', 'prc', 'azw', 'azw4']) file_types = set(['mobi', 'prc', 'azw', 'azw3', 'azw4'])
description = _('Set metadata in %s files')%'MOBI' description = _('Set metadata in %s files')%'MOBI'
author = 'Marshall T. Vandegrift' author = 'Marshall T. Vandegrift'
@ -539,7 +539,8 @@ from calibre.ebooks.conversion.plugins.epub_output import EPUBOutput
from calibre.ebooks.conversion.plugins.fb2_output import FB2Output from calibre.ebooks.conversion.plugins.fb2_output import FB2Output
from calibre.ebooks.conversion.plugins.lit_output import LITOutput from calibre.ebooks.conversion.plugins.lit_output import LITOutput
from calibre.ebooks.conversion.plugins.lrf_output import LRFOutput from calibre.ebooks.conversion.plugins.lrf_output import LRFOutput
from calibre.ebooks.conversion.plugins.mobi_output import MOBIOutput from calibre.ebooks.conversion.plugins.mobi_output import (MOBIOutput,
AZW3Output)
from calibre.ebooks.conversion.plugins.oeb_output import OEBOutput from calibre.ebooks.conversion.plugins.oeb_output import OEBOutput
from calibre.ebooks.conversion.plugins.pdb_output import PDBOutput from calibre.ebooks.conversion.plugins.pdb_output import PDBOutput
from calibre.ebooks.conversion.plugins.pdf_output import PDFOutput from calibre.ebooks.conversion.plugins.pdf_output import PDFOutput
@ -580,7 +581,7 @@ plugins += [
FB2Output, FB2Output,
LITOutput, LITOutput,
LRFOutput, LRFOutput,
MOBIOutput, MOBIOutput, AZW3Output,
OEBOutput, OEBOutput,
PDBOutput, PDBOutput,
PDFOutput, PDFOutput,
@ -1253,6 +1254,15 @@ class StoreBeWriteStore(StoreBase):
headquarters = 'US' headquarters = 'US'
formats = ['EPUB', 'MOBI', 'PDF'] formats = ['EPUB', 'MOBI', 'PDF']
class StoreBiblioStore(StoreBase):
name = u'Библио.бг'
author = 'Alex Stanev'
description = u'Електронна книжарница за книги и списания във формати ePUB и PDF. Част от заглавията са с активна DRM защита.'
actual_plugin = 'calibre.gui2.store.stores.biblio_plugin:BiblioStore'
headquarters = 'BG'
formats = ['EPUB, PDF']
class StoreBookotekaStore(StoreBase): class StoreBookotekaStore(StoreBase):
name = 'Bookoteka' name = 'Bookoteka'
author = u'Tomasz Długosz' author = u'Tomasz Długosz'
@ -1596,6 +1606,7 @@ plugins += [
StoreBNStore, StoreBNStore,
StoreBeamEBooksDEStore, StoreBeamEBooksDEStore,
StoreBeWriteStore, StoreBeWriteStore,
StoreBiblioStore,
StoreBookotekaStore, StoreBookotekaStore,
StoreChitankaStore, StoreChitankaStore,
StoreDieselEbooksStore, StoreDieselEbooksStore,

View File

@ -54,6 +54,15 @@ Run an embedded python interpreter.
parser.add_option('-m', '--inspect-mobi', action='store_true', parser.add_option('-m', '--inspect-mobi', action='store_true',
default=False, default=False,
help='Inspect the MOBI file(s) at the specified path(s)') help='Inspect the MOBI file(s) at the specified path(s)')
parser.add_option('--tweak-book', default=None,
help='Tweak the book (exports the book as a collection of HTML '
'files and metadata, which you can edit using standard HTML '
'editing tools, and then rebuilds the file from the edited HTML. '
'Makes no additional changes to the HTML, unlike a full calibre '
'conversion). Note that this tool will try to open the '
'folder containing the HTML files in the editor pointed to by the'
' EDITOR environment variable.')
parser.add_option('--test-build', help='Test binary modules in build', parser.add_option('--test-build', help='Test binary modules in build',
action='store_true', default=False) action='store_true', default=False)
@ -239,7 +248,9 @@ def main(args=sys.argv):
prints('Inspecting:', path) prints('Inspecting:', path)
inspect_mobi(path) inspect_mobi(path)
print print
elif opts.tweak_book:
from calibre.ebooks.tweak import tweak
tweak(opts.tweak_book)
elif opts.test_build: elif opts.test_build:
from calibre.test_build import test from calibre.test_build import test
test() test()

View File

@ -6,8 +6,32 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from calibre.customize.conversion import OutputFormatPlugin from calibre.customize.conversion import (OutputFormatPlugin,
from calibre.customize.conversion import OptionRecommendation OptionRecommendation)
def remove_html_cover(oeb, log):
from calibre.ebooks.oeb.base import OEB_DOCS
if not oeb.metadata.cover \
or 'cover' not in oeb.guide:
return
href = oeb.guide['cover'].href
del oeb.guide['cover']
item = oeb.manifest.hrefs[href]
if item.spine_position is not None:
log.warn('Found an HTML cover: ', item.href, 'removing it.',
'If you find some content missing from the output MOBI, it '
'is because you misidentified the HTML cover in the input '
'document')
oeb.spine.remove(item)
if item.media_type in OEB_DOCS:
oeb.manifest.remove(item)
def extract_mobi(output_path, opts):
if opts.extract_to is not None:
from calibre.ebooks.mobi.debug.main import inspect_mobi
ddir = opts.extract_to
inspect_mobi(output_path, ddir=ddir)
class MOBIOutput(OutputFormatPlugin): class MOBIOutput(OutputFormatPlugin):
@ -140,25 +164,6 @@ class MOBIOutput(OutputFormatPlugin):
# Fix up the periodical href to point to first section href # Fix up the periodical href to point to first section href
toc.nodes[0].href = toc.nodes[0].nodes[0].href toc.nodes[0].href = toc.nodes[0].nodes[0].href
def remove_html_cover(self):
from calibre.ebooks.oeb.base import OEB_DOCS
oeb = self.oeb
if not oeb.metadata.cover \
or 'cover' not in oeb.guide:
return
href = oeb.guide['cover'].href
del oeb.guide['cover']
item = oeb.manifest.hrefs[href]
if item.spine_position is not None:
self.log.warn('Found an HTML cover: ', item.href, 'removing it.',
'If you find some content missing from the output MOBI, it '
'is because you misidentified the HTML cover in the input '
'document')
oeb.spine.remove(item)
if item.media_type in OEB_DOCS:
self.oeb.manifest.remove(item)
def convert(self, oeb, output_path, input_plugin, opts, log): def convert(self, oeb, output_path, input_plugin, opts, log):
from calibre.utils.config import tweaks from calibre.utils.config import tweaks
from calibre.ebooks.mobi.writer2.resources import Resources from calibre.ebooks.mobi.writer2.resources import Resources
@ -169,7 +174,7 @@ class MOBIOutput(OutputFormatPlugin):
mobi_type = 'old' # Amazon does not support KF8 periodicals mobi_type = 'old' # Amazon does not support KF8 periodicals
create_kf8 = mobi_type in ('new', 'both') create_kf8 = mobi_type in ('new', 'both')
self.remove_html_cover() remove_html_cover(self.oeb, self.log)
resources = Resources(oeb, opts, self.is_periodical, resources = Resources(oeb, opts, self.is_periodical,
add_fonts=create_kf8) add_fonts=create_kf8)
self.check_for_periodical() self.check_for_periodical()
@ -185,7 +190,7 @@ class MOBIOutput(OutputFormatPlugin):
) if create_kf8 else None ) if create_kf8 else None
if mobi_type == 'new': if mobi_type == 'new':
kf8.write(output_path) kf8.write(output_path)
self.extract_mobi(output_path, opts) extract_mobi(output_path, opts)
return return
self.log('Creating MOBI 6 output') self.log('Creating MOBI 6 output')
@ -225,11 +230,72 @@ class MOBIOutput(OutputFormatPlugin):
writer = MobiWriter(opts, resources, kf8, writer = MobiWriter(opts, resources, kf8,
write_page_breaks_after_item=write_page_breaks_after_item) write_page_breaks_after_item=write_page_breaks_after_item)
writer(oeb, output_path) writer(oeb, output_path)
self.extract_mobi(output_path, opts) extract_mobi(output_path, opts)
class AZW3Output(OutputFormatPlugin):
name = 'AZW3 Output'
author = 'Kovid Goyal'
file_type = 'azw3'
options = set([
OptionRecommendation(name='prefer_author_sort',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('When present, use author sort field as author.')
),
OptionRecommendation(name='no_inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Don\'t add Table of Contents to the book. Useful if '
'the book has its own table of contents.')),
OptionRecommendation(name='toc_title', recommended_value=None,
help=_('Title for any generated in-line table of contents.')
),
OptionRecommendation(name='dont_compress',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Disable compression of the file contents.')
),
OptionRecommendation(name='personal_doc', recommended_value='[PDOC]',
help=_('Tag marking book to be filed with Personal Docs')
),
OptionRecommendation(name='mobi_toc_at_start',
recommended_value=False,
help=_('When adding the Table of Contents to the book, add it at the start of the '
'book instead of the end. Not recommended.')
),
OptionRecommendation(name='extract_to', recommended_value=None,
help=_('Extract the contents of the MOBI file to the'
' specified directory. If the directory already '
'exists, it will be deleted.')
),
OptionRecommendation(name='share_not_sync', recommended_value=False,
help=_('Enable sharing of book content via Facebook etc. '
' on the Kindle. WARNING: Using this feature means that '
' the book will not auto sync its last read position '
' on multiple devices. Complain to Amazon.')
),
])
def convert(self, oeb, output_path, input_plugin, opts, log):
from calibre.ebooks.mobi.writer2.resources import Resources
from calibre.ebooks.mobi.writer8.main import create_kf8_book
self.oeb, self.opts, self.log = oeb, opts, log
opts.mobi_periodical = self.is_periodical
passthrough = getattr(opts, 'mobi_passthrough', False)
resources = Resources(self.oeb, self.opts, self.is_periodical,
add_fonts=True, process_images=False)
if not passthrough:
remove_html_cover(self.oeb, self.log)
# Split on pagebreaks so that the resulting KF8 works better with
# calibre's viewer, which does not support CSS page breaks
from calibre.ebooks.oeb.transforms.split import Split
Split()(self.oeb, self.opts)
kf8 = create_kf8_book(self.oeb, self.opts, resources, for_joint=False)
kf8.write(output_path)
extract_mobi(output_path, opts)
def extract_mobi(self, output_path, opts):
if opts.extract_to is not None:
from calibre.ebooks.mobi.debug.main import inspect_mobi
ddir = opts.extract_to
inspect_mobi(output_path, ddir=ddir)

View File

@ -179,7 +179,11 @@ class HeuristicProcessor(object):
for match in re.finditer(pat, search_text): for match in re.finditer(pat, search_text):
ital_string = str(match.group('words')) ital_string = str(match.group('words'))
#self.log.debug("italicising "+str(match.group(0))+" with <i>"+ital_string+"</i>") #self.log.debug("italicising "+str(match.group(0))+" with <i>"+ital_string+"</i>")
try:
html = re.sub(re.escape(str(match.group(0))), '<i>%s</i>' % ital_string, html) html = re.sub(re.escape(str(match.group(0))), '<i>%s</i>' % ital_string, html)
except OverflowError:
# match.group(0) was too large to be compiled into a regex
continue
return html return html

View File

@ -141,9 +141,10 @@ class MOBIFile(object):
self.files.append(File(skel, skeleton, ftext, first_aid, sections)) self.files.append(File(skel, skeleton, ftext, first_aid, sections))
def dump_flows(self, ddir): def dump_flows(self, ddir):
if self.fdst is None: boundaries = [(0, len(self.raw_text))]
raise ValueError('This MOBI file has no FDST record') if self.fdst is not None:
for i, x in enumerate(self.fdst.sections): boundaries = self.fdst.sections
for i, x in enumerate(boundaries):
start, end = x start, end = x
raw = self.raw_text[start:end] raw = self.raw_text[start:end]
with open(os.path.join(ddir, 'flow%04d.txt'%i), 'wb') as f: with open(os.path.join(ddir, 'flow%04d.txt'%i), 'wb') as f:

View File

@ -234,6 +234,22 @@ class MetadataHeader(BookHeader):
else: else:
self.exth = None self.exth = None
@property
def kf8_type(self):
if (self.mobi_version == 8 and getattr(self, 'skelidx', NULL_INDEX) !=
NULL_INDEX):
return u'standalone'
kf8_header_index = getattr(self.exth, 'kf8_header', None)
if kf8_header_index is None:
return None
try:
if self.section_data(kf8_header_index-1) == b'BOUNDARY':
return u'joint'
except:
pass
return None
def identity(self): def identity(self):
self.stream.seek(60) self.stream.seek(60)
ident = self.stream.read(8).upper() ident = self.stream.read(8).upper()

View File

@ -0,0 +1,84 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, glob
from calibre import CurrentDir
from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
from calibre.ebooks.mobi.reader.headers import MetadataHeader
from calibre.utils.logging import default_log
from calibre.ebooks import DRMError
from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
from calibre.customize.ui import (plugin_for_input_format,
plugin_for_output_format)
from calibre.utils.ipc.simple_worker import fork_job
class BadFormat(ValueError):
pass
def do_explode(path, dest):
with open(path, 'rb') as stream:
mr = MobiReader(stream, default_log, None, None)
with CurrentDir(dest):
mr = Mobi8Reader(mr, default_log)
opf = os.path.abspath(mr())
return opf
def explode(path, dest, question=lambda x:True):
with open(path, 'rb') as stream:
raw = stream.read(3)
stream.seek(0)
if raw == b'TPZ':
raise BadFormat(_('This is not a MOBI file. It is a Topaz file.'))
try:
header = MetadataHeader(stream, default_log)
except MobiError:
raise BadFormat(_('This is not a MOBI file.'))
if header.encryption_type != 0:
raise DRMError(_('This file is locked with DRM. It cannot be tweaked.'))
kf8_type = header.kf8_type
if kf8_type is None:
raise BadFormat('This MOBI file does not contain a KF8 format book')
if kf8_type == 'joint':
if not question(_('This MOBI file contains both KF8 and '
'older Mobi6 data. Tweaking it will remove the Mobi6 data, which '
'means the file will not be usable on older Kindles. Are you '
'sure?')):
return None
return fork_job('calibre.ebooks.mobi.tweak', 'do_explode', args=(path,
dest), no_output=True)['result']
def do_rebuild(opf, dest_path):
plumber = Plumber(opf, dest_path, default_log)
plumber.setup_options()
inp = plugin_for_input_format('azw3')
outp = plugin_for_output_format('azw3')
plumber.opts.mobi_passthrough = True
oeb = create_oebbook(default_log, opf, plumber.opts)
outp.convert(oeb, dest_path, inp, plumber.opts, default_log)
def rebuild(src_dir, dest_path):
opf = glob.glob(os.path.join(src_dir, '*.opf'))
if not opf:
raise ValueError('No OPF file found in %s'%src_dir)
opf = opf[0]
fork_job('calibre.ebooks.mobi.tweak', 'do_rebuild', args=(opf, dest_path),
no_output=True)

View File

@ -25,6 +25,15 @@ from calibre.ebooks.mobi.writer2.indexer import Indexer
WRITE_UNCROSSABLE_BREAKS = False WRITE_UNCROSSABLE_BREAKS = False
NULL_INDEX = 0xffffffff NULL_INDEX = 0xffffffff
FLIS = (b'FLIS\0\0\0\x08\0\x41\0\0\0\0\0\0\xff\xff\xff\xff\0\x01\0\x03\0\0\0\x03\0\0\0\x01'+
b'\xff'*4)
def fcis(text_length):
fcis = b'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
fcis += pack(b'>I', text_length)
fcis += b'\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
return fcis
class MobiWriter(object): class MobiWriter(object):
def __init__(self, opts, resources, kf8, write_page_breaks_after_item=True): def __init__(self, opts, resources, kf8, write_page_breaks_after_item=True):
@ -208,14 +217,9 @@ class MobiWriter(object):
# FCIS/FLIS (Seems to serve no purpose) # FCIS/FLIS (Seems to serve no purpose)
flis_number = len(self.records) flis_number = len(self.records)
self.records.append( self.records.append(FLIS)
b'FLIS\0\0\0\x08\0\x41\0\0\0\0\0\0\xff\xff\xff\xff\0\x01\0\x03\0\0\0\x03\0\0\0\x01'+
b'\xff'*4)
fcis = b'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
fcis += pack(b'>I', self.text_length)
fcis += b'\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
fcis_number = len(self.records) fcis_number = len(self.records)
self.records.append(fcis) self.records.append(fcis(self.text_length))
# EOF record # EOF record
self.records.append(b'\xE9\x8E\x0D\x0A') self.records.append(b'\xE9\x8E\x0D\x0A')
@ -379,6 +383,12 @@ class MobiWriter(object):
self.resources.serialize(self.records, used_images) self.resources.serialize(self.records, used_images)
resource_record_count = len(self.records) - old resource_record_count = len(self.records) - old
# FCIS/FLIS (Seems to serve no purpose)
flis_number = len(self.records)
self.records.append(FLIS)
fcis_number = len(self.records)
self.records.append(fcis(self.text_length))
# Insert KF8 records # Insert KF8 records
self.records.append(b'BOUNDARY') self.records.append(b'BOUNDARY')
kf8_header_index = len(self.records) kf8_header_index = len(self.records)
@ -398,6 +408,8 @@ class MobiWriter(object):
header_fields['exth_flags'] = 0b100001010000 # Kinglegen uses this header_fields['exth_flags'] = 0b100001010000 # Kinglegen uses this
header_fields['fdst_record'] = NULL_INDEX header_fields['fdst_record'] = NULL_INDEX
header_fields['fdst_count'] = 1 # Why not 0? Kindlegen uses 1 header_fields['fdst_count'] = 1 # Why not 0? Kindlegen uses 1
header_fields['flis_record'] = flis_number
header_fields['fcis_record'] = fcis_number
extra_data_flags = 0b1 # Has multibyte overlap bytes extra_data_flags = 0b1 # Has multibyte overlap bytes
if self.primary_index_record_idx is not None: if self.primary_index_record_idx is not None:
extra_data_flags |= 0b10 extra_data_flags |= 0b10

View File

@ -19,9 +19,11 @@ PLACEHOLDER_GIF = b'GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff!\
class Resources(object): class Resources(object):
def __init__(self, oeb, opts, is_periodical, add_fonts=False): def __init__(self, oeb, opts, is_periodical, add_fonts=False,
process_images=True):
self.oeb, self.log, self.opts = oeb, oeb.log, opts self.oeb, self.log, self.opts = oeb, oeb.log, opts
self.is_periodical = is_periodical self.is_periodical = is_periodical
self.process_images = process_images
self.item_map = {} self.item_map = {}
self.records = [] self.records = []
@ -34,6 +36,8 @@ class Resources(object):
self.add_resources(add_fonts) self.add_resources(add_fonts)
def process_image(self, data): def process_image(self, data):
if not self.process_images:
return data
return (mobify_image(data) if self.opts.mobi_keep_original_images else return (mobify_image(data) if self.opts.mobi_keep_original_images else
rescale_image(data)) rescale_image(data))

View File

@ -18,6 +18,14 @@ from calibre.ebooks.mobi.writer8.exth import build_exth
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
NULL_INDEX = 0xffffffff NULL_INDEX = 0xffffffff
FLIS = b'FLIS\0\0\0\x08\0\x41\0\0\0\0\0\0\xff\xff\xff\xff\0\x01\0\x03\0\0\0\x03\0\0\0\x01'+ b'\xff'*4
def fcis(text_length):
fcis = b'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x02\x00\x00\x00\x00'
fcis += pack(b'>L', text_length)
fcis += b'\x00\x00\x00\x00\x00\x00\x00\x28\x00\x00\x00\x00\x00\x00\x00'
fcis += b'\x28\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
return fcis
class MOBIHeader(Header): # {{{ class MOBIHeader(Header): # {{{
''' '''
@ -115,7 +123,10 @@ class MOBIHeader(Header): # {{{
exth_flags = DYN exth_flags = DYN
# 132: Unknown # 132: Unknown
unknown = zeroes(36) unknown = zeroes(32)
# 164: Unknown
unknown_index = NULL
# 168: DRM # 168: DRM
drm_offset = NULL drm_offset = NULL
@ -130,13 +141,13 @@ class MOBIHeader(Header): # {{{
fdst_record = DYN fdst_record = DYN
fdst_count = DYN fdst_count = DYN
# 200: FCI # 200: FCIS
fcis_record = NULL fcis_record = DYN
fcis_count fcis_count = 1
# 208: FLIS # 208: FLIS
flis_record = NULL flis_record = DYN
flis_count flis_count = 1
# 216: Unknown # 216: Unknown
unknown3 = zeroes(8) unknown3 = zeroes(8)
@ -193,7 +204,7 @@ HEADER_FIELDS = {'compression', 'text_length', 'last_text_record', 'book_type',
'first_resource_record', 'exth_flags', 'fdst_record', 'first_resource_record', 'exth_flags', 'fdst_record',
'fdst_count', 'ncx_index', 'chunk_index', 'skel_index', 'fdst_count', 'ncx_index', 'chunk_index', 'skel_index',
'guide_index', 'exth', 'full_title', 'extra_data_flags', 'guide_index', 'exth', 'full_title', 'extra_data_flags',
'uid'} 'flis_record', 'fcis_record', 'uid'}
class KF8Book(object): class KF8Book(object):
@ -241,6 +252,12 @@ class KF8Book(object):
self.fdst_record = len(self.records) self.fdst_record = len(self.records)
self.records.extend(writer.fdst_records) self.records.extend(writer.fdst_records)
# FLIS/FCIS
self.flis_record = len(self.records)
self.records.append(FLIS)
self.fcis_record = len(self.records)
self.records.append(fcis(self.text_length))
# EOF # EOF
self.records.append(b'\xe9\x8e\r\n') # EOF record self.records.append(b'\xe9\x8e\r\n') # EOF record

View File

@ -13,7 +13,7 @@ from functools import partial
from lxml import etree from lxml import etree
from calibre.ebooks.oeb.base import XHTML_NS from calibre.ebooks.oeb.base import XHTML_NS, extract
from calibre.constants import ispy3 from calibre.constants import ispy3
from calibre.ebooks.mobi.utils import to_base from calibre.ebooks.mobi.utils import to_base
@ -224,14 +224,24 @@ class Chunker(object):
nroot.text = root.text nroot.text = root.text
nroot.tail = '\n' nroot.tail = '\n'
for tag in root.iterdescendants(etree.Element): # Remove Comments and ProcessingInstructions as kindlegen seems to
# We are ignoring all non tag entities in the tree # remove them as well
# like comments and processing instructions, as they make the for tag in root.iterdescendants():
# chunking code even harder, for minimal gain. if tag.tag in {etree.Comment, etree.ProcessingInstruction}:
elem = nroot.makeelement(tag.tag.rpartition('}')[-1], extract(tag)
for tag in root.iterdescendants():
if tag.tag == etree.Entity:
elem = etree.Entity(tag.name)
else:
tn = tag.tag
if tn is not None:
tn = tn.rpartition('}')[-1]
elem = nroot.makeelement(tn,
attrib={k.rpartition('}')[-1]:v for k, v in attrib={k.rpartition('}')[-1]:v for k, v in
tag.attrib.iteritems()}) tag.attrib.iteritems()})
elem.text, elem.tail = tag.text, tag.tail elem.text = tag.text
elem.tail = tag.tail
parent = node_from_path(nroot, path_to_node(tag.getparent())) parent = node_from_path(nroot, path_to_node(tag.getparent()))
parent.append(elem) parent.append(elem)
@ -251,6 +261,11 @@ class Chunker(object):
# Now loop over children # Now loop over children
for child in list(tag): for child in list(tag):
raw = tostring(child, with_tail=False) raw = tostring(child, with_tail=False)
if child.tag == etree.Entity:
chunks.append(raw)
if child.tail:
chunks.extend(self.chunk_up_text(child.tail, aid))
continue
raw = close_self_closing_tags(raw) raw = close_self_closing_tags(raw)
if len(raw) > CHUNK_SIZE and child.get('aid', None): if len(raw) > CHUNK_SIZE and child.get('aid', None):
self.step_into_tag(child, chunks) self.step_into_tag(child, chunks)

137
src/calibre/ebooks/tweak.py Normal file
View File

@ -0,0 +1,137 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, os, shlex, subprocess
from calibre import prints, as_unicode, walk
from calibre.constants import iswindows, __appname__
from calibre.ptempfile import TemporaryDirectory
from calibre.libunzip import extract as zipextract
from calibre.utils.zipfile import ZipFile, ZIP_DEFLATED, ZIP_STORED
from calibre.utils.ipc.simple_worker import WorkerError
class Error(ValueError):
pass
def ask_cli_question(msg):
prints(msg, end=' [y/N]: ')
sys.stdout.flush()
if iswindows:
import msvcrt
ans = msvcrt.getch()
else:
import tty, termios
old_settings = termios.tcgetattr(sys.stdin.fileno())
try:
tty.setraw(sys.stdin.fileno())
try:
ans = sys.stdin.read(1)
except KeyboardInterrupt:
ans = b''
finally:
termios.tcsetattr(sys.stdin.fileno(), termios.TCSADRAIN, old_settings)
print()
return ans == b'y'
def mobi_exploder(path, tdir, question=lambda x:True):
from calibre.ebooks.mobi.tweak import explode, BadFormat
try:
return explode(path, tdir, question=question)
except BadFormat as e:
raise Error(as_unicode(e))
def zip_exploder(path, tdir, question=lambda x:True):
zipextract(path, tdir)
for f in walk(tdir):
if f.lower().endswith('.opf'):
return f
raise Error('Invalid book: Could not find .opf')
def zip_rebuilder(tdir, path):
with ZipFile(path, 'w', compression=ZIP_DEFLATED) as zf:
# Write mimetype
mt = os.path.join(tdir, 'mimetype')
if os.path.exists(mt):
zf.write(mt, 'mimetype', compress_type=ZIP_STORED)
# Write everything else
exclude_files = {'.DS_Store', 'mimetype', 'iTunesMetadata.plist'}
for root, dirs, files in os.walk(tdir):
for fn in files:
if fn in exclude_files:
continue
absfn = os.path.join(root, fn)
zfn = os.path.relpath(absfn, tdir).replace(os.sep, '/')
zf.write(absfn, zfn)
def get_tools(fmt):
fmt = fmt.lower()
if fmt in {'mobi', 'azw', 'azw3'}:
from calibre.ebooks.mobi.tweak import rebuild
ans = mobi_exploder, rebuild
elif fmt in {'epub', 'htmlz'}:
ans = zip_exploder, zip_rebuilder
else:
ans = None, None
return ans
def tweak(ebook_file):
''' Command line interface to the Tweak Book tool '''
fmt = ebook_file.rpartition('.')[-1].lower()
exploder, rebuilder = get_tools(fmt)
if exploder is None:
prints('Cannot tweak %s files. Supported formats are: EPUB, HTMLZ, AZW3, MOBI'
, file=sys.stderr)
raise SystemExit(1)
with TemporaryDirectory('_tweak_'+
os.path.basename(ebook_file).rpartition('.')[0]) as tdir:
try:
opf = exploder(ebook_file, tdir, question=ask_cli_question)
except WorkerError as e:
prints('Failed to unpack', ebook_file)
prints(e.orig_tb)
raise SystemExit(1)
except Error as e:
prints(as_unicode(e), file=sys.stderr)
raise SystemExit(1)
if opf is None:
# The question was answered with No
return
ed = os.environ.get('EDITOR', None)
proceed = False
if ed is None:
prints('Book extracted to', tdir)
prints('Make your tweaks and once you are done,', __appname__,
'will rebuild', ebook_file, 'from', tdir)
print()
proceed = ask_cli_question('Rebuild ' + ebook_file + '?')
else:
cmd = shlex.split(ed)
try:
subprocess.check_call(cmd + [tdir])
except:
prints(ed, 'failed, aborting...')
raise SystemExit(1)
proceed = True
if proceed:
prints('Rebuilding', ebook_file, 'please wait ...')
try:
rebuilder(tdir, ebook_file)
except WorkerError as e:
prints('Failed to rebuild', ebook_file)
prints(e.orig_tb)
raise SystemExit(1)
prints(ebook_file, 'successfully tweaked')

View File

@ -0,0 +1,33 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.gui2.convert.azw3_output_ui import Ui_Form
from calibre.gui2.convert import Widget
font_family_model = None
class PluginWidget(Widget, Ui_Form):
TITLE = _('AZW3 Output')
HELP = _('Options specific to')+' AZW3 '+_('output')
COMMIT_NAME = 'azw3_output'
ICON = I('mimetypes/mobi.png')
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent,
['prefer_author_sort', 'toc_title',
'mobi_ignore_margins', 'mobi_toc_at_start',
'dont_compress', 'no_inline_toc', 'share_not_sync',
'personal_doc']#, 'mobi_navpoints_only_deepest']
)
self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id)

View File

@ -0,0 +1,125 @@
<?xml version="1.0" encoding="UTF-8"?>
<ui version="4.0">
<class>Form</class>
<widget class="QWidget" name="Form">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>588</width>
<height>342</height>
</rect>
</property>
<property name="windowTitle">
<string>Form</string>
</property>
<layout class="QGridLayout" name="gridLayout">
<item row="4" column="0" colspan="2">
<widget class="QCheckBox" name="opt_prefer_author_sort">
<property name="text">
<string>Use author &amp;sort for author</string>
</property>
</widget>
</item>
<item row="1" column="0">
<widget class="QLabel" name="label">
<property name="text">
<string>&amp;Title for Table of Contents:</string>
</property>
<property name="buddy">
<cstring>opt_toc_title</cstring>
</property>
</widget>
</item>
<item row="1" column="1">
<widget class="QLineEdit" name="opt_toc_title"/>
</item>
<item row="5" column="0">
<widget class="QCheckBox" name="opt_dont_compress">
<property name="text">
<string>Disable compression of the file contents</string>
</property>
</widget>
</item>
<item row="0" column="0">
<widget class="QCheckBox" name="opt_no_inline_toc">
<property name="text">
<string>Do not add Table of Contents to book</string>
</property>
</widget>
</item>
<item row="2" column="0" colspan="2">
<widget class="QCheckBox" name="opt_mobi_toc_at_start">
<property name="text">
<string>Put generated Table of Contents at &amp;start of book instead of end</string>
</property>
</widget>
</item>
<item row="3" column="0">
<widget class="QCheckBox" name="opt_mobi_ignore_margins">
<property name="text">
<string>Ignore &amp;margins</string>
</property>
</widget>
</item>
<item row="7" column="0" colspan="2">
<widget class="QGroupBox" name="groupBox">
<property name="title">
<string>Kindle options</string>
</property>
<layout class="QVBoxLayout" name="verticalLayout">
<item>
<layout class="QHBoxLayout" name="horizontalLayout">
<item>
<widget class="QLabel" name="label_3">
<property name="text">
<string>Personal Doc tag:</string>
</property>
</widget>
</item>
<item>
<widget class="QLineEdit" name="opt_personal_doc"/>
</item>
</layout>
</item>
<item>
<widget class="QCheckBox" name="opt_share_not_sync">
<property name="text">
<string>Enable sharing of book content via Facebook, etc. WARNING: Disables last read syncing</string>
</property>
</widget>
</item>
<item>
<spacer name="verticalSpacer">
<property name="orientation">
<enum>Qt::Vertical</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>20</width>
<height>40</height>
</size>
</property>
</spacer>
</item>
</layout>
</widget>
</item>
<item row="8" column="0">
<spacer name="verticalSpacer_2">
<property name="orientation">
<enum>Qt::Vertical</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>20</width>
<height>40</height>
</size>
</property>
</spacer>
</item>
</layout>
</widget>
<resources/>
<connections/>
</ui>

View File

@ -0,0 +1,54 @@
# -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function)
__license__ = 'GPL 3'
__copyright__ = '2012, Alex Stanev <alex@stanev.org>'
__docformat__ = 'restructuredtext en'
import re
from calibre.gui2.store.basic_config import BasicStoreConfig
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
from calibre.gui2.store.search_result import SearchResult
class BiblioStore(BasicStoreConfig, OpenSearchOPDSStore):
open_search_url = 'http://biblio.bg/feed.opds.php'
web_url = 'http://biblio.bg/'
def search(self, query, max_results=10, timeout=60):
# check for cyrillic symbols before performing search
uquery = unicode(query.strip(), 'utf-8')
reObj = re.search(u'^[а-яА-Я\\d\\s]{3,}$', uquery)
if not reObj:
return
for s in OpenSearchOPDSStore.search(self, query, max_results, timeout):
yield s
def get_details(self, search_result, timeout):
# get format and DRM status
from calibre import browser
from contextlib import closing
from lxml import html
br = browser()
with closing(br.open(search_result.detail_item, timeout=timeout)) as nf:
idata = html.fromstring(nf.read())
search_result.formats = ''
if idata.xpath('.//span[@class="format epub"]'):
search_result.formats = 'EPUB'
if idata.xpath('.//span[@class="format pdf"]'):
if search_result.formats == '':
search_result.formats = 'PDF'
else:
search_result.formats.join(', PDF')
if idata.xpath('.//span[@class="format nodrm-icon"]'):
search_result.drm = SearchResult.DRM_UNLOCKED
else:
search_result.drm = SearchResult.DRM_LOCKED
return True

View File

@ -22,7 +22,7 @@ It can convert every input format in the following list, to every output format.
*Input Formats:* CBZ, CBR, CBC, CHM, DJVU, EPUB, FB2, HTML, HTMLZ, LIT, LRF, MOBI, ODT, PDF, PRC, PDB, PML, RB, RTF, SNB, TCR, TXT, TXTZ *Input Formats:* CBZ, CBR, CBC, CHM, DJVU, EPUB, FB2, HTML, HTMLZ, LIT, LRF, MOBI, ODT, PDF, PRC, PDB, PML, RB, RTF, SNB, TCR, TXT, TXTZ
*Output Formats:* EPUB, FB2, OEB, LIT, LRF, MOBI, HTMLZ, PDB, PML, RB, PDF, RTF, SNB, TCR, TXT, TXTZ *Output Formats:* AZW3, EPUB, FB2, OEB, LIT, LRF, MOBI, HTMLZ, PDB, PML, RB, PDF, RTF, SNB, TCR, TXT, TXTZ
.. note :: .. note ::
@ -35,7 +35,7 @@ It can convert every input format in the following list, to every output format.
What are the best source formats to convert? What are the best source formats to convert?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In order of decreasing preference: LIT, MOBI, EPUB, FB2, HTML, PRC, RTF, PDB, TXT, PDF In order of decreasing preference: LIT, MOBI, AZW, EPUB, AZW3, FB2, HTML, PRC, RTF, PDB, TXT, PDF
I converted a PDF file, but the result has various problems? I converted a PDF file, but the result has various problems?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -245,7 +245,7 @@ The following functions are available in addition to those described in single-f
* ``current_library_name() -- `` return the last name on the path to the current calibre library. This function can be called in template program mode using the template ``{:'current_library_name()'}``. * ``current_library_name() -- `` return the last name on the path to the current calibre library. This function can be called in template program mode using the template ``{:'current_library_name()'}``.
* ``days_between(date1, date2)`` -- return the number of days between ``date1`` and ``date2``. The number is positive if ``date1`` is greater than ``date2``, otherwise negative. If either ``date1`` or ``date2`` are not dates, the function returns the empty string. * ``days_between(date1, date2)`` -- return the number of days between ``date1`` and ``date2``. The number is positive if ``date1`` is greater than ``date2``, otherwise negative. If either ``date1`` or ``date2`` are not dates, the function returns the empty string.
* ``divide(x, y)`` -- returns x / y. Throws an exception if either x or y are not numbers. * ``divide(x, y)`` -- returns x / y. Throws an exception if either x or y are not numbers.
* ``eval(string)`` -- evaluates the string as a program, passing the local variables (those ``assign`` ed to). This permits using the template processor to construct complex results from local variables. * ``eval(string)`` -- evaluates the string as a program, passing the local variables (those ``assign`` ed to). This permits using the template processor to construct complex results from local variables. Because the `{` and `}` characters are special, you must use `[[` for the `{` character and `]]` for the '}' character; they are converted automatically. Note also that prefixes and suffixes (the "|prefix|suffix" syntax) cannot be used in the argument to this function when using template program mode.
* ``field(name)`` -- returns the metadata field named by ``name``. * ``field(name)`` -- returns the metadata field named by ``name``.
* ``first_non_empty(value, value, ...)`` -- returns the first value that is not empty. If all values are empty, then the empty value is returned. You can have as many values as you want. * ``first_non_empty(value, value, ...)`` -- returns the first value that is not empty. If all values are empty, then the empty value is returned. You can have as many values as you want.
* ``format_date(x, date_format)`` -- format_date(val, format_string) -- format the value, which must be a date field, using the format_string, returning a string. The formatting codes are:: * ``format_date(x, date_format)`` -- format_date(val, format_string) -- format the value, which must be a date field, using the format_string, returning a string. The formatting codes are::
@ -306,7 +306,7 @@ The following functions are available in addition to those described in single-f
* ``substr(str, start, end)`` -- returns the ``start``'th through the ``end``'th characters of ``str``. The first character in ``str`` is the zero'th character. If end is negative, then it indicates that many characters counting from the right. If end is zero, then it indicates the last character. For example, ``substr('12345', 1, 0)`` returns ``'2345'``, and ``substr('12345', 1, -1)`` returns ``'234'``. * ``substr(str, start, end)`` -- returns the ``start``'th through the ``end``'th characters of ``str``. The first character in ``str`` is the zero'th character. If end is negative, then it indicates that many characters counting from the right. If end is zero, then it indicates the last character. For example, ``substr('12345', 1, 0)`` returns ``'2345'``, and ``substr('12345', 1, -1)`` returns ``'234'``.
* ``subtract(x, y)`` -- returns x - y. Throws an exception if either x or y are not numbers. * ``subtract(x, y)`` -- returns x - y. Throws an exception if either x or y are not numbers.
* ``today()`` -- return a date string for today. This value is designed for use in format_date or days_between, but can be manipulated like any other string. The date is in ISO format. * ``today()`` -- return a date string for today. This value is designed for use in format_date or days_between, but can be manipulated like any other string. The date is in ISO format.
* ``template(x)`` -- evaluates x as a template. The evaluation is done in its own context, meaning that variables are not shared between the caller and the template evaluation. Because the `{` and `}` characters are special, you must use `[[` for the `{` character and `]]` for the '}' character; they are converted automatically. For example, ``template('[[title_sort]]') will evaluate the template ``{title_sort}`` and return its value. * ``template(x)`` -- evaluates x as a template. The evaluation is done in its own context, meaning that variables are not shared between the caller and the template evaluation. Because the `{` and `}` characters are special, you must use `[[` for the `{` character and `]]` for the '}' character; they are converted automatically. For example, ``template('[[title_sort]]') will evaluate the template ``{title_sort}`` and return its value. Note also that prefixes and suffixes (the "|prefix|suffix" syntax) cannot be used in the argument to this function when using template program mode.
.. _template_functions_reference: .. _template_functions_reference:

View File

@ -387,7 +387,7 @@ def _prefs():
help=_('The language in which to display the user interface')) help=_('The language in which to display the user interface'))
c.add_opt('output_format', default='EPUB', c.add_opt('output_format', default='EPUB',
help=_('The default output format for ebook conversions.')) help=_('The default output format for ebook conversions.'))
c.add_opt('input_format_order', default=['EPUB', 'MOBI', 'LIT', 'PRC', c.add_opt('input_format_order', default=['EPUB', 'AZW3', 'MOBI', 'LIT', 'PRC',
'FB2', 'HTML', 'HTM', 'XHTM', 'SHTML', 'XHTML', 'ZIP', 'ODT', 'RTF', 'PDF', 'FB2', 'HTML', 'HTM', 'XHTM', 'SHTML', 'XHTML', 'ZIP', 'ODT', 'RTF', 'PDF',
'TXT'], 'TXT'],
help=_('Ordered list of formats to prefer for input.')) help=_('Ordered list of formats to prefer for input.'))

View File

@ -217,7 +217,9 @@ class BuiltinTemplate(BuiltinFormatterFunction):
'characters are special, you must use [[ for the { character and ' 'characters are special, you must use [[ for the { character and '
']] for the } character; they are converted automatically. ' ']] for the } character; they are converted automatically. '
'For example, template(\'[[title_sort]]\') will evaluate the ' 'For example, template(\'[[title_sort]]\') will evaluate the '
'template {title_sort} and return its value.') 'template {title_sort} and return its value. Note also that '
'prefixes and suffixes (the "|prefix|suffix" syntax) cannot be '
'used in the argument to this function when using template program mode.')
def evaluate(self, formatter, kwargs, mi, locals, template): def evaluate(self, formatter, kwargs, mi, locals, template):
template = template.replace('[[', '{').replace(']]', '}') template = template.replace('[[', '{').replace(']]', '}')
@ -230,7 +232,12 @@ class BuiltinEval(BuiltinFormatterFunction):
__doc__ = doc = _('eval(template) -- evaluates the template, passing the local ' __doc__ = doc = _('eval(template) -- evaluates the template, passing the local '
'variables (those \'assign\'ed to) instead of the book metadata. ' 'variables (those \'assign\'ed to) instead of the book metadata. '
' This permits using the template processor to construct complex ' ' This permits using the template processor to construct complex '
'results from local variables.') 'results from local variables. Because the { and } '
'characters are special, you must use [[ for the { character and '
']] for the } character; they are converted automatically. '
'Note also that prefixes and suffixes (the "|prefix|suffix" syntax) '
'cannot be used in the argument to this function when using '
'template program mode.')
def evaluate(self, formatter, kwargs, mi, locals, template): def evaluate(self, formatter, kwargs, mi, locals, template):
from formatter import EvalFormatter from formatter import EvalFormatter