"),
(re.compile(r"
", re.DOTALL|re.IGNORECASE),
- lambda match: "")
+ lambda match: ""),
+ (re.compile(r'

', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'

', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'

', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ #(re.compile(r'[
.+?]', re.DOTALL|re.IGNORECASE),
+ #lambda match: '')
]
elif __Region__ == 'Vancouver':
if __UseChineseTitle__ == True:
@@ -221,6 +240,10 @@ class MPRecipe(BasicNewsRecipe):
return __Date__[6:8]
else:
return self.get_dtlocal().strftime("%d")
+
+ # Note: does not work with custom date given by __Date__
+ def get_weekday(self):
+ return self.get_dtlocal().weekday()
def get_cover_url(self):
if __Region__ == 'Hong Kong':
@@ -260,7 +283,23 @@ class MPRecipe(BasicNewsRecipe):
if articles:
feeds.append((title, articles))
- if __InclPremium__ == True:
+# if __InclPremium__ == True:
+# # parse column section articles directly from .txt files
+# for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
+# ]:
+# articles = self.parse_section2_txt(url, keystr)
+# if articles:
+# feeds.append((title, articles))
+#
+# for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
+# (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
+# articles = self.parse_section(url)
+# if articles:
+# feeds.append((title, articles))
+
+ # new
+ if __InclPremium__ == True and (self.get_weekday() <> 6 or __ParseSelectedMobile__ == False):
+ # if both not on Sunday and not __ParseSelectedMobile__, go ahead
# parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
]:
@@ -268,17 +307,45 @@ class MPRecipe(BasicNewsRecipe):
if articles:
feeds.append((title, articles))
- for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
- (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
- articles = self.parse_section(url)
+ if __InclPremium__ == False or self.get_weekday() <> 6:
+ for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
+ if __ParseTxt__ == False:
+ articles = self.parse_section(url)
+ else:
+ articles = self.parse_section_txt(url, seckey)
+ if articles:
+ feeds.append((title, articles))
+ else:
+ if __InclPremium__ == True and __ParseSelectedMobile__ == True:
+ articles = self.parse_section_mobile('http://premium.mingpao.com/pda/palm', 'HotNews1.cfm?cat=ja&token=b218bc260b89c0&online=1')
+ if articles:
+ feeds.append((u'\u526f\u520a Supplement', articles))
+ else:
+ for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
+ if __ParseTxt__ == False:
+ articles = self.parse_section(url)
+ else:
+ articles = self.parse_section_txt(url, seckey)
+ if articles:
+ feeds.append((title, articles))
+
+ for title, url, seckey in [(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm', 'em')]:
+ if __ParseTxt__ == False:
+ articles = self.parse_section(url)
+ else:
+ articles = self.parse_section_txt(url, seckey)
if articles:
feeds.append((title, articles))
+ # end of new
else:
- for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
- (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
- (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
- (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
- articles = self.parse_section(url)
+ for title, url, seckey in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm', 'ga'),
+ (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm', 'gb'),
+ (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm', 'gf'),
+ (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm', 'mr')]:
+ if __ParseTxt__ == False:
+ articles = self.parse_section(url)
+ else:
+ articles = self.parse_section_txt(url, seckey)
if articles:
feeds.append((title, articles))
@@ -287,10 +354,13 @@ class MPRecipe(BasicNewsRecipe):
#if ed_articles:
# feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
- for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
- (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
- (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
- articles = self.parse_section(url)
+ for title, url, seckey in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm', 'fa'),
+ (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm', 'ca'),
+ (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm', 'ta')]:
+ if __ParseTxt__ == False:
+ articles = self.parse_section(url)
+ else:
+ articles = self.parse_section_txt(url, seckey)
if articles:
feeds.append((title, articles))
@@ -322,7 +392,9 @@ class MPRecipe(BasicNewsRecipe):
if articles:
feeds.append((title, articles))
- if __InclPremium__ == True:
+
+ if __InclPremium__ == True and (self.get_weekday() <> 6 or __ParseSelectedMobile__ == False):
+ # if both not on Sunday or not __ParseSelectedMobile__, go ahead
# parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
]:
@@ -330,12 +402,36 @@ class MPRecipe(BasicNewsRecipe):
if articles:
feeds.append((title, articles))
- for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
- (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
- articles = self.parse_section(url)
+ if __InclPremium__ == False or self.get_weekday() <> 6:
+ for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
+ if __ParseTxt__ == False:
+ articles = self.parse_section(url)
+ else:
+ articles = self.parse_section_txt(url, seckey)
+ if articles:
+ feeds.append((title, articles))
+ else:
+ if __InclPremium__ == True and __ParseSelectedMobile__ == True:
+ articles = self.parse_section_mobile('http://premium.mingpao.com/pda/palm', 'HotNews1.cfm?cat=ja&token=b218bc260b89c0&online=1')
+ if articles:
+ feeds.append((u'\u526f\u520a Supplement', articles))
+ else:
+ for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
+ if __ParseTxt__ == False:
+ articles = self.parse_section(url)
+ else:
+ articles = self.parse_section_txt(url, seckey)
+ if articles:
+ feeds.append((title, articles))
+
+ for title, url, seckey in [(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm', 'em')]:
+ if __ParseTxt__ == False:
+ articles = self.parse_section(url)
+ else:
+ articles = self.parse_section_txt(url, seckey)
if articles:
feeds.append((title, articles))
-
+
elif __Region__ == 'Vancouver':
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
@@ -366,7 +462,7 @@ class MPRecipe(BasicNewsRecipe):
feeds.append((title, articles))
return feeds
- # parse from news.mingpao.com
+ # parse from news.mingpao.com (web html)
def parse_section(self, url):
dateStr = self.get_fetchdate()
soup = self.index_to_soup(url)
@@ -379,17 +475,57 @@ class MPRecipe(BasicNewsRecipe):
title = self.tag_to_string(a)
url = a.get('href', False)
url = 'http://news.mingpao.com/' + dateStr + '/' +url
- # replace the url to the print-friendly version
- if __ParsePFF__ == True:
+ # replace the url to the alternative version
+ if __ParsePF__ == True:
+ # printer-friendly option
if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
url = re.sub('%2F.*%2F', '/', url)
- title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
+ if __InclPremium__ == True:
+ title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
url = url.replace('%2Etxt', '_print.htm')
url = url.replace('%5F', '_')
else:
url = url.replace('.htm', '_print.htm')
- if url not in included_urls and url.rfind('Redirect') == -1:
+ #if url not in included_urls and url.rfind('Redirect') == -1 and (__InclPremium__ == False or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
+ if url not in included_urls and (__InclPremium__ == True or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
+ current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
+ included_urls.append(url)
+ current_articles.reverse()
+ return current_articles
+
+ # parse from news.mingpao.com (txt)
+ def parse_section_txt(self, url, ch):
+ dateStr = self.get_fetchdate()
+ soup = self.index_to_soup(url)
+ divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
+ current_articles = []
+ included_urls = []
+ divs.reverse()
+ for i in divs:
+ a = i.find('a', href = True)
+ title = self.tag_to_string(a)
+ url = a.get('href', False)
+ #print 'Base url: ', url
+ # replace the url to the alternative version
+ # text version
+ if url.rfind('Redirect') <> -1:
+ url = 'http://news.mingpao.com/' + dateStr + '/' +url
+ #print 'original url: ', url
+ url = re.sub(dateStr + '/../cfm/Redirect.cfm.*NewsFile=', 'ftp/WebNews2/', url)
+ url = re.sub('%2F', '/', url)
+ if __InclPremium__ == True:
+ title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
+ url = url.replace('%2Etxt', '.txt')
+ url = url.replace('%5F', '_')
+ else:
+ # get the first two char in url as ch
+ seckey = url[0:2]
+ url = url.replace('.htm', '.txt')
+ url = 'http://news.mingpao.com/ftp/WebNews2/' + dateStr + '/' + ch + '/' + seckey + '/' + url
+ #print 'updated url: ', url
+ if url not in included_urls and (__InclPremium__ == True or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
+ #if url not in included_urls and (url.rfind('Redirect') == -1) and (__InclPremium__ == False or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
included_urls.append(url)
current_articles.reverse()
@@ -415,7 +551,7 @@ class MPRecipe(BasicNewsRecipe):
current_articles.append({'title': title, 'url': url, 'description': ''})
included_urls.append(url)
except:
- print 'skipping a premium article'
+ print 'skipping a premium article'
current_articles.reverse()
return current_articles
@@ -437,6 +573,20 @@ class MPRecipe(BasicNewsRecipe):
current_articles.reverse()
return current_articles
+ # parse from mobile version
+ def parse_section_mobile(self, base, page):
+ soup = self.index_to_soup(base + '/' + page)
+ a = soup.findAll('a', href=True)
+ current_articles = []
+ included_urls = []
+ for i in a:
+ title = self.tag_to_string(i)
+ url = i.get('href', False)
+ if url not in included_urls and url.rfind('HotNews2.cfm') <> -1:
+ current_articles.append({'title': title, 'url': base + '/' + url, 'description': ''})
+ included_urls.append(url)
+ return current_articles
+
# parse from www.mingpaovan.com
def parse_section3(self, url, baseUrl):
self.get_fetchdate()
@@ -631,15 +781,22 @@ class MPRecipe(BasicNewsRecipe):
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
new_html = new_html.replace(img, gifimg)
except:
- # find the location of the first _
- pos = img.find('_')
- if pos > -1:
- # if found, insert _ after the first _
- newimg = img[0:pos] + '_' + img[pos:]
- new_html = new_html.replace(img, newimg)
+ if __ParseTxt__ == False:
+ # find the location of the first _
+ pos = img.find('_')
+ if pos > -1:
+ # if found, insert _ after the first _
+ newimg = img[0:pos] + '_' + img[pos:]
+ new_html = new_html.replace(img, newimg)
+ else:
+ # if not found, insert _ after "
+ new_html = new_html.replace(img[1:], '"_' + img[1:])
else:
- # if not found, insert _ after "
- new_html = new_html.replace(img[1:], '"_' + img[1:])
+ # insert to front
+ #print 'imgstr: ', img
+ pos = img.find('_')
+ new_html = new_html.replace(img[5:], '_' + img[5:])
+
elif url.rfind('life.mingpao.com') > -1:
imglist = re.findall('src=\'?.*?jpg\'', new_html)
br = mechanize.Browser()
@@ -673,9 +830,13 @@ class MPRecipe(BasicNewsRecipe):
newimg = img[0:pos+1] + '_' + img[pos+1:]
#print 'Use hi-res img', newimg
new_html = new_html.replace(img, newimg)
+ # test
+ #print new_html
return new_html
def preprocess_html(self, soup):
+ for mobiletitle in soup.findAll('font', attrs={'color': ['navy']}):
+ mobiletitle.name = 'h1'
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(style=True):
@@ -909,3 +1070,4 @@ class MPRecipe(BasicNewsRecipe):
opf.render(opf_file, ncx_file)
+
diff --git a/recipes/monbiot.recipe b/recipes/monbiot.recipe
new file mode 100644
index 0000000000..5cc50c24d1
--- /dev/null
+++ b/recipes/monbiot.recipe
@@ -0,0 +1,43 @@
+__license__ = 'GPL v3'
+__copyright__ = '2012, Darko Miletic
'
+'''
+www.monbiot.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class GeorgeMonbiot(BasicNewsRecipe):
+ title = 'George Monbiot - blog'
+ __author__ = 'Darko Miletic'
+ description = 'Tell people something they know already and they will thank you for it. Tell people something new and they will hate you for it.'
+ publisher = 'George Monbiot'
+ category = 'news, politics, UK, World'
+ oldest_article = 15
+ max_articles_per_feed = 200
+ no_stylesheets = True
+ encoding = 'utf8'
+ use_embedded_content = False
+ language = 'en_GB'
+ remove_empty_feeds = True
+ publication_type = 'blog'
+ extra_css = """
+ body{font-family: Arial,Helvetica,sans-serif }
+ img{margin-bottom: 0.4em; display:block}
+ """
+
+ conversion_options = {
+ 'comment' : description
+ , 'tags' : category
+ , 'publisher' : publisher
+ , 'language' : language
+ }
+
+ remove_tags = [
+ dict(name=['meta','link']),
+ dict(attrs={'class':'shareinpost'}),
+ dict(attrs={'id':'paging'})
+ ]
+ remove_attributes=['lang']
+ keep_only_tags=[dict(attrs={'id':'content'})]
+
+ feeds = [(u'Articles', u'http://www.monbiot.com/feed/atom/')]
diff --git a/recipes/newsweek_polska.recipe b/recipes/newsweek_polska.recipe
index c8c53af655..4625eb89e6 100644
--- a/recipes/newsweek_polska.recipe
+++ b/recipes/newsweek_polska.recipe
@@ -2,20 +2,25 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
-__copyright__ = '2010, matek09, matek09@gmail.com'
+__copyright__ = '2010, matek09, matek09@gmail.com; 2012, admroz, a.rozewicki@gmail.com'
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
+from string import capwords
import datetime
class Newsweek(BasicNewsRecipe):
+
+ # how many issues to go back, 0 means get the most current one
+ BACK_ISSUES = 1
+
EDITION = '0'
DATE = None
YEAR = datetime.datetime.now().year
title = u'Newsweek Polska'
- __author__ = 'matek09'
+ __author__ = 'matek09, admroz'
description = 'Weekly magazine'
encoding = 'utf-8'
language = 'pl'
@@ -25,6 +30,9 @@ class Newsweek(BasicNewsRecipe):
articles_are_obfuscated = True
+ #
+ # Parses each article
+ #
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url)
@@ -37,7 +45,28 @@ class Newsweek(BasicNewsRecipe):
info = main_section.find('ul', attrs={'class' : 'articleInfo'})
authors = info.find('li').find('h4')
article = main_section.find('div', attrs={'id' : 'article'})
- html = unicode(title) + unicode(authors) + unicode(article)
+
+ # remove related articles box
+ related = article.find('div', attrs={'class' : 'relatedBox'})
+ if related is not None:
+ related.extract()
+
+ # remove div with social networking links and links to
+ # other articles in web version
+ for div in article.findAll('div'):
+ if div.find('span', attrs={'class' : 'google-plus'}):
+ div.extract()
+
+ for p in div.findAll('p'):
+ if p.find('span', attrs={'style' : 'color: rgb(255, 0, 0);'}):
+ p.extract()
+ continue
+ for a in p.findAll('a'):
+ if a.find('span', attrs={'style' : 'font-size: larger;'}):
+ a.extract()
+
+
+ html = unicode(title) + unicode(authors) + unicode(article)
next = main_section.find('li', attrs={'class' : 'next'})
while next:
@@ -58,33 +87,35 @@ class Newsweek(BasicNewsRecipe):
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name
-
- def is_full(self, issue_soup):
- while True:
- main_section = issue_soup.find(id='mainSection')
- next = main_section.find('li', attrs={'class' : 'next'})
- if len(main_section.findAll(attrs={'class' : 'locked'})) > 1:
- return False
- elif next is None:
- return True
- else:
- issue_soup = self.index_to_soup(next.find('a')['href'])
- def find_last_full_issue(self, archive_url):
+
+ #
+ # Goes back given number of issues. It also knows how to go back
+ # to the previous year if there are not enough issues in the current one
+ #
+ def find_last_issue(self, archive_url):
archive_soup = self.index_to_soup(archive_url)
select = archive_soup.find('select', attrs={'id' : 'paper_issue_select'})
- for option in select.findAll(lambda tag: tag.name == 'option' and tag.has_key('value')):
+ options = select.findAll(lambda tag: tag.name == 'option' and tag.has_key('value'))
+
+ # check if need to go back to previous year
+ if len(options) > self.BACK_ISSUES:
+ option = options[self.BACK_ISSUES];
self.EDITION = option['value'].replace('http://www.newsweek.pl/wydania/','')
issue_soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
- if self.is_full(issue_soup):
- return
-
- self.YEAR = self.YEAR - 1
- self.find_last_full_issue(archive_url + ',' + str(self.YEAR))
-
+ else:
+ self.BACK_ISSUES = self.BACK_ISSUES - len(options)
+ self.YEAR = self.YEAR - 1
+ self.find_last_issue(archive_url + ',' + str(self.YEAR))
+
+
+ #
+ # Looks for the last issue which we want to download. Then goes on each
+ # section and article and stores them (assigning to sections)
+ #
def parse_index(self):
archive_url = 'http://www.newsweek.pl/wydania/archiwum'
- self.find_last_full_issue(archive_url)
+ self.find_last_issue(archive_url)
soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
self.DATE = self.tag_to_string(soup.find('span', attrs={'class' : 'data'}))
main_section = soup.find(id='mainSection')
@@ -93,32 +124,44 @@ class Newsweek(BasicNewsRecipe):
feeds = []
articles = {}
sections = []
- while True:
- news_list = main_section.find('ul', attrs={'class' : 'newsList'})
- for h2 in news_list.findAll('h2'):
+
+ news_list = main_section.find('ul', attrs={'class' : 'newsList'})
+ section = 'Inne'
+
+ for li in news_list.findAll('li'):
+ h3 = li.find('h3')
+ if h3 is not None:
+ section = capwords(self.tag_to_string(h3))
+ continue
+ else:
+ h2 = li.find('h2')
+ if h2 is not None:
+ article = self.create_article(h2)
+ if article is None :
+ continue
- article = self.create_article(h2)
- category_div = h2.findNext('div', attrs={'class' : 'kategorie'})
- section = self.tag_to_string(category_div)
- if articles.has_key(section):
- articles[section].append(article)
- else:
- articles[section] = [article]
- sections.append(section)
+ if articles.has_key(section):
+ articles[section].append(article)
+ else:
+ articles[section] = [article]
+ sections.append(section)
- next = main_section.find('li', attrs={'class' : 'next'})
- if next is None:
- break
- soup = self.index_to_soup(next.find('a')['href'])
- main_section = soup.find(id='mainSection')
for section in sections:
feeds.append((section, articles[section]))
return feeds
+
+ #
+ # Creates each article metadata (skips locked ones). The content will
+ # be extracted later by other method (get_obfuscated_article).
+ #
def create_article(self, h2):
article = {}
a = h2.find('a')
+ if a is None:
+ return None
+
article['title'] = self.tag_to_string(a)
article['url'] = a['href']
article['date'] = self.DATE
@@ -129,7 +172,3 @@ class Newsweek(BasicNewsRecipe):
else:
article['description'] = ''
return article
-
-
-
-
diff --git a/recipes/the_sun.recipe b/recipes/the_sun.recipe
index db74e003a0..11500430ff 100644
--- a/recipes/the_sun.recipe
+++ b/recipes/the_sun.recipe
@@ -1,12 +1,14 @@
-import re, mechanize
+import re, random
+
+from calibre import browser
from calibre.web.feeds.recipes import BasicNewsRecipe
+
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'The Sun UK'
-
description = 'A Recipe for The Sun tabloid UK'
__author__ = 'Dave Asbury'
- # last updated 7/4/12
+ # last updated 29/4/12
language = 'en_GB'
oldest_article = 1
max_articles_per_feed = 15
@@ -48,12 +50,10 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
feeds = [
- (u'News','http://feed43.com/2517447382644748.xml'),
- (u'Sport', u'http://feed43.com/4283846255668687.xml'),
- (u'Bizarre', u'http://feed43.com/0233840304242011.xml'),
- (u'Film',u'http://feed43.com/1307545221226200.xml'),
- (u'Music',u'http://feed43.com/1701513435064132.xml'),
- (u'Sun Woman',u'http://feed43.com/0022626854226453.xml'),
+ (u'News', u'http://www.thesun.co.uk/sol/homepage/news/rss'),
+ (u'Sport', u'http://www.thesun.co.uk/sol/homepage/sport/rss'),
+ (u'Showbiz', u'http://www.thesun.co.uk/sol/homepage/showbiz/rss'),
+ (u'Woman', u'http://www.thesun.co.uk/sol/homepage/woman/rss'),
]
def get_cover_url(self):
@@ -61,14 +61,11 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
# look for the block containing the sun button and url
cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'})
-
-
#cov = soup.find(attrs={'id' : 'large'})
cov2 = str(cov)
cov2='http://www.politicshome.com'+cov2[9:-133]
#cov2 now contains url of the page containing pic
-
#cov2 now contains url of the page containing pic
soup = self.index_to_soup(cov2)
cov = soup.find(attrs={'id' : 'large'})
@@ -76,16 +73,21 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
cov2=cov2[27:-18]
#cov2 now is pic url, now go back to original function
- br = mechanize.Browser()
+ br = browser()
br.set_handle_redirect(False)
try:
br.open_novisit(cov2)
cover_url = cov2
except:
- cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
+ cover_url = random.choice((
+ 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage6_677961a_905507a.jpg'
+ ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage7_677962a_905505a.jpg'
+ ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage5_677960a_905512a.jpg'
+ ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage2_677957a_905502a.jpg'
+ ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage3_677958a_905503a.jpg'
+ ))
- #cover_url = cov2
- #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
return cover_url
+
diff --git a/recipes/vice_magazine.recipe b/recipes/vice_magazine.recipe
new file mode 100644
index 0000000000..262c09269c
--- /dev/null
+++ b/recipes/vice_magazine.recipe
@@ -0,0 +1,17 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class ViceESRecipe(BasicNewsRecipe):
+ title = u'Vice Magazine España'
+ __author__ = 'atordo'
+ description = u'La página web oficial de la revista Vice España'
+ category = u'noticias, fotografía, blogs, moda, arte, cine, música, literatura, tecnología'
+ cover_url = 'http://www.seeklogo.com/images/V/Vice-logo-668578AC94-seeklogo.com.gif'
+ oldest_article = 20
+ max_articles_per_feed = 30
+ auto_cleanup = True
+ no_stylesheets = True
+ language = 'es'
+
+ feeds = [('Vice', 'http://www.vice.com/es/rss')]
+
diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index e8d5c5fc91..f77b1d3528 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -445,7 +445,7 @@ class LRFMetadataWriter(MetadataWriterPlugin):
class MOBIMetadataWriter(MetadataWriterPlugin):
name = 'Set MOBI metadata'
- file_types = set(['mobi', 'prc', 'azw', 'azw4'])
+ file_types = set(['mobi', 'prc', 'azw', 'azw3', 'azw4'])
description = _('Set metadata in %s files')%'MOBI'
author = 'Marshall T. Vandegrift'
@@ -539,7 +539,8 @@ from calibre.ebooks.conversion.plugins.epub_output import EPUBOutput
from calibre.ebooks.conversion.plugins.fb2_output import FB2Output
from calibre.ebooks.conversion.plugins.lit_output import LITOutput
from calibre.ebooks.conversion.plugins.lrf_output import LRFOutput
-from calibre.ebooks.conversion.plugins.mobi_output import MOBIOutput
+from calibre.ebooks.conversion.plugins.mobi_output import (MOBIOutput,
+ AZW3Output)
from calibre.ebooks.conversion.plugins.oeb_output import OEBOutput
from calibre.ebooks.conversion.plugins.pdb_output import PDBOutput
from calibre.ebooks.conversion.plugins.pdf_output import PDFOutput
@@ -580,7 +581,7 @@ plugins += [
FB2Output,
LITOutput,
LRFOutput,
- MOBIOutput,
+ MOBIOutput, AZW3Output,
OEBOutput,
PDBOutput,
PDFOutput,
diff --git a/src/calibre/debug.py b/src/calibre/debug.py
index f5f803ec84..f2ae5d8eaf 100644
--- a/src/calibre/debug.py
+++ b/src/calibre/debug.py
@@ -54,6 +54,15 @@ Run an embedded python interpreter.
parser.add_option('-m', '--inspect-mobi', action='store_true',
default=False,
help='Inspect the MOBI file(s) at the specified path(s)')
+ parser.add_option('--tweak-book', default=None,
+ help='Tweak the book (exports the book as a collection of HTML '
+ 'files and metadata, which you can edit using standard HTML '
+ 'editing tools, and then rebuilds the file from the edited HTML. '
+ 'Makes no additional changes to the HTML, unlike a full calibre '
+ 'conversion). Note that this tool will try to open the '
+ 'folder containing the HTML files in the editor pointed to by the'
+ ' EDITOR environment variable.')
+
parser.add_option('--test-build', help='Test binary modules in build',
action='store_true', default=False)
@@ -239,7 +248,9 @@ def main(args=sys.argv):
prints('Inspecting:', path)
inspect_mobi(path)
print
-
+ elif opts.tweak_book:
+ from calibre.ebooks.tweak import tweak
+ tweak(opts.tweak_book)
elif opts.test_build:
from calibre.test_build import test
test()
diff --git a/src/calibre/ebooks/conversion/plugins/mobi_output.py b/src/calibre/ebooks/conversion/plugins/mobi_output.py
index e759df5b78..eee2d480a3 100644
--- a/src/calibre/ebooks/conversion/plugins/mobi_output.py
+++ b/src/calibre/ebooks/conversion/plugins/mobi_output.py
@@ -6,8 +6,32 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-from calibre.customize.conversion import OutputFormatPlugin
-from calibre.customize.conversion import OptionRecommendation
+from calibre.customize.conversion import (OutputFormatPlugin,
+ OptionRecommendation)
+
+def remove_html_cover(oeb, log):
+ from calibre.ebooks.oeb.base import OEB_DOCS
+
+ if not oeb.metadata.cover \
+ or 'cover' not in oeb.guide:
+ return
+ href = oeb.guide['cover'].href
+ del oeb.guide['cover']
+ item = oeb.manifest.hrefs[href]
+ if item.spine_position is not None:
+ log.warn('Found an HTML cover: ', item.href, 'removing it.',
+ 'If you find some content missing from the output MOBI, it '
+ 'is because you misidentified the HTML cover in the input '
+ 'document')
+ oeb.spine.remove(item)
+ if item.media_type in OEB_DOCS:
+ oeb.manifest.remove(item)
+
+def extract_mobi(output_path, opts):
+ if opts.extract_to is not None:
+ from calibre.ebooks.mobi.debug.main import inspect_mobi
+ ddir = opts.extract_to
+ inspect_mobi(output_path, ddir=ddir)
class MOBIOutput(OutputFormatPlugin):
@@ -140,25 +164,6 @@ class MOBIOutput(OutputFormatPlugin):
# Fix up the periodical href to point to first section href
toc.nodes[0].href = toc.nodes[0].nodes[0].href
- def remove_html_cover(self):
- from calibre.ebooks.oeb.base import OEB_DOCS
-
- oeb = self.oeb
- if not oeb.metadata.cover \
- or 'cover' not in oeb.guide:
- return
- href = oeb.guide['cover'].href
- del oeb.guide['cover']
- item = oeb.manifest.hrefs[href]
- if item.spine_position is not None:
- self.log.warn('Found an HTML cover: ', item.href, 'removing it.',
- 'If you find some content missing from the output MOBI, it '
- 'is because you misidentified the HTML cover in the input '
- 'document')
- oeb.spine.remove(item)
- if item.media_type in OEB_DOCS:
- self.oeb.manifest.remove(item)
-
def convert(self, oeb, output_path, input_plugin, opts, log):
from calibre.utils.config import tweaks
from calibre.ebooks.mobi.writer2.resources import Resources
@@ -169,7 +174,7 @@ class MOBIOutput(OutputFormatPlugin):
mobi_type = 'old' # Amazon does not support KF8 periodicals
create_kf8 = mobi_type in ('new', 'both')
- self.remove_html_cover()
+ remove_html_cover(self.oeb, self.log)
resources = Resources(oeb, opts, self.is_periodical,
add_fonts=create_kf8)
self.check_for_periodical()
@@ -185,7 +190,7 @@ class MOBIOutput(OutputFormatPlugin):
) if create_kf8 else None
if mobi_type == 'new':
kf8.write(output_path)
- self.extract_mobi(output_path, opts)
+ extract_mobi(output_path, opts)
return
self.log('Creating MOBI 6 output')
@@ -225,11 +230,72 @@ class MOBIOutput(OutputFormatPlugin):
writer = MobiWriter(opts, resources, kf8,
write_page_breaks_after_item=write_page_breaks_after_item)
writer(oeb, output_path)
- self.extract_mobi(output_path, opts)
+ extract_mobi(output_path, opts)
+
+class AZW3Output(OutputFormatPlugin):
+
+ name = 'AZW3 Output'
+ author = 'Kovid Goyal'
+ file_type = 'azw3'
+
+ options = set([
+ OptionRecommendation(name='prefer_author_sort',
+ recommended_value=False, level=OptionRecommendation.LOW,
+ help=_('When present, use author sort field as author.')
+ ),
+ OptionRecommendation(name='no_inline_toc',
+ recommended_value=False, level=OptionRecommendation.LOW,
+ help=_('Don\'t add Table of Contents to the book. Useful if '
+ 'the book has its own table of contents.')),
+ OptionRecommendation(name='toc_title', recommended_value=None,
+ help=_('Title for any generated in-line table of contents.')
+ ),
+ OptionRecommendation(name='dont_compress',
+ recommended_value=False, level=OptionRecommendation.LOW,
+ help=_('Disable compression of the file contents.')
+ ),
+ OptionRecommendation(name='personal_doc', recommended_value='[PDOC]',
+ help=_('Tag marking book to be filed with Personal Docs')
+ ),
+ OptionRecommendation(name='mobi_toc_at_start',
+ recommended_value=False,
+ help=_('When adding the Table of Contents to the book, add it at the start of the '
+ 'book instead of the end. Not recommended.')
+ ),
+ OptionRecommendation(name='extract_to', recommended_value=None,
+ help=_('Extract the contents of the MOBI file to the'
+ ' specified directory. If the directory already '
+ 'exists, it will be deleted.')
+ ),
+ OptionRecommendation(name='share_not_sync', recommended_value=False,
+ help=_('Enable sharing of book content via Facebook etc. '
+ ' on the Kindle. WARNING: Using this feature means that '
+ ' the book will not auto sync its last read position '
+ ' on multiple devices. Complain to Amazon.')
+ ),
+ ])
+
+ def convert(self, oeb, output_path, input_plugin, opts, log):
+ from calibre.ebooks.mobi.writer2.resources import Resources
+ from calibre.ebooks.mobi.writer8.main import create_kf8_book
+
+ self.oeb, self.opts, self.log = oeb, opts, log
+ opts.mobi_periodical = self.is_periodical
+ passthrough = getattr(opts, 'mobi_passthrough', False)
+
+ resources = Resources(self.oeb, self.opts, self.is_periodical,
+ add_fonts=True, process_images=False)
+ if not passthrough:
+ remove_html_cover(self.oeb, self.log)
+
+ # Split on pagebreaks so that the resulting KF8 works better with
+ # calibre's viewer, which does not support CSS page breaks
+ from calibre.ebooks.oeb.transforms.split import Split
+ Split()(self.oeb, self.opts)
+
+ kf8 = create_kf8_book(self.oeb, self.opts, resources, for_joint=False)
+
+ kf8.write(output_path)
+ extract_mobi(output_path, opts)
- def extract_mobi(self, output_path, opts):
- if opts.extract_to is not None:
- from calibre.ebooks.mobi.debug.main import inspect_mobi
- ddir = opts.extract_to
- inspect_mobi(output_path, ddir=ddir)
diff --git a/src/calibre/ebooks/mobi/debug/mobi8.py b/src/calibre/ebooks/mobi/debug/mobi8.py
index a03205edd7..788ca3ed0a 100644
--- a/src/calibre/ebooks/mobi/debug/mobi8.py
+++ b/src/calibre/ebooks/mobi/debug/mobi8.py
@@ -141,9 +141,10 @@ class MOBIFile(object):
self.files.append(File(skel, skeleton, ftext, first_aid, sections))
def dump_flows(self, ddir):
- if self.fdst is None:
- raise ValueError('This MOBI file has no FDST record')
- for i, x in enumerate(self.fdst.sections):
+ boundaries = [(0, len(self.raw_text))]
+ if self.fdst is not None:
+ boundaries = self.fdst.sections
+ for i, x in enumerate(boundaries):
start, end = x
raw = self.raw_text[start:end]
with open(os.path.join(ddir, 'flow%04d.txt'%i), 'wb') as f:
diff --git a/src/calibre/ebooks/mobi/reader/headers.py b/src/calibre/ebooks/mobi/reader/headers.py
index 0162fddda7..a5ca4a7132 100644
--- a/src/calibre/ebooks/mobi/reader/headers.py
+++ b/src/calibre/ebooks/mobi/reader/headers.py
@@ -234,6 +234,22 @@ class MetadataHeader(BookHeader):
else:
self.exth = None
+ @property
+ def kf8_type(self):
+ if (self.mobi_version == 8 and getattr(self, 'skelidx', NULL_INDEX) !=
+ NULL_INDEX):
+ return u'standalone'
+
+ kf8_header_index = getattr(self.exth, 'kf8_header', None)
+ if kf8_header_index is None:
+ return None
+ try:
+ if self.section_data(kf8_header_index-1) == b'BOUNDARY':
+ return u'joint'
+ except:
+ pass
+ return None
+
def identity(self):
self.stream.seek(60)
ident = self.stream.read(8).upper()
diff --git a/src/calibre/ebooks/mobi/tweak.py b/src/calibre/ebooks/mobi/tweak.py
new file mode 100644
index 0000000000..248ed97261
--- /dev/null
+++ b/src/calibre/ebooks/mobi/tweak.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+ print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+import os, glob
+
+from calibre import CurrentDir
+from calibre.ebooks.mobi import MobiError
+from calibre.ebooks.mobi.reader.mobi6 import MobiReader
+from calibre.ebooks.mobi.reader.headers import MetadataHeader
+from calibre.utils.logging import default_log
+from calibre.ebooks import DRMError
+from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
+from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
+from calibre.customize.ui import (plugin_for_input_format,
+ plugin_for_output_format)
+from calibre.utils.ipc.simple_worker import fork_job
+
+class BadFormat(ValueError):
+ pass
+
+def do_explode(path, dest):
+ with open(path, 'rb') as stream:
+ mr = MobiReader(stream, default_log, None, None)
+
+ with CurrentDir(dest):
+ mr = Mobi8Reader(mr, default_log)
+ opf = os.path.abspath(mr())
+
+ return opf
+
+def explode(path, dest, question=lambda x:True):
+ with open(path, 'rb') as stream:
+ raw = stream.read(3)
+ stream.seek(0)
+ if raw == b'TPZ':
+ raise BadFormat(_('This is not a MOBI file. It is a Topaz file.'))
+
+ try:
+ header = MetadataHeader(stream, default_log)
+ except MobiError:
+ raise BadFormat(_('This is not a MOBI file.'))
+
+ if header.encryption_type != 0:
+ raise DRMError(_('This file is locked with DRM. It cannot be tweaked.'))
+
+ kf8_type = header.kf8_type
+
+ if kf8_type is None:
+ raise BadFormat('This MOBI file does not contain a KF8 format book')
+
+ if kf8_type == 'joint':
+ if not question(_('This MOBI file contains both KF8 and '
+ 'older Mobi6 data. Tweaking it will remove the Mobi6 data, which '
+ 'means the file will not be usable on older Kindles. Are you '
+ 'sure?')):
+ return None
+
+ return fork_job('calibre.ebooks.mobi.tweak', 'do_explode', args=(path,
+ dest), no_output=True)['result']
+
+def do_rebuild(opf, dest_path):
+ plumber = Plumber(opf, dest_path, default_log)
+ plumber.setup_options()
+ inp = plugin_for_input_format('azw3')
+ outp = plugin_for_output_format('azw3')
+
+ plumber.opts.mobi_passthrough = True
+ oeb = create_oebbook(default_log, opf, plumber.opts)
+ outp.convert(oeb, dest_path, inp, plumber.opts, default_log)
+
+def rebuild(src_dir, dest_path):
+ opf = glob.glob(os.path.join(src_dir, '*.opf'))
+ if not opf:
+ raise ValueError('No OPF file found in %s'%src_dir)
+ opf = opf[0]
+ fork_job('calibre.ebooks.mobi.tweak', 'do_rebuild', args=(opf, dest_path),
+ no_output=True)
+
diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py
index 27c4838a4b..9afd39a211 100644
--- a/src/calibre/ebooks/mobi/writer2/main.py
+++ b/src/calibre/ebooks/mobi/writer2/main.py
@@ -25,6 +25,15 @@ from calibre.ebooks.mobi.writer2.indexer import Indexer
WRITE_UNCROSSABLE_BREAKS = False
NULL_INDEX = 0xffffffff
+FLIS = (b'FLIS\0\0\0\x08\0\x41\0\0\0\0\0\0\xff\xff\xff\xff\0\x01\0\x03\0\0\0\x03\0\0\0\x01'+
+ b'\xff'*4)
+
+def fcis(text_length):
+ fcis = b'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
+ fcis += pack(b'>I', text_length)
+ fcis += b'\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
+ return fcis
+
class MobiWriter(object):
def __init__(self, opts, resources, kf8, write_page_breaks_after_item=True):
@@ -208,14 +217,9 @@ class MobiWriter(object):
# FCIS/FLIS (Seems to serve no purpose)
flis_number = len(self.records)
- self.records.append(
- b'FLIS\0\0\0\x08\0\x41\0\0\0\0\0\0\xff\xff\xff\xff\0\x01\0\x03\0\0\0\x03\0\0\0\x01'+
- b'\xff'*4)
- fcis = b'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
- fcis += pack(b'>I', self.text_length)
- fcis += b'\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
+ self.records.append(FLIS)
fcis_number = len(self.records)
- self.records.append(fcis)
+ self.records.append(fcis(self.text_length))
# EOF record
self.records.append(b'\xE9\x8E\x0D\x0A')
@@ -379,6 +383,12 @@ class MobiWriter(object):
self.resources.serialize(self.records, used_images)
resource_record_count = len(self.records) - old
+ # FCIS/FLIS (Seems to serve no purpose)
+ flis_number = len(self.records)
+ self.records.append(FLIS)
+ fcis_number = len(self.records)
+ self.records.append(fcis(self.text_length))
+
# Insert KF8 records
self.records.append(b'BOUNDARY')
kf8_header_index = len(self.records)
@@ -398,6 +408,8 @@ class MobiWriter(object):
header_fields['exth_flags'] = 0b100001010000 # Kinglegen uses this
header_fields['fdst_record'] = NULL_INDEX
header_fields['fdst_count'] = 1 # Why not 0? Kindlegen uses 1
+ header_fields['flis_record'] = flis_number
+ header_fields['fcis_record'] = fcis_number
extra_data_flags = 0b1 # Has multibyte overlap bytes
if self.primary_index_record_idx is not None:
extra_data_flags |= 0b10
diff --git a/src/calibre/ebooks/mobi/writer2/resources.py b/src/calibre/ebooks/mobi/writer2/resources.py
index 2fcb93790c..2f12793b03 100644
--- a/src/calibre/ebooks/mobi/writer2/resources.py
+++ b/src/calibre/ebooks/mobi/writer2/resources.py
@@ -19,9 +19,11 @@ PLACEHOLDER_GIF = b'GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff!\
class Resources(object):
- def __init__(self, oeb, opts, is_periodical, add_fonts=False):
+ def __init__(self, oeb, opts, is_periodical, add_fonts=False,
+ process_images=True):
self.oeb, self.log, self.opts = oeb, oeb.log, opts
self.is_periodical = is_periodical
+ self.process_images = process_images
self.item_map = {}
self.records = []
@@ -34,6 +36,8 @@ class Resources(object):
self.add_resources(add_fonts)
def process_image(self, data):
+ if not self.process_images:
+ return data
return (mobify_image(data) if self.opts.mobi_keep_original_images else
rescale_image(data))
diff --git a/src/calibre/ebooks/mobi/writer8/mobi.py b/src/calibre/ebooks/mobi/writer8/mobi.py
index 18f19a4084..eabcf97047 100644
--- a/src/calibre/ebooks/mobi/writer8/mobi.py
+++ b/src/calibre/ebooks/mobi/writer8/mobi.py
@@ -18,6 +18,14 @@ from calibre.ebooks.mobi.writer8.exth import build_exth
from calibre.utils.filenames import ascii_filename
NULL_INDEX = 0xffffffff
+FLIS = b'FLIS\0\0\0\x08\0\x41\0\0\0\0\0\0\xff\xff\xff\xff\0\x01\0\x03\0\0\0\x03\0\0\0\x01'+ b'\xff'*4
+
+def fcis(text_length):
+ fcis = b'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x02\x00\x00\x00\x00'
+ fcis += pack(b'>L', text_length)
+ fcis += b'\x00\x00\x00\x00\x00\x00\x00\x28\x00\x00\x00\x00\x00\x00\x00'
+ fcis += b'\x28\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
+ return fcis
class MOBIHeader(Header): # {{{
'''
@@ -115,7 +123,10 @@ class MOBIHeader(Header): # {{{
exth_flags = DYN
# 132: Unknown
- unknown = zeroes(36)
+ unknown = zeroes(32)
+
+ # 164: Unknown
+ unknown_index = NULL
# 168: DRM
drm_offset = NULL
@@ -130,13 +141,13 @@ class MOBIHeader(Header): # {{{
fdst_record = DYN
fdst_count = DYN
- # 200: FCI
- fcis_record = NULL
- fcis_count
+ # 200: FCIS
+ fcis_record = DYN
+ fcis_count = 1
# 208: FLIS
- flis_record = NULL
- flis_count
+ flis_record = DYN
+ flis_count = 1
# 216: Unknown
unknown3 = zeroes(8)
@@ -193,7 +204,7 @@ HEADER_FIELDS = {'compression', 'text_length', 'last_text_record', 'book_type',
'first_resource_record', 'exth_flags', 'fdst_record',
'fdst_count', 'ncx_index', 'chunk_index', 'skel_index',
'guide_index', 'exth', 'full_title', 'extra_data_flags',
- 'uid'}
+ 'flis_record', 'fcis_record', 'uid'}
class KF8Book(object):
@@ -241,6 +252,12 @@ class KF8Book(object):
self.fdst_record = len(self.records)
self.records.extend(writer.fdst_records)
+ # FLIS/FCIS
+ self.flis_record = len(self.records)
+ self.records.append(FLIS)
+ self.fcis_record = len(self.records)
+ self.records.append(fcis(self.text_length))
+
# EOF
self.records.append(b'\xe9\x8e\r\n') # EOF record
diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py
index c2cd9b4283..8fd4714e1c 100644
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@@ -13,7 +13,7 @@ from functools import partial
from lxml import etree
-from calibre.ebooks.oeb.base import XHTML_NS
+from calibre.ebooks.oeb.base import XHTML_NS, extract
from calibre.constants import ispy3
from calibre.ebooks.mobi.utils import to_base
@@ -224,14 +224,24 @@ class Chunker(object):
nroot.text = root.text
nroot.tail = '\n'
- for tag in root.iterdescendants(etree.Element):
- # We are ignoring all non tag entities in the tree
- # like comments and processing instructions, as they make the
- # chunking code even harder, for minimal gain.
- elem = nroot.makeelement(tag.tag.rpartition('}')[-1],
- attrib={k.rpartition('}')[-1]:v for k, v in
- tag.attrib.iteritems()})
- elem.text, elem.tail = tag.text, tag.tail
+ # Remove Comments and ProcessingInstructions as kindlegen seems to
+ # remove them as well
+ for tag in root.iterdescendants():
+ if tag.tag in {etree.Comment, etree.ProcessingInstruction}:
+ extract(tag)
+
+ for tag in root.iterdescendants():
+ if tag.tag == etree.Entity:
+ elem = etree.Entity(tag.name)
+ else:
+ tn = tag.tag
+ if tn is not None:
+ tn = tn.rpartition('}')[-1]
+ elem = nroot.makeelement(tn,
+ attrib={k.rpartition('}')[-1]:v for k, v in
+ tag.attrib.iteritems()})
+ elem.text = tag.text
+ elem.tail = tag.tail
parent = node_from_path(nroot, path_to_node(tag.getparent()))
parent.append(elem)
@@ -251,6 +261,11 @@ class Chunker(object):
# Now loop over children
for child in list(tag):
raw = tostring(child, with_tail=False)
+ if child.tag == etree.Entity:
+ chunks.append(raw)
+ if child.tail:
+ chunks.extend(self.chunk_up_text(child.tail, aid))
+ continue
raw = close_self_closing_tags(raw)
if len(raw) > CHUNK_SIZE and child.get('aid', None):
self.step_into_tag(child, chunks)
diff --git a/src/calibre/ebooks/tweak.py b/src/calibre/ebooks/tweak.py
new file mode 100644
index 0000000000..72e4c0a56c
--- /dev/null
+++ b/src/calibre/ebooks/tweak.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+ print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+import sys, os, shlex, subprocess
+
+from calibre import prints, as_unicode, walk
+from calibre.constants import iswindows, __appname__
+from calibre.ptempfile import TemporaryDirectory
+from calibre.libunzip import extract as zipextract
+from calibre.utils.zipfile import ZipFile, ZIP_DEFLATED, ZIP_STORED
+from calibre.utils.ipc.simple_worker import WorkerError
+
+class Error(ValueError):
+ pass
+
+def ask_cli_question(msg):
+ prints(msg, end=' [y/N]: ')
+ sys.stdout.flush()
+
+ if iswindows:
+ import msvcrt
+ ans = msvcrt.getch()
+ else:
+ import tty, termios
+ old_settings = termios.tcgetattr(sys.stdin.fileno())
+ try:
+ tty.setraw(sys.stdin.fileno())
+ try:
+ ans = sys.stdin.read(1)
+ except KeyboardInterrupt:
+ ans = b''
+ finally:
+ termios.tcsetattr(sys.stdin.fileno(), termios.TCSADRAIN, old_settings)
+ print()
+ return ans == b'y'
+
+def mobi_exploder(path, tdir, question=lambda x:True):
+ from calibre.ebooks.mobi.tweak import explode, BadFormat
+ try:
+ return explode(path, tdir, question=question)
+ except BadFormat as e:
+ raise Error(as_unicode(e))
+
+def zip_exploder(path, tdir, question=lambda x:True):
+ zipextract(path, tdir)
+ for f in walk(tdir):
+ if f.lower().endswith('.opf'):
+ return f
+ raise Error('Invalid book: Could not find .opf')
+
+def zip_rebuilder(tdir, path):
+ with ZipFile(path, 'w', compression=ZIP_DEFLATED) as zf:
+ # Write mimetype
+ mt = os.path.join(tdir, 'mimetype')
+ if os.path.exists(mt):
+ zf.write(mt, 'mimetype', compress_type=ZIP_STORED)
+ # Write everything else
+ exclude_files = {'.DS_Store', 'mimetype', 'iTunesMetadata.plist'}
+ for root, dirs, files in os.walk(tdir):
+ for fn in files:
+ if fn in exclude_files:
+ continue
+ absfn = os.path.join(root, fn)
+ zfn = os.path.relpath(absfn, tdir).replace(os.sep, '/')
+ zf.write(absfn, zfn)
+
+def get_tools(fmt):
+ fmt = fmt.lower()
+
+ if fmt in {'mobi', 'azw', 'azw3'}:
+ from calibre.ebooks.mobi.tweak import rebuild
+ ans = mobi_exploder, rebuild
+ elif fmt in {'epub', 'htmlz'}:
+ ans = zip_exploder, zip_rebuilder
+ else:
+ ans = None, None
+
+ return ans
+
+def tweak(ebook_file):
+ ''' Command line interface to the Tweak Book tool '''
+ fmt = ebook_file.rpartition('.')[-1].lower()
+ exploder, rebuilder = get_tools(fmt)
+ if exploder is None:
+ prints('Cannot tweak %s files. Supported formats are: EPUB, HTMLZ, AZW3, MOBI'
+ , file=sys.stderr)
+ raise SystemExit(1)
+
+ with TemporaryDirectory('_tweak_'+
+ os.path.basename(ebook_file).rpartition('.')[0]) as tdir:
+ try:
+ opf = exploder(ebook_file, tdir, question=ask_cli_question)
+ except WorkerError as e:
+ prints('Failed to unpack', ebook_file)
+ prints(e.orig_tb)
+ raise SystemExit(1)
+ except Error as e:
+ prints(as_unicode(e), file=sys.stderr)
+ raise SystemExit(1)
+
+ if opf is None:
+ # The question was answered with No
+ return
+
+ ed = os.environ.get('EDITOR', None)
+ proceed = False
+ if ed is None:
+ prints('Book extracted to', tdir)
+ prints('Make your tweaks and once you are done,', __appname__,
+ 'will rebuild', ebook_file, 'from', tdir)
+ print()
+ proceed = ask_cli_question('Rebuild ' + ebook_file + '?')
+ else:
+ cmd = shlex.split(ed)
+ try:
+ subprocess.check_call(cmd + [tdir])
+ except:
+ prints(ed, 'failed, aborting...')
+ raise SystemExit(1)
+ proceed = True
+
+ if proceed:
+ prints('Rebuilding', ebook_file, 'please wait ...')
+ try:
+ rebuilder(tdir, ebook_file)
+ except WorkerError as e:
+ prints('Failed to rebuild', ebook_file)
+ prints(e.orig_tb)
+ raise SystemExit(1)
+ prints(ebook_file, 'successfully tweaked')
+
diff --git a/src/calibre/gui2/convert/azw3_output.py b/src/calibre/gui2/convert/azw3_output.py
new file mode 100644
index 0000000000..8b1ef25aac
--- /dev/null
+++ b/src/calibre/gui2/convert/azw3_output.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__ = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+
+from calibre.gui2.convert.azw3_output_ui import Ui_Form
+from calibre.gui2.convert import Widget
+
+font_family_model = None
+
+class PluginWidget(Widget, Ui_Form):
+
+ TITLE = _('AZW3 Output')
+ HELP = _('Options specific to')+' AZW3 '+_('output')
+ COMMIT_NAME = 'azw3_output'
+ ICON = I('mimetypes/mobi.png')
+
+ def __init__(self, parent, get_option, get_help, db=None, book_id=None):
+ Widget.__init__(self, parent,
+ ['prefer_author_sort', 'toc_title',
+ 'mobi_ignore_margins', 'mobi_toc_at_start',
+ 'dont_compress', 'no_inline_toc', 'share_not_sync',
+ 'personal_doc']#, 'mobi_navpoints_only_deepest']
+ )
+ self.db, self.book_id = db, book_id
+
+ self.initialize_options(get_option, get_help, db, book_id)
+
+
diff --git a/src/calibre/gui2/convert/azw3_output.ui b/src/calibre/gui2/convert/azw3_output.ui
new file mode 100644
index 0000000000..657a38861d
--- /dev/null
+++ b/src/calibre/gui2/convert/azw3_output.ui
@@ -0,0 +1,125 @@
+
+
+ Form
+
+
+
+ 0
+ 0
+ 588
+ 342
+
+
+
+ Form
+
+
+ -
+
+
+ Use author &sort for author
+
+
+
+ -
+
+
+ &Title for Table of Contents:
+
+
+ opt_toc_title
+
+
+
+ -
+
+
+ -
+
+
+ Disable compression of the file contents
+
+
+
+ -
+
+
+ Do not add Table of Contents to book
+
+
+
+ -
+
+
+ Put generated Table of Contents at &start of book instead of end
+
+
+
+ -
+
+
+ Ignore &margins
+
+
+
+ -
+
+
+ Kindle options
+
+
+
-
+
+
-
+
+
+ Personal Doc tag:
+
+
+
+ -
+
+
+
+
+ -
+
+
+ Enable sharing of book content via Facebook, etc. WARNING: Disables last read syncing
+
+
+
+ -
+
+
+ Qt::Vertical
+
+
+
+ 20
+ 40
+
+
+
+
+
+
+
+ -
+
+
+ Qt::Vertical
+
+
+
+ 20
+ 40
+
+
+
+
+
+
+
+
+
diff --git a/src/calibre/gui2/store/stores/bn_plugin.py b/src/calibre/gui2/store/stores/bn_plugin.py
index 38461cac40..ded20e8823 100644
--- a/src/calibre/gui2/store/stores/bn_plugin.py
+++ b/src/calibre/gui2/store/stores/bn_plugin.py
@@ -7,7 +7,6 @@ __copyright__ = '2011, John Schember '
__docformat__ = 'restructuredtext en'
import random
-import re
import urllib
from contextlib import closing
@@ -39,7 +38,7 @@ class BNStore(BasicStoreConfig, StorePlugin):
purl = None
url = murl
- print(url)
+ #print(url)
if external or self.config.get('open_external', False):
open_url(QUrl(url_slash_cleaner(url)))
diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst
index f0d9aa8bd3..34e54592e4 100644
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@@ -22,7 +22,7 @@ It can convert every input format in the following list, to every output format.
*Input Formats:* CBZ, CBR, CBC, CHM, DJVU, EPUB, FB2, HTML, HTMLZ, LIT, LRF, MOBI, ODT, PDF, PRC, PDB, PML, RB, RTF, SNB, TCR, TXT, TXTZ
-*Output Formats:* EPUB, FB2, OEB, LIT, LRF, MOBI, HTMLZ, PDB, PML, RB, PDF, RTF, SNB, TCR, TXT, TXTZ
+*Output Formats:* AZW3, EPUB, FB2, OEB, LIT, LRF, MOBI, HTMLZ, PDB, PML, RB, PDF, RTF, SNB, TCR, TXT, TXTZ
.. note ::
@@ -35,7 +35,7 @@ It can convert every input format in the following list, to every output format.
What are the best source formats to convert?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-In order of decreasing preference: LIT, MOBI, EPUB, FB2, HTML, PRC, RTF, PDB, TXT, PDF
+In order of decreasing preference: LIT, MOBI, AZW, EPUB, AZW3, FB2, HTML, PRC, RTF, PDB, TXT, PDF
I converted a PDF file, but the result has various problems?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/src/calibre/manual/template_lang.rst b/src/calibre/manual/template_lang.rst
index 782673ce16..fb035a8cb3 100644
--- a/src/calibre/manual/template_lang.rst
+++ b/src/calibre/manual/template_lang.rst
@@ -245,7 +245,7 @@ The following functions are available in addition to those described in single-f
* ``current_library_name() -- `` return the last name on the path to the current calibre library. This function can be called in template program mode using the template ``{:'current_library_name()'}``.
* ``days_between(date1, date2)`` -- return the number of days between ``date1`` and ``date2``. The number is positive if ``date1`` is greater than ``date2``, otherwise negative. If either ``date1`` or ``date2`` are not dates, the function returns the empty string.
* ``divide(x, y)`` -- returns x / y. Throws an exception if either x or y are not numbers.
- * ``eval(string)`` -- evaluates the string as a program, passing the local variables (those ``assign`` ed to). This permits using the template processor to construct complex results from local variables.
+ * ``eval(string)`` -- evaluates the string as a program, passing the local variables (those ``assign`` ed to). This permits using the template processor to construct complex results from local variables. Because the `{` and `}` characters are special, you must use `[[` for the `{` character and `]]` for the '}' character; they are converted automatically. Note also that prefixes and suffixes (the "|prefix|suffix" syntax) cannot be used in the argument to this function when using template program mode.
* ``field(name)`` -- returns the metadata field named by ``name``.
* ``first_non_empty(value, value, ...)`` -- returns the first value that is not empty. If all values are empty, then the empty value is returned. You can have as many values as you want.
* ``format_date(x, date_format)`` -- format_date(val, format_string) -- format the value, which must be a date field, using the format_string, returning a string. The formatting codes are::
@@ -306,7 +306,7 @@ The following functions are available in addition to those described in single-f
* ``substr(str, start, end)`` -- returns the ``start``'th through the ``end``'th characters of ``str``. The first character in ``str`` is the zero'th character. If end is negative, then it indicates that many characters counting from the right. If end is zero, then it indicates the last character. For example, ``substr('12345', 1, 0)`` returns ``'2345'``, and ``substr('12345', 1, -1)`` returns ``'234'``.
* ``subtract(x, y)`` -- returns x - y. Throws an exception if either x or y are not numbers.
* ``today()`` -- return a date string for today. This value is designed for use in format_date or days_between, but can be manipulated like any other string. The date is in ISO format.
- * ``template(x)`` -- evaluates x as a template. The evaluation is done in its own context, meaning that variables are not shared between the caller and the template evaluation. Because the `{` and `}` characters are special, you must use `[[` for the `{` character and `]]` for the '}' character; they are converted automatically. For example, ``template('[[title_sort]]') will evaluate the template ``{title_sort}`` and return its value.
+ * ``template(x)`` -- evaluates x as a template. The evaluation is done in its own context, meaning that variables are not shared between the caller and the template evaluation. Because the `{` and `}` characters are special, you must use `[[` for the `{` character and `]]` for the '}' character; they are converted automatically. For example, ``template('[[title_sort]]') will evaluate the template ``{title_sort}`` and return its value. Note also that prefixes and suffixes (the "|prefix|suffix" syntax) cannot be used in the argument to this function when using template program mode.
.. _template_functions_reference:
diff --git a/src/calibre/utils/config_base.py b/src/calibre/utils/config_base.py
index 7fb120d028..ab22c6b30b 100644
--- a/src/calibre/utils/config_base.py
+++ b/src/calibre/utils/config_base.py
@@ -387,7 +387,7 @@ def _prefs():
help=_('The language in which to display the user interface'))
c.add_opt('output_format', default='EPUB',
help=_('The default output format for ebook conversions.'))
- c.add_opt('input_format_order', default=['EPUB', 'MOBI', 'LIT', 'PRC',
+ c.add_opt('input_format_order', default=['EPUB', 'AZW3', 'MOBI', 'LIT', 'PRC',
'FB2', 'HTML', 'HTM', 'XHTM', 'SHTML', 'XHTML', 'ZIP', 'ODT', 'RTF', 'PDF',
'TXT'],
help=_('Ordered list of formats to prefer for input.'))
diff --git a/src/calibre/utils/formatter_functions.py b/src/calibre/utils/formatter_functions.py
index bfb2f036c0..5b620e54e3 100644
--- a/src/calibre/utils/formatter_functions.py
+++ b/src/calibre/utils/formatter_functions.py
@@ -217,7 +217,9 @@ class BuiltinTemplate(BuiltinFormatterFunction):
'characters are special, you must use [[ for the { character and '
']] for the } character; they are converted automatically. '
'For example, template(\'[[title_sort]]\') will evaluate the '
- 'template {title_sort} and return its value.')
+ 'template {title_sort} and return its value. Note also that '
+ 'prefixes and suffixes (the "|prefix|suffix" syntax) cannot be '
+ 'used in the argument to this function when using template program mode.')
def evaluate(self, formatter, kwargs, mi, locals, template):
template = template.replace('[[', '{').replace(']]', '}')
@@ -230,7 +232,12 @@ class BuiltinEval(BuiltinFormatterFunction):
__doc__ = doc = _('eval(template) -- evaluates the template, passing the local '
'variables (those \'assign\'ed to) instead of the book metadata. '
' This permits using the template processor to construct complex '
- 'results from local variables.')
+ 'results from local variables. Because the { and } '
+ 'characters are special, you must use [[ for the { character and '
+ ']] for the } character; they are converted automatically. '
+ 'Note also that prefixes and suffixes (the "|prefix|suffix" syntax) '
+ 'cannot be used in the argument to this function when using '
+ 'template program mode.')
def evaluate(self, formatter, kwargs, mi, locals, template):
from formatter import EvalFormatter