« '),
@@ -86,9 +91,10 @@ class LeMonde(BasicNewsRecipe):
(re.compile(r'\s»'), lambda match: ' »'),
(re.compile(r'«\s'), lambda match: '« '),
(re.compile(r' %'), lambda match: ' %'),
- (re.compile(r'\.jpg » border='), lambda match: '.jpg'),
- (re.compile(r'\.png » border='), lambda match: '.png'),
+ (re.compile(r'\.jpg » width='), lambda match: '.jpg'),
+ (re.compile(r'\.png » width='), lambda match: '.png'),
(re.compile(r' – '), lambda match: ' – '),
+ (re.compile(r'figcaption style="display:none"'), lambda match: 'figcaption'),
(re.compile(r' – '), lambda match: ' – '),
(re.compile(r' - '), lambda match: ' – '),
(re.compile(r' -,'), lambda match: ' –,'),
@@ -97,10 +103,15 @@ class LeMonde(BasicNewsRecipe):
keep_only_tags = [
- dict(name='div', attrs={'class':['contenu']})
+ dict(name='div', attrs={'class':['global']})
]
- remove_tags = [dict(name='div', attrs={'class':['LM_atome']})]
- remove_tags_after = [dict(id='appel_temoignage')]
+
+ remove_tags = [
+ dict(name='div', attrs={'class':['bloc_base meme_sujet']}),
+ dict(name='p', attrs={'class':['lire']})
+ ]
+
+ remove_tags_after = [dict(id='fb-like')]
def get_article_url(self, article):
url = article.get('guid', None)
@@ -136,4 +147,3 @@ class LeMonde(BasicNewsRecipe):
cover_url = link_item.img['src']
return cover_url
-
diff --git a/recipes/liberatorio_politico.recipe b/recipes/liberatorio_politico.recipe
new file mode 100644
index 0000000000..bbffcd89b1
--- /dev/null
+++ b/recipes/liberatorio_politico.recipe
@@ -0,0 +1,12 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1334649829(BasicNewsRecipe):
+ title = u'Liberatorio Politico'
+ oldest_article = 7
+ max_articles_per_feed = 100
+ auto_cleanup = True
+ masthead_url = 'http://liberatorio.altervista.org/wp-content/uploads/2012/01/Testata-LIBERATORIO-Altervista1.jpg'
+ feeds = [(u'Liberatorio Politico', u'http://liberatorio.altervista.org/feed/')]
+ __author__ = 'faber1971'
+ description = 'Inquiry journalism - a blog on Molfetta, Land of Bari, Apulia and Italy - v1.00 (07, April 2012)'
+ language = 'it'
diff --git a/recipes/limes.recipe b/recipes/limes.recipe
new file mode 100644
index 0000000000..2290b7099e
--- /dev/null
+++ b/recipes/limes.recipe
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+__license__ = 'GPL v3'
+__copyright__ = '2012, faber1971'
+__version__ = 'v1.00'
+__date__ = '16, April 2012'
+__description__ = 'Geopolitical Italian magazine'
+
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Limes(BasicNewsRecipe):
+ description = 'Italian weekly magazine'
+ __author__ = 'faber1971'
+
+ cover_url = 'http://temi.repubblica.it/UserFiles/limes/Image/Loghi/logo-limes.gif'
+ title = 'Limes'
+ category = 'Geopolitical news'
+
+ language = 'it'
+# encoding = 'cp1252'
+ timefmt = '[%a, %d %b, %Y]'
+
+ oldest_article = 16
+ max_articles_per_feed = 100
+ use_embedded_content = False
+ recursion = 10
+
+ remove_javascript = True
+ no_stylesheets = True
+ masthead_url = 'http://temi.repubblica.it/UserFiles/limes/Image/Loghi/logo-limes.gif'
+
+ feeds = [
+ (u'Limes', u'http://temi.repubblica.it/limes/feed/')
+ ]
+
+
+
+ keep_only_tags = [
+ dict(name='div', attrs={'class':['testo','copertina','occhiello','firma','didascalia','content-second-right','detail-articles','titolo-local','generic-articles']}),
+ dict(name='div', attrs={'class':['generic-articles','summary','detail-articles']}),
+ dict(name='div', attrs={'id':['content-second-right','content2']})
+ ]
+
+ remove_tags = [
+ dict(name='div',attrs={'class':['servizi','aggiungi','label-web','bottom-mobile','box-abbonamenti','box-cerca','big','little','stampaweb']}),
+ dict(name='div',attrs={'id':['topheader','header','navigation-new','navigation','content-second-left','menutext']}),
+ dict(name='ul',attrs={'id':'user-utility'}),
+ dict(name=['script','noscript','iframe'])
+ ]
+
diff --git a/recipes/marketing_magazine.recipe b/recipes/marketing_magazine.recipe
index 0c14939cd8..d004f274af 100644
--- a/recipes/marketing_magazine.recipe
+++ b/recipes/marketing_magazine.recipe
@@ -1,20 +1,21 @@
__license__ = 'GPL v3'
+__author__ = 'faber1971'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1327062445(BasicNewsRecipe):
title = u'Marketing Magazine'
+ description = 'Collection of Italian marketing websites'
+ language = 'it'
+ __author__ = 'faber1971'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
remove_javascript = True
no_stylesheets = True
+ conversion_options = {'linearize_tables': True}
remove_tags = [
dict(name='ul', attrs={'id':'ads0'})
]
masthead_url = 'http://www.simrendeogun.com/wp-content/uploads/2011/06/New-Marketing-Magazine-Logo.jpg'
- __author__ = 'faber1971'
- description = 'Collection of Italian marketing websites - v1.03 (20, February 2012)'
- language = 'it'
-
- feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'MarketingArena', u'http://feeds.feedburner.com/marketingarena'), (u'minimarketing', u'http://feeds.feedburner.com/minimarketingit'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')]
+ feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'MarketingArena', u'http://feeds.feedburner.com/marketingarena'), (u'Marketing Journal', u'http://feeds.feedburner.com/marketingjournal/jPwA'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'minimarketing', u'http://feeds.feedburner.com/minimarketingit'), (u'[4]marketing.biz', u'http://feeds.feedburner.com/4marketing'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Bloguerrilla', u'http://feeds.feedburner.com/Bloguerrilla'), (u'Nonconvenzionale', u'http://feeds.feedburner.com/nonconvenzionale'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')]
diff --git a/recipes/melbourne_herald_sun.recipe b/recipes/melbourne_herald_sun.recipe
new file mode 100644
index 0000000000..c24a4563af
--- /dev/null
+++ b/recipes/melbourne_herald_sun.recipe
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+__license__ = 'GPL v3'
+__copyright__ = '2009, Matthew Briggs'
+__docformat__ = 'restructuredtext en'
+
+'''
+http://www.herald sun.com.au/
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class DailyTelegraph(BasicNewsRecipe):
+ title = u'Melbourne Herald Sun'
+ __author__ = u'Ray Hartley'
+ description = (u'Victorian and National News'
+ '. You will need to have a subscription to '
+ 'http://www.heraldsun.com.au to get full articles.')
+ language = 'en_AU'
+
+ oldest_article = 2
+ needs_subscription = 'optional'
+ max_articles_per_feed = 30
+ remove_javascript = True
+ no_stylesheets = True
+ encoding = 'utf8'
+ use_embedded_content = False
+ language = 'en_AU'
+ remove_empty_feeds = True
+ publication_type = 'newspaper'
+ masthead_url = 'http://resources2.news.com.au/cs/heraldsun/images/header-and-footer/logo.gif'
+ extra_css = """
+ body{font-family: Arial,Helvetica,sans-serif }
+ img{margin-bottom: 0.4em; display:block}
+ .caption{display: inline; font-size: x-small}
+ """
+
+ conversion_options = {
+ 'comment' : description
+ , 'language' : language
+ }
+
+ keep_only_tags = [dict(attrs={'id':'story'})]
+ remove_tags_before=dict(attrs={'class':'story-header'})
+ remove_tags_after=dict(attrs={'class':'story-footer'})
+ remove_tags = [
+ dict(name=['meta','link','base','iframe','embed','object','media-metadata','media-reference','media-producer'])
+ ,dict(attrs={'class':['story-header-tools','story-sidebar','story-footer','story-summary-list']})
+ ]
+ remove_attributes=['lang']
+
+
+ feeds = [(u'Breaking News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_breakingnews_206.xml' )
+ ,(u'Business' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_business_207.xml' )
+ ,(u'Entertainment' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_entertainment_208.xml' )
+ ,(u'Health Science' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_health_212.xml' )
+ ,(u'Music' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_music_449.xml' )
+ ,(u'National News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_national_209.xml' )
+ ,(u'Sport News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_sport_213.xml' )
+ ,(u'AFL News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_afl_205.xml' )
+ ,(u'State News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_vic_214.xml' )
+ ,(u'Technology' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_tech_215.xml' )
+ ,(u'World News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_world_216.xml' )
+ ,(u'Opinion', u'http://feeds.news.com.au/public/rss/2.0/heraldsun_opinion_210.xml' )
+ ,(u'Andrew Bolt' , u'http://blogs.news.com.au/heraldsun/andrewbolt/index.php/xml/rss_2.0/heraldsun/hs_andrewbolt/')
+ ,(u'Afl - St Kilda' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_afl_stkilda_565.xml')
+ ,(u'Terry McCrann' ,u'http://feeds.news.com.au/public/rss/2.0/heraldsun_tmccrann_224.xml' )
+ ,(u'The Other side' ,u'http://feeds.news.com.au/public/rss/2.0/heraldsun_otherside_211.xml')]
+
+ def get_browser(self):
+ br = BasicNewsRecipe.get_browser(self)
+ if self.username and self.password:
+ br.open('http://www.heraldsun.com.au')
+ br.select_form(nr=0)
+ br['username'] = self.username
+ br['password'] = self.password
+ raw = br.submit().read()
+ if '>log out' not in raw.lower():
+ raise ValueError('Failed to log in to www.heralsun'
+ ' are your username and password correct?')
+ return br
+
+ def get_article_url(self, article):
+ return article.id
+
+
diff --git a/recipes/metro_news_nl.recipe b/recipes/metro_news_nl.recipe
index ac3e23869b..9191f7caec 100644
--- a/recipes/metro_news_nl.recipe
+++ b/recipes/metro_news_nl.recipe
@@ -3,25 +3,6 @@ from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.utils.magick import Image
from BeautifulSoup import BeautifulSoup
-try:
- from calibre_plugins.drMerry.debug import debuglogger as mlog
- print 'drMerry debuglogger found, debug options can be used'
- from calibre_plugins.drMerry.stats import statslogger as mstat
- print 'drMerry stats tracker found, stat can be tracked'
- mlog.setLoglevel(1) #-1 == no log; 0 for normal output
- mstat.calculateStats(False) #track stats (to track stats loglevel must be > 0
- KEEPSTATS = mstat.keepmystats()
- SHOWDEBUG0 = mlog.showdebuglevel(0)
- SHOWDEBUG1 = mlog.showdebuglevel(1)
- SHOWDEBUG2 = mlog.showdebuglevel(2)
-except:
- #print 'drMerry debuglogger not found, skipping debug options'
- SHOWDEBUG0 = False
- SHOWDEBUG1 = False
- SHOWDEBUG2 = False
- KEEPSTATS = False
-
-#print ('level0: %s\nlevel1: %s\nlevel2: %s' % (SHOWDEBUG0,SHOWDEBUG1,SHOWDEBUG2))
''' Version 1.2, updated cover image to match the changed website.
added info date on title
@@ -43,80 +24,75 @@ except:
extended timeout from 2 to 10
changed oldest article from 10 to 1.2
changed max articles from 15 to 25
+ Version 1.9.1 18-04-2012
+ removed some debug settings
+ updated code to match new metro-layout
+ Version 1.9.2 24-04-2012
+ updated code to match new metro-layout
+ Version 1.9.3 25-04-2012
+ Changed a lot of custom code into calibre code as the default code of calibre has become much faster since the first version fo this recipe
+ Added new feeds
+ Updated css
+ Changed order of regex to speedup proces
'''
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Metro Nieuws NL'
oldest_article = 1.2
max_articles_per_feed = 25
- __author__ = u'DrMerry'
- description = u'Metro Nederland'
- language = u'nl'
- simultaneous_downloads = 3
+ __author__ = u'DrMerry'
+ description = u'Metro Nederland'
+ language = u'nl'
+ simultaneous_downloads = 5
masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
timeout = 10
- center_navbar = True
- timefmt = ' [%A, %d %b %Y]'
+ center_navbar = True
+ timefmt = ' [%A, %d %b %Y]'
no_stylesheets = True
remove_javascript = True
remove_empty_feeds = True
- cover_url = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg'
+ cover_url = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg'
publication_type = 'newspaper'
- encoding = 'utf-8'
- remove_attributes = ['style', 'font', 'width', 'height']
+ encoding = 'utf-8'
+ remove_attributes = ['style', 'font', 'width', 'height', 'itemtype', 'itemprop', 'itemscope']#, 'href']
use_embedded_content = False
- conversion_options = {
- 'authors' : 'Metro Nederland & calibre & DrMerry',
- 'author_sort' : 'Metro Nederland & calibre & DrMerry',
- 'publisher' : 'DrMerry/Metro Nederland'
- }
- extra_css = 'body {padding:5px 0px; background:#fff;font-size: 13px;}\
- #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {clear: both;margin-bottom: 10px;font-size:0.5em; color: #616262;}\
- .article-box-fact.module-title {clear:both;padding: 8px 0;color: #24763b;font-family: arial, sans-serif;font-size: 14px;font-weight: bold;}\
- h1.title {color: #000000;font-size: 44px;padding-bottom: 10px;font-weight: 300;} h2.subtitle {font-size: 13px;font-weight: 700;padding-bottom: 10px;}\
- .article-body p{padding-bottom:10px;}div.column-1-3{margin-left: 19px;padding-right: 9px;}\
- div.column-1-2 {display: inline;padding-right: 7px;}\
- p.article-image-caption {font-size: 12px;font-weight: 300;color: #616262;margin-top: 5px;} \
- p.article-image-caption .credits {font-style: italic;font-size: 10px;}\
- div.article-image-caption {width: 246px;margin-bottom: 5px;margin-left: 10px;}\
- div.article-image-caption-2column {margin-bottom: 10px;width: 373px;} div.article-image-caption-3column {}\
- img {border:0px; padding:2px;} hr.merryhr {width:30%; border-width:0px; color:green; margin-left:5px; background-color: green} div.column-3 {background-color:#eee; width:50%; margin:2px; float:right; padding:2px;} div.column-3 module-title {border: 1px solid #aaa} div.article-box-fact div.subtitle {font-weight:bold; color:green;}'
+ extra_css = 'body{font-size:1em;padding:5px 0}body,a,h2{background-color:#fff;text-decoration:none;color:#000}#date,div.byline,p.article-image-caption .credits,.calibrenavbar{font-size:.5em}.article-box-fact.module-title,#date,div.byline{clear:both}.article-box-fact.module-title{margin:8px 0}.article-box-fact.module-title,h2{font-size:1.1em}h1.title{font-size:1.4em}h1.title,.article-body p,div.article-image-caption-2column,div.article-image-caption-3column,#date,div.byline{margin-bottom:.6em}div.article-box-fact div.subtitle,.article-box-fact.module-title,h1.title,p.article-image-caption{font-weight:700}div.column-1-3{margin-left:19px}div.column-1-2{display:inline}div.column-1-2,div.column-1-3{margin-right:7px}p.article-image-caption{font-size:.6em;margin-top:5px}p.article-image-caption,#date,div.byline{color:#616262}p.article-image-caption .credits{font-style:italic}div.article-image-caption{width:246px}div.article-image-caption-2column{width:373px}div.column-3{background-color:#eee;float:right;width:50%}div.column-3 module-title{border:1px solid #aaa}div.article-box-fact div.subtitle,.article-box-fact.module-title{color:#24763b}div.byline{border-top:2px solid #24763b}div.column-3,img,div.column-3,p.small,div.article-image-caption{margin:.5em}img,p.small,.column1,h2{border:0;padding:0}.column1,h1,h2{margin:0}'
+
preprocess_regexps = [
- (re.compile(r'
]+top-line[^>]+>', re.DOTALL|re.IGNORECASE),
- lambda match: '
'),
- (re.compile(r'(
]+metronieuws\.nl/[^>]+/templates/[^>]+jpe?g[^>]+>|metronieuws\.nl/internal\-roxen\-unit\.gif)', re.DOTALL|re.IGNORECASE),
- lambda match: ''),
+ (re.compile(r'( |\s|
]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)', re.DOTALL|re.IGNORECASE),lambda match: ' '),
+ #(re.compile(r'( |\s)+', re.DOTALL|re.IGNORECASE),lambda match:' '),
+ #(re.compile(r'<(a |/a)[^>]*>', re.DOTALL|re.IGNORECASE),lambda match:'')
+ #(re.compile('(?)h2', re.DOTALL|re.IGNORECASE),lambda match:'\1em')
]
+
+ remove_tags_before= dict(id='date')
+ remove_tags_after = [dict(name='div', attrs={'class':['column-1-3','gallery-text']})]#id='share-and-byline')]
+ remove_tags = [
+ dict(name=['iframe','script','noscript','style']),
+ dict(name='div', attrs={'class':[re.compile('column-[14]-5'),'col-179 ','col-373 ','clear','ad','navigation',re.compile('share-tools(-top)?'),'tools','metroCommentFormWrap','article-tools-below-title','related-links','padding-top-15',re.compile('^promo.*?$'),'teaser-component',re.compile('fb(-comments|_iframe_widget)')]}),
+ dict(id=['column-1-5-bottom','column-4-5',re.compile('^ad(\d+|adcomp.*?)?$'),'sidebar',re.compile('^article-\d'),'comments','gallery-1']),
+ dict(name='a', attrs={'name':'comments'}),
+ #dict(name='div', attrs={'data-href'}),
+ dict(name='img', attrs={'class':'top-line'}),
+ dict(attrs={'style':re.compile('^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$'),'title':'volledig scherm'})]
+
+ '''removed by before/after:
+ id:
+ column-1-5-top,'hidden_div','footer',
+ class:
+ 'header',re.compile('^footer-[a-zA-Z0-9]+$),'header-links',
+ '''
def preprocess_html(self, soup):
- if SHOWDEBUG0 == True:
- mlog.setdefaults()
- mlog.addTextAndTag(['Show debug = on with level'], [str(mlog.debuglevel)])
- if KEEPSTATS == True:
- mlog.addDebug('Stats will be calculated')
- else:
- mlog.addTextAndTag(['Stats won\'t be calculated\nTo be enabled, stats must be true, currently','and debug level must be 1 or higher, currently'],[mstat.dokeepmystats, mlog.debuglevel])
- mlog.showDebug()
myProcess = MerryProcess()
+ myProcess.moveTitleAndAuthor(soup)
myProcess.removeUnwantedTags(soup)
return soup
def postprocess_html(self, soup, first):
myProcess = MerryProcess()
myProcess.optimizeLayout(soup)
- if SHOWDEBUG0 == True:
- if KEEPSTATS == True:
- statinfo = 'generated stats:'
- statinfo += str(mstat.stats(mstat.statslist))
- print statinfo
- statinfo = 'generated stats (for removed tags):'
- statinfo += str(mstat.stats(mstat.removedtagslist))
- print statinfo
- #show all Debug info we forgot to report
- #Using print to be sure that this text will not be added at the end of the log.
- print '\n!!!!!unreported messages:\n(should be empty)\n'
- mlog.showDebug()
return soup
feeds = [
@@ -128,295 +104,109 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
(u'Buitenland', u'http://www.metronieuws.nl/rss.xml?c=1277377288-4'),
(u'Columns', u'http://www.metronieuws.nl/rss.xml?c=1277377288-17'),
(u'Entertainment', u'http://www.metronieuws.nl/rss.xml?c=1277377288-2'),
- (u'Dot', u'http://www.metronieuws.nl/rss.xml?c=1283166782-12'),
+ (u'Strips',u'http://www.metronieuws.nl/rss.xml?c=1325037714-0'),
+ (u'Tech', u'http://www.metronieuws.nl/rss.xml?c=1283166782-12'),
(u'Familie', u'http://www.metronieuws.nl/rss.xml?c=1283166782-9'),
(u'Blogs', u'http://www.metronieuws.nl/rss.xml?c=1295586825-6'),
(u'Reizen', u'http://www.metronieuws.nl/rss.xml?c=1277377288-13'),
- (u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
+ (u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
+ (u'Wetenschap',u'http://www.metronieuws.nl/rss.xml?c=1303088437-0'),
+ (u'Planeet',u'http://www.metronieuws.nl/rss.xml?c=1277377288-14'),
+ (u'Gezondheid',u'http://www.metronieuws.nl/rss.xml?c=1277377288-15'),
(u'Sport', u'http://www.metronieuws.nl/rss.xml?c=1277377288-12')
]
class MerryPreProcess():
- def replacePictures(self, soup):
- #to be implemented
- return soup
-
def optimizePicture(self,soup):
- if SHOWDEBUG0 == True:
- mlog.addDebug('start image optimize')
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
- iurl = tag['src']
- img = Image()
- img.open(iurl)
- img.trim(0)
- img.save(iurl)
- if SHOWDEBUG0 == True:
- mlog.addDebug('Images optimized')
- mlog.showDebug()
+ try:
+ iurl = tag['src']
+ img = Image()
+ img.open(iurl)
+ img.trim(0)
+ img.save(iurl)
+ except:
+ print '\n!!image optimize failed!!\n'
+ continue
return soup
class MerryExtract():
def safeRemovePart(self, killingSoup, soupIsArray):
if killingSoup and not killingSoup == None:
- if SHOWDEBUG2 == True:
- mlog.addTextAndTag(['items to remove'],[killingSoup])
try:
if soupIsArray == True:
for killer in killingSoup:
killer.extract()
else:
killingSoup.extract()
- if SHOWDEBUG1 == True:
- mlog.addDebug('tag extracted')
- mlog.showDebug()
- if KEEPSTATS == True:
- try:
- mstat.addstat(mstat.removedtagslist,str(killingSoup.name))
- except:
- mstat.addstat(mstat.removedtagslist,'unknown')
except:
- if SHOWDEBUG1 == True:
- mlog.addDebug('tag extraction failed')
- mlog.showDebug()
- if KEEPSTATS == True:
- mstat.addstat(mstat.removedtagslist,'exception')
return False
else:
return False
return killingSoup
-class MerryReplace():
- myKiller = MerryExtract()
- def replaceATag(self, soup):
- anchors = []
- anchors = soup.findAll('a')
- if anchors and not (anchors == None or anchors == []):
- try:
- for link in anchors:
- # print str(link)
- if link and not link == None:
- # print ('type: %s'%(str(type(link))))
- # print ('link: %s' % (link))
- myParent = link.parent
- # print str('parent: %s'%(myParent))
- try:
- myIndex = link.parent.index(link)
- hasIndex = True
- except:
- myIndex = 0
- hasIndex = False
- # print str('index %s'%(myIndex))
- if not link.string == None:
- # print 'link=notnone'
- if hasIndex == True:
- myParent.insert(myIndex, link.string)
- else:
- myParent.append(link.string)
- else:
- # print 'link=none'
- myParent.insert(myIndex, link.contents)
- self.myKiller.safeRemovePart(link, False)
- else:
- notshown = 'tag received is empty' # print
- except:
- notshown = 'tag received is empty' # print
- notshown
- return soup
-
class MerryProcess(BeautifulSoup):
myKiller = MerryExtract()
- myReplacer = MerryReplace()
myPrepare = MerryPreProcess()
def optimizeLayout(self,soup):
self.myPrepare.optimizePicture(soup)
- if SHOWDEBUG0 == True:
- mlog.addDebug('End of Optimize Layout')
- mlog.showDebug()
return soup
def insertFacts(self, soup):
- allfacts = soup.findAll('div', {'class':re.compile('^article-box-fact.*$')})
- if SHOWDEBUG0 == True:
- mlog.addTextAndTag(['allfacts'],[allfacts])
- mlog.showDebug()
+ thefactpart = re.compile('^article-box-fact.*$')
+ allfacts = soup.findAll('div', {'class':thefactpart})
if allfacts and not allfacts == None:
- allfactsparent = soup.find('div', {'class':re.compile('^article-box-fact.*$')}).parent
- if SHOWDEBUG0 == True:
- mlog.addTextAndTag(['allfactsparent'],[allfactsparent])
- mlog.showDebug()
+ allfactsparent = soup.find('div', {'class':thefactpart}).parent
for part in allfactsparent:
if not part in allfacts:
- if SHOWDEBUG0 == True:
- mlog.addTextAndTag(['FOUND A non-fact'],[part])
- mlog.showDebug()
self.myKiller.safeRemovePart(part, True)
- if SHOWDEBUG1 == True:
- mlog.addTextAndTag(['New All Facts'],[allfacts])
- mlog.showDebug()
articlefacts = soup.find('div', {'class':'article-box-fact column'})
- errorOccured=False
if (articlefacts and not articlefacts==None):
try:
contenttag = soup.find('div', {'class':'article-body'})
- if SHOWDEBUG0 == True:
- mlog.addTextAndTag(['curcontag'],[contenttag])
- mlog.showDebug()
foundrighttag = False
if contenttag and not contenttag == None:
foundrighttag = True
- if SHOWDEBUG0 == True:
- if errorOccured == False:
- mlog.addTextAndTag(['type','curcontag (in while)'],[type(contenttag),contenttag])
- else:
- mlog.addDebug('Could not find right parent tag. Error Occured')
- mlog.showDebug()
if foundrighttag == True:
contenttag.insert(0, allfactsparent)
- if SHOWDEBUG2 == True:
- mlog.addTextAndTag(['added parent'],[soup.prettify()])
- mlog.showDebug()
except:
- errorOccured=True
- mlog.addTrace()
- else:
- errorOccured=True
- if SHOWDEBUG0 == True and errorOccured == True:
- mlog.addTextAndTag(['no articlefacts'],[articlefacts])
- mlog.showDebug()
+ pass
+ return soup
+
+ def moveTitleAndAuthor(self, soup):
+ moveitem = soup.h1
+ pubdate = soup.find(id="date")
+ if moveitem and not moveitem == None and pubdate and not pubdate == None:
+ try:
+ pubdate.parent.insert(0, moveitem)
+ except:
+ print '\n!!error in moving title!!\n'
+ pass
+ moveitem = None
+ moveitem = soup.find('div', {'class':'byline'})
+ if moveitem and not moveitem == None:
+ try:
+ moveitem.parent.parent.insert(-1, moveitem)
+ except:
+ print '\n!!error in moving byline!!\n'
+ pass
return soup
-
- def previousNextSibRemover(self, soup, previous=True, soupIsArray=False):
- findsibsof = soup
- firstpart = previous
- if findsibsof and not findsibsof == None:
- if soupIsArray == True:
- for foundsib in findsibsof:
- self.previousNextSibRemover(foundsib, firstpart, soupIsArray=False)
- else:
- if firstpart == True and soupIsArray == False:
- sibs = findsibsof.previousSiblingGenerator()
- else:
- sibs = findsibsof.nextSiblingGenerator()
- for sib in sibs:
- self.myKiller.safeRemovePart(sib, True)
- else:
- if SHOWDEBUG1 == True:
- mlog.addDebug('Not any sib found')
- return
def removeUnwantedTags(self,soup):
- if SHOWDEBUG1 == True:
- mlog.addTextAndTag(['Len of Soup before RemoveTagsByName'],[len(str(soup))])
- mlog.showDebug()
- self.removeTagsByName(soup)
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup before firstandlastpart: %s' % len(str(soup)))
- mlog.showDebug()
self.insertFacts(soup)
- self.removeFirstAndLastPart(soup)
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup before unwantedpart: %s' % len(str(soup)))
- mlog.showDebug()
- self.removeUnwantedParts(soup)
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup before EmptyParts: %s' % len(str(soup)))
- mlog.showDebug()
self.removeEmptyTags(soup)
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup after EmptyParts: %s' % len(str(soup)))
- mlog.showDebug()
- self.myReplacer.replaceATag(soup)
- return soup
-
- def removeUnwantedParts(self, soup):
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup before UnwantedID: %s' % len(str(soup)))
- mlog.showDebug()
- self.removeUnwantedTagsByID(soup)
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup before Class: %s' % len(str(soup)))
- mlog.showDebug()
- self.removeUnwantedTagsByClass(soup)
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup before Style: %s' % len(str(soup)))
- mlog.showDebug()
- self.removeUnwantedTagsByStyle(soup)
- return soup
-
- def removeUnwantedTagsByStyle(self,soup):
- self.removeArrayOfTags(soup.findAll(attrs={'style' : re.compile("^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$")}))
- if SHOWDEBUG0 == True:
- mlog.addDebug('end remove by style')
+ self.removeArrayOfTags(soup.findAll(attrs={'class': 'share-tools-bottom'})) # at end to keep author
return soup
def removeArrayOfTags(self,souparray):
return self.myKiller.safeRemovePart(souparray, True)
- def removeUnwantedTagsByClass(self,soup):
- if SHOWDEBUG0 == True:
- mlog.addDebug('start remove by class')
- self.removeArrayOfTags(soup.findAll("div", { "class" :re.compile('^(promo.*?|article-tools-below-title|metroCommentFormWrap|ad|share-tools|tools|header-links|related-links|padding-top-15)$')}))
- return soup
-
- def removeUnwantedTagsByID(self,soup):
- defaultids = ['footer-extra',re.compile('^ad(\d+|adcomp.*?)?$'),'column-4-5','navigation','header',re.compile('^column-1-5-(top|bottom)$'),'footer','hidden_div','sidebar',re.compile('^article-\d$'),'comments','footer']
- for removeid in defaultids:
- if SHOWDEBUG1 == True:
- mlog.addDebug('RemoveTagByID, tag: %s, Len of Soup: %s' % (str(removeid), len(str(soup))))
- mlog.showDebug()
- self.removeArrayOfTags(soup.findAll(id=removeid))
- return soup
-
- # def safeRemoveTag(self, subtree):
- # return self.myKiller.safeRemovePart(subtree, True)
-
-
- def removeTagsByName(self, soup):
- self.myKiller.safeRemovePart(soup.script, True)
- self.myKiller.safeRemovePart(soup.iframe, True)
- self.myKiller.safeRemovePart(soup.style, True)
- self.myKiller.safeRemovePart(soup.noscript, True)
- return soup
-
def removeEmptyTags(self,soup,run=0):
- if SHOWDEBUG0 == True:
- mlog.addDebug('starting removeEmptyTags')
- if SHOWDEBUG1 == True:
- run += 1
- mlog.addDebug(run)
- if SHOWDEBUG2 == True:
- mlog.addDebug(str(soup.prettify()))
- mlog.showDebug()
- emptymatches = re.compile('^( |\s|\n|\r|\t)*$')
+ emptymatches = re.compile('^[ \s\n\r\t ]*$')
emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
if emptytags and not (emptytags == None or emptytags == []):
- if SHOWDEBUG1 == True:
- mlog.addDebug('tags found')
- mlog.addDebug(str(emptytags))
self.removeArrayOfTags(emptytags)
#recursive in case removing empty tag creates new empty tag
self.removeEmptyTags(soup, run=run)
- else:
- if SHOWDEBUG1 == True:
- mlog.addDebug('no empty tags found')
- mlog.showDebug()
- if SHOWDEBUG0 == True:
- if SHOWDEBUG2 == True:
- mlog.addDebug('new soup:')
- mlog.addDebug(str(soup.prettify()))
- mlog.addDebug('RemoveEmptyTags Completed')
- mlog.showDebug()
- return soup
-
- def removeFirstAndLastPart(self,soup):
- def findparenttag(lookuptag):
- if lookuptag and not lookuptag == None:
- return lookuptag.findParents()
- findtag = soup.find(id="date")
- self.previousNextSibRemover(findtag, previous=True, soupIsArray=False)
- self.previousNextSibRemover(findparenttag(findtag), previous=True, soupIsArray=True)
- for endtag in [soup.find(id="share-and-byline"), soup.find("div", { "class" : "gallery-text" })]:
- self.previousNextSibRemover(endtag, previous=False, soupIsArray=False)
- self.previousNextSibRemover(findparenttag(endtag), previous=False, soupIsArray=True)
- return soup
+ return soup
\ No newline at end of file
diff --git a/recipes/metro_uk.recipe b/recipes/metro_uk.recipe
index 8dc7008a68..c30f81c019 100644
--- a/recipes/metro_uk.recipe
+++ b/recipes/metro_uk.recipe
@@ -1,52 +1,30 @@
-import re
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Metro UK'
description = 'News as provide by The Metro -UK'
-
+ #timefmt = ''
__author__ = 'Dave Asbury'
- #last update 3/12/11
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
- no_stylesheets = True
+ #no_stylesheets = True
oldest_article = 1
- max_articles_per_feed = 20
+ max_articles_per_feed = 10
remove_empty_feeds = True
remove_javascript = True
+ auto_cleanup = True
- #preprocess_regexps = [(re.compile(r'Tweet'), lambda a : '')]
- preprocess_regexps = [
- (re.compile(r'', re.IGNORECASE | re.DOTALL), lambda match: ' ')]
- preprocess_regexps = [
- (re.compile(r'tweet', re.IGNORECASE | re.DOTALL), lambda match: '')]
language = 'en_GB'
-
-
masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
-
-
keep_only_tags = [
- dict(name='h1'),dict(name='h2', attrs={'class':'h2'}),
- dict(attrs={'class':['img-cnt figure']}),
- dict(attrs={'class':['art-img']}),
- dict(name='div', attrs={'class':'art-lft'}),
- dict(name='p')
+
]
remove_tags = [
- dict(name = 'div',attrs={'id' : ['comments-news','formSubmission']}),
- dict(name='div', attrs={'class':[ 'news m12 clrd clr-b p5t shareBtm', 'commentForm', 'metroCommentInnerWrap',
- 'art-rgt','pluck-app pluck-comm','news m12 clrd clr-l p5t', 'flt-r','username','clrd' ]}),
- dict(attrs={'class':['username', 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime','addYourComment','displayName']})
- ,dict(name='div', attrs={'class' : 'clrd art-fd fd-gr1-b'})
+
]
+
+
feeds = [
(u'News', u'http://www.metro.co.uk/rss/news/'), (u'Money', u'http://www.metro.co.uk/rss/money/'), (u'Sport', u'http://www.metro.co.uk/rss/sport/'), (u'Film', u'http://www.metro.co.uk/rss/metrolife/film/'), (u'Music', u'http://www.metro.co.uk/rss/metrolife/music/'), (u'TV', u'http://www.metro.co.uk/rss/tv/'), (u'Showbiz', u'http://www.metro.co.uk/rss/showbiz/'), (u'Weird News', u'http://www.metro.co.uk/rss/weird/'), (u'Travel', u'http://www.metro.co.uk/rss/travel/'), (u'Lifestyle', u'http://www.metro.co.uk/rss/lifestyle/'), (u'Books', u'http://www.metro.co.uk/rss/lifestyle/books/'), (u'Food', u'http://www.metro.co.uk/rss/lifestyle/restaurants/')]
-
extra_css = '''
- body {font: sans-serif medium;}'
- h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
- h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
- span{ font-size:9.5px; font-weight:bold;font-style:italic}
- p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
-
- '''
+ body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
+ '''
diff --git a/recipes/naczytniki.recipe b/recipes/naczytniki.recipe
index 2ae6bc391e..3d1a8b6095 100644
--- a/recipes/naczytniki.recipe
+++ b/recipes/naczytniki.recipe
@@ -7,12 +7,12 @@ class naczytniki(BasicNewsRecipe):
cover_url = 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png'
language = 'pl'
description ='everything about e-readers'
- category='readers'
+ category='e-readers'
no_stylesheets=True
+ use_embedded_content=False
oldest_article = 7
max_articles_per_feed = 100
preprocess_regexps = [(re.compile(ur'
Zobacz także:
.*?', re.DOTALL), lambda match: '') ]
- remove_tags_after= dict(name='div', attrs={'class':'sociable'})
keep_only_tags=[dict(name='div', attrs={'class':'post'})]
remove_tags=[dict(name='span', attrs={'class':'comments'}), dict(name='div', attrs={'class':'sociable'})]
- feeds = [(u'Wpisy', u'http://naczytniki.pl/?feed=rss2')]
+ feeds = [(u'Wpisy', u'http://naczytniki.pl/?feed=rss2')]
\ No newline at end of file
diff --git a/recipes/national_geographic_pl.recipe b/recipes/national_geographic_pl.recipe
index a2f759e878..07fc0da666 100644
--- a/recipes/national_geographic_pl.recipe
+++ b/recipes/national_geographic_pl.recipe
@@ -9,8 +9,9 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
class recipeMagic(BasicNewsRecipe):
title = 'National Geographic PL'
__author__ = 'Marcin Urban 2011'
+ __modified_by__ = 'fenuks'
description = 'legenda wśród magazynów z historią sięgającą 120 lat'
- cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg'
+ #cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
@@ -42,11 +43,43 @@ class recipeMagic(BasicNewsRecipe):
]
remove_attributes = ['width','height']
+ feeds=[]
- feeds = [
- ('National Geographic PL', 'http://www.national-geographic.pl/rss/'),
- ]
+ def find_articles(self, url):
+ articles = []
+ soup=self.index_to_soup(url)
+ tag=soup.find(attrs={'class':'arl'})
+ art=tag.ul.findAll('li')
+ for i in art:
+ title=i.a['title']
+ url=i.a['href']
+ #date=soup.find(id='footer').ul.li.string[41:-1]
+ desc=i.div.p.string
+ articles.append({'title' : title,
+ 'url' : url,
+ 'date' : '',
+ 'description' : desc
+ })
+ return articles
+
+ def parse_index(self):
+ feeds = []
+ feeds.append((u"Aktualności", self.find_articles('http://www.national-geographic.pl/aktualnosci/')))
+ feeds.append((u"Artykuły", self.find_articles('http://www.national-geographic.pl/artykuly/')))
+
+ return feeds
def print_version(self, url):
- return url.replace('artykuly0Cpokaz', 'drukuj-artykul')
+ if 'artykuly' in url:
+ return url.replace('artykuly/pokaz', 'drukuj-artykul')
+ elif 'aktualnosci' in url:
+ return url.replace('aktualnosci/pokaz', 'drukuj-artykul')
+ else:
+ return url
+
+ def get_cover_url(self):
+ soup = self.index_to_soup('http://www.national-geographic.pl/biezace-wydania/')
+ tag=soup.find(attrs={'class':'txt jus'})
+ self.cover_url=tag.img['src']
+ return getattr(self, 'cover_url', self.cover_url)
diff --git a/recipes/non_leggerlo.recipe b/recipes/non_leggerlo.recipe
new file mode 100644
index 0000000000..90bb76c0ef
--- /dev/null
+++ b/recipes/non_leggerlo.recipe
@@ -0,0 +1,16 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1335362999(BasicNewsRecipe):
+ title = u'Non leggerlo'
+ oldest_article = 7
+ max_articles_per_feed = 100
+ auto_cleanup = False
+ keep_only_tags = [
+ dict(name='div', attrs={'class':'post hentry'})
+ ]
+ feeds = [(u'Non leggerlo', u'http://nonleggerlo.blogspot.com/feeds/posts/default')]
+ description = 'An Italian satirical blog'
+ language = 'it'
+ __author__ = 'faber1971'
+__version__ = 'v1.0'
+__date__ = '24, April 2012'
diff --git a/recipes/nowa_fantastyka.recipe b/recipes/nowa_fantastyka.recipe
index ec556da5fa..0371cb1f58 100644
--- a/recipes/nowa_fantastyka.recipe
+++ b/recipes/nowa_fantastyka.recipe
@@ -81,5 +81,7 @@ class Nowa_Fantastyka(BasicNewsRecipe):
title=soup.find(attrs={'class':'tytul'})
if title:
title['style']='font-size: 20px; font-weight: bold;'
- self.log.warn(soup)
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.INDEX + a['href']
return soup
diff --git a/recipes/nrc_handelsblad.recipe b/recipes/nrc_handelsblad.recipe
new file mode 100644
index 0000000000..2f149161c2
--- /dev/null
+++ b/recipes/nrc_handelsblad.recipe
@@ -0,0 +1,76 @@
+__license__ = 'GPL v3'
+__copyright__ = '2012'
+'''
+nrc.nl
+'''
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class NRC(BasicNewsRecipe):
+ title = 'NRC Handelsblad'
+ __author__ = 'veezh'
+ description = 'Nieuws (no subscription needed)'
+ oldest_article = 1
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ #delay = 1
+ use_embedded_content = False
+ encoding = 'utf-8'
+ publisher = 'nrc.nl'
+ category = 'news, Netherlands, world'
+ language = 'nl'
+ timefmt = ''
+ #publication_type = 'newsportal'
+ extra_css = '''
+ h1{font-size:130%;}
+ #h2{font-size:100%;font-weight:normal;}
+ #.href{font-size:xx-small;}
+ .bijschrift{color:#666666; font-size:x-small;}
+ #.main-article-info{font-family:Arial,Helvetica,sans-serif;}
+ #full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
+ #match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
+ '''
+ #preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')]
+ conversion_options = {
+ 'comments' : description
+ ,'tags' : category
+ ,'language' : language
+ ,'publisher' : publisher
+ ,'linearize_tables': True
+ }
+
+ remove_empty_feeds = True
+
+ filterDuplicates = True
+
+ def preprocess_html(self, soup):
+ for alink in soup.findAll('a'):
+ if alink.string is not None:
+ tstr = alink.string
+ alink.replaceWith(tstr)
+ return soup
+
+ keep_only_tags = [dict(name='div', attrs={'class':'article'})]
+ remove_tags_after = [dict(id='broodtekst')]
+
+# keep_only_tags = [
+# dict(name='div', attrs={'class':['label']})
+# ]
+
+# remove_tags_after = [dict(name='dl', attrs={'class':['tags']})]
+
+# def get_article_url(self, article):
+# link = article.get('link')
+# if 'blog' not in link and ('chat' not in link):
+# return link
+
+ feeds = [
+# ('Nieuws', 'http://www.nrc.nl/rss.php'),
+ ('Binnenland', 'http://www.nrc.nl/nieuws/categorie/binnenland/rss.php'),
+ ('Buitenland', 'http://www.nrc.nl/nieuws/categorie/buitenland/rss.php'),
+ ('Economie', 'http://www.nrc.nl/nieuws/categorie/economie/rss.php'),
+ ('Wetenschap', 'http://www.nrc.nl/nieuws/categorie/wetenschap/rss.php'),
+ ('Cultuur', 'http://www.nrc.nl/nieuws/categorie/cultuur/rss.php'),
+ ('Boeken', 'http://www.nrc.nl/boeken/rss.php'),
+ ('Tech', 'http://www.nrc.nl/tech/rss.php/'),
+ ('Klimaat', 'http://www.nrc.nl/klimaat/rss.php/'),
+ ]
diff --git a/recipes/oreilly_premium.recipe b/recipes/oreilly_premium.recipe
index 9dc11059c4..17b8f241ff 100644
--- a/recipes/oreilly_premium.recipe
+++ b/recipes/oreilly_premium.recipe
@@ -1,45 +1,69 @@
-# Talking Points is not grabbing everything.
-# The look is right, but only the last one added?
-import re
import time
+import traceback
+# above for debugging via stack
from calibre.web.feeds.recipes import BasicNewsRecipe
# Allows the Python soup converter, which makes parsing easier.
from calibre.ebooks.BeautifulSoup import BeautifulSoup
-# strip ads and graphics
-# Current Column lacks a title.
-# Talking Points Memo - shorten title - Remove year and Bill's name
+
+import os
+
+
+from calibre.web.feeds import feeds_from_index
+from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
+
+
+# To Do: strip ads and graphics, Current Column lacks a title.
# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
# Newsletters: Talking Points Memos covered by cat12
+# ./ebook-convert --username xxx --password xxx
+# this is derived from BasicNewsRecipe, so it can only overload those.
+# Soome of what we need is otherwise in article, so we have more copy to do than otherwise.
class OReillyPremium(BasicNewsRecipe):
title = u'OReilly Premium'
__author__ = 'TMcN'
- language = 'en'
description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.'
cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png'
+ custom_title = 'Bill O\'Reilly Premium - '+ time.strftime('%d %b %Y')
+ title = 'Bill O\'Reilly Premium'
auto_cleanup = True
+ conversion_options = {'linearize_tables': True}
encoding = 'utf8'
- needs_subscription = True
+ language = 'en'
no_stylesheets = True
- oldest_article = 20
+ needs_subscription = True
+ oldest_article = 31
remove_javascript = True
remove_tags = [dict(name='img', attrs={})]
# Don't go down
recursions = 0
- max_articles_per_feed = 2000
+ max_articles_per_feed = 20
debugMessages = True
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []],
- ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
- ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
- ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
- ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
+ # ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
+ # ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
+ # ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
+ # ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []]
]
+ feeds = [
+ (u'No Spin', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'),
+ (u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'),
+ (u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'),
+ (u'Blog', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'),
+ (u'StratFor', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5')
+ ]
+ # http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day.
+
+ # Note: Talking Points is broken in the above model; the site changed to more Ajax-y.
+ # Now using RSS
+
def get_browser(self):
+ print("In get_browser")
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
@@ -66,6 +90,7 @@ class OReillyPremium(BasicNewsRecipe):
def stripBadChars(self, inString) :
return inString.replace("\'", "")
+
def parseGeneric(self, baseURL):
# Does a generic parsing of the articles. There are six categories (0-5)
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
@@ -73,6 +98,7 @@ class OReillyPremium(BasicNewsRecipe):
fullReturn = []
for i in range(len(self.catList)) :
articleList = []
+ print("In "+self.catList[i][0]+", index: "+ str(i))
soup = self.index_to_soup(self.catList[i][1])
# Set defaults
description = 'None'
@@ -81,14 +107,12 @@ class OReillyPremium(BasicNewsRecipe):
# 3-5 create one.
# So no for-div for 3-5
- if i < 3 :
+ if i == 0 :
+ print("Starting TV Archives")
for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
+ print("Next DIV:")
print(div)
- if i == 1:
- a = div.find('a', href=True)
- else :
- a = div
- print(a)
+ a = div
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
@@ -96,82 +120,63 @@ class OReillyPremium(BasicNewsRecipe):
continue
# url = baseURL+re.sub(r'\?.*', '', a['href'])
url = baseURL+a['href']
- if i < 2 :
- url = self.extractPrintURL(baseURL, url, "Print this entry")
- title = self.tag_to_string(a, use_alt=True).strip()
- elif i == 2 :
- # Daily Briefs
- url = self.extractPrintURL(baseURL, url, "Print this entry")
- title = div.contents[0]
- if self.debugMessages :
- print(title+" @ "+url)
+ url = self.extractPrintURL(baseURL, url, "Print this entry")
+ title = self.tag_to_string(a, use_alt=True).strip()
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
- elif i == 3 : # Stratfor
- a = soup.find('a', self.catList[i][3])
- if a is None :
- continue
- url = baseURL+a['href']
- title = self.tag_to_string(a, use_alt=True).strip()
- # Get Stratfor contents so we can get the real title.
- stratSoup = self.index_to_soup(url)
- title = stratSoup.html.head.title.string
- stratIndex = title.find('Stratfor.com:', 0)
- if (stratIndex > -1) :
- title = title[stratIndex+14:-1]
- # Look for first blogBody 2K, it is used as the article.
+
+
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
# returns a list of tuple ('feed title', list of articles)
# {
@@ -182,12 +187,19 @@ class OReillyPremium(BasicNewsRecipe):
# 'content' : The full article (can be an empty string). This is used by FullContentProfile
# }
# this is used instead of BasicNewsRecipe.parse_feeds().
+ # it is called by download
def parse_index(self):
# Parse the page into Python Soup
+ print("Entering recipe print_index from:")
+ traceback.print_stack()
+ print("web")
baseURL = "https://www.billoreilly.com"
- return self.parseGeneric(baseURL)
+ masterList = self.parseGeneric(baseURL)
+ #print(masterList)
+ return masterList
def preprocess_html(self, soup):
+ print("In preprocess_html")
refresh = soup.find('meta', {'http-equiv':'refresh'})
if refresh is None:
return soup
@@ -195,3 +207,128 @@ class OReillyPremium(BasicNewsRecipe):
raw = self.browser.open('https://www.billoreilly.com'+content).read()
return BeautifulSoup(raw.decode('cp1252', 'replace'))
+ def build_index(self):
+ print("In OReilly build_index()\n\n")
+ feedsRSS = []
+ self.report_progress(0, ('Fetching feeds...'))
+ #try:
+ feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
+ max_articles_per_feed=self.max_articles_per_feed,
+ log=self.log)
+ self.report_progress(0, ('Got feeds from index page'))
+ #except NotImplementedError:
+ # feeds = self.parse_feeds()
+ # Now add regular feeds.
+ feedsRSS = self.parse_feeds()
+ print ("feedsRSS is type "+feedsRSS.__class__.__name__)
+
+ for articles in feedsRSS:
+ print("articles is type "+articles.__class__.__name__)
+ print("Title:" + articles.title)
+ feeds.append(articles)
+ if not feeds:
+ raise ValueError('No articles found, aborting')
+
+ #feeds = FeedCollection(feeds)
+
+ self.report_progress(0, ('Trying to download cover...'))
+ self.download_cover()
+ self.report_progress(0, ('Generating masthead...'))
+ self.masthead_path = None
+
+ try:
+ murl = self.get_masthead_url()
+ except:
+ self.log.exception('Failed to get masthead url')
+ murl = None
+
+ if murl is not None:
+ # Try downloading the user-supplied masthead_url
+ # Failure sets self.masthead_path to None
+ self.download_masthead(murl)
+ if self.masthead_path is None:
+ self.log.info("Synthesizing mastheadImage")
+ self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg')
+ try:
+ self.default_masthead_image(self.masthead_path)
+ except:
+ self.log.exception('Failed to generate default masthead image')
+ self.masthead_path = None
+
+ if self.test:
+ feeds = feeds[:2]
+ self.has_single_feed = len(feeds) == 1
+
+ index = os.path.join(self.output_dir, 'index.html')
+
+ html = self.feeds2index(feeds)
+ with open(index, 'wb') as fi:
+ fi.write(html)
+
+ self.jobs = []
+
+ if self.reverse_article_order:
+ for feed in feeds:
+ if hasattr(feed, 'reverse'):
+ feed.reverse()
+
+ self.feed_objects = feeds
+ for f, feed in enumerate(feeds):
+ feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
+ if not os.path.isdir(feed_dir):
+ os.makedirs(feed_dir)
+
+ for a, article in enumerate(feed):
+ if a >= self.max_articles_per_feed:
+ break
+ art_dir = os.path.join(feed_dir, 'article_%d'%a)
+ if not os.path.isdir(art_dir):
+ os.makedirs(art_dir)
+ try:
+ url = self.print_version(article.url)
+ except NotImplementedError:
+ url = article.url
+ except:
+ self.log.exception('Failed to find print version for: '+article.url)
+ url = None
+ if not url:
+ continue
+ func, arg = (self.fetch_embedded_article, article) \
+ if self.use_embedded_content or (self.use_embedded_content == None and feed.has_embedded_content()) \
+ else \
+ ((self.fetch_obfuscated_article if self.articles_are_obfuscated \
+ else self.fetch_article), url)
+ req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
+ {}, (f, a), self.article_downloaded,
+ self.error_in_article_download)
+ req.feed = feed
+ req.article = article
+ req.feed_dir = feed_dir
+ self.jobs.append(req)
+
+
+ self.jobs_done = 0
+ tp = ThreadPool(self.simultaneous_downloads)
+ for req in self.jobs:
+ tp.putRequest(req, block=True, timeout=0)
+
+
+ self.report_progress(0, ('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
+ while True:
+ try:
+ tp.poll()
+ time.sleep(0.1)
+ except NoResultsPending:
+ break
+ for f, feed in enumerate(feeds):
+ print("Writing feeds for "+feed.title)
+ html = self.feed2index(f,feeds)
+ feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
+ with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
+ fi.write(html)
+ self.create_opf(feeds)
+ self.report_progress(1, ('Feeds downloaded to %s')%index)
+
+ return index
+
+
diff --git a/recipes/orlando_sentinel.recipe b/recipes/orlando_sentinel.recipe
index 7a59f6f6ba..b327bc2b74 100644
--- a/recipes/orlando_sentinel.recipe
+++ b/recipes/orlando_sentinel.recipe
@@ -1,3 +1,4 @@
+import urllib, re
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1279258912(BasicNewsRecipe):
@@ -27,12 +28,30 @@ class AdvancedUserRecipe1279258912(BasicNewsRecipe):
encoding = 'utf-8'
conversion_options = {'linearize_tables':True}
masthead_url = 'http://www.orlandosentinel.com/media/graphic/2009-07/46844851.gif'
- keep_only_tags = [
- dict(name='div', attrs={'class':'story'})
- ]
- remove_tags = [
- dict(name='div', attrs={'class':['articlerail','tools','comment-group','clearfix']}),
- ]
- remove_tags_after = [
- dict(name='p', attrs={'class':'copyright'}),
- ]
+
+ auto_cleanup = True
+
+ def get_article_url(self, article):
+ ans = None
+ try:
+ s = article.summary
+ ans = urllib.unquote(
+ re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
+ except:
+ pass
+ if ans is None:
+ link = article.get('feedburner_origlink', None)
+ if link and link.split('/')[-1]=="story01.htm":
+ link=link.split('/')[-2]
+ encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
+ '0D': '?', '0E': '-', '0N': '.com', '0L': 'http:',
+ '0S':'//'}
+ for k, v in encoding.iteritems():
+ link = link.replace(k, v)
+ ans = link
+ elif link:
+ ans = link
+ if ans is not None:
+ return ans.replace('?track=rss', '')
+
+
diff --git a/recipes/ourdailybread.recipe b/recipes/ourdailybread.recipe
index e0d38db821..1b1b7393b3 100644
--- a/recipes/ourdailybread.recipe
+++ b/recipes/ourdailybread.recipe
@@ -14,6 +14,7 @@ class OurDailyBread(BasicNewsRecipe):
language = 'en'
max_articles_per_feed = 100
no_stylesheets = True
+ auto_cleanup = True
use_embedded_content = False
category = 'ODB, Daily Devotional, Bible, Christian Devotional, Devotional, RBC Ministries, Our Daily Bread, Devotionals, Daily Devotionals, Christian Devotionals, Faith, Bible Study, Bible Studies, Scripture, RBC, religion'
encoding = 'utf-8'
@@ -25,12 +26,12 @@ class OurDailyBread(BasicNewsRecipe):
,'linearize_tables' : True
}
- keep_only_tags = [dict(attrs={'class':'module-content'})]
- remove_tags = [
- dict(attrs={'id':'article-zoom'})
- ,dict(attrs={'class':'listen-now-box'})
- ]
- remove_tags_after = dict(attrs={'class':'readable-area'})
+ #keep_only_tags = [dict(attrs={'class':'module-content'})]
+ #remove_tags = [
+ #dict(attrs={'id':'article-zoom'})
+ #,dict(attrs={'class':'listen-now-box'})
+ #]
+ #remove_tags_after = dict(attrs={'class':'readable-area'})
extra_css = '''
.text{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
diff --git a/recipes/overclock_pl.recipe b/recipes/overclock_pl.recipe
index d7f4c8093d..953dee67eb 100644
--- a/recipes/overclock_pl.recipe
+++ b/recipes/overclock_pl.recipe
@@ -17,21 +17,8 @@ class Overclock_pl(BasicNewsRecipe):
remove_tags=[dict(name='span', attrs={'class':'info'}), dict(attrs={'class':'shareit'})]
feeds = [(u'Aktualno\u015bci', u'http://www.overclock.pl/rss.news.xml'), (u'Testy i recenzje', u'http://www.overclock.pl/rss.articles.xml')]
-
- def append_page(self, soup, appendtag):
- tag=soup.find(id='navigation')
- if tag:
- nexturl=tag.findAll('option')
- tag.extract()
- for nextpage in nexturl[2:]:
- soup2 = self.index_to_soup(nextpage['value'])
- pagetext = soup2.find(id='content')
- pos = len(appendtag.contents)
- appendtag.insert(pos, pagetext)
- rem=appendtag.find(attrs={'alt':'Pierwsza'})
- if rem:
- rem.parent.extract()
-
- def preprocess_html(self, soup):
- self.append_page(soup, soup.body)
- return soup
\ No newline at end of file
+ def print_version(self, url):
+ if 'articles/show' in url:
+ return url.replace('show', 'showall')
+ else:
+ return url
\ No newline at end of file
diff --git a/recipes/palmtop_pl.recipe b/recipes/palmtop_pl.recipe
index ace772e7e7..87da5d0d1c 100644
--- a/recipes/palmtop_pl.recipe
+++ b/recipes/palmtop_pl.recipe
@@ -10,5 +10,7 @@ class palmtop_pl(BasicNewsRecipe):
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
-
+ use_embedded_content=True
+ #remove_tags_before=dict(name='h2')
+ #remove_tags_after=dict(attrs={'class':'entry clearfix'})
feeds = [(u'Newsy', u'http://palmtop.pl/feed/atom/')]
diff --git a/recipes/pc_arena.recipe b/recipes/pc_arena.recipe
index faefeb25c0..56bb601f70 100644
--- a/recipes/pc_arena.recipe
+++ b/recipes/pc_arena.recipe
@@ -1,31 +1,32 @@
from calibre.web.feeds.news import BasicNewsRecipe
class PC_Arena(BasicNewsRecipe):
title = u'PCArena'
- oldest_article = 18300
+ oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.'
category = 'IT'
language = 'pl'
- masthead_url='http://pcarena.pl/public/design/frontend/images/logo.gif'
- cover_url= 'http://pcarena.pl/public/design/frontend/images/logo.gif'
+ index='http://pcarena.pl'
+ masthead_url='http://pcarena.pl/pcarena/img/logo.png'
+ cover_url= 'http://pcarena.pl/pcarena/img/logo.png'
no_stylesheets = True
- keep_only_tags=[dict(attrs={'class':['artHeader', 'art']})]
- remove_tags=[dict(attrs={'class':'pages'})]
- feeds = [(u'Newsy', u'http://pcarena.pl/misc/rss/news'), (u'Artyku\u0142y', u'http://pcarena.pl/misc/rss/articles')]
+ remove_empty_feeds=True
+ #keep_only_tags=[dict(attrs={'class':['artHeader', 'art']})]
+ #remove_tags=[dict(attrs={'class':'pages'})]
+ feeds = [(u'Aktualności', u'http://pcarena.pl/aktualnosci/feeds.rss'), (u'Testy', u'http://pcarena.pl/testy/feeds.rss'), (u'Software', u'http://pcarena.pl/oprogramowanie/feeds.rss'), (u'Poradniki', u'http://pcarena.pl/poradniki/feeds.rss'), (u'Mobile', u'http://pcarena.pl/mobile/feeds.rss')]
+
+ def print_version(self, url):
+ return url.replace('show', 'print')
- def append_page(self, soup, appendtag):
- tag=soup.find(name='div', attrs={'class':'pagNum'})
- if tag:
- nexturl=tag.findAll('a')
- tag.extract()
- for nextpage in nexturl[1:]:
- nextpage= 'http://pcarena.pl' + nextpage['href']
- soup2 = self.index_to_soup(nextpage)
- pagetext = soup2.find(attrs={'class':'artBody'})
- pos = len(appendtag.contents)
- appendtag.insert(pos, pagetext)
+ def image_url_processor(self, baseurl, url):
+ if 'http' not in url:
+ return 'http://pcarena.pl' + url
+ else:
+ return url
def preprocess_html(self, soup):
- self.append_page(soup, soup.body)
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
return soup
\ No newline at end of file
diff --git a/recipes/pc_centre_pl.recipe b/recipes/pc_centre_pl.recipe
index 68a17888ce..f4eccd70a0 100644
--- a/recipes/pc_centre_pl.recipe
+++ b/recipes/pc_centre_pl.recipe
@@ -10,32 +10,11 @@ class PC_Centre(BasicNewsRecipe):
masthead_url= 'http://pccentre.pl/views/images/logo.gif'
cover_url= 'http://pccentre.pl/views/images/logo.gif'
no_stylesheets = True
- keep_only_tags= [dict(id='content')]
- remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')]
- feeds = [(u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n§ion=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n§ion=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n§ion=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n§ion=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n§ion=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n§ion=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n§ion=9')]
+ remove_empty_feeds = True
+ #keep_only_tags= [dict(id='content')]
+ #remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')]
+ remove_tags=[dict(attrs={'class':'logo_print'})]
+ feeds = [(u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n§ion=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n§ion=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n§ion=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n§ion=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n§ion=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n§ion=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n§ion=9')]
-
- def append_page(self, soup, appendtag):
- tag=soup.find(name='div', attrs={'class':'pages'})
- if tag:
- nexturl=tag.findAll('a')
- tag.extract()
- for nextpage in nexturl[:-1]:
- nextpage= 'http://pccentre.pl' + nextpage['href']
- soup2 = self.index_to_soup(nextpage)
- pagetext = soup2.find(id='content')
- rem=pagetext.findAll(attrs={'class':['subtitle', 'content_info', 'list_of_content', 'pages', 'social2', 'pcc_acc', 'pcc_acc_na']})
- for r in rem:
- r.extract()
- rem=pagetext.findAll(id='comments')
- for r in rem:
- r.extract()
- rem=pagetext.findAll('h1')
- for r in rem:
- r.extract()
- pos = len(appendtag.contents)
- appendtag.insert(pos, pagetext)
-
- def preprocess_html(self, soup):
- self.append_page(soup, soup.body)
- return soup
\ No newline at end of file
+ def print_version(self, url):
+ return url.replace('show', 'print')
\ No newline at end of file
diff --git a/recipes/readitlater.recipe b/recipes/readitlater.recipe
index 38f7ec1a9a..92c9aaf9d6 100644
--- a/recipes/readitlater.recipe
+++ b/recipes/readitlater.recipe
@@ -1,5 +1,5 @@
"""
-readitlaterlist.com
+Pocket Calibre Recipe v1.0
"""
__license__ = 'GPL v3'
__copyright__ = '''
@@ -12,22 +12,23 @@ from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
-class Readitlater(BasicNewsRecipe):
- title = 'ReadItLater'
+class Pocket(BasicNewsRecipe):
+ title = 'Pocket'
__author__ = 'Darko Miletic, Przemyslaw Kryger, Keith Callenberg, tBunnyMan'
- description = '''Personalized news feeds. Go to readitlaterlist.com to setup \
- up your news. This version displays pages of articles from \
+ description = '''Personalized news feeds. Go to getpocket.com to setup up \
+ your news. This version displays pages of articles from \
oldest to newest, with max & minimum counts, and marks articles \
read after downloading.'''
- publisher = 'readitlaterlist.com'
+ publisher = 'getpocket.com'
category = 'news, custom'
oldest_article = 7
max_articles_per_feed = 50
- minimum_articles = 1
+ minimum_articles = 10
+ mark_as_read_after_dl = True
no_stylesheets = True
use_embedded_content = False
needs_subscription = True
- INDEX = u'http://readitlaterlist.com'
+ INDEX = u'http://getpocket.com'
LOGIN = INDEX + u'/l'
readList = []
@@ -100,9 +101,31 @@ class Readitlater(BasicNewsRecipe):
br = self.get_browser()
for link in markList:
url = self.INDEX + link
+ print 'Marking read: ', url
response = br.open(url)
- response
+ print response.info()
def cleanup(self):
- self.mark_as_read(self.readList)
+ if self.mark_as_read_after_dl:
+ self.mark_as_read(self.readList)
+ else:
+ pass
+ def default_cover(self, cover_file):
+ '''
+ Create a generic cover for recipes that don't have a cover
+ This override adds time to the cover
+ '''
+ try:
+ from calibre.ebooks import calibre_cover
+ title = self.title if isinstance(self.title, unicode) else \
+ self.title.decode('utf-8', 'replace')
+ date = strftime(self.timefmt)
+ time = strftime('[%I:%M %p]')
+ img_data = calibre_cover(title, date, time)
+ cover_file.write(img_data)
+ cover_file.flush()
+ except:
+ self.log.exception('Failed to generate default cover')
+ return False
+ return True
diff --git a/recipes/real_clear.recipe b/recipes/real_clear.recipe
index 19add74fcd..cbf5a2f8e4 100644
--- a/recipes/real_clear.recipe
+++ b/recipes/real_clear.recipe
@@ -1,5 +1,7 @@
# Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug
+import re
import time
+from urlparse import urlparse
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import NavigableString
@@ -20,12 +22,13 @@ class RealClear(BasicNewsRecipe):
# Don't go down
recursions = 0
max_articles_per_feed = 400
- debugMessages = False
+ debugMessages = True
# Numeric parameter is type, controls whether we look for
feedsets = [
- ["Politics", "http://www.realclearpolitics.com/index.xml", 0],
- ["Science", "http://www.realclearscience.com/index.xml", 0],
+ ["Politics", "http://www.realclearpolitics.com/index.xml", 0],
+ ["Policy", "http://www.realclearpolicy.com/index.xml", 0],
+ ["Science", "http://www.realclearscience.com/index.xml", 0],
["Tech", "http://www.realcleartechnology.com/index.xml", 0],
# The feedburner is essentially the same as the top feed, politics.
# ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1],
@@ -37,7 +40,9 @@ class RealClear(BasicNewsRecipe):
]
# Hints to extractPrintURL.
# First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down.
- printhints = [
+ phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
+
+ printhints = [ ["realclear", "", '' , 'printpage'],
["billoreilly.com", "Print this entry", 'a', ''],
["billoreilly.com", "Print This Article", 'a', ''],
["politico.com", "Print", 'a', 'share-print'],
@@ -48,11 +53,24 @@ class RealClear(BasicNewsRecipe):
# usatoday - just prints with all current crap anyhow
]
+ # RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html
+ # The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s
+ # http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
+ # Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
+ # Use the FULL PRINTPAGE URL; it formats it better too!
+ #
+ # NYT - try single page...
+ # Need special code - is it one page or several? Which URL?
+ # from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
+ # to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
+ # which is at link rel="canonical" and at 0 and len(self.printhints[x][1]) == 0:
+ if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0:
+ # e.g. RealClear
if self.debugMessages == True :
- print("search1")
+ print("Search by href: "+self.printhints[x][self.phHrefSearch])
+ printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch]))
+ elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
+ if self.debugMessages == True :
+ print("Search 1: "+self.printhints[x][2]+" Attributes: ")
+ print(self.printhints[x][3])
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
elif len(self.printhints[x][3])>0 :
if self.debugMessages == True :
print("search2")
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
else :
+ if self.debugMessages == True:
+ print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1])
printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
if printFind is None:
if self.debugMessages == True :
print("Not Found")
+ # print(soup)
+ print("end soup\n\n");
continue
+
print(printFind)
if isinstance(printFind, NavigableString)==False:
if printFind['href'] is not None:
+ print("Check "+printFind['href']+" for base of "+baseURL)
+ if printFind['href'].find("http")!=0 :
+ return baseURL+printFind['href']
return printFind['href']
tag = printFind.parent
print(tag)
@@ -158,6 +190,7 @@ class RealClear(BasicNewsRecipe):
def parse_index(self):
# Parse the page into Python Soup
+ #articleList = []
ans = []
feedsCount = len(self.feedsets)
for x in range(0,feedsCount): # should be ,4
@@ -168,3 +201,4 @@ class RealClear(BasicNewsRecipe):
print(ans)
return ans
+
diff --git a/recipes/richmond_times_dispatch.recipe b/recipes/richmond_times_dispatch.recipe
new file mode 100644
index 0000000000..163a6317ff
--- /dev/null
+++ b/recipes/richmond_times_dispatch.recipe
@@ -0,0 +1,59 @@
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class AdvancedUserRecipe1335532466(BasicNewsRecipe):
+ title = u'Richmond Times-Dispatch'
+ description = 'News from Richmond, Virginia, USA'
+ __author__ = 'jde'
+ cover_url = 'http://static2.dukecms.com/va_tn/timesdispatch_com/site-media/img/icons/logo252x97.png'
+ language = 'en'
+ encoding = 'utf8'
+ oldest_article = 1 #days
+ max_articles_per_feed = 25
+ needs_subscription = False
+ remove_javascript = True
+ recursions = 0
+ use_embedded_content = False
+ no_stylesheets = True
+ auto_cleanup = True
+
+ feeds = [
+
+('News',
+'http://www2.timesdispatch.com/list/feed/rss/news-archive'),
+('Breaking News',
+'http://www2.timesdispatch.com/list/feed/rss/breaking-news'),
+('National News',
+'http://www2.timesdispatch.com/list/feed/rss/national-news'),
+('Local News',
+'http://www2.timesdispatch.com/list/feed/rss/local-news'),
+('Business',
+'http://www2.timesdispatch.com/list/feed/rss/business'),
+('Local Business',
+'http://www2.timesdispatch.com/list/feed/rss/local-business'),
+('Politics',
+'http://www2.timesdispatch.com/list/feed/rss/politics'),
+('Virginia Politics',
+'http://www2.timesdispatch.com/list/feed/rss/virginia-politics'),
+('Editorials',
+'http://www2.timesdispatch.com/list/feed/rss/editorial-desk'),
+('Columnists and Blogs',
+'http://www2.timesdispatch.com/list/feed/rss/news-columnists-blogs'),
+('Opinion Columnists',
+'http://www2.timesdispatch.com/list/feed/rss/opinion-editorial-columnists'),
+('Letters to the Editor',
+'http://www2.timesdispatch.com/list/feed/rss/opinion-letters'),
+('Traffic',
+'http://www2.timesdispatch.com/list/feed/rss/traffic'),
+('Sports',
+'http://www2.timesdispatch.com/list/feed/rss/sports2'),
+('Entertainment/Life',
+'http://www2.timesdispatch.com/list/feed/rss/entertainment'),
+('Movies',
+'http://www2.timesdispatch.com/list/feed/rss/movies'),
+('Music',
+'http://www2.timesdispatch.com/list/feed/rss/music'),
+('Dining & Food',
+'http://www2.timesdispatch.com/list/feed/rss/dining'),
+
+ ]
+
diff --git a/recipes/rue89.recipe b/recipes/rue89.recipe
index bd3ef7ea4c..261770802e 100644
--- a/recipes/rue89.recipe
+++ b/recipes/rue89.recipe
@@ -6,6 +6,7 @@ Rue89
__author__ = '2010-2012, Louis Gesbert '
+import re
from calibre.web.feeds.news import BasicNewsRecipe
class Rue89(BasicNewsRecipe):
@@ -15,23 +16,24 @@ class Rue89(BasicNewsRecipe):
title = u'Rue89'
language = 'fr'
oldest_article = 7
- max_articles_per_feed = 12
+ max_articles_per_feed = 50
use_embedded_content = False
# From http://www.rue89.com/les-flux-rss-de-rue89
feeds = [
(u'La Une', u'http://www.rue89.com/feed'),
- (u'Rue69', u'http://www.rue89.com/rue69/feed'),
- (u'Eco', u'http://www.rue89.com/rue89-eco/feed'),
- (u'Planète', u'http://www.rue89.com/rue89-planete/feed'),
- (u'Sport', u'http://www.rue89.com/rue89-sport/feed'),
- (u'Culture', u'http://www.rue89.com/culture/feed'),
- (u'Hi-tech', u'http://www.rue89.com/hi-tech/feed'),
- (u'Media', u'http://www.rue89.com/medias/feed'),
- (u'Monde', u'http://www.rue89.com/monde/feed'),
- (u'Politique', u'http://www.rue89.com/politique/feed'),
- (u'Societe', u'http://www.rue89.com/societe/feed'),
+ # Other feeds disabled, 'La Une' seems to include them all
+ # (u'Rue69', u'http://www.rue89.com/rue69/feed'),
+ # (u'Eco', u'http://www.rue89.com/rue89-eco/feed'),
+ # (u'Planète', u'http://www.rue89.com/rue89-planete/feed'),
+ # (u'Sport', u'http://www.rue89.com/rue89-sport/feed'),
+ # (u'Culture', u'http://www.rue89.com/culture/feed'),
+ # (u'Hi-tech', u'http://www.rue89.com/hi-tech/feed'),
+ # (u'Media', u'http://www.rue89.com/medias/feed'),
+ # (u'Monde', u'http://www.rue89.com/monde/feed'),
+ # (u'Politique', u'http://www.rue89.com/politique/feed'),
+ # (u'Societe', u'http://www.rue89.com/societe/feed'),
]
# Follow redirection from feedsportal.com
@@ -41,19 +43,36 @@ class Rue89(BasicNewsRecipe):
def print_version(self, url):
return url + '?imprimer=1'
- no_stylesheets = True
-
conversion_options = { 'smarten_punctuation' : True }
keep_only_tags = [
- dict(name='div', attrs={'id':'article'}),
+ dict(name='div', attrs={'id':'content'}),
]
remove_tags_after = [
dict(name='div', attrs={'id':'plus_loin'}),
+ dict(name='div', attrs={'class':'stats'}),
]
remove_tags = [
dict(name='div', attrs={'id':'article_tools'}),
dict(name='div', attrs={'id':'plus_loin'}),
+ dict(name='div', attrs={'class':'stats'}),
+ dict(name='div', attrs={'class':'tools'}),
]
+
+ extra_css = "#content { padding: 0 0; }"
+
+ # Without this, parsing of video articles returns strange results
+ preprocess_regexps = [
+ (re.compile(r'', re.IGNORECASE|re.DOTALL), ''),
+ ]
+
+ def preprocess_html(self, soup):
+ # Remove whole article if it's a "zapnet" (video)
+ if soup.find('h1', {'class':'zapnet_title'}):
+ return None
+ # Reduce h2 titles to h3
+ for title in soup.findAll('h2'):
+ title.name = 'h3'
+ return soup
diff --git a/recipes/sol_haber.recipe b/recipes/sol_haber.recipe
new file mode 100644
index 0000000000..29db88019c
--- /dev/null
+++ b/recipes/sol_haber.recipe
@@ -0,0 +1,141 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Onur Gungor onurgu@gmail.com'
+__docformat__ = 'restructuredtext en'
+
+'''
+www.sol.org.tr
+'''
+
+import datetime
+
+import re
+
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class SolHaberRecipe(BasicNewsRecipe):
+ title = u'soL Haber'
+ oldest_article = 7
+ max_articles_per_feed = 100
+
+ language = 'tr'
+ __author__ = 'Onur Güngör'
+ description = 'Hayata soL''dan bakın..'
+ publisher = 'soL Haber'
+ tags = 'news, haberler, siyaset, türkiye, turkey, politics'
+
+
+ conversion_options = {
+ 'comment' : description
+ , 'tags' : tags
+ , 'publisher' : publisher
+ , 'language' : language
+ }
+
+ category_dict = { 'sonuncu-kavga':'Sonuncu Kavga',
+ 'devlet-ve-siyaset':'Devlet ve Siyaset',
+ 'ekonomi':'Ekonomi',
+ 'enternasyonal-gundem':'Enternasyonel Gündem',
+ 'kent-gundemleri':'Kent Gündemleri',
+ 'kultur-sanat':'Kültür Sanat',
+ 'dunyadan':'Dünyadan',
+ 'serbest-kursu':'Serbest Kürsü',
+ 'medya':'Medya',
+ 'liseliler':'Liseliler',
+ 'yazarlar':'Köşe Yazıları'}
+
+ end_date = datetime.date.today().isoformat()
+ start_date = (datetime.date.today()-datetime.timedelta(days=1)).isoformat()
+
+
+ section_tuples = [['Köşe Yazıları', 'http://haber.sol.org.tr/arsiv?icerik=kose_yazisi&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
+ ['Haberler', 'http://haber.sol.org.tr/arsiv?icerik=haber&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
+ ['soL postal', 'http://haber.sol.org.tr/arsiv?icerik=postal&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
+ ['Bizim Amerika', 'http://haber.sol.org.tr/arsiv?icerik=bizim_amerika&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)]]
+
+
+ # Disable stylesheets from site.
+ no_stylesheets = True
+
+ cover_margins = (20, 20, '#ffffff')
+
+ storybody_reg_exp = '^\s*(haber|kose)\s*$'
+
+ comments_reg_exp = '^\s*makale-elestiri\s*$'
+
+ remove_tags = [dict(name='div', attrs={'class':re.compile(comments_reg_exp, re.IGNORECASE)})]
+
+ keep_only_tags = [dict(name='div', attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)})]
+
+ def get_masthead_title(self):
+ return self.title + "(" + self.end_date + ")"
+
+ def parse_index(self):
+
+ result = []
+ articles_dict = dict()
+
+ author_regexp = re.compile('^http://.*?/yazarlar/(.*?)/.*$')
+ category_regexp = re.compile('^http://.*?/(.+?)/.*$')
+
+ for section_tuple in self.section_tuples:
+
+ section_title = section_tuple[0]
+ section_index_url = section_tuple[1]
+
+ self.log('Bölüm:', section_title, 'URL:', section_index_url)
+
+ soup = self.index_to_soup(section_index_url)
+
+ logo = soup.find('div', id='logo').find('img', src=True)
+ if logo is not None:
+ self.cover_url = logo['src']
+ if self.cover_url.startswith('/'):
+ self.cover_url = 'http://haber.sol.org.tr'+self.cover_url
+
+ view_content = soup.find('div', id='ana-icerik').find('div', attrs={'class':'view-content'})
+ if view_content == None:
+ break
+ rows = view_content.find('tbody').findAll('tr')
+
+ self.log('Row sayısı', len(rows))
+ for row in rows:
+ cells = row.findAll('td')
+
+ a = cells[1].find('a', href=True)
+
+ url = a['href']
+ title = self.tag_to_string(a)
+
+ if url.startswith('/'):
+ url = 'http://haber.sol.org.tr'+url
+
+ category = section_title
+ category_match_result = category_regexp.match(url)
+ if category_match_result:
+ category = category_match_result.group(1)
+
+ date = self.tag_to_string(cells[2])
+
+ author = 'soL haber'
+
+ author_match_result = author_regexp.match(url)
+ if author_match_result:
+ author = author_match_result.group(1)
+
+ self.log('\tFound article:', title, 'at', url, 'published at ', date, 'by', author)
+ article = {'title':title, 'url':url, 'description':None, 'date':date, 'author':author}
+ if category in articles_dict:
+ articles_dict[category].append(article)
+ else:
+ articles_dict[category] = [article]
+
+ for category in articles_dict.keys():
+ if category in self.category_dict:
+ result.append((self.category_dict[category], articles_dict[category]))
+ else:
+ result.append((category, articles_dict[category]))
+
+ return result
diff --git a/recipes/soldiers.recipe b/recipes/soldiers.recipe
index fb96e5a2ed..a1e9e5ca23 100644
--- a/recipes/soldiers.recipe
+++ b/recipes/soldiers.recipe
@@ -15,6 +15,8 @@ class Soldiers(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
+ auto_cleanup = True
+ auto_cleanup_keep = '//div[@id="mediaWrapper"]'
simultaneous_downloads = 1
delay = 4
max_connections = 1
@@ -31,14 +33,14 @@ class Soldiers(BasicNewsRecipe):
, 'language' : language
}
- keep_only_tags = [dict(name='div', attrs={'id':['storyHeader','textArea']})]
+ #keep_only_tags = [dict(name='div', attrs={'id':['storyHeader','textArea']})]
- remove_tags = [
- dict(name='div', attrs={'id':['addThis','comment','articleFooter']})
- ,dict(name=['object','link'])
- ]
+ #remove_tags = [
+ #dict(name='div', attrs={'id':['addThis','comment','articleFooter']})
+ #,dict(name=['object','link'])
+ #]
- feeds = [(u'Frontpage', u'http://www.army.mil/rss/feeds/soldiersfrontpage.xml' )]
+ feeds = [(u'Frontpage', u'http://www.army.mil/rss/2/' )]
def get_cover_url(self):
diff --git a/recipes/southernstar.recipe b/recipes/southernstar.recipe
new file mode 100644
index 0000000000..69a81e2fb6
--- /dev/null
+++ b/recipes/southernstar.recipe
@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, watou'
+'''
+southernstar.ie
+'''
+import re
+import tempfile
+import os
+import codecs
+
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import Tag, NavigableString
+
+class TheSouthernStar(BasicNewsRecipe):
+
+ title = 'The Southern Star'
+ __author__ = 'watou'
+ description = 'West Cork\'s leading news and information provider since 1889'
+ NEWS_INDEX = 'http://www.southernstar.ie/news.php'
+ LOCAL_NOTES = 'http://www.southernstar.ie/localnotes.php'
+ SPORT_INDEX = 'http://www.southernstar.ie/sport.php'
+ CLASSIFIEDS = 'http://www.southernstar.ie/classifieds.php'
+ language = 'en_IE'
+ encoding = 'cp1252'
+
+ publication_type = 'newspaper'
+ masthead_url = 'http://www.southernstar.ie/images/logo.gif'
+ remove_tags_before = dict(name='div', attrs={'class':'article'})
+ remove_tags_after = dict(name='div', attrs={'class':'article'})
+ remove_tags = [dict(name='div', attrs={'style':'width:300px; position:relative'}),
+ dict(name='form'),
+ dict(name='div', attrs={'class':'endpanel'})]
+ no_stylesheets = True
+ tempfiles = []
+ pubdate = ''
+
+ preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')]
+
+ def parse_index(self):
+ feeds = []
+ seen_titles = set([])
+
+ articles = self.fetch_ss_articles(self.NEWS_INDEX, seen_titles)
+ if articles:
+ feeds.append(('News', articles))
+
+ articles = self.fetch_ss_notes(self.LOCAL_NOTES)
+ if articles:
+ feeds.append(('Local Notes', articles))
+
+ articles = self.fetch_ss_articles(self.SPORT_INDEX, seen_titles)
+ if articles:
+ feeds.append(('Sport', articles))
+
+ articles = self.fetch_ss_notes(self.CLASSIFIEDS)
+ if articles:
+ feeds.append(('Classifieds', articles))
+
+ return feeds
+
+ def fetch_ss_articles(self, index, seen_titles):
+ articles = []
+ soup = self.index_to_soup(index)
+ ts = soup.find('div', {'class':'article'})
+ ds = self.tag_to_string(ts.find('strong'))
+ self.pubdate = ' ['+ds+']'
+ self.timefmt = ' [%s]'%ds
+
+ for post in ts.findAll('h1'):
+ a = post.find('a', href=True)
+ title = self.tag_to_string(a)
+ if title in seen_titles:
+ continue
+ seen_titles.add(title)
+ url = a['href']
+ if url.startswith('article'):
+ url = 'http://www.southernstar.ie/'+url
+ self.log('\tFound article:', title, 'at', url)
+ p = post.findNextSibling('p')
+ desc = None
+ if p is not None:
+ desc = str(p)
+ articles.append({'title':title, 'url':url, 'description':desc,
+ 'date':self.pubdate})
+
+ return articles
+
+ def fetch_ss_notes(self, page):
+ articles = []
+
+ soup = self.index_to_soup(page)
+ ts = soup.find('div', {'class':'content'})
+ for post in ts.findAll('h1'):
+ title = self.tag_to_string(post)
+ self.log('\tFound note:', title)
+ f = tempfile.NamedTemporaryFile(suffix='.html',delete=False)
+ f.close()
+ f = codecs.open(f.name, 'w+b', self.encoding, 'replace')
+ url = "file://" + f.name
+ f.write(u''+title+'')
+ f.write(str(post.findNextSibling('p')))
+ f.write(u'')
+ self.log('\tWrote note to', f.name)
+ f.close()
+ self.tempfiles.append(f)
+ articles.append({'title':title, 'url':url, 'date':self.pubdate})
+
+ return articles
+
+ def postprocess_html(self, soup, first):
+ for table in soup.findAll('table', align='right'):
+ img = table.find('img')
+ if img is not None:
+ img.extract()
+ caption = self.tag_to_string(table).strip()
+ div = Tag(soup, 'div')
+ div['style'] = 'text-align:center'
+ div.insert(0, img)
+ div.insert(1, Tag(soup, 'br'))
+ if caption:
+ div.insert(2, NavigableString(caption))
+ table.replaceWith(div)
+
+ return soup
+
+ def image_url_processor(self, baseurl, url):
+ return url.replace(' ','%20')
+
+ def cleanup(self):
+ self.log('cleaning up')
+ for f in self.tempfiles:
+ os.unlink(f.name)
+ self.tempfiles = []
diff --git a/recipes/swiat_obrazu.recipe b/recipes/swiat_obrazu.recipe
new file mode 100644
index 0000000000..68740fa4dd
--- /dev/null
+++ b/recipes/swiat_obrazu.recipe
@@ -0,0 +1,25 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Swiat_Obrazu(BasicNewsRecipe):
+ title = u'Swiat Obrazu'
+ __author__ = 'fenuks'
+ description = u'Internetowy Dziennik o Fotografii i Wideo www.SwiatObrazu.pl to źródło informacji o technice fotografii i wideo, o sprzęcie najbardziej znanych i uznanych firm: Canon, Nikon, Sony, Hasselblad i wielu innych. Znajdziecie tu programy do obróbki zdjęć, forum foto i forum wideo i galerie zdjęć. Codziennie najświeższe informacje: aktualności, testy, poradniki, wywiady, felietony. Swiatobrazu.pl stale organizuje konkursy oraz warsztaty fotograficzne i wideo.'
+ category = 'photography'
+ masthead_url = 'http://www.swiatobrazu.pl/img/logo.jpg'
+ cover_url = 'http://www.swiatobrazu.pl/img/logo.jpg'
+ language = 'pl'
+ oldest_article = 7
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ remove_javascript= True
+ use_embedded_content = False
+ feeds = [(u'Wszystko', u'http://www.swiatobrazu.pl/rss')]
+
+ def print_version(self, url):
+ return url + ',drukuj'
+
+ def image_url_processor(self, baseurl, url):
+ if 'http://' not in url or 'https://' not in url:
+ return 'http://www.swiatobrazu.pl' + url[5:]
+ else:
+ return url
diff --git a/recipes/tablety_pl.recipe b/recipes/tablety_pl.recipe
index f4c1efa9b8..1c3f46f967 100644
--- a/recipes/tablety_pl.recipe
+++ b/recipes/tablety_pl.recipe
@@ -8,10 +8,11 @@ class Tablety_pl(BasicNewsRecipe):
cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png'
category = 'IT'
language = 'pl'
+ use_embedded_content=True
oldest_article = 8
max_articles_per_feed = 100
preprocess_regexps = [(re.compile(ur'Przeczytaj także.*? ', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj koniecznie.*? ', re.DOTALL), lambda match: '')]
- remove_tags_before=dict(name="h1", attrs={'class':'entry-title'})
- remove_tags_after=dict(name="div", attrs={'class':'snap_nopreview sharing robots-nocontent'})
- remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'})]
+ #remove_tags_before=dict(name="h1", attrs={'class':'entry-title'})
+ #remove_tags_after=dict(name="footer", attrs={'class':'entry-footer clearfix'})
+ #remove_tags=[dict(name='footer', attrs={'class':'entry-footer clearfix'}), dict(name='div', attrs={'class':'entry-comment-counter'})]
feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')]
diff --git a/recipes/tagesspiegel.recipe b/recipes/tagesspiegel.recipe
index 92d88d56ae..71191065f1 100644
--- a/recipes/tagesspiegel.recipe
+++ b/recipes/tagesspiegel.recipe
@@ -34,7 +34,7 @@ class TagesspiegelRSS(BasicNewsRecipe):
no_javascript = True
remove_empty_feeds = True
encoding = 'utf-8'
- remove_tags = [{'class':'hcf-header'}, {'class':'hcf-atlas'}, {'class':'hcf-date hcf-separate'}]
+ remove_tags = [{'class':'hcf-header'}, {'class':'hcf-atlas'}, {'class':'hcf-colon'}, {'class':'hcf-date hcf-separate'}]
def print_version(self, url):
url = url.split('/')
@@ -51,6 +51,7 @@ class TagesspiegelRSS(BasicNewsRecipe):
return ''.join(div.findAll(text=True, recursive=False)).strip() if div is not None else None
articles = {}
+ links = set()
key = None
ans = []
maincol = soup.find('div', attrs={'class':re.compile('hcf-main-col')})
@@ -59,7 +60,7 @@ class TagesspiegelRSS(BasicNewsRecipe):
if div['class'] == 'hcf-header':
try:
- key = string.capwords(feed_title(div.em.a))
+ key = string.capwords(feed_title(div.em))
articles[key] = []
ans.append(key)
except:
@@ -70,6 +71,12 @@ class TagesspiegelRSS(BasicNewsRecipe):
if not a:
continue
url = 'http://www.tagesspiegel.de' + a['href']
+
+ # check for duplicates
+ if url in links:
+ continue
+ links.add(url)
+
title = self.tag_to_string(a, use_alt=True).strip()
description = ''
pubdate = strftime('%a, %d %b')
diff --git a/recipes/tanuki.recipe b/recipes/tanuki.recipe
index 666cb8aa77..a615763307 100644
--- a/recipes/tanuki.recipe
+++ b/recipes/tanuki.recipe
@@ -34,4 +34,12 @@ class tanuki(BasicNewsRecipe):
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ if 'tanuki-anime' in soup.title.string.lower():
+ a['href']='http://anime.tanuki.pl' + a['href']
+ elif 'tanuki-manga' in soup.title.string.lower():
+ a['href']='http://manga.tanuki.pl' + a['href']
+ elif 'tanuki-czytelnia' in soup.title.string.lower():
+ a['href']='http://czytelnia.tanuki.pl' + a['href']
return soup
\ No newline at end of file
diff --git a/recipes/telam.recipe b/recipes/telam.recipe
new file mode 100644
index 0000000000..c2dbfee1d7
--- /dev/null
+++ b/recipes/telam.recipe
@@ -0,0 +1,62 @@
+__license__ = 'GPL v3'
+__copyright__ = '2012, Darko Miletic '
+'''
+www.telam.com.ar
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Telam(BasicNewsRecipe):
+ title = 'Telam'
+ __author__ = 'Darko Miletic'
+ description = 'AGENCIA DE NOTICIAS DE LA REPUBLICA ARGENTINA'
+ publisher = 'Telam S.E.'
+ category = 'news, politics, Argentina'
+ oldest_article = 2
+ max_articles_per_feed = 200
+ no_stylesheets = True
+ encoding = 'utf8'
+ use_embedded_content = False
+ language = 'es_AR'
+ remove_empty_feeds = True
+ publication_type = 'newsportal'
+ masthead_url = 'http://www.telam.com.ar/front/imagenes/encabezado/logotelam.jpg'
+ extra_css = """
+ body{font-family: Arial,Helvetica,sans-serif }
+ img{margin-bottom: 0.4em; display:block}
+ """
+
+ conversion_options = {
+ 'comment' : description
+ , 'tags' : category
+ , 'publisher' : publisher
+ , 'language' : language
+ }
+
+ remove_tags = [dict(name=['meta','link'])]
+ remove_tags_before = dict(attrs={'class':'nota_fecha'})
+ remove_tags_after = dict(attrs={'class':'nota_completa'})
+ remove_attributes = ['lang']
+
+
+ feeds = [
+ (u'Ultimas noticias', u'http://www.telam.com.ar/xml/rss/' )
+ ,(u'Politica' , u'http://www.telam.com.ar/xml/rss/1')
+ ,(u'Economia' , u'http://www.telam.com.ar/xml/rss/2')
+ ,(u'Sociedad' , u'http://www.telam.com.ar/xml/rss/3')
+ ,(u'Policiales' , u'http://www.telam.com.ar/xml/rss/4')
+ ,(u'Internacionales' , u'http://www.telam.com.ar/xml/rss/6')
+ ,(u'Espectaculos' , u'http://www.telam.com.ar/xml/rss/7')
+ ,(u'Cultura' , u'http://www.telam.com.ar/xml/rss/8')
+ ,(u'Deportes' , u'http://www.telam.com.ar/xml/rss/9')
+ ,(u'Telam Investiga' , u'http://www.telam.com.ar/xml/rss/5')
+ ]
+
+ def print_version(self, url):
+ artid = url.rpartition('/')[2]
+ return 'http://www.telam.com.ar/?codProg=imprimir-nota&id=' + artid
+
+ def preprocess_html(self, soup):
+ for item in soup.findAll(style=True):
+ del item['style']
+ return soup
diff --git a/recipes/the_sun.recipe b/recipes/the_sun.recipe
index 80b37f329a..db74e003a0 100644
--- a/recipes/the_sun.recipe
+++ b/recipes/the_sun.recipe
@@ -1,24 +1,23 @@
-import re
+import re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe
-
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'The Sun UK'
- cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
- description = 'A Recipe for The Sun tabloid UK - uses feed43'
+ description = 'A Recipe for The Sun tabloid UK'
__author__ = 'Dave Asbury'
- # last updated 20/2/12
+ # last updated 7/4/12
language = 'en_GB'
oldest_article = 1
max_articles_per_feed = 15
remove_empty_feeds = True
no_stylesheets = True
+ #auto_cleanup = True
+ #articles_are_obfuscated = True
masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif'
- encoding = 'cp1251'
+ encoding = 'UTF-8'
- encoding = 'cp1252'
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
@@ -30,13 +29,14 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
preprocess_regexps = [
(re.compile(r''
+ try:
+ width = int(re.sub('.*?width(:|=)(?P\d+).*', '\g', replacement_break))
+ except:
+ scene_break = hr_open+' '
+ self.log.warn('Invalid replacement scene break'
+ ' expression, using default')
+ else:
+ replacement_break = re.sub('(?i)(width=\d+\%?|width:\s*\d+(\%|px|pt|em)?;?)', '', replacement_break)
+ divpercent = (100 - width) / 2
+ hr_open = re.sub('45', str(divpercent), hr_open)
+ scene_break = hr_open+replacement_break+''
else:
scene_break = hr_open+' '
elif re.match('^ ]*>\s* \s*', '', html)
+ html = self.detect_scene_breaks(html)
html = self.detect_whitespace(html)
html = self.detect_soft_breaks(html)
blanks_count = len(self.any_multi_blank.findall(html))
if blanks_count >= 1:
html = self.merge_blanks(html, blanks_count)
- scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
- scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
+ detected_scene_break = re.compile(r']*>.*? ')
+ scene_break_count = len(detected_scene_break.findall(html))
# If the user has enabled scene break replacement, then either softbreaks
# or 'hard' scene breaks are replaced, depending on which is in use
# Otherwise separator lines are centered, use a bit larger margin in this case
replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
if replacement_break:
replacement_break = self.markup_user_break(replacement_break)
- if len(scene_break.findall(html)) >= 1:
- html = scene_break.sub(replacement_break, html)
+ if scene_break_count >= 1:
+ html = detected_scene_break.sub(replacement_break, html)
+ html = re.sub(']*>\s* ', replacement_break, html)
else:
html = re.sub(']*>\s* ', replacement_break, html)
- else:
- html = scene_break.sub(self.scene_break_open+'\g'+'', html)
if self.deleted_nbsps:
# put back non-breaking spaces in empty paragraphs so they render correctly
diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py
index b45f8f9f9e..b846d76a95 100644
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@@ -18,6 +18,7 @@ from lxml import etree
from calibre import prepare_string_for_xml
from calibre.constants import __appname__, __version__
from calibre.utils.magick import Image
+from calibre.utils.localization import lang_as_iso639_1
class FB2MLizer(object):
'''
@@ -103,7 +104,10 @@ class FB2MLizer(object):
metadata['version'] = __version__
metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
if self.oeb_book.metadata.language:
- metadata['lang'] = self.oeb_book.metadata.language[0].value
+ lc = lang_as_iso639_1(self.oeb_book.metadata.language[0].value)
+ if not lc:
+ lc = self.oeb_book.metadata.language[0].value
+ metadata['lang'] = lc or 'en'
else:
metadata['lang'] = u'en'
metadata['id'] = None
diff --git a/src/calibre/ebooks/metadata/book/base.py b/src/calibre/ebooks/metadata/book/base.py
index ce80486af8..32aad28022 100644
--- a/src/calibre/ebooks/metadata/book/base.py
+++ b/src/calibre/ebooks/metadata/book/base.py
@@ -647,14 +647,10 @@ class Metadata(object):
return (unicode(cmeta['name']+'_index'), '', '', cmeta)
if key in self.custom_field_keys():
- res = self.get(key, None)
+ res = self.get(key, None) # get evaluates all necessary composites
cmeta = self.get_user_metadata(key, make_copy=False)
name = unicode(cmeta['name'])
- if cmeta['datatype'] != 'composite' and (res is None or res == ''):
- return (name, res, None, None)
- orig_res = res
- cmeta = self.get_user_metadata(key, make_copy=False)
- if res is None or res == '':
+ if res is None or res == '': # can't check "not res" because of numeric fields
return (name, res, None, None)
orig_res = res
datatype = cmeta['datatype']
diff --git a/src/calibre/ebooks/metadata/book/json_codec.py b/src/calibre/ebooks/metadata/book/json_codec.py
index c0c3900a5d..3b52821c1b 100644
--- a/src/calibre/ebooks/metadata/book/json_codec.py
+++ b/src/calibre/ebooks/metadata/book/json_codec.py
@@ -108,6 +108,8 @@ def decode_is_multiple(fm):
else:
im = {'cache_to_list': '|', 'ui_to_list': ',',
'list_to_ui': ', '}
+ elif im is None:
+ im = {}
fm['is_multiple'] = im
class JsonCodec(object):
diff --git a/src/calibre/ebooks/metadata/haodoo.py b/src/calibre/ebooks/metadata/haodoo.py
new file mode 100644
index 0000000000..a32f7a2268
--- /dev/null
+++ b/src/calibre/ebooks/metadata/haodoo.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+
+'''
+Read meta information from Haodoo.net pdb files.
+'''
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Kan-Ru Chen '
+__docformat__ = 'restructuredtext en'
+
+from calibre.ebooks.pdb.header import PdbHeaderReader
+from calibre.ebooks.pdb.haodoo.reader import Reader
+
+def get_metadata(stream, extract_cover=True):
+ '''
+ Return metadata as a L{MetaInfo} object
+ '''
+ stream.seek(0)
+
+ pheader = PdbHeaderReader(stream)
+ reader = Reader(pheader, stream, None, None)
+
+ return reader.get_metadata()
diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py
index c30545e6e1..92aa960be6 100644
--- a/src/calibre/ebooks/metadata/opf2.py
+++ b/src/calibre/ebooks/metadata/opf2.py
@@ -535,7 +535,7 @@ class OPF(object): # {{{
series_index = MetadataField('series_index', is_dc=False,
formatter=float, none_is=1)
title_sort = TitleSortField('title_sort', is_dc=False)
- rating = MetadataField('rating', is_dc=False, formatter=int)
+ rating = MetadataField('rating', is_dc=False, formatter=float)
pubdate = MetadataField('date', formatter=parse_date,
renderer=isoformat)
publication_type = MetadataField('publication_type', is_dc=False)
@@ -883,6 +883,8 @@ class OPF(object): # {{{
val = etree.tostring(x, with_tail=False, encoding=unicode,
method='text').strip()
if val and typ not in ('calibre', 'uuid'):
+ if typ == 'isbn' and val.lower().startswith('urn:isbn:'):
+ val = val[len('urn:isbn:'):]
identifiers[typ] = val
found_scheme = True
break
diff --git a/src/calibre/ebooks/metadata/pdb.py b/src/calibre/ebooks/metadata/pdb.py
index d01bb0ecdb..70bcca132e 100644
--- a/src/calibre/ebooks/metadata/pdb.py
+++ b/src/calibre/ebooks/metadata/pdb.py
@@ -14,11 +14,14 @@ from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.metadata.ereader import get_metadata as get_eReader
from calibre.ebooks.metadata.plucker import get_metadata as get_plucker
+from calibre.ebooks.metadata.haodoo import get_metadata as get_Haodoo
MREADER = {
'PNPdPPrs' : get_eReader,
'PNRdPPrs' : get_eReader,
'DataPlkr' : get_plucker,
+ 'BOOKMTIT' : get_Haodoo,
+ 'BOOKMTIU' : get_Haodoo,
}
from calibre.ebooks.metadata.ereader import set_metadata as set_eReader
diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py
index fb1ee4af4e..4ff4726139 100644
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@@ -347,7 +347,10 @@ class Worker(Thread): # Get details {{{
method='text').strip()
else:
title = self.tostring(tdiv, encoding=unicode, method='text').strip()
- return re.sub(r'[(\[].*[)\]]', '', title).strip()
+ ans = re.sub(r'[(\[].*[)\]]', '', title).strip()
+ if not ans:
+ ans = title.rpartition('[')[0].strip()
+ return ans
def parse_authors(self, root):
x = '//h1[contains(@class, "parseasinTitle")]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]'
diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py
index 4408bff6c6..2206a9ff04 100644
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@@ -112,6 +112,18 @@ def get_cached_cover_urls(mi):
if url:
yield (p, url)
+def dump_caches():
+ from calibre.customize.ui import metadata_plugins
+ return {p.name:p.dump_caches() for p in metadata_plugins(['identify'])}
+
+def load_caches(dump):
+ from calibre.customize.ui import metadata_plugins
+ plugins = list(metadata_plugins(['identify']))
+ for p in plugins:
+ cache = dump.get(p.name, None)
+ if cache:
+ p.load_caches(cache)
+
def cap_author_token(token):
lt = lower(token)
if lt in ('von', 'de', 'el', 'van', 'le'):
@@ -293,6 +305,16 @@ class Source(Plugin):
with self.cache_lock:
return self._identifier_to_cover_url_cache.get(id_, None)
+ def dump_caches(self):
+ with self.cache_lock:
+ return {'isbn_to_identifier':self._isbn_to_identifier_cache.copy(),
+ 'identifier_to_cover':self._identifier_to_cover_url_cache.copy()}
+
+ def load_caches(self, dump):
+ with self.cache_lock:
+ self._isbn_to_identifier_cache.update(dump['isbn_to_identifier'])
+ self._identifier_to_cover_url_cache.update(dump['identifier_to_cover'])
+
# }}}
# Utility functions {{{
diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py
index bb1bbb9d42..6d6ebd3990 100755
--- a/src/calibre/ebooks/metadata/sources/overdrive.py
+++ b/src/calibre/ebooks/metadata/sources/overdrive.py
@@ -197,14 +197,18 @@ class OverDrive(Source):
title_tokens = list(self.get_title_tokens(title,
strip_joiners=False, strip_subtitle=True))
- if len(title_tokens) >= len(author_tokens):
+ xref_q = ''
+ if len(author_tokens) <= 1:
initial_q = ' '.join(title_tokens)
xref_q = '+'.join(author_tokens)
else:
initial_q = ' '.join(author_tokens)
- xref_q = '+'.join(title_tokens)
- #log.error('Initial query is %s'%initial_q)
- #log.error('Cross reference query is %s'%xref_q)
+ for token in title_tokens:
+ if len(xref_q) < len(token):
+ xref_q = token
+
+ log.error('Initial query is %s'%initial_q)
+ log.error('Cross reference query is %s'%xref_q)
q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
query = '{"szKeyword":"'+initial_q+'"}'
@@ -219,27 +223,30 @@ class OverDrive(Source):
# get the search results object
results = False
+ iterations = 0
while results == False:
+ iterations += 1
xreq = mechanize.Request(q_xref)
xreq.add_header('X-Requested-With', 'XMLHttpRequest')
xreq.add_header('Referer', q_init_search)
xreq.add_header('Accept', 'application/json, text/javascript, */*')
raw = br.open_novisit(xreq).read()
for m in re.finditer(ur'"iTotalDisplayRecords":(?P\d+).*?"iTotalRecords":(?P\d+)', raw):
- if int(m.group('displayrecords')) >= 1:
- results = True
- elif int(m.group('totalrecords')) >= 1:
- if int(m.group('totalrecords')) >= 100:
- if xref_q.find('+') != -1:
- xref_tokens = xref_q.split('+')
- xref_q = xref_tokens[0]
- #log.error('xref_q is '+xref_q)
- else:
- xref_q = ''
- xref_q = ''
- q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
- elif int(m.group('totalrecords')) == 0:
+ if int(m.group('totalrecords')) == 0:
return ''
+ elif int(m.group('displayrecords')) >= 1:
+ results = True
+ elif int(m.group('totalrecords')) >= 1 and iterations < 3:
+ if xref_q.find('+') != -1:
+ xref_tokens = xref_q.split('+')
+ xref_q = xref_tokens[0]
+ for token in xref_tokens:
+ if len(xref_q) < len(token):
+ xref_q = token
+ #log.error('rewrote xref_q, new query is '+xref_q)
+ else:
+ xref_q = ''
+ q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
return self.sort_ovrdrv_results(raw, log, title, title_tokens, author, author_tokens)
@@ -263,6 +270,7 @@ class OverDrive(Source):
else:
if creators:
creators = creators.split(', ')
+
# if an exact match in a preferred format occurs
if ((author and creators and creators[0] == author[0]) or (not author and not creators)) and od_title.lower() == title.lower() and int(formatid) in [1, 50, 410, 900] and thumbimage:
return self.format_results(reserveid, od_title, subtitle, series, publisher,
@@ -330,9 +338,9 @@ class OverDrive(Source):
def find_ovrdrv_data(self, br, log, title, author, isbn, ovrdrv_id=None):
q = base_url
if ovrdrv_id is None:
- return self.overdrive_search(br, log, q, title, author)
+ return self.overdrive_search(br, log, q, title, author)
else:
- return self.overdrive_get_record(br, log, q, ovrdrv_id)
+ return self.overdrive_get_record(br, log, q, ovrdrv_id)
@@ -461,10 +469,10 @@ if __name__ == '__main__':
[
(
- {'title':'Foundation and Earth',
- 'authors':['Asimov']},
- [title_test('Foundation and Earth', exact=True),
- authors_test(['Isaac Asimov'])]
+ {'title':'The Sea Kings Daughter',
+ 'authors':['Elizabeth Peters']},
+ [title_test('The Sea Kings Daughter', exact=False),
+ authors_test(['Elizabeth Peters'])]
),
(
diff --git a/src/calibre/ebooks/metadata/sources/worker.py b/src/calibre/ebooks/metadata/sources/worker.py
new file mode 100644
index 0000000000..48f0f99584
--- /dev/null
+++ b/src/calibre/ebooks/metadata/sources/worker.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+ print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+import os
+from threading import Event, Thread
+from Queue import Queue, Empty
+from io import BytesIO
+
+from calibre.utils.date import as_utc
+from calibre.ebooks.metadata.sources.identify import identify, msprefs
+from calibre.ebooks.metadata.book.base import Metadata
+from calibre.customize.ui import metadata_plugins
+from calibre.ebooks.metadata.sources.covers import (download_cover,
+ run_download)
+from calibre.ebooks.metadata.sources.base import dump_caches, load_caches
+from calibre.utils.logging import GUILog
+from calibre.ebooks.metadata.opf2 import metadata_to_opf, OPF
+
+def merge_result(oldmi, newmi, ensure_fields=None):
+ dummy = Metadata(_('Unknown'))
+ for f in msprefs['ignore_fields']:
+ if ':' in f or (ensure_fields and f in ensure_fields):
+ continue
+ setattr(newmi, f, getattr(dummy, f))
+ fields = set()
+ for plugin in metadata_plugins(['identify']):
+ fields |= plugin.touched_fields
+
+ def is_equal(x, y):
+ if hasattr(x, 'tzinfo'):
+ x = as_utc(x)
+ if hasattr(y, 'tzinfo'):
+ y = as_utc(y)
+ return x == y
+
+ for f in fields:
+ # Optimize so that set_metadata does not have to do extra work later
+ if not f.startswith('identifier:'):
+ if (not newmi.is_null(f) and is_equal(getattr(newmi, f),
+ getattr(oldmi, f))):
+ setattr(newmi, f, getattr(dummy, f))
+
+ return newmi
+
+def main(do_identify, covers, metadata, ensure_fields, tdir):
+ os.chdir(tdir)
+ failed_ids = set()
+ failed_covers = set()
+ all_failed = True
+ log = GUILog()
+
+ for book_id, mi in metadata.iteritems():
+ mi = OPF(BytesIO(mi), basedir=os.getcwdu(),
+ populate_spine=False).to_book_metadata()
+ title, authors, identifiers = mi.title, mi.authors, mi.identifiers
+ cdata = None
+ log.clear()
+
+ if do_identify:
+ results = []
+ try:
+ results = identify(log, Event(), title=title, authors=authors,
+ identifiers=identifiers)
+ except:
+ pass
+ if results:
+ all_failed = False
+ mi = merge_result(mi, results[0], ensure_fields=ensure_fields)
+ identifiers = mi.identifiers
+ if not mi.is_null('rating'):
+ # set_metadata expects a rating out of 10
+ mi.rating *= 2
+ with open('%d.mi'%book_id, 'wb') as f:
+ f.write(metadata_to_opf(mi, default_lang='und'))
+ else:
+ log.error('Failed to download metadata for', title)
+ failed_ids.add(book_id)
+
+ if covers:
+ cdata = download_cover(log, title=title, authors=authors,
+ identifiers=identifiers)
+ if cdata is None:
+ failed_covers.add(book_id)
+ else:
+ with open('%d.cover'%book_id, 'wb') as f:
+ f.write(cdata[-1])
+ all_failed = False
+
+ with open('%d.log'%book_id, 'wb') as f:
+ f.write(log.plain_text.encode('utf-8'))
+
+ return failed_ids, failed_covers, all_failed
+
+def single_identify(title, authors, identifiers):
+ log = GUILog()
+ results = identify(log, Event(), title=title, authors=authors,
+ identifiers=identifiers)
+ return [metadata_to_opf(r) for r in results], [r.has_cached_cover_url for
+ r in results], dump_caches(), log.dump()
+
+def single_covers(title, authors, identifiers, caches, tdir):
+ os.chdir(tdir)
+ load_caches(caches)
+ log = GUILog()
+ results = Queue()
+ worker = Thread(target=run_download, args=(log, results, Event()),
+ kwargs=dict(title=title, authors=authors, identifiers=identifiers))
+ worker.daemon = True
+ worker.start()
+ while worker.is_alive():
+ try:
+ plugin, width, height, fmt, data = results.get(True, 1)
+ except Empty:
+ continue
+ else:
+ name = '%s,,%s,,%s,,%s.cover'%(plugin.name, width, height, fmt)
+ with open(name, 'wb') as f:
+ f.write(data)
+ os.mkdir(name+'.done')
+
+ return log.dump()
+
+
diff --git a/src/calibre/ebooks/mobi/debug/headers.py b/src/calibre/ebooks/mobi/debug/headers.py
index 2cc7954559..34eeb78e9e 100644
--- a/src/calibre/ebooks/mobi/debug/headers.py
+++ b/src/calibre/ebooks/mobi/debug/headers.py
@@ -205,7 +205,10 @@ class EXTHHeader(object):
@property
def kf8_header_index(self):
- return self.get(121, None)
+ ans = self.get(121, None)
+ if ans == NULL_INDEX:
+ ans = None
+ return ans
def __str__(self):
ans = ['*'*20 + ' EXTH Header '+ '*'*20]
@@ -292,21 +295,21 @@ class MOBIHeader(object): # {{{
self.datp_record_count, = struct.unpack(b'>I', self.raw[124:128])
self.exth_flags, = struct.unpack(b'>I', self.raw[128:132])
self.has_exth = bool(self.exth_flags & 0x40)
- self.has_drm_data = self.length >= 174 and len(self.raw) >= 180
+ self.has_drm_data = self.length >= 174 and len(self.raw) >= 184
if self.has_drm_data:
- self.unknown3 = self.raw[132:164]
- self.drm_offset, = struct.unpack(b'>I', self.raw[164:168])
- self.drm_count, = struct.unpack(b'>I', self.raw[168:172])
- self.drm_size, = struct.unpack(b'>I', self.raw[172:176])
- self.drm_flags = bin(struct.unpack(b'>I', self.raw[176:180])[0])
+ self.unknown3 = self.raw[132:168]
+ self.drm_offset, self.drm_count, self.drm_size, self.drm_flags = \
+ struct.unpack(b'>4I', self.raw[168:184])
self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16
self.has_fcis_flis = False
self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False
self.extra_data_flags = 0
if self.has_extra_data_flags:
- self.unknown4 = self.raw[180:192]
- self.fdst_idx, self.fdst_count = struct.unpack_from(b'>II',
+ self.unknown4 = self.raw[184:192]
+ self.fdst_idx, self.fdst_count = struct.unpack_from(b'>LL',
self.raw, 192)
+ if self.fdst_count <= 1:
+ self.fdst_idx = NULL_INDEX
(self.fcis_number, self.fcis_count, self.flis_number,
self.flis_count) = struct.unpack(b'>IIII',
self.raw[200:216])
@@ -324,22 +327,23 @@ class MOBIHeader(object): # {{{
self.primary_index_record, = struct.unpack(b'>I',
self.raw[244:248])
- if self.file_version >= 8:
+ if self.length >= 248:
(self.sect_idx, self.skel_idx, self.datp_idx, self.oth_idx
) = struct.unpack_from(b'>4L', self.raw, 248)
self.unknown9 = self.raw[264:self.length]
- if self.meta_orth_indx != self.sect_idx:
+ if self.meta_orth_indx not in {NULL_INDEX, self.sect_idx}:
raise ValueError('KF8 header has different Meta orth and '
'section indices')
# The following are all relative to the position of the header record
# make them absolute for ease of debugging
- for x in ('sect_idx', 'skel_idx', 'datp_idx', 'oth_idx',
+ self.relative_records = {'sect_idx', 'skel_idx', 'datp_idx', 'oth_idx',
'meta_orth_indx', 'huffman_record_offset',
'first_non_book_record', 'datp_record_offset', 'fcis_number',
'flis_number', 'primary_index_record', 'fdst_idx',
- 'first_image_index'):
- if hasattr(self, x):
+ 'first_image_index'}
+ for x in self.relative_records:
+ if hasattr(self, x) and getattr(self, x) != NULL_INDEX:
setattr(self, x, self.header_offset+getattr(self, x))
if self.has_exth:
@@ -352,70 +356,79 @@ class MOBIHeader(object): # {{{
def __str__(self):
ans = ['*'*20 + ' MOBI %d Header '%self.file_version+ '*'*20]
+
a = ans.append
- i = lambda d, x : a('%s (null value: %d): %d'%(d, NULL_INDEX, x))
- ans.append('Compression: %s'%self.compression)
- ans.append('Unused: %r'%self.unused)
- ans.append('Number of text records: %d'%self.number_of_text_records)
- ans.append('Text record size: %d'%self.text_record_size)
- ans.append('Encryption: %s'%self.encryption_type)
- ans.append('Unknown: %r'%self.unknown)
- ans.append('Identifier: %r'%self.identifier)
- ans.append('Header length: %d'% self.length)
- ans.append('Type: %s'%self.type)
- ans.append('Encoding: %s'%self.encoding)
- ans.append('UID: %r'%self.uid)
- ans.append('File version: %d'%self.file_version)
- i('Meta Orth Index (Sections index in KF8)', self.meta_orth_indx)
- i('Meta Infl Index', self.meta_infl_indx)
- ans.append('Secondary index record: %d (null val: %d)'%(
- self.secondary_index_record, NULL_INDEX))
- ans.append('Reserved: %r'%self.reserved)
- ans.append('First non-book record (null value: %d): %d'%(NULL_INDEX,
- self.first_non_book_record))
- ans.append('Full name offset: %d'%self.fullname_offset)
- ans.append('Full name length: %d bytes'%self.fullname_length)
- ans.append('Langcode: %r'%self.locale_raw)
- ans.append('Language: %s'%self.language)
- ans.append('Sub language: %s'%self.sublanguage)
- ans.append('Input language: %r'%self.input_language)
- ans.append('Output language: %r'%self.output_langauage)
- ans.append('Min version: %d'%self.min_version)
- ans.append('First Image index: %d'%self.first_image_index)
- ans.append('Huffman record offset: %d'%self.huffman_record_offset)
- ans.append('Huffman record count: %d'%self.huffman_record_count)
- ans.append('DATP record offset: %r'%self.datp_record_offset)
- ans.append('DATP record count: %r'%self.datp_record_count)
- ans.append('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth))
+
+ def i(d, x):
+ x = 'NULL' if x == NULL_INDEX else x
+ a('%s: %s'%(d, x))
+
+ def r(d, attr):
+ x = getattr(self, attr)
+ if attr in self.relative_records and x != NULL_INDEX:
+ a('%s: Absolute: %d Relative: %d'%(d, x, x-self.header_offset))
+ else:
+ i(d, x)
+
+ a('Compression: %s'%self.compression)
+ a('Unused: %r'%self.unused)
+ a('Number of text records: %d'%self.number_of_text_records)
+ a('Text record size: %d'%self.text_record_size)
+ a('Encryption: %s'%self.encryption_type)
+ a('Unknown: %r'%self.unknown)
+ a('Identifier: %r'%self.identifier)
+ a('Header length: %d'% self.length)
+ a('Type: %s'%self.type)
+ a('Encoding: %s'%self.encoding)
+ a('UID: %r'%self.uid)
+ a('File version: %d'%self.file_version)
+ r('Meta Orth Index', 'meta_orth_indx')
+ r('Meta Infl Index', 'meta_infl_indx')
+ r('Secondary index record', 'secondary_index_record')
+ a('Reserved: %r'%self.reserved)
+ r('First non-book record', 'first_non_book_record')
+ a('Full name offset: %d'%self.fullname_offset)
+ a('Full name length: %d bytes'%self.fullname_length)
+ a('Langcode: %r'%self.locale_raw)
+ a('Language: %s'%self.language)
+ a('Sub language: %s'%self.sublanguage)
+ a('Input language: %r'%self.input_language)
+ a('Output language: %r'%self.output_langauage)
+ a('Min version: %d'%self.min_version)
+ r('First Image index', 'first_image_index')
+ r('Huffman record offset', 'huffman_record_offset')
+ a('Huffman record count: %d'%self.huffman_record_count)
+ r('DATP record offset', 'datp_record_offset')
+ a('DATP record count: %r'%self.datp_record_count)
+ a('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth))
if self.has_drm_data:
- ans.append('Unknown3: %r'%self.unknown3)
- ans.append('DRM Offset: %s'%self.drm_offset)
- ans.append('DRM Count: %s'%self.drm_count)
- ans.append('DRM Size: %s'%self.drm_size)
- ans.append('DRM Flags: %r'%self.drm_flags)
+ a('Unknown3: %r'%self.unknown3)
+ r('DRM Offset', 'drm_offset')
+ a('DRM Count: %s'%self.drm_count)
+ a('DRM Size: %s'%self.drm_size)
+ a('DRM Flags: %r'%self.drm_flags)
if self.has_extra_data_flags:
- ans.append('Unknown4: %r'%self.unknown4)
- ans.append('FDST Index: %d'% self.fdst_idx)
- ans.append('FDST Count: %d'% self.fdst_count)
- ans.append('FCIS number: %d'% self.fcis_number)
- ans.append('FCIS count: %d'% self.fcis_count)
- ans.append('FLIS number: %d'% self.flis_number)
- ans.append('FLIS count: %d'% self.flis_count)
- ans.append('Unknown6: %r'% self.unknown6)
- ans.append('SRCS record index: %d'%self.srcs_record_index)
- ans.append('Number of SRCS records?: %d'%self.num_srcs_records)
- ans.append('Unknown7: %r'%self.unknown7)
- ans.append(('Extra data flags: %s (has multibyte: %s) '
+ a('Unknown4: %r'%self.unknown4)
+ r('FDST Index', 'fdst_idx')
+ a('FDST Count: %d'% self.fdst_count)
+ r('FCIS number', 'fcis_number')
+ a('FCIS count: %d'% self.fcis_count)
+ r('FLIS number', 'flis_number')
+ a('FLIS count: %d'% self.flis_count)
+ a('Unknown6: %r'% self.unknown6)
+ r('SRCS record index', 'srcs_record_index')
+ a('Number of SRCS records?: %d'%self.num_srcs_records)
+ a('Unknown7: %r'%self.unknown7)
+ a(('Extra data flags: %s (has multibyte: %s) '
'(has indexing: %s) (has uncrossable breaks: %s)')%(
bin(self.extra_data_flags), self.has_multibytes,
self.has_indexing_bytes, self.has_uncrossable_breaks ))
- ans.append('Primary index record (null value: %d): %d'%(NULL_INDEX,
- self.primary_index_record))
- if self.file_version >= 8:
- i('Sections Index', self.sect_idx)
- i('SKEL Index', self.skel_idx)
- i('DATP Index', self.datp_idx)
- i('Other Index', self.oth_idx)
+ r('NCX index', 'primary_index_record')
+ if self.length >= 248:
+ r('Sections Index', 'sect_idx')
+ r('SKEL Index', 'skel_idx')
+ r('DATP Index', 'datp_idx')
+ r('Other Index', 'oth_idx')
if self.unknown9:
a('Unknown9: %r'%self.unknown9)
@@ -467,9 +480,15 @@ class MOBIFile(object):
if mh.file_version >= 8:
self.kf8_type = 'standalone'
elif mh.has_exth and mh.exth.kf8_header_index is not None:
- self.kf8_type = 'joint'
kf8i = mh.exth.kf8_header_index
- mh8 = MOBIHeader(self.records[kf8i], kf8i)
+ try:
+ rec = self.records[kf8i-1]
+ except IndexError:
+ pass
+ else:
+ if rec.raw == b'BOUNDARY':
+ self.kf8_type = 'joint'
+ mh8 = MOBIHeader(self.records[kf8i], kf8i)
self.mobi8_header = mh8
if 'huff' in self.mobi_header.compression.lower():
@@ -530,6 +549,9 @@ class TextRecord(object): # {{{
raw = '%s : %r\n\n'%(k, v)
f.write(raw.encode('utf-8'))
+ def __len__(self):
+ return len(self.raw)
+
# }}}
diff --git a/src/calibre/ebooks/mobi/debug/index.py b/src/calibre/ebooks/mobi/debug/index.py
new file mode 100644
index 0000000000..488adef05d
--- /dev/null
+++ b/src/calibre/ebooks/mobi/debug/index.py
@@ -0,0 +1,198 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+ print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+from collections import OrderedDict, namedtuple
+
+from calibre.ebooks.mobi.reader.headers import NULL_INDEX
+from calibre.ebooks.mobi.reader.index import (CNCX, parse_indx_header,
+ parse_tagx_section, parse_index_record, INDEX_HEADER_FIELDS)
+from calibre.ebooks.mobi.reader.ncx import (tag_fieldname_map, default_entry)
+
+File = namedtuple('File',
+ 'file_number name divtbl_count start_position length')
+
+Elem = namedtuple('Chunk',
+ 'insert_pos toc_text file_number sequence_number start_pos '
+ 'length')
+
+GuideRef = namedtuple('GuideRef', 'type title pos_fid')
+
+def read_index(sections, idx, codec):
+ table, cncx = OrderedDict(), CNCX([], codec)
+
+ data = sections[idx].raw
+
+ indx_header = parse_indx_header(data)
+ indx_count = indx_header['count']
+
+ if indx_header['ncncx'] > 0:
+ off = idx + indx_count + 1
+ cncx_records = [x.raw for x in sections[off:off+indx_header['ncncx']]]
+ cncx = CNCX(cncx_records, codec)
+
+ tag_section_start = indx_header['tagx']
+ control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
+
+ for i in xrange(idx + 1, idx + 1 + indx_count):
+ # Index record
+ data = sections[i].raw
+ parse_index_record(table, data, control_byte_count, tags, codec,
+ indx_header['ordt_map'], strict=True)
+ return table, cncx, indx_header
+
+class Index(object):
+
+ def __init__(self, idx, records, codec):
+ self.table = self.cncx = self.header = self.records = None
+ if idx != NULL_INDEX:
+ self.table, self.cncx, self.header = read_index(records, idx, codec)
+
+ def render(self):
+ ans = ['*'*10 + ' Index Header ' + '*'*10]
+ a = ans.append
+ if self.header is not None:
+ for field in INDEX_HEADER_FIELDS:
+ a('%-12s: %r'%(field, self.header[field]))
+ ans.extend(['', ''])
+
+ if self.cncx:
+ a('*'*10 + ' CNCX ' + '*'*10)
+ for offset, val in self.cncx.iteritems():
+ a('%10s: %s'%(offset, val))
+ ans.extend(['', ''])
+
+ if self.table is not None:
+ a('*'*10 + ' %d Index Entries '%len(self.table) + '*'*10)
+ for k, v in self.table.iteritems():
+ a('%s: %r'%(k, v))
+
+ if self.records:
+ ans.extend(['', '', '*'*10 + ' Parsed Entries ' + '*'*10])
+ for f in self.records:
+ a(repr(f))
+
+ return ans + ['']
+
+ def __str__(self):
+ return '\n'.join(self.render())
+
+ def __iter__(self):
+ return iter(self.records)
+
+class SKELIndex(Index):
+
+ def __init__(self, skelidx, records, codec):
+ super(SKELIndex, self).__init__(skelidx, records, codec)
+ self.records = []
+
+ if self.table is not None:
+ for i, text in enumerate(self.table.iterkeys()):
+ tag_map = self.table[text]
+ if set(tag_map.iterkeys()) != {1, 6}:
+ raise ValueError('SKEL Index has unknown tags: %s'%
+ (set(tag_map.iterkeys())-{1,6}))
+ self.records.append(File(
+ i, # file_number
+ text, # name
+ tag_map[1][0], # divtbl_count
+ tag_map[6][0], # start_pos
+ tag_map[6][1]) # length
+ )
+
+class SECTIndex(Index):
+
+ def __init__(self, sectidx, records, codec):
+ super(SECTIndex, self).__init__(sectidx, records, codec)
+ self.records = []
+
+ if self.table is not None:
+ for i, text in enumerate(self.table.iterkeys()):
+ tag_map = self.table[text]
+ if set(tag_map.iterkeys()) != {2, 3, 4, 6}:
+ raise ValueError('Chunk Index has unknown tags: %s'%
+ (set(tag_map.iterkeys())-{2, 3, 4, 6}))
+
+ toc_text = self.cncx[tag_map[2][0]]
+ self.records.append(Elem(
+ int(text), # insert_pos
+ toc_text, # toc_text
+ tag_map[3][0], # file_number
+ tag_map[4][0], # sequence_number
+ tag_map[6][0], # start_pos
+ tag_map[6][1] # length
+ )
+ )
+
+class GuideIndex(Index):
+
+ def __init__(self, guideidx, records, codec):
+ super(GuideIndex, self).__init__(guideidx, records, codec)
+ self.records = []
+
+ if self.table is not None:
+ for i, text in enumerate(self.table.iterkeys()):
+ tag_map = self.table[text]
+ if set(tag_map.iterkeys()) not in ({1, 6}, {1, 2, 3}):
+ raise ValueError('Guide Index has unknown tags: %s'%
+ tag_map)
+
+ title = self.cncx[tag_map[1][0]]
+ self.records.append(GuideRef(
+ text,
+ title,
+ tag_map[6] if 6 in tag_map else (tag_map[2], tag_map[3])
+ )
+ )
+
+
+class NCXIndex(Index):
+
+ def __init__(self, ncxidx, records, codec):
+ super(NCXIndex, self).__init__(ncxidx, records, codec)
+ self.records = []
+
+ if self.table is not None:
+ NCXEntry = namedtuple('NCXEntry', 'index start length depth parent '
+ 'first_child last_child title pos_fid')
+
+ for num, x in enumerate(self.table.iteritems()):
+ text, tag_map = x
+ entry = e = default_entry.copy()
+ entry['name'] = text
+ entry['num'] = num
+
+ for tag in tag_fieldname_map.iterkeys():
+ fieldname, i = tag_fieldname_map[tag]
+ if tag in tag_map:
+ fieldvalue = tag_map[tag][i]
+ if tag == 6:
+ # Appears to be an idx into the KF8 elems table with an
+ # offset
+ fieldvalue = tuple(tag_map[tag])
+ entry[fieldname] = fieldvalue
+ for which, name in {3:'text', 5:'kind', 70:'description',
+ 71:'author', 72:'image_caption',
+ 73:'image_attribution'}.iteritems():
+ if tag == which:
+ entry[name] = self.cncx.get(fieldvalue,
+ default_entry[name])
+ def refindx(e, name):
+ ans = e[name]
+ if ans < 0:
+ ans = None
+ return ans
+
+ entry = NCXEntry(start=e['pos'], index=e['num'],
+ length=e['len'], depth=e['hlvl'], parent=refindx(e,
+ 'parent'), first_child=refindx(e, 'child1'),
+ last_child=refindx(e, 'childn'), title=e['text'],
+ pos_fid=e['pos_fid'])
+ self.records.append(entry)
+
+
diff --git a/src/calibre/ebooks/mobi/debug/mobi6.py b/src/calibre/ebooks/mobi/debug/mobi6.py
index 640f58c661..fb5674653c 100644
--- a/src/calibre/ebooks/mobi/debug/mobi6.py
+++ b/src/calibre/ebooks/mobi/debug/mobi6.py
@@ -393,7 +393,7 @@ class IndexRecord(object): # {{{
parse_index_record(table, record.raw,
index_header.tagx_control_byte_count, tags,
- index_header.index_encoding, strict=True)
+ index_header.index_encoding, {}, strict=True)
self.indices = []
diff --git a/src/calibre/ebooks/mobi/debug/mobi8.py b/src/calibre/ebooks/mobi/debug/mobi8.py
index e4a92ee95c..a03205edd7 100644
--- a/src/calibre/ebooks/mobi/debug/mobi8.py
+++ b/src/calibre/ebooks/mobi/debug/mobi8.py
@@ -2,14 +2,68 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
+from future_builtins import map
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import sys, os
+import sys, os, imghdr, struct, textwrap
+from itertools import izip
+from calibre import CurrentDir
from calibre.ebooks.mobi.debug.headers import TextRecord
+from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex,
+ GuideIndex)
+from calibre.ebooks.mobi.utils import read_font_record, decode_tbs, RECORD_SIZE
+from calibre.ebooks.mobi.debug import format_bytes
+from calibre.ebooks.mobi.reader.headers import NULL_INDEX
+
+class FDST(object):
+
+ def __init__(self, raw):
+ if raw[:4] != b'FDST':
+ raise ValueError('KF8 does not have a valid FDST record')
+ self.sec_off, self.num_sections = struct.unpack_from(b'>LL', raw, 4)
+ if self.sec_off != 12:
+ raise ValueError('FDST record has unknown extra fields')
+ secf = b'>%dL' % (self.num_sections*2)
+ secs = struct.unpack_from(secf, raw, self.sec_off)
+ rest = raw[self.sec_off+struct.calcsize(secf):]
+ if rest:
+ raise ValueError('FDST record has trailing data: '
+ '%s'%format_bytes(rest))
+ self.sections = tuple(izip(secs[::2], secs[1::2]))
+
+ def __str__(self):
+ ans = ['FDST record']
+ a = lambda k, v:ans.append('%s: %s'%(k, v))
+ a('Offset to sections', self.sec_off)
+ a('Number of section records', self.num_sections)
+ ans.append('**** %d Sections ****'% len(self.sections))
+ for sec in self.sections:
+ ans.append('Start: %20d End: %d'%sec)
+
+ return '\n'.join(ans)
+
+class File(object):
+
+ def __init__(self, skel, skeleton, text, first_aid, sections):
+ self.name = 'part%04d'%skel.file_number
+ self.skeleton, self.text, self.first_aid = skeleton, text, first_aid
+ self.sections = sections
+
+ def dump(self, ddir):
+ with open(os.path.join(ddir, self.name + '.html'), 'wb') as f:
+ f.write(self.text)
+ base = os.path.join(ddir, self.name + '-parts')
+ os.mkdir(base)
+ with CurrentDir(base):
+ with open('skeleton.html', 'wb') as f:
+ f.write(self.skeleton)
+ for i, text in enumerate(self.sections):
+ with open('sect-%04d.html'%i, 'wb') as f:
+ f.write(text)
class MOBIFile(object):
@@ -30,6 +84,12 @@ class MOBIFile(object):
first_text_record+offset+h8.number_of_text_records])]
self.raw_text = b''.join(r.raw for r in self.text_records)
+ self.header = self.mf.mobi8_header
+ self.extract_resources()
+ self.read_fdst()
+ self.read_indices()
+ self.build_files()
+ self.read_tbs()
def print_header(self, f=sys.stdout):
print (str(self.mf.palmdb).encode('utf-8'), file=f)
@@ -41,6 +101,148 @@ class MOBIFile(object):
print (file=f)
print (str(self.mf.mobi8_header).encode('utf-8'), file=f)
+ def read_fdst(self):
+ self.fdst = None
+
+ if self.header.fdst_idx != NULL_INDEX:
+ idx = self.header.fdst_idx
+ self.fdst = FDST(self.mf.records[idx].raw)
+ if self.fdst.num_sections != self.header.fdst_count:
+ raise ValueError('KF8 Header contains invalid FDST count')
+
+ def read_indices(self):
+ self.skel_index = SKELIndex(self.header.skel_idx, self.mf.records,
+ self.header.encoding)
+ self.sect_index = SECTIndex(self.header.sect_idx, self.mf.records,
+ self.header.encoding)
+ self.ncx_index = NCXIndex(self.header.primary_index_record,
+ self.mf.records, self.header.encoding)
+ self.guide_index = GuideIndex(self.header.oth_idx, self.mf.records,
+ self.header.encoding)
+
+ def build_files(self):
+ text = self.raw_text
+ self.files = []
+ for skel in self.skel_index.records:
+ sects = [x for x in self.sect_index.records if x.file_number
+ == skel.file_number]
+ skeleton = text[skel.start_position:skel.start_position+skel.length]
+ ftext = skeleton
+ first_aid = sects[0].toc_text
+ sections = []
+
+ for sect in sects:
+ start_pos = skel.start_position + skel.length + sect.start_pos
+ sect_text = text[start_pos:start_pos+sect.length]
+ insert_pos = sect.insert_pos - skel.start_position
+ ftext = ftext[:insert_pos] + sect_text + ftext[insert_pos:]
+ sections.append(sect_text)
+
+ self.files.append(File(skel, skeleton, ftext, first_aid, sections))
+
+ def dump_flows(self, ddir):
+ if self.fdst is None:
+ raise ValueError('This MOBI file has no FDST record')
+ for i, x in enumerate(self.fdst.sections):
+ start, end = x
+ raw = self.raw_text[start:end]
+ with open(os.path.join(ddir, 'flow%04d.txt'%i), 'wb') as f:
+ f.write(raw)
+
+ def extract_resources(self):
+ self.resource_map = []
+ known_types = {b'FLIS', b'FCIS', b'SRCS',
+ b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP',
+ b'AUDI', b'VIDE'}
+
+ for i, rec in enumerate(self.resource_records):
+ sig = rec.raw[:4]
+ payload = rec.raw
+ ext = 'dat'
+ prefix = 'binary'
+ suffix = ''
+ if sig in {b'HUFF', b'CDIC', b'INDX'}: continue
+ # TODO: Ignore CNCX records as well
+ if sig == b'FONT':
+ font = read_font_record(rec.raw)
+ if font['err']:
+ raise ValueError('Failed to read font record: %s Headers: %s'%(
+ font['err'], font['headers']))
+ payload = (font['font_data'] if font['font_data'] else
+ font['raw_data'])
+ prefix, ext = 'fonts', font['ext']
+ elif sig not in known_types:
+ q = imghdr.what(None, rec.raw)
+ if q:
+ prefix, ext = 'images', q
+
+ if prefix == 'binary':
+ if sig == b'\xe9\x8e\r\n':
+ suffix = '-EOF'
+ elif sig in known_types:
+ suffix = '-' + sig.decode('ascii')
+
+ self.resource_map.append(('%s/%06d%s.%s'%(prefix, i, suffix, ext),
+ payload))
+
+ def read_tbs(self):
+ from calibre.ebooks.mobi.writer8.tbs import (Entry, DOC,
+ collect_indexing_data, encode_strands_as_sequences,
+ sequences_to_bytes)
+ entry_map = []
+ for index in self.ncx_index:
+ vals = list(index)[:-1] + [None, None, None, None]
+ entry_map.append(Entry(*vals))
+
+
+ indexing_data = collect_indexing_data(entry_map, list(map(len,
+ self.text_records)))
+ self.indexing_data = [DOC + '\n' +textwrap.dedent('''\
+ Index Entry lines are of the form:
+ depth:index_number [action] parent (index_num-parent) Geometry
+
+ Where Geometry is the start and end of the index entry w.r.t
+ the start of the text record.
+
+ ''')]
+ for i, strands in enumerate(indexing_data):
+ rec = self.text_records[i]
+ tbs_bytes = rec.trailing_data.get('indexing', b'')
+ desc = ['Record #%d'%i]
+ for s, strand in enumerate(strands):
+ desc.append('Strand %d'%s)
+ for entries in strand.itervalues():
+ for e in entries:
+ desc.append(
+ ' %s%d [%-9s] parent: %s (%d) Geometry: (%d, %d)'%(
+ e.depth * (' ') + '- ', e.index, e.action, e.parent,
+ e.index-(e.parent or 0), e.start-i*RECORD_SIZE,
+ e.start+e.length-i*RECORD_SIZE))
+ desc.append('TBS Bytes: ' + format_bytes(tbs_bytes))
+ flag_sz = 3
+ sequences = []
+ otbs = tbs_bytes
+ while tbs_bytes:
+ try:
+ val, extra, consumed = decode_tbs(tbs_bytes, flag_size=flag_sz)
+ except:
+ break
+ flag_sz = 4
+ tbs_bytes = tbs_bytes[consumed:]
+ extra = {bin(k):v for k, v in extra.iteritems()}
+ sequences.append((val, extra))
+ for j, seq in enumerate(sequences):
+ desc.append('Sequence #%d: %r %r'%(j, seq[0], seq[1]))
+ if tbs_bytes:
+ desc.append('Remaining bytes: %s'%format_bytes(tbs_bytes))
+ calculated_sequences = encode_strands_as_sequences(strands)
+ calculated_bytes = sequences_to_bytes(calculated_sequences)
+ if calculated_bytes != otbs:
+ print ('WARNING: TBS mismatch for record %d'%i)
+ desc.append('WARNING: TBS mismatch!')
+ desc.append('Calculated sequences: %r'%calculated_sequences)
+ desc.append('')
+ self.indexing_data.append('\n'.join(desc))
def inspect_mobi(mobi_file, ddir):
f = MOBIFile(mobi_file)
@@ -51,12 +253,38 @@ def inspect_mobi(mobi_file, ddir):
with open(alltext, 'wb') as of:
of.write(f.raw_text)
- for tdir, attr in [('text_records', 'text_records'), ('images',
- 'image_records'), ('binary', 'binary_records'), ('font',
- 'font_records')]:
- tdir = os.path.join(ddir, tdir)
- os.mkdir(tdir)
- for rec in getattr(f, attr, []):
- rec.dump(tdir)
+ for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows'):
+ os.mkdir(os.path.join(ddir, x))
+
+ for rec in f.text_records:
+ rec.dump(os.path.join(ddir, 'text_records'))
+
+ for href, payload in f.resource_map:
+ with open(os.path.join(ddir, href), 'wb') as fo:
+ fo.write(payload)
+
+ if f.fdst:
+ with open(os.path.join(ddir, 'fdst.record'), 'wb') as fo:
+ fo.write(str(f.fdst).encode('utf-8'))
+
+ with open(os.path.join(ddir, 'skel.record'), 'wb') as fo:
+ fo.write(str(f.skel_index).encode('utf-8'))
+
+ with open(os.path.join(ddir, 'chunks.record'), 'wb') as fo:
+ fo.write(str(f.sect_index).encode('utf-8'))
+
+ with open(os.path.join(ddir, 'ncx.record'), 'wb') as fo:
+ fo.write(str(f.ncx_index).encode('utf-8'))
+
+ with open(os.path.join(ddir, 'guide.record'), 'wb') as fo:
+ fo.write(str(f.guide_index).encode('utf-8'))
+
+ with open(os.path.join(ddir, 'tbs.txt'), 'wb') as fo:
+ fo.write(('\n'.join(f.indexing_data)).encode('utf-8'))
+
+ for part in f.files:
+ part.dump(os.path.join(ddir, 'files'))
+
+ f.dump_flows(os.path.join(ddir, 'flows'))
diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py
index 7cda4b0a57..d276689224 100644
--- a/src/calibre/ebooks/mobi/mobiml.py
+++ b/src/calibre/ebooks/mobi/mobiml.py
@@ -10,7 +10,7 @@ import copy
import re
from lxml import etree
from calibre.ebooks.oeb.base import namespace, barename
-from calibre.ebooks.oeb.base import XHTML, XHTML_NS, OEB_DOCS, urlnormalize
+from calibre.ebooks.oeb.base import XHTML, XHTML_NS, urlnormalize
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.transforms.flatcss import KeyMapper
from calibre.utils.magick.draw import identify_data
@@ -109,26 +109,8 @@ class MobiMLizer(object):
self.profile = profile = context.dest
self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items())
self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys())
- self.remove_html_cover()
self.mobimlize_spine()
- def remove_html_cover(self):
- oeb = self.oeb
- if not oeb.metadata.cover \
- or 'cover' not in oeb.guide:
- return
- href = oeb.guide['cover'].href
- del oeb.guide['cover']
- item = oeb.manifest.hrefs[href]
- if item.spine_position is not None:
- self.log.warn('Found an HTML cover,', item.href, 'removing it.',
- 'If you find some content missing from the output MOBI, it '
- 'is because you misidentified the HTML cover in the input '
- 'document')
- oeb.spine.remove(item)
- if item.media_type in OEB_DOCS:
- self.oeb.manifest.remove(item)
-
def mobimlize_spine(self):
'Iterate over the spine and convert it to MOBIML'
for item in self.oeb.spine:
@@ -473,7 +455,7 @@ class MobiMLizer(object):
if tag in TABLE_TAGS and self.ignore_tables:
tag = 'span' if tag == 'td' else 'div'
- if tag == 'table':
+ if tag in ('table', 'td', 'tr'):
col = style.backgroundColor
if col:
elem.set('bgcolor', col)
diff --git a/src/calibre/ebooks/mobi/reader/headers.py b/src/calibre/ebooks/mobi/reader/headers.py
index 06d349d5de..0162fddda7 100644
--- a/src/calibre/ebooks/mobi/reader/headers.py
+++ b/src/calibre/ebooks/mobi/reader/headers.py
@@ -11,7 +11,7 @@ import struct, re, os
from calibre import replace_entities
from calibre.utils.date import parse_date
from calibre.ebooks.mobi import MobiError
-from calibre.ebooks.metadata import MetaInformation
+from calibre.ebooks.metadata import MetaInformation, check_isbn
from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
NULL_INDEX = 0xffffffff
@@ -46,7 +46,10 @@ class EXTHHeader(object): # {{{
self.thumbnail_offset, = struct.unpack('>L', content)
elif idx == 501:
# cdetype
- pass
+ if content == b'EBSP':
+ if not self.mi.tags:
+ self.mi.tags = []
+ self.mi.tags.append(_('Sample Book'))
elif idx == 502:
# last update time
pass
@@ -75,10 +78,14 @@ class EXTHHeader(object): # {{{
self.mi.author_sort = au.strip()
elif idx == 101:
self.mi.publisher = content.decode(codec, 'ignore').strip()
+ if self.mi.publisher in {'Unknown', _('Unknown')}:
+ self.mi.publisher = None
elif idx == 103:
self.mi.comments = content.decode(codec, 'ignore')
elif idx == 104:
- self.mi.isbn = content.decode(codec, 'ignore').strip().replace('-', '')
+ raw = check_isbn(content.decode(codec, 'ignore').strip().replace('-', ''))
+ if raw:
+ self.mi.isbn = raw
elif idx == 105:
if not self.mi.tags:
self.mi.tags = []
@@ -92,12 +99,24 @@ class EXTHHeader(object): # {{{
pass
elif idx == 108:
self.mi.book_producer = content.decode(codec, 'ignore').strip()
+ elif idx == 112: # dc:source set in some EBSP amazon samples
+ try:
+ content = content.decode(codec).strip()
+ isig = 'urn:isbn:'
+ if content.lower().startswith(isig):
+ raw = check_isbn(content[len(isig):])
+ if raw and not self.mi.isbn:
+ self.mi.isbn = raw
+ except:
+ pass
elif idx == 113:
pass # ASIN or UUID
elif idx == 116:
self.start_offset, = struct.unpack(b'>L', content)
elif idx == 121:
self.kf8_header, = struct.unpack(b'>L', content)
+ if self.kf8_header == NULL_INDEX:
+ self.kf8_header = None
#else:
# print 'unhandled metadata record', idx, repr(content)
# }}}
diff --git a/src/calibre/ebooks/mobi/reader/index.py b/src/calibre/ebooks/mobi/reader/index.py
index dd85b5a5cb..c732d8862e 100644
--- a/src/calibre/ebooks/mobi/reader/index.py
+++ b/src/calibre/ebooks/mobi/reader/index.py
@@ -15,6 +15,12 @@ from calibre.ebooks.mobi.utils import (decint, count_set_bits,
TagX = namedtuple('TagX', 'tag num_of_values bitmask eof')
PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values')
+INDEX_HEADER_FIELDS = (
+ 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
+ 'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
+ ) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries',
+ 'ordt1', 'ordt2', 'tagx')
+
class InvalidFile(ValueError):
pass
@@ -36,13 +42,40 @@ def format_bytes(byts):
def parse_indx_header(data):
check_signature(data, b'INDX')
- words = (
- 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
- 'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
- )
+ words = INDEX_HEADER_FIELDS
num = len(words)
values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
- return dict(zip(words, values))
+ ans = dict(zip(words, values))
+ ordt1, ordt2 = ans['ordt1'], ans['ordt2']
+ ans['ordt1_raw'], ans['ordt2_raw'] = [], []
+ ans['ordt_map'] = ''
+
+ if ordt1 > 0 and data[ordt1:ordt1+4] == b'ORDT':
+ # I dont know what this is, but using it seems to be unnecessary, so
+ # just leave it as the raw bytestring
+ ans['ordt1_raw'] = data[ordt1+4:ordt1+4+ans['oentries']]
+ if ordt2 > 0 and data[ordt2:ordt2+4] == b'ORDT':
+ ans['ordt2_raw'] = raw = bytearray(data[ordt2+4:ordt2+4+2*ans['oentries']])
+ if ans['code'] == 65002:
+ # This appears to be EBCDIC-UTF (65002) encoded. I can't be
+ # bothered to write a decoder for this (see
+ # http://www.unicode.org/reports/tr16/) Just how stupid is Amazon?
+ # Instead, we use a weird hack that seems to do the trick for all
+ # the books with this type of ORDT record that I have come across.
+ # Some EBSP book samples in KF8 format from Amazon have this type
+ # of encoding.
+ # Basically we try to interpret every second byte as a printable
+ # ascii character. If we cannot, we map to the ? char.
+
+ parsed = bytearray(ans['oentries'])
+ for i in xrange(0, 2*ans['oentries'], 2):
+ parsed[i//2] = raw[i+1] if 0x20 < raw[i+1] < 0x7f else ord(b'?')
+ ans['ordt_map'] = bytes(parsed).decode('ascii')
+ else:
+ ans['ordt_map'] = '?'*ans['oentries']
+
+ return ans
+
class CNCX(object): # {{{
@@ -78,6 +111,13 @@ class CNCX(object): # {{{
def get(self, offset, default=None):
return self.records.get(offset, default)
+
+ def __bool__(self):
+ return bool(self.records)
+ __nonzero__ = __bool__
+
+ def iteritems(self):
+ return self.records.iteritems()
# }}}
def parse_tagx_section(data):
@@ -163,7 +203,7 @@ def get_tag_map(control_byte_count, tagx, data, strict=False):
return ans
def parse_index_record(table, data, control_byte_count, tags, codec,
- strict=False):
+ ordt_map, strict=False):
header = parse_indx_header(data)
idxt_pos = header['start']
if data[idxt_pos:idxt_pos+4] != b'IDXT':
@@ -184,12 +224,11 @@ def parse_index_record(table, data, control_byte_count, tags, codec,
for j in xrange(entry_count):
start, end = idx_positions[j:j+2]
rec = data[start:end]
- ident, consumed = decode_string(rec, codec=codec)
+ ident, consumed = decode_string(rec, codec=codec, ordt_map=ordt_map)
rec = rec[consumed:]
tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
table[ident] = tag_map
-
def read_index(sections, idx, codec):
table, cncx = OrderedDict(), CNCX([], codec)
@@ -203,12 +242,13 @@ def read_index(sections, idx, codec):
cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
cncx = CNCX(cncx_records, codec)
- tag_section_start = indx_header['len']
+ tag_section_start = indx_header['tagx']
control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
for i in xrange(idx + 1, idx + 1 + indx_count):
# Index record
data = sections[i][0]
- parse_index_record(table, data, control_byte_count, tags, codec)
+ parse_index_record(table, data, control_byte_count, tags, codec,
+ indx_header['ordt_map'])
return table, cncx
diff --git a/src/calibre/ebooks/mobi/reader/markup.py b/src/calibre/ebooks/mobi/reader/markup.py
index 8bb7f211f3..8a06bc346a 100644
--- a/src/calibre/ebooks/mobi/reader/markup.py
+++ b/src/calibre/ebooks/mobi/reader/markup.py
@@ -223,15 +223,15 @@ def insert_images_into_markup(parts, resource_map, log):
# Handle any embedded raster images links in the xhtml text
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
- img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''')
+ img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)[^')"]*[)'"]''')
+
+ style_pattern = re.compile(r'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''',
+ re.IGNORECASE)
+
for i in xrange(len(parts)):
part = parts[i]
- #[partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
-
- # links to raster image files
- # image_pattern
srcpieces = img_pattern.split(part)
- for j in range(1, len(srcpieces), 2):
+ for j in xrange(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith(']*>)''', re.IGNORECASE)
diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py
index ec7166ebb0..dcf2f998b2 100644
--- a/src/calibre/ebooks/mobi/reader/mobi8.py
+++ b/src/calibre/ebooks/mobi/reader/mobi8.py
@@ -9,14 +9,20 @@ __docformat__ = 'restructuredtext en'
import struct, re, os, imghdr
from collections import namedtuple
-from itertools import repeat
+from itertools import repeat, izip
+from urlparse import urldefrag
+
+from lxml import etree
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.ebooks.mobi.reader.index import read_index
from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc
from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup
from calibre.ebooks.metadata.opf2 import Guide, OPFCreator
+from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.mobi.utils import read_font_record
+from calibre.ebooks.oeb.parse_utils import parse_html
+from calibre.ebooks.oeb.base import XPath, XHTML, xml2text
Part = namedtuple('Part',
'num type filename start end aid')
@@ -65,16 +71,16 @@ class Mobi8Reader(object):
return self.write_opf(guide, ncx, spine, resource_map)
def read_indices(self):
- self.flow_table = (0, NULL_INDEX)
+ self.flow_table = ()
if self.header.fdstidx != NULL_INDEX:
header = self.kf8_sections[self.header.fdstidx][0]
if header[:4] != b'FDST':
raise ValueError('KF8 does not have a valid FDST record')
- num_sections, = struct.unpack_from(b'>L', header, 0x08)
- sections = header[0x0c:]
- self.flow_table = struct.unpack_from(b'>%dL' % (num_sections*2),
- sections, 0)[::2] + (NULL_INDEX,)
+ sec_start, num_sections = struct.unpack_from(b'>LL', header, 4)
+ secs = struct.unpack_from(b'>%dL' % (num_sections*2),
+ header, sec_start)
+ self.flow_table = tuple(izip(secs[::2], secs[1::2]))
self.files = []
if self.header.skelidx != NULL_INDEX:
@@ -103,7 +109,7 @@ class Mobi8Reader(object):
table, cncx = read_index(self.kf8_sections, self.header.othidx,
self.header.codec)
Item = namedtuple('Item',
- 'type title div_frag_num')
+ 'type title pos_fid')
for i, ref_type in enumerate(table.iterkeys()):
tag_map = table[ref_type]
@@ -113,7 +119,7 @@ class Mobi8Reader(object):
if 3 in tag_map.keys():
fileno = tag_map[3][0]
if 6 in tag_map.keys():
- fileno = tag_map[6][0]
+ fileno = tag_map[6]
self.guide.append(Item(ref_type.decode(self.header.codec),
title, fileno))
@@ -121,13 +127,10 @@ class Mobi8Reader(object):
raw_ml = self.mobi6_reader.mobi_html
self.flows = []
self.flowinfo = []
+ ft = self.flow_table if self.flow_table else [(0, len(raw_ml))]
# now split the raw_ml into its flow pieces
- for j in xrange(0, len(self.flow_table)-1):
- start = self.flow_table[j]
- end = self.flow_table[j+1]
- if end == NULL_INDEX:
- end = len(raw_ml)
+ for start, end in ft:
self.flows.append(raw_ml[start:end])
# the first piece represents the xhtml text
@@ -284,19 +287,24 @@ class Mobi8Reader(object):
def create_guide(self):
guide = Guide()
- for ref_type, ref_title, fileno in self.guide:
- elem = self.elems[fileno]
- fi = self.get_file_info(elem.insert_pos)
- idtext = self.get_id_tag(elem.insert_pos).decode(self.header.codec)
- linktgt = fi.filename
+ has_start = False
+ for ref_type, ref_title, pos_fid in self.guide:
+ try:
+ if len(pos_fid) != 2:
+ continue
+ except TypeError:
+ continue # thumbnailstandard record, ignore it
+ linktgt, idtext = self.get_id_tag_by_pos_fid(*pos_fid)
if idtext:
linktgt += b'#' + idtext
- g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwdu())
+ g = Guide.Reference(linktgt, os.getcwdu())
g.title, g.type = ref_title, ref_type
+ if g.title == 'start' or g.type == 'text':
+ has_start = True
guide.append(g)
so = self.header.exth.start_offset
- if so not in {None, NULL_INDEX}:
+ if so not in {None, NULL_INDEX} and not has_start:
fi = self.get_file_info(so)
if fi.filename is not None:
idtext = self.get_id_tag(so).decode(self.header.codec)
@@ -379,6 +387,19 @@ class Mobi8Reader(object):
len(resource_map)):
mi.cover = resource_map[self.cover_offset]
+ if len(list(toc)) < 2:
+ self.log.warn('KF8 has no metadata Table of Contents')
+
+ for ref in guide:
+ if ref.type == 'toc':
+ href = ref.href()
+ href, frag = urldefrag(href)
+ if os.path.exists(href.replace('/', os.sep)):
+ try:
+ toc = self.read_inline_toc(href, frag)
+ except:
+ self.log.exception('Failed to read inline ToC')
+
opf = OPFCreator(os.getcwdu(), mi)
opf.guide = guide
@@ -393,4 +414,70 @@ class Mobi8Reader(object):
opf.render(of, ncx, 'toc.ncx')
return 'metadata.opf'
+ def read_inline_toc(self, href, frag):
+ ans = TOC()
+ base_href = '/'.join(href.split('/')[:-1])
+ with open(href.replace('/', os.sep), 'rb') as f:
+ raw = f.read().decode(self.header.codec)
+ root = parse_html(raw, log=self.log)
+ body = XPath('//h:body')(root)
+ reached = False
+ if body:
+ start = body[0]
+ else:
+ start = None
+ reached = True
+ if frag:
+ elems = XPath('//*[@id="%s"]'%frag)
+ if elems:
+ start = elems[0]
+
+ def node_depth(elem):
+ ans = 0
+ parent = elem.getparent()
+ while parent is not None:
+ parent = parent.getparent()
+ ans += 1
+ return ans
+
+ # Layer the ToC based on nesting order in the source HTML
+ current_depth = None
+ parent = ans
+ seen = set()
+ links = []
+ for elem in root.iterdescendants(etree.Element):
+ if reached and elem.tag == XHTML('a') and elem.get('href',
+ False):
+ href = elem.get('href')
+ href, frag = urldefrag(href)
+ href = base_href + '/' + href
+ text = xml2text(elem).strip()
+ if (text, href, frag) in seen:
+ continue
+ seen.add((text, href, frag))
+ links.append((text, href, frag, node_depth(elem)))
+ elif elem is start:
+ reached = True
+
+ depths = sorted(set(x[-1] for x in links))
+ depth_map = {x:i for i, x in enumerate(depths)}
+ for text, href, frag, depth in links:
+ depth = depth_map[depth]
+ if current_depth is None:
+ current_depth = 0
+ parent.add_item(href, frag, text)
+ elif current_depth == depth:
+ parent.add_item(href, frag, text)
+ elif current_depth < depth:
+ parent = parent[-1] if len(parent) > 0 else parent
+ parent.add_item(href, frag, text)
+ current_depth += 1
+ else:
+ delta = current_depth - depth
+ while delta > 0 and parent.parent is not None:
+ parent = parent.parent
+ delta -= 1
+ parent.add_item(href, frag, text)
+ current_depth = depth
+ return ans
diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py
index 4c1e52e119..ae8e583a1b 100644
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@@ -7,18 +7,22 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import struct, string, imghdr, zlib
+import struct, string, imghdr, zlib, os
from collections import OrderedDict
+from io import BytesIO
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
from calibre.ebooks import normalize
IMAGE_MAX_SIZE = 10 * 1024 * 1024
+RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
-def decode_string(raw, codec='utf-8'):
+def decode_string(raw, codec='utf-8', ordt_map=''):
length, = struct.unpack(b'>B', raw[0])
raw = raw[1:1+length]
consumed = length+1
+ if ordt_map:
+ return ''.join(ordt_map[ord(x)] for x in raw), consumed
return raw.decode(codec), consumed
def decode_hex_number(raw, codec='utf-8'):
@@ -362,15 +366,17 @@ def count_set_bits(num):
num >>= 1
return ans
-def to_base(num, base=32):
+def to_base(num, base=32, min_num_digits=None):
digits = string.digits + string.ascii_uppercase
sign = 1 if num >= 0 else -1
- if num == 0: return '0'
+ if num == 0: return ('0' if min_num_digits is None else '0'*min_num_digits)
num *= sign
ans = []
while num:
ans.append(digits[(num % base)])
num //= base
+ if min_num_digits is not None and len(ans) < min_num_digits:
+ ans.extend('0'*(min_num_digits - len(ans)))
if sign < 0:
ans.append('-')
ans.reverse()
@@ -386,27 +392,8 @@ def mobify_image(data):
data = im.export('gif')
return data
-def read_zlib_header(header):
- header = bytearray(header)
- # See sec 2.2 of RFC 1950 for the zlib stream format
- # http://www.ietf.org/rfc/rfc1950.txt
- if (header[0]*256 + header[1])%31 != 0:
- return None, 'Bad zlib header, FCHECK failed'
-
- cmf = header[0] & 0b1111
- cinfo = header[0] >> 4
- if cmf != 8:
- return None, 'Unknown zlib compression method: %d'%cmf
- if cinfo > 7:
- return None, 'Invalid CINFO field in zlib header: %d'%cinfo
- fdict = (header[1]&0b10000)>>5
- if fdict != 0:
- return None, 'FDICT based zlib compression not supported'
- wbits = cinfo + 8
- return wbits, None
-
-
-def read_font_record(data, extent=1040): # {{{
+# Font records {{{
+def read_font_record(data, extent=1040):
'''
Return the font encoded in the MOBI FONT record represented by data.
The return value in a dict with fields raw_data, font_data, err, ext,
@@ -464,15 +451,8 @@ def read_font_record(data, extent=1040): # {{{
if flags & 0b1:
# ZLIB compressed data
- wbits, err = read_zlib_header(font_data[:2])
- if err is not None:
- ans['err'] = err
- return ans
- adler32, = struct.unpack_from(b'>I', font_data, len(font_data) - 4)
try:
- # remove two bytes of zlib header and 4 bytes of trailing checksum
- # negative wbits indicates no standard gzip header
- font_data = zlib.decompress(font_data[2:-4], -wbits, usize)
+ font_data = zlib.decompress(font_data)
except Exception as e:
ans['err'] = 'Failed to zlib decompress font data (%s)'%e
return ans
@@ -481,23 +461,146 @@ def read_font_record(data, extent=1040): # {{{
ans['err'] = 'Uncompressed font size mismatch'
return ans
- if False:
- # For some reason these almost never match, probably Amazon has a
- # buggy Adler32 implementation
- sig = (zlib.adler32(font_data) & 0xffffffff)
- if sig != adler32:
- ans['err'] = ('Adler checksum did not match. Stored: %d '
- 'Calculated: %d')%(adler32, sig)
- return ans
-
ans['font_data'] = font_data
sig = font_data[:4]
ans['ext'] = ('ttf' if sig in {b'\0\1\0\0', b'true', b'ttcf'}
else 'otf' if sig == b'OTTO' else 'dat')
return ans
+
+def write_font_record(data, obfuscate=True, compress=True):
+ '''
+ Write the ttf/otf font represented by data into a font record. See
+ read_font_record() for details on the format of the record.
+ '''
+
+ flags = 0
+ key_len = 20
+ usize = len(data)
+ xor_key = b''
+ if compress:
+ flags |= 0b1
+ data = zlib.compress(data, 9)
+ if obfuscate:
+ flags |= 0b10
+ xor_key = os.urandom(key_len)
+ key = bytearray(xor_key)
+ data = bytearray(data)
+ for i in xrange(1040):
+ data[i] ^= key[i%key_len]
+ data = bytes(data)
+
+ key_start = struct.calcsize(b'>5L') + 4
+ data_start = key_start + len(xor_key)
+
+ header = b'FONT' + struct.pack(b'>5L', usize, flags, data_start,
+ len(xor_key), key_start)
+
+ return header + xor_key + data
+
# }}}
+def create_text_record(text):
+ '''
+ Return a Palmdoc record of size RECORD_SIZE from the text file object.
+ In case the record ends in the middle of a multibyte character return
+ the overlap as well.
+ Returns data, overlap: where both are byte strings. overlap is the
+ extra bytes needed to complete the truncated multibyte character.
+ '''
+ opos = text.tell()
+ text.seek(0, 2)
+ # npos is the position of the next record
+ npos = min((opos + RECORD_SIZE, text.tell()))
+ # Number of bytes from the next record needed to complete the last
+ # character in this record
+ extra = 0
+ last = b''
+ while not last.decode('utf-8', 'ignore'):
+ # last contains no valid utf-8 characters
+ size = len(last) + 1
+ text.seek(npos - size)
+ last = text.read(size)
+
+ # last now has one valid utf-8 char and possibly some bytes that belong
+ # to a truncated char
+
+ try:
+ last.decode('utf-8', 'strict')
+ except UnicodeDecodeError:
+ # There are some truncated bytes in last
+ prev = len(last)
+ while True:
+ text.seek(npos - prev)
+ last = text.read(len(last) + 1)
+ try:
+ last.decode('utf-8')
+ except UnicodeDecodeError:
+ pass
+ else:
+ break
+ extra = len(last) - prev
+
+ text.seek(opos)
+ data = text.read(RECORD_SIZE)
+ overlap = text.read(extra)
+ text.seek(npos)
+
+ return data, overlap
+
+class CNCX(object): # {{{
+
+ '''
+ Create the CNCX records. These are records containing all the strings from
+ an index. Each record is of the form:
+ '''
+
+ MAX_STRING_LENGTH = 500
+
+ def __init__(self, strings=()):
+ self.strings = OrderedDict((s, 0) for s in strings)
+
+ self.records = []
+ offset = 0
+ buf = BytesIO()
+ for key in tuple(self.strings.iterkeys()):
+ utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
+ l = len(utf8)
+ sz_bytes = encint(l)
+ raw = sz_bytes + utf8
+ if 0xfbf8 - buf.tell() < 6 + len(raw):
+ # Records in PDB files cannot be larger than 0x10000, so we
+ # stop well before that.
+ pad = 0xfbf8 - buf.tell()
+ buf.write(b'\0' * pad)
+ self.records.append(buf.getvalue())
+ buf.seek(0), buf.truncate(0)
+ offset = len(self.records) * 0x10000
+ buf.write(raw)
+ self.strings[key] = offset
+ offset += len(raw)
+
+ val = buf.getvalue()
+ if val:
+ self.records.append(align_block(val))
+
+ def __getitem__(self, string):
+ return self.strings[string]
+
+ def __bool__(self):
+ return bool(self.records)
+ __nonzero__ = __bool__
+
+ def __len__(self):
+ return len(self.records)
+
+# }}}
+
+def is_guide_ref_start(ref):
+ return (ref.title.lower() == 'start' or
+ (ref.type and ref.type.lower() in {'start',
+ 'other.start', 'text'}))
diff --git a/src/calibre/ebooks/mobi/writer2/__init__.py b/src/calibre/ebooks/mobi/writer2/__init__.py
index bc8dbbf7de..df3dcefb94 100644
--- a/src/calibre/ebooks/mobi/writer2/__init__.py
+++ b/src/calibre/ebooks/mobi/writer2/__init__.py
@@ -12,5 +12,4 @@ UNCOMPRESSED = 1
PALMDOC = 2
HUFFDIC = 17480
PALM_MAX_IMAGE_SIZE = 63 * 1024
-RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py
index e349172d95..183697a1b4 100644
--- a/src/calibre/ebooks/mobi/writer2/indexer.py
+++ b/src/calibre/ebooks/mobi/writer2/indexer.py
@@ -12,56 +12,22 @@ from struct import pack
from cStringIO import StringIO
from collections import OrderedDict, defaultdict
-from calibre.ebooks.mobi.writer2 import RECORD_SIZE
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
- encode_tbs, align_block, utf8_text)
+ encode_tbs, align_block, RECORD_SIZE, CNCX as CNCX_)
-class CNCX(object): # {{{
-
- '''
- Create the CNCX records. These are records containing all the strings from
- the NCX. Each record is of the form:
- '''
-
- MAX_STRING_LENGTH = 500
+class CNCX(CNCX_): # {{{
def __init__(self, toc, is_periodical):
- self.strings = OrderedDict()
-
+ strings = []
for item in toc.iterdescendants(breadth_first=True):
- self.strings[item.title] = 0
+ strings.append(item.title)
if is_periodical:
- self.strings[item.klass] = 0
+ strings.append(item.klass)
if item.author:
- self.strings[item.author] = 0
+ strings.append(item.author)
if item.description:
- self.strings[item.description] = 0
-
- self.records = []
- offset = 0
- buf = StringIO()
- for key in tuple(self.strings.iterkeys()):
- utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
- l = len(utf8)
- sz_bytes = encint(l)
- raw = sz_bytes + utf8
- if 0xfbf8 - buf.tell() < 6 + len(raw):
- # Records in PDB files cannot be larger than 0x10000, so we
- # stop well before that.
- pad = 0xfbf8 - buf.tell()
- buf.write(b'\0' * pad)
- self.records.append(buf.getvalue())
- buf.truncate(0)
- offset = len(self.records) * 0x10000
- buf.write(raw)
- self.strings[key] = offset
- offset += len(raw)
-
- self.records.append(align_block(buf.getvalue()))
-
- def __getitem__(self, string):
- return self.strings[string]
+ strings.append(item.description)
+ CNCX_.__init__(self, strings)
# }}}
class TAGX(object): # {{{
@@ -534,14 +500,14 @@ class Indexer(object): # {{{
# Write offsets to index entries as an IDXT block
idxt_block = b'IDXT'
- buf.truncate(0)
+ buf.seek(0), buf.truncate(0)
for offset in offsets:
buf.write(pack(b'>H', header_length+offset))
idxt_block = align_block(idxt_block + buf.getvalue())
body = index_block + idxt_block
header = b'INDX'
- buf.truncate(0)
+ buf.seek(0), buf.truncate(0)
buf.write(pack(b'>I', header_length))
buf.write(b'\0'*4) # Unknown
buf.write(pack(b'>I', 1)) # Header type? Or index record number?
diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py
index 99321fab12..27c4838a4b 100644
--- a/src/calibre/ebooks/mobi/writer2/main.py
+++ b/src/calibre/ebooks/mobi/writer2/main.py
@@ -7,51 +7,31 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import re, random, time
+import random, time
from cStringIO import StringIO
from struct import pack
-from calibre.ebooks import normalize, generate_masthead
-from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
+from calibre.ebooks import normalize
from calibre.ebooks.mobi.writer2.serializer import Serializer
from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED, RECORD_SIZE)
-from calibre.ebooks.mobi.utils import (rescale_image, encint, mobify_image,
- encode_trailing_data, align_block, detect_periodical)
+from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED)
+from calibre.ebooks.mobi.utils import (encint, encode_trailing_data,
+ align_block, detect_periodical, RECORD_SIZE, create_text_record)
from calibre.ebooks.mobi.writer2.indexer import Indexer
-from calibre.ebooks.mobi import MAX_THUMB_DIMEN, MAX_THUMB_SIZE
-
-EXTH_CODES = {
- 'creator': 100,
- 'publisher': 101,
- 'description': 103,
- 'identifier': 104,
- 'subject': 105,
- 'pubdate': 106,
- 'review': 107,
- 'contributor': 108,
- 'rights': 109,
- 'type': 111,
- 'source': 112,
- 'versionnumber': 114,
- 'startreading': 116,
- 'coveroffset': 201,
- 'thumboffset': 202,
- 'hasfakecover': 203,
- 'lastupdatetime': 502,
- 'title': 503,
- }
# Disabled as I dont care about uncrossable breaks
WRITE_UNCROSSABLE_BREAKS = False
+NULL_INDEX = 0xffffffff
class MobiWriter(object):
- COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
- def __init__(self, opts, write_page_breaks_after_item=True):
+ def __init__(self, opts, resources, kf8, write_page_breaks_after_item=True):
self.opts = opts
+ self.resources = resources
+ self.kf8 = kf8
+ self.for_joint = kf8 is not None
self.write_page_breaks_after_item = write_page_breaks_after_item
self.compression = UNCOMPRESSED if opts.dont_compress else PALMDOC
self.prefer_author_sort = opts.prefer_author_sort
@@ -83,7 +63,7 @@ class MobiWriter(object):
self.stream = stream
self.records = [None]
self.generate_content()
- self.generate_record0()
+ self.generate_joint_record0() if self.for_joint else self.generate_record0()
self.write_header()
self.write_content()
@@ -151,73 +131,19 @@ class MobiWriter(object):
# Images {{{
def generate_images(self):
- oeb = self.oeb
- oeb.logger.info('Serializing images...')
- self.image_records = []
- self.image_map = {}
- self.masthead_offset = 0
- index = 1
+ resources = self.resources
+ image_records = resources.records
+ self.image_map = resources.item_map
+ self.masthead_offset = resources.masthead_offset
+ self.cover_offset = resources.cover_offset
+ self.thumbnail_offset = resources.thumbnail_offset
- mh_href = None
- if 'masthead' in oeb.guide and oeb.guide['masthead'].href:
- mh_href = oeb.guide['masthead'].href
- self.image_records.append(None)
- index += 1
- elif self.is_periodical:
- # Generate a default masthead
- data = generate_masthead(unicode(self.oeb.metadata['title'][0]))
- self.image_records.append(data)
- index += 1
-
- cover_href = self.cover_offset = self.thumbnail_offset = None
- if (oeb.metadata.cover and
- unicode(oeb.metadata.cover[0]) in oeb.manifest.ids):
- cover_id = unicode(oeb.metadata.cover[0])
- item = oeb.manifest.ids[cover_id]
- cover_href = item.href
-
- for item in self.oeb.manifest.values():
- if item.media_type not in OEB_RASTER_IMAGES: continue
- try:
- data = item.data
- if self.opts.mobi_keep_original_images:
- data = mobify_image(data)
- else:
- data = rescale_image(data)
- except:
- oeb.logger.warn('Bad image file %r' % item.href)
- continue
- else:
- if mh_href and item.href == mh_href:
- self.image_records[0] = data
- continue
-
- self.image_records.append(data)
- self.image_map[item.href] = index
- index += 1
-
- if cover_href and item.href == cover_href:
- self.cover_offset = self.image_map[item.href] - 1
- try:
- data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN,
- maxsizeb=MAX_THUMB_SIZE)
- except:
- oeb.logger.warn('Failed to generate thumbnail')
- else:
- self.image_records.append(data)
- self.thumbnail_offset = index - 1
- index += 1
- finally:
- item.unload_data_from_memory()
-
- if self.image_records and self.image_records[0] is None:
+ if image_records and image_records[0] is None:
raise ValueError('Failed to find masthead image in manifest')
# }}}
- # Text {{{
-
- def generate_text(self):
+ def generate_text(self): # {{{
self.oeb.logger.info('Serializing markup content...')
self.serializer = Serializer(self.oeb, self.image_map,
self.is_periodical,
@@ -232,7 +158,7 @@ class MobiWriter(object):
self.oeb.logger.info(' Compressing markup content...')
while text.tell() < self.text_length:
- data, overlap = self.read_text_record(text)
+ data, overlap = create_text_record(text)
if self.compression == PALMDOC:
data = compress_doc(data)
@@ -249,57 +175,6 @@ class MobiWriter(object):
if records_size % 4 != 0:
self.records.append(b'\x00'*(records_size % 4))
self.first_non_text_record_idx += 1
-
- def read_text_record(self, text):
- '''
- Return a Palmdoc record of size RECORD_SIZE from the text file object.
- In case the record ends in the middle of a multibyte character return
- the overlap as well.
-
- Returns data, overlap: where both are byte strings. overlap is the
- extra bytes needed to complete the truncated multibyte character.
- '''
- opos = text.tell()
- text.seek(0, 2)
- # npos is the position of the next record
- npos = min((opos + RECORD_SIZE, text.tell()))
- # Number of bytes from the next record needed to complete the last
- # character in this record
- extra = 0
-
- last = b''
- while not last.decode('utf-8', 'ignore'):
- # last contains no valid utf-8 characters
- size = len(last) + 1
- text.seek(npos - size)
- last = text.read(size)
-
- # last now has one valid utf-8 char and possibly some bytes that belong
- # to a truncated char
-
- try:
- last.decode('utf-8', 'strict')
- except UnicodeDecodeError:
- # There are some truncated bytes in last
- prev = len(last)
- while True:
- text.seek(npos - prev)
- last = text.read(len(last) + 1)
- try:
- last.decode('utf-8')
- except UnicodeDecodeError:
- pass
- else:
- break
- extra = len(last) - prev
-
- text.seek(opos)
- data = text.read(RECORD_SIZE)
- overlap = text.read(extra)
- text.seek(npos)
-
- return data, overlap
-
# }}}
def generate_record0(self): # MOBI header {{{
@@ -315,11 +190,20 @@ class MobiWriter(object):
# header as well
bt = 0x103 if self.indexer.is_flat_periodical else 0x101
- exth = self.build_exth(bt)
+ from calibre.ebooks.mobi.writer8.exth import build_exth
+ exth = build_exth(metadata,
+ prefer_author_sort=self.opts.prefer_author_sort,
+ is_periodical=self.is_periodical,
+ share_not_sync=self.opts.share_not_sync,
+ cover_offset=self.cover_offset,
+ thumbnail_offset=self.thumbnail_offset,
+ start_offset=self.serializer.start_offset, mobi_doctype=bt
+ )
first_image_record = None
- if self.image_records:
+ if self.resources:
+ used_images = self.serializer.used_images
first_image_record = len(self.records)
- self.records.extend(self.image_records)
+ self.resources.serialize(self.records, used_images)
last_content_record = len(self.records) - 1
# FCIS/FLIS (Seems to serve no purpose)
@@ -481,125 +365,72 @@ class MobiWriter(object):
self.records[0] = align_block(record0)
# }}}
- def build_exth(self, mobi_doctype): # EXTH Header {{{
- oeb = self.oeb
- exth = StringIO()
- nrecs = 0
- for term in oeb.metadata:
- if term not in EXTH_CODES: continue
- code = EXTH_CODES[term]
- items = oeb.metadata[term]
- if term == 'creator':
- if self.prefer_author_sort:
- creators = [normalize(unicode(c.file_as or c)) for c in
- items][:1]
- else:
- creators = [normalize(unicode(c)) for c in items]
- items = ['; '.join(creators)]
- for item in items:
- data = normalize(unicode(item))
- if term != 'description':
- data = self.COLLAPSE_RE.sub(' ', data)
- if term == 'identifier':
- if data.lower().startswith('urn:isbn:'):
- data = data[9:]
- elif item.scheme.lower() == 'isbn':
- pass
- else:
- continue
- data = data.encode('utf-8')
- exth.write(pack(b'>II', code, len(data) + 8))
- exth.write(data)
- nrecs += 1
- if term == 'rights' :
- try:
- rights = normalize(unicode(oeb.metadata.rights[0])).encode('utf-8')
- except:
- rights = b'Unknown'
- exth.write(pack(b'>II', EXTH_CODES['rights'], len(rights) + 8))
- exth.write(rights)
- nrecs += 1
+ def generate_joint_record0(self): # {{{
+ from calibre.ebooks.mobi.writer8.mobi import (MOBIHeader,
+ HEADER_FIELDS)
+ from calibre.ebooks.mobi.writer8.exth import build_exth
- # Write UUID as ASIN
- uuid = None
- from calibre.ebooks.oeb.base import OPF
- for x in oeb.metadata['identifier']:
- if (x.get(OPF('scheme'), None).lower() == 'uuid' or
- unicode(x).startswith('urn:uuid:')):
- uuid = unicode(x).split(':')[-1]
- break
- if uuid is None:
- from uuid import uuid4
- uuid = str(uuid4())
+ # Insert resource records
+ first_image_record = None
+ old = len(self.records)
+ if self.resources:
+ used_images = self.serializer.used_images | self.kf8.used_images
+ first_image_record = len(self.records)
+ self.resources.serialize(self.records, used_images)
+ resource_record_count = len(self.records) - old
- if isinstance(uuid, unicode):
- uuid = uuid.encode('utf-8')
- if not self.opts.share_not_sync:
- exth.write(pack(b'>II', 113, len(uuid) + 8))
- exth.write(uuid)
- nrecs += 1
+ # Insert KF8 records
+ self.records.append(b'BOUNDARY')
+ kf8_header_index = len(self.records)
+ self.kf8.start_offset = (self.serializer.start_offset,
+ self.kf8.start_offset)
+ self.records.append(self.kf8.record0)
+ self.records.extend(self.kf8.records[1:])
- # Write cdetype
- if not self.is_periodical:
- if not self.opts.share_not_sync:
- exth.write(pack(b'>II', 501, 12))
- exth.write(b'EBOK')
- nrecs += 1
- else:
- ids = {0x101:b'NWPR', 0x103:b'MAGZ'}.get(mobi_doctype, None)
- if ids:
- exth.write(pack(b'>II', 501, 12))
- exth.write(ids)
- nrecs += 1
+ first_image_record = (first_image_record if first_image_record else
+ len(self.records))
- # Add a publication date entry
- if oeb.metadata['date']:
- datestr = str(oeb.metadata['date'][0])
- elif oeb.metadata['timestamp']:
- datestr = str(oeb.metadata['timestamp'][0])
+ header_fields = {k:getattr(self.kf8, k) for k in HEADER_FIELDS}
- if datestr is None:
- raise ValueError("missing date or timestamp")
+ # Now change the header fields that need to be different in the MOBI 6
+ # header
+ header_fields['first_resource_record'] = first_image_record
+ header_fields['exth_flags'] = 0b100001010000 # Kinglegen uses this
+ header_fields['fdst_record'] = NULL_INDEX
+ header_fields['fdst_count'] = 1 # Why not 0? Kindlegen uses 1
+ extra_data_flags = 0b1 # Has multibyte overlap bytes
+ if self.primary_index_record_idx is not None:
+ extra_data_flags |= 0b10
+ header_fields['extra_data_flags'] = extra_data_flags
- datestr = bytes(datestr)
- exth.write(pack(b'>II', EXTH_CODES['pubdate'], len(datestr) + 8))
- exth.write(datestr)
- nrecs += 1
- if self.is_periodical:
- exth.write(pack(b'>II', EXTH_CODES['lastupdatetime'], len(datestr) + 8))
- exth.write(datestr)
- nrecs += 1
+ for k, v in {'last_text_record':'last_text_record_idx',
+ 'first_non_text_record':'first_non_text_record_idx',
+ 'ncx_index':'primary_index_record_idx',
+ }.iteritems():
+ header_fields[k] = getattr(self, v)
+ if header_fields['ncx_index'] is None:
+ header_fields['ncx_index'] = NULL_INDEX
- if self.is_periodical:
- # Pretend to be amazon's super secret periodical generator
- vals = {204:201, 205:2, 206:0, 207:101}
- else:
- # Pretend to be kindlegen 1.2
- vals = {204:201, 205:1, 206:2, 207:33307}
- for code, val in vals.iteritems():
- exth.write(pack(b'>III', code, 12, val))
- nrecs += 1
+ for x in ('skel', 'chunk', 'guide'):
+ header_fields[x+'_index'] = NULL_INDEX
- if self.cover_offset is not None:
- exth.write(pack(b'>III', EXTH_CODES['coveroffset'], 12,
- self.cover_offset))
- exth.write(pack(b'>III', EXTH_CODES['hasfakecover'], 12, 0))
- nrecs += 2
- if self.thumbnail_offset is not None:
- exth.write(pack(b'>III', EXTH_CODES['thumboffset'], 12,
- self.thumbnail_offset))
- nrecs += 1
+ # Create the MOBI 6 EXTH
+ opts = self.opts
+ kuc = 0 if resource_record_count > 0 else None
- if self.serializer.start_offset is not None:
- exth.write(pack(b'>III', EXTH_CODES['startreading'], 12,
- self.serializer.start_offset))
- nrecs += 1
+ header_fields['exth'] = build_exth(self.oeb.metadata,
+ prefer_author_sort=opts.prefer_author_sort,
+ is_periodical=opts.mobi_periodical,
+ share_not_sync=opts.share_not_sync,
+ cover_offset=self.cover_offset,
+ thumbnail_offset=self.thumbnail_offset,
+ num_of_resources=resource_record_count,
+ kf8_unknown_count=kuc, be_kindlegen2=True,
+ kf8_header_index=kf8_header_index,
+ start_offset=self.serializer.start_offset,
+ mobi_doctype=2)
+ self.records[0] = MOBIHeader(file_version=6)(**header_fields)
- exth = exth.getvalue()
- trail = len(exth) % 4
- pad = b'\0' * (4 - trail) # Always pad w/ at least 1 byte
- exth = [b'EXTH', pack(b'>II', len(exth) + 12, nrecs), exth, pad]
- return b''.join(exth)
# }}}
def write_header(self): # PalmDB header {{{
diff --git a/src/calibre/ebooks/mobi/writer2/resources.py b/src/calibre/ebooks/mobi/writer2/resources.py
new file mode 100644
index 0000000000..2fcb93790c
--- /dev/null
+++ b/src/calibre/ebooks/mobi/writer2/resources.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+ print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+import imghdr
+
+from calibre.ebooks.mobi import MAX_THUMB_DIMEN, MAX_THUMB_SIZE
+from calibre.ebooks.mobi.utils import (rescale_image, mobify_image,
+ write_font_record)
+from calibre.ebooks import generate_masthead
+from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
+
+PLACEHOLDER_GIF = b'GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff!\xf9\x04\x01\x00\x00\x00\x00,\x00\x00\x00\x00\x01\x00\x01\x00@\x02\x01D\x00;'
+
+class Resources(object):
+
+ def __init__(self, oeb, opts, is_periodical, add_fonts=False):
+ self.oeb, self.log, self.opts = oeb, oeb.log, opts
+ self.is_periodical = is_periodical
+
+ self.item_map = {}
+ self.records = []
+ self.mime_map = {}
+ self.masthead_offset = 0
+ self.used_image_indices = set()
+ self.image_indices = set()
+ self.cover_offset = self.thumbnail_offset = None
+
+ self.add_resources(add_fonts)
+
+ def process_image(self, data):
+ return (mobify_image(data) if self.opts.mobi_keep_original_images else
+ rescale_image(data))
+
+ def add_resources(self, add_fonts):
+ oeb = self.oeb
+ oeb.logger.info('Serializing resources...')
+ index = 1
+
+ mh_href = None
+ if 'masthead' in oeb.guide and oeb.guide['masthead'].href:
+ mh_href = oeb.guide['masthead'].href
+ self.records.append(None)
+ index += 1
+ self.used_image_indices.add(0)
+ self.image_indices.add(0)
+ elif self.is_periodical:
+ # Generate a default masthead
+ data = generate_masthead(unicode(self.oeb.metadata['title'][0]))
+ self.records.append(data)
+ self.used_image_indices.add(0)
+ self.image_indices.add(0)
+ index += 1
+
+ cover_href = self.cover_offset = self.thumbnail_offset = None
+ if (oeb.metadata.cover and
+ unicode(oeb.metadata.cover[0]) in oeb.manifest.ids):
+ cover_id = unicode(oeb.metadata.cover[0])
+ item = oeb.manifest.ids[cover_id]
+ cover_href = item.href
+
+ for item in self.oeb.manifest.values():
+ if item.media_type not in OEB_RASTER_IMAGES: continue
+ try:
+ data = self.process_image(item.data)
+ except:
+ self.log.warn('Bad image file %r' % item.href)
+ continue
+ else:
+ if mh_href and item.href == mh_href:
+ self.records[0] = data
+ continue
+
+ self.image_indices.add(len(self.records))
+ self.records.append(data)
+ self.item_map[item.href] = index
+ self.mime_map[item.href] = 'image/%s'%imghdr.what(None, data)
+ index += 1
+
+ if cover_href and item.href == cover_href:
+ self.cover_offset = self.item_map[item.href] - 1
+ self.used_image_indices.add(self.cover_offset)
+ try:
+ data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN,
+ maxsizeb=MAX_THUMB_SIZE)
+ except:
+ self.log.warn('Failed to generate thumbnail')
+ else:
+ self.image_indices.add(len(self.records))
+ self.records.append(data)
+ self.thumbnail_offset = index - 1
+ self.used_image_indices.add(self.thumbnail_offset)
+ index += 1
+ finally:
+ item.unload_data_from_memory()
+
+ if add_fonts:
+ for item in self.oeb.manifest.values():
+ if item.href and item.href.rpartition('.')[-1].lower() in {
+ 'ttf', 'otf'} and isinstance(item.data, bytes):
+ self.records.append(write_font_record(item.data))
+ self.item_map[item.href] = len(self.records)
+
+ def add_extra_images(self):
+ '''
+ Add any images that were created after the call to add_resources()
+ '''
+ for item in self.oeb.manifest.values():
+ if (item.media_type not in OEB_RASTER_IMAGES or item.href in
+ self.item_map): continue
+ try:
+ data = self.process_image(item.data)
+ except:
+ self.log.warn('Bad image file %r' % item.href)
+ else:
+ self.records.append(data)
+ self.item_map[item.href] = len(self.records)
+ finally:
+ item.unload_data_from_memory()
+
+ def serialize(self, records, used_images):
+ used_image_indices = self.used_image_indices | {
+ v-1 for k, v in self.item_map.iteritems() if k in used_images}
+ for i in self.image_indices-used_image_indices:
+ self.records[i] = PLACEHOLDER_GIF
+ records.extend(self.records)
+
+ def __bool__(self):
+ return bool(self.records)
+ __nonzero__ = __bool__
+
diff --git a/src/calibre/ebooks/mobi/writer2/serializer.py b/src/calibre/ebooks/mobi/writer2/serializer.py
index abce926152..2dda657a93 100644
--- a/src/calibre/ebooks/mobi/writer2/serializer.py
+++ b/src/calibre/ebooks/mobi/writer2/serializer.py
@@ -12,6 +12,7 @@ import re
from calibre.ebooks.oeb.base import (OEB_DOCS, XHTML, XHTML_NS, XML_NS,
namespace, prefixname, urlnormalize)
from calibre.ebooks.mobi.mobiml import MBP_NS
+from calibre.ebooks.mobi.utils import is_guide_ref_start
from collections import defaultdict
from urlparse import urldefrag
@@ -39,6 +40,7 @@ class Serializer(object):
self.oeb = oeb
# Map of image hrefs to image index in the MOBI file
self.images = images
+ self.used_images = set()
self.logger = oeb.logger
self.is_periodical = is_periodical
self.write_page_breaks_after_item = write_page_breaks_after_item
@@ -160,9 +162,7 @@ class Serializer(object):
buf.write(b'title="')
self.serialize_text(ref.title, quot=True)
buf.write(b'" ')
- if (ref.title.lower() == 'start' or
- (ref.type and ref.type.lower() in ('start',
- 'other.start'))):
+ if is_guide_ref_start(ref):
self._start_href = ref.href
self.serialize_href(ref.href)
# Space required or won't work, I kid you not
@@ -329,6 +329,7 @@ class Serializer(object):
href = urlnormalize(item.abshref(val))
if href in self.images:
index = self.images[href]
+ self.used_images.add(href)
buf.write(b'recindex="%05d"' % index)
continue
buf.write(attr.encode('utf-8'))
diff --git a/src/calibre/ebooks/mobi/writer8/__init__.py b/src/calibre/ebooks/mobi/writer8/__init__.py
new file mode 100644
index 0000000000..dd9615356c
--- /dev/null
+++ b/src/calibre/ebooks/mobi/writer8/__init__.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+ print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+
+
diff --git a/src/calibre/ebooks/mobi/writer8/exth.py b/src/calibre/ebooks/mobi/writer8/exth.py
new file mode 100644
index 0000000000..361b978528
--- /dev/null
+++ b/src/calibre/ebooks/mobi/writer8/exth.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+ print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+import re
+from struct import pack
+from io import BytesIO
+
+from calibre.ebooks.mobi.utils import utf8_text
+
+EXTH_CODES = {
+ 'creator': 100,
+ 'publisher': 101,
+ 'description': 103,
+ 'identifier': 104,
+ 'subject': 105,
+ 'pubdate': 106,
+ 'review': 107,
+ 'contributor': 108,
+ 'rights': 109,
+ 'type': 111,
+ 'source': 112,
+ 'versionnumber': 114,
+ 'startreading': 116,
+ 'kf8_header_index': 121,
+ 'num_of_resources': 125,
+ 'kf8_unknown_count': 131,
+ 'coveroffset': 201,
+ 'thumboffset': 202,
+ 'hasfakecover': 203,
+ 'lastupdatetime': 502,
+ 'title': 503,
+}
+
+COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
+
+def build_exth(metadata, prefer_author_sort=False, is_periodical=False,
+ share_not_sync=True, cover_offset=None, thumbnail_offset=None,
+ start_offset=None, mobi_doctype=2, num_of_resources=None,
+ kf8_unknown_count=0, be_kindlegen2=False, kf8_header_index=None):
+ exth = BytesIO()
+ nrecs = 0
+
+ for term in metadata:
+ if term not in EXTH_CODES: continue
+ code = EXTH_CODES[term]
+ items = metadata[term]
+ if term == 'creator':
+ if prefer_author_sort:
+ creators = [unicode(c.file_as or c) for c in
+ items][:1]
+ else:
+ creators = [unicode(c) for c in items]
+ items = ['; '.join(creators)]
+ for item in items:
+ data = unicode(item)
+ if term != 'description':
+ data = COLLAPSE_RE.sub(' ', data)
+ if term == 'identifier':
+ if data.lower().startswith('urn:isbn:'):
+ data = data[9:]
+ elif item.scheme.lower() == 'isbn':
+ pass
+ else:
+ continue
+ data = utf8_text(data)
+ exth.write(pack(b'>II', code, len(data) + 8))
+ exth.write(data)
+ nrecs += 1
+ if term == 'rights' :
+ try:
+ rights = utf8_text(unicode(metadata.rights[0]))
+ except:
+ rights = b'Unknown'
+ exth.write(pack(b'>II', EXTH_CODES['rights'], len(rights) + 8))
+ exth.write(rights)
+ nrecs += 1
+
+ # Write UUID as ASIN
+ uuid = None
+ from calibre.ebooks.oeb.base import OPF
+ for x in metadata['identifier']:
+ if (x.get(OPF('scheme'), None).lower() == 'uuid' or
+ unicode(x).startswith('urn:uuid:')):
+ uuid = unicode(x).split(':')[-1]
+ break
+ if uuid is None:
+ from uuid import uuid4
+ uuid = str(uuid4())
+
+ if isinstance(uuid, unicode):
+ uuid = uuid.encode('utf-8')
+ if not share_not_sync:
+ exth.write(pack(b'>II', 113, len(uuid) + 8))
+ exth.write(uuid)
+ nrecs += 1
+
+ # Write cdetype
+ if not is_periodical:
+ if not share_not_sync:
+ exth.write(pack(b'>II', 501, 12))
+ exth.write(b'EBOK')
+ nrecs += 1
+ else:
+ ids = {0x101:b'NWPR', 0x103:b'MAGZ'}.get(mobi_doctype, None)
+ if ids:
+ exth.write(pack(b'>II', 501, 12))
+ exth.write(ids)
+ nrecs += 1
+
+ # Add a publication date entry
+ if metadata['date']:
+ datestr = str(metadata['date'][0])
+ elif metadata['timestamp']:
+ datestr = str(metadata['timestamp'][0])
+
+ if datestr is None:
+ raise ValueError("missing date or timestamp")
+
+ datestr = bytes(datestr)
+ exth.write(pack(b'>II', EXTH_CODES['pubdate'], len(datestr) + 8))
+ exth.write(datestr)
+ nrecs += 1
+ if is_periodical:
+ exth.write(pack(b'>II', EXTH_CODES['lastupdatetime'], len(datestr) + 8))
+ exth.write(datestr)
+ nrecs += 1
+
+ if be_kindlegen2:
+ vals = {204:201, 205:2, 206:2, 207:35621}
+ elif is_periodical:
+ # Pretend to be amazon's super secret periodical generator
+ vals = {204:201, 205:2, 206:0, 207:101}
+ else:
+ # Pretend to be kindlegen 1.2
+ vals = {204:201, 205:1, 206:2, 207:33307}
+ for code, val in vals.iteritems():
+ exth.write(pack(b'>III', code, 12, val))
+ nrecs += 1
+
+ if cover_offset is not None:
+ exth.write(pack(b'>III', EXTH_CODES['coveroffset'], 12,
+ cover_offset))
+ exth.write(pack(b'>III', EXTH_CODES['hasfakecover'], 12, 0))
+ nrecs += 2
+ if thumbnail_offset is not None:
+ exth.write(pack(b'>III', EXTH_CODES['thumboffset'], 12,
+ thumbnail_offset))
+ nrecs += 1
+
+ if start_offset is not None:
+ try:
+ len(start_offset)
+ except TypeError:
+ start_offset = [start_offset]
+ for so in start_offset:
+ if so is not None:
+ exth.write(pack(b'>III', EXTH_CODES['startreading'], 12,
+ so))
+ nrecs += 1
+
+ if kf8_header_index is not None:
+ exth.write(pack(b'>III', EXTH_CODES['kf8_header_index'], 12,
+ kf8_header_index))
+ nrecs += 1
+
+ if num_of_resources is not None:
+ exth.write(pack(b'>III', EXTH_CODES['num_of_resources'], 12,
+ num_of_resources))
+ nrecs += 1
+
+ if kf8_unknown_count is not None:
+ exth.write(pack(b'>III', EXTH_CODES['kf8_unknown_count'], 12,
+ kf8_unknown_count))
+ nrecs += 1
+
+ exth = exth.getvalue()
+ trail = len(exth) % 4
+ pad = b'\0' * (4 - trail) # Always pad w/ at least 1 byte
+ exth = [b'EXTH', pack(b'>II', len(exth) + 12, nrecs), exth, pad]
+ return b''.join(exth)
+
+
diff --git a/src/calibre/ebooks/mobi/writer8/header.py b/src/calibre/ebooks/mobi/writer8/header.py
new file mode 100644
index 0000000000..94ae722f59
--- /dev/null
+++ b/src/calibre/ebooks/mobi/writer8/header.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+ print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+import random
+from io import BytesIO
+from collections import OrderedDict
+from struct import pack
+
+from calibre.ebooks.mobi.utils import align_block
+
+NULL = 0xffffffff
+zeroes = lambda x: b'\0'*x
+nulls = lambda x: b'\xff'*x
+short = lambda x: pack(b'>H', x)
+
+class Header(OrderedDict):
+
+ HEADER_NAME = b''
+
+ DEFINITION = '''
+ '''
+
+ ALIGN_BLOCK = False
+ POSITIONS = {} # Mapping of position field to field whose position should
+ # be stored in the position field
+ SHORT_FIELDS = set()
+
+ def __init__(self):
+ OrderedDict.__init__(self)
+
+ for line in self.DEFINITION.splitlines():
+ line = line.strip()
+ if not line or line.startswith('#'): continue
+ name, val = [x.strip() for x in line.partition('=')[0::2]]
+ if val:
+ val = eval(val, {'zeroes':zeroes, 'NULL':NULL, 'DYN':None,
+ 'nulls':nulls, 'short':short, 'random':random})
+ else:
+ val = 0
+ if name in self:
+ raise ValueError('Duplicate field in definition: %r'%name)
+ self[name] = val
+
+ @property
+ def dynamic_fields(self):
+ return tuple(k for k, v in self.iteritems() if v is None)
+
+ def __call__(self, **kwargs):
+ positions = {}
+ for name, val in kwargs.iteritems():
+ if name not in self:
+ raise KeyError('Not a valid header field: %r'%name)
+ self[name] = val
+
+ buf = BytesIO()
+ buf.write(bytes(self.HEADER_NAME))
+ for name, val in self.iteritems():
+ val = self.format_value(name, val)
+ positions[name] = buf.tell()
+ if val is None:
+ raise ValueError('Dynamic field %r not set'%name)
+ if isinstance(val, (int, long)):
+ fmt = 'H' if name in self.SHORT_FIELDS else 'I'
+ val = pack(b'>'+fmt, val)
+ buf.write(val)
+
+ for pos_field, field in self.POSITIONS.iteritems():
+ buf.seek(positions[pos_field])
+ buf.write(pack(b'>I', positions[field]))
+
+ ans = buf.getvalue()
+ if self.ALIGN_BLOCK:
+ ans = align_block(ans)
+ return ans
+
+
+ def format_value(self, name, val):
+ return val
+
+
diff --git a/src/calibre/ebooks/mobi/writer8/index.py b/src/calibre/ebooks/mobi/writer8/index.py
new file mode 100644
index 0000000000..c37afb81ff
--- /dev/null
+++ b/src/calibre/ebooks/mobi/writer8/index.py
@@ -0,0 +1,335 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+ print_function)
+from future_builtins import map
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+from collections import namedtuple
+from struct import pack
+from io import BytesIO
+
+from calibre.ebooks.mobi.utils import CNCX, encint, align_block
+from calibre.ebooks.mobi.writer8.header import Header
+
+TagMeta_ = namedtuple('TagMeta',
+ 'name number values_per_entry bitmask end_flag')
+TagMeta = lambda x:TagMeta_(*x)
+EndTagTable = TagMeta(('eof', 0, 0, 0, 1))
+
+# map of mask to number of shifts needed, works with 1 bit and two-bit wide masks
+# could also be extended to 4 bit wide ones as well
+mask_to_bit_shifts = { 1:0, 2:1, 3:0, 4:2, 8:3, 12:2, 16:4, 32:5, 48:4, 64:6,
+ 128:7, 192: 6 }
+
+class IndexHeader(Header): # {{{
+
+ HEADER_NAME = b'INDX'
+ ALIGN_BLOCK = True
+ HEADER_LENGTH = 192
+
+ DEFINITION = '''
+ # 4 - 8: Header Length
+ header_length = {header_length}
+
+ # 8 - 16: Unknown
+ unknown1 = zeroes(8)
+
+ # 16 - 20: Index type: 0 - normal 2 - inflection
+ type = 2
+
+ # 20 - 24: IDXT offset (filled in later)
+ idxt_offset
+
+ # 24 - 28: Number of index records
+ num_of_records = 1
+
+ # 28 - 32: Index encoding (65001 = utf-8)
+ encoding = 65001
+
+ # 32 - 36: Unknown
+ unknown2 = NULL
+
+ # 36 - 40: Number of Index entries
+ num_of_entries = DYN
+
+ # 40 - 44: ORDT offset
+ ordt_offset
+
+ # 44 - 48: LIGT offset
+ ligt_offset
+
+ # 48 - 52: Number of ORDT/LIGT? entries
+ num_of_ordt_entries
+
+ # 52 - 56: Number of CNCX records
+ num_of_cncx = DYN
+
+ # 56 - 180: Unknown
+ unknown3 = zeroes(124)
+
+ # 180 - 184: TAGX offset
+ tagx_offset = {header_length}
+
+ # 184 - 192: Unknown
+ unknown4 = zeroes(8)
+
+ # TAGX
+ tagx = DYN
+
+ # Last Index entry
+ last_index = DYN
+
+ # IDXT
+ idxt = DYN
+ '''.format(header_length=HEADER_LENGTH)
+
+ POSITIONS = {'idxt_offset':'idxt'}
+# }}}
+
+class Index(object): # {{{
+
+ control_byte_count = 1
+ cncx = CNCX()
+ tag_types = (EndTagTable,)
+
+ HEADER_LENGTH = IndexHeader.HEADER_LENGTH
+
+ @classmethod
+ def generate_tagx(cls):
+ header = b'TAGX'
+ byts = bytearray()
+ for tag_meta in cls.tag_types:
+ byts.extend(tag_meta[1:])
+ # table length, control byte count
+ header += pack(b'>II', 12+len(byts), cls.control_byte_count)
+ return header + bytes(byts)
+
+ @classmethod
+ def calculate_control_bytes_for_each_entry(cls, entries):
+ control_bytes = []
+ for lead_text, tags in entries:
+ cbs = []
+ ans = 0
+ for (name, number, vpe, mask, endi) in cls.tag_types:
+ if endi == 1:
+ cbs.append(ans)
+ ans = 0
+ continue
+ try:
+ nvals = len(tags.get(name, ()))
+ except TypeError:
+ nvals = 1
+ nentries = nvals // vpe
+ shifts = mask_to_bit_shifts[mask]
+ ans |= mask & (nentries << shifts)
+ if len(cbs) != cls.control_byte_count:
+ raise ValueError('The entry %r is invalid'%[lead_text, tags])
+ control_bytes.append(cbs)
+ return control_bytes
+
+ def __call__(self):
+ self.control_bytes = self.calculate_control_bytes_for_each_entry(
+ self.entries)
+
+ rendered_entries = []
+ index, idxt, buf = BytesIO(), BytesIO(), BytesIO()
+ IndexEntry = namedtuple('IndexEntry', 'offset length raw')
+ last_lead_text = b''
+ too_large = ValueError('Index has too many entries, calibre does not'
+ ' support generating multiple index records at this'
+ ' time.')
+
+ for i, x in enumerate(self.entries):
+ control_bytes = self.control_bytes[i]
+ leading_text, tags = x
+ buf.seek(0), buf.truncate(0)
+ leading_text = (leading_text.encode('utf-8') if
+ isinstance(leading_text, unicode) else leading_text)
+ raw = bytearray(leading_text)
+ raw.insert(0, len(leading_text))
+ buf.write(bytes(raw))
+ buf.write(bytes(bytearray(control_bytes)))
+ for tag in self.tag_types:
+ values = tags.get(tag.name, None)
+ if values is None: continue
+ try:
+ len(values)
+ except TypeError:
+ values = [values]
+ if values:
+ for val in values:
+ try:
+ buf.write(encint(val))
+ except ValueError:
+ raise ValueError('Invalid values for %r: %r'%(
+ tag, values))
+ raw = buf.getvalue()
+ offset = index.tell()
+ if offset + self.HEADER_LENGTH >= 0x10000:
+ raise too_large
+ rendered_entries.append(IndexEntry(offset, len(raw), raw))
+ idxt.write(pack(b'>H', self.HEADER_LENGTH+offset))
+ index.write(raw)
+ last_lead_text = leading_text
+
+ index_block = align_block(index.getvalue())
+ idxt_block = align_block(b'IDXT' + idxt.getvalue())
+ body = index_block + idxt_block
+ if len(body) + self.HEADER_LENGTH >= 0x10000:
+ raise too_large
+ header = b'INDX'
+ buf.seek(0), buf.truncate(0)
+ buf.write(pack(b'>I', self.HEADER_LENGTH))
+ buf.write(b'\0'*4) # Unknown
+ buf.write(pack(b'>I', 1)) # Header type? Or index record number?
+ buf.write(b'\0'*4) # Unknown
+
+ # IDXT block offset
+ buf.write(pack(b'>I', self.HEADER_LENGTH + len(index_block)))
+
+ # Number of index entries
+ buf.write(pack(b'>I', len(rendered_entries)))
+
+ buf.write(b'\xff'*8) # Unknown
+
+ buf.write(b'\0'*156) # Unknown
+
+ header += buf.getvalue()
+ index_record = header + body
+
+ tagx = self.generate_tagx()
+ idxt = (b'IDXT' + pack(b'>H', IndexHeader.HEADER_LENGTH + len(tagx)) +
+ b'\0')
+ # Last index
+ idx = bytes(bytearray([len(last_lead_text)])) + last_lead_text
+ idx += pack(b'>H', len(rendered_entries))
+
+ header = {
+ 'num_of_entries': len(rendered_entries),
+ 'num_of_cncx': len(self.cncx),
+ 'tagx':tagx,
+ 'last_index':align_block(idx),
+ 'idxt':idxt
+ }
+ header = IndexHeader()(**header)
+ self.records = [header, index_record]
+ self.records.extend(self.cncx.records)
+ return self.records
+# }}}
+
+class SkelIndex(Index):
+
+ tag_types = tuple(map(TagMeta, (
+ ('chunk_count', 1, 1, 3, 0),
+ ('geometry', 6, 2, 12, 0),
+ EndTagTable
+ )))
+
+ def __init__(self, skel_table):
+ self.entries = [
+ (s.name, {
+ # Dont ask me why these entries have to be repeated twice
+ 'chunk_count':(s.chunk_count, s.chunk_count),
+ 'geometry':(s.start_pos, s.length, s.start_pos, s.length),
+ }) for s in skel_table
+ ]
+
+
+class ChunkIndex(Index):
+
+ tag_types = tuple(map(TagMeta, (
+ ('cncx_offset', 2, 1, 1, 0),
+ ('file_number', 3, 1, 2, 0),
+ ('sequence_number', 4, 1, 4, 0),
+ ('geometry', 6, 2, 8, 0),
+ EndTagTable
+ )))
+
+ def __init__(self, chunk_table):
+ self.cncx = CNCX(c.selector for c in chunk_table)
+
+ self.entries = [
+ ('%010d'%c.insert_pos, {
+
+ 'cncx_offset':self.cncx[c.selector],
+ 'file_number':c.file_number,
+ 'sequence_number':c.sequence_number,
+ 'geometry':(c.start_pos, c.length),
+ }) for c in chunk_table
+ ]
+
+class GuideIndex(Index):
+
+ tag_types = tuple(map(TagMeta, (
+ ('title', 1, 1, 1, 0),
+ ('pos_fid', 6, 2, 2, 0),
+ EndTagTable
+ )))
+
+ def __init__(self, guide_table):
+ self.cncx = CNCX(c.title for c in guide_table)
+
+ self.entries = [
+ (r.type, {
+
+ 'title':self.cncx[r.title],
+ 'pos_fid':r.pos_fid,
+ }) for r in guide_table
+ ]
+
+
+class NCXIndex(Index):
+
+ ''' The commented out parts have been seen in NCX indexes from MOBI 6
+ periodicals. Since we have no MOBI 8 periodicals to reverse engineer, leave
+ it for now. '''
+ # control_byte_count = 2
+ tag_types = tuple(map(TagMeta, (
+ ('offset', 1, 1, 1, 0),
+ ('length', 2, 1, 2, 0),
+ ('label', 3, 1, 4, 0),
+ ('depth', 4, 1, 8, 0),
+ ('parent', 21, 1, 16, 0),
+ ('first_child', 22, 1, 32, 0),
+ ('last_child', 23, 1, 64, 0),
+ ('pos_fid', 6, 2, 128, 0),
+ EndTagTable,
+ # ('image', 69, 1, 1, 0),
+ # ('description', 70, 1, 2, 0),
+ # ('author', 71, 1, 4, 0),
+ # ('caption', 72, 1, 8, 0),
+ # ('attribution', 73, 1, 16, 0),
+ # EndTagTable
+ )))
+
+ def __init__(self, toc_table):
+ strings = []
+ for entry in toc_table:
+ strings.append(entry['label'])
+ aut = entry.get('author', None)
+ if aut:
+ strings.append(aut)
+ desc = entry.get('description', None)
+ if desc:
+ strings.append(desc)
+ self.cncx = CNCX(strings)
+
+ def to_entry(x):
+ ans = {}
+ for f in ('offset', 'length', 'depth', 'pos_fid', 'parent',
+ 'first_child', 'last_child'):
+ if f in x:
+ ans[f] = x[f]
+ for f in ('label', 'description', 'author'):
+ if f in x:
+ ans[f] = self.cncx[x[f]]
+ return ('%02x'%x['index'], ans)
+
+ self.entries = list(map(to_entry, toc_table))
+
+
+
diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py
new file mode 100644
index 0000000000..4e6719bb90
--- /dev/null
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@@ -0,0 +1,408 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+ print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+import copy, logging
+from functools import partial
+from collections import defaultdict, namedtuple
+from io import BytesIO
+from struct import pack
+
+import cssutils
+from lxml import etree
+
+from calibre import isbytestring, force_unicode
+from calibre.ebooks.mobi.utils import (create_text_record, to_base,
+ is_guide_ref_start)
+from calibre.ebooks.compression.palmdoc import compress_doc
+from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
+ extract, XHTML, urlnormalize)
+from calibre.ebooks.oeb.parse_utils import barename
+from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href
+from calibre.ebooks.mobi.writer8.index import (NCXIndex, SkelIndex,
+ ChunkIndex, GuideIndex)
+from calibre.ebooks.mobi.writer8.mobi import KF8Book
+from calibre.ebooks.mobi.writer8.tbs import apply_trailing_byte_sequences
+from calibre.ebooks.mobi.writer8.toc import TOCAdder
+
+XML_DOCS = OEB_DOCS | {SVG_MIME}
+
+# References to record numbers in KF8 are stored as base-32 encoded integers,
+# with 4 digits
+to_ref = partial(to_base, base=32, min_num_digits=4)
+
+class KF8Writer(object):
+
+ def __init__(self, oeb, opts, resources):
+ self.oeb, self.opts, self.log = oeb, opts, oeb.log
+ self.compress = not self.opts.dont_compress
+ self.has_tbs = False
+ self.log.info('Creating KF8 output')
+
+ # Create an inline ToC if one does not already exist
+ self.toc_adder = TOCAdder(oeb, opts)
+ self.used_images = set()
+ self.resources = resources
+ self.flows = [None] # First flow item is reserved for the text
+ self.records = [None] # Placeholder for zeroth record
+
+ self.log('\tGenerating KF8 markup...')
+ self.dup_data()
+ self.replace_resource_links()
+ self.extract_css_into_flows()
+ self.extract_svg_into_flows()
+ self.replace_internal_links_with_placeholders()
+ self.insert_aid_attributes()
+ self.chunk_it_up()
+ # Dump the cloned data as it is no longer needed
+ del self._data_cache
+ self.create_text_records()
+ self.log('\tCreating indices...')
+ self.create_fdst_records()
+ self.create_indices()
+ self.create_guide()
+ # We do not want to use this ToC for MOBI 6, so remove it
+ self.toc_adder.remove_generated_toc()
+
+ def dup_data(self):
+ ''' Duplicate data so that any changes we make to markup/CSS only
+ affect KF8 output and not MOBI 6 output '''
+ self._data_cache = {}
+ # Suppress cssutils logging output as it is duplicated anyway earlier
+ # in the pipeline
+ cssutils.log.setLevel(logging.CRITICAL)
+ for item in self.oeb.manifest:
+ if item.media_type in XML_DOCS:
+ self._data_cache[item.href] = copy.deepcopy(item.data)
+ elif item.media_type in OEB_STYLES:
+ # I can't figure out how to make an efficient copy of the
+ # in-memory CSSStylesheet, as deepcopy doesn't work (raises an
+ # exception)
+ self._data_cache[item.href] = cssutils.parseString(
+ item.data.cssText, validate=False)
+
+ def data(self, item):
+ return self._data_cache.get(item.href, item.data)
+
+ def replace_resource_links(self):
+ ''' Replace links to resources (raster images/fonts) with pointers to
+ the MOBI record containing the resource. The pointers are of the form:
+ kindle:embed:XXXX?mime=image/* The ?mime= is apparently optional and
+ not used for fonts. '''
+
+ def pointer(item, oref):
+ ref = item.abshref(oref)
+ idx = self.resources.item_map.get(ref, None)
+ if idx is not None:
+ is_image = self.resources.records[idx-1][:4] not in {b'FONT'}
+ idx = to_ref(idx)
+ if is_image:
+ self.used_images.add(ref)
+ return 'kindle:embed:%s?mime=%s'%(idx,
+ self.resources.mime_map[ref])
+ else:
+ return 'kindle:embed:%s'%idx
+ return oref
+
+ for item in self.oeb.manifest:
+
+ if item.media_type in XML_DOCS:
+ root = self.data(item)
+ for tag in XPath('//h:img|//svg:image')(root):
+ for attr, ref in tag.attrib.iteritems():
+ if attr.split('}')[-1].lower() in {'src', 'href'}:
+ tag.attrib[attr] = pointer(item, ref)
+
+ for tag in XPath('//h:style')(root):
+ if tag.text:
+ sheet = cssutils.parseString(tag.text, validate=False)
+ replacer = partial(pointer, item)
+ cssutils.replaceUrls(sheet, replacer,
+ ignoreImportRules=True)
+ repl = sheet.cssText
+ if isbytestring(repl):
+ repl = repl.decode('utf-8')
+ tag.text = '\n'+ repl + '\n'
+
+ elif item.media_type in OEB_STYLES:
+ sheet = self.data(item)
+ replacer = partial(pointer, item)
+ cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True)
+
+ def extract_css_into_flows(self):
+ inlines = defaultdict(list) # Ensure identical
+
+
+{title}
+
+
+ |