diff --git a/recipes/corren2.recipe b/recipes/corren2.recipe
index 494be88f10..f53da20fd1 100644
--- a/recipes/corren2.recipe
+++ b/recipes/corren2.recipe
@@ -1,39 +1,34 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPLv3'
+
from calibre.web.feeds.news import BasicNewsRecipe
-class AdvancedUserRecipe1255797795(BasicNewsRecipe):
- title = u'Corren'
- language = 'sv'
- __author__ = 'Jonas Svensson'
- simultaneous_downloads = 1
- no_stylesheets = True
- oldest_article = 7
+class AdvancedUserRecipe1311446032(BasicNewsRecipe):
+ title = 'Corren'
+ __author__ = 'Jonas Svensson'
+ description = 'News from Sweden'
+ publisher = 'Corren'
+ category = 'news, politics, Sweden'
+ oldest_article = 2
+ delay = 1
max_articles_per_feed = 100
- remove_attributes = ['onload']
- timefmt = ''
+ no_stylesheets = True
+ use_embedded_content = False
+ encoding = 'iso-8859-1'
+ language = 'sv'
- feeds = [
- (u'Toppnyheter (alla kategorier)', u'http://www.corren.se/inc/RssHandler.ashx?id=4122151&ripurl=http://www.corren.se/nyheter/'),
- (u'Bostad', u'http://www.corren.se/inc/RssHandler.ashx?id=4122174&ripurl=http://www.corren.se/bostad/'),
- (u'Ekonomi & Jobb', u'http://www.corren.se/inc/RssHandler.ashx?id=4122176&ripurl=http://www.corren.se/ekonomi/'),
- (u'Kultur & Nöje', u'http://www.corren.se/inc/RssHandler.ashx?id=4122192&ripurl=http://www.corren.se/kultur/'),
- (u'Mat & dryck', u'http://www.corren.se/inc/RssHandler.ashx?id=4122201&ripurl=http://www.corren.se/mat-dryck/'),
- (u'Motor', u'http://www.corren.se/inc/RssHandler.ashx?id=4122203&ripurl=http://www.corren.se/motor/'),
- (u'Sport', u'http://www.corren.se/inc/RssHandler.ashx?id=4122206&ripurl=http://www.corren.se/sport/'),
- (u'Åsikter', u'http://www.corren.se/inc/RssHandler.ashx?id=4122223&ripurl=http://www.corren.se/asikter/'),
- (u'Mjölby', u'http://www.corren.se/inc/RssHandler.ashx?id=4122235&ripurl=http://www.corren.se/ostergotland/mjolby/'),
- (u'Motala', u'http://www.corren.se/inc/RssHandler.ashx?id=4122236&ripurl=http://www.corren.se/ostergotland/motala/')
- ]
-
- def print_version(self, url):
- url = url.replace("ekonomi/artikel.aspx", "Print.aspx")
- url = url.replace("bostad/artikel.aspx", "Print.aspx")
- url = url.replace("kultur/artikel.aspx", "Print.aspx")
- url = url.replace("motor/artikel.aspx", "Print.aspx")
- url = url.replace("mat-dryck/artikel.aspx", "Print.aspx")
- url = url.replace("sport/artikel.aspx", "Print.aspx")
- url = url.replace("asikter/artikel.aspx", "Print.aspx")
- url = url.replace("mat-dryck/artikel.aspx", "Print.aspx")
- url = url.replace("ostergotland/mjolby/artikel.aspx", "Print.aspx")
- url = url.replace("ostergotland/motala/artikel.aspx", "Print.aspx")
- return url.replace("nyheter/artikel.aspx", "Print.aspx")
+ feeds = [
+ (u'Toppnyheter', u'http://www.corren.se/inc/RssHandler.ashx?id=4122151&ripurl=http://www.corren.se/nyheter/')
+ ,(u'Ekonomi', u'http://www.corren.se/inc/RssHandler.ashx?id=4122176&ripurl=http://www.corren.se/ekonomi/')
+ ,(u'Link\xf6ping', u'http://www.corren.se/inc/RssHandler.ashx?id=4122234')
+ ,(u'Åsikter', u'http://www.corren.se/inc/RssHandler.ashx?id=4122223,4122224,4122226,4122227,4122228,4122229,4122230')
+ ]
+ keep_only_tags = [dict(name='div', attrs={'id':'article'}),dict(name='div', attrs={'class':'body'})]
+ remove_tags = [
+ dict(name='ul',attrs={'class':'functions'})
+ ,dict(name='a',attrs={'href':'javascript*'})
+ ,dict(name='div',attrs={'class':'box'})
+ ,dict(name='div',attrs={'class':'functionsbottom'})
+ ]
diff --git a/recipes/dagens_industri.recipe b/recipes/dagens_industri.recipe
new file mode 100644
index 0000000000..c9b60c72b1
--- /dev/null
+++ b/recipes/dagens_industri.recipe
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPLv3'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1311450855(BasicNewsRecipe):
+ title = u'Dagens Industri'
+ __author__ = 'Jonas Svensson'
+ description = 'Economy news from Sweden'
+ publisher = 'DI'
+ category = 'news, politics, Sweden'
+ oldest_article = 2
+ delay = 1
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ use_embedded_content = False
+ encoding = 'utf-8'
+ language = 'sv'
+
+ feeds = [(u'DI', u'http://di.se/rss')]
+
+ keep_only_tags = [dict(name='h1', attrs={'id':'ctl00_ExtraWideContentRegion_WideContentRegion_MainRegion_MainContentRegion_MainBodyRegion_headlineNormal'}),dict(name='div', attrs={'id':'articleBody'})]
+
+ remove_tags = [
+ dict(name='div',attrs={'class':'article-actions clear'})
+ ,dict(name='div',attrs={'class':'article-action-popup'})
+ ,dict(name='div',attrs={'class':'header'})
+ ,dict(name='div',attrs={'class':'content clear'})
+ ,dict(name='div',attrs={'id':'articleAdvertisementDiv'})
+ ,dict(name='ul',attrs={'class':'action-list'})
+ ]
diff --git a/recipes/guardian.recipe b/recipes/guardian.recipe
index c5021cb91d..124820d0a1 100644
--- a/recipes/guardian.recipe
+++ b/recipes/guardian.recipe
@@ -12,7 +12,7 @@ from datetime import date
class Guardian(BasicNewsRecipe):
- title = u'The Guardian / The Observer'
+ title = u'The Guardian and The Observer'
if date.today().weekday() == 6:
base_url = "http://www.guardian.co.uk/theobserver"
else:
@@ -28,7 +28,7 @@ class Guardian(BasicNewsRecipe):
# List of section titles to ignore
# For example: ['Sport']
ignore_sections = []
-
+
timefmt = ' [%a, %d %b %Y]'
keep_only_tags = [
dict(name='div', attrs={'id':["content","article_header","main-article-info",]}),
@@ -94,7 +94,7 @@ class Guardian(BasicNewsRecipe):
prefix = section_title + ': '
for subsection in s.parent.findAll('a', attrs={'class':'book-section'}):
yield (prefix + self.tag_to_string(subsection), subsection['href'])
-
+
def find_articles(self, url):
soup = self.index_to_soup(url)
div = soup.find('div', attrs={'class':'book-index'})
@@ -115,7 +115,7 @@ class Guardian(BasicNewsRecipe):
'title': title, 'url':url, 'description':desc,
'date' : strftime('%a, %d %b'),
}
-
+
def parse_index(self):
try:
feeds = []
diff --git a/recipes/instapaper.recipe b/recipes/instapaper.recipe
index c6175a783f..d182e556a2 100644
--- a/recipes/instapaper.recipe
+++ b/recipes/instapaper.recipe
@@ -43,7 +43,7 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe):
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
- self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
+ self.report_progress(0, 'Fetching feed'+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for item in soup.findAll('div', attrs={'class':'cornerControls'}):
@@ -63,3 +63,8 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe):
def populate_article_metadata(self, article, soup, first):
article.title = soup.find('title').contents[0].strip()
+ def postprocess_html(self, soup, first_fetch):
+ for link_tag in soup.findAll(attrs={"id" : "story"}):
+ link_tag.insert(0,'
'+soup.find('title').contents[0].strip()+'
')
+
+ return soup
diff --git a/recipes/irish_times.recipe b/recipes/irish_times.recipe
index 3efcfc6d29..31ccd306e4 100644
--- a/recipes/irish_times.recipe
+++ b/recipes/irish_times.recipe
@@ -1,4 +1,4 @@
-__license__ = 'GPL v3'
+__license__ = 'GPL v3'
__copyright__ = "2008, Derry FitzGerald. 2009 Modified by Ray Kinsella and David O'Callaghan, 2011 Modified by Phil Burns"
'''
irishtimes.com
@@ -10,7 +10,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class IrishTimes(BasicNewsRecipe):
title = u'The Irish Times'
encoding = 'ISO-8859-15'
- __author__ = "Derry FitzGerald, Ray Kinsella, David O'Callaghan and Phil Burns"
+ __author__ = "Derry FitzGerald, Ray Kinsella, David O'Callaghan and Phil Burns"
language = 'en_IE'
timefmt = ' (%A, %B %d, %Y)'
@@ -18,6 +18,7 @@ class IrishTimes(BasicNewsRecipe):
oldest_article = 1.0
max_articles_per_feed = 100
no_stylesheets = True
+ simultaneous_downloads= 5
r = re.compile('.*(?Phttp:\/\/(www.irishtimes.com)|(rss.feedsportal.com\/c)\/.*\.html?).*')
remove_tags = [dict(name='div', attrs={'class':'footer'})]
@@ -25,17 +26,17 @@ class IrishTimes(BasicNewsRecipe):
feeds = [
('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'),
- ('Ireland', 'http://rss.feedsportal.com/c/851/f/10845/index.rss'),
- ('World', 'http://rss.feedsportal.com/c/851/f/10846/index.rss'),
- ('Finance', 'http://rss.feedsportal.com/c/851/f/10847/index.rss'),
- ('Features', 'http://rss.feedsportal.com/c/851/f/10848/index.rss'),
- ('Sport', 'http://rss.feedsportal.com/c/851/f/10849/index.rss'),
- ('Opinion', 'http://rss.feedsportal.com/c/851/f/10850/index.rss'),
- ('Letters', 'http://rss.feedsportal.com/c/851/f/10851/index.rss'),
+ ('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'),
+ ('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'),
+ ('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'),
+ ('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'),
+ ('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'),
+ ('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'),
+ ('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
('Magazine', 'http://www.irishtimes.com/feeds/rss/newspaper/magazine.rss'),
- ('Health', 'http://rss.feedsportal.com/c/851/f/10852/index.rss'),
- ('Education & Parenting', 'http://rss.feedsportal.com/c/851/f/10853/index.rss'),
- ('Motors', 'http://rss.feedsportal.com/c/851/f/10854/index.rss'),
+ ('Health', 'http://www.irishtimes.com/feeds/rss/newspaper/health.rss'),
+ ('Education & Parenting', 'http://www.irishtimes.com/feeds/rss/newspaper/education.rss'),
+ ('Motors', 'http://www.irishtimes.com/feeds/rss/newspaper/motors.rss'),
('An Teanga Bheo', 'http://www.irishtimes.com/feeds/rss/newspaper/anteangabheo.rss'),
('Commercial Property', 'http://www.irishtimes.com/feeds/rss/newspaper/commercialproperty.rss'),
('Science Today', 'http://www.irishtimes.com/feeds/rss/newspaper/sciencetoday.rss'),
@@ -49,10 +50,16 @@ class IrishTimes(BasicNewsRecipe):
def print_version(self, url):
if url.count('rss.feedsportal.com'):
- u = url.replace('0Bhtml/story01.htm','_pf0Bhtml/story01.htm')
+ #u = url.replace('0Bhtml/story01.htm','_pf0Bhtml/story01.htm')
+ u = url.find('irishtimes')
+ u = 'http://www.irishtimes.com' + url[u + 12:]
+ u = u.replace('0C', '/')
+ u = u.replace('A', '')
+ u = u.replace('0Bhtml/story01.htm', '_pf.html')
else:
u = url.replace('.html','_pf.html')
return u
def get_article_url(self, article):
return article.link
+
diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py
index 3e2cc4da57..ad56dbcb75 100644
--- a/resources/default_tweaks.py
+++ b/resources/default_tweaks.py
@@ -11,7 +11,7 @@ defaults.
'''
#: Auto increment series index
-# The algorithm used to assign a new book in an existing series a series number.
+# The algorithm used to assign a book added to an existing series a series number.
# New series numbers assigned using this tweak are always integer values, except
# if a constant non-integer is specified.
# Possible values are:
@@ -27,7 +27,19 @@ defaults.
# series_index_auto_increment = 'next'
# series_index_auto_increment = 'next_free'
# series_index_auto_increment = 16.5
+#
+# Set the use_series_auto_increment_tweak_when_importing tweak to True to
+# use the above values when importing/adding books. If this tweak is set to
+# False (the default) then the series number will be set to 1 if it is not
+# explicitly set to during the import. If set to True, then the
+# series index will be set according to the series_index_auto_increment setting.
+# Note that the use_series_auto_increment_tweak_when_importing tweak is used
+# only when a value is not provided during import. If the importing regular
+# expression produces a value for series_index, or if you are reading metadata
+# from books and the import plugin produces a value, than that value will
+# be used irrespective of the setting of the tweak.
series_index_auto_increment = 'next'
+use_series_auto_increment_tweak_when_importing = False
#: Add separator after completing an author name
# Should the completion separator be append
diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index a79078988a..620254b1f5 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -570,7 +570,7 @@ from calibre.devices.teclast.driver import (TECLAST_K3, NEWSMY, IPAPYRUS,
from calibre.devices.sne.driver import SNE
from calibre.devices.misc import (PALMPRE, AVANT, SWEEX, PDNOVEL,
GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, LUMIREAD, ALURATEK_COLOR,
- TREKSTOR, EEEREADER, NEXTBOOK, ADAM, MOOVYBOOK)
+ TREKSTOR, EEEREADER, NEXTBOOK, ADAM, MOOVYBOOK, COBY)
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
from calibre.devices.kobo.driver import KOBO
from calibre.devices.bambook.driver import BAMBOOK
@@ -705,7 +705,7 @@ plugins += [
EEEREADER,
NEXTBOOK,
ADAM,
- MOOVYBOOK,
+ MOOVYBOOK, COBY,
ITUNES,
BOEYE_BEX,
BOEYE_BDX,
@@ -1228,17 +1228,6 @@ class StoreEbookscomStore(StoreBase):
formats = ['EPUB', 'LIT', 'MOBI', 'PDF']
affiliate = True
-#class StoreEPubBuyDEStore(StoreBase):
-# name = 'EPUBBuy DE'
-# author = 'Charles Haley'
-# description = u'Bei EPUBBuy.com finden Sie ausschliesslich eBooks im weitverbreiteten EPUB-Format und ohne DRM. So haben Sie die freie Wahl, wo Sie Ihr eBook lesen: Tablet, eBook-Reader, Smartphone oder einfach auf Ihrem PC. So macht eBook-Lesen Spaß!'
-# actual_plugin = 'calibre.gui2.store.stores.epubbuy_de_plugin:EPubBuyDEStore'
-#
-# drm_free_only = True
-# headquarters = 'DE'
-# formats = ['EPUB']
-# affiliate = True
-
class StoreEBookShoppeUKStore(StoreBase):
name = 'ebookShoppe UK'
author = u'Charles Haley'
@@ -1266,16 +1255,7 @@ class StoreEKnigiStore(StoreBase):
headquarters = 'BG'
formats = ['EPUB', 'PDF', 'HTML']
- #affiliate = True
-
-class StoreEpubBudStore(StoreBase):
- name = 'ePub Bud'
- description = 'Well, it\'s pretty much just "YouTube for Children\'s eBooks. A not-for-profit organization devoted to brining self published childrens books to the world.'
- actual_plugin = 'calibre.gui2.store.stores.epubbud_plugin:EpubBudStore'
-
- drm_free_only = True
- headquarters = 'US'
- formats = ['EPUB']
+ affiliate = True
class StoreFeedbooksStore(StoreBase):
name = 'Feedbooks'
@@ -1311,6 +1291,7 @@ class StoreGoogleBooksStore(StoreBase):
headquarters = 'US'
formats = ['EPUB', 'PDF', 'TXT']
+ affiliate = True
class StoreGutenbergStore(StoreBase):
name = 'Project Gutenberg'
@@ -1394,6 +1375,17 @@ class StoreOReillyStore(StoreBase):
headquarters = 'US'
formats = ['APK', 'DAISY', 'EPUB', 'MOBI', 'PDF']
+class StoreOzonRUStore(StoreBase):
+ name = 'OZON.ru'
+ description = u'ebooks from OZON.ru'
+ actual_plugin = 'calibre.gui2.store.stores.ozon_ru_plugin:OzonRUStore'
+ author = 'Roman Mukhin'
+
+ drm_free_only = True
+ headquarters = 'RU'
+ formats = ['TXT', 'PDF', 'DJVU', 'RTF', 'DOC', 'JAR', 'FB2']
+ affiliate = True
+
class StorePragmaticBookshelfStore(StoreBase):
name = 'Pragmatic Bookshelf'
description = u'The Pragmatic Bookshelf\'s collection of programming and tech books avaliable as ebooks.'
@@ -1491,10 +1483,8 @@ plugins += [
StoreEbookNLStore,
StoreEbookscomStore,
StoreEBookShoppeUKStore,
-# StoreEPubBuyDEStore,
StoreEHarlequinStore,
StoreEKnigiStore,
- StoreEpubBudStore,
StoreFeedbooksStore,
StoreFoylesUKStore,
StoreGandalfStore,
@@ -1508,6 +1498,7 @@ plugins += [
StoreNextoStore,
StoreOpenBooksStore,
StoreOReillyStore,
+ StoreOzonRUStore,
StorePragmaticBookshelfStore,
StoreSmashwordsStore,
StoreVirtualoStore,
diff --git a/src/calibre/db/tables.py b/src/calibre/db/tables.py
index b75effff4b..fa7b001851 100644
--- a/src/calibre/db/tables.py
+++ b/src/calibre/db/tables.py
@@ -12,7 +12,7 @@ from datetime import datetime
from dateutil.tz import tzoffset
from calibre.constants import plugins
-from calibre.utils.date import parse_date, local_tz
+from calibre.utils.date import parse_date, local_tz, UNDEFINED_DATE
from calibre.ebooks.metadata import author_to_author_sort
_c_speedup = plugins['speedup'][0]
@@ -29,8 +29,11 @@ def _c_convert_timestamp(val):
if ret is None:
return parse_date(val, as_utc=False)
year, month, day, hour, minutes, seconds, tzsecs = ret
- return datetime(year, month, day, hour, minutes, seconds,
+ try:
+ return datetime(year, month, day, hour, minutes, seconds,
tzinfo=tzoffset(None, tzsecs)).astimezone(local_tz)
+ except OverflowError:
+ return UNDEFINED_DATE.astimezone(local_tz)
class Table(object):
diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py
index d26489c42f..a12f37c7eb 100644
--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@@ -128,7 +128,7 @@ class ANDROID(USBMS):
'7', 'A956', 'A955', 'A43', 'ANDROID_PLATFORM', 'TEGRA_2',
'MB860', 'MULTI-CARD', 'MID7015A', 'INCREDIBLE', 'A7EB', 'STREAK',
'MB525', 'ANDROID2.3', 'SGH-I997', 'GT-I5800_CARD', 'MB612',
- 'GT-S5830_CARD']
+ 'GT-S5830_CARD', 'GT-S5570_CARD']
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD',
diff --git a/src/calibre/devices/misc.py b/src/calibre/devices/misc.py
index 6c5706f039..92fce68f11 100644
--- a/src/calibre/devices/misc.py
+++ b/src/calibre/devices/misc.py
@@ -351,3 +351,29 @@ class MOOVYBOOK(USBMS):
def get_main_ebook_dir(self, for_upload=False):
return 'Books' if for_upload else self.EBOOK_DIR_MAIN
+class COBY(USBMS):
+
+ name = 'COBY MP977 device interface'
+ gui_name = 'COBY'
+ description = _('Communicate with the COBY')
+ author = 'Kovid Goyal'
+ supported_platforms = ['windows', 'osx', 'linux']
+
+ # Ordered list of supported formats
+ FORMATS = ['epub', 'pdf']
+
+ VENDOR_ID = [0x1e74]
+ PRODUCT_ID = [0x7121]
+ BCD = [0x02]
+ VENDOR_NAME = 'USB_2.0'
+ WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'MP977_DRIVER'
+
+ EBOOK_DIR_MAIN = ''
+
+ SUPPORTS_SUB_DIRS = False
+
+ def get_carda_ebook_dir(self, for_upload=False):
+ if for_upload:
+ return 'eBooks'
+ return self.EBOOK_DIR_CARD_A
+
diff --git a/src/calibre/ebooks/metadata/fb2.py b/src/calibre/ebooks/metadata/fb2.py
index 4c47d87717..765ac6d009 100644
--- a/src/calibre/ebooks/metadata/fb2.py
+++ b/src/calibre/ebooks/metadata/fb2.py
@@ -24,10 +24,9 @@ XPath = partial(etree.XPath, namespaces=NAMESPACES)
tostring = partial(etree.tostring, method='text', encoding=unicode)
def get_metadata(stream):
- """ Return fb2 metadata as a L{MetaInformation} object """
+ ''' Return fb2 metadata as a L{MetaInformation} object '''
root = _get_fbroot(stream)
-
book_title = _parse_book_title(root)
authors = _parse_authors(root)
@@ -166,7 +165,7 @@ def _parse_tags(root, mi):
break
def _parse_series(root, mi):
- #calibri supports only 1 series: use the 1-st one
+ # calibri supports only 1 series: use the 1-st one
# pick up sequence but only from 1 secrion in prefered order
# except
xp_ti = '//fb2:title-info/fb2:sequence[1]'
@@ -181,11 +180,12 @@ def _parse_series(root, mi):
def _parse_isbn(root, mi):
# some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
isbn = XPath('normalize-space(//fb2:publish-info/fb2:isbn/text())')(root)
- # some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
- if ',' in isbn:
- isbn = isbn[:isbn.index(',')]
- if check_isbn(isbn):
- mi.isbn = isbn
+ if isbn:
+ # some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
+ if ',' in isbn:
+ isbn = isbn[:isbn.index(',')]
+ if check_isbn(isbn):
+ mi.isbn = isbn
def _parse_comments(root, mi):
# pick up annotation but only from 1 secrion ; fallback:
@@ -232,4 +232,3 @@ def _get_fbroot(stream):
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
root = etree.fromstring(raw, parser=parser)
return root
-
diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py
index 186821b0c3..35fd724ddd 100644
--- a/src/calibre/ebooks/metadata/opf2.py
+++ b/src/calibre/ebooks/metadata/opf2.py
@@ -22,6 +22,7 @@ from calibre.utils.date import parse_date, isoformat
from calibre.utils.localization import get_lang
from calibre import prints, guess_type
from calibre.utils.cleantext import clean_ascii_chars
+from calibre.utils.config import tweaks
class Resource(object): # {{{
'''
@@ -527,7 +528,12 @@ class OPF(object): # {{{
category = MetadataField('type')
rights = MetadataField('rights')
series = MetadataField('series', is_dc=False)
- series_index = MetadataField('series_index', is_dc=False, formatter=float, none_is=1)
+ if tweaks['use_series_auto_increment_tweak_when_importing']:
+ series_index = MetadataField('series_index', is_dc=False,
+ formatter=float, none_is=None)
+ else:
+ series_index = MetadataField('series_index', is_dc=False,
+ formatter=float, none_is=1)
title_sort = TitleSortField('title_sort', is_dc=False)
rating = MetadataField('rating', is_dc=False, formatter=int)
pubdate = MetadataField('date', formatter=parse_date,
@@ -1024,8 +1030,10 @@ class OPF(object): # {{{
attrib = attrib or {}
attrib['name'] = 'calibre:' + name
name = '{%s}%s' % (self.NAMESPACES['opf'], 'meta')
+ nsmap = dict(self.NAMESPACES)
+ del nsmap['opf']
elem = etree.SubElement(self.metadata, name, attrib=attrib,
- nsmap=self.NAMESPACES)
+ nsmap=nsmap)
elem.tail = '\n'
return elem
diff --git a/src/calibre/ebooks/metadata/sources/identify.py b/src/calibre/ebooks/metadata/sources/identify.py
index 97b6d15bc8..a7bcbc5a89 100644
--- a/src/calibre/ebooks/metadata/sources/identify.py
+++ b/src/calibre/ebooks/metadata/sources/identify.py
@@ -22,6 +22,7 @@ from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.date import utc_tz, as_utc
from calibre.utils.html2text import html2text
from calibre.utils.icu import lower
+from calibre.utils.date import UNDEFINED_DATE
# Download worker {{{
class Worker(Thread):
@@ -490,6 +491,8 @@ def identify(log, abort, # {{{
max_tags = msprefs['max_tags']
for r in results:
r.tags = r.tags[:max_tags]
+ if getattr(r.pubdate, 'year', 2000) <= UNDEFINED_DATE.year:
+ r.pubdate = None
if msprefs['swap_author_names']:
for r in results:
diff --git a/src/calibre/ebooks/metadata/sources/isbndb.py b/src/calibre/ebooks/metadata/sources/isbndb.py
index b33a625ca7..31c5e69d65 100644
--- a/src/calibre/ebooks/metadata/sources/isbndb.py
+++ b/src/calibre/ebooks/metadata/sources/isbndb.py
@@ -151,7 +151,7 @@ class ISBNDB(Source):
bl = feed.find('BookList')
if bl is None:
- err = tostring(etree.find('errormessage'))
+ err = tostring(feed.find('errormessage'))
raise ValueError('ISBNDb query failed:' + err)
total_results = int(bl.get('total_results'))
shown_results = int(bl.get('shown_results'))
diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py
index d861e69cbf..1279ba7793 100644
--- a/src/calibre/ebooks/mobi/debug.py
+++ b/src/calibre/ebooks/mobi/debug.py
@@ -12,7 +12,7 @@ from collections import OrderedDict, defaultdict
from calibre.utils.date import utc_tz
from calibre.ebooks.mobi.langcodes import main_language, sub_language
from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
- get_trailing_data)
+ get_trailing_data, decode_tbs)
from calibre.utils.magick.draw import identify_data
# PalmDB {{{
@@ -73,7 +73,7 @@ class PalmDB(object):
self.ident = self.type + self.creator
if self.ident not in (b'BOOKMOBI', b'TEXTREAD'):
raise ValueError('Unknown book ident: %r'%self.ident)
- self.uid_seed = self.raw[68:72]
+ self.uid_seed, = struct.unpack(b'>I', self.raw[68:72])
self.next_rec_list_id = self.raw[72:76]
self.number_of_records, = struct.unpack(b'>H', self.raw[76:78])
@@ -182,6 +182,7 @@ class EXTHHeader(object):
self.records = []
for i in xrange(self.count):
pos = self.read_record(pos)
+ self.records.sort(key=lambda x:x.type)
def read_record(self, pos):
type_, length = struct.unpack(b'>II', self.raw[pos:pos+8])
@@ -290,7 +291,12 @@ class MOBIHeader(object): # {{{
(self.fcis_number, self.fcis_count, self.flis_number,
self.flis_count) = struct.unpack(b'>IIII',
self.raw[200:216])
- self.unknown6 = self.raw[216:240]
+ self.unknown6 = self.raw[216:224]
+ self.srcs_record_index = struct.unpack(b'>I',
+ self.raw[224:228])[0]
+ self.num_srcs_records = struct.unpack(b'>I',
+ self.raw[228:232])[0]
+ self.unknown7 = self.raw[232:240]
self.extra_data_flags = struct.unpack(b'>I',
self.raw[240:244])[0]
self.has_multibytes = bool(self.extra_data_flags & 0b1)
@@ -339,7 +345,7 @@ class MOBIHeader(object): # {{{
ans.append('Huffman record offset: %d'%self.huffman_record_offset)
ans.append('Huffman record count: %d'%self.huffman_record_count)
ans.append('Unknown2: %r'%self.unknown2)
- ans.append('EXTH flags: %r (%s)'%(self.exth_flags, self.has_exth))
+ ans.append('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth))
if self.has_drm_data:
ans.append('Unknown3: %r'%self.unknown3)
ans.append('DRM Offset: %s'%self.drm_offset)
@@ -356,6 +362,9 @@ class MOBIHeader(object): # {{{
ans.append('FLIS number: %d'% self.flis_number)
ans.append('FLIS count: %d'% self.flis_count)
ans.append('Unknown6: %r'% self.unknown6)
+ ans.append('SRCS record index: %d'%self.srcs_record_index)
+ ans.append('Number of SRCS records?: %d'%self.num_srcs_records)
+ ans.append('Unknown7: %r'%self.unknown7)
ans.append(('Extra data flags: %s (has multibyte: %s) '
'(has indexing: %s) (has uncrossable breaks: %s)')%(
bin(self.extra_data_flags), self.has_multibytes,
@@ -399,6 +408,7 @@ class IndexHeader(object): # {{{
def __init__(self, record):
self.record = record
raw = self.record.raw
+ #open('/t/index_header.bin', 'wb').write(raw)
if raw[:4] != b'INDX':
raise ValueError('Invalid Primary Index Record')
@@ -406,7 +416,7 @@ class IndexHeader(object): # {{{
self.unknown1 = raw[8:16]
self.index_type, = struct.unpack('>I', raw[16:20])
self.index_type_desc = {0: 'normal', 2:
- 'inflection'}.get(self.index_type, 'unknown')
+ 'inflection', 6: 'calibre'}.get(self.index_type, 'unknown')
self.idxt_start, = struct.unpack('>I', raw[20:24])
self.index_count, = struct.unpack('>I', raw[24:28])
self.index_encoding_num, = struct.unpack('>I', raw[28:32])
@@ -415,12 +425,7 @@ class IndexHeader(object): # {{{
if self.index_encoding == 'unknown':
raise ValueError(
'Unknown index encoding: %d'%self.index_encoding_num)
- self.locale_raw, = struct.unpack(b'>I', raw[32:36])
- langcode = self.locale_raw
- langid = langcode & 0xFF
- sublangid = (langcode >> 10) & 0xFF
- self.language = main_language.get(langid, 'ENGLISH')
- self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
+ self.possibly_language = raw[32:36]
self.num_index_entries, = struct.unpack('>I', raw[36:40])
self.ordt_start, = struct.unpack('>I', raw[40:44])
self.ligt_start, = struct.unpack('>I', raw[44:48])
@@ -480,8 +485,7 @@ class IndexHeader(object): # {{{
a('Number of index records: %d'%self.index_count)
a('Index encoding: %s (%d)'%(self.index_encoding,
self.index_encoding_num))
- a('Index language: %s - %s (%s)'%(self.language, self.sublanguage,
- hex(self.locale_raw)))
+ a('Unknown (possibly language?): %r'%(self.possibly_language))
a('Number of index entries: %d'% self.num_index_entries)
a('ORDT start: %d'%self.ordt_start)
a('LIGT start: %d'%self.ligt_start)
@@ -596,10 +600,14 @@ class IndexEntry(object): # {{{
0x3f : 'article',
}
- def __init__(self, ident, entry_type, raw, cncx, tagx_entries):
+ def __init__(self, ident, entry_type, raw, cncx, tagx_entries, flags=0):
self.index = ident
self.raw = raw
self.tags = []
+ self.entry_type_raw = entry_type
+ self.byte_size = len(raw)
+
+ orig_raw = raw
try:
self.entry_type = self.TYPES[entry_type]
@@ -619,6 +627,27 @@ class IndexEntry(object): # {{{
vals.append(val)
self.tags.append(Tag(tag, vals, self.entry_type, cncx))
+ if flags & 0b10:
+ # Look for optional description and author
+ desc_tag = [t for t in tagx_entries if t.tag == 22]
+ if desc_tag and raw:
+ val, consumed = decint(raw)
+ raw = raw[consumed:]
+ if val:
+ self.tags.append(Tag(desc_tag[0], [val], self.entry_type,
+ cncx))
+ if flags & 0b100:
+ aut_tag = [t for t in tagx_entries if t.tag == 23]
+ if aut_tag and raw:
+ val, consumed = decint(raw)
+ raw = raw[consumed:]
+ if val:
+ self.tags.append(Tag(aut_tag[0], [val], self.entry_type,
+ cncx))
+
+ self.consumed = len(orig_raw) - len(raw)
+ self.trailing_bytes = raw
+
@property
def label(self):
for tag in self.tags:
@@ -669,13 +698,16 @@ class IndexEntry(object): # {{{
return -1
def __str__(self):
- ans = ['Index Entry(index=%s, entry_type=%s, length=%d)'%(
- self.index, self.entry_type, len(self.tags))]
+ ans = ['Index Entry(index=%s, entry_type=%s (%s), length=%d, byte_size=%d)'%(
+ self.index, self.entry_type, bin(self.entry_type_raw)[2:],
+ len(self.tags), self.byte_size)]
for tag in self.tags:
ans.append('\t'+str(tag))
if self.first_child_index != -1:
ans.append('\tNumber of children: %d'%(self.last_child_index -
self.first_child_index + 1))
+ if self.trailing_bytes:
+ ans.append('\tTrailing bytes: %r'%self.trailing_bytes)
return '\n'.join(ans)
# }}}
@@ -690,6 +722,7 @@ class IndexRecord(object): # {{{
def __init__(self, record, index_header, cncx):
self.record = record
raw = self.record.raw
+
if raw[:4] != b'INDX':
raise ValueError('Invalid Primary Index Record')
@@ -713,8 +746,12 @@ class IndexRecord(object): # {{{
for i in range(self.idxt_count):
off, = u(b'>H', indices[i*2:(i+1)*2])
self.index_offsets.append(off-192)
+ rest = indices[(i+1)*2:]
+ if rest.replace(b'\0', ''): # There can be padding null bytes
+ raise ValueError('Extra bytes after IDXT table: %r'%rest)
indxt = raw[192:self.idxt_offset]
+ self.size_of_indxt_block = len(indxt)
self.indices = []
for i, off in enumerate(self.index_offsets):
try:
@@ -723,9 +760,18 @@ class IndexRecord(object): # {{{
next_off = len(indxt)
index, consumed = decode_hex_number(indxt[off:])
entry_type = ord(indxt[off+consumed])
+ d, flags = 1, 0
+ if index_header.index_type == 6:
+ flags = ord(indxt[off+consumed+d])
+ d += 1
+ pos = off+consumed+d
self.indices.append(IndexEntry(index, entry_type,
- indxt[off+consumed+1:next_off], cncx, index_header.tagx_entries))
- index = self.indices[-1]
+ indxt[pos:next_off], cncx,
+ index_header.tagx_entries, flags=flags))
+
+ rest = indxt[pos+self.indices[-1].consumed:]
+ if rest.replace(b'\0', ''): # There can be padding null bytes
+ raise ValueError('Extra bytes after IDXT table: %r'%rest)
def get_parent(self, index):
if index.depth < 1:
@@ -744,14 +790,15 @@ class IndexRecord(object): # {{{
len(w), not bool(w.replace(b'\0', b'')) ))
a('Header length: %d'%self.header_length)
u(self.unknown1)
- a('Header Type: %d'%self.header_type)
+ a('Unknown (header type? index record number? always 1?): %d'%self.header_type)
u(self.unknown2)
- a('IDXT Offset: %d'%self.idxt_offset)
+ a('IDXT Offset (%d block size): %d'%(self.size_of_indxt_block,
+ self.idxt_offset))
a('IDXT Count: %d'%self.idxt_count)
u(self.unknown3)
u(self.unknown4)
a('Index offsets: %r'%self.index_offsets)
- a('\nIndex Entries:')
+ a('\nIndex Entries (%d entries):'%len(self.indices))
for entry in self.indices:
a(str(entry)+'\n')
@@ -797,6 +844,7 @@ class TextRecord(object): # {{{
def __init__(self, idx, record, extra_data_flags, decompress):
self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags)
+ raw_trailing_bytes = record.raw[len(self.raw):]
self.raw = decompress(self.raw)
if 0 in self.trailing_data:
self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0)
@@ -804,6 +852,7 @@ class TextRecord(object): # {{{
self.trailing_data['indexing'] = self.trailing_data.pop(1)
if 2 in self.trailing_data:
self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2)
+ self.trailing_data['raw_bytes'] = raw_trailing_bytes
self.idx = idx
@@ -917,22 +966,27 @@ class TBSIndexing(object): # {{{
ans.append(('\t\tIndex Entry: %d (Parent index: %d, '
'Depth: %d, Offset: %d, Size: %d) [%s]')%(
x.index, x.parent_index, x.depth, x.offset, x.size, x.label))
- def bin3(num):
+ def bin4(num):
ans = bin(num)[2:]
- return '0'*(3-len(ans)) + ans
+ return bytes('0'*(4-len(ans)) + ans)
+
+ def repr_extra(x):
+ return str({bin4(k):v for k, v in extra.iteritems()})
tbs_type = 0
+ is_periodical = self.doc_type in (257, 258, 259)
if len(byts):
- outer, consumed = decint(byts)
+ outermost_index, extra, consumed = decode_tbs(byts, flag_size=4 if
+ is_periodical else 3)
byts = byts[consumed:]
- tbs_type = outer & 0b111
- ans.append('TBS Type: %s (%d)'%(bin3(tbs_type), tbs_type))
- ans.append('Outer Index entry: %d'%(outer >> 3))
- arg1, consumed = decint(byts)
- byts = byts[consumed:]
- ans.append('Unknown (vwi: always 0?): %d'%arg1)
- if self.doc_type in (257, 259): # Hierarchical periodical
- byts, a = self.interpret_periodical(tbs_type, byts)
+ for k in extra:
+ tbs_type |= k
+ ans.append('\nTBS: %d (%s)'%(tbs_type, bin4(tbs_type)))
+ ans.append('Outermost index: %d'%outermost_index)
+ ans.append('Unknown extra start bytes: %s'%repr_extra(extra))
+ if is_periodical: # Hierarchical periodical
+ byts, a = self.interpret_periodical(tbs_type, byts,
+ dat['geom'][0])
ans += a
if byts:
sbyts = tuple(hex(b)[2:] for b in byts)
@@ -941,159 +995,88 @@ class TBSIndexing(object): # {{{
ans.append('')
return tbs_type, ans
- def interpret_periodical(self, tbs_type, byts):
+ def interpret_periodical(self, tbs_type, byts, record_offset):
ans = []
- def tbs_type_6(byts, psi=None, msg=None, fmsg='Unknown'): # {{{
+ def read_section_transitions(byts, psi=None): # {{{
if psi is None:
- # Assume parent section is 1
+ # Assume previous section is 1
psi = self.get_index(1)
- if msg is None:
- msg = ('Article index at start of record or first article'
- ' index, relative to parent section')
- if byts:
- # byts could be empty
- arg, consumed = decint(byts)
- byts = byts[consumed:]
- flags = (arg & 0b1111)
- ai = (arg >> 4)
- ans.append('%s (fvwi): %d [%d absolute]'%(msg, ai,
- ai+psi.index))
- if flags == 1:
- arg, consumed = decint(byts)
- if arg == 0:
- # EOF of record, otherwise ignore and hope someone else
- # will deal with these bytes
- byts = byts[consumed:]
- ans.append('EOF (vwi: should be 0): %d'%arg)
- elif flags in (4, 5):
- num = byts[0]
- byts = byts[1:]
- ans.append('Number of article nodes in the record (byte): %d'%num)
- if flags == 5:
- arg, consumed = decint(byts)
- byts = byts[consumed:]
- ans.append('%s (vwi)): %d'%(fmsg, arg))
- elif flags == 0:
- pass
- else:
- raise ValueError('Unknown flags: %d'%flags)
- return byts
- # }}}
-
- if tbs_type == 3: # {{{
- arg2, consumed = decint(byts)
- byts = byts[consumed:]
- ans.append('Unknown (vwi: always 0?): %d'%arg2)
-
- arg3, consumed = decint(byts)
- byts = byts[consumed:]
- fsi = arg3 >> 4
- flags = arg3 & 0b1111
- ans.append('First section index (fvwi): %d'%fsi)
- psi = self.get_index(fsi)
- ans.append('Flags (flag: always 0?): %d'%flags)
- if flags == 4:
- ans.append('Number of articles in this section: %d'%byts[0])
- byts = byts[1:]
- elif flags == 0:
- pass
- else:
- raise ValueError('Unknown flags value: %d'%flags)
-
-
- if byts:
- byts = tbs_type_6(byts, psi=psi,
- msg=('First article of ending section, relative to its'
- ' parent\'s index'),
- fmsg=('->Offset from start of record to beginning of'
- ' last starting section'))
while byts:
- # We have a transition not just an opening first section
- psi = self.get_index(psi.index+1)
- arg, consumed = decint(byts)
- off = arg >> 4
+ ai, extra, consumed = decode_tbs(byts)
byts = byts[consumed:]
- flags = arg & 0b1111
- ans.append('Last article of ending section w.r.t. starting'
- ' section offset (fvwi): %d [%d absolute]'%(off,
- psi.index+off))
- ans.append('Flags (always 8?): %d'%flags)
- byts = tbs_type_6(byts, psi=psi)
- if byts:
- # Ended with flag 1,and not EOF, which means there's
- # another section transition in this record
- arg, consumed = decint(byts)
- byts = byts[consumed:]
- ans.append('->Offset from start of record to beginning of '
- 'last starting section: %d'%(arg))
+ if extra.get(0b0010, None) is not None:
+ raise ValueError('Dont know how to interpret flag 0b0010'
+ ' while reading section transitions')
+ if extra.get(0b1000, None) is not None:
+ if len(extra) > 1:
+ raise ValueError('Dont know how to interpret flags'
+ ' %r while reading section transitions'%extra)
+ nsi = self.get_index(psi.index+1)
+ ans.append('Last article in this record of section %d'
+ ' (relative to next section index [%d]): '
+ '%d [%d absolute index]'%(psi.index, nsi.index, ai,
+ ai+nsi.index))
+ psi = nsi
+ continue
+
+ ans.append('First article in this record of section %d'
+ ' (relative to its parent section): '
+ '%d [%d absolute index]'%(psi.index, ai, ai+psi.index))
+
+ num = extra.get(0b0100, None)
+ if num is None:
+ msg = ('The section %d has at most one article'
+ ' in this record')%psi.index
else:
- break
+ msg = ('Number of articles in this record of '
+ 'section %d: %d')%(psi.index, num)
+ ans.append(msg)
- # }}}
+ offset = extra.get(0b0001, None)
+ if offset is not None:
+ if offset == 0:
+ ans.append('This record is spanned by the article:'
+ '%d'%(ai+psi.index))
+ else:
+ ans.append('->Offset to start of next section (%d) from start'
+ ' of record: %d [%d absolute offset]'%(psi.index+1,
+ offset, offset+record_offset))
+ return byts
+ # }}}
- elif tbs_type == 7: # {{{
- # This occurs for records that have no section nodes and
- # whose parent section's index == 1
- ans.append('Unknown (maybe vwi?): %r'%bytes(byts[:2]))
- byts = byts[2:]
- arg, consumed = decint(byts)
+ def read_starting_section(byts): # {{{
+ orig = byts
+ si, extra, consumed = decode_tbs(byts)
byts = byts[consumed:]
- ai = arg >> 4
- flags = arg & 0b1111
- ans.append('Article at start of record (fvwi): %d'%ai)
- if flags == 4:
- num = byts[0]
- byts = byts[1:]
- ans.append('Number of articles in record (byte): %d'%num)
- elif flags == 0:
- pass
- elif flags == 1:
- arg, consumed = decint(byts)
- byts = byts[consumed:]
- ans.append('EOF (vwi: should be 0): %d'%arg)
- else:
- raise ValueError('Unknown flags value: %d'%flags)
+ if len(extra) > 1 or 0b0010 in extra or 0b1000 in extra:
+ raise ValueError('Dont know how to interpret flags %r'
+ ' when reading starting section'%extra)
+ si = self.get_index(si)
+ ans.append('The section at the start of this record is:'
+ ' %d'%si.index)
+ if 0b0100 in extra:
+ num = extra[0b0100]
+ ans.append('The number of articles from the section %d'
+ ' in this record: %d'%(si.index, num))
+ elif 0b0001 in extra:
+ eof = extra[0b0001]
+ if eof != 0:
+ raise ValueError('Unknown eof value %s when reading'
+ ' starting section. All bytes: %r'%(eof, orig))
+ ans.append('This record is spanned by an article from'
+ ' the section: %d'%si.index)
+ return si, byts
# }}}
- elif tbs_type == 6: # {{{
- # This is used for records spanned by an article whose parent
- # section's index == 1 or for the opening record if it contains the
- # periodical start, section 1 start and at least one article. The
- # two cases are distinguished by the flags on the article index
- # vwi.
- unk = byts[0]
- byts = byts[1:]
- ans.append('Unknown (byte: always 2?): %d'%unk)
- byts = tbs_type_6(byts)
- # }}}
+ if tbs_type & 0b0100:
+ # Starting section is the first section
+ ssi = self.get_index(1)
+ else:
+ ssi, byts = read_starting_section(byts)
- elif tbs_type == 2: # {{{
- # This occurs for records with no section nodes and whose parent
- # section's index != 1 (undefined (records before the first
- # section) or > 1)
- # This is also used for records that are spanned by an article
- # whose parent section index > 1. In this case the flags of the
- # vwi referring to the article at the start
- # of the record are set to 1 instead of 4.
- arg, consumed = decint(byts)
- byts = byts[consumed:]
- flags = (arg & 0b1111)
- psi = (arg >> 4)
- ans.append('Parent section index (fvwi): %d'%psi)
- psi = self.get_index(psi)
- ans.append('Flags: %d'%flags)
- if flags == 1:
- arg, consumed = decint(byts)
- byts = byts[consumed:]
- ans.append('Unknown (vwi?: always 0?): %d'%arg)
- byts = tbs_type_6(byts, psi=psi)
- elif flags == 0:
- byts = tbs_type_6(byts, psi=psi)
- else:
- raise ValueError('Unkown flags: %d'%flags)
- # }}}
+ byts = read_section_transitions(byts, ssi)
return byts, ans
diff --git a/src/calibre/ebooks/mobi/tbs_periodicals.rst b/src/calibre/ebooks/mobi/tbs_periodicals.rst
index d770133625..2fa6ec90f3 100644
--- a/src/calibre/ebooks/mobi/tbs_periodicals.rst
+++ b/src/calibre/ebooks/mobi/tbs_periodicals.rst
@@ -3,6 +3,20 @@ Reverse engineering the trailing byte sequences for hierarchical periodicals
In the following, *vwi* means variable width integer and *fvwi* means a vwi whose lowest four bits are used as a flag. All the following information/inferences are from examining the output of kindlegen on a sample periodical. Given the general level of Amazon's incompetence, there are no guarantees that this information is the *best/most complete* way to do TBS indexing.
+Sequence encoding:
+
+0b1000 : Continuation bit
+
+First sequences:
+0b0010 : 80
+0b0011 : 80 80
+0b0110 : 80 2
+0b0111 : 80 2 80
+
+Other sequences:
+0b0101 : 4 1a
+0b0001 : c b1
+
Opening record
----------------
@@ -52,10 +66,60 @@ The text record that contains the opening node for the periodical (depth=0 node
If there was only a single article, instead of 2, then the last two bytes would be: c0, i.e. there would be no byte giving the number of articles in the record.
+ Starting record with two section transitions::
+
+ Record #1: Starts at: 0 Ends at: 4095
+ Contains: 7 index entries (0 ends, 4 complete, 3 starts)
+ TBS bytes: 86 80 2 c0 b8 c4 3
+ Complete:
+ Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 564, Size: 375) [Ars Technica]
+ Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 572, Size: 367) [Week in gaming: 3DS review, Crysis 2, George Hotz]
+ Index Entry: 6 (Parent index: 2, Depth: 2, Offset: 947, Size: 1014) [Max and the Magic Marker for iPad: Review]
+ Index Entry: 7 (Parent index: 2, Depth: 2, Offset: 1961, Size: 1077) [iPad 2 steers itself into home console gaming territory with Real Racing 2 HD]
+ Starts:
+ Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 35372) [j_x's Google reader]
+ Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 939, Size: 10368) [Neowin.net]
+ Index Entry: 8 (Parent index: 2, Depth: 2, Offset: 3038, Size: 1082) [Microsoft's Joe Belfiore still working on upcoming Zune hardware]
+ TBS Type: 110 (6)
+ Outer Index entry: 0
+ Unknown (vwi: always 0?): 0
+ Unknown (byte: always 2?): 2
+ Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute]
+ Remaining bytes: b8 c4 3
+
+ Starting record with three section transitions::
+
+ Record #1: Starts at: 0 Ends at: 4095
+ Contains: 10 index entries (0 ends, 7 complete, 3 starts)
+ TBS bytes: 86 80 2 c0 b8 c0 b8 c4 4
+ Complete:
+ Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 564, Size: 375) [Ars Technica]
+ Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 939, Size: 316) [Neowin.net]
+ Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 572, Size: 367) [Week in gaming: 3DS review, Crysis 2, George Hotz]
+ Index Entry: 6 (Parent index: 2, Depth: 2, Offset: 947, Size: 308) [Max and the Magic Marker for iPad: Review]
+ Index Entry: 7 (Parent index: 3, Depth: 2, Offset: 1263, Size: 760) [OSnews Asks on Interrupts: The Results]
+ Index Entry: 8 (Parent index: 3, Depth: 2, Offset: 2023, Size: 693) [Apple Ditches SAMBA in Favour of Homegrown Replacement]
+ Index Entry: 9 (Parent index: 3, Depth: 2, Offset: 2716, Size: 747) [ITC: Apple's Mobile Products Do Not Violate Nokia Patents]
+ Starts:
+ Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 25320) [j_x's Google reader]
+ Index Entry: 3 (Parent index: 0, Depth: 1, Offset: 1255, Size: 6829) [OSNews]
+ Index Entry: 10 (Parent index: 3, Depth: 2, Offset: 3463, Size: 666) [Transparent Monitor Embedded in Window Glass]
+ TBS Type: 110 (6)
+ Outer Index entry: 0
+ Unknown (vwi: always 0?): 0
+ Unknown (byte: always 2?): 2
+ Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute]
+ Remaining bytes: b8 c0 b8 c4 4
+
+
+
+
Records with no nodes
------------------------
+subtype = 010
+
These records are spanned by a single article. They are of two types:
1. If the parent section index is 1, TBS type of 6, like this::
@@ -247,7 +311,7 @@ In such a record there is a transition from one section to the next. As such the
Last article of ending section w.r.t. starting section offset (fvwi): 12 [15 absolute]
Flags (always 8?): 8
Article index at start of record or first article index, relative to parent section (fvwi): 13 [16 absolute]
- Number of article nodes in the record (byte): 4
+ Number of article nodes in the record belonging ot the last section (byte): 4
Ending record
@@ -274,3 +338,26 @@ Logically, ending records must have at least one article ending, one section end
If the record had only a single article end, the last two bytes would be replaced with: f0
+If the last record has multiple section transitions, it is of type 6 and looks like::
+
+ Record #9: Starts at: 32768 Ends at: 34953
+ Contains: 9 index entries (3 ends, 6 complete, 0 starts)
+ TBS bytes: 86 80 2 1 d0 1 c8 1 d0 1 c8 1 d0 1 c8 1 d0
+ Ends:
+ Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 34739) [j_x's Google reader]
+ Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 7758, Size: 26279) [Ars Technica]
+ Index Entry: 14 (Parent index: 1, Depth: 2, Offset: 31929, Size: 2108) [Trademarked keyword sales may soon be restricted in Europe]
+ Complete:
+ Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 34037, Size: 316) [Neowin.net]
+ Index Entry: 3 (Parent index: 0, Depth: 1, Offset: 34353, Size: 282) [OSNews]
+ Index Entry: 4 (Parent index: 0, Depth: 1, Offset: 34635, Size: 319) [Slashdot]
+ Index Entry: 15 (Parent index: 2, Depth: 2, Offset: 34045, Size: 308) [Max and the Magic Marker for iPad: Review]
+ Index Entry: 16 (Parent index: 3, Depth: 2, Offset: 34361, Size: 274) [OSnews Asks on Interrupts: The Results]
+ Index Entry: 17 (Parent index: 4, Depth: 2, Offset: 34643, Size: 311) [Leonard Nimoy Turns 80]
+ TBS Type: 110 (6)
+ Outer Index entry: 0
+ Unknown (vwi: always 0?): 0
+ Unknown (byte: always 2?): 2
+ Article index at start of record or first article index, relative to parent section (fvwi): 13 [14 absolute]
+ Remaining bytes: 1 c8 1 d0 1 c8 1 d0 1 c8 1 d0
+
diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py
index cf03c613f4..4298276bc1 100644
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@@ -11,6 +11,7 @@ import struct
from collections import OrderedDict
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
+from calibre.ebooks import normalize
IMAGE_MAX_SIZE = 10 * 1024 * 1024
@@ -39,7 +40,10 @@ def encode_number_as_hex(num):
The bytes that follow are simply the hexadecimal representation of the
number.
'''
- num = bytes(hex(num)[2:])
+ num = bytes(hex(num)[2:].upper())
+ nlen = len(num)
+ if nlen % 2 != 0:
+ num = b'0'+num
ans = bytearray(num)
ans.insert(0, len(num))
return bytes(ans)
@@ -65,11 +69,14 @@ def encint(value, forward=True):
If forward is True the bytes returned are suitable for prepending to the
output buffer, otherwise they must be append to the output buffer.
'''
+ if value < 0:
+ raise ValueError('Cannot encode negative numbers as vwi')
# Encode vwi
byts = bytearray()
while True:
b = value & 0b01111111
value >>= 7 # shift value to the right by 7 bits
+
byts.append(b)
if value == 0:
break
@@ -184,7 +191,7 @@ def encode_trailing_data(raw):
where size is a backwards encoded vwi whose value is the length of the
- entire return bytestring.
+ entire returned bytestring. data is the bytestring passed in as raw.
This is the encoding used for trailing data entries at the end of text
records. See get_trailing_data() for details.
@@ -197,3 +204,131 @@ def encode_trailing_data(raw):
lsize += 1
return raw + encoded
+def encode_fvwi(val, flags, flag_size=4):
+ '''
+ Encode the value val and the flag_size bits from flags as a fvwi. This encoding is
+ used in the trailing byte sequences for indexing. Returns encoded
+ bytestring.
+ '''
+ ans = val << flag_size
+ for i in xrange(flag_size):
+ ans |= (flags & (1 << i))
+ return encint(ans)
+
+
+def decode_fvwi(byts, flag_size=4):
+ '''
+ Decode encoded fvwi. Returns number, flags, consumed
+ '''
+ arg, consumed = decint(bytes(byts))
+ val = arg >> flag_size
+ flags = 0
+ for i in xrange(flag_size):
+ flags |= (arg & (1 << i))
+ return val, flags, consumed
+
+
+def decode_tbs(byts, flag_size=4):
+ '''
+ Trailing byte sequences for indexing consists of series of fvwi numbers.
+ This function reads the fvwi number and its associated flags. It them uses
+ the flags to read any more numbers that belong to the series. The flags are
+ the lowest 4 bits of the vwi (see the encode_fvwi function above).
+
+ Returns the fvwi number, a dictionary mapping flags bits to the associated
+ data and the number of bytes consumed.
+ '''
+ byts = bytes(byts)
+ val, flags, consumed = decode_fvwi(byts, flag_size=flag_size)
+ extra = {}
+ byts = byts[consumed:]
+ if flags & 0b1000 and flag_size > 3:
+ extra[0b1000] = True
+ if flags & 0b0010:
+ x, consumed2 = decint(byts)
+ byts = byts[consumed2:]
+ extra[0b0010] = x
+ consumed += consumed2
+ if flags & 0b0100:
+ extra[0b0100] = ord(byts[0])
+ byts = byts[1:]
+ consumed += 1
+ if flags & 0b0001:
+ x, consumed2 = decint(byts)
+ byts = byts[consumed2:]
+ extra[0b0001] = x
+ consumed += consumed2
+ return val, extra, consumed
+
+def encode_tbs(val, extra, flag_size=4):
+ '''
+ Encode the number val and the extra data in the extra dict as an fvwi. See
+ decode_tbs above.
+ '''
+ flags = 0
+ for flag in extra:
+ flags |= flag
+ ans = encode_fvwi(val, flags, flag_size=flag_size)
+
+ if 0b0010 in extra:
+ ans += encint(extra[0b0010])
+ if 0b0100 in extra:
+ ans += bytes(bytearray([extra[0b0100]]))
+ if 0b0001 in extra:
+ ans += encint(extra[0b0001])
+ return ans
+
+def utf8_text(text):
+ '''
+ Convert a possibly null string to utf-8 bytes, guaranteeing to return a non
+ empty, normalized bytestring.
+ '''
+ if text and text.strip():
+ text = text.strip()
+ if not isinstance(text, unicode):
+ text = text.decode('utf-8', 'replace')
+ text = normalize(text).encode('utf-8')
+ else:
+ text = _('Unknown').encode('utf-8')
+ return text
+
+def align_block(raw, multiple=4, pad=b'\0'):
+ '''
+ Return raw with enough pad bytes append to ensure its length is a multiple
+ of 4.
+ '''
+ extra = len(raw) % multiple
+ if extra == 0: return raw
+ return raw + pad*(multiple - extra)
+
+
+def detect_periodical(toc, log=None):
+ '''
+ Detect if the TOC object toc contains a periodical that conforms to the
+ structure required by kindlegen to generate a periodical.
+ '''
+ for node in toc.iterdescendants():
+ if node.depth() == 1 and node.klass != 'article':
+ if log is not None:
+ log.debug(
+ 'Not a periodical: Deepest node does not have '
+ 'class="article"')
+ return False
+ if node.depth() == 2 and node.klass != 'section':
+ if log is not None:
+ log.debug(
+ 'Not a periodical: Second deepest node does not have'
+ ' class="section"')
+ return False
+ if node.depth() == 3 and node.klass != 'periodical':
+ if log is not None:
+ log.debug('Not a periodical: Third deepest node'
+ ' does not have class="periodical"')
+ return False
+ if node.depth() > 3:
+ if log is not None:
+ log.debug('Not a periodical: Has nodes of depth > 3')
+ return False
+ return True
+
+
diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py
index c28b91e63a..d5226f68bd 100644
--- a/src/calibre/ebooks/mobi/writer2/indexer.py
+++ b/src/calibre/ebooks/mobi/writer2/indexer.py
@@ -2,6 +2,7 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
+from future_builtins import filter
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal '
@@ -9,33 +10,11 @@ __docformat__ = 'restructuredtext en'
from struct import pack
from cStringIO import StringIO
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict
-from calibre.ebooks import normalize
-from calibre.ebooks.mobi.utils import encint
-
-def utf8_text(text):
- '''
- Convert a possibly null string to utf-8 bytes, guaranteeing to return a non
- empty, normalized bytestring.
- '''
- if text and text.strip():
- text = text.strip()
- if not isinstance(text, unicode):
- text = text.decode('utf-8', 'replace')
- text = normalize(text).encode('utf-8')
- else:
- text = _('Unknown').encode('utf-8')
- return text
-
-def align_block(raw, multiple=4, pad=b'\0'):
- '''
- Return raw with enough pad bytes append to ensure its length is a multiple
- of 4.
- '''
- extra = len(raw) % multiple
- if extra == 0: return raw
- return raw + pad*(multiple - extra)
+from calibre.ebooks.mobi.writer2 import RECORD_SIZE
+from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
+ encode_tbs, align_block, utf8_text, detect_periodical)
class CNCX(object): # {{{
@@ -48,22 +27,15 @@ class CNCX(object): # {{{
MAX_STRING_LENGTH = 500
- def __init__(self, toc, opts):
+ def __init__(self, toc, is_periodical):
self.strings = OrderedDict()
- for item in toc:
- if item is self.toc: continue
- label = item.title
- klass = item.klass
- if opts.mobi_periodical:
- if item.description:
- self.strings[item.description] = 0
- if item.author:
- self.string[item.author] = 0
- self.strings[label] = self.strings[klass] = 0
+ for item in toc.iterdescendants(breadth_first=True):
+ self.strings[item.title] = 0
+ if is_periodical:
+ self.strings[item.klass] = 0
self.records = []
-
offset = 0
buf = StringIO()
for key in tuple(self.strings.iterkeys()):
@@ -79,38 +51,677 @@ class CNCX(object): # {{{
self.records.append(buf.getvalue())
buf.truncate(0)
offset = len(self.records) * 0x10000
-
+ buf.write(raw)
self.strings[key] = offset
offset += len(raw)
- buf.write(b'\0') # CNCX must end with zero byte
self.records.append(align_block(buf.getvalue()))
def __getitem__(self, string):
return self.strings[string]
# }}}
-class Indexer(object):
+class IndexEntry(object): # {{{
- def __init__(self, serializer, number_of_text_records, opts, oeb):
+ TAG_VALUES = {
+ 'offset': 1,
+ 'size': 2,
+ 'label_offset': 3,
+ 'depth': 4,
+ 'class_offset': 5,
+ 'parent_index': 21,
+ 'first_child_index': 22,
+ 'last_child_index': 23,
+ }
+ RTAG_MAP = {v:k for k, v in TAG_VALUES.iteritems()}
+
+ BITMASKS = [1, 2, 3, 4, 5, 21, 22, 23,]
+
+ def __init__(self, offset, label_offset, depth=0, class_offset=None):
+ self.offset, self.label_offset = offset, label_offset
+ self.depth, self.class_offset = depth, class_offset
+
+ self.length = 0
+ self.index = 0
+
+ self.parent_index = None
+ self.first_child_index = None
+ self.last_child_index = None
+
+ def __repr__(self):
+ return ('IndexEntry(offset=%r, depth=%r, length=%r, index=%r,'
+ ' parent_index=%r)')%(self.offset, self.depth, self.length,
+ self.index, self.parent_index)
+
+ @dynamic_property
+ def size(self):
+ def fget(self): return self.length
+ def fset(self, val): self.length = val
+ return property(fget=fget, fset=fset, doc='Alias for length')
+
+ @classmethod
+ def tagx_block(cls, for_periodical=True):
+ buf = bytearray()
+
+ def add_tag(tag, num_values=1):
+ buf.append(tag)
+ buf.append(num_values)
+ # bitmask
+ buf.append(1 << (cls.BITMASKS.index(tag)))
+ # eof
+ buf.append(0)
+
+ for tag in xrange(1, 5):
+ add_tag(tag)
+
+ if for_periodical:
+ for tag in (5, 21, 22, 23):
+ add_tag(tag)
+
+ # End of TAGX record
+ for i in xrange(3): buf.append(0)
+ buf.append(1)
+
+ header = b'TAGX'
+ header += pack(b'>I', 12+len(buf)) # table length
+ header += pack(b'>I', 1) # control byte count
+
+ return header + bytes(buf)
+
+ @property
+ def next_offset(self):
+ return self.offset + self.length
+
+ @property
+ def tag_nums(self):
+ for i in range(1, 5):
+ yield i
+ for attr in ('class_offset', 'parent_index', 'first_child_index',
+ 'last_child_index'):
+ if getattr(self, attr) is not None:
+ yield self.TAG_VALUES[attr]
+
+ @property
+ def entry_type(self):
+ ans = 0
+ for tag in self.tag_nums:
+ ans |= (1 << self.BITMASKS.index(tag)) # 1 << x == 2**x
+ return ans
+
+ @property
+ def bytestring(self):
+ buf = StringIO()
+ buf.write(encode_number_as_hex(self.index))
+ et = self.entry_type
+ buf.write(bytes(bytearray([et])))
+
+ for tag in self.tag_nums:
+ attr = self.RTAG_MAP[tag]
+ val = getattr(self, attr)
+ buf.write(encint(val))
+
+ ans = buf.getvalue()
+ return ans
+
+# }}}
+
+class TBS(object): # {{{
+
+ '''
+ Take the list of index nodes starting/ending on a record and calculate the
+ trailing byte sequence for the record.
+ '''
+
+ def __init__(self, data, is_periodical, first=False, section_map={},
+ after_first=False):
+ self.section_map = section_map
+ #import pprint
+ #pprint.pprint(data)
+ #print()
+ if is_periodical:
+ # The starting bytes.
+ # The value is zero which I think indicates the periodical
+ # index entry. The values for the various flags seem to be
+ # unused. If the 0b100 is present, it means that the record
+ # deals with section 1 (or is the final record with section
+ # transitions).
+ self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3)
+ self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0},
+ flag_size=3)
+ self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0},
+ flag_size=3)
+ self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001:
+ 0}, flag_size=3)
+
+ if not data:
+ byts = b''
+ if after_first:
+ # This can happen if a record contains only text between
+ # the periodical start and the first section
+ byts = self.type_011
+ self.bytestring = byts
+ else:
+ depth_map = defaultdict(list)
+ for x in ('starts', 'ends', 'completes'):
+ for idx in data[x]:
+ depth_map[idx.depth].append(idx)
+ for l in depth_map.itervalues():
+ l.sort(key=lambda x:x.offset)
+ self.periodical_tbs(data, first, depth_map)
+ else:
+ if not data:
+ self.bytestring = b''
+ else:
+ self.book_tbs(data, first)
+
+ def periodical_tbs(self, data, first, depth_map):
+ buf = StringIO()
+
+ has_section_start = (depth_map[1] and
+ set(depth_map[1]).intersection(set(data['starts'])))
+ spanner = data['spans']
+ parent_section_index = -1
+
+ if depth_map[0]:
+ # We have a terminal record
+
+ # Find the first non periodical node
+ first_node = None
+ for nodes in (depth_map[1], depth_map[2]):
+ for node in nodes:
+ if (first_node is None or (node.offset, node.depth) <
+ (first_node.offset, first_node.depth)):
+ first_node = node
+
+ typ = (self.type_110 if has_section_start else self.type_010)
+
+ # parent_section_index is needed for the last record
+ if first_node is not None and first_node.depth > 0:
+ parent_section_index = (first_node.index if first_node.depth
+ == 1 else first_node.parent_index)
+ else:
+ parent_section_index = max(self.section_map.iterkeys())
+
+ else:
+ # Non terminal record
+
+ if spanner is not None:
+ # record is spanned by a single article
+ parent_section_index = spanner.parent_index
+ typ = (self.type_110 if parent_section_index == 1 else
+ self.type_010)
+ elif not depth_map[1]:
+ # has only article nodes, i.e. spanned by a section
+ parent_section_index = depth_map[2][0].parent_index
+ typ = (self.type_111 if parent_section_index == 1 else
+ self.type_010)
+ else:
+ # has section transitions
+ if depth_map[2]:
+ parent_section_index = depth_map[2][0].parent_index
+ else:
+ parent_section_index = depth_map[1][0].index
+ typ = self.type_011
+
+ buf.write(typ)
+
+ if typ not in (self.type_110, self.type_111) and parent_section_index > 0:
+ # Write starting section information
+ if spanner is None:
+ num_articles = len([a for a in depth_map[1] if a.parent_index
+ == parent_section_index])
+ extra = {}
+ if num_articles > 1:
+ extra = {0b0100: num_articles}
+ else:
+ extra = {0b0001: 0}
+ buf.write(encode_tbs(parent_section_index, extra))
+
+ if spanner is None:
+ articles = depth_map[2]
+ sections = set([self.section_map[a.parent_index] for a in
+ articles])
+ sections = sorted(sections, key=lambda x:x.offset)
+ section_map = {s:[a for a in articles if a.parent_index ==
+ s.index] for s in sections}
+ for i, section in enumerate(sections):
+ # All the articles in this record that belong to section
+ articles = section_map[section]
+ first_article = articles[0]
+ last_article = articles[-1]
+ num = len(articles)
+
+ try:
+ next_sec = sections[i+1]
+ except:
+ next_sec = None
+
+ extra = {}
+ if num > 1:
+ extra[0b0100] = num
+ if i == 0 and next_sec is not None:
+ # Write offset to next section from start of record
+ # For some reason kindlegen only writes this offset
+ # for the first section transition. Imitate it.
+ extra[0b0001] = next_sec.offset - data['offset']
+
+ buf.write(encode_tbs(first_article.index-section.index, extra))
+
+ if next_sec is not None:
+ buf.write(encode_tbs(last_article.index-next_sec.index,
+ {0b1000: 0}))
+ else:
+ buf.write(encode_tbs(spanner.index - parent_section_index,
+ {0b0001: 0}))
+
+ self.bytestring = buf.getvalue()
+
+ def book_tbs(self, data, first):
+ self.bytestring = b''
+# }}}
+
+class Indexer(object): # {{{
+
+ def __init__(self, serializer, number_of_text_records,
+ size_of_last_text_record, opts, oeb):
self.serializer = serializer
self.number_of_text_records = number_of_text_records
+ self.text_size = (RECORD_SIZE * (self.number_of_text_records-1) +
+ size_of_last_text_record)
self.oeb = oeb
self.log = oeb.log
self.opts = opts
- self.cncx = CNCX(oeb.toc, opts)
+ self.is_periodical = detect_periodical(self.oeb.toc, self.log)
+ self.log('Generating MOBI index for a %s'%('periodical' if
+ self.is_periodical else 'book'))
+ self.is_flat_periodical = False
+ if self.is_periodical:
+ periodical_node = iter(oeb.toc).next()
+ sections = tuple(periodical_node)
+ self.is_flat_periodical = len(sections) == 1
self.records = []
- def create_header(self):
- buf = StringIO()
+ self.cncx = CNCX(oeb.toc, self.is_periodical)
- # Ident
+ if self.is_periodical:
+ self.indices = self.create_periodical_index()
+ else:
+ self.indices = self.create_book_index()
+
+ self.records.append(self.create_index_record())
+ self.records.insert(0, self.create_header())
+ self.records.extend(self.cncx.records)
+
+ self.calculate_trailing_byte_sequences()
+
+ def create_index_record(self): # {{{
+ header_length = 192
+ buf = StringIO()
+ indices = self.indices
+
+ # Write index entries
+ offsets = []
+ for i in indices:
+ offsets.append(buf.tell())
+ buf.write(i.bytestring)
+ index_block = align_block(buf.getvalue())
+
+ # Write offsets to index entries as an IDXT block
+ idxt_block = b'IDXT'
+ buf.truncate(0)
+ for offset in offsets:
+ buf.write(pack(b'>H', header_length+offset))
+ idxt_block = align_block(idxt_block + buf.getvalue())
+ body = index_block + idxt_block
+
+ header = b'INDX'
+ buf.truncate(0)
+ buf.write(pack(b'>I', header_length))
+ buf.write(b'\0'*4) # Unknown
+ buf.write(pack(b'>I', 1)) # Header type? Or index record number?
+ buf.write(b'\0'*4) # Unknown
+ # IDXT block offset
+ buf.write(pack(b'>I', header_length + len(index_block)))
+ # Number of index entries
+ buf.write(pack(b'>I', len(offsets)))
+ # Unknown
+ buf.write(b'\xff'*8)
+ # Unknown
+ buf.write(b'\0'*156)
+
+ header += buf.getvalue()
+
+ ans = header + body
+ if len(ans) > 0x10000:
+ raise ValueError('Too many entries (%d) in the TOC'%len(offsets))
+ return ans
+ # }}}
+
+ def create_header(self): # {{{
+ buf = StringIO()
+ tagx_block = IndexEntry.tagx_block(self.is_periodical)
+ header_length = 192
+
+ # Ident 0 - 4
buf.write(b'INDX')
- # Header length
- buf.write(pack(b'>I', 192))
+ # Header length 4 - 8
+ buf.write(pack(b'>I', header_length))
- # Index type: 0 - normal, 2 - inflection
+ # Unknown 8-16
+ buf.write(b'\0'*8)
+
+ # Index type: 0 - normal, 2 - inflection 16 - 20
buf.write(pack(b'>I', 2))
+
+ # IDXT offset 20-24
+ buf.write(pack(b'>I', 0)) # Filled in later
+
+ # Number of index records 24-28
+ buf.write(pack(b'>I', len(self.records)))
+
+ # Index Encoding 28-32
+ buf.write(pack(b'>I', 65001)) # utf-8
+
+ # Unknown 32-36
+ buf.write(b'\xff'*4)
+
+ # Number of index entries 36-40
+ buf.write(pack(b'>I', len(self.indices)))
+
+ # ORDT offset 40-44
+ buf.write(pack(b'>I', 0))
+
+ # LIGT offset 44-48
+ buf.write(pack(b'>I', 0))
+
+ # Number of LIGT entries 48-52
+ buf.write(pack(b'>I', 0))
+
+ # Number of CNCX records 52-56
+ buf.write(pack(b'>I', len(self.cncx.records)))
+
+ # Unknown 56-180
+ buf.write(b'\0'*124)
+
+ # TAGX offset 180-184
+ buf.write(pack(b'>I', header_length))
+
+ # Unknown 184-192
+ buf.write(b'\0'*8)
+
+ # TAGX block
+ buf.write(tagx_block)
+
+ num = len(self.indices)
+
+ # The index of the last entry in the NCX
+ buf.write(encode_number_as_hex(num-1))
+
+ # The number of entries in the NCX
+ buf.write(pack(b'>H', num))
+
+ # Padding
+ pad = (4 - (buf.tell()%4))%4
+ if pad:
+ buf.write(b'\0'*pad)
+
+ idxt_offset = buf.tell()
+
+ buf.write(b'IDXT')
+ buf.write(pack(b'>H', header_length + len(tagx_block)))
+ buf.write(b'\0')
+ buf.seek(20)
+ buf.write(pack(b'>I', idxt_offset))
+
+ return align_block(buf.getvalue())
+ # }}}
+
+ def create_book_index(self): # {{{
+ indices = []
+ seen = set()
+ id_offsets = self.serializer.id_offsets
+
+ for node in self.oeb.toc.iterdescendants():
+ try:
+ offset = id_offsets[node.href]
+ label = self.cncx[node.title]
+ except:
+ self.log.warn('TOC item %s not found in document'%node.href)
+ continue
+ if offset in seen:
+ continue
+ seen.add(offset)
+ index = IndexEntry(offset, label)
+ indices.append(index)
+
+ indices.sort(key=lambda x:x.offset)
+
+ # Set lengths
+ for i, index in enumerate(indices):
+ try:
+ next_offset = indices[i+1].offset
+ except:
+ next_offset = self.serializer.body_end_offset
+ index.length = next_offset - index.offset
+
+ # Remove empty nodes
+ indices = [i for i in indices if i.length > 0]
+
+ # Set index values
+ for i, index in enumerate(indices):
+ index.index = i
+
+ # Set lengths again to close up any gaps left by filtering
+ for i, index in enumerate(indices):
+ try:
+ next_offset = indices[i+1].offset
+ except:
+ next_offset = self.serializer.body_end_offset
+ index.length = next_offset - index.offset
+
+ return indices
+
+ # }}}
+
+ def create_periodical_index(self): # {{{
+ periodical_node = iter(self.oeb.toc).next()
+ periodical_node_offset = self.serializer.body_start_offset
+ periodical_node_size = (self.serializer.body_end_offset -
+ periodical_node_offset)
+
+ normalized_sections = []
+
+ id_offsets = self.serializer.id_offsets
+
+ periodical = IndexEntry(periodical_node_offset,
+ self.cncx[periodical_node.title],
+ class_offset=self.cncx[periodical_node.klass])
+ periodical.length = periodical_node_size
+ periodical.first_child_index = 1
+
+ seen_sec_offsets = set()
+ seen_art_offsets = set()
+
+ for sec in periodical_node:
+ normalized_articles = []
+ try:
+ offset = id_offsets[sec.href]
+ label = self.cncx[sec.title]
+ klass = self.cncx[sec.klass]
+ except:
+ continue
+ if offset in seen_sec_offsets:
+ continue
+ seen_sec_offsets.add(offset)
+ section = IndexEntry(offset, label, class_offset=klass, depth=1)
+ section.parent_index = 0
+ for art in sec:
+ try:
+ offset = id_offsets[art.href]
+ label = self.cncx[art.title]
+ klass = self.cncx[art.klass]
+ except:
+ continue
+ if offset in seen_art_offsets:
+ continue
+ seen_art_offsets.add(offset)
+ article = IndexEntry(offset, label, class_offset=klass,
+ depth=2)
+ normalized_articles.append(article)
+ if normalized_articles:
+ normalized_articles.sort(key=lambda x:x.offset)
+ normalized_sections.append((section, normalized_articles))
+
+ normalized_sections.sort(key=lambda x:x[0].offset)
+
+ # Set lengths
+ for s, x in enumerate(normalized_sections):
+ sec, normalized_articles = x
+ try:
+ sec.length = normalized_sections[s+1][0].offset - sec.offset
+ except:
+ sec.length = self.serializer.body_end_offset - sec.offset
+ for i, art in enumerate(normalized_articles):
+ try:
+ art.length = normalized_articles[i+1].offset - art.offset
+ except:
+ art.length = sec.offset + sec.length - art.offset
+
+ # Filter
+ for i, x in list(enumerate(normalized_sections)):
+ sec, normalized_articles = x
+ normalized_articles = list(filter(lambda x: x.length > 0,
+ normalized_articles))
+ normalized_sections[i] = (sec, normalized_articles)
+
+ normalized_sections = list(filter(lambda x: x[0].length > 0 and x[1],
+ normalized_sections))
+
+ # Set indices
+ i = 0
+ for sec, articles in normalized_sections:
+ i += 1
+ sec.index = i
+ sec.parent_index = 0
+
+ for sec, articles in normalized_sections:
+ for art in articles:
+ i += 1
+ art.index = i
+ art.parent_index = sec.index
+
+ for sec, normalized_articles in normalized_sections:
+ sec.first_child_index = normalized_articles[0].index
+ sec.last_child_index = normalized_articles[-1].index
+
+ # Set lengths again to close up any gaps left by filtering
+ for s, x in enumerate(normalized_sections):
+ sec, articles = x
+ try:
+ next_offset = normalized_sections[s+1][0].offset
+ except:
+ next_offset = self.serializer.body_end_offset
+ sec.length = next_offset - sec.offset
+
+ for a, art in enumerate(articles):
+ try:
+ next_offset = articles[a+1].offset
+ except:
+ next_offset = sec.next_offset
+ art.length = next_offset - art.offset
+
+ # Sanity check
+ for s, x in enumerate(normalized_sections):
+ sec, articles = x
+ try:
+ next_sec = normalized_sections[s+1][0]
+ except:
+ if (sec.length == 0 or sec.next_offset !=
+ self.serializer.body_end_offset):
+ raise ValueError('Invalid section layout')
+ else:
+ if next_sec.offset != sec.next_offset or sec.length == 0:
+ raise ValueError('Invalid section layout')
+ for a, art in enumerate(articles):
+ try:
+ next_art = articles[a+1]
+ except:
+ if (art.length == 0 or art.next_offset !=
+ sec.next_offset):
+ raise ValueError('Invalid article layout')
+ else:
+ if art.length == 0 or art.next_offset != next_art.offset:
+ raise ValueError('Invalid article layout')
+
+ # Flatten
+ indices = [periodical]
+ for sec, articles in normalized_sections:
+ indices.append(sec)
+ periodical.last_child_index = sec.index
+
+ for sec, articles in normalized_sections:
+ for a in articles:
+ indices.append(a)
+
+ return indices
+ # }}}
+
+ # TBS {{{
+ def calculate_trailing_byte_sequences(self):
+ self.tbs_map = {}
+ found_node = False
+ sections = [i for i in self.indices if i.depth == 1]
+ section_map = OrderedDict((i.index, i) for i in
+ sorted(sections, key=lambda x:x.offset))
+
+ deepest = max(i.depth for i in self.indices)
+
+ for i in xrange(self.number_of_text_records):
+ offset = i * RECORD_SIZE
+ next_offset = offset + RECORD_SIZE
+ data = {'ends':[], 'completes':[], 'starts':[],
+ 'spans':None, 'offset':offset, 'record_number':i+1}
+
+ for index in self.indices:
+ if index.offset >= next_offset:
+ # Node starts after current record
+ if index.depth == deepest:
+ break
+ else:
+ continue
+ if index.next_offset <= offset:
+ # Node ends before current record
+ continue
+ if index.offset >= offset:
+ # Node starts in current record
+ if index.next_offset <= next_offset:
+ # Node ends in current record
+ data['completes'].append(index)
+ else:
+ data['starts'].append(index)
+ else:
+ # Node starts before current records
+ if index.next_offset <= next_offset:
+ # Node ends in current record
+ data['ends'].append(index)
+ elif index.depth == deepest:
+ data['spans'] = index
+
+ if (data['ends'] or data['completes'] or data['starts'] or
+ data['spans'] is not None):
+ self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not
+ found_node, section_map=section_map)
+ found_node = True
+ else:
+ self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False,
+ after_first=found_node, section_map=section_map)
+
+ def get_trailing_byte_sequence(self, num):
+ return self.tbs_map[num].bytestring
+ # }}}
+
+# }}}
+
diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py
index 088326a876..e3f4081670 100644
--- a/src/calibre/ebooks/mobi/writer2/main.py
+++ b/src/calibre/ebooks/mobi/writer2/main.py
@@ -20,6 +20,7 @@ from calibre.utils.filenames import ascii_filename
from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED, RECORD_SIZE)
from calibre.ebooks.mobi.utils import (rescale_image, encint,
encode_trailing_data)
+from calibre.ebooks.mobi.writer2.indexer import Indexer
EXTH_CODES = {
'creator': 100,
@@ -28,7 +29,6 @@ EXTH_CODES = {
'identifier': 104,
'subject': 105,
'pubdate': 106,
- 'date': 106,
'review': 107,
'contributor': 108,
'rights': 109,
@@ -54,6 +54,7 @@ class MobiWriter(object):
self.last_text_record_idx = 1
def __call__(self, oeb, path_or_stream):
+ self.log = oeb.log
if hasattr(path_or_stream, 'write'):
return self.dump_stream(oeb, path_or_stream)
with open(path_or_stream, 'w+b') as stream:
@@ -87,6 +88,25 @@ class MobiWriter(object):
# Indexing {{{
def generate_index(self):
self.primary_index_record_idx = None
+ try:
+ self.indexer = Indexer(self.serializer, self.last_text_record_idx,
+ len(self.records[self.last_text_record_idx]),
+ self.opts, self.oeb)
+ except:
+ self.log.exception('Failed to generate MOBI index:')
+ else:
+ self.primary_index_record_idx = len(self.records)
+ for i in xrange(len(self.records)):
+ if i == 0: continue
+ tbs = self.indexer.get_trailing_byte_sequence(i)
+ self.records[i] += encode_trailing_data(tbs)
+ self.records.extend(self.indexer.records)
+
+ @property
+ def is_periodical(self):
+ return (self.primary_index_record_idx is None or not
+ self.indexer.is_periodical)
+
# }}}
def write_uncrossable_breaks(self): # {{{
@@ -178,7 +198,6 @@ class MobiWriter(object):
self.serializer = Serializer(self.oeb, self.images,
write_page_breaks_after_item=self.write_page_breaks_after_item)
text = self.serializer()
- self.content_length = len(text)
self.text_length = len(text)
text = StringIO(text)
nrecords = 0
@@ -186,22 +205,16 @@ class MobiWriter(object):
if self.compression != UNCOMPRESSED:
self.oeb.logger.info(' Compressing markup content...')
- data, overlap = self.read_text_record(text)
-
- while len(data) > 0:
+ while text.tell() < self.text_length:
+ data, overlap = self.read_text_record(text)
if self.compression == PALMDOC:
data = compress_doc(data)
- record = StringIO()
- record.write(data)
- self.records.append(record.getvalue())
+ data += overlap
+ data += pack(b'>B', len(overlap))
+
+ self.records.append(data)
nrecords += 1
- data, overlap = self.read_text_record(text)
-
- # Write information about the mutibyte character overlap, if any
- record.write(overlap)
- record.write(pack(b'>B', len(overlap)))
-
self.last_text_record_idx = nrecords
@@ -262,10 +275,19 @@ class MobiWriter(object):
exth = self.build_exth()
last_content_record = len(self.records) - 1
- # EOF record
- self.records.append('\xE9\x8E\x0D\x0A')
+ # FCIS/FLIS (Seem to server no purpose)
+ flis_number = len(self.records)
+ self.records.append(
+ b'FLIS\0\0\0\x08\0\x41\0\0\0\0\0\0\xff\xff\xff\xff\0\x01\0\x03\0\0\0\x03\0\0\0\x01'+
+ b'\xff'*4)
+ fcis = b'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
+ fcis += pack(b'>I', self.text_length)
+ fcis += b'\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
+ fcis_number = len(self.records)
+ self.records.append(fcis)
- self.generate_end_records()
+ # EOF record
+ self.records.append(b'\xE9\x8E\x0D\x0A')
record0 = StringIO()
# The MOBI Header
@@ -295,8 +317,15 @@ class MobiWriter(object):
# 0x10 - 0x13 : UID
# 0x14 - 0x17 : Generator version
+ bt = 0x002
+ if self.primary_index_record_idx is not None:
+ if self.indexer.is_flat_periodical:
+ bt = 0x102
+ elif self.indexer.is_periodical:
+ bt = 0x103
+
record0.write(pack(b'>IIIII',
- 0xe8, 0x002, 65001, uid, 6))
+ 0xe8, bt, 65001, uid, 6))
# 0x18 - 0x1f : Unknown
record0.write(b'\xff' * 8)
@@ -325,7 +354,8 @@ class MobiWriter(object):
# 0x58 - 0x5b : Format version
# 0x5c - 0x5f : First image record number
record0.write(pack(b'>II',
- 6, self.first_image_record if self.first_image_record else 0))
+ 6, self.first_image_record if self.first_image_record else
+ len(self.records)-1))
# 0x60 - 0x63 : First HUFF/CDIC record number
# 0x64 - 0x67 : Number of HUFF/CDIC records
@@ -334,7 +364,12 @@ class MobiWriter(object):
record0.write(b'\0' * 16)
# 0x70 - 0x73 : EXTH flags
- record0.write(pack(b'>I', 0x50))
+ # Bit 6 (0b1000000) being set indicates the presence of an EXTH header
+ # The purpose of the other bits is unknown
+ exth_flags = 0b1011000
+ if self.is_periodical:
+ exth_flags |= 0b1000
+ record0.write(pack(b'>I', exth_flags))
# 0x74 - 0x93 : Unknown
record0.write(b'\0' * 32)
@@ -359,13 +394,13 @@ class MobiWriter(object):
record0.write(b'\0\0\0\x01')
# 0xb8 - 0xbb : FCIS record number
- record0.write(pack(b'>I', 0xffffffff))
+ record0.write(pack(b'>I', fcis_number))
# 0xbc - 0xbf : Unknown (FCIS record count?)
- record0.write(pack(b'>I', 0xffffffff))
+ record0.write(pack(b'>I', 1))
# 0xc0 - 0xc3 : FLIS record number
- record0.write(pack(b'>I', 0xffffffff))
+ record0.write(pack(b'>I', flis_number))
# 0xc4 - 0xc7 : Unknown (FLIS record count?)
record0.write(pack(b'>I', 1))
@@ -457,25 +492,33 @@ class MobiWriter(object):
nrecs += 1
# Write cdetype
- if not self.opts.mobi_periodical:
+ if self.is_periodical:
data = b'EBOK'
exth.write(pack(b'>II', 501, len(data)+8))
exth.write(data)
nrecs += 1
# Add a publication date entry
- if oeb.metadata['date'] != [] :
+ if oeb.metadata['date']:
datestr = str(oeb.metadata['date'][0])
- elif oeb.metadata['timestamp'] != [] :
+ elif oeb.metadata['timestamp']:
datestr = str(oeb.metadata['timestamp'][0])
if datestr is not None:
+ datestr = bytes(datestr)
+ datestr = datestr.replace(b'+00:00', b'Z')
exth.write(pack(b'>II', EXTH_CODES['pubdate'], len(datestr) + 8))
exth.write(datestr)
nrecs += 1
else:
raise NotImplementedError("missing date or timestamp needed for mobi_periodical")
+ # Write the same creator info as kindlegen 1.2
+ for code, val in [(204, 202), (205, 1), (206, 2), (207, 33307)]:
+ exth.write(pack(b'>II', code, 12))
+ exth.write(pack(b'>I', val))
+ nrecs += 1
+
if (oeb.metadata.cover and
unicode(oeb.metadata.cover[0]) in oeb.manifest.ids):
id = unicode(oeb.metadata.cover[0])
diff --git a/src/calibre/ebooks/mobi/writer2/serializer.py b/src/calibre/ebooks/mobi/writer2/serializer.py
index d6878bee4a..881937ce73 100644
--- a/src/calibre/ebooks/mobi/writer2/serializer.py
+++ b/src/calibre/ebooks/mobi/writer2/serializer.py
@@ -143,6 +143,7 @@ class Serializer(object):
spine.extend([item for item in self.oeb.spine if not item.linear])
for item in spine:
self.serialize_item(item)
+ self.body_end_offset = buf.tell()
buf.write(b'