From 47b9f6dcfda02b911fced64654e400a1d013dcb3 Mon Sep 17 00:00:00 2001 From: Alex Stanev Date: Wed, 20 Jul 2011 17:37:01 +0300 Subject: [PATCH 01/19] Store plugin for e-knigi.net --- src/calibre/customize/builtins.py | 11 +++ .../gui2/store/stores/eknigi_plugin.py | 83 +++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100644 src/calibre/gui2/store/stores/eknigi_plugin.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 91e81bd46f..49865c0f19 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -1258,6 +1258,16 @@ class StoreEHarlequinStore(StoreBase): formats = ['EPUB', 'PDF'] affiliate = True +class StoreEKnigiStore(StoreBase): + name = u'еКниги' + author = 'Alex Stanev' + description = u'Онлайн книжарница за електронни книги и аудио риалити романи' + actual_plugin = 'calibre.gui2.store.stores.eknigi_plugin:eKnigiStore' + + headquarters = 'BG' + formats = ['EPUB', 'PDF', 'HTML'] + affiliate = True + class StoreEpubBudStore(StoreBase): name = 'ePub Bud' description = 'Well, it\'s pretty much just "YouTube for Children\'s eBooks. A not-for-profit organization devoted to brining self published childrens books to the world.' @@ -1483,6 +1493,7 @@ plugins += [ StoreEBookShoppeUKStore, # StoreEPubBuyDEStore, StoreEHarlequinStore, + StoreEKnigiStore, StoreEpubBudStore, StoreFeedbooksStore, StoreFoylesUKStore, diff --git a/src/calibre/gui2/store/stores/eknigi_plugin.py b/src/calibre/gui2/store/stores/eknigi_plugin.py new file mode 100644 index 0000000000..08d9418d51 --- /dev/null +++ b/src/calibre/gui2/store/stores/eknigi_plugin.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- + +from __future__ import (unicode_literals, division, absolute_import, print_function) + +__license__ = 'GPL 3' +__copyright__ = '2011, Alex Stanev ' +__docformat__ = 'restructuredtext en' + +import urllib2 +from contextlib import closing + +from lxml import html + +from PyQt4.Qt import QUrl + +from calibre import browser, url_slash_cleaner +from calibre.gui2 import open_url +from calibre.gui2.store import StorePlugin +from calibre.gui2.store.basic_config import BasicStoreConfig +from calibre.gui2.store.search_result import SearchResult +from calibre.gui2.store.web_store_dialog import WebStoreDialog + +class eKnigiStore(BasicStoreConfig, StorePlugin): + + def open(self, parent=None, detail_item=None, external=False): + url = 'http://e-knigi.net/?amigosid=22' + aff_suffix = '&amigosid=22' + + if external or self.config.get('open_external', False): + if detail_item: + url = detail_item + aff_suffix + open_url(QUrl(url_slash_cleaner(url))) + else: + detail_url = None + if detail_item: + url = detail_item + aff_suffix + d = WebStoreDialog(self.gui, url, parent, detail_url) + d.setWindowTitle(self.name) + d.set_tags(self.config.get('tags', '')) + d.exec_() + + def search(self, query, max_results=10, timeout=60): + base_url = 'http://e-knigi.net' + url = base_url + '/virtuemart?page=shop.browse&search_category=0&search_limiter=anywhere&limitstart=0&limit=' + str(max_results) + '&keyword=' + urllib2.quote(query) + + br = browser() + + counter = max_results + with closing(br.open(url, timeout=timeout)) as f: + doc = html.fromstring(f.read()) + + # if the store finds only one product, it opens directly detail view + for data in doc.xpath('//div[@class="prod_details"]'): + s = SearchResult() + s.cover_url = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@src')).strip() + s.title = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@alt')).strip() + s.author = ''.join(data.xpath('.//div[@class="td_bg clearfix"]/div[@class="gk_product_tab"]/div/table/tr[3]/td[2]/text()')).strip() + s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip() + s.detail_item = url + s.drm = SearchResult.DRM_UNLOCKED + + yield s + return + + # search in store results + for data in doc.xpath('//div[@class="browseProductContainer"]'): + if counter <= 0: + break + id = ''.join(data.xpath('.//a[1]/@href')).strip() + if not id: + continue + + counter -= 1 + + s = SearchResult() + s.cover_url = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@src')).strip() + s.title = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@title')).strip() + s.author = ''.join(data.xpath('.//div[@style="float:left;width:90%"]/b/text()')).strip().replace('Автор: ', '') + s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip() + s.detail_item = base_url + id + s.drm = SearchResult.DRM_UNLOCKED + + yield s From ea0301816c27ea89c73845992897804e8fe053d1 Mon Sep 17 00:00:00 2001 From: Alex Stanev Date: Wed, 20 Jul 2011 17:45:00 +0300 Subject: [PATCH 02/19] Code cleanup --- .../gui2/store/stores/chitanka_plugin.py | 62 +++++-------------- 1 file changed, 16 insertions(+), 46 deletions(-) diff --git a/src/calibre/gui2/store/stores/chitanka_plugin.py b/src/calibre/gui2/store/stores/chitanka_plugin.py index 15b2151a4e..f296b6acbc 100644 --- a/src/calibre/gui2/store/stores/chitanka_plugin.py +++ b/src/calibre/gui2/store/stores/chitanka_plugin.py @@ -55,36 +55,21 @@ class ChitankaStore(BasicStoreConfig, StorePlugin): if counter <= 0: break - id = ''.join(data.xpath('.//a[@class="booklink"]/@href')) + id = ''.join(data.xpath('.//a[@class="booklink"]/@href')).strip() if not id: continue - cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')) - title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')) - author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')) - fb2 = ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')) - epub = ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')) - txt = ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')) - - # remove .zip extensions - if fb2.find('.zip') != -1: - fb2 = fb2[:fb2.find('.zip')] - if epub.find('.zip') != -1: - epub = epub[:epub.find('.zip')] - if txt.find('.zip') != -1: - txt = txt[:txt.find('.zip')] - counter -= 1 s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.author = author.strip() - s.detail_item = id.strip() + s.cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')).strip() + s.title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')).strip() + s.author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')).strip() + s.detail_item = id s.drm = SearchResult.DRM_UNLOCKED - s.downloads['FB2'] = base_url + fb2.strip() - s.downloads['EPUB'] = base_url + epub.strip() - s.downloads['TXT'] = base_url + txt.strip() + s.downloads['FB2'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip().replace('.zip', '') + s.downloads['EPUB'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')).strip().replace('.zip', '') + s.downloads['TXT'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')).strip().replace('.zip', '') s.formats = 'FB2, EPUB, TXT, SFB' yield s @@ -106,35 +91,20 @@ class ChitankaStore(BasicStoreConfig, StorePlugin): if counter <= 0: break - id = ''.join(data.xpath('.//a[@class="booklink"]/@href')) + id = ''.join(data.xpath('.//a[@class="booklink"]/@href')).strip() if not id: continue - cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')) - title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')) - author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')) - fb2 = ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')) - epub = ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')) - txt = ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')) - - # remove .zip extensions - if fb2.find('.zip') != -1: - fb2 = fb2[:fb2.find('.zip')] - if epub.find('.zip') != -1: - epub = epub[:epub.find('.zip')] - if txt.find('.zip') != -1: - txt = txt[:txt.find('.zip')] - counter -= 1 s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.author = author.strip() - s.detail_item = id.strip() + s.cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')).strip() + s.title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')).strip() + s.author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')).strip() + s.detail_item = id s.drm = SearchResult.DRM_UNLOCKED - s.downloads['FB2'] = base_url + fb2.strip() - s.downloads['EPUB'] = base_url + epub.strip() - s.downloads['TXT'] = base_url + txt.strip() + s.downloads['FB2'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip().replace('.zip', '') + s.downloads['EPUB'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')).strip().replace('.zip', '') + s.downloads['TXT'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')).strip().replace('.zip', '') s.formats = 'FB2, EPUB, TXT, SFB' yield s From f45c317a69e7ae091aa9f4d5196fe312e988ed75 Mon Sep 17 00:00:00 2001 From: Alex Stanev Date: Fri, 22 Jul 2011 17:42:03 +0300 Subject: [PATCH 03/19] Use Kovid's affiliate id 30% of the time --- src/calibre/gui2/store/stores/eknigi_plugin.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/calibre/gui2/store/stores/eknigi_plugin.py b/src/calibre/gui2/store/stores/eknigi_plugin.py index ea29b43d5f..b2f5f170b6 100644 --- a/src/calibre/gui2/store/stores/eknigi_plugin.py +++ b/src/calibre/gui2/store/stores/eknigi_plugin.py @@ -6,6 +6,7 @@ __license__ = 'GPL 3' __copyright__ = '2011, Alex Stanev ' __docformat__ = 'restructuredtext en' +import random import urllib2 from contextlib import closing @@ -23,10 +24,12 @@ from calibre.gui2.store.web_store_dialog import WebStoreDialog class eKnigiStore(BasicStoreConfig, StorePlugin): def open(self, parent=None, detail_item=None, external=False): - #url = 'http://e-knigi.net/?amigosid=22' - #aff_suffix = '&amigosid=22' - url = 'http://e-knigi.net' - aff_suffix = '' + # Use Kovid's affiliate id 30% of the time + if random.randint(1, 10) in (1, 2, 3): + aff_suffix = '&amigosid=23' + else: + aff_suffix = '&amigosid=22' + url = 'http://e-knigi.net/?' + aff_suffix[1:] if external or self.config.get('open_external', False): if detail_item: From 16ad2c9f02795042a01d7214508b20c464c95f2c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 23 Jul 2011 12:07:41 -0600 Subject: [PATCH 04/19] ... --- recipes/guardian.recipe | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/recipes/guardian.recipe b/recipes/guardian.recipe index c5021cb91d..124820d0a1 100644 --- a/recipes/guardian.recipe +++ b/recipes/guardian.recipe @@ -12,7 +12,7 @@ from datetime import date class Guardian(BasicNewsRecipe): - title = u'The Guardian / The Observer' + title = u'The Guardian and The Observer' if date.today().weekday() == 6: base_url = "http://www.guardian.co.uk/theobserver" else: @@ -28,7 +28,7 @@ class Guardian(BasicNewsRecipe): # List of section titles to ignore # For example: ['Sport'] ignore_sections = [] - + timefmt = ' [%a, %d %b %Y]' keep_only_tags = [ dict(name='div', attrs={'id':["content","article_header","main-article-info",]}), @@ -94,7 +94,7 @@ class Guardian(BasicNewsRecipe): prefix = section_title + ': ' for subsection in s.parent.findAll('a', attrs={'class':'book-section'}): yield (prefix + self.tag_to_string(subsection), subsection['href']) - + def find_articles(self, url): soup = self.index_to_soup(url) div = soup.find('div', attrs={'class':'book-index'}) @@ -115,7 +115,7 @@ class Guardian(BasicNewsRecipe): 'title': title, 'url':url, 'description':desc, 'date' : strftime('%a, %d %b'), } - + def parse_index(self): try: feeds = [] From 3f226c85bbb52cd11b631c88641b13137e82436a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 23 Jul 2011 16:44:35 -0600 Subject: [PATCH 05/19] Fix #815224 (UnicodeEncodeError on start due to store StoreChitankaStore name value) --- src/calibre/gui2/store/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/calibre/gui2/store/__init__.py b/src/calibre/gui2/store/__init__.py index d58ccbda84..ae42d82032 100644 --- a/src/calibre/gui2/store/__init__.py +++ b/src/calibre/gui2/store/__init__.py @@ -6,6 +6,8 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' +from calibre.utils.filenames import ascii_filename + class StorePlugin(object): # {{{ ''' A plugin representing an online ebook repository (store). The store can @@ -43,7 +45,7 @@ class StorePlugin(object): # {{{ The easiest way to handle affiliate money payouts is to randomly select between the author's affiliate id and calibre's affiliate id so that 70% of the time the author's id is used. - + See declined.txt for a list of stores that do not want to be included. ''' @@ -53,7 +55,7 @@ class StorePlugin(object): # {{{ self.gui = gui self.name = name self.base_plugin = None - self.config = JSONConfig('store/stores/' + self.name) + self.config = JSONConfig('store/stores/' + ascii_filename(self.name)) def open(self, gui, parent=None, detail_item=None, external=False): ''' From 1443197753943df0c1e1846d97cd6df60a6cbf4b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 23 Jul 2011 18:06:18 -0600 Subject: [PATCH 06/19] ... --- src/calibre/gui2/update.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/gui2/update.py b/src/calibre/gui2/update.py index f76d4b8e65..caa1d3f3dc 100644 --- a/src/calibre/gui2/update.py +++ b/src/calibre/gui2/update.py @@ -15,6 +15,7 @@ from calibre.gui2 import config, dynamic, open_url from calibre.gui2.dialogs.plugin_updater import get_plugin_updates_available URL = 'http://status.calibre-ebook.com/latest' +#URL = 'http://localhost:8000/latest' NO_CALIBRE_UPDATE = '-0.0.0' VSEP = '|' From 38ef36d69eb894f281952330dc87042392d4935a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 23 Jul 2011 23:06:48 -0600 Subject: [PATCH 07/19] Fix #814964 (error message when doing bulk edit) --- src/calibre/db/tables.py | 7 +++++-- src/calibre/library/sqlite.py | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/calibre/db/tables.py b/src/calibre/db/tables.py index b75effff4b..fa7b001851 100644 --- a/src/calibre/db/tables.py +++ b/src/calibre/db/tables.py @@ -12,7 +12,7 @@ from datetime import datetime from dateutil.tz import tzoffset from calibre.constants import plugins -from calibre.utils.date import parse_date, local_tz +from calibre.utils.date import parse_date, local_tz, UNDEFINED_DATE from calibre.ebooks.metadata import author_to_author_sort _c_speedup = plugins['speedup'][0] @@ -29,8 +29,11 @@ def _c_convert_timestamp(val): if ret is None: return parse_date(val, as_utc=False) year, month, day, hour, minutes, seconds, tzsecs = ret - return datetime(year, month, day, hour, minutes, seconds, + try: + return datetime(year, month, day, hour, minutes, seconds, tzinfo=tzoffset(None, tzsecs)).astimezone(local_tz) + except OverflowError: + return UNDEFINED_DATE.astimezone(local_tz) class Table(object): diff --git a/src/calibre/library/sqlite.py b/src/calibre/library/sqlite.py index a2a85806f5..b5917f1a55 100644 --- a/src/calibre/library/sqlite.py +++ b/src/calibre/library/sqlite.py @@ -17,7 +17,7 @@ from datetime import datetime from functools import partial from calibre.ebooks.metadata import title_sort, author_to_author_sort -from calibre.utils.date import parse_date, isoformat, local_tz +from calibre.utils.date import parse_date, isoformat, local_tz, UNDEFINED_DATE from calibre import isbytestring, force_unicode from calibre.constants import iswindows, DEBUG, plugins from calibre.utils.icu import strcmp @@ -39,8 +39,11 @@ def _c_convert_timestamp(val): if ret is None: return parse_date(val, as_utc=False) year, month, day, hour, minutes, seconds, tzsecs = ret - return datetime(year, month, day, hour, minutes, seconds, + try: + return datetime(year, month, day, hour, minutes, seconds, tzinfo=tzoffset(None, tzsecs)).astimezone(local_tz) + except OverflowError: + return UNDEFINED_DATE.astimezone(local_tz) def _py_convert_timestamp(val): if val: From 49d99ff2006de14d10b3aa00a28d8365729dde90 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 24 Jul 2011 14:25:22 -0600 Subject: [PATCH 08/19] Updated Corren --- recipes/corren2.recipe | 63 +++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 34 deletions(-) diff --git a/recipes/corren2.recipe b/recipes/corren2.recipe index 494be88f10..f53da20fd1 100644 --- a/recipes/corren2.recipe +++ b/recipes/corren2.recipe @@ -1,39 +1,34 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPLv3' + from calibre.web.feeds.news import BasicNewsRecipe -class AdvancedUserRecipe1255797795(BasicNewsRecipe): - title = u'Corren' - language = 'sv' - __author__ = 'Jonas Svensson' - simultaneous_downloads = 1 - no_stylesheets = True - oldest_article = 7 +class AdvancedUserRecipe1311446032(BasicNewsRecipe): + title = 'Corren' + __author__ = 'Jonas Svensson' + description = 'News from Sweden' + publisher = 'Corren' + category = 'news, politics, Sweden' + oldest_article = 2 + delay = 1 max_articles_per_feed = 100 - remove_attributes = ['onload'] - timefmt = '' + no_stylesheets = True + use_embedded_content = False + encoding = 'iso-8859-1' + language = 'sv' - feeds = [ - (u'Toppnyheter (alla kategorier)', u'http://www.corren.se/inc/RssHandler.ashx?id=4122151&ripurl=http://www.corren.se/nyheter/'), - (u'Bostad', u'http://www.corren.se/inc/RssHandler.ashx?id=4122174&ripurl=http://www.corren.se/bostad/'), - (u'Ekonomi & Jobb', u'http://www.corren.se/inc/RssHandler.ashx?id=4122176&ripurl=http://www.corren.se/ekonomi/'), - (u'Kultur & Nöje', u'http://www.corren.se/inc/RssHandler.ashx?id=4122192&ripurl=http://www.corren.se/kultur/'), - (u'Mat & dryck', u'http://www.corren.se/inc/RssHandler.ashx?id=4122201&ripurl=http://www.corren.se/mat-dryck/'), - (u'Motor', u'http://www.corren.se/inc/RssHandler.ashx?id=4122203&ripurl=http://www.corren.se/motor/'), - (u'Sport', u'http://www.corren.se/inc/RssHandler.ashx?id=4122206&ripurl=http://www.corren.se/sport/'), - (u'Åsikter', u'http://www.corren.se/inc/RssHandler.ashx?id=4122223&ripurl=http://www.corren.se/asikter/'), - (u'Mjölby', u'http://www.corren.se/inc/RssHandler.ashx?id=4122235&ripurl=http://www.corren.se/ostergotland/mjolby/'), - (u'Motala', u'http://www.corren.se/inc/RssHandler.ashx?id=4122236&ripurl=http://www.corren.se/ostergotland/motala/') - ] - - def print_version(self, url): - url = url.replace("ekonomi/artikel.aspx", "Print.aspx") - url = url.replace("bostad/artikel.aspx", "Print.aspx") - url = url.replace("kultur/artikel.aspx", "Print.aspx") - url = url.replace("motor/artikel.aspx", "Print.aspx") - url = url.replace("mat-dryck/artikel.aspx", "Print.aspx") - url = url.replace("sport/artikel.aspx", "Print.aspx") - url = url.replace("asikter/artikel.aspx", "Print.aspx") - url = url.replace("mat-dryck/artikel.aspx", "Print.aspx") - url = url.replace("ostergotland/mjolby/artikel.aspx", "Print.aspx") - url = url.replace("ostergotland/motala/artikel.aspx", "Print.aspx") - return url.replace("nyheter/artikel.aspx", "Print.aspx") + feeds = [ + (u'Toppnyheter', u'http://www.corren.se/inc/RssHandler.ashx?id=4122151&ripurl=http://www.corren.se/nyheter/') + ,(u'Ekonomi', u'http://www.corren.se/inc/RssHandler.ashx?id=4122176&ripurl=http://www.corren.se/ekonomi/') + ,(u'Link\xf6ping', u'http://www.corren.se/inc/RssHandler.ashx?id=4122234') + ,(u'Åsikter', u'http://www.corren.se/inc/RssHandler.ashx?id=4122223,4122224,4122226,4122227,4122228,4122229,4122230') + ] + keep_only_tags = [dict(name='div', attrs={'id':'article'}),dict(name='div', attrs={'class':'body'})] + remove_tags = [ + dict(name='ul',attrs={'class':'functions'}) + ,dict(name='a',attrs={'href':'javascript*'}) + ,dict(name='div',attrs={'class':'box'}) + ,dict(name='div',attrs={'class':'functionsbottom'}) + ] From 48b50a2e478108b26b1991477396c70e69231689 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 24 Jul 2011 14:28:13 -0600 Subject: [PATCH 09/19] Dagens Industri by Jonas Svensson --- recipes/dagens_industri.recipe | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 recipes/dagens_industri.recipe diff --git a/recipes/dagens_industri.recipe b/recipes/dagens_industri.recipe new file mode 100644 index 0000000000..c9b60c72b1 --- /dev/null +++ b/recipes/dagens_industri.recipe @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPLv3' + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1311450855(BasicNewsRecipe): + title = u'Dagens Industri' + __author__ = 'Jonas Svensson' + description = 'Economy news from Sweden' + publisher = 'DI' + category = 'news, politics, Sweden' + oldest_article = 2 + delay = 1 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + encoding = 'utf-8' + language = 'sv' + + feeds = [(u'DI', u'http://di.se/rss')] + + keep_only_tags = [dict(name='h1', attrs={'id':'ctl00_ExtraWideContentRegion_WideContentRegion_MainRegion_MainContentRegion_MainBodyRegion_headlineNormal'}),dict(name='div', attrs={'id':'articleBody'})] + + remove_tags = [ + dict(name='div',attrs={'class':'article-actions clear'}) + ,dict(name='div',attrs={'class':'article-action-popup'}) + ,dict(name='div',attrs={'class':'header'}) + ,dict(name='div',attrs={'class':'content clear'}) + ,dict(name='div',attrs={'id':'articleAdvertisementDiv'}) + ,dict(name='ul',attrs={'class':'action-list'}) + ] From 5e7c3b5b3586061f663170c3a59eccfa4b93d243 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 24 Jul 2011 15:49:08 -0600 Subject: [PATCH 10/19] New MOBI writer: Finish up creation of all index records for periodicals --- src/calibre/ebooks/mobi/debug.py | 1 + src/calibre/ebooks/mobi/writer2/indexer.py | 124 +++++++++++++++++++-- src/calibre/ebooks/mobi/writer2/main.py | 3 + 3 files changed, 119 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 514cf9c246..a848f11355 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -399,6 +399,7 @@ class IndexHeader(object): # {{{ def __init__(self, record): self.record = record raw = self.record.raw + #open('/t/index_header.bin', 'wb').write(raw) if raw[:4] != b'INDX': raise ValueError('Invalid Primary Index Record') diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index 48b1d82c04..e2428a2dd2 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -15,6 +15,7 @@ from collections import OrderedDict from calibre.ebooks import normalize from calibre.ebook.mobi.writer2 import RECORD_SIZE from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex) +from calibre.ebooks.mobi.langcodes import iana2mobi def utf8_text(text): ''' @@ -85,7 +86,7 @@ class CNCX(object): # {{{ return self.strings[string] # }}} -class IndexEntry(object): +class IndexEntry(object): # {{{ TAG_VALUES = { 'offset': 1, @@ -112,6 +113,35 @@ class IndexEntry(object): self.first_child_index = None self.last_child_index = None + @classmethod + def tagx_block(cls, for_periodical=True): + buf = bytearray() + + def add_tag(tag, num_values=1): + buf.append(tag) + buf.append(num_values) + # bitmask + buf.append(1 << (cls.BITMASKS.index(tag))) + # eof + buf.append(0) + + for tag in xrange(1, 5): + add_tag(tag) + + if for_periodical: + for tag in (5, 21, 22, 23): + add_tag(tag) + + # End of TAGX record + for i in xrange(3): buf.append(0) + buf.append(1) + + header = b'TAGX' + header += pack(b'>I', len(buf)) # table length + header += pack(b'>I', 1) # control byte count + + return header + bytes(buf) + @property def next_offset(self): return self.offset + self.length @@ -147,6 +177,7 @@ class IndexEntry(object): ans = buf.get_value() return ans +# }}} class Indexer(object): @@ -172,15 +203,18 @@ class Indexer(object): self.cncx = CNCX(oeb.toc, opts) if self.is_periodical: - indices = self.create_periodical_index() + self.indices = self.create_periodical_index() else: raise NotImplementedError() - self.records.append(self.create_index_record(indices)) + self.records.append(self.create_index_record()) + self.records.insert(0, self.create_header()) + self.records.extend(self.cncx.records) - def create_index_record(self, indices): + def create_index_record(self): # {{{ header_length = 192 buf = StringIO() + indices = self.indices # Write index entries offsets = [] @@ -218,6 +252,7 @@ class Indexer(object): if len(ans) > 0x10000: raise ValueError('Too many entries (%d) in the TOC'%len(offsets)) return ans + # }}} def create_periodical_index(self): # {{{ periodical_node = iter(self.oeb.toc).next() @@ -361,14 +396,85 @@ class Indexer(object): return indices # }}} - def create_header(self): + def create_header(self): # {{{ buf = StringIO() + tagx_block = IndexEntry.tagx_block(self.is_periodical) + header_length = 192 - # Ident + # Ident 0 - 4 buf.write(b'INDX') - # Header length - buf.write(pack(b'>I', 192)) + # Header length 4 - 8 + buf.write(pack(b'>I', header_length)) - # Index type: 0 - normal, 2 - inflection + # Unknown 8-16 + buf.write(b'\0'*8) + + # Index type: 0 - normal, 2 - inflection 16 - 20 buf.write(pack(b'>I', 2)) + + # IDXT offset 20-24 + buf.write(pack(b'>I', 0)) # Filled in later + + # Number of index records 24-28 + buf.write(pack('b>I', len(self.records))) + + # Index Encoding 28-32 + buf.write(pack(b'>I', 65001)) # utf-8 + + # Index language 32-36 + buf.write(iana2mobi( + str(self.oeb.metadata.language[0]))) + + # Number of index entries 36-40 + buf.write(pack(b'>I', len(self.indices))) + + # ORDT offset 40-44 + buf.write(pack(b'>I', 0)) + + # LIGT offset 44-48 + buf.write(pack(b'>I', 0)) + + # Number of LIGT entries 48-52 + buf.write(pack(b'>I', 0)) + + # Number of CNCX records 52-56 + buf.write(pack(b'>I', len(self.cncx.records))) + + # Unknown 56-180 + buf.write(b'\0'*124) + + # TAGX offset 180-184 + buf.write(pack(b'>I', header_length)) + + # Unknown 184-192 + buf.write(b'\0'*8) + + # TAGX block + buf.write(tagx_block) + + num = len(self.indices) + + # The index of the last entry in the NCX + buf.write(encode_number_as_hex(num-1)) + + # The number of entries in the NCX + buf.write(pack(b'>H', num)) + + # Padding + pad = (4 - (buf.tell()%4))%4 + if pad: + buf.write(b'\0'*pad) + + idxt_offset = buf.tell() + + buf.write(b'IDXT') + buf.write(header_length + len(tagx_block)) + buf.write(b'\0') + buf.seek(20) + buf.write(pack(b'>I', idxt_offset)) + + return align_block(buf.getvalue()) + # }}} + + diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index ea67007168..a031e2e957 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -93,6 +93,9 @@ class MobiWriter(object): self.opts, self.oeb) except: self.log.exception('Failed to generate MOBI index:') + else: + self.primary_index_record_idx = len(self.records) + self.records.extend(self.indexer.records) # }}} def write_uncrossable_breaks(self): # {{{ From 8f40166b9dc52ba867611a60e565bdf0b2d242ee Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 24 Jul 2011 16:04:33 -0600 Subject: [PATCH 11/19] New MOBI writer: Create index records for books --- src/calibre/ebooks/mobi/writer2/indexer.py | 212 +++++++++++++-------- 1 file changed, 129 insertions(+), 83 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index e2428a2dd2..501b23113f 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -205,7 +205,7 @@ class Indexer(object): if self.is_periodical: self.indices = self.create_periodical_index() else: - raise NotImplementedError() + self.indices = self.create_book_index() self.records.append(self.create_index_record()) self.records.insert(0, self.create_header()) @@ -254,6 +254,134 @@ class Indexer(object): return ans # }}} + def create_header(self): # {{{ + buf = StringIO() + tagx_block = IndexEntry.tagx_block(self.is_periodical) + header_length = 192 + + # Ident 0 - 4 + buf.write(b'INDX') + + # Header length 4 - 8 + buf.write(pack(b'>I', header_length)) + + # Unknown 8-16 + buf.write(b'\0'*8) + + # Index type: 0 - normal, 2 - inflection 16 - 20 + buf.write(pack(b'>I', 2)) + + # IDXT offset 20-24 + buf.write(pack(b'>I', 0)) # Filled in later + + # Number of index records 24-28 + buf.write(pack('b>I', len(self.records))) + + # Index Encoding 28-32 + buf.write(pack(b'>I', 65001)) # utf-8 + + # Index language 32-36 + buf.write(iana2mobi( + str(self.oeb.metadata.language[0]))) + + # Number of index entries 36-40 + buf.write(pack(b'>I', len(self.indices))) + + # ORDT offset 40-44 + buf.write(pack(b'>I', 0)) + + # LIGT offset 44-48 + buf.write(pack(b'>I', 0)) + + # Number of LIGT entries 48-52 + buf.write(pack(b'>I', 0)) + + # Number of CNCX records 52-56 + buf.write(pack(b'>I', len(self.cncx.records))) + + # Unknown 56-180 + buf.write(b'\0'*124) + + # TAGX offset 180-184 + buf.write(pack(b'>I', header_length)) + + # Unknown 184-192 + buf.write(b'\0'*8) + + # TAGX block + buf.write(tagx_block) + + num = len(self.indices) + + # The index of the last entry in the NCX + buf.write(encode_number_as_hex(num-1)) + + # The number of entries in the NCX + buf.write(pack(b'>H', num)) + + # Padding + pad = (4 - (buf.tell()%4))%4 + if pad: + buf.write(b'\0'*pad) + + idxt_offset = buf.tell() + + buf.write(b'IDXT') + buf.write(header_length + len(tagx_block)) + buf.write(b'\0') + buf.seek(20) + buf.write(pack(b'>I', idxt_offset)) + + return align_block(buf.getvalue()) + # }}} + + def create_book_index(self): # {{{ + indices = [] + seen = set() + id_offsets = self.serializer.id_offsets + + for node in self.oeb.toc.iterdescendants(): + try: + offset = id_offsets[node.href] + label = self.cncx[node.title] + except: + self.log.warn('TOC item %s not found in document'%node.href) + continue + if offset in seen: + continue + seen.add(offset) + index = IndexEntry(offset, label) + self.indices.append(index) + + indices.sort(key=lambda x:x.offset) + + # Set lengths + for i, index in indices: + try: + next_offset = indices[i+1].offset + except: + next_offset = self.serializer.body_end_offset + index.length = next_offset - index.offset + + # Remove empty nodes + indices = [i for i in indices if i.length > 0] + + # Set index values + for i, index in indices: + index.index = i + + # Set lengths again to close up any gaps left by filtering + for i, index in indices: + try: + next_offset = indices[i+1].offset + except: + next_offset = self.serializer.body_end_offset + index.length = next_offset - index.offset + + return indices + + # }}} + def create_periodical_index(self): # {{{ periodical_node = iter(self.oeb.toc).next() periodical_node_offset = self.serializer.body_start_offset @@ -396,85 +524,3 @@ class Indexer(object): return indices # }}} - def create_header(self): # {{{ - buf = StringIO() - tagx_block = IndexEntry.tagx_block(self.is_periodical) - header_length = 192 - - # Ident 0 - 4 - buf.write(b'INDX') - - # Header length 4 - 8 - buf.write(pack(b'>I', header_length)) - - # Unknown 8-16 - buf.write(b'\0'*8) - - # Index type: 0 - normal, 2 - inflection 16 - 20 - buf.write(pack(b'>I', 2)) - - # IDXT offset 20-24 - buf.write(pack(b'>I', 0)) # Filled in later - - # Number of index records 24-28 - buf.write(pack('b>I', len(self.records))) - - # Index Encoding 28-32 - buf.write(pack(b'>I', 65001)) # utf-8 - - # Index language 32-36 - buf.write(iana2mobi( - str(self.oeb.metadata.language[0]))) - - # Number of index entries 36-40 - buf.write(pack(b'>I', len(self.indices))) - - # ORDT offset 40-44 - buf.write(pack(b'>I', 0)) - - # LIGT offset 44-48 - buf.write(pack(b'>I', 0)) - - # Number of LIGT entries 48-52 - buf.write(pack(b'>I', 0)) - - # Number of CNCX records 52-56 - buf.write(pack(b'>I', len(self.cncx.records))) - - # Unknown 56-180 - buf.write(b'\0'*124) - - # TAGX offset 180-184 - buf.write(pack(b'>I', header_length)) - - # Unknown 184-192 - buf.write(b'\0'*8) - - # TAGX block - buf.write(tagx_block) - - num = len(self.indices) - - # The index of the last entry in the NCX - buf.write(encode_number_as_hex(num-1)) - - # The number of entries in the NCX - buf.write(pack(b'>H', num)) - - # Padding - pad = (4 - (buf.tell()%4))%4 - if pad: - buf.write(b'\0'*pad) - - idxt_offset = buf.tell() - - buf.write(b'IDXT') - buf.write(header_length + len(tagx_block)) - buf.write(b'\0') - buf.seek(20) - buf.write(pack(b'>I', idxt_offset)) - - return align_block(buf.getvalue()) - # }}} - - From 9b1f09cc9e750728df535b9bb50ff0441c22ea59 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 25 Jul 2011 00:48:08 -0600 Subject: [PATCH 12/19] ... --- src/calibre/ebooks/mobi/writer2/indexer.py | 59 +++++++++++++++++++++- src/calibre/ebooks/mobi/writer2/main.py | 6 +++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index 501b23113f..41c5d2ec91 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -179,7 +179,27 @@ class IndexEntry(object): # {{{ # }}} -class Indexer(object): +class TBS(object): # {{{ + + ''' + Take the list of index nodes starting/ending on a record and calculate the + trailing byte sequence for the record. + ''' + + def __init__(self, data, is_periodical): + if is_periodical: + self.periodical_tbs(data) + else: + self.book_tbs(data) + + def periodical_tbs(self, data): + self.bytestring = b'' + + def book_tbs(self, data): + self.bytestring = b'' +# }}} + +class Indexer(object): # {{{ def __init__(self, serializer, number_of_text_records, size_of_last_text_record, opts, oeb): @@ -211,6 +231,8 @@ class Indexer(object): self.records.insert(0, self.create_header()) self.records.extend(self.cncx.records) + self.calculate_trailing_byte_sequences() + def create_index_record(self): # {{{ header_length = 192 buf = StringIO() @@ -524,3 +546,38 @@ class Indexer(object): return indices # }}} + def calculate_trailing_byte_sequences(self): + self.tbs_map = {} + for i in xrange(self.number_of_text_records): + offset = i * RECORD_SIZE + next_offset = offset + RECORD_SIZE + data = OrderedDict([('ends',[]), ('completes',[]), ('starts',[]), + ('spans', None)]) + for index in self.indices: + if index.offset >= next_offset: + # Node starts after current record + break + if index.next_offset <= offset: + # Node ends before current record + continue + if index.offset >= offset: + # Node starts in current record + if index.next_offset <= next_offset: + # Node ends in current record + data['completes'].append(index) + else: + data['starts'].append(index) + else: + # Node starts before current records + if index.next_offset <= next_offset: + # Node ends in current record + data['ends'].append(index) + else: + data['spans'] = index + self.tbs_map[i+1] = TBS(data, self.is_periodical) + + def get_trailing_byte_sequence(self, num): + return self.tbs_map[num].bytestring + +# }}} + diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index a031e2e957..06572f48c4 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -95,7 +95,13 @@ class MobiWriter(object): self.log.exception('Failed to generate MOBI index:') else: self.primary_index_record_idx = len(self.records) + for i in xrange(len(self.records)): + if i == 0: continue + tbs = self.indexer.get_trailing_byte_sequence(i) + self.records[i] += tbs self.records.extend(self.indexer.records) + + # }}} def write_uncrossable_breaks(self): # {{{ From 68632c18531168ed5cb169e7eccb5eacf0036232 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 25 Jul 2011 03:08:03 -0600 Subject: [PATCH 13/19] ... --- src/calibre/ebooks/mobi/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index cf03c613f4..ae1241e2f1 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -39,7 +39,7 @@ def encode_number_as_hex(num): The bytes that follow are simply the hexadecimal representation of the number. ''' - num = bytes(hex(num)[2:]) + num = bytes(hex(num)[2:].upper()) ans = bytearray(num) ans.insert(0, len(num)) return bytes(ans) From 6867bde932f59d68f1e223d4428270e1bc4fd5c2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 25 Jul 2011 09:52:08 -0600 Subject: [PATCH 14/19] Fix #815971 (Garbage in Job Detail List) --- src/calibre/utils/ipc/job.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/utils/ipc/job.py b/src/calibre/utils/ipc/job.py index f4b54aee95..e75884d387 100644 --- a/src/calibre/utils/ipc/job.py +++ b/src/calibre/utils/ipc/job.py @@ -141,7 +141,8 @@ class BaseJob(object): def log_file(self): if self.log_path: return open(self.log_path, 'rb') - return cStringIO.StringIO(_('No details available.')) + return cStringIO.StringIO(_('No details available.').encode('utf-8', + 'replace')) @property def details(self): From c59e0cb9aa5526877c8e92ad646ef6b864fb4c0b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 25 Jul 2011 10:47:16 -0600 Subject: [PATCH 15/19] Improved Instapaper --- recipes/instapaper.recipe | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/recipes/instapaper.recipe b/recipes/instapaper.recipe index c6175a783f..d182e556a2 100644 --- a/recipes/instapaper.recipe +++ b/recipes/instapaper.recipe @@ -43,7 +43,7 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe): lfeeds = self.get_feeds() for feedobj in lfeeds: feedtitle, feedurl = feedobj - self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) + self.report_progress(0, 'Fetching feed'+' %s...'%(feedtitle if feedtitle else feedurl)) articles = [] soup = self.index_to_soup(feedurl) for item in soup.findAll('div', attrs={'class':'cornerControls'}): @@ -63,3 +63,8 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe): def populate_article_metadata(self, article, soup, first): article.title = soup.find('title').contents[0].strip() + def postprocess_html(self, soup, first_fetch): + for link_tag in soup.findAll(attrs={"id" : "story"}): + link_tag.insert(0,'

'+soup.find('title').contents[0].strip()+'

') + + return soup From 1f59369d64f4640a8e40bbae3ac55a478faa32fa Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 25 Jul 2011 14:41:48 -0600 Subject: [PATCH 16/19] ... --- src/calibre/ebooks/mobi/debug.py | 85 +++++++++++++++++++------------- 1 file changed, 51 insertions(+), 34 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index a848f11355..b85d73f55c 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -12,7 +12,7 @@ from collections import OrderedDict, defaultdict from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.utils import (decode_hex_number, decint, - get_trailing_data) + get_trailing_data, decode_fvwi) from calibre.utils.magick.draw import identify_data # PalmDB {{{ @@ -964,7 +964,8 @@ class TBSIndexing(object): # {{{ byts = byts[consumed:] ans.append('Unknown (vwi: always 0?): %d'%arg1) if self.doc_type in (257, 259): # Hierarchical periodical - byts, a = self.interpret_periodical(tbs_type, byts) + byts, a = self.interpret_periodical(tbs_type, byts, + dat['geom'][0]) ans += a if byts: sbyts = tuple(hex(b)[2:] for b in byts) @@ -973,7 +974,7 @@ class TBSIndexing(object): # {{{ ans.append('') return tbs_type, ans - def interpret_periodical(self, tbs_type, byts): + def interpret_periodical(self, tbs_type, byts, record_offset): ans = [] def tbs_type_6(byts, psi=None, msg=None, fmsg='Unknown'): # {{{ @@ -1014,6 +1015,50 @@ class TBSIndexing(object): # {{{ # }}} + def read_section_transitions(byts, psi=None): # {{{ + if psi is None: + # Assume parent section is 1 + psi = self.get_index(1) + + while byts: + ai, flags, consumed = decode_fvwi(byts) + byts = byts[consumed:] + if flags & 0b1000: + nsi = self.get_index(psi.index+1) + ans.append('Last article in this record of section %d' + ' (relative to next section index [%d]): ' + '%d [%d absolute index]'%(psi.index, nsi.index, ai, + ai+nsi.index)) + psi = nsi + continue + + ans.append('First article in this record of section %d' + ' (relative to its parent section): ' + '%d [%d absolute index]'%(psi.index, ai, ai+psi.index)) + + if flags == 0: + ans.append('The section %d has only one article' + ' in this record'%psi.index) + continue + + if flags & 0b0100: + num = byts[0] + byts = byts[1:] + ans.append('Number of articles in this record of ' + 'section %d: %d'%(psi.index, num)) + + if flags & 0b0010: + raise ValueError( + 'Dont know how to interpret the 0b0010 flag') + + if flags & 0b0001: + arg, consumed = decint(byts) + byts = byts[consumed:] + ans.append('->Offset to start of next section (%d) from start' + ' of record: %d [%d absolute offset]'%(psi.index+1, + arg, arg+record_offset)) + # }}} + if tbs_type == 3: # {{{ arg2, consumed = decint(byts) byts = byts[consumed:] @@ -1025,7 +1070,7 @@ class TBSIndexing(object): # {{{ flags = arg3 & 0b1111 ans.append('First section index (fvwi): %d'%fsi) psi = self.get_index(fsi) - ans.append('Flags (flag: always 0?): %d'%flags) + ans.append('Flags: %d'%flags) if flags == 4: ans.append('Number of articles in this section: %d'%byts[0]) byts = byts[1:] @@ -1033,35 +1078,7 @@ class TBSIndexing(object): # {{{ pass else: raise ValueError('Unknown flags value: %d'%flags) - - - if byts: - byts = tbs_type_6(byts, psi=psi, - msg=('First article of ending section, relative to its' - ' parent\'s index'), - fmsg=('->Offset from start of record to beginning of' - ' last starting section')) - while byts: - # We have a transition not just an opening first section - psi = self.get_index(psi.index+1) - arg, consumed = decint(byts) - off = arg >> 4 - byts = byts[consumed:] - flags = arg & 0b1111 - ans.append('Last article of ending section w.r.t. starting' - ' section offset (fvwi): %d [%d absolute]'%(off, - psi.index+off)) - ans.append('Flags (always 8?): %d'%flags) - byts = tbs_type_6(byts, psi=psi) - if byts: - # Ended with flag 1,and not EOF, which means there's - # another section transition in this record - arg, consumed = decint(byts) - byts = byts[consumed:] - ans.append('->Offset from start of record to beginning of ' - 'last starting section: %d'%(arg)) - else: - break + byts = read_section_transitions(byts, psi) # }}} @@ -1124,7 +1141,7 @@ class TBSIndexing(object): # {{{ elif flags == 0: byts = tbs_type_6(byts, psi=psi) else: - raise ValueError('Unkown flags: %d'%flags) + raise ValueError('Unknown flags: %d'%flags) # }}} return byts, ans From 0c5a37fbc0120cbb262700ead7f13385b713a758 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 25 Jul 2011 15:45:16 -0600 Subject: [PATCH 17/19] ... --- src/calibre/ebooks/metadata/sources/isbndb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/metadata/sources/isbndb.py b/src/calibre/ebooks/metadata/sources/isbndb.py index b33a625ca7..31c5e69d65 100644 --- a/src/calibre/ebooks/metadata/sources/isbndb.py +++ b/src/calibre/ebooks/metadata/sources/isbndb.py @@ -151,7 +151,7 @@ class ISBNDB(Source): bl = feed.find('BookList') if bl is None: - err = tostring(etree.find('errormessage')) + err = tostring(feed.find('errormessage')) raise ValueError('ISBNDb query failed:' + err) total_results = int(bl.get('total_results')) shown_results = int(bl.get('shown_results')) From 427060533522e005f82e6866046abb8b3ec81dee Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 25 Jul 2011 19:49:57 -0600 Subject: [PATCH 18/19] New MOBI output: Write the TBS sequences for periodicals. Also fully decoded all TBS sequences, only unknown bits left are in the opening sequence that seems to depend on the type of record being indexed. The rules are simple, so I just use them instead of spending more time looking for deeper meaning. --- src/calibre/ebooks/mobi/debug.py | 214 ++++++-------------- src/calibre/ebooks/mobi/tbs_periodicals.rst | 89 +++++++- src/calibre/ebooks/mobi/utils.py | 94 +++++++++ src/calibre/ebooks/mobi/writer2/indexer.py | 166 +++++++++++---- 4 files changed, 375 insertions(+), 188 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index b85d73f55c..67f20e691f 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -12,7 +12,7 @@ from collections import OrderedDict, defaultdict from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.utils import (decode_hex_number, decint, - get_trailing_data, decode_fvwi) + get_trailing_data, decode_tbs) from calibre.utils.magick.draw import identify_data # PalmDB {{{ @@ -949,20 +949,22 @@ class TBSIndexing(object): # {{{ ans.append(('\t\tIndex Entry: %d (Parent index: %d, ' 'Depth: %d, Offset: %d, Size: %d) [%s]')%( x.index, x.parent_index, x.depth, x.offset, x.size, x.label)) - def bin3(num): + def bin4(num): ans = bin(num)[2:] - return '0'*(3-len(ans)) + ans + return bytes('0'*(4-len(ans)) + ans) + + def repr_extra(x): + return str({bin4(k):v for k, v in extra.iteritems()}) tbs_type = 0 if len(byts): - outer, consumed = decint(byts) + outermost_index, extra, consumed = decode_tbs(byts) byts = byts[consumed:] - tbs_type = outer & 0b111 - ans.append('TBS Type: %s (%d)'%(bin3(tbs_type), tbs_type)) - ans.append('Outer Index entry: %d'%(outer >> 3)) - arg1, consumed = decint(byts) - byts = byts[consumed:] - ans.append('Unknown (vwi: always 0?): %d'%arg1) + for k in extra: + tbs_type |= k + ans.append('\nTBS: %d (%s)'%(tbs_type, bin4(tbs_type))) + ans.append('Outermost index: %d'%outermost_index) + ans.append('Unknown extra start bytes: %s'%repr_extra(extra)) if self.doc_type in (257, 259): # Hierarchical periodical byts, a = self.interpret_periodical(tbs_type, byts, dat['geom'][0]) @@ -977,53 +979,21 @@ class TBSIndexing(object): # {{{ def interpret_periodical(self, tbs_type, byts, record_offset): ans = [] - def tbs_type_6(byts, psi=None, msg=None, fmsg='Unknown'): # {{{ - if psi is None: - # Assume parent section is 1 - psi = self.get_index(1) - if msg is None: - msg = ('Article index at start of record or first article' - ' index, relative to parent section') - if byts: - # byts could be empty - arg, consumed = decint(byts) - byts = byts[consumed:] - flags = (arg & 0b1111) - ai = (arg >> 4) - ans.append('%s (fvwi): %d [%d absolute]'%(msg, ai, - ai+psi.index)) - if flags == 1: - arg, consumed = decint(byts) - if arg == 0: - # EOF of record, otherwise ignore and hope someone else - # will deal with these bytes - byts = byts[consumed:] - ans.append('EOF (vwi: should be 0): %d'%arg) - elif flags in (4, 5): - num = byts[0] - byts = byts[1:] - ans.append('Number of article nodes in the record (byte): %d'%num) - if flags == 5: - arg, consumed = decint(byts) - byts = byts[consumed:] - ans.append('%s (vwi)): %d'%(fmsg, arg)) - elif flags == 0: - pass - else: - raise ValueError('Unknown flags: %d'%flags) - return byts - - # }}} - def read_section_transitions(byts, psi=None): # {{{ if psi is None: - # Assume parent section is 1 + # Assume previous section is 1 psi = self.get_index(1) while byts: - ai, flags, consumed = decode_fvwi(byts) + ai, extra, consumed = decode_tbs(byts) byts = byts[consumed:] - if flags & 0b1000: + if extra.get(0b0010, None) is not None: + raise ValueError('Dont know how to interpret flag 0b0010' + ' while reading section transitions') + if extra.get(0b1000, None) is not None: + if len(extra) > 1: + raise ValueError('Dont know how to interpret flags' + ' %r while reading section transitions'%extra) nsi = self.get_index(psi.index+1) ans.append('Last article in this record of section %d' ' (relative to next section index [%d]): ' @@ -1036,113 +1006,57 @@ class TBSIndexing(object): # {{{ ' (relative to its parent section): ' '%d [%d absolute index]'%(psi.index, ai, ai+psi.index)) - if flags == 0: - ans.append('The section %d has only one article' - ' in this record'%psi.index) - continue + num = extra.get(0b0100, None) + if num is None: + msg = ('The section %d has at most one article' + ' in this record')%psi.index + else: + msg = ('Number of articles in this record of ' + 'section %d: %d')%(psi.index, num) + ans.append(msg) - if flags & 0b0100: - num = byts[0] - byts = byts[1:] - ans.append('Number of articles in this record of ' - 'section %d: %d'%(psi.index, num)) - - if flags & 0b0010: - raise ValueError( - 'Dont know how to interpret the 0b0010 flag') - - if flags & 0b0001: - arg, consumed = decint(byts) - byts = byts[consumed:] - ans.append('->Offset to start of next section (%d) from start' + offset = extra.get(0b0001, None) + if offset is not None: + if offset == 0: + ans.append('This record is spanned by the article:' + '%d'%(ai+psi.index)) + else: + ans.append('->Offset to start of next section (%d) from start' ' of record: %d [%d absolute offset]'%(psi.index+1, - arg, arg+record_offset)) + offset, offset+record_offset)) + return byts # }}} - if tbs_type == 3: # {{{ - arg2, consumed = decint(byts) + def read_starting_section(byts): # {{{ + si, extra, consumed = decode_tbs(byts) byts = byts[consumed:] - ans.append('Unknown (vwi: always 0?): %d'%arg2) - - arg3, consumed = decint(byts) - byts = byts[consumed:] - fsi = arg3 >> 4 - flags = arg3 & 0b1111 - ans.append('First section index (fvwi): %d'%fsi) - psi = self.get_index(fsi) - ans.append('Flags: %d'%flags) - if flags == 4: - ans.append('Number of articles in this section: %d'%byts[0]) - byts = byts[1:] - elif flags == 0: - pass - else: - raise ValueError('Unknown flags value: %d'%flags) - byts = read_section_transitions(byts, psi) - - # }}} - - elif tbs_type == 7: # {{{ - # This occurs for records that have no section nodes and - # whose parent section's index == 1 - ans.append('Unknown (maybe vwi?): %r'%bytes(byts[:2])) - byts = byts[2:] - arg, consumed = decint(byts) - byts = byts[consumed:] - ai = arg >> 4 - flags = arg & 0b1111 - ans.append('Article at start of record (fvwi): %d'%ai) - if flags == 4: - num = byts[0] - byts = byts[1:] - ans.append('Number of articles in record (byte): %d'%num) - elif flags == 0: - pass - elif flags == 1: - arg, consumed = decint(byts) - byts = byts[consumed:] - ans.append('EOF (vwi: should be 0): %d'%arg) - else: - raise ValueError('Unknown flags value: %d'%flags) + if len(extra) > 1 or 0b0010 in extra or 0b1000 in extra: + raise ValueError('Dont know how to interpret flags %r' + ' when reading starting section'%extra) + si = self.get_index(si) + ans.append('The section at the start of this record is:' + ' %d'%si.index) + if 0b0100 in extra: + num = extra[0b0100] + ans.append('The number of articles from the section %d' + ' in this record: %d'%(si.index, num)) + elif 0b0001 in extra: + eof = extra[0b0001] + if eof != 0: + raise ValueError('Unknown eof value %s when reading' + ' starting section'%eof) + ans.append('This record is spanned by an article from' + ' the section: %d'%si.index) + return si, byts # }}} - elif tbs_type == 6: # {{{ - # This is used for records spanned by an article whose parent - # section's index == 1 or for the opening record if it contains the - # periodical start, section 1 start and at least one article. The - # two cases are distinguished by the flags on the article index - # vwi. - unk = byts[0] - byts = byts[1:] - ans.append('Unknown (byte: always 2?): %d'%unk) - byts = tbs_type_6(byts) - # }}} + if tbs_type & 0b0100: + # Starting section is the first section + ssi = self.get_index(1) + else: + ssi, byts = read_starting_section(byts) - elif tbs_type == 2: # {{{ - # This occurs for records with no section nodes and whose parent - # section's index != 1 (undefined (records before the first - # section) or > 1) - # This is also used for records that are spanned by an article - # whose parent section index > 1. In this case the flags of the - # vwi referring to the article at the start - # of the record are set to 1 instead of 4. - arg, consumed = decint(byts) - byts = byts[consumed:] - flags = (arg & 0b1111) - psi = (arg >> 4) - ans.append('Parent section index (fvwi): %d'%psi) - psi = self.get_index(psi) - ans.append('Flags: %d'%flags) - if flags == 1: - arg, consumed = decint(byts) - byts = byts[consumed:] - ans.append('Unknown (vwi?: always 0?): %d'%arg) - byts = tbs_type_6(byts, psi=psi) - elif flags == 0: - byts = tbs_type_6(byts, psi=psi) - else: - raise ValueError('Unknown flags: %d'%flags) - # }}} + byts = read_section_transitions(byts, ssi) return byts, ans diff --git a/src/calibre/ebooks/mobi/tbs_periodicals.rst b/src/calibre/ebooks/mobi/tbs_periodicals.rst index d770133625..2fa6ec90f3 100644 --- a/src/calibre/ebooks/mobi/tbs_periodicals.rst +++ b/src/calibre/ebooks/mobi/tbs_periodicals.rst @@ -3,6 +3,20 @@ Reverse engineering the trailing byte sequences for hierarchical periodicals In the following, *vwi* means variable width integer and *fvwi* means a vwi whose lowest four bits are used as a flag. All the following information/inferences are from examining the output of kindlegen on a sample periodical. Given the general level of Amazon's incompetence, there are no guarantees that this information is the *best/most complete* way to do TBS indexing. +Sequence encoding: + +0b1000 : Continuation bit + +First sequences: +0b0010 : 80 +0b0011 : 80 80 +0b0110 : 80 2 +0b0111 : 80 2 80 + +Other sequences: +0b0101 : 4 1a +0b0001 : c b1 + Opening record ---------------- @@ -52,10 +66,60 @@ The text record that contains the opening node for the periodical (depth=0 node If there was only a single article, instead of 2, then the last two bytes would be: c0, i.e. there would be no byte giving the number of articles in the record. + Starting record with two section transitions:: + + Record #1: Starts at: 0 Ends at: 4095 + Contains: 7 index entries (0 ends, 4 complete, 3 starts) + TBS bytes: 86 80 2 c0 b8 c4 3 + Complete: + Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 564, Size: 375) [Ars Technica] + Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 572, Size: 367) [Week in gaming: 3DS review, Crysis 2, George Hotz] + Index Entry: 6 (Parent index: 2, Depth: 2, Offset: 947, Size: 1014) [Max and the Magic Marker for iPad: Review] + Index Entry: 7 (Parent index: 2, Depth: 2, Offset: 1961, Size: 1077) [iPad 2 steers itself into home console gaming territory with Real Racing 2 HD] + Starts: + Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 35372) [j_x's Google reader] + Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 939, Size: 10368) [Neowin.net] + Index Entry: 8 (Parent index: 2, Depth: 2, Offset: 3038, Size: 1082) [Microsoft's Joe Belfiore still working on upcoming Zune hardware] + TBS Type: 110 (6) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Unknown (byte: always 2?): 2 + Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute] + Remaining bytes: b8 c4 3 + + Starting record with three section transitions:: + + Record #1: Starts at: 0 Ends at: 4095 + Contains: 10 index entries (0 ends, 7 complete, 3 starts) + TBS bytes: 86 80 2 c0 b8 c0 b8 c4 4 + Complete: + Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 564, Size: 375) [Ars Technica] + Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 939, Size: 316) [Neowin.net] + Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 572, Size: 367) [Week in gaming: 3DS review, Crysis 2, George Hotz] + Index Entry: 6 (Parent index: 2, Depth: 2, Offset: 947, Size: 308) [Max and the Magic Marker for iPad: Review] + Index Entry: 7 (Parent index: 3, Depth: 2, Offset: 1263, Size: 760) [OSnews Asks on Interrupts: The Results] + Index Entry: 8 (Parent index: 3, Depth: 2, Offset: 2023, Size: 693) [Apple Ditches SAMBA in Favour of Homegrown Replacement] + Index Entry: 9 (Parent index: 3, Depth: 2, Offset: 2716, Size: 747) [ITC: Apple's Mobile Products Do Not Violate Nokia Patents] + Starts: + Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 25320) [j_x's Google reader] + Index Entry: 3 (Parent index: 0, Depth: 1, Offset: 1255, Size: 6829) [OSNews] + Index Entry: 10 (Parent index: 3, Depth: 2, Offset: 3463, Size: 666) [Transparent Monitor Embedded in Window Glass] + TBS Type: 110 (6) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Unknown (byte: always 2?): 2 + Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute] + Remaining bytes: b8 c0 b8 c4 4 + + + + Records with no nodes ------------------------ +subtype = 010 + These records are spanned by a single article. They are of two types: 1. If the parent section index is 1, TBS type of 6, like this:: @@ -247,7 +311,7 @@ In such a record there is a transition from one section to the next. As such the Last article of ending section w.r.t. starting section offset (fvwi): 12 [15 absolute] Flags (always 8?): 8 Article index at start of record or first article index, relative to parent section (fvwi): 13 [16 absolute] - Number of article nodes in the record (byte): 4 + Number of article nodes in the record belonging ot the last section (byte): 4 Ending record @@ -274,3 +338,26 @@ Logically, ending records must have at least one article ending, one section end If the record had only a single article end, the last two bytes would be replaced with: f0 +If the last record has multiple section transitions, it is of type 6 and looks like:: + + Record #9: Starts at: 32768 Ends at: 34953 + Contains: 9 index entries (3 ends, 6 complete, 0 starts) + TBS bytes: 86 80 2 1 d0 1 c8 1 d0 1 c8 1 d0 1 c8 1 d0 + Ends: + Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 34739) [j_x's Google reader] + Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 7758, Size: 26279) [Ars Technica] + Index Entry: 14 (Parent index: 1, Depth: 2, Offset: 31929, Size: 2108) [Trademarked keyword sales may soon be restricted in Europe] + Complete: + Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 34037, Size: 316) [Neowin.net] + Index Entry: 3 (Parent index: 0, Depth: 1, Offset: 34353, Size: 282) [OSNews] + Index Entry: 4 (Parent index: 0, Depth: 1, Offset: 34635, Size: 319) [Slashdot] + Index Entry: 15 (Parent index: 2, Depth: 2, Offset: 34045, Size: 308) [Max and the Magic Marker for iPad: Review] + Index Entry: 16 (Parent index: 3, Depth: 2, Offset: 34361, Size: 274) [OSnews Asks on Interrupts: The Results] + Index Entry: 17 (Parent index: 4, Depth: 2, Offset: 34643, Size: 311) [Leonard Nimoy Turns 80] + TBS Type: 110 (6) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Unknown (byte: always 2?): 2 + Article index at start of record or first article index, relative to parent section (fvwi): 13 [14 absolute] + Remaining bytes: 1 c8 1 d0 1 c8 1 d0 1 c8 1 d0 + diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index ae1241e2f1..37d2093066 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -11,6 +11,7 @@ import struct from collections import OrderedDict from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail +from calibre.ebooks import normalize IMAGE_MAX_SIZE = 10 * 1024 * 1024 @@ -197,3 +198,96 @@ def encode_trailing_data(raw): lsize += 1 return raw + encoded +def encode_fvwi(val, flags): + ''' + Encode the value val and the 4 bit flags flags as a fvwi. This encoding is + used in the trailing byte sequences for indexing. Returns encoded + bytestring. + ''' + ans = (val << 4) | (flags & 0b1111) + return encint(ans) + + +def decode_fvwi(byts): + ''' + Decode encoded fvwi. Returns number, flags, consumed + ''' + arg, consumed = decint(bytes(byts)) + return (arg >> 4), (arg & 0b1111), consumed + +def decode_tbs(byts): + ''' + Trailing byte sequences for indexing consists of series of fvwi numbers. + This function reads the fvwi number and its associated flags. It them uses + the flags to read any more numbers that belong to the series. The flags are + the lowest 4 bits of the vwi (see the encode_fvwi function above). + + Returns the fvwi number, a dictionary mapping flags bits to the associated + data and the number of bytes consumed. + ''' + byts = bytes(byts) + val, flags, consumed = decode_fvwi(byts) + extra = {} + byts = byts[consumed:] + if flags & 0b1000: + extra[0b1000] = True + if flags & 0b0010: + x, consumed2 = decint(byts) + byts = byts[consumed2:] + extra[0b0010] = x + consumed += consumed2 + if flags & 0b0100: + extra[0b0100] = ord(byts[0]) + byts = byts[1:] + consumed += 1 + if flags & 0b0001: + x, consumed2 = decint(byts) + byts = byts[consumed2:] + extra[0b0001] = x + consumed += consumed2 + return val, extra, consumed + +def encode_tbs(val, extra): + ''' + Encode the number val and the extra data in the extra dict as an fvwi. See + decode_tbs above. + ''' + flags = 0 + for flag in extra: + flags |= flag + ans = encode_fvwi(val, flags) + + if 0b0010 in extra: + ans += encint(extra[0b0010]) + if 0b0100 in extra: + ans += bytes(bytearray([extra[0b0100]])) + if 0b0001 in extra: + ans += encint(extra[0b0001]) + return ans + +def utf8_text(text): + ''' + Convert a possibly null string to utf-8 bytes, guaranteeing to return a non + empty, normalized bytestring. + ''' + if text and text.strip(): + text = text.strip() + if not isinstance(text, unicode): + text = text.decode('utf-8', 'replace') + text = normalize(text).encode('utf-8') + else: + text = _('Unknown').encode('utf-8') + return text + +def align_block(raw, multiple=4, pad=b'\0'): + ''' + Return raw with enough pad bytes append to ensure its length is a multiple + of 4. + ''' + extra = len(raw) % multiple + if extra == 0: return raw + return raw + pad*(multiple - extra) + + + + diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index 41c5d2ec91..04387f47f7 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -10,35 +10,13 @@ __docformat__ = 'restructuredtext en' from struct import pack from cStringIO import StringIO -from collections import OrderedDict +from collections import OrderedDict, defaultdict -from calibre.ebooks import normalize -from calibre.ebook.mobi.writer2 import RECORD_SIZE -from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex) +from calibre.ebooks.mobi.writer2 import RECORD_SIZE +from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex, + encode_trailing_data, encode_tbs, align_block, utf8_text) from calibre.ebooks.mobi.langcodes import iana2mobi -def utf8_text(text): - ''' - Convert a possibly null string to utf-8 bytes, guaranteeing to return a non - empty, normalized bytestring. - ''' - if text and text.strip(): - text = text.strip() - if not isinstance(text, unicode): - text = text.decode('utf-8', 'replace') - text = normalize(text).encode('utf-8') - else: - text = _('Unknown').encode('utf-8') - return text - -def align_block(raw, multiple=4, pad=b'\0'): - ''' - Return raw with enough pad bytes append to ensure its length is a multiple - of 4. - ''' - extra = len(raw) % multiple - if extra == 0: return raw - return raw + pad*(multiple - extra) class CNCX(object): # {{{ @@ -98,7 +76,7 @@ class IndexEntry(object): # {{{ 'first_child_index': 22, 'last_child_index': 23, } - RTAG_MAP = dict(TAG_VALUES.itervalues(), TAG_VALUES.iterkeys()) + RTAG_MAP = {v:k for k, v in TAG_VALUES.iteritems()} BITMASKS = [1, 2, 3, 4, 5, 21, 22, 23,] @@ -186,17 +164,123 @@ class TBS(object): # {{{ trailing byte sequence for the record. ''' - def __init__(self, data, is_periodical): - if is_periodical: - self.periodical_tbs(data) + def __init__(self, data, is_periodical, first=False, all_sections=[]): + if not data: + self.bytestring = encode_trailing_data(b'') else: - self.book_tbs(data) + self.section_map = OrderedDict((i.index, i) for i in + sorted(all_sections, key=lambda x:x.offset)) - def periodical_tbs(self, data): - self.bytestring = b'' + if is_periodical: + # The starting bytes. + # The value is zero which I think indicates the periodical + # index entry. The values for the various flags seem to be + # unused. If the 0b0100 is present, it means that the record + # deals with section 1 (or is the final record with section + # transitions). + self.type_010 = encode_tbs(0, {0b0010: 0}) + self.type_011 = encode_tbs(0, {0b0010: 0, 0b0001: 0}) + self.type_110 = encode_tbs(0, {0b0100: 2, 0b0010: 0}) + self.type_111 = encode_tbs(0, {0b0100: 2, 0b0010: 0, 0b0001: 0}) - def book_tbs(self, data): - self.bytestring = b'' + depth_map = defaultdict(list) + for x in ('starts', 'ends', 'completes'): + for idx in data[x]: + depth_map[idx.depth].append(idx) + for l in depth_map.itervalues(): + l.sort(key=lambda x:x.offset) + self.periodical_tbs(data, first, depth_map) + else: + self.book_tbs(data, first) + + def periodical_tbs(self, data, first, depth_map): + buf = StringIO() + + has_section_start = (depth_map[1] and depth_map[1][0] in + data['starts']) + spanner = data['spans'] + first_node = None + for nodes in depth_map.values(): + for node in nodes: + if (first_node is None or (node.offset, node.depth) < + (first_node.offset, first_node.depth)): + first_node = node + + parent_section_index = -1 + if depth_map[0]: + # We have a terminal record + typ = (self.type_110 if has_section_start else self.type_010) + if first_node.depth > 0: + parent_section_index = (first_node.index if first_node.depth + == 1 else first_node.parent_index) + else: + if spanner is not None: + # record is spanned by a single article + parent_section_index = spanner.parent_index + typ = (self.type_110 if parent_section_index == 1 else + self.type_010) + elif not depth_map[1]: + # has only article nodes, i.e. spanned by a section + parent_section_index = self.depth_map[2][0].parent_index + typ = (self.type_111 if parent_section_index == 1 else + self.type_010) + else: + # has section transitions + parent_section_index = self.depth_map[2][0].parent_index + + buf.write(typ) + + if parent_section_index > 1: + # Write starting section information + if spanner is None: + num_articles = len(depth_map[1]) + extra = {} + if num_articles > 1: + extra = {0b0100: num_articles} + else: + extra = {0b0001: 0} + buf.write(encode_tbs(parent_section_index, extra)) + + if spanner is None: + articles = depth_map[2] + sections = [self.section_map[a.parent_index] for a in articles] + sections.sort(key=lambda x:x.offset) + section_map = {s:[a for a in articles is a.parent_index == + s.index] for s in sections} + for i, section in enumerate(sections): + # All the articles in this record that belong to section + articles = section_map[section] + first_article = articles[0] + last_article = articles[-1] + num = len(articles) + + try: + next_sec = sections[i+1] + except: + next_sec == None + + extra = {} + if num > 1: + extra[0b0100] = num + if i == 0 and next_sec is not None: + # Write offset to next section from start of record + # For some reason kindlegen only writes this offset + # for the first section transition. Imitate it. + extra[0b0001] = next_sec.offset - data['offset'] + + buf.write(encode_tbs(first_article.index-section.index, extra)) + + if next_sec is not None: + buf.write(encode_tbs(last_article.index-next_sec.index, + {0b1000: 0})) + else: + buf.write(encode_tbs(spanner.index - parent_section_index, + {0b0001: 0})) + + self.bytestring = encode_trailing_data(buf.getvalue()) + + def book_tbs(self, data, first): + self.bytestring = encode_trailing_data(b'') # }}} class Indexer(object): # {{{ @@ -548,11 +632,13 @@ class Indexer(object): # {{{ def calculate_trailing_byte_sequences(self): self.tbs_map = {} + found_node = False + sections = [i for i in self.indices if i.depth == 1] for i in xrange(self.number_of_text_records): offset = i * RECORD_SIZE next_offset = offset + RECORD_SIZE data = OrderedDict([('ends',[]), ('completes',[]), ('starts',[]), - ('spans', None)]) + ('spans', None), ('offset', offset)]) for index in self.indices: if index.offset >= next_offset: # Node starts after current record @@ -574,7 +660,13 @@ class Indexer(object): # {{{ data['ends'].append(index) else: data['spans'] = index - self.tbs_map[i+1] = TBS(data, self.is_periodical) + if (data['ends'] or data['completes'] or data['starts'] or + data['spans'] is not None): + self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not + found_node, all_sections=sections) + found_node = True + else: + self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False) def get_trailing_byte_sequence(self, num): return self.tbs_map[num].bytestring From 1297576ee20028ce7302ac180dc6e7c2520ae760 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 25 Jul 2011 20:23:52 -0600 Subject: [PATCH 19/19] New MOBI output: Allow calibre to convert OEB documents with a toc.ncx conforming to the kindlegen periodical specification into periodicals --- src/calibre/ebooks/mobi/writer2/indexer.py | 28 +++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index 04387f47f7..0f7a670cff 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -295,7 +295,9 @@ class Indexer(object): # {{{ self.log = oeb.log self.opts = opts - self.is_periodical = opts.mobi_periodical + self.is_periodical = self.detect_periodical() + self.log('Generating MOBI index for a %s'%('periodical' if + self.is_periodical else 'book')) self.is_flat_periodical = False if opts.mobi_periodical: periodical_node = iter(oeb.toc).next() @@ -317,6 +319,28 @@ class Indexer(object): # {{{ self.calculate_trailing_byte_sequences() + def detect_periodical(self): # {{{ + for node in self.oeb.toc.iterdescendants(): + if node.depth() == 1 and node.klass != 'article': + self.log.debug( + 'Not a periodical: Deepest node does not have ' + 'class="article"') + return False + if node.depth() == 2 and node.klass != 'section': + self.log.debug( + 'Not a periodical: Second deepest node does not have' + ' class="section"') + return False + if node.depth() == 3 and node.klass != 'periodical': + self.log.debug('Not a periodical: Third deepest node' + ' does not have class="periodical"') + return False + if node.depth() > 3: + self.log.debug('Not a periodical: Has nodes of depth > 3') + return False + return True + # }}} + def create_index_record(self): # {{{ header_length = 192 buf = StringIO() @@ -630,6 +654,7 @@ class Indexer(object): # {{{ return indices # }}} + # TBS {{{ def calculate_trailing_byte_sequences(self): self.tbs_map = {} found_node = False @@ -670,6 +695,7 @@ class Indexer(object): # {{{ def get_trailing_byte_sequence(self, num): return self.tbs_map[num].bytestring + # }}} # }}}