From 2b45d99b02e300c4bdfc06566eb979f45d93a403 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jul 2011 16:46:31 -0600 Subject: [PATCH 01/12] Improved Instapaper recipe --- recipes/instapaper.recipe | 43 ++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/recipes/instapaper.recipe b/recipes/instapaper.recipe index 0eb5cf0f09..c6175a783f 100644 --- a/recipes/instapaper.recipe +++ b/recipes/instapaper.recipe @@ -1,22 +1,31 @@ -from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1299694372(BasicNewsRecipe): - title = u'Instapaper' - __author__ = 'Darko Miletic' - publisher = 'Instapaper.com' - category = 'info, custom, Instapaper' - oldest_article = 365 + title = u'Instapaper' + __author__ = 'Darko Miletic' + publisher = 'Instapaper.com' + category = 'info, custom, Instapaper' + oldest_article = 365 max_articles_per_feed = 100 no_stylesheets = True + remove_javascript = True + remove_tags = [ + dict(name='div', attrs={'id':'text_controls_toggle'}) + ,dict(name='script') + ,dict(name='div', attrs={'id':'text_controls'}) + ,dict(name='div', attrs={'id':'editing_controls'}) + ,dict(name='div', attrs={'class':'bar bottom'}) + ] use_embedded_content = False needs_subscription = True INDEX = u'http://www.instapaper.com' LOGIN = INDEX + u'/user/login' - - feeds = [(u'Instapaper Unread', u'http://www.instapaper.com/u'), (u'Instapaper Starred', u'http://www.instapaper.com/starred')] + feeds = [ + (u'Instapaper Unread', u'http://www.instapaper.com/u'), + (u'Instapaper Starred', u'http://www.instapaper.com/starred') + ] def get_browser(self): br = BasicNewsRecipe.get_browser() @@ -37,18 +46,20 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe): self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) articles = [] soup = self.index_to_soup(feedurl) - for item in soup.findAll('div', attrs={'class':'titleRow'}): - description = self.tag_to_string(item.div) + for item in soup.findAll('div', attrs={'class':'cornerControls'}): + #description = self.tag_to_string(item.div) atag = item.a if atag and atag.has_key('href'): url = atag['href'] - title = self.tag_to_string(atag) - date = strftime(self.timefmt) articles.append({ - 'title' :title - ,'date' :date - ,'url' :url - ,'description':description + 'url' :url }) totalfeeds.append((feedtitle, articles)) return totalfeeds + + def print_version(self, url): + return 'http://www.instapaper.com' + url + + def populate_article_metadata(self, article, soup, first): + article.title = soup.find('title').contents[0].strip() + From ca2c41516af57e1c036e87e1caf8bd1f0ccb0ef0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jul 2011 17:53:26 -0600 Subject: [PATCH 02/12] Content server: Add a link at the bottom of the mobile interface to switch tot he full interface. Fixes #812525 ([Enhancement] Web app) --- resources/content_server/browse/browse.html | 1 + src/calibre/ebooks/mobi/debug.py | 45 +++++++++++++++++++-- src/calibre/library/server/mobile.py | 12 +++++- src/calibre/manual/faq.rst | 4 +- 4 files changed, 56 insertions(+), 6 deletions(-) diff --git a/resources/content_server/browse/browse.html b/resources/content_server/browse/browse.html index 6a9697dc06..cf17742c87 100644 --- a/resources/content_server/browse/browse.html +++ b/resources/content_server/browse/browse.html @@ -11,6 +11,7 @@ + diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index ce7d78303e..884311617d 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -10,7 +10,7 @@ __docformat__ = 'restructuredtext en' import struct, datetime, sys, os from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language -from calibre.ebooks.mobi.writer2.utils import decode_hex_number +from calibre.ebooks.mobi.writer2.utils import decode_hex_number, decint # PalmDB {{{ class PalmDOCAttributes(object): @@ -498,9 +498,45 @@ class IndexHeader(object): # {{{ class IndexEntry(object): # {{{ + TYPES = { + # Present in book type files + 0x0f : 'chapter', + 0x6f : 'chapter_with_subchapters', + 0x1f : 'subchapter', + # Present in periodicals + 0xdf : 'periodical', + 0xff : 'section', + 0x3f : 'article', + } + def __init__(self, ident, entry_type, raw): self.id = ident - self.entry_type = entry_type + self.fields = [] + self.sub_type = None + + try: + self.entry_type = self.TYPES[entry_type] + except KeyError: + raise ValueError('Unknown IndexEntry type: %s'%hex(entry_type)) + + if self.entry_type in (0xdf, 0xff): + self.subtype = ord(raw[0]) + raw = raw[1:] + while True: + val, consumed = decint(raw) + raw = raw[consumed:] + if val == 0: + break + else: + self.fields.append(val) + + + def __str__(self): + ans = ['Index Entry(id=%s, entry_type=%s, sub_type=%s)'%( + self.id, self.entry_type, self.sub_type)] + ans.append('\tFields: %r'%self.fields) + return '\n'.join(ans) + # }}} class IndexRecord(object): # {{{ @@ -538,7 +574,7 @@ class IndexRecord(object): # {{{ index = indxt[off:] ident, consumed = decode_hex_number(index) index = index[consumed:] - entry_type = u(b'>B', index[0]) + entry_type, = u(b'>B', index[0]) self.indices.append(IndexEntry(ident, entry_type, index[1:])) @@ -557,6 +593,9 @@ class IndexRecord(object): # {{{ u(self.unknown3) u(self.unknown4) a('Index offsets: %r'%self.index_offsets) + a('\nIndex Entries:') + for entry in self.indices: + a(str(entry)+'\n') return '\n'.join(ans) diff --git a/src/calibre/library/server/mobile.py b/src/calibre/library/server/mobile.py index ad5ee4af96..3ce96a2b49 100644 --- a/src/calibre/library/server/mobile.py +++ b/src/calibre/library/server/mobile.py @@ -153,12 +153,22 @@ def build_index(books, num, search, sort, order, start, total, url_base, CKEYS, bookt.append(TR(thumbnail, data)) # }}} + body.append(HR()) + body.append(DIV( + A(_('Switch to the full interface (non-mobile interface)'), + href="/browse", + style="text-decoration: none; color: blue", + title=_('The full interface gives you many more features, ' + 'but it may not work well on a small screen')), + style="text-align:center")) return HTML( HEAD( TITLE(__appname__ + ' Library'), LINK(rel='icon', href='http://calibre-ebook.com/favicon.ico', type='image/x-icon'), - LINK(rel='stylesheet', type='text/css', href=prefix+'/mobile/style.css') + LINK(rel='stylesheet', type='text/css', + href=prefix+'/mobile/style.css'), + LINK(rel='apple-touch-icon', href="/static/calibre.png") ), # End head body ) # End html diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index 5601407282..556f508880 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -405,9 +405,9 @@ To those of you that claim that you need access to the filesystem to so that you If you are worried that someday |app| will cease to be developed, leaving all your books marooned in its folder structure, explore the powerful "Save to Disk" feature in |app| that lets you export all your files into a folder structure of arbitrary complexity based on their metadata. -Since I keep getting asked why there are numbers at the end of the title folder name, the reason is for *robustness*. That number is the id number of the book record in the |app| database. The presence of the number allows you to have multiple records with the same title and author names. More importantly, it is part of what allows |app| to magically regenerate the database with all metadata if the database file gets corrupted. Given that |app|'s mission is to get you to stop storing metadata in filenames and stop using the filesystem to find things, the increased robustness afforded by the id numbers is well worth the uglier folder names. +Finally, the reason there are numbers at the end of every title folder, is for *robustness*. That number is the id number of the book record in the |app| database. The presence of the number allows you to have multiple records with the same title and author names. It is also part of what allows |app| to magically regenerate the database with all metadata if the database file gets corrupted. Given that |app|'s mission is to get you to stop storing metadata in filenames and stop using the filesystem to find things, the increased robustness afforded by the id numbers is well worth the uglier folder names. -Finally, if you are irrevocably wedded to using the filesystem to store your metadata, feel free to patch your local copy of |app| to use whatever storage scheme you like. But, do not bother me with requests to change the directory structure, **they will be ignored**. +If you are still not convinced, then I'm afraid |app| is not for you. Look elsewhere for your book cataloguing needs. Just so we're clear, **this is not going to change**. Kindly do not contact us in an attempt to get us to change this. Why doesn't |app| have a column for foo? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 79ca569caae628806160e3372cdcd5cd2e6912bc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jul 2011 19:32:46 -0600 Subject: [PATCH 03/12] Mobi debug: Decompile CTOC and fix interpretation of index entries --- src/calibre/ebooks/mobi/debug.py | 69 +++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 18 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 884311617d..9eccd508a0 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -8,6 +8,7 @@ __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' import struct, datetime, sys, os +from collections import OrderedDict from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.writer2.utils import decode_hex_number, decint @@ -509,32 +510,31 @@ class IndexEntry(object): # {{{ 0x3f : 'article', } - def __init__(self, ident, entry_type, raw): + def __init__(self, ident, entry_type, raw, is_last): self.id = ident self.fields = [] self.sub_type = None + self.raw = raw try: self.entry_type = self.TYPES[entry_type] except KeyError: - raise ValueError('Unknown IndexEntry type: %s'%hex(entry_type)) + raise ValueError('Unknown Index Entry type: %s'%hex(entry_type)) if self.entry_type in (0xdf, 0xff): self.subtype = ord(raw[0]) raw = raw[1:] - while True: + while raw: val, consumed = decint(raw) raw = raw[consumed:] - if val == 0: - break - else: - self.fields.append(val) - + self.fields.append(val) + if is_last and self.fields[-1] == 0: + self.fields = self.fields[:-1] def __str__(self): - ans = ['Index Entry(id=%s, entry_type=%s, sub_type=%s)'%( - self.id, self.entry_type, self.sub_type)] - ans.append('\tFields: %r'%self.fields) + ans = ['Index Entry(id=%s, entry_type=%s, sub_type=%s, length=%d)'%( + self.id, self.entry_type, self.sub_type, len(self.raw))] + ans.append('\tFields (%d): %r'%(len(self.fields), self.fields)) return '\n'.join(ans) # }}} @@ -570,16 +570,21 @@ class IndexRecord(object): # {{{ indxt = raw[192:self.idxt_offset] self.indices = [] - for off in self.index_offsets: - index = indxt[off:] - ident, consumed = decode_hex_number(index) - index = index[consumed:] - entry_type, = u(b'>B', index[0]) - self.indices.append(IndexEntry(ident, entry_type, index[1:])) + for i, off in enumerate(self.index_offsets): + try: + next_off = self.index_offsets[i+1] + is_last = False + except: + next_off = len(indxt) + is_last = True + ident, consumed = decode_hex_number(indxt[off:]) + entry_type, = u(b'>B', indxt[off+consumed]) + self.indices.append(IndexEntry(ident, entry_type, + indxt[off+consumed+1:next_off], is_last)) def __str__(self): - ans = ['*'*20 + ' Index Record (%d bytes)'%len(self.record.raw)+ '*'*20] + ans = ['*'*20 + ' Index Record (%d bytes) '%len(self.record.raw)+ '*'*20] a = ans.append def u(w): a('Unknown: %r (%d bytes) (All zeros: %r)'%(w, @@ -601,6 +606,29 @@ class IndexRecord(object): # {{{ # }}} +class CTOC(object) : # {{{ + + def __init__(self, records, codec): + self.records = OrderedDict() + pos = 0 + for record in records: + raw = record.raw + while pos < len(raw): + length, consumed = decint(raw[pos:]) + if length > 0: + self.records[pos] = raw[pos+consumed:pos+consumed+length].decode( + codec) + pos += consumed+length + + def __str__(self): + ans = ['*'*20 + ' CTOC (%d strings) '%len(self.records)+ '*'*20] + for k, v in self.records.iteritems(): + ans.append('%10d : %s'%(k, v)) + return '\n'.join(ans) + + +# }}} + class MOBIFile(object): # {{{ def __init__(self, stream): @@ -633,6 +661,9 @@ class MOBIFile(object): # {{{ pir = self.mobi_header.primary_index_record if pir != 0xffffffff: self.index_header = IndexHeader(self.records[pir]) + self.ctoc = CTOC(self.records[ + pir+2:pir+2+self.index_header.num_of_ctoc_blocks], + self.index_header.index_encoding) self.index_record = IndexRecord(self.records[pir+1]) @@ -660,6 +691,8 @@ def inspect_mobi(path_or_stream): with open(os.path.join(ddir, 'index.txt'), 'wb') as out: print(str(f.index_header), file=out) print('\n\n', file=out) + print(str(f.ctoc).encode('utf-8'), file=out) + print('\n\n', file=out) print(str(f.index_record), file=out) print ('Debug data saved to:', ddir) From 08dff7d7221ecd070f8ac2d155088be85759a4ab Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jul 2011 20:04:03 -0600 Subject: [PATCH 04/12] ... --- src/calibre/ebooks/mobi/debug.py | 71 +++++++++++++++++++++++++++++--- 1 file changed, 65 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 9eccd508a0..dd7707e2f8 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -511,7 +511,7 @@ class IndexEntry(object): # {{{ } def __init__(self, ident, entry_type, raw, is_last): - self.id = ident + self.index = ident self.fields = [] self.sub_type = None self.raw = raw @@ -531,10 +531,69 @@ class IndexEntry(object): # {{{ if is_last and self.fields[-1] == 0: self.fields = self.fields[:-1] + self.interpret() + + def interpret(self): + self.offset = self.fields[0] + self.object_size = self.fields[1] + self.label_offset = self.fields[2] + self.depth = self.fields[3] + self.extra = OrderedDict() + self.extra_fields = [] + if self.entry_type == 'subchapter': + self.parent_index = self.fields[4] + self.extra['Parent chapter index'] = 'parent_index' + self.extra_fields = self.fields[5:] + elif self.entry_type == 'article': + self.class_offset = self.fields[4] + self.extra['Class offset in CTOC'] = 'class_offset' + self.parent_index = self.fields[5] + self.extra['Parent section index'] = 'parent_index' + if len(self.fields) > 6: + self.desc_offset = self.fields[6] + self.extra['Decription offset in CTOC'] = 'desc_offset' + if len(self.fields) > 7: + self.author_offset = self.fields[7] + self.extra['Author offset in CTOC'] = 'author_offset' + self.extra_fields = self.fields[8:] + elif self.entry_type == 'chapter_with_subchapters': + self.first_subchapter_index = self.fields[4] + self.last_subchapter_index = self.fields[5] + self.extra['First subchapter index'] = 'first_subchapter_index' + self.extra['Last subchapter index'] = 'last_subchapter_index' + self.extra_fields = self.fields[6:] + elif self.entry_type == 'periodical': + self.class_offset = self.fields[4] + self.extra['Class offset in CTOC'] = 'class_offset' + self.first_section_index = self.fields[5] + self.last_section_index = self.fields[6] + self.extra['First section index'] = 'first_section_index' + self.extra['Last section index'] = 'last_section_index' + self.extra_fields = self.fields[7:] + elif self.entry_type == 'section': + self.class_offset = self.fields[4] + self.extra['Class offset in CTOC'] = 'class_offset' + self.periodical_index = self.fields[5] + self.extra['Periodical index'] = 'periodical_index' + self.first_article_index = self.fields[6] + self.last_article_index = self.fields[7] + self.extra['First article index'] = 'first_article_index' + self.extra['Last article index'] = 'last_article_index' + self.extra_fields = self.fields[8:] + def __str__(self): - ans = ['Index Entry(id=%s, entry_type=%s, sub_type=%s, length=%d)'%( - self.id, self.entry_type, self.sub_type, len(self.raw))] - ans.append('\tFields (%d): %r'%(len(self.fields), self.fields)) + ans = ['Index Entry(index=%s, entry_type=%s, sub_type=%s, length=%d)'%( + self.index, self.entry_type, self.sub_type, len(self.raw))] + ans.append('\tOffset in HTML: %d'%self.offset) + ans.append('\tObject size in HTML: %d'%self.object_size) + ans.append('\tLabel offset in CTOC: %d'%self.label_offset) + ans.append('\tDepth: %d'%self.depth) + for text, attr in self.extra.iteritems(): + ans.append('\t%s: %d'%(text, getattr(self, attr))) + if self.extra_fields: + ans.append('\tExtra Fields (%d): %r'%(len(self.extra_fields), + self.extra_fields)) + return '\n'.join(ans) # }}} @@ -577,9 +636,9 @@ class IndexRecord(object): # {{{ except: next_off = len(indxt) is_last = True - ident, consumed = decode_hex_number(indxt[off:]) + index, consumed = decode_hex_number(indxt[off:]) entry_type, = u(b'>B', indxt[off+consumed]) - self.indices.append(IndexEntry(ident, entry_type, + self.indices.append(IndexEntry(index, entry_type, indxt[off+consumed+1:next_off], is_last)) From 55987fa6cb801e196cc84f8e7418e7be40db63fe Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jul 2011 22:38:04 -0600 Subject: [PATCH 05/12] Mobi debug: Figured out the TAGX table, use it to properly decode the index entries --- src/calibre/ebooks/mobi/debug.py | 189 +++++++++++++++++-------------- 1 file changed, 105 insertions(+), 84 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index dd7707e2f8..2dd26e9f83 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -377,18 +377,17 @@ class TagX(object): # {{{ def __init__(self, raw, control_byte_count): self.tag = ord(raw[0]) self.num_values = ord(raw[1]) - self.bmask = ord(raw[2]) - self.bitmask = bin(self.bmask) + self.bitmask = ord(raw[2]) # End of file = 1 iff last entry # When it is 1 all others are 0 self.eof = ord(raw[3]) self.is_eof = (self.eof == 1 and self.tag == 0 and self.num_values == 0 - and self.bmask == 0) + and self.bitmask == 0) def __repr__(self): - return 'TAGX(tag=%02d, num_values=%d, bitmask=%r (%d), eof=%d)' % (self.tag, - self.num_values, self.bitmask, self.bmask, self.eof) + return 'TAGX(tag=%02d, num_values=%d, bitmask=%r, eof=%d)' % (self.tag, + self.num_values, bin(self.bitmask), self.eof) # }}} class IndexHeader(object): # {{{ @@ -444,6 +443,7 @@ class IndexHeader(object): # {{{ self.tagx_control_byte_count)) if self.tagx_entries and not self.tagx_entries[-1].is_eof: raise ValueError('TAGX last entry is not EOF') + self.tagx_entries = self.tagx_entries[:-1] idxt0_pos = self.header_length+self.tagx_header_length last_num, consumed = decode_hex_number(raw[idxt0_pos:]) @@ -497,6 +497,81 @@ class IndexHeader(object): # {{{ return '\n'.join(ans) # }}} +class Tag(object): # {{{ + + ''' + Index entries are a collection of tags. Each tag is represented by this + class. + ''' + + TAG_MAP = { + 1: ('offset', 'Offset in HTML'), + 2: ('size', 'Size in HTML'), + 3: ('label_offset', 'Offset to label in CNCX'), + 4: ('depth', 'Depth of this entry in TOC'), + + # The remaining tag types have to be interpreted subject to the type + # of index entry they are present in + } + + INTERPRET_MAP = { + 'subchapter': { + 5 : ('Parent chapter index', 'parent_index') + }, + + 'article' : { + 5 : ('Class offset in CTOC', 'class_offset'), + 21 : ('Parent section index', 'parent_index'), + 22 : ('Description offset in CTOC', 'desc_offset'), + 23 : ('Author offset in CTOC', 'author_offset'), + }, + + 'chapter_with_subchapters' : { + 22 : ('First subchapter index', 'first_subchapter_index'), + 23 : ('Last subchapter index', 'last_subchapter_index'), + }, + + 'periodical' : { + 5 : ('Class offset in CTOC', 'class_offset'), + 22 : ('First section index', 'first_section_index'), + 23 : ('Last section index', 'last_section_index'), + }, + + 'section' : { + 5 : ('Class offset in CTOC', 'class_offset'), + 21 : ('Periodical index', 'periodical_index'), + 22 : ('First article index', 'first_article_index'), + 23 : ('Last article index', 'last_article_index'), + }, + } + + + def __init__(self, tagx, vals, entry_type, ctoc): + self.value = vals if len(vals) > 1 else vals[0] + self.entry_type = entry_type + self.ctoc_value = None + if tagx.tag in self.TAG_MAP: + self.attr, self.desc = self.TAG_MAP[tagx.tag] + else: + try: + td = self.INTERPRET_MAP[entry_type] + except: + raise ValueError('Unknown entry type: %s'%entry_type) + try: + self.desc, self.attr = td[tagx.tag] + except: + raise ValueError('Unknown tag: %d for entry type: %s'%( + tagx.tag, entry_type)) + if '_offset' in self.attr: + self.ctoc_value = ctoc[self.value] + + def __str__(self): + if self.ctoc_value is not None: + return '%s : %r [%r]'%(self.desc, self.value, self.ctoc_value) + return '%s : %r'%(self.desc, self.value) + +# }}} + class IndexEntry(object): # {{{ TYPES = { @@ -510,97 +585,41 @@ class IndexEntry(object): # {{{ 0x3f : 'article', } - def __init__(self, ident, entry_type, raw, is_last): + def __init__(self, ident, entry_type, raw, ctoc, tagx_entries): self.index = ident - self.fields = [] - self.sub_type = None self.raw = raw + self.tags = [] try: self.entry_type = self.TYPES[entry_type] except KeyError: raise ValueError('Unknown Index Entry type: %s'%hex(entry_type)) - if self.entry_type in (0xdf, 0xff): - self.subtype = ord(raw[0]) - raw = raw[1:] - while raw: - val, consumed = decint(raw) - raw = raw[consumed:] - self.fields.append(val) - if is_last and self.fields[-1] == 0: - self.fields = self.fields[:-1] + expected_tags = [tag for tag in tagx_entries if tag.bitmask & + entry_type] - self.interpret() - - def interpret(self): - self.offset = self.fields[0] - self.object_size = self.fields[1] - self.label_offset = self.fields[2] - self.depth = self.fields[3] - self.extra = OrderedDict() - self.extra_fields = [] - if self.entry_type == 'subchapter': - self.parent_index = self.fields[4] - self.extra['Parent chapter index'] = 'parent_index' - self.extra_fields = self.fields[5:] - elif self.entry_type == 'article': - self.class_offset = self.fields[4] - self.extra['Class offset in CTOC'] = 'class_offset' - self.parent_index = self.fields[5] - self.extra['Parent section index'] = 'parent_index' - if len(self.fields) > 6: - self.desc_offset = self.fields[6] - self.extra['Decription offset in CTOC'] = 'desc_offset' - if len(self.fields) > 7: - self.author_offset = self.fields[7] - self.extra['Author offset in CTOC'] = 'author_offset' - self.extra_fields = self.fields[8:] - elif self.entry_type == 'chapter_with_subchapters': - self.first_subchapter_index = self.fields[4] - self.last_subchapter_index = self.fields[5] - self.extra['First subchapter index'] = 'first_subchapter_index' - self.extra['Last subchapter index'] = 'last_subchapter_index' - self.extra_fields = self.fields[6:] - elif self.entry_type == 'periodical': - self.class_offset = self.fields[4] - self.extra['Class offset in CTOC'] = 'class_offset' - self.first_section_index = self.fields[5] - self.last_section_index = self.fields[6] - self.extra['First section index'] = 'first_section_index' - self.extra['Last section index'] = 'last_section_index' - self.extra_fields = self.fields[7:] - elif self.entry_type == 'section': - self.class_offset = self.fields[4] - self.extra['Class offset in CTOC'] = 'class_offset' - self.periodical_index = self.fields[5] - self.extra['Periodical index'] = 'periodical_index' - self.first_article_index = self.fields[6] - self.last_article_index = self.fields[7] - self.extra['First article index'] = 'first_article_index' - self.extra['Last article index'] = 'last_article_index' - self.extra_fields = self.fields[8:] + for tag in expected_tags: + vals = [] + for i in range(tag.num_values): + if not raw: + raise ValueError('Index entry does not match TAGX header') + val, consumed = decint(raw) + raw = raw[consumed:] + vals.append(val) + self.tags.append(Tag(tag, vals, self.entry_type, ctoc)) def __str__(self): - ans = ['Index Entry(index=%s, entry_type=%s, sub_type=%s, length=%d)'%( - self.index, self.entry_type, self.sub_type, len(self.raw))] - ans.append('\tOffset in HTML: %d'%self.offset) - ans.append('\tObject size in HTML: %d'%self.object_size) - ans.append('\tLabel offset in CTOC: %d'%self.label_offset) - ans.append('\tDepth: %d'%self.depth) - for text, attr in self.extra.iteritems(): - ans.append('\t%s: %d'%(text, getattr(self, attr))) - if self.extra_fields: - ans.append('\tExtra Fields (%d): %r'%(len(self.extra_fields), - self.extra_fields)) - + ans = ['Index Entry(index=%s, entry_type=%s, length=%d)'%( + self.index, self.entry_type, len(self.tags))] + for tag in self.tags: + ans.append('\t'+str(tag)) return '\n'.join(ans) # }}} class IndexRecord(object): # {{{ - def __init__(self, record): + def __init__(self, record, index_header, ctoc): self.record = record raw = self.record.raw if raw[:4] != b'INDX': @@ -632,14 +651,12 @@ class IndexRecord(object): # {{{ for i, off in enumerate(self.index_offsets): try: next_off = self.index_offsets[i+1] - is_last = False except: next_off = len(indxt) - is_last = True index, consumed = decode_hex_number(indxt[off:]) - entry_type, = u(b'>B', indxt[off+consumed]) + entry_type = ord(indxt[off+consumed]) self.indices.append(IndexEntry(index, entry_type, - indxt[off+consumed+1:next_off], is_last)) + indxt[off+consumed+1:next_off], ctoc, index_header.tagx_entries)) def __str__(self): @@ -679,6 +696,9 @@ class CTOC(object) : # {{{ codec) pos += consumed+length + def __getitem__(self, offset): + return self.records.get(offset) + def __str__(self): ans = ['*'*20 + ' CTOC (%d strings) '%len(self.records)+ '*'*20] for k, v in self.records.iteritems(): @@ -723,7 +743,8 @@ class MOBIFile(object): # {{{ self.ctoc = CTOC(self.records[ pir+2:pir+2+self.index_header.num_of_ctoc_blocks], self.index_header.index_encoding) - self.index_record = IndexRecord(self.records[pir+1]) + self.index_record = IndexRecord(self.records[pir+1], + self.index_header, self.ctoc) def print_header(self, f=sys.stdout): From a389b310c63a5f6bfa63c48de1ab5dad3c33d9e4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jul 2011 22:40:25 -0600 Subject: [PATCH 06/12] ... --- src/calibre/ebooks/mobi/debug.py | 44 ++++++++++++++++---------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 2dd26e9f83..32578781b8 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -421,7 +421,7 @@ class IndexHeader(object): # {{{ self.ordt_start, = struct.unpack('>I', raw[40:44]) self.ligt_start, = struct.unpack('>I', raw[44:48]) self.num_of_ligt_entries, = struct.unpack('>I', raw[48:52]) - self.num_of_ctoc_blocks, = struct.unpack('>I', raw[52:56]) + self.num_of_cncx_blocks, = struct.unpack('>I', raw[52:56]) self.unknown2 = raw[56:180] self.tagx_offset, = struct.unpack(b'>I', raw[180:184]) if self.tagx_offset != self.header_length: @@ -482,7 +482,7 @@ class IndexHeader(object): # {{{ a('ORDT start: %d'%self.ordt_start) a('LIGT start: %d'%self.ligt_start) a('Number of LIGT entries: %d'%self.num_of_ligt_entries) - a('Number of CTOC blocks: %d'%self.num_of_ctoc_blocks) + a('Number of cncx blocks: %d'%self.num_of_cncx_blocks) u(self.unknown2) a('TAGX offset: %d'%self.tagx_offset) u(self.unknown3) @@ -520,10 +520,10 @@ class Tag(object): # {{{ }, 'article' : { - 5 : ('Class offset in CTOC', 'class_offset'), + 5 : ('Class offset in cncx', 'class_offset'), 21 : ('Parent section index', 'parent_index'), - 22 : ('Description offset in CTOC', 'desc_offset'), - 23 : ('Author offset in CTOC', 'author_offset'), + 22 : ('Description offset in cncx', 'desc_offset'), + 23 : ('Author offset in cncx', 'author_offset'), }, 'chapter_with_subchapters' : { @@ -532,13 +532,13 @@ class Tag(object): # {{{ }, 'periodical' : { - 5 : ('Class offset in CTOC', 'class_offset'), + 5 : ('Class offset in cncx', 'class_offset'), 22 : ('First section index', 'first_section_index'), 23 : ('Last section index', 'last_section_index'), }, 'section' : { - 5 : ('Class offset in CTOC', 'class_offset'), + 5 : ('Class offset in cncx', 'class_offset'), 21 : ('Periodical index', 'periodical_index'), 22 : ('First article index', 'first_article_index'), 23 : ('Last article index', 'last_article_index'), @@ -546,10 +546,10 @@ class Tag(object): # {{{ } - def __init__(self, tagx, vals, entry_type, ctoc): + def __init__(self, tagx, vals, entry_type, cncx): self.value = vals if len(vals) > 1 else vals[0] self.entry_type = entry_type - self.ctoc_value = None + self.cncx_value = None if tagx.tag in self.TAG_MAP: self.attr, self.desc = self.TAG_MAP[tagx.tag] else: @@ -563,11 +563,11 @@ class Tag(object): # {{{ raise ValueError('Unknown tag: %d for entry type: %s'%( tagx.tag, entry_type)) if '_offset' in self.attr: - self.ctoc_value = ctoc[self.value] + self.cncx_value = cncx[self.value] def __str__(self): - if self.ctoc_value is not None: - return '%s : %r [%r]'%(self.desc, self.value, self.ctoc_value) + if self.cncx_value is not None: + return '%s : %r [%r]'%(self.desc, self.value, self.cncx_value) return '%s : %r'%(self.desc, self.value) # }}} @@ -585,7 +585,7 @@ class IndexEntry(object): # {{{ 0x3f : 'article', } - def __init__(self, ident, entry_type, raw, ctoc, tagx_entries): + def __init__(self, ident, entry_type, raw, cncx, tagx_entries): self.index = ident self.raw = raw self.tags = [] @@ -606,7 +606,7 @@ class IndexEntry(object): # {{{ val, consumed = decint(raw) raw = raw[consumed:] vals.append(val) - self.tags.append(Tag(tag, vals, self.entry_type, ctoc)) + self.tags.append(Tag(tag, vals, self.entry_type, cncx)) def __str__(self): ans = ['Index Entry(index=%s, entry_type=%s, length=%d)'%( @@ -619,7 +619,7 @@ class IndexEntry(object): # {{{ class IndexRecord(object): # {{{ - def __init__(self, record, index_header, ctoc): + def __init__(self, record, index_header, cncx): self.record = record raw = self.record.raw if raw[:4] != b'INDX': @@ -656,7 +656,7 @@ class IndexRecord(object): # {{{ index, consumed = decode_hex_number(indxt[off:]) entry_type = ord(indxt[off+consumed]) self.indices.append(IndexEntry(index, entry_type, - indxt[off+consumed+1:next_off], ctoc, index_header.tagx_entries)) + indxt[off+consumed+1:next_off], cncx, index_header.tagx_entries)) def __str__(self): @@ -682,7 +682,7 @@ class IndexRecord(object): # {{{ # }}} -class CTOC(object) : # {{{ +class CNCX(object) : # {{{ def __init__(self, records, codec): self.records = OrderedDict() @@ -700,7 +700,7 @@ class CTOC(object) : # {{{ return self.records.get(offset) def __str__(self): - ans = ['*'*20 + ' CTOC (%d strings) '%len(self.records)+ '*'*20] + ans = ['*'*20 + ' cncx (%d strings) '%len(self.records)+ '*'*20] for k, v in self.records.iteritems(): ans.append('%10d : %s'%(k, v)) return '\n'.join(ans) @@ -740,11 +740,11 @@ class MOBIFile(object): # {{{ pir = self.mobi_header.primary_index_record if pir != 0xffffffff: self.index_header = IndexHeader(self.records[pir]) - self.ctoc = CTOC(self.records[ - pir+2:pir+2+self.index_header.num_of_ctoc_blocks], + self.cncx = CNCX(self.records[ + pir+2:pir+2+self.index_header.num_of_cncx_blocks], self.index_header.index_encoding) self.index_record = IndexRecord(self.records[pir+1], - self.index_header, self.ctoc) + self.index_header, self.cncx) def print_header(self, f=sys.stdout): @@ -771,7 +771,7 @@ def inspect_mobi(path_or_stream): with open(os.path.join(ddir, 'index.txt'), 'wb') as out: print(str(f.index_header), file=out) print('\n\n', file=out) - print(str(f.ctoc).encode('utf-8'), file=out) + print(str(f.cncx).encode('utf-8'), file=out) print('\n\n', file=out) print(str(f.index_record), file=out) From 1b136b6fec5ec4c6eba6f14decce2f2ec4b11e67 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jul 2011 07:05:05 -0600 Subject: [PATCH 07/12] Fix #812750 (Literati (aka Azbooka) does not detect SD Card) --- src/calibre/devices/hanvon/driver.py | 2 +- src/calibre/ebooks/mobi/debug.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/calibre/devices/hanvon/driver.py b/src/calibre/devices/hanvon/driver.py index 3ce0fedac0..11b064b783 100644 --- a/src/calibre/devices/hanvon/driver.py +++ b/src/calibre/devices/hanvon/driver.py @@ -131,7 +131,7 @@ class AZBOOKA(ALEX): description = _('Communicate with the Azbooka') VENDOR_NAME = 'LINUX' - WINDOWS_MAIN_MEM = 'FILE-STOR_GADGET' + WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'FILE-STOR_GADGET' MAIN_MEMORY_VOLUME_LABEL = 'Azbooka Internal Memory' diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 32578781b8..8ffa3aa15b 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -574,6 +574,13 @@ class Tag(object): # {{{ class IndexEntry(object): # {{{ + ''' + The index is made up of entries, each of which is represented by an + instance of this class. Index entries typically point to offsets int eh + HTML, specify HTML sizes and point to text strings in the CNCX that are + used in the navigation UI. + ''' + TYPES = { # Present in book type files 0x0f : 'chapter', @@ -619,6 +626,11 @@ class IndexEntry(object): # {{{ class IndexRecord(object): # {{{ + ''' + Represents all indexing information in the MOBI, apart from indexing info + in the trailing data of the text records. + ''' + def __init__(self, record, index_header, cncx): self.record = record raw = self.record.raw @@ -684,6 +696,12 @@ class IndexRecord(object): # {{{ class CNCX(object) : # {{{ + ''' + Parses the records that contain the compiled NCX (all strings from the + NCX). Presents a simple offset : string mapping interface to access the + data. + ''' + def __init__(self, records, codec): self.records = OrderedDict() pos = 0 From 7612ec7ad840353a9679921284177171561c9ddb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jul 2011 07:10:24 -0600 Subject: [PATCH 08/12] Updated NBObline and JBPress --- recipes/jbpress.recipe | 11 ++++++++++- recipes/nbonline.recipe | 7 +++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/recipes/jbpress.recipe b/recipes/jbpress.recipe index acfb1c78d6..1048f1fc9a 100644 --- a/recipes/jbpress.recipe +++ b/recipes/jbpress.recipe @@ -1,4 +1,4 @@ -import urllib2 +import urllib2, re from calibre.web.feeds.news import BasicNewsRecipe class JBPress(BasicNewsRecipe): @@ -40,3 +40,12 @@ class JBPress(BasicNewsRecipe): def print_version(self, url): url = urllib2.urlopen(url).geturl() # resolve redirect. return url.replace('/-/', '/print/') + + def preprocess_html(self, soup): + # remove breadcrumb + h3s = soup.findAll('h3') + for h3 in h3s: + if re.compile('^JBpress>').match(h3.string): + h3.extract() + return soup + diff --git a/recipes/nbonline.recipe b/recipes/nbonline.recipe index c5a06edec7..82b7667a5c 100644 --- a/recipes/nbonline.recipe +++ b/recipes/nbonline.recipe @@ -1,11 +1,10 @@ -EMAILADDRESS = 'hoge@foobar.co.jp' from calibre.web.feeds.news import BasicNewsRecipe class NBOnline(BasicNewsRecipe): title = u'Nikkei Business Online' language = 'ja' - description = u'Nikkei Business Online New articles. PLEASE NOTE: You need to edit EMAILADDRESS line of this "nbonline.recipe" file to set your e-mail address which is needed when login. (file is in "Calibre2/resources/recipes" directory.)' + description = u'Nikkei Business Online.\u6CE8\uFF1A\u30E6\u30FC\u30B6\u30FC\u540D\u306Bemail\u30A2\u30C9\u30EC\u30B9\u3068\u30E6\u30FC\u30B6\u30FC\u540D\u3092\u30BB\u30DF\u30B3\u30ED\u30F3\u3067\u533A\u5207\u3063\u3066\u5165\u308C\u3066\u304F\u3060\u3055\u3044\u3002\u4F8B\uFF1Aemail@address.jp;username . PLEASE NOTE: You need to put your email address and username into username filed separeted by ; (semi-colon).' __author__ = 'Ado Nishimura' needs_subscription = True oldest_article = 7 @@ -23,8 +22,8 @@ class NBOnline(BasicNewsRecipe): if self.username is not None and self.password is not None: br.open('https://signon.nikkeibp.co.jp/front/login/?ct=p&ts=nbo') br.select_form(name='loginActionForm') - br['email'] = EMAILADDRESS - br['userId'] = self.username + br['email'] = self.username.split(';')[0] + br['userId'] = self.username.split(';')[1] br['password'] = self.password br.submit() return br From 93fef1787ed9df4f0d5a02a849603b44d3c9d18f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jul 2011 07:17:20 -0600 Subject: [PATCH 09/12] =?UTF-8?q?De=20Luns=20a=20Venres=20by=20Susana=20So?= =?UTF-8?q?telo=20Doc=C3=ADo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- recipes/galicia_confidential.recipe | 22 +++++++-------- recipes/luns_a_venres.recipe | 44 +++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 12 deletions(-) create mode 100644 recipes/luns_a_venres.recipe diff --git a/recipes/galicia_confidential.recipe b/recipes/galicia_confidential.recipe index d07946001e..4aaf434b09 100644 --- a/recipes/galicia_confidential.recipe +++ b/recipes/galicia_confidential.recipe @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- - from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds import Feed @@ -36,14 +35,13 @@ class GC_gl(BasicNewsRecipe): def feed_to_index_append(self, feedObject, masterFeed): - for feed in feedObject: - newArticles = [] - for article in feed.articles: - newArt = { - 'title' : article.title, - 'url' : article.url, - 'date' : article.date - } - newArticles.append(newArt) - masterFeed.append((feed.title,newArticles)) - + for feed in feedObject: + newArticles = [] + for article in feed.articles: + newArt = { + 'title' : article.title, + 'url' : article.url, + 'date' : article.date + } + newArticles.append(newArt) + masterFeed.append((feed.title,newArticles)) diff --git a/recipes/luns_a_venres.recipe b/recipes/luns_a_venres.recipe new file mode 100644 index 0000000000..1d7a2c159f --- /dev/null +++ b/recipes/luns_a_venres.recipe @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +from calibre.web.feeds.news import BasicNewsRecipe + +class LV_gl(BasicNewsRecipe): + title = u'De Luns a Venres (RSS)' + __author__ = u'Susana Sotelo Docío' + description = u'O gratuíto galego' + publisher = u'Galiciaé' + category = u'news' + encoding = 'utf-8' + language = 'gl' + direction = 'ltr' + cover_url = 'http://lv.galiciae.com/new_estilos/lv/logo.gif' + oldest_article = 2 + max_articles_per_feed = 200 + center_navbar = False + + feeds = [ + (u'Galicia', u'http://lv.galiciae.com/cache/rss/sec_galicia_gl.rss'), + (u'Cultura', u'http://lv.galiciae.com/cache/rss/sec_cultura_gl.rss'), + (u'Mundo', u'http://lv.galiciae.com/cache/rss/sec_mundo_gl.rss'), + (u'Cidadanía', u'http://lv.galiciae.com/cache/rss/sec_ciudadania_gl.rss'), + (u'Tecnoloxía', u'http://lv.galiciae.com/cache/rss/sec_tecnologia_gl.rss'), + (u'España', u'http://lv.galiciae.com/cache/rss/sec_espana_gl.rss'), + (u'Deportes', u'http://lv.galiciae.com/cache/rss/sec_deportes_gl.rss'), + (u'Economía', u'http://lv.galiciae.com/cache/rss/sec_economia_gl.rss'), + (u'Lercheo', u'http://lv.galiciae.com/cache/rss/sec_gente_gl.rss'), + (u'Medio ambiente', u'http://lv.galiciae.com/cache/rss/sec_medioambiente_gl.rss'), + (u'España/Mundo', u'http://lv.galiciae.com/cache/rss/sec_espanamundo_gl.rss'), + (u'Sociedade', u'http://lv.galiciae.com/cache/rss/sec_sociedad_gl.rss'), + (u'Ciencia', u'http://lv.galiciae.com/cache/rss/sec_ciencia_gl.rss'), + (u'Motor', u'http://lv.galiciae.com/cache/rss/sec_motor_gl.rss'), + (u'Coches', u'http://lv.galiciae.com/cache/rss/sec_coches_gl.rss'), + (u'Motos', u'http://lv.galiciae.com/cache/rss/sec_motos_gl.rss'), + (u'Industriais', u'http://lv.galiciae.com/cache/rss/sec_industriales_gl.rss') + ] + + extra_css = u' p{text-align:left} ' + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\nencoding="' + encoding + '"\ntags="' + category + '"\noverride_css=" p {text-align:left; text-indent: 0cm} "' + + def print_version(self, url): + url += '?imprimir&lang=gl' + return url + From 9cc367ae24ee341e507fbd1cf815de527d54195a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jul 2011 08:09:11 -0600 Subject: [PATCH 10/12] MOBI Input: When extracting images, ignore records that are known as non images faster --- src/calibre/ebooks/mobi/reader.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 1173b84266..d704379cf1 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -933,6 +933,9 @@ class MobiReader(object): continue processed_records.append(i) data = self.sections[i][0] + if data[:4] in (b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n'): + # A FLIS, FCIS, SRCS or EOF record, ignore + continue buf = cStringIO.StringIO(data) image_index += 1 try: From c7ea8f4886d84035dde2c99b53cbc1b27c436596 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jul 2011 09:50:02 -0600 Subject: [PATCH 11/12] Conversion pipeline: When adding/removing entries to the manifest, ignore unparseable URLs instead of erroring out on them --- src/calibre/ebooks/oeb/reader.py | 9 +++++++-- src/calibre/ebooks/oeb/transforms/trimmanifest.py | 5 ++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index 5bb6b193f7..9e4b6238a0 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -188,8 +188,13 @@ class OEBReader(object): href, _ = urldefrag(href) if not href: continue - href = item.abshref(urlnormalize(href)) - scheme = urlparse(href).scheme + try: + href = item.abshref(urlnormalize(href)) + scheme = urlparse(href).scheme + except: + self.oeb.log.exception( + 'Skipping invalid href: %r'%href) + continue if not scheme and href not in known: new.add(href) elif item.media_type in OEB_STYLES: diff --git a/src/calibre/ebooks/oeb/transforms/trimmanifest.py b/src/calibre/ebooks/oeb/transforms/trimmanifest.py index 95501dbb9b..3d56f0ef3d 100644 --- a/src/calibre/ebooks/oeb/transforms/trimmanifest.py +++ b/src/calibre/ebooks/oeb/transforms/trimmanifest.py @@ -47,7 +47,10 @@ class ManifestTrimmer(object): item.data is not None: hrefs = [r[2] for r in iterlinks(item.data)] for href in hrefs: - href = item.abshref(urlnormalize(href)) + try: + href = item.abshref(urlnormalize(href)) + except: + continue if href in oeb.manifest.hrefs: found = oeb.manifest.hrefs[href] if found not in used: From c0bb5902ac5d712d9549438ff96ad1e7a1e25f51 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jul 2011 09:50:33 -0600 Subject: [PATCH 12/12] Mobi debug: Dump text/image and unparsed binary records --- src/calibre/ebooks/mobi/debug.py | 125 +++++++++++++++++++++-- src/calibre/ebooks/mobi/writer2/utils.py | 23 +++++ 2 files changed, 142 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 8ffa3aa15b..2dbe363e7c 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -7,11 +7,13 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import struct, datetime, sys, os +import struct, datetime, sys, os, shutil from collections import OrderedDict from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language -from calibre.ebooks.mobi.writer2.utils import decode_hex_number, decint +from calibre.ebooks.mobi.writer2.utils import (decode_hex_number, decint, + get_trailing_data) +from calibre.utils.magick.draw import identify_data # PalmDB {{{ class PalmDOCAttributes(object): @@ -278,6 +280,7 @@ class MOBIHeader(object): # {{{ self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16 self.has_fcis_flis = False self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False + self.extra_data_flags = 0 if self.has_extra_data_flags: self.unknown4 = self.raw[180:192] self.first_content_record, self.last_content_record = \ @@ -726,6 +729,63 @@ class CNCX(object) : # {{{ # }}} +class TextRecord(object): # {{{ + + def __init__(self, idx, record, extra_data_flags, decompress): + self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags) + self.raw = decompress(self.raw) + if 0 in self.trailing_data: + self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0) + if 1 in self.trailing_data: + self.trailing_data['indexing'] = self.trailing_data.pop(1) + if 2 in self.trailing_data: + self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2) + + self.idx = idx + + def dump(self, folder): + name = '%06d'%self.idx + with open(os.path.join(folder, name+'.txt'), 'wb') as f: + f.write(self.raw) + with open(os.path.join(folder, name+'.trailing_data'), 'wb') as f: + for k, v in self.trailing_data.iteritems(): + raw = '%s : %r\n\n'%(k, v) + f.write(raw.encode('utf-8')) + +# }}} + +class ImageRecord(object): # {{{ + + def __init__(self, idx, record, fmt): + self.raw = record.raw + self.fmt = fmt + self.idx = idx + + def dump(self, folder): + name = '%06d'%self.idx + with open(os.path.join(folder, name+'.'+self.fmt), 'wb') as f: + f.write(self.raw) + +# }}} + +class BinaryRecord(object): # {{{ + + def __init__(self, idx, record): + self.raw = record.raw + sig = self.raw[:4] + name = '%06d'%idx + if sig in (b'FCIS', b'FLIS', b'SRCS'): + name += '-' + sig.decode('ascii') + elif sig == b'\xe9\x8e\r\n': + name += '-' + 'EOF' + self.name = name + + def dump(self, folder): + with open(os.path.join(folder, self.name+'.bin'), 'wb') as f: + f.write(self.raw) + +# }}} + class MOBIFile(object): # {{{ def __init__(self, stream): @@ -754,7 +814,22 @@ class MOBIFile(object): # {{{ self.mobi_header = MOBIHeader(self.records[0]) + if 'huff' in self.mobi_header.compression.lower(): + huffrecs = [r.raw for r in + xrange(self.mobi_header.huffman_record_offset, + self.mobi_header.huffman_record_offset + + self.mobi_header.huffman_record_count)] + from calibre.ebooks.mobi.huffcdic import HuffReader + huffs = HuffReader(huffrecs) + decompress = huffs.decompress + elif 'palmdoc' in self.mobi_header.compression.lower(): + from calibre.ebooks.compression.palmdoc import decompress_doc + decompress = decompress_doc + else: + decompress = lambda x: x + self.index_header = None + self.indexing_record_nums = set() pir = self.mobi_header.primary_index_record if pir != 0xffffffff: self.index_header = IndexHeader(self.records[pir]) @@ -763,6 +838,34 @@ class MOBIFile(object): # {{{ self.index_header.index_encoding) self.index_record = IndexRecord(self.records[pir+1], self.index_header, self.cncx) + self.indexing_record_nums = set(xrange(pir, + pir+2+self.index_header.num_of_cncx_blocks)) + + + ntr = self.mobi_header.number_of_text_records + fntbr = self.mobi_header.first_non_book_record + fii = self.mobi_header.first_image_index + if fntbr == 0xffffffff: + fntbr = len(self.records) + self.text_records = [TextRecord(r, self.records[r], + self.mobi_header.extra_data_flags, decompress) for r in xrange(1, + min(len(self.records), ntr+1))] + self.image_records, self.binary_records = [], [] + for i in xrange(fntbr, len(self.records)): + if i in self.indexing_record_nums: + continue + r = self.records[i] + fmt = None + if i >= fii and r.raw[:4] not in (b'FLIS', b'FCIS', b'SRCS', + b'\xe9\x8e\r\n'): + try: + width, height, fmt = identify_data(r.raw) + except: + pass + if fmt is not None: + self.image_records.append(ImageRecord(i, r, fmt)) + else: + self.binary_records.append(BinaryRecord(i, r)) def print_header(self, f=sys.stdout): @@ -776,13 +879,16 @@ class MOBIFile(object): # {{{ print (str(self.mobi_header).encode('utf-8'), file=f) # }}} -def inspect_mobi(path_or_stream): +def inspect_mobi(path_or_stream, prefix='decompiled'): stream = (path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb')) f = MOBIFile(stream) - ddir = 'debug_' + os.path.splitext(os.path.basename(stream.name))[0] - if not os.path.exists(ddir): - os.mkdir(ddir) + ddir = prefix + '_' + os.path.splitext(os.path.basename(stream.name))[0] + try: + shutil.rmtree(ddir) + except: + pass + os.mkdir(ddir) with open(os.path.join(ddir, 'header.txt'), 'wb') as out: f.print_header(f=out) if f.index_header is not None: @@ -793,6 +899,13 @@ def inspect_mobi(path_or_stream): print('\n\n', file=out) print(str(f.index_record), file=out) + for tdir, attr in [('text', 'text_records'), ('images', 'image_records'), + ('binary', 'binary_records')]: + tdir = os.path.join(ddir, tdir) + os.mkdir(tdir) + for rec in getattr(f, attr): + rec.dump(tdir) + print ('Debug data saved to:', ddir) def main(): diff --git a/src/calibre/ebooks/mobi/writer2/utils.py b/src/calibre/ebooks/mobi/writer2/utils.py index 8166bdf328..708b9152d4 100644 --- a/src/calibre/ebooks/mobi/writer2/utils.py +++ b/src/calibre/ebooks/mobi/writer2/utils.py @@ -8,6 +8,7 @@ __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' import struct +from collections import OrderedDict from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail @@ -150,4 +151,26 @@ def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None): scale -= 0.05 return data +def get_trailing_data(record, extra_data_flags): + ''' + Given a text record as a bytestring and the extra data flags from the MOBI + header, return the trailing data as a dictionary, mapping bit number to + data as bytestring. Also returns the record - all trailing data. + + :return: Trailing data, record - trailing data + ''' + data = OrderedDict() + for i in xrange(16, -1, -1): + flag = 2**i + if flag & extra_data_flags: + if i == 0: + # Only the first two bits are used for the size since there can + # never be more than 3 trailing multibyte chars + sz = ord(record[-1]) & 0b11 + consumed = 1 + else: + sz, consumed = decint(record, forward=False) + data[i] = record[-(sz+consumed):-consumed] + record = record[:-(sz+consumed)] + return data, record