From 96d51719af4816fe50c6f6315c1a9bf9982559b7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 20 Jul 2011 13:30:13 -0600 Subject: [PATCH 01/14] ... --- src/calibre/ebooks/mobi/writer2/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/writer2/utils.py b/src/calibre/ebooks/mobi/writer2/utils.py index dc9526eb77..cd0ee453c3 100644 --- a/src/calibre/ebooks/mobi/writer2/utils.py +++ b/src/calibre/ebooks/mobi/writer2/utils.py @@ -161,7 +161,7 @@ def get_trailing_data(record, extra_data_flags): ''' data = OrderedDict() for i in xrange(16, -1, -1): - flag = 2**i + flag = 1 << i # 2**i if flag & extra_data_flags: if i == 0: # Only the first two bits are used for the size since there can From 629da2b324ff5017e3a316bed0722cb591259258 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 20 Jul 2011 14:01:41 -0600 Subject: [PATCH 02/14] ... --- src/calibre/ebooks/mobi/debug.py | 62 +------------------ .../ebooks/mobi/{writer2 => }/utils.py | 0 src/calibre/ebooks/mobi/writer2/main.py | 2 +- 3 files changed, 4 insertions(+), 60 deletions(-) rename src/calibre/ebooks/mobi/{writer2 => }/utils.py (100%) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 9bc587c527..971f037479 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -11,7 +11,7 @@ import struct, datetime, sys, os, shutil from collections import OrderedDict from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language -from calibre.ebooks.mobi.writer2.utils import (decode_hex_number, decint, +from calibre.ebooks.mobi.utils import (decode_hex_number, decint, get_trailing_data) from calibre.utils.magick.draw import identify_data @@ -738,8 +738,7 @@ class CNCX(object) : # {{{ class TextRecord(object): # {{{ - def __init__(self, idx, record, extra_data_flags, decompress, index_record, - doc_type): + def __init__(self, idx, record, extra_data_flags, decompress): self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags) self.raw = decompress(self.raw) if 0 in self.trailing_data: @@ -751,60 +750,6 @@ class TextRecord(object): # {{{ self.idx = idx - if 'indexing' in self.trailing_data and index_record is not None: - self.interpret_indexing(doc_type, index_record.indices) - - def interpret_indexing(self, doc_type, indices): - raw = self.trailing_data['indexing'] - ident, consumed = decint(raw) - raw = raw[consumed:] - entry_type = ident & 0b111 - index_entry_idx = ident >> 3 - index_entry = None - for i in indices: - if i.index == index_entry_idx: - index_entry = i.label - break - self.trailing_data['interpreted_indexing'] = ( - 'Type: %s, Index Entry: %s'%(entry_type, index_entry)) - if doc_type == 2: # Book - self.interpret_book_indexing(raw, entry_type) - - def interpret_book_indexing(self, raw, entry_type): - arg1, consumed = decint(raw) - raw = raw[consumed:] - if arg1 != 0: - raise ValueError('TBS index entry has unknown arg1: %d'% - arg1) - if entry_type == 2: - desc = ('This record has only a single starting or a single' - ' ending point') - if raw: - raise ValueError('TBS index entry has unknown extra bytes:' - ' %r'%raw) - elif entry_type == 3: - desc = ('This record is spanned by a single node (i.e. it' - ' has no start or end points)') - arg2, consumed = decint(raw) - if arg2 != 0: - raise ValueError('TBS index entry has unknown arg2: %d'% - arg2) - elif entry_type == 6: - if len(raw) != 1: - raise ValueError('TBS index entry has unknown extra bytes:' - ' %r'%raw) - num = ord(raw[0]) - # An unmatched starting or ending point each contributes 1 to - # this count. A matched pair of starting and ending points - # together contribute 1 to this count. Note that you can only - # ever have either 1 unmatched start point or 1 unmatched end - # point, never both (logically impossible). - desc = ('This record has %d starting/ending points and/or complete' - ' nodes.')%num - else: - raise ValueError('Unknown TBS index entry type: %d for book'%entry_type) - self.trailing_data['interpreted_indexing'] += ' :: ' + desc - def dump(self, folder): name = '%06d'%self.idx with open(os.path.join(folder, name+'.txt'), 'wb') as f: @@ -910,8 +855,7 @@ class MOBIFile(object): # {{{ if fntbr == 0xffffffff: fntbr = len(self.records) self.text_records = [TextRecord(r, self.records[r], - self.mobi_header.extra_data_flags, decompress, self.index_record, - self.mobi_header.type_raw) for r in xrange(1, + self.mobi_header.extra_data_flags, decompress) for r in xrange(1, min(len(self.records), ntr+1))] self.image_records, self.binary_records = [], [] for i in xrange(fntbr, len(self.records)): diff --git a/src/calibre/ebooks/mobi/writer2/utils.py b/src/calibre/ebooks/mobi/utils.py similarity index 100% rename from src/calibre/ebooks/mobi/writer2/utils.py rename to src/calibre/ebooks/mobi/utils.py diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index 76976ce81e..2e9d31458a 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -18,7 +18,7 @@ from calibre.ebooks.compression.palmdoc import compress_doc from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.utils.filenames import ascii_filename from calibre.ebooks.mobi.writer2 import PALMDOC, UNCOMPRESSED -from calibre.ebooks.mobi.writer2.utils import (rescale_image, encint) +from calibre.ebooks.mobi.utils import (rescale_image, encint) EXTH_CODES = { 'creator': 100, From ca1367048d90beba6432d2eec569c0ded602c3a1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Jul 2011 00:12:14 -0600 Subject: [PATCH 03/14] Fix regression that broke loading translations from .po files in the working directory --- src/calibre/translations/msgfmt.py | 101 +++++++++++++++++++++++------ src/calibre/utils/localization.py | 6 +- 2 files changed, 83 insertions(+), 24 deletions(-) diff --git a/src/calibre/translations/msgfmt.py b/src/calibre/translations/msgfmt.py index 3c41cee2cd..a27a6c007f 100644 --- a/src/calibre/translations/msgfmt.py +++ b/src/calibre/translations/msgfmt.py @@ -1,20 +1,39 @@ #! /usr/bin/env python # Written by Martin v. Loewis -# Modified by Kovid Goyal """Generate binary message catalog from textual translation description. This program converts a textual Uniforum-style message catalog (.po file) into a binary GNU catalog (.mo file). This is essentially the same function as the GNU msgfmt program, however, it is a simpler implementation. + +Usage: msgfmt.py [OPTIONS] filename.po + +Options: + -o file + --output-file=file + Specify the output file to write to. If omitted, output will go to a + file named filename.mo (based off the input file name). + + -h + --help + Print this message and exit. + + -V + --version + Display version information and exit. """ import sys import os +import getopt import struct import array -__version__ = "1.2" +__version__ = "1.1" + +MESSAGES = {} + def usage(code, msg=''): print >> sys.stderr, __doc__ @@ -23,16 +42,16 @@ def usage(code, msg=''): sys.exit(code) - -def add(id, str, fuzzy, MESSAGES): +def add(id, str, fuzzy): "Add a non-fuzzy translation to the dictionary." + global MESSAGES if not fuzzy and str: MESSAGES[id] = str - -def generate(MESSAGES): +def generate(): "Return the generated output." + global MESSAGES keys = MESSAGES.keys() # the keys are sorted in the .mo file keys.sort() @@ -44,6 +63,7 @@ def generate(MESSAGES): offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id]))) ids += id + '\0' strs += MESSAGES[id] + '\0' + output = '' # The header is 7 32-bit unsigned integers. We don't use hash tables, so # the keys start right after the index tables. # translated string. @@ -71,9 +91,7 @@ def generate(MESSAGES): return output - def make(filename, outfile): - MESSAGES = {} ID = 1 STR = 2 @@ -101,7 +119,7 @@ def make(filename, outfile): lno += 1 # If we get a comment line after a msgstr, this is a new entry if l[0] == '#' and section == STR: - add(msgid, msgstr, fuzzy, MESSAGES) + add(msgid, msgstr, fuzzy) section = None fuzzy = 0 # Record a fuzzy mark @@ -111,16 +129,39 @@ def make(filename, outfile): if l[0] == '#': continue # Now we are in a msgid section, output previous section - if l.startswith('msgid'): + if l.startswith('msgid') and not l.startswith('msgid_plural'): if section == STR: - add(msgid, msgstr, fuzzy, MESSAGES) + add(msgid, msgstr, fuzzy) section = ID l = l[5:] msgid = msgstr = '' + is_plural = False + # This is a message with plural forms + elif l.startswith('msgid_plural'): + if section != ID: + print >> sys.stderr, 'msgid_plural not preceeded by msgid on %s:%d' %\ + (infile, lno) + sys.exit(1) + l = l[12:] + msgid += '\0' # separator of singular and plural + is_plural = True # Now we are in a msgstr section elif l.startswith('msgstr'): section = STR - l = l[6:] + if l.startswith('msgstr['): + if not is_plural: + print >> sys.stderr, 'plural without msgid_plural on %s:%d' %\ + (infile, lno) + sys.exit(1) + l = l.split(']', 1)[1] + if msgstr: + msgstr += '\0' # Separator of the various plural forms + else: + if is_plural: + print >> sys.stderr, 'indexed msgstr required for plural on %s:%d' %\ + (infile, lno) + sys.exit(1) + l = l[6:] # Skip empty lines l = l.strip() if not l: @@ -138,22 +179,40 @@ def make(filename, outfile): sys.exit(1) # Add last entry if section == STR: - add(msgid, msgstr, fuzzy, MESSAGES) + add(msgid, msgstr, fuzzy) # Compute output - output = generate(MESSAGES) + output = generate() + outfile.write(output) + + +def main(): try: - outfile.write(output) - except IOError,msg: - print >> sys.stderr, msg + opts, args = getopt.getopt(sys.argv[1:], 'hVo:', + ['help', 'version', 'output-file=']) + except getopt.error, msg: + usage(1, msg) + outfile = None + # parse options + for opt, arg in opts: + if opt in ('-h', '--help'): + usage(0) + elif opt in ('-V', '--version'): + print >> sys.stderr, "msgfmt.py", __version__ + sys.exit(0) + elif opt in ('-o', '--output-file'): + outfile = arg + # do it + if not args: + print >> sys.stderr, 'No input file given' + print >> sys.stderr, "Try `msgfmt --help' for more information." + return - -def main(outfile, args=sys.argv[1:]): for filename in args: make(filename, outfile) - return 0 + if __name__ == '__main__': - sys.exit(main(sys.stdout)) + main() diff --git a/src/calibre/utils/localization.py b/src/calibre/utils/localization.py index 294d8f7640..43b4fbcc89 100644 --- a/src/calibre/utils/localization.py +++ b/src/calibre/utils/localization.py @@ -71,13 +71,13 @@ def set_translators(): lang = get_lang() if lang: buf = iso639 = None - if os.access(lang+'.po', os.R_OK): + mpath = get_lc_messages_path(lang) + if os.access(mpath+'.po', os.R_OK): from calibre.translations.msgfmt import make buf = cStringIO.StringIO() - make(lang+'.po', buf) + make(mpath+'.po', buf) buf = cStringIO.StringIO(buf.getvalue()) - mpath = get_lc_messages_path(lang) if mpath is not None: with ZipFile(P('localization/locales.zip', allow_user_override=False), 'r') as zf: From 576e8aaebfd104372b5c0fe1ef1dd60b88859137 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Jul 2011 08:48:31 -0600 Subject: [PATCH 04/14] Fix #813924 ('zh-TW' come up in language list of News Schedule dialog) --- recipes/united_daily.recipe | 2 +- src/calibre/utils/localization.py | 2 +- src/calibre/web/feeds/recipes/model.py | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/recipes/united_daily.recipe b/recipes/united_daily.recipe index 1013b3d2b6..25493a43ac 100644 --- a/recipes/united_daily.recipe +++ b/recipes/united_daily.recipe @@ -64,7 +64,7 @@ class UnitedDaily(BasicNewsRecipe): __author__ = 'Eddie Lau' __version__ = '1.1' - language = 'zh-TW' + language = 'zh_TW' publisher = 'United Daily News Group' description = 'United Daily (Taiwan)' category = 'News, Chinese, Taiwan' diff --git a/src/calibre/utils/localization.py b/src/calibre/utils/localization.py index 43b4fbcc89..39cb228d60 100644 --- a/src/calibre/utils/localization.py +++ b/src/calibre/utils/localization.py @@ -72,7 +72,7 @@ def set_translators(): if lang: buf = iso639 = None mpath = get_lc_messages_path(lang) - if os.access(mpath+'.po', os.R_OK): + if mpath and os.access(mpath+'.po', os.R_OK): from calibre.translations.msgfmt import make buf = cStringIO.StringIO() make(mpath+'.po', buf) diff --git a/src/calibre/web/feeds/recipes/model.py b/src/calibre/web/feeds/recipes/model.py index 5f8d906e61..40d246b450 100644 --- a/src/calibre/web/feeds/recipes/model.py +++ b/src/calibre/web/feeds/recipes/model.py @@ -217,6 +217,8 @@ class RecipeModel(QAbstractItemModel, SearchQueryParser): self.all_urns.add(urn) if ok(urn): lang = x.get('language', 'und') + if lang: + lang = lang.replace('-', '_') if lang not in lang_map: lang_map[lang] = factory(NewsCategory, new_root, lang) factory(NewsItem, lang_map[lang], urn, x.get('title')) From f5e3378fa7b7263df7cb955d7f5612f0e7325954 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Jul 2011 09:12:39 -0600 Subject: [PATCH 05/14] Fix #814107 (New device compatibility - bq DaVinci) --- src/calibre/devices/android/driver.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index e34852130b..d26489c42f 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -47,10 +47,12 @@ class ANDROID(USBMS): # Google 0x18d1 : { + 0x0001 : [0x0223], 0x4e11 : [0x0100, 0x226, 0x227], - 0x4e12: [0x0100, 0x226, 0x227], - 0x4e21: [0x0100, 0x226, 0x227], - 0xb058: [0x0222, 0x226, 0x227]}, + 0x4e12 : [0x0100, 0x226, 0x227], + 0x4e21 : [0x0100, 0x226, 0x227], + 0xb058 : [0x0222, 0x226, 0x227] + }, # Samsung 0x04e8 : { 0x681d : [0x0222, 0x0223, 0x0224, 0x0400], From 1101be2b2d426460a6d45e7ff175e417ac669e6c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Jul 2011 12:15:07 -0600 Subject: [PATCH 06/14] MOBI debug: Dump trailing byte sequences in a format easy to decode --- src/calibre/ebooks/mobi/debug.py | 111 ++++++++++++++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 971f037479..cd7d949087 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -8,7 +8,7 @@ __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' import struct, datetime, sys, os, shutil -from collections import OrderedDict +from collections import OrderedDict, defaultdict from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.utils import (decode_hex_number, decint, @@ -625,6 +625,27 @@ class IndexEntry(object): # {{{ return tag.cncx_value return '' + @property + def offset(self): + for tag in self.tags: + if tag.attr == 'offset': + return tag.value + return 0 + + @property + def size(self): + for tag in self.tags: + if tag.attr == 'size': + return tag.value + return 0 + + @property + def depth(self): + for tag in self.tags: + if tag.attr == 'depth': + return tag.value + return 0 + def __str__(self): ans = ['Index Entry(index=%s, entry_type=%s, length=%d)'%( self.index, self.entry_type, len(self.tags))] @@ -793,6 +814,88 @@ class BinaryRecord(object): # {{{ # }}} +class TBSIndexing(object): # {{{ + + def __init__(self, text_records, indices): + self.record_indices = OrderedDict() + pos = 0 + for r in text_records: + start = pos + pos += len(r.raw) + end = pos - 1 + self.record_indices[r] = x = {'starts':[], 'ends':[], + 'complete':[], 'geom': (start, end)} + for entry in indices: + istart, sz = entry.offset, entry.size + iend = istart + sz - 1 + has_start = istart >= start and istart <= end + has_end = iend >= start and iend <= end + rec = None + if has_start and has_end: + rec = 'complete' + elif has_start and not has_end: + rec = 'starts' + elif not has_start and has_end: + rec = 'ends' + if rec: + x[rec].append(entry) + + def __str__(self): + ans = ['*'*20 + ' TBS Indexing (%d records) '%len(self.record_indices)+ '*'*20] + for r, dat in self.record_indices.iteritems(): + ans += self.dump_record(r, dat)[-1] + return '\n'.join(ans) + + def dump(self, bdir): + types = defaultdict(list) + for r, dat in self.record_indices.iteritems(): + tbs_type, strings = self.dump_record(r, dat) + if tbs_type == 0: continue + types[tbs_type] += strings + for typ, strings in types.iteritems(): + with open(os.path.join(bdir, 'tbs_type_%d.txt'%typ), 'wb') as f: + f.write('\n'.join(strings)) + + def dump_record(self, r, dat): + ans = [] + ans.append('\nRecord #%d: Starts at: %d Ends at: %d'%(r.idx, + dat['geom'][0], dat['geom'][1])) + s, e, c = dat['starts'], dat['ends'], dat['complete'] + ans.append(('\tContains: %d index entries ' + '(%d ends, %d complete, %d starts)')%tuple(map(len, (s+e+c, e, + c, s)))) + byts = bytearray(r.trailing_data.get('indexing', b'')) + sbyts = tuple(hex(b)[2:] for b in byts) + ans.append('TBS bytes: %s'%(' '.join(sbyts))) + for typ, entries in (('Ends', e), ('Complete', c), ('Starts', s)): + if entries: + ans.append('\t%s:'%typ) + for x in entries: + ans.append('\t\tIndex Entry: %d (Depth: %d Offset: %d, Size: %d) [%s]'%( + x.index, x.depth, x.offset, x.size, x.label)) + def bin3(num): + ans = bin(num)[2:] + return '0'*(3-len(ans)) + ans + + tbs_type = 0 + if len(byts): + outer, consumed = decint(bytes(byts)) + byts = byts[consumed:] + tbs_type = outer & 0b111 + ans.append('TBS Type: %s (%d)'%(bin3(tbs_type), tbs_type)) + ans.append('Outer Index entry: %d'%(outer >> 3)) + arg1, consumed = decint(bytes(byts)) + byts = byts[consumed:] + ans.append('Unknown: %d'%arg1) + if byts: + sbyts = tuple(hex(b)[2:] for b in byts) + ans.append('Remaining bytes: %s'%' '.join(sbyts)) + + ans.append('') + return tbs_type, ans + +# }}} + class MOBIFile(object): # {{{ def __init__(self, stream): @@ -874,6 +977,9 @@ class MOBIFile(object): # {{{ else: self.binary_records.append(BinaryRecord(i, r)) + if self.index_record is not None: + self.tbs_indexing = TBSIndexing(self.text_records, + self.index_record.indices) def print_header(self, f=sys.stdout): print (str(self.palmdb).encode('utf-8'), file=f) @@ -905,6 +1011,9 @@ def inspect_mobi(path_or_stream, prefix='decompiled'): print(str(f.cncx).encode('utf-8'), file=out) print('\n\n', file=out) print(str(f.index_record), file=out) + with open(os.path.join(ddir, 'tbs_indexing.txt'), 'wb') as out: + print(str(f.tbs_indexing), file=out) + f.tbs_indexing.dump(ddir) for tdir, attr in [('text', 'text_records'), ('images', 'image_records'), ('binary', 'binary_records')]: From 80b5a80145588a205fb475b8b93a873281b88476 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Jul 2011 12:42:14 -0600 Subject: [PATCH 07/14] ... --- src/calibre/ebooks/mobi/debug.py | 35 ++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index cd7d949087..30e8a9cb44 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -530,21 +530,21 @@ class Tag(object): # {{{ }, 'chapter_with_subchapters' : { - 22 : ('First subchapter index', 'first_subchapter_index'), - 23 : ('Last subchapter index', 'last_subchapter_index'), + 22 : ('First subchapter index', 'first_child_index'), + 23 : ('Last subchapter index', 'last_child_index'), }, 'periodical' : { 5 : ('Class offset in cncx', 'class_offset'), - 22 : ('First section index', 'first_section_index'), - 23 : ('Last section index', 'last_section_index'), + 22 : ('First section index', 'first_child_index'), + 23 : ('Last section index', 'last_child_index'), }, 'section' : { 5 : ('Class offset in cncx', 'class_offset'), - 21 : ('Periodical index', 'periodical_index'), - 22 : ('First article index', 'first_article_index'), - 23 : ('Last article index', 'last_article_index'), + 21 : ('Periodical index', 'parent_index'), + 22 : ('First article index', 'first_child_index'), + 23 : ('Last article index', 'last_child_index'), }, } @@ -646,6 +646,13 @@ class IndexEntry(object): # {{{ return tag.value return 0 + @property + def parent_index(self): + for tag in self.tags: + if tag.attr == 'parent_index': + return tag.value + return -1 + def __str__(self): ans = ['Index Entry(index=%s, entry_type=%s, length=%d)'%( self.index, self.entry_type, len(self.tags))] @@ -700,6 +707,15 @@ class IndexRecord(object): # {{{ entry_type = ord(indxt[off+consumed]) self.indices.append(IndexEntry(index, entry_type, indxt[off+consumed+1:next_off], cncx, index_header.tagx_entries)) + index = self.indices[-1] + + def get_parent(self, index): + if index.depth < 1: + return None + parent_depth = index.depth - 1 + for p in self.indices: + if p.depth != parent_depth: + continue def __str__(self): @@ -871,8 +887,9 @@ class TBSIndexing(object): # {{{ if entries: ans.append('\t%s:'%typ) for x in entries: - ans.append('\t\tIndex Entry: %d (Depth: %d Offset: %d, Size: %d) [%s]'%( - x.index, x.depth, x.offset, x.size, x.label)) + ans.append(('\t\tIndex Entry: %d (Parent index: %d, ' + 'Depth: %d, Offset: %d, Size: %d) [%s]')%( + x.index, x.parent_index, x.depth, x.offset, x.size, x.label)) def bin3(num): ans = bin(num)[2:] return '0'*(3-len(ans)) + ans From 5618246e6bd84c0721ae34ae24fd4c212a3a41ca Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Jul 2011 12:58:35 -0600 Subject: [PATCH 08/14] Fix #814232 (Medion ereader not detected) --- src/calibre/devices/eb600/driver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/devices/eb600/driver.py b/src/calibre/devices/eb600/driver.py index f09a33d67b..a9baa0a898 100644 --- a/src/calibre/devices/eb600/driver.py +++ b/src/calibre/devices/eb600/driver.py @@ -35,9 +35,9 @@ class EB600(USBMS): PRODUCT_ID = [0x1688] BCD = [0x110] - VENDOR_NAME = ['NETRONIX', 'WOLDER'] - WINDOWS_MAIN_MEM = ['EBOOK', 'MIBUK_GAMMA_6.2'] - WINDOWS_CARD_A_MEM = 'EBOOK' + VENDOR_NAME = ['NETRONIX', 'WOLDER', 'MD86371'] + WINDOWS_MAIN_MEM = ['EBOOK', 'MIBUK_GAMMA_6.2', 'MD86371'] + WINDOWS_CARD_A_MEM = ['EBOOK', 'MD86371'] OSX_MAIN_MEM = 'EB600 Internal Storage Media' OSX_CARD_A_MEM = 'EB600 Card Storage Media' From 7213f7b5a0b1726245ae4900698cd1178609b1bb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Jul 2011 17:41:59 -0600 Subject: [PATCH 09/14] IDG.se by zapt0 --- recipes/idg_se.recipe | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 recipes/idg_se.recipe diff --git a/recipes/idg_se.recipe b/recipes/idg_se.recipe new file mode 100644 index 0000000000..b4e86f9643 --- /dev/null +++ b/recipes/idg_se.recipe @@ -0,0 +1,33 @@ +__license__ = 'GPLv3' + +from calibre.web.feeds.news import BasicNewsRecipe + +class IDGse(BasicNewsRecipe): + title = 'IDG' + description = 'IDG.se' + language = 'se' + __author__ = 'zapt0' + oldest_article = 1 + max_articles_per_feed = 40 + no_stylesheets = True + encoding = 'ISO-8859-1' + remove_javascript = True + + feeds = [(u'Senaste nytt',u'http://feeds.idg.se/idg/vzzs')] + + def print_version(self,url): + return url + '?articleRenderMode=print&m=print' + + def get_cover_url(this): + return 'http://idgmedia.idg.se/polopoly_fs/2.3275!images/idgmedia_logo_75.jpg' + + keep_only_tags = [ + dict(name='h1'), + dict(name='div', attrs={'class':['divColumn1Article']}), + ] + #remove ads + remove_tags = [ + dict(name='div', attrs={'id':['preamble_ad']}), + dict(name='ul', attrs={'class':['share']}) + ] + From 35ce4aecae6d01e82ce5d45c60b26dfff00b54a0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Jul 2011 21:02:11 -0600 Subject: [PATCH 10/14] MOBI debug: Lots of progress on decoding the TBS sequences for hierarchical periodicals --- src/calibre/ebooks/mobi/debug.py | 190 ++++++++++++++++++++++++++++++- src/calibre/ebooks/mobi/utils.py | 8 +- 2 files changed, 191 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 30e8a9cb44..bfd6a20c07 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -653,11 +653,28 @@ class IndexEntry(object): # {{{ return tag.value return -1 + @property + def first_child_index(self): + for tag in self.tags: + if tag.attr == 'first_child_index': + return tag.value + return -1 + + @property + def last_child_index(self): + for tag in self.tags: + if tag.attr == 'last_child_index': + return tag.value + return -1 + def __str__(self): ans = ['Index Entry(index=%s, entry_type=%s, length=%d)'%( self.index, self.entry_type, len(self.tags))] for tag in self.tags: ans.append('\t'+str(tag)) + if self.first_child_index != -1: + ans.append('\tNumber of children: %d'%(self.last_child_index - + self.first_child_index + 1)) return '\n'.join(ans) # }}} @@ -832,8 +849,10 @@ class BinaryRecord(object): # {{{ class TBSIndexing(object): # {{{ - def __init__(self, text_records, indices): + def __init__(self, text_records, indices, doc_type): self.record_indices = OrderedDict() + self.doc_type = doc_type + self.indices = indices pos = 0 for r in text_records: start = pos @@ -856,6 +875,11 @@ class TBSIndexing(object): # {{{ if rec: x[rec].append(entry) + def get_index(self, idx): + for i in self.indices: + if i.index == idx: return i + raise IndexError('Index %d not found'%idx) + def __str__(self): ans = ['*'*20 + ' TBS Indexing (%d records) '%len(self.record_indices)+ '*'*20] for r, dat in self.record_indices.iteritems(): @@ -896,14 +920,17 @@ class TBSIndexing(object): # {{{ tbs_type = 0 if len(byts): - outer, consumed = decint(bytes(byts)) + outer, consumed = decint(byts) byts = byts[consumed:] tbs_type = outer & 0b111 ans.append('TBS Type: %s (%d)'%(bin3(tbs_type), tbs_type)) ans.append('Outer Index entry: %d'%(outer >> 3)) - arg1, consumed = decint(bytes(byts)) + arg1, consumed = decint(byts) byts = byts[consumed:] ans.append('Unknown: %d'%arg1) + if self.doc_type in (257, 259): # Hierarchical periodical + byts, a = self.interpret_periodical(tbs_type, byts) + ans += a if byts: sbyts = tuple(hex(b)[2:] for b in byts) ans.append('Remaining bytes: %s'%' '.join(sbyts)) @@ -911,6 +938,161 @@ class TBSIndexing(object): # {{{ ans.append('') return tbs_type, ans + def interpret_periodical(self, tbs_type, byts): + ans = [] + if tbs_type == 3: # {{{ + if byts: + arg2, consumed = decint(byts) + byts = byts[consumed:] + ans.append('Unknown: %d'%arg2) + if byts: + arg3, consumed = decint(byts) + byts = byts[consumed:] + fsi = arg3 >> 4 + extra = arg3 & 0b1111 + ans.append('First section index: %d'%fsi) + psi = self.get_index(fsi) + ans.append('Extra bits: %d'%extra) + if byts: + if byts[0] == fsi: + ssi = psi.index+1 + ans.append('First section ends') + byts = byts[1:] + arg, consumed = decint(byts) + raw = byts[:consumed] + byts = byts[consumed:] + flags = arg & 0b1111 + ans.append('Unknown (art index at start of record?):' + ' %d %r'%((arg>>4), raw)) + ans.append('Flags: %d'%flags) + num = 1 + if flags >= 4: + num = byts[0] + byts = byts[1:] + ans.append('Number of articles in closing section: %d'%num) + if flags == 5: + arg, consumed = decint(byts) + ans.append('Unknown: %r'%bytes(byts[:consumed])) + byts = byts[consumed:] + arg, consumed = decint(byts) + byts = byts[consumed:] + off = arg >> 4 + ans.append('Last article of ending section w.r.t. starting' + ' section offset: %d [%d absolute]'%(off, + ssi+off)) + ans.append('Extra bits: %d'%(arg & 0b1111)) + arg, consumed = decint(byts) + byts = byts[consumed:] + off = arg >> 4 + flag = arg & 0b1111 + ans.append('Offset to first article of starting section: %d' + ' [%d absolute]'%(off, ssi+off)) + ans.append('Flags: %d'%flag) + num = 1 + if flag == 4: + num = byts[0] + byts = byts[1:] + ans.append('Number of articles in starting section: %d'%num) + else: + ans.append('First section starts') + off, consumed = decint(byts) + flags = off & 0b1111 + off = off >> 4 + byts = byts[consumed:] + ans.append('Article at start of block as offset from ' + 'parent index: %d [%d absolute]'%(off, psi.index+off)) + ans.append('Flags: %d'%flags) + if flags == 4: + ans.append('Number of articles: %d'%byts[0]) + byts = byts[1:] + # }}} + + elif tbs_type == 7: # {{{ + # This occurs for records that have no section nodes and + # whose parent section's index == 1 + ans.append('Unknown: %r'%bytes(byts[:2])) + byts = byts[2:] + arg, consumed = decint(byts) + byts = byts[consumed:] + ai = arg >> 4 + flags = arg & 0b1111 + num = 1 + if flags == 4: + if not byts: + raise ValueError('Type 7 TBS entry missing article count') + num = byts[0] + byts = byts[1:] + ans.append('Article at start of record: %d'%ai) + ans.append('Number of articles in record: %d'%num) + # }}} + + elif tbs_type == 2: # {{{ + # This occurs for records with no section nodes and whose parent + # section's index != 1 (undefined (records before the first + # section) or > 1) + # This is also used for records that are spanned by an article + # whose parent section index > 1. In this case the flags of the + # vwi referring to the article at the start + # of the record are set to 1 instead of 4. + if byts: + arg, consumed = decint(byts) + byts = byts[consumed:] + flags = (arg & 0b1111) + psi = (arg >> 4) + ans.append('Parent section index: %d'%psi) + psi = self.get_index(psi) + ans.append('Flags: %d'%flags) + if flags == 1: + arg, consumed = decint(byts) + byts = byts[consumed:] + ans.append('Unknown: %d'%arg) + elif flags == 0: + arg, consumed = decint(byts) + byts = byts[consumed:] + flags = arg & 0b1111 + off = arg >> 4 + ans.append('Article at start of block as offset from ' + 'parent index: %d [%d absolute]'%(off, psi.index+off)) + if flags == 4: + num = byts[0] + byts = byts[1:] + ans.append('Number of nodes: %d'%num) + elif flags == 1: + num = byts[0] + byts = byts[1:] + ans.append('EOF: %s'%hex(num)) + else: + raise ValueError('Unknown flag value: %d'%flags) + # }}} + + elif tbs_type == 6: # {{{ + # This is used for records spanned by an article whose parent + # section's index == 1 or for the opening record if it contains the + # periodical start, section 1 start and atleast one article. The + # two cases are distinguidshed by the flags on the article index + # vwi. + unk = byts[0] + byts = byts[1:] + ans.append('Unknown (always 2?): %d'%unk) + arg, consumed = decint(byts) + byts = byts[consumed:] + flags = (arg & 0b1111) + ai = (arg >> 4) + ans.append(('Article index at start of record or first article' + ' index, relative to section 1: %d [%d absolute]'%(ai, ai+1))) + if flags == 1: + arg, consumed = decint(byts) + byts = byts[consumed:] + ans.append('EOF (should be 0): %d'%arg) + elif flags == 4: + num = byts[0] + byts = byts[1:] + ans.append('Number of article nodes in the record: %d'%num) + + # }}} + + return byts, ans + # }}} class MOBIFile(object): # {{{ @@ -996,7 +1178,7 @@ class MOBIFile(object): # {{{ if self.index_record is not None: self.tbs_indexing = TBSIndexing(self.text_records, - self.index_record.indices) + self.index_record.indices, self.mobi_header.type_raw) def print_header(self, f=sys.stdout): print (str(self.palmdb).encode('utf-8'), file=f) diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index cd0ee453c3..5192eee43c 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -79,7 +79,7 @@ def encint(value, forward=True): def decint(raw, forward=True): ''' - Read a variable width integer from the bytestring raw and return the + Read a variable width integer from the bytestring or bytearray raw and return the integer and the number of bytes read. If forward is True bytes are read from the start of raw, otherwise from the end of raw. @@ -88,8 +88,10 @@ def decint(raw, forward=True): ''' val = 0 byts = bytearray() - for byte in raw if forward else reversed(raw): - bnum = ord(byte) + src = bytearray(raw) + if not forward: + src.reverse() + for bnum in src: byts.append(bnum & 0b01111111) if bnum & 0b10000000: break From 8ba6341324ad9c95d6cf94a6e3c33e7b47449886 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Jul 2011 22:44:15 -0600 Subject: [PATCH 11/14] MOBI debug: Document all I've learned about TBS so far --- src/calibre/ebooks/mobi/debug.py | 120 +++++++------- src/calibre/ebooks/mobi/tbs_periodicals.rst | 168 ++++++++++++++++++++ 2 files changed, 232 insertions(+), 56 deletions(-) create mode 100644 src/calibre/ebooks/mobi/tbs_periodicals.rst diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index bfd6a20c07..79f2c3483b 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -927,7 +927,7 @@ class TBSIndexing(object): # {{{ ans.append('Outer Index entry: %d'%(outer >> 3)) arg1, consumed = decint(byts) byts = byts[consumed:] - ans.append('Unknown: %d'%arg1) + ans.append('Unknown (vwi: always 0?): %d'%arg1) if self.doc_type in (257, 259): # Hierarchical periodical byts, a = self.interpret_periodical(tbs_type, byts) ans += a @@ -940,6 +940,36 @@ class TBSIndexing(object): # {{{ def interpret_periodical(self, tbs_type, byts): ans = [] + + def tbs_type_6(byts, psi=None): # {{{ + if psi is None: + # Assume parent section is 1 + psi = self.get_index(1) + if byts: + # byts could be empty + arg, consumed = decint(byts) + byts = byts[consumed:] + flags = (arg & 0b1111) + ai = (arg >> 4) + ans.append(('Article index at start of record or first article' + ' index, relative to parent section (fvwi): %d [%d absolute]'%(ai, + ai+psi.index))) + if flags == 1: + arg, consumed = decint(byts) + byts = byts[consumed:] + ans.append('EOF (vwi: should be 0): %d'%arg) + elif flags == 4: + num = byts[0] + byts = byts[1:] + ans.append('Number of article nodes in the record (byte): %d'%num) + elif flags == 0: + pass + else: + raise ValueError('Unknown flags: %d'%flags) + return byts + + # }}} + if tbs_type == 3: # {{{ if byts: arg2, consumed = decint(byts) @@ -1010,20 +1040,37 @@ class TBSIndexing(object): # {{{ elif tbs_type == 7: # {{{ # This occurs for records that have no section nodes and # whose parent section's index == 1 - ans.append('Unknown: %r'%bytes(byts[:2])) + ans.append('Unknown (maybe vwi?): %r'%bytes(byts[:2])) byts = byts[2:] arg, consumed = decint(byts) byts = byts[consumed:] ai = arg >> 4 flags = arg & 0b1111 - num = 1 + ans.append('Article at start of record (fvwi): %d'%ai) if flags == 4: - if not byts: - raise ValueError('Type 7 TBS entry missing article count') num = byts[0] byts = byts[1:] - ans.append('Article at start of record: %d'%ai) - ans.append('Number of articles in record: %d'%num) + ans.append('Number of articles in record (byte): %d'%num) + elif flags == 0: + pass + elif flags == 1: + arg, consumed = decint(byts) + byts = byts[consumed:] + ans.append('EOF (vwi: should be 0): %d'%arg) + else: + raise ValueError('Unknown flags value: %d'%flags) + # }}} + + elif tbs_type == 6: # {{{ + # This is used for records spanned by an article whose parent + # section's index == 1 or for the opening record if it contains the + # periodical start, section 1 start and at least one article. The + # two cases are distinguished by the flags on the article index + # vwi. + unk = byts[0] + byts = byts[1:] + ans.append('Unknown (byte: always 2?): %d'%unk) + byts = tbs_type_6(byts) # }}} elif tbs_type == 2: # {{{ @@ -1034,61 +1081,22 @@ class TBSIndexing(object): # {{{ # whose parent section index > 1. In this case the flags of the # vwi referring to the article at the start # of the record are set to 1 instead of 4. - if byts: - arg, consumed = decint(byts) - byts = byts[consumed:] - flags = (arg & 0b1111) - psi = (arg >> 4) - ans.append('Parent section index: %d'%psi) - psi = self.get_index(psi) - ans.append('Flags: %d'%flags) - if flags == 1: - arg, consumed = decint(byts) - byts = byts[consumed:] - ans.append('Unknown: %d'%arg) - elif flags == 0: - arg, consumed = decint(byts) - byts = byts[consumed:] - flags = arg & 0b1111 - off = arg >> 4 - ans.append('Article at start of block as offset from ' - 'parent index: %d [%d absolute]'%(off, psi.index+off)) - if flags == 4: - num = byts[0] - byts = byts[1:] - ans.append('Number of nodes: %d'%num) - elif flags == 1: - num = byts[0] - byts = byts[1:] - ans.append('EOF: %s'%hex(num)) - else: - raise ValueError('Unknown flag value: %d'%flags) - # }}} - - elif tbs_type == 6: # {{{ - # This is used for records spanned by an article whose parent - # section's index == 1 or for the opening record if it contains the - # periodical start, section 1 start and atleast one article. The - # two cases are distinguidshed by the flags on the article index - # vwi. - unk = byts[0] - byts = byts[1:] - ans.append('Unknown (always 2?): %d'%unk) arg, consumed = decint(byts) byts = byts[consumed:] flags = (arg & 0b1111) - ai = (arg >> 4) - ans.append(('Article index at start of record or first article' - ' index, relative to section 1: %d [%d absolute]'%(ai, ai+1))) + psi = (arg >> 4) + ans.append('Parent section index (fvwi): %d'%psi) + psi = self.get_index(psi) + ans.append('Flags: %d'%flags) if flags == 1: arg, consumed = decint(byts) byts = byts[consumed:] - ans.append('EOF (should be 0): %d'%arg) - elif flags == 4: - num = byts[0] - byts = byts[1:] - ans.append('Number of article nodes in the record: %d'%num) - + ans.append('Unknown (vwi?: always 0?): %d'%arg) + byts = tbs_type_6(byts, psi=psi) + elif flags == 0: + byts = tbs_type_6(byts, psi=psi) + else: + raise ValueError('Unkown flags: %d'%flags) # }}} return byts, ans diff --git a/src/calibre/ebooks/mobi/tbs_periodicals.rst b/src/calibre/ebooks/mobi/tbs_periodicals.rst new file mode 100644 index 0000000000..4dbae3f295 --- /dev/null +++ b/src/calibre/ebooks/mobi/tbs_periodicals.rst @@ -0,0 +1,168 @@ +Reverse engineering the trailing byte sequences for hierarchical periodicals +=============================================================================== + +In the following, *vwi* means variable width integer and *fvwi* means a vwi whose lowest four bits are used as a flag. + +Opening record +---------------- + +The text record that contains the opening node for the periodical (depth=0 node in the NCX) can have TBS of 3 different forms: + + 1. If it has only the periodical node and no section/article nodes, TBS of type 2, like this:: + + Record #1: Starts at: 0 Ends at: 4095 + Contains: 1 index entries (0 ends, 0 complete, 1 starts) + TBS bytes: 82 80 + Starts: + Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 68470) [j_x's Google reader] + TBS Type: 010 (2) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + + 2. A periodical and a section node, but no article nodes, TBS type of 6, like this:: + + Record #1: Starts at: 0 Ends at: 4095 + Contains: 2 index entries (0 ends, 0 complete, 2 starts) + TBS bytes: 86 80 2 + Starts: + Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 93254) [j_x's Google reader] + Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 541, Size: 49280) [Ars Technica] + TBS Type: 110 (6) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Unknown (byte: always 2?): 2 + + 3. If it has both the section 1 node and at least one article node, TBS of type 6, like this:: + + Record #1: Starts at: 0 Ends at: 4095 + Contains: 4 index entries (0 ends, 1 complete, 3 starts) + TBS bytes: 86 80 2 c4 2 + Complete: + Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 549, Size: 1866) [Week in gaming: 3DS review, Crysis 2, George Hotz] + Starts: + Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 79253) [j_x's Google reader] + Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 541, Size: 35279) [Ars Technica] + Index Entry: 6 (Parent index: 1, Depth: 2, Offset: 2415, Size: 2764) [Week in Apple: ZFS on Mac OS X, rogue tethering, DUI apps, and more] + TBS Type: 110 (6) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Unknown (byte: always 2?): 2 + Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute] + Number of article nodes in the record (byte): 2 + + If there was only a single article, instead of 2, then the last two bytes would be: c0, i.e. there would be no byte giving the number of articles in the record. + + +Records with no nodes +------------------------ + +These records are spanned by a single article. They are of two types: + + 1. If the parent section index is 1, TBS type of 6, like this:: + + Record #4: Starts at: 12288 Ends at: 16383 + Contains: 0 index entries (0 ends, 0 complete, 0 starts) + TBS bytes: 86 80 2 c1 80 + TBS Type: 110 (6) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Unknown (byte: always 2?): 2 + Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute] + EOF (vwi: should be 0): 0 + + If the record is before the first article, the TBS bytes would be: 86 80 2 + + 2. If the parent section index is > 1, TBS type of 2, like this:: + + Record #14: Starts at: 53248 Ends at: 57343 + Contains: 0 index entries (0 ends, 0 complete, 0 starts) + TBS bytes: 82 80 a0 1 e1 80 + TBS Type: 010 (2) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Parent section index (fvwi): 2 + Flags: 0 + Article index at start of record or first article index, relative to parent section (fvwi): 14 [16 absolute] + EOF (vwi: should be 0): 0 + +Records with only article nodes +----------------------------------- + +Such records have no section transitions (i.e. a section end/section start pair). They have only one or more article nodes. They are of two types: + + 1. If the parent section index is 1, TBS type of 7, like this:: + + Record #6: Starts at: 20480 Ends at: 24575 + Contains: 2 index entries (1 ends, 0 complete, 1 starts) + TBS bytes: 87 80 2 80 1 84 2 + Ends: + Index Entry: 9 (Parent index: 1, Depth: 2, Offset: 16453, Size: 4199) [Vaccine's success spurs whooping cough comeback] + Starts: + Index Entry: 10 (Parent index: 1, Depth: 2, Offset: 20652, Size: 4246) [Apple's mobile products do not violate Nokia patents, says ITC] + TBS Type: 111 (7) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Unknown: '\x02\x80' (vwi?: Always 256) + Article at start of record (fvwi): 8 + Number of articles in record (byte): 2 + + If there was only one article in the record, the last two bytes would be replaced by a single byte: 80 + + If this record is the first record with an article, then the article at the start of the record should be the last section index. At least, that's what kindlegen does, though if you ask me, it should be the first section index. + + + 2. If the parent section index is > 1, TBS type of 2, like this:: + + Record #16: Starts at: 61440 Ends at: 65535 + Contains: 5 index entries (1 ends, 3 complete, 1 starts) + TBS bytes: 82 80 a1 80 1 f4 5 + Ends: + Index Entry: 17 (Parent index: 2, Depth: 2, Offset: 60920, Size: 1082) [Microsoft's Joe Belfiore still working on upcoming Zune hardware] + Complete: + Index Entry: 18 (Parent index: 2, Depth: 2, Offset: 62002, Size: 1016) [Rumour: OS X Lion nearing Golden Master stage] + Index Entry: 19 (Parent index: 2, Depth: 2, Offset: 63018, Size: 1045) [iOS 4.3.1 released] + Index Entry: 20 (Parent index: 2, Depth: 2, Offset: 64063, Size: 972) [Windows 8 'system reset' image leaks] + Starts: + Index Entry: 21 (Parent index: 2, Depth: 2, Offset: 65035, Size: 1057) [Windows Phone 7: Why it's failing] + TBS Type: 010 (2) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Parent section index (fvwi) : 2 + Flags: 1 + Unknown (vwi: always 0?): 0 + Article index at start of record or first article index, relative to parent section (fvwi): 15 [17 absolute] + Number of article nodes in the record (byte): 5 + + If there was only one article in the record, the last two bytes would be replaced by a single byte: f0 + +Records with a section transition +----------------------------------- + +In such a record there is a transition from one section to the next. As such the record must have at least one article ending and one article starting. + +TODO: Note you have to test the case of a single transition and multiple transitions + +Ending record +---------------- + +Logically, ending records must have at least one article ending, one section ending and the periodical ending. They are of TBS type 2, like this:: + + Record #17: Starts at: 65536 Ends at: 68684 + Contains: 4 index entries (3 ends, 1 complete, 0 starts) + TBS bytes: 82 80 c0 4 f4 2 + Ends: + Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 68470) [j_x's Google reader] + Index Entry: 4 (Parent index: 0, Depth: 1, Offset: 51234, Size: 17451) [Slashdot] + Index Entry: 43 (Parent index: 4, Depth: 2, Offset: 65422, Size: 1717) [US ITC May Reverse Judge's Ruling In Kodak vs. Apple] + Complete: + Index Entry: 44 (Parent index: 4, Depth: 2, Offset: 67139, Size: 1546) [Google Starts Testing Google Music Internally] + TBS Type: 010 (2) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Parent section index (fvwi): 4 + Flags: 0 + Article at start of block as offset from parent index (fvwi): 39 [43 absolute] + Number of nodes (byte): 2 + +If the record had only a single article end, the last two bytes would be replaced with: f0 + From c8545cb7d71f07f5b7011cd85975328ea30ed03d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Jul 2011 22:48:30 -0600 Subject: [PATCH 12/14] ... --- src/calibre/ebooks/mobi/tbs_periodicals.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/mobi/tbs_periodicals.rst b/src/calibre/ebooks/mobi/tbs_periodicals.rst index 4dbae3f295..13f12bb98b 100644 --- a/src/calibre/ebooks/mobi/tbs_periodicals.rst +++ b/src/calibre/ebooks/mobi/tbs_periodicals.rst @@ -138,9 +138,9 @@ Such records have no section transitions (i.e. a section end/section start pair) Records with a section transition ----------------------------------- -In such a record there is a transition from one section to the next. As such the record must have at least one article ending and one article starting. +In such a record there is a transition from one section to the next. As such the record must have at least one article ending and one article starting, except in the case of the first section. -TODO: Note you have to test the case of a single transition and multiple transitions +TODO: Note you have to test the cases of first section, a single transition and multiple transitions. Ending record ---------------- From 769c1dc67856179702e16810874540c26e3b9096 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 22 Jul 2011 00:07:38 -0600 Subject: [PATCH 13/14] MOBI debug: Last remaininng TBS type almost completely deciphered --- src/calibre/ebooks/mobi/debug.py | 103 +++++++------------- src/calibre/ebooks/mobi/tbs_periodicals.rst | 21 ++++ 2 files changed, 57 insertions(+), 67 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 79f2c3483b..a4d53d2395 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -941,27 +941,33 @@ class TBSIndexing(object): # {{{ def interpret_periodical(self, tbs_type, byts): ans = [] - def tbs_type_6(byts, psi=None): # {{{ + def tbs_type_6(byts, psi=None, msg=None): # {{{ if psi is None: # Assume parent section is 1 psi = self.get_index(1) + if msg is None: + msg = ('Article index at start of record or first article' + ' index, relative to parent section') if byts: # byts could be empty arg, consumed = decint(byts) byts = byts[consumed:] flags = (arg & 0b1111) ai = (arg >> 4) - ans.append(('Article index at start of record or first article' - ' index, relative to parent section (fvwi): %d [%d absolute]'%(ai, - ai+psi.index))) + ans.append('%s (fvwi): %d [%d absolute]'%(msg, ai, + ai+psi.index)) if flags == 1: arg, consumed = decint(byts) byts = byts[consumed:] ans.append('EOF (vwi: should be 0): %d'%arg) - elif flags == 4: + elif flags in (4, 5): num = byts[0] byts = byts[1:] ans.append('Number of article nodes in the record (byte): %d'%num) + if flags == 5: + arg, consumed = decint(byts) + byts = byts[consumed:] + ans.append('Unknown ??? (vwi)): %d'%(arg)) elif flags == 0: pass else: @@ -971,70 +977,33 @@ class TBSIndexing(object): # {{{ # }}} if tbs_type == 3: # {{{ + arg2, consumed = decint(byts) + byts = byts[consumed:] + ans.append('Unknown (vwi: always 0?): %d'%arg2) + + arg3, consumed = decint(byts) + byts = byts[consumed:] + fsi = arg3 >> 4 + extra = arg3 & 0b1111 + ans.append('First section index (fvwi): %d'%fsi) + psi = self.get_index(fsi) + ans.append('Extra bits (flag: always 0?): %d'%extra) + + byts = tbs_type_6(byts, psi=psi, + msg=('First article of ending section, relative to its' + ' parent\'s index')) if byts: - arg2, consumed = decint(byts) + # We have a transition not just an opening first section + psi = self.get_index(psi.index+1) + arg, consumed = decint(byts) + off = arg >> 4 byts = byts[consumed:] - ans.append('Unknown: %d'%arg2) - if byts: - arg3, consumed = decint(byts) - byts = byts[consumed:] - fsi = arg3 >> 4 - extra = arg3 & 0b1111 - ans.append('First section index: %d'%fsi) - psi = self.get_index(fsi) - ans.append('Extra bits: %d'%extra) - if byts: - if byts[0] == fsi: - ssi = psi.index+1 - ans.append('First section ends') - byts = byts[1:] - arg, consumed = decint(byts) - raw = byts[:consumed] - byts = byts[consumed:] - flags = arg & 0b1111 - ans.append('Unknown (art index at start of record?):' - ' %d %r'%((arg>>4), raw)) - ans.append('Flags: %d'%flags) - num = 1 - if flags >= 4: - num = byts[0] - byts = byts[1:] - ans.append('Number of articles in closing section: %d'%num) - if flags == 5: - arg, consumed = decint(byts) - ans.append('Unknown: %r'%bytes(byts[:consumed])) - byts = byts[consumed:] - arg, consumed = decint(byts) - byts = byts[consumed:] - off = arg >> 4 - ans.append('Last article of ending section w.r.t. starting' - ' section offset: %d [%d absolute]'%(off, - ssi+off)) - ans.append('Extra bits: %d'%(arg & 0b1111)) - arg, consumed = decint(byts) - byts = byts[consumed:] - off = arg >> 4 - flag = arg & 0b1111 - ans.append('Offset to first article of starting section: %d' - ' [%d absolute]'%(off, ssi+off)) - ans.append('Flags: %d'%flag) - num = 1 - if flag == 4: - num = byts[0] - byts = byts[1:] - ans.append('Number of articles in starting section: %d'%num) - else: - ans.append('First section starts') - off, consumed = decint(byts) - flags = off & 0b1111 - off = off >> 4 - byts = byts[consumed:] - ans.append('Article at start of block as offset from ' - 'parent index: %d [%d absolute]'%(off, psi.index+off)) - ans.append('Flags: %d'%flags) - if flags == 4: - ans.append('Number of articles: %d'%byts[0]) - byts = byts[1:] + flags = arg & 0b1111 + ans.append('Last article of ending section w.r.t. starting' + ' section offset (fvwi): %d [%d absolute]'%(off, + psi.index+off)) + ans.append('Flags (always 8?): %d'%flags) + byts = tbs_type_6(byts, psi=psi) # }}} elif tbs_type == 7: # {{{ diff --git a/src/calibre/ebooks/mobi/tbs_periodicals.rst b/src/calibre/ebooks/mobi/tbs_periodicals.rst index 13f12bb98b..9b21a8f3f8 100644 --- a/src/calibre/ebooks/mobi/tbs_periodicals.rst +++ b/src/calibre/ebooks/mobi/tbs_periodicals.rst @@ -142,6 +142,27 @@ In such a record there is a transition from one section to the next. As such the TODO: Note you have to test the cases of first section, a single transition and multiple transitions. + 1. The first section:: + + Record #2: Starts at: 4096 Ends at: 8191 + Contains: 2 index entries (0 ends, 0 complete, 2 starts) + TBS bytes: 83 80 80 90 c0 + Starts: + Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 7758, Size: 26279) [Ars Technica] + Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 7766, Size: 1866) [Week in gaming: 3DS review, Crysis 2, George Hotz] + TBS Type: 011 (3) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Unknown (vwi: always 0?): 0 + First section index (fvwi) : 1 + Extra bits: 0 + First section starts + Article at start of block as offset from parent index (fvwi): 4 [5 absolute] + Flags: 0 + + If there was more than one article at the start then the last byte would be replaced by: c4 n where n is the number of articles + + Ending record ---------------- From 6f8a5ecd0eb9ffcd1b1bed259342f66972125fe5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 22 Jul 2011 08:30:15 -0600 Subject: [PATCH 14/14] Utrinski Vesnik by Darko Spasovski --- recipes/utrinski.recipe | 71 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 recipes/utrinski.recipe diff --git a/recipes/utrinski.recipe b/recipes/utrinski.recipe new file mode 100644 index 0000000000..5256695079 --- /dev/null +++ b/recipes/utrinski.recipe @@ -0,0 +1,71 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2011, Darko Spasovski ' +''' +utrinski.com.mk +''' + +import re +import datetime +from calibre.web.feeds.news import BasicNewsRecipe + +class UtrinskiVesnik(BasicNewsRecipe): + + __author__ = 'Darko Spasovski' + INDEX = 'http://www.utrinski.com.mk/' + title = 'Utrinski Vesnik' + description = 'Daily Macedonian newspaper' + masthead_url = 'http://www.utrinski.com.mk/images/LogoTop.jpg' + language = 'mk' + remove_javascript = True + publication_type = 'newspaper' + category = 'news, Macedonia' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in + [ + ## Remove anything before the start of the article. + (r'', lambda match: ''), + + ## Remove anything after the end of the article. + (r'