Sync to trunk.

2025-07-07 10:14:46 -04:00 · 2011-07-24 21:57:20 -04:00 · 2011-07-24 21:57:20 -04:00 · 1e1562495d
commit 1e1562495d
parent 85f859150b 8f40166b9d
15 changed files with 559 additions and 74 deletions
--- a/recipes/corren2.recipe
+++ b/recipes/corren2.recipe
@ -1,39 +1,34 @@
+# -*- coding: utf-8 -*-
+
+__license__	= 'GPLv3'
+
 from calibre.web.feeds.news import BasicNewsRecipe

-class AdvancedUserRecipe1255797795(BasicNewsRecipe):
-    title          = u'Corren'
-    language = 'sv'
-    __author__ = 'Jonas Svensson'
-    simultaneous_downloads = 1
-    no_stylesheets = True
-    oldest_article = 7
+class AdvancedUserRecipe1311446032(BasicNewsRecipe):
+    title                 = 'Corren'
+    __author__            = 'Jonas Svensson'
+    description           = 'News from Sweden'
+    publisher             = 'Corren'
+    category              = 'news, politics, Sweden'
+    oldest_article        = 2
+    delay                 = 1
    max_articles_per_feed = 100
-    remove_attributes = ['onload']
-    timefmt = ''
+    no_stylesheets        = True
+    use_embedded_content  = False
+    encoding              = 'iso-8859-1'
+    language              = 'sv'

-    feeds          = [
-                   (u'Toppnyheter (alla kategorier)', u'http://www.corren.se/inc/RssHandler.ashx?id=4122151&ripurl=http://www.corren.se/nyheter/'),
-                   (u'Bostad', u'http://www.corren.se/inc/RssHandler.ashx?id=4122174&ripurl=http://www.corren.se/bostad/'),
-                   (u'Ekonomi & Jobb', u'http://www.corren.se/inc/RssHandler.ashx?id=4122176&ripurl=http://www.corren.se/ekonomi/'),
-                   (u'Kultur & Nöje', u'http://www.corren.se/inc/RssHandler.ashx?id=4122192&ripurl=http://www.corren.se/kultur/'),
-                   (u'Mat & dryck', u'http://www.corren.se/inc/RssHandler.ashx?id=4122201&ripurl=http://www.corren.se/mat-dryck/'),
-                   (u'Motor', u'http://www.corren.se/inc/RssHandler.ashx?id=4122203&ripurl=http://www.corren.se/motor/'),
-                   (u'Sport', u'http://www.corren.se/inc/RssHandler.ashx?id=4122206&ripurl=http://www.corren.se/sport/'),
-                   (u'Åsikter', u'http://www.corren.se/inc/RssHandler.ashx?id=4122223&ripurl=http://www.corren.se/asikter/'),
-                   (u'Mjölby', u'http://www.corren.se/inc/RssHandler.ashx?id=4122235&ripurl=http://www.corren.se/ostergotland/mjolby/'),
-                   (u'Motala', u'http://www.corren.se/inc/RssHandler.ashx?id=4122236&ripurl=http://www.corren.se/ostergotland/motala/')
-                     ]
-
-    def print_version(self, url):
-        url = url.replace("ekonomi/artikel.aspx", "Print.aspx")
-        url = url.replace("bostad/artikel.aspx", "Print.aspx")
-        url = url.replace("kultur/artikel.aspx", "Print.aspx")
-        url = url.replace("motor/artikel.aspx", "Print.aspx")
-        url = url.replace("mat-dryck/artikel.aspx", "Print.aspx")
-        url = url.replace("sport/artikel.aspx", "Print.aspx")
-        url = url.replace("asikter/artikel.aspx", "Print.aspx")
-        url = url.replace("mat-dryck/artikel.aspx", "Print.aspx")
-        url = url.replace("ostergotland/mjolby/artikel.aspx", "Print.aspx")
-        url = url.replace("ostergotland/motala/artikel.aspx", "Print.aspx")
-        return url.replace("nyheter/artikel.aspx", "Print.aspx")
+    feeds = [
+              (u'Toppnyheter', u'http://www.corren.se/inc/RssHandler.ashx?id=4122151&ripurl=http://www.corren.se/nyheter/')
+              ,(u'Ekonomi', u'http://www.corren.se/inc/RssHandler.ashx?id=4122176&ripurl=http://www.corren.se/ekonomi/')
+              ,(u'Link\xf6ping', u'http://www.corren.se/inc/RssHandler.ashx?id=4122234')
+              ,(u'Åsikter', u'http://www.corren.se/inc/RssHandler.ashx?id=4122223,4122224,4122226,4122227,4122228,4122229,4122230')
+            ]

+    keep_only_tags = [dict(name='div', attrs={'id':'article'}),dict(name='div', attrs={'class':'body'})]
+    remove_tags = [
+                     dict(name='ul',attrs={'class':'functions'})
+                     ,dict(name='a',attrs={'href':'javascript*'})
+                     ,dict(name='div',attrs={'class':'box'})
+                     ,dict(name='div',attrs={'class':'functionsbottom'})
+                  ]
--- a/recipes/dagens_industri.recipe
+++ b/recipes/dagens_industri.recipe
@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+
+__license__	= 'GPLv3'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1311450855(BasicNewsRecipe):
+    title          = u'Dagens Industri'
+    __author__            = 'Jonas Svensson'
+    description           = 'Economy news from Sweden'
+    publisher             = 'DI'
+    category              = 'news, politics, Sweden'
+    oldest_article        = 2
+    delay                 = 1
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    use_embedded_content  = False
+    encoding              = 'utf-8'
+    language              = 'sv'
+
+    feeds          = [(u'DI', u'http://di.se/rss')]
+
+    keep_only_tags = [dict(name='h1', attrs={'id':'ctl00_ExtraWideContentRegion_WideContentRegion_MainRegion_MainContentRegion_MainBodyRegion_headlineNormal'}),dict(name='div', attrs={'id':'articleBody'})]
+
+    remove_tags = [
+                     dict(name='div',attrs={'class':'article-actions clear'})
+                     ,dict(name='div',attrs={'class':'article-action-popup'})
+                     ,dict(name='div',attrs={'class':'header'})
+                     ,dict(name='div',attrs={'class':'content clear'})
+                     ,dict(name='div',attrs={'id':'articleAdvertisementDiv'})
+                     ,dict(name='ul',attrs={'class':'action-list'})
+                  ]
--- a/recipes/guardian.recipe
+++ b/recipes/guardian.recipe
@ -12,7 +12,7 @@ from datetime import date

 class Guardian(BasicNewsRecipe):

-    title = u'The Guardian / The Observer'
+    title = u'The Guardian and The Observer'
    if date.today().weekday() == 6:
        base_url = "http://www.guardian.co.uk/theobserver"
    else:
--- a/src/calibre/db/tables.py
+++ b/src/calibre/db/tables.py
@ -12,7 +12,7 @@ from datetime import datetime
 from dateutil.tz import tzoffset

 from calibre.constants import plugins
-from calibre.utils.date import parse_date, local_tz
+from calibre.utils.date import parse_date, local_tz, UNDEFINED_DATE
 from calibre.ebooks.metadata import author_to_author_sort

 _c_speedup = plugins['speedup'][0]
@ -29,8 +29,11 @@ def _c_convert_timestamp(val):
    if ret is None:
        return parse_date(val, as_utc=False)
    year, month, day, hour, minutes, seconds, tzsecs = ret
-    return datetime(year, month, day, hour, minutes, seconds,
+    try:
+        return datetime(year, month, day, hour, minutes, seconds,
                tzinfo=tzoffset(None, tzsecs)).astimezone(local_tz)
+    except OverflowError:
+        return UNDEFINED_DATE.astimezone(local_tz)

 class Table(object):

--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@ -128,7 +128,7 @@ class ANDROID(USBMS):
            '7', 'A956', 'A955', 'A43', 'ANDROID_PLATFORM', 'TEGRA_2',
            'MB860', 'MULTI-CARD', 'MID7015A', 'INCREDIBLE', 'A7EB', 'STREAK',
            'MB525', 'ANDROID2.3', 'SGH-I997', 'GT-I5800_CARD', 'MB612',
-            'GT-S5830_CARD']
+            'GT-S5830_CARD', 'GT-S5570_CARD']
    WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
            'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
            'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD',
--- a/src/calibre/ebooks/mobi/debug.py
+++ b/src/calibre/ebooks/mobi/debug.py
@ -399,6 +399,7 @@ class IndexHeader(object): # {{{
    def __init__(self, record):
        self.record = record
        raw = self.record.raw
+        #open('/t/index_header.bin', 'wb').write(raw)
        if raw[:4] != b'INDX':
            raise ValueError('Invalid Primary Index Record')

@ -406,7 +407,7 @@ class IndexHeader(object): # {{{
        self.unknown1 = raw[8:16]
        self.index_type, = struct.unpack('>I', raw[16:20])
        self.index_type_desc = {0: 'normal', 2:
-                'inflection'}.get(self.index_type, 'unknown')
+                'inflection', 6: 'calibre'}.get(self.index_type, 'unknown')
        self.idxt_start, = struct.unpack('>I', raw[20:24])
        self.index_count, = struct.unpack('>I', raw[24:28])
        self.index_encoding_num, = struct.unpack('>I', raw[28:32])
@ -596,10 +597,11 @@ class IndexEntry(object): # {{{
            0x3f : 'article',
    }

-    def __init__(self, ident, entry_type, raw, cncx, tagx_entries):
+    def __init__(self, ident, entry_type, raw, cncx, tagx_entries, flags=0):
        self.index = ident
        self.raw = raw
        self.tags = []
+        self.entry_type_raw = entry_type

        try:
            self.entry_type = self.TYPES[entry_type]
@ -619,6 +621,27 @@ class IndexEntry(object): # {{{
                vals.append(val)
            self.tags.append(Tag(tag, vals, self.entry_type, cncx))

+        if flags & 0b10:
+            # Look for optional description and author
+            desc_tag = [t for t in tagx_entries if t.tag == 22]
+            if desc_tag and raw:
+                val, consumed = decint(raw)
+                raw = raw[consumed:]
+                if val:
+                    self.tags.append(Tag(desc_tag[0], [val], self.entry_type,
+                        cncx))
+        if flags & 0b100:
+            aut_tag = [t for t in tagx_entries if t.tag == 23]
+            if aut_tag and raw:
+                val, consumed = decint(raw)
+                raw = raw[consumed:]
+                if val:
+                    self.tags.append(Tag(aut_tag[0], [val], self.entry_type,
+                        cncx))
+
+        if raw.replace(b'\x00', b''): # There can be padding null bytes
+            raise ValueError('Extra bytes in INDX table entry %d: %r'%(self.index, raw))
+
    @property
    def label(self):
        for tag in self.tags:
@ -669,8 +692,8 @@ class IndexEntry(object): # {{{
        return -1

    def __str__(self):
-        ans = ['Index Entry(index=%s, entry_type=%s, length=%d)'%(
-            self.index, self.entry_type, len(self.tags))]
+        ans = ['Index Entry(index=%s, entry_type=%s (%s), length=%d)'%(
+            self.index, self.entry_type, bin(self.entry_type_raw)[2:], len(self.tags))]
        for tag in self.tags:
            ans.append('\t'+str(tag))
        if self.first_child_index != -1:
@ -690,6 +713,7 @@ class IndexRecord(object): # {{{
    def __init__(self, record, index_header, cncx):
        self.record = record
        raw = self.record.raw
+
        if raw[:4] != b'INDX':
            raise ValueError('Invalid Primary Index Record')

@ -713,6 +737,9 @@ class IndexRecord(object): # {{{
        for i in range(self.idxt_count):
            off, = u(b'>H', indices[i*2:(i+1)*2])
            self.index_offsets.append(off-192)
+        rest = indices[(i+1)*2:]
+        if rest.replace(b'\0', ''): # There can be padding null bytes
+            raise ValueError('Extra bytes after IDXT table: %r'%rest)

        indxt = raw[192:self.idxt_offset]
        self.indices = []
@ -723,8 +750,13 @@ class IndexRecord(object): # {{{
                next_off = len(indxt)
            index, consumed = decode_hex_number(indxt[off:])
            entry_type = ord(indxt[off+consumed])
+            d, flags = 1, 0
+            if index_header.index_type == 6:
+                flags = ord(indxt[off+consumed+d])
+                d += 1
            self.indices.append(IndexEntry(index, entry_type,
-                indxt[off+consumed+1:next_off], cncx, index_header.tagx_entries))
+                indxt[off+consumed+d:next_off], cncx,
+                index_header.tagx_entries, flags=flags))
            index = self.indices[-1]

    def get_parent(self, index):
@ -744,7 +776,7 @@ class IndexRecord(object): # {{{
                len(w), not bool(w.replace(b'\0', b'')) ))
        a('Header length: %d'%self.header_length)
        u(self.unknown1)
-        a('Header Type: %d'%self.header_type)
+        a('Unknown (header type? index record number? always 1?): %d'%self.header_type)
        u(self.unknown2)
        a('IDXT Offset: %d'%self.idxt_offset)
        a('IDXT Count: %d'%self.idxt_count)
--- a/src/calibre/ebooks/mobi/writer2/indexer.py
+++ b/src/calibre/ebooks/mobi/writer2/indexer.py
@ -2,6 +2,7 @@
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
+from future_builtins import filter

 __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
@ -12,7 +13,9 @@ from cStringIO import StringIO
 from collections import OrderedDict

 from calibre.ebooks import normalize
-from calibre.ebooks.mobi.utils import encint
+from calibre.ebook.mobi.writer2 import RECORD_SIZE
+from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex)
+from calibre.ebooks.mobi.langcodes import iana2mobi

 def utf8_text(text):
    '''
@ -37,7 +40,6 @@ def align_block(raw, multiple=4, pad=b'\0'):
    if extra == 0: return raw
    return raw + pad*(multiple - extra)

-
 class CNCX(object): # {{{

    '''
@ -53,17 +55,11 @@ class CNCX(object): # {{{

        for item in toc:
            if item is self.toc: continue
-            label = item.title
-            klass = item.klass
+            self.strings[item.title] = 0
            if opts.mobi_periodical:
-                if item.description:
-                    self.strings[item.description] = 0
-                if item.author:
-                    self.string[item.author] = 0
-            self.strings[label] = self.strings[klass] = 0
+                self.strings[item.klass] = 0

        self.records = []
-
        offset = 0
        buf = StringIO()
        for key in tuple(self.strings.iterkeys()):
@ -90,27 +86,441 @@ class CNCX(object): # {{{
        return self.strings[string]
 # }}}

+class IndexEntry(object): # {{{
+
+    TAG_VALUES = {
+            'offset': 1,
+            'size': 2,
+            'label_offset': 3,
+            'depth': 4,
+            'class_offset': 5,
+            'parent_index': 21,
+            'first_child_index': 22,
+            'last_child_index': 23,
+    }
+    RTAG_MAP = dict(TAG_VALUES.itervalues(), TAG_VALUES.iterkeys())
+
+    BITMASKS = [1, 2, 3, 4, 5, 21, 22, 23,]
+
+    def __init__(self, offset, label_offset, depth=0, class_offset=None):
+        self.offset, self.label_offset = offset, label_offset
+        self.depth, self.class_offset = depth, class_offset
+
+        self.length = 0
+        self.index = 0
+
+        self.parent_index = None
+        self.first_child_index = None
+        self.last_child_index = None
+
+    @classmethod
+    def tagx_block(cls, for_periodical=True):
+        buf = bytearray()
+
+        def add_tag(tag, num_values=1):
+            buf.append(tag)
+            buf.append(num_values)
+            # bitmask
+            buf.append(1 << (cls.BITMASKS.index(tag)))
+            # eof
+            buf.append(0)
+
+        for tag in xrange(1, 5):
+            add_tag(tag)
+
+        if for_periodical:
+            for tag in (5, 21, 22, 23):
+                add_tag(tag)
+
+        # End of TAGX record
+        for i in xrange(3): buf.append(0)
+        buf.append(1)
+
+        header = b'TAGX'
+        header += pack(b'>I', len(buf)) # table length
+        header += pack(b'>I', 1) # control byte count
+
+        return header + bytes(buf)
+
+    @property
+    def next_offset(self):
+        return self.offset + self.length
+
+    @property
+    def tag_nums(self):
+        for i in range(1, 5):
+            yield i
+        for attr in ('class_offset', 'parent_index', 'first_child_index',
+                'last_child_index'):
+            if getattr(self, attr) is not None:
+                yield self.TAG_VALUES[attr]
+
+    @property
+    def entry_type(self):
+        ans = 0
+        for tag in self.tag_nums:
+            ans |= (1 << self.BITMASKS[tag]) # 1 << x == 2**x
+        return ans
+
+    @property
+    def bytestring(self):
+        buf = StringIO()
+        buf.write(encode_number_as_hex(self.index))
+        et = self.entry_type
+        buf.write(bytes(bytearray([et])))
+
+        for tag in self.tag_nums:
+            attr = self.RTAG_MAP[tag]
+            val = getattr(self, attr)
+            buf.write(encint(val))
+
+        ans = buf.get_value()
+        return ans
+
+# }}}
+
 class Indexer(object):

-    def __init__(self, serializer, number_of_text_records, opts, oeb):
+    def __init__(self, serializer, number_of_text_records,
+            size_of_last_text_record, opts, oeb):
        self.serializer = serializer
        self.number_of_text_records = number_of_text_records
+        self.text_size = (RECORD_SIZE * (self.number_of_text_records-1) +
+                            size_of_last_text_record)
        self.oeb = oeb
        self.log = oeb.log
        self.opts = opts

-        self.cncx = CNCX(oeb.toc, opts)
+        self.is_periodical = opts.mobi_periodical
+        self.is_flat_periodical = False
+        if opts.mobi_periodical:
+            periodical_node = iter(oeb.toc).next()
+            sections = tuple(periodical_node)
+            self.is_flat_periodical = len(sections) == 1

        self.records = []

-    def create_header(self):
-        buf = StringIO()
+        self.cncx = CNCX(oeb.toc, opts)

-        # Ident
+        if self.is_periodical:
+            self.indices = self.create_periodical_index()
+        else:
+            self.indices = self.create_book_index()
+
+        self.records.append(self.create_index_record())
+        self.records.insert(0, self.create_header())
+        self.records.extend(self.cncx.records)
+
+    def create_index_record(self): # {{{
+        header_length = 192
+        buf = StringIO()
+        indices = self.indices
+
+        # Write index entries
+        offsets = []
+        for i in indices:
+            offsets.append(buf.tell())
+            buf.write(i.bytestring)
+        index_block = align_block(buf.getvalue())
+
+        # Write offsets to index entries as an IDXT block
+        idxt_block = b'IDXT'
+        buf.truncate(0)
+        for offset in offsets:
+            buf.write(pack(b'>H', header_length+offset))
+        idxt_block = align_block(idxt_block + buf.getvalue())
+        body = index_block + idxt_block
+
+        header = b'INDX'
+        buf.truncate(0)
+        buf.write(pack(b'>I', header_length))
+        buf.write(b'\0'*4) # Unknown
+        buf.write(pack(b'>I', 1)) # Header type? Or index record number?
+        buf.write(b'\0'*4) # Unknown
+        # IDXT block offset
+        buf.write(pack(b'>I', header_length + len(index_block)))
+        # Number of index entries
+        buf.write(pack(b'>I', len(offsets)))
+        # Unknown
+        buf.write(b'\xff'*8)
+        # Unknown
+        buf.write(b'\0'*156)
+
+        header += buf.getvalue()
+
+        ans = header + body
+        if len(ans) > 0x10000:
+            raise ValueError('Too many entries (%d) in the TOC'%len(offsets))
+        return ans
+    # }}}
+
+    def create_header(self): # {{{
+        buf = StringIO()
+        tagx_block = IndexEntry.tagx_block(self.is_periodical)
+        header_length = 192
+
+        # Ident 0 - 4
        buf.write(b'INDX')

-        # Header length
-        buf.write(pack(b'>I', 192))
+        # Header length 4 - 8
+        buf.write(pack(b'>I', header_length))

-        # Index type: 0 - normal, 2 - inflection
+        # Unknown 8-16
+        buf.write(b'\0'*8)
+
+        # Index type: 0 - normal, 2 - inflection 16 - 20
        buf.write(pack(b'>I', 2))
+
+        # IDXT offset 20-24
+        buf.write(pack(b'>I', 0)) # Filled in later
+
+        # Number of index records 24-28
+        buf.write(pack('b>I', len(self.records)))
+
+        # Index Encoding 28-32
+        buf.write(pack(b'>I', 65001)) # utf-8
+
+        # Index language 32-36
+        buf.write(iana2mobi(
+            str(self.oeb.metadata.language[0])))
+
+        # Number of index entries 36-40
+        buf.write(pack(b'>I', len(self.indices)))
+
+        # ORDT offset 40-44
+        buf.write(pack(b'>I', 0))
+
+        # LIGT offset 44-48
+        buf.write(pack(b'>I', 0))
+
+        # Number of LIGT entries 48-52
+        buf.write(pack(b'>I', 0))
+
+        # Number of CNCX records 52-56
+        buf.write(pack(b'>I', len(self.cncx.records)))
+
+        # Unknown 56-180
+        buf.write(b'\0'*124)
+
+        # TAGX offset 180-184
+        buf.write(pack(b'>I', header_length))
+
+        # Unknown 184-192
+        buf.write(b'\0'*8)
+
+        # TAGX block
+        buf.write(tagx_block)
+
+        num = len(self.indices)
+
+        # The index of the last entry in the NCX
+        buf.write(encode_number_as_hex(num-1))
+
+        # The number of entries in the NCX
+        buf.write(pack(b'>H', num))
+
+        # Padding
+        pad = (4 - (buf.tell()%4))%4
+        if pad:
+            buf.write(b'\0'*pad)
+
+        idxt_offset = buf.tell()
+
+        buf.write(b'IDXT')
+        buf.write(header_length + len(tagx_block))
+        buf.write(b'\0')
+        buf.seek(20)
+        buf.write(pack(b'>I', idxt_offset))
+
+        return align_block(buf.getvalue())
+    # }}}
+
+    def create_book_index(self): # {{{
+        indices = []
+        seen = set()
+        id_offsets = self.serializer.id_offsets
+
+        for node in self.oeb.toc.iterdescendants():
+            try:
+                offset = id_offsets[node.href]
+                label = self.cncx[node.title]
+            except:
+                self.log.warn('TOC item %s not found in document'%node.href)
+                continue
+            if offset in seen:
+                continue
+            seen.add(offset)
+            index = IndexEntry(offset, label)
+            self.indices.append(index)
+
+        indices.sort(key=lambda x:x.offset)
+
+        # Set lengths
+        for i, index in indices:
+            try:
+                next_offset = indices[i+1].offset
+            except:
+                next_offset = self.serializer.body_end_offset
+            index.length = next_offset - index.offset
+
+        # Remove empty nodes
+        indices = [i for i in indices if i.length > 0]
+
+        # Set index values
+        for i, index in indices:
+            index.index = i
+
+        # Set lengths again to close up any gaps left by filtering
+        for i, index in indices:
+            try:
+                next_offset = indices[i+1].offset
+            except:
+                next_offset = self.serializer.body_end_offset
+            index.length = next_offset - index.offset
+
+        return indices
+
+    # }}}
+
+    def create_periodical_index(self): # {{{
+        periodical_node = iter(self.oeb.toc).next()
+        periodical_node_offset = self.serializer.body_start_offset
+        periodical_node_size = (self.serializer.body_end_offset -
+                periodical_node_offset)
+
+        normalized_sections = []
+
+        id_offsets = self.serializer.id_offsets
+
+        periodical = IndexEntry(periodical_node_offset,
+                self.cncx[periodical_node.title],
+                class_offset=self.cncx[periodical_node.klass])
+        periodical.length = periodical_node_size
+        periodical.first_child_index = 1
+
+        seen_sec_offsets = set()
+        seen_art_offsets = set()
+
+        for sec in periodical_node:
+            normalized_articles = []
+            try:
+                offset = id_offsets[sec.href]
+                label = self.cncx[sec.title]
+                klass = self.cncx[sec.klass]
+            except:
+                continue
+            if offset in seen_sec_offsets:
+                continue
+            seen_sec_offsets.add(offset)
+            section = IndexEntry(offset, label, class_offset=klass, depth=1)
+            section.parent_index = 0
+            for art in sec:
+                try:
+                    offset = id_offsets[art.href]
+                    label = self.cncx[art.title]
+                    klass = self.cncx[art.klass]
+                except:
+                    continue
+                if offset in seen_art_offsets:
+                    continue
+                seen_art_offsets.add(offset)
+                article = IndexEntry(offset, label, class_offset=klass,
+                        depth=2)
+                normalized_articles.append(article)
+            if normalized_articles:
+                normalized_articles.sort(key=lambda x:x.offset)
+                normalized_sections.append((section, normalized_articles))
+
+        normalized_sections.sort(key=lambda x:x[0].offset)
+
+        # Set lengths
+        for s, x in enumerate(normalized_sections):
+            sec, normalized_articles = x
+            try:
+                sec.length = normalized_sections[s+1].offset - sec.offset
+            except:
+                sec.length = self.serializer.body_end_offset - sec.offset
+            for i, art in enumerate(normalized_articles):
+                try:
+                    art.length = normalized_articles[i+1].offset - art.offset
+                except:
+                    art.length = sec.offset + sec.length - art.offset
+
+        # Filter
+        for i, x in list(enumerate(normalized_sections)):
+            sec, normalized_articles = x
+            normalized_articles = list(filter(lambda x: x.length > 0,
+                normalized_articles))
+            normalized_sections[i] = (sec, normalized_articles)
+
+        normalized_sections = list(filter(lambda x: x[0].size > 0 and x[1],
+            normalized_sections))
+
+        # Set indices
+        i = 0
+        for sec, normalized_articles in normalized_sections:
+            i += 1
+            sec.index = i
+
+        for sec, normalized_articles in normalized_sections:
+            for art in normalized_articles:
+                i += 1
+                art.index = i
+                art.parent_index = sec.index
+
+        for sec, normalized_articles in normalized_sections:
+            sec.first_child_index = normalized_articles[0].index
+            sec.last_child_index = normalized_articles[-1].index
+
+        # Set lengths again to close up any gaps left by filtering
+        for s, x in enumerate(normalized_sections):
+            sec, articles = x
+            try:
+                next_offset = normalized_sections[s+1].offset
+            except:
+                next_offset = self.serializer.body_end_offset
+            sec.length = next_offset - sec.offset
+
+            for a, art in enumerate(articles):
+                try:
+                    next_offset = articles[a+1].offset
+                except:
+                    next_offset = sec.next_offset
+                art.length = next_offset - art.offset
+
+        # Sanity check
+        for s, x in enumerate(normalized_sections):
+            sec, articles = x
+            try:
+                next_sec = normalized_sections[s+1]
+            except:
+                if (sec.length == 0 or sec.next_offset !=
+                        self.serializer.body_end_offset):
+                    raise ValueError('Invalid section layout')
+            else:
+                if next_sec.offset != sec.next_offset or sec.length == 0:
+                    raise ValueError('Invalid section layout')
+            for a, art in enumerate(articles):
+                try:
+                    next_art = articles[a+1]
+                except:
+                    if (art.length == 0 or art.next_offset !=
+                            sec.next_offset):
+                        raise ValueError('Invalid article layout')
+                else:
+                    if art.length == 0 or art.next_offset != next_art.offset:
+                        raise ValueError('Invalid article layout')
+
+        # Flatten
+        indices = [periodical]
+        for sec, articles in normalized_sections:
+            indices.append(sec)
+            periodical.last_child_index = sec.index
+
+        for sec, articles in normalized_sections:
+            for a in articles:
+                indices.append(a)
+
+        return indices
+    # }}}
+
--- a/src/calibre/ebooks/mobi/writer2/main.py
+++ b/src/calibre/ebooks/mobi/writer2/main.py
@ -20,6 +20,7 @@ from calibre.utils.filenames import ascii_filename
 from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED, RECORD_SIZE)
 from calibre.ebooks.mobi.utils import (rescale_image, encint,
        encode_trailing_data)
+from calibre.ebooks.mobi.writer2.indexer import Indexer

 EXTH_CODES = {
    'creator': 100,
@ -87,6 +88,14 @@ class MobiWriter(object):
    # Indexing {{{
    def generate_index(self):
        self.primary_index_record_idx = None
+        try:
+            self.indexer = Indexer(self.serializer, self.last_text_record_idx,
+                    self.opts, self.oeb)
+        except:
+            self.log.exception('Failed to generate MOBI index:')
+        else:
+            self.primary_index_record_idx = len(self.records)
+            self.records.extend(self.indexer.records)
    # }}}

    def write_uncrossable_breaks(self): # {{{
@ -202,7 +211,6 @@ class MobiWriter(object):
            record.write(overlap)
            record.write(pack(b'>B', len(overlap)))

-
        self.last_text_record_idx = nrecords

    def read_text_record(self, text):
@ -265,8 +273,6 @@ class MobiWriter(object):
        # EOF record
        self.records.append('\xE9\x8E\x0D\x0A')

-        self.generate_end_records()
-
        record0 = StringIO()
        # The MOBI Header
        record0.write(pack(b'>HHIHHHH',
--- a/src/calibre/ebooks/mobi/writer2/serializer.py
+++ b/src/calibre/ebooks/mobi/writer2/serializer.py
@ -143,6 +143,7 @@ class Serializer(object):
        spine.extend([item for item in self.oeb.spine if not item.linear])
        for item in spine:
            self.serialize_item(item)
+        self.body_end_offset = buf.tell()
        buf.write(b'</body>')

    def serialize_item(self, item):
--- a/src/calibre/gui2/book_details.py
+++ b/src/calibre/gui2/book_details.py
@ -133,6 +133,7 @@ def render_data(mi, use_roman_numbers=True, all_fields=False):
            authors = []
            formatter = EvalFormatter()
            for aut in mi.authors:
+                link = ''
                if mi.author_link_map[aut]:
                    link = mi.author_link_map[aut]
                elif gprefs.get('default_author_link'):
--- a/src/calibre/gui2/dialogs/quickview.py
+++ b/src/calibre/gui2/dialogs/quickview.py
@ -183,7 +183,6 @@ class Quickview(QDialog, Ui_Quickview):
        self.items.blockSignals(False)

    def indicate_no_items(self):
-        print 'no items'
        self.no_valid_items = True
        self.items.clear()
        self.items.addItem(QListWidgetItem(_('**No items found**')))
--- a/src/calibre/gui2/store/init.py
+++ b/src/calibre/gui2/store/init.py
@ -6,6 +6,8 @@ __license__ = 'GPL 3'
 __copyright__ = '2011, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'

+from calibre.utils.filenames import ascii_filename
+
 class StorePlugin(object): # {{{
    '''
    A plugin representing an online ebook repository (store). The store can
@ -53,7 +55,7 @@ class StorePlugin(object): # {{{
        self.gui = gui
        self.name = name
        self.base_plugin = None
-        self.config = JSONConfig('store/stores/' + self.name)
+        self.config = JSONConfig('store/stores/' + ascii_filename(self.name))

    def open(self, gui, parent=None, detail_item=None, external=False):
        '''
--- a/src/calibre/gui2/update.py
+++ b/src/calibre/gui2/update.py
@ -15,6 +15,7 @@ from calibre.gui2 import config, dynamic, open_url
 from calibre.gui2.dialogs.plugin_updater import get_plugin_updates_available

 URL = 'http://status.calibre-ebook.com/latest'
+#URL = 'http://localhost:8000/latest'
 NO_CALIBRE_UPDATE = '-0.0.0'
 VSEP = '|'

--- a/src/calibre/library/sqlite.py
+++ b/src/calibre/library/sqlite.py
@ -17,7 +17,7 @@ from datetime import datetime
 from functools import partial

 from calibre.ebooks.metadata import title_sort, author_to_author_sort
-from calibre.utils.date import parse_date, isoformat, local_tz
+from calibre.utils.date import parse_date, isoformat, local_tz, UNDEFINED_DATE
 from calibre import isbytestring, force_unicode
 from calibre.constants import iswindows, DEBUG, plugins
 from calibre.utils.icu import strcmp
@ -39,8 +39,11 @@ def _c_convert_timestamp(val):
    if ret is None:
        return parse_date(val, as_utc=False)
    year, month, day, hour, minutes, seconds, tzsecs = ret
-    return datetime(year, month, day, hour, minutes, seconds,
+    try:
+        return datetime(year, month, day, hour, minutes, seconds,
                tzinfo=tzoffset(None, tzsecs)).astimezone(local_tz)
+    except OverflowError:
+        return UNDEFINED_DATE.astimezone(local_tz)

 def _py_convert_timestamp(val):
    if val:
--- a/src/calibre/manual/gui.rst
+++ b/src/calibre/manual/gui.rst
@ -401,7 +401,7 @@ with undefined values in the column. Searching for ``true`` will find all books
 values in the column. Searching for ``yes`` or ``checked`` will find all books with ``Yes`` in the column.
 Searching for ``no`` or ``unchecked`` will find all books with ``No`` in the column.

-Hierarchical items (e.g. A.B.C) use an extended syntax to match initial parts of the hierarchy. This is done by adding a period between the exact match indicator (=) and the text. For example, the query ``tags:=.A`` will find the tags `A` and `A.B`, but will not find the tags `AA` or `AA.B`. The query ``tags:=.A.B`` will find the tags `A.B` and `A.C`, but not the tag `A`.
+Hierarchical items (e.g. A.B.C) use an extended syntax to match initial parts of the hierarchy. This is done by adding a period between the exact match indicator (=) and the text. For example, the query ``tags:=.A`` will find the tags `A` and `A.B`, but will not find the tags `AA` or `AA.B`. The query ``tags:=.A.B`` will find the tags `A.B` and `A.B.C`, but not the tag `A`.

 Identifiers (e.g., isbn, doi, lccn etc) also use an extended syntax. First, note that an identifier has the form ``type:value``, as in ``isbn:123456789``. The extended syntax permits you to specify independently which type and value to search for. Both the type and the value parts of the query can use `equality`, `contains`, or `regular expression` matches. Examples: