Sync to trunk.

2025-07-09 03:04:10 -04:00 · 2012-04-22 10:00:22 -04:00 · 2012-04-22 10:00:22 -04:00 · 4952d01cdd
commit 4952d01cdd
parent 748172b422 fe1e290820
27 changed files with 1439 additions and 311 deletions
--- a/Changelog.yaml
+++ b/Changelog.yaml
@ -19,6 +19,57 @@
 #   new recipes:
 #     - title: 

+- version: 0.8.48
+  date: 2012-04-20
+
+  new features:
+    - title: "Conversion: The search and replace feature has been completely revamped."
+      description: "You can now use any number of search and replace
+      expression, not just three. You can also store and load frequently used
+      sets of search and replace expressions. Also, the wizard generates its
+      preview in a separate process to protect against crashes/memory leaks."
+      tickets: [983476,983484,983478]
+
+    - title: "Support for the new '.azw3' files that Amazon recently started generating. calibre will now detect them as ebooks. It can also view/convert them, if they are DRM free."
+
+    - title: "Drivers for Samsung Galaxy ACE GT-S5830L and HTC One X"
+      tickets: [981185] 
+ 
+  bug fixes:
+    - title: "Get Books: Support the new website design of Barnes & Noble"
+
+    - title: "T1 driver: Fix books sent to SD card sometimes resulting problems when deleted."
+      tickets: [943586]
+
+    - title: "Do not allow author names to be set to blank via the Manage authors function. Blank authors are now automatically set to 'Unknown'"
+
+    - title: "MOBI Output: Handle background color specified on <td> and <tr> in addition to <table> tags."
+      tickets: [980813]
+
+    - title: "MOBI Output: Fix underline style applied to parent element not getting inherited by <a> children."
+      tickets: [985711]
+
+  improved recipes:
+    - xkcd
+    - Metro Nieuws
+    - Calgary Herald
+    - Orlando Sentinel
+    - countryfile
+    - Heise
+
+  new recipes:
+    - title: Various new Polish news sources
+      author: fenuks
+
+    - title: Various Italian news sources 
+      author: faber1971
+
+    - title: Jakarta Globe 
+      author: rty
+
+    - title: Acim Bilim Dergisi
+      author: thomass
+
 - version: 0.8.47
  date: 2012-04-13

--- a/recipes/acim_bilim_dergisi.recipe
+++ b/recipes/acim_bilim_dergisi.recipe
@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1334868409(BasicNewsRecipe):
+    title          = u'AÇIK BİLİM DERGİSİ'
+    description            =  ' Aylık çevrimiçi bilim dergisi'
+    __author__             = u'thomass'
+    oldest_article = 30
+    max_articles_per_feed = 300
+    auto_cleanup = True
+    encoding               = 'UTF-8'
+    publisher              = 'açık bilim'
+    category               = 'haber, bilim,TR,dergi'
+    language               = 'tr'
+    publication_type = 'magazine '
+    conversion_options = {
+                            'tags'            : category
+                            ,'language'        : language
+                            ,'publisher'       : publisher
+                            ,'linearize_tables': True
+                         }
+    cover_img_url = 'http://www.acikbilim.com/wp-content/themes/Equilibrium/images/logodene.jpg'
+    masthead_url = 'http://www.acikbilim.com/wp-content/themes/Equilibrium/images/logodene.jpg'
+
+
+    feeds          = [(u'Tüm Yayınlar', u'http://www.acikbilim.com/feed')]
--- a/recipes/icons/telam.png
+++ b/recipes/icons/telam.png
--- a/recipes/sol_haber.recipe
+++ b/recipes/sol_haber.recipe
@ -0,0 +1,141 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Onur Gungor onurgu@gmail.com'
+__docformat__ = 'restructuredtext en'
+
+'''
+www.sol.org.tr
+'''
+
+import datetime
+
+import re
+
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class SolHaberRecipe(BasicNewsRecipe):
+    title = u'soL Haber'
+    oldest_article = 7
+    max_articles_per_feed = 100
+
+    language = 'tr'
+    __author__ = 'Onur Güngör'
+    description = 'Hayata soL''dan bakın..'
+    publisher = 'soL Haber'
+    tags = 'news, haberler, siyaset, türkiye, turkey, politics'
+
+
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : tags
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
+
+    category_dict = { 'sonuncu-kavga':'Sonuncu Kavga',
+                      'devlet-ve-siyaset':'Devlet ve Siyaset',
+                      'ekonomi':'Ekonomi',
+                      'enternasyonal-gundem':'Enternasyonel Gündem',
+                      'kent-gundemleri':'Kent Gündemleri',
+                      'kultur-sanat':'Kültür Sanat',
+                      'dunyadan':'Dünyadan',
+                      'serbest-kursu':'Serbest Kürsü',
+                      'medya':'Medya',
+                      'liseliler':'Liseliler',
+                      'yazarlar':'Köşe Yazıları'}
+
+    end_date = datetime.date.today().isoformat()
+    start_date = (datetime.date.today()-datetime.timedelta(days=1)).isoformat()
+
+
+    section_tuples = [['Köşe Yazıları', 'http://haber.sol.org.tr/arsiv?icerik=kose_yazisi&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
+                    ['Haberler', 'http://haber.sol.org.tr/arsiv?icerik=haber&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
+                    ['soL postal', 'http://haber.sol.org.tr/arsiv?icerik=postal&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
+                    ['Bizim Amerika', 'http://haber.sol.org.tr/arsiv?icerik=bizim_amerika&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)]]
+
+
+    # Disable stylesheets from site.
+    no_stylesheets = True
+
+    cover_margins = (20, 20, '#ffffff')
+
+    storybody_reg_exp = '^\s*(haber|kose)\s*$'
+
+    comments_reg_exp = '^\s*makale-elestiri\s*$'
+
+    remove_tags = [dict(name='div', attrs={'class':re.compile(comments_reg_exp, re.IGNORECASE)})]
+
+    keep_only_tags = [dict(name='div', attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)})]
+
+    def get_masthead_title(self):
+        return self.title + "(" + self.end_date + ")"
+
+    def parse_index(self):
+
+        result = []
+        articles_dict = dict()
+
+        author_regexp = re.compile('^http://.*?/yazarlar/(.*?)/.*$')
+        category_regexp = re.compile('^http://.*?/(.+?)/.*$')
+
+        for section_tuple in self.section_tuples:
+
+            section_title = section_tuple[0]
+            section_index_url = section_tuple[1]
+
+            self.log('Bölüm:', section_title, 'URL:', section_index_url)
+
+            soup = self.index_to_soup(section_index_url)
+
+            logo = soup.find('div', id='logo').find('img', src=True)
+            if logo is not None:
+                self.cover_url = logo['src']
+                if self.cover_url.startswith('/'):
+                    self.cover_url = 'http://haber.sol.org.tr'+self.cover_url
+
+            view_content = soup.find('div', id='ana-icerik').find('div', attrs={'class':'view-content'})
+            if view_content == None:
+                break
+            rows = view_content.find('tbody').findAll('tr')
+
+            self.log('Row sayısı', len(rows))
+            for row in rows:
+                cells = row.findAll('td')
+
+                a = cells[1].find('a', href=True)
+
+                url = a['href']
+                title = self.tag_to_string(a)
+
+                if url.startswith('/'):
+                    url = 'http://haber.sol.org.tr'+url
+
+                category = section_title
+                category_match_result = category_regexp.match(url)
+                if category_match_result:
+                    category = category_match_result.group(1)
+
+                date = self.tag_to_string(cells[2])
+
+                author = 'soL haber'
+
+                author_match_result = author_regexp.match(url)
+                if author_match_result:
+                    author = author_match_result.group(1)
+
+                self.log('\tFound article:', title, 'at', url, 'published at ', date, 'by', author)
+                article = {'title':title, 'url':url, 'description':None, 'date':date, 'author':author}
+                if category in articles_dict:
+                    articles_dict[category].append(article)
+                else:
+                    articles_dict[category] = [article]
+
+        for category in articles_dict.keys():
+            if category in self.category_dict:
+                result.append((self.category_dict[category], articles_dict[category]))
+            else:
+                result.append((category, articles_dict[category]))
+
+        return result
--- a/recipes/telam.recipe
+++ b/recipes/telam.recipe
@ -0,0 +1,62 @@
+__license__   = 'GPL v3'
+__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
+'''
+www.telam.com.ar
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Telam(BasicNewsRecipe):
+    title                 = 'Telam'
+    __author__            = 'Darko Miletic'
+    description           = 'AGENCIA DE NOTICIAS DE LA REPUBLICA ARGENTINA'
+    publisher             = 'Telam S.E.'
+    category              = 'news, politics, Argentina'
+    oldest_article        = 2
+    max_articles_per_feed = 200
+    no_stylesheets        = True
+    encoding              = 'utf8'
+    use_embedded_content  = False
+    language              = 'es_AR'
+    remove_empty_feeds    = True
+    publication_type      = 'newsportal'
+    masthead_url          = 'http://www.telam.com.ar/front/imagenes/encabezado/logotelam.jpg'
+    extra_css             = """
+                               body{font-family: Arial,Helvetica,sans-serif }
+                               img{margin-bottom: 0.4em; display:block}
+                            """
+
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
+
+    remove_tags        = [dict(name=['meta','link'])]
+    remove_tags_before = dict(attrs={'class':'nota_fecha'})
+    remove_tags_after  = dict(attrs={'class':'nota_completa'})
+    remove_attributes  = ['lang']
+
+
+    feeds = [
+              (u'Ultimas noticias', u'http://www.telam.com.ar/xml/rss/' )
+             ,(u'Politica'        , u'http://www.telam.com.ar/xml/rss/1')
+             ,(u'Economia'        , u'http://www.telam.com.ar/xml/rss/2')
+             ,(u'Sociedad'        , u'http://www.telam.com.ar/xml/rss/3')
+             ,(u'Policiales'      , u'http://www.telam.com.ar/xml/rss/4')
+             ,(u'Internacionales' , u'http://www.telam.com.ar/xml/rss/6')
+             ,(u'Espectaculos'    , u'http://www.telam.com.ar/xml/rss/7')
+             ,(u'Cultura'         , u'http://www.telam.com.ar/xml/rss/8')
+             ,(u'Deportes'        , u'http://www.telam.com.ar/xml/rss/9')
+             ,(u'Telam Investiga' , u'http://www.telam.com.ar/xml/rss/5')
+            ]
+
+    def print_version(self, url):
+        artid = url.rpartition('/')[2]
+        return 'http://www.telam.com.ar/?codProg=imprimir-nota&id=' + artid
+
+    def preprocess_html(self, soup):
+        for item in soup.findAll(style=True):
+            del item['style']
+        return soup
--- a/recipes/tpm_uk.recipe
+++ b/recipes/tpm_uk.recipe
@ -11,6 +11,8 @@ class TPM_uk(BasicNewsRecipe):
    __author__            = 'Darko Miletic'
    description           = 'Title says it all'
    publisher             = "The Philosophers' Magazine"
+    recipe_disabled = ('This recipe has been disabled as the website has'
+            ' started providing articles only in PDF form')
    category              = 'philosophy, news'
    oldest_article        = 25
    max_articles_per_feed = 200
--- a/src/calibre/constants.py
+++ b/src/calibre/constants.py
@ -4,7 +4,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 __appname__   = u'calibre'
-numeric_version = (0, 8, 47)
+numeric_version = (0, 8, 48)
 __version__   = u'.'.join(map(unicode, numeric_version))
 __author__    = u"Kovid Goyal <kovid@kovidgoyal.net>"

--- a/src/calibre/devices/kobo/driver.py
+++ b/src/calibre/devices/kobo/driver.py
@ -298,7 +298,7 @@ class KOBO(USBMS):
            changed = False
            for i, row in enumerate(cursor):
            #  self.report_progress((i+1) / float(numrows), _('Getting list of books on device...'))
-                if row[3].startswith("file:///usr/local/Kobo/help/"):
+                if not hasattr(row[3], 'startswith') or row[3].startswith("file:///usr/local/Kobo/help/"):
                    # These are internal to the Kobo device and do not exist
                    continue
                path = self.path_from_contentid(row[3], row[5], row[4], oncard)
--- a/src/calibre/ebooks/conversion/plugins/mobi_output.py
+++ b/src/calibre/ebooks/conversion/plugins/mobi_output.py
@ -169,6 +169,7 @@ class MOBIOutput(OutputFormatPlugin):
        self.remove_html_cover()
        resources = Resources(oeb, opts, self.is_periodical,
                add_fonts=create_kf8)
+        self.check_for_periodical()

        kf8 = self.create_kf8(resources) if create_kf8 else None

@ -203,7 +204,6 @@ class MOBIOutput(OutputFormatPlugin):
            resources.add_extra_images()
        mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
        mobimlizer(oeb, opts)
-        self.check_for_periodical()
        write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
        from calibre.ebooks.mobi.writer2.main import MobiWriter
        writer = MobiWriter(opts, resources, kf8,
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -559,7 +559,7 @@ class HTMLPreProcessor(object):
                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
                end_rules.append(
                    # Un wrap using punctuation
-                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                )

        for rule in self.PREPROCESS + start_rules:
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -316,13 +316,20 @@ class HeuristicProcessor(object):
        '''
        Unwraps lines based on line length and punctuation
        supports a range of html markup and text files
+        
+        the lookahead regex below is meant look for any non-full stop characters - punctuation
+        characters which can be used as a full stop should *not* be added below - e.g. ?!“”. etc
+        the reason for this is to prevent false positive wrapping.  False positives are more
+        difficult to detect than false negatives during a manual review of the doc
+        
+        This function intentionally leaves hyphenated content alone as that is handled by the 
+        dehyphenate routine in a separate step
        '''
-        # define the pieces of the regex

-        lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
+        # define the pieces of the regex
+        lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
        em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
        soft_hyphen = u"\xad"
-        dash = u"\x2d" # some ocrs doesn't convert dashes to hyphens
        line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
        blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
        line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
@ -331,23 +338,19 @@ class HeuristicProcessor(object):
        unwrap_regex = lookahead+line_ending+blanklines+line_opening
        em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
        shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
-        dash_unwrap_regex = dash+line_ending+blanklines+line_opening

        if format == 'txt':
            unwrap_regex = lookahead+txt_line_wrap
            em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
            shy_unwrap_regex = soft_hyphen+txt_line_wrap
-            dash_unwrap_regex = dash+txt_line_wrap

        unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
        em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
        shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
-        dash_unwrap = re.compile(u"%s" % dash_unwrap_regex, re.UNICODE)

        content = unwrap.sub(' ', content)
        content = em_en_unwrap.sub('', content)
        content = shy_unwrap.sub('', content)
-        content = dash_unwrap.sub('', content)
        return content

    def txt_process(self, match):
@ -460,27 +463,31 @@ class HeuristicProcessor(object):
        return html

    def detect_whitespace(self, html):
-        blanks_around_headings = re.compile(r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<heading><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
+        blanks_around_headings = re.compile(r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<content><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
+        blanks_around_scene_breaks = re.compile(r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<content><p class="scenebreak"[^>]*>.*?</p>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
        blanks_n_nopunct = re.compile(r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE|re.DOTALL)

        def merge_header_whitespace(match):
            initblanks = match.group('initparas')
-            endblanks = match.group('initparas')
-            heading = match.group('heading')
+            endblanks = match.group('endparas')
+            content = match.group('content')
            top_margin = ''
            bottom_margin = ''
            if initblanks is not None:
                top_margin = 'margin-top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
            if endblanks is not None:
-                bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(initblanks)))+'em;'
+                bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(endblanks)))+'em;'

            if initblanks == None and endblanks == None:
-                return heading
+                return content
+            elif content.find('scenebreak') != -1:
+                return content
            else:
-                heading = re.sub('(?i)<h(?P<hnum>\d+)[^>]*>', '\n\n<h'+'\g<hnum>'+' style="'+top_margin+bottom_margin+'">', heading)
-            return heading
+                content = re.sub('(?i)<h(?P<hnum>\d+)[^>]*>', '\n\n<h'+'\g<hnum>'+' style="'+top_margin+bottom_margin+'">', content)
+            return content

        html = blanks_around_headings.sub(merge_header_whitespace, html)
+        html = blanks_around_scene_breaks.sub(merge_header_whitespace, html)

        def markup_whitespaces(match):
            blanks = match.group(0)
@ -515,6 +522,12 @@ class HeuristicProcessor(object):
            html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
        return html

+    def detect_scene_breaks(self, html):
+        scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
+        scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
+        html = scene_breaks.sub(self.scene_break_open+'\g<break>'+'</p>', html)
+        return html
+
    def markup_user_break(self, replacement_break):
        '''
        Takes string a user supplies and wraps it in markup that will be centered with
@ -781,25 +794,25 @@ class HeuristicProcessor(object):
        if getattr(self.extra_opts, 'format_scene_breaks', False):
            self.log.debug('Formatting scene breaks')
            html = re.sub('(?i)<div[^>]*>\s*<br(\s?/)?>\s*</div>', '<p></p>', html)
+            html = self.detect_scene_breaks(html)
            html = self.detect_whitespace(html)
            html = self.detect_soft_breaks(html)
            blanks_count = len(self.any_multi_blank.findall(html))
            if blanks_count >= 1:
                html = self.merge_blanks(html, blanks_count)
-            scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
-            scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
+            detected_scene_break = re.compile(r'<p class="scenebreak"[^>]*>.*?</p>')
+            scene_break_count = len(detected_scene_break.findall(html))
            # If the user has enabled scene break replacement, then either softbreaks
            # or 'hard' scene breaks are replaced, depending on which is in use
            # Otherwise separator lines are centered, use a bit larger margin in this case
            replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
            if replacement_break:
                replacement_break = self.markup_user_break(replacement_break)
-                if len(scene_break.findall(html)) >= 1:
-                    html = scene_break.sub(replacement_break, html)
+                if scene_break_count >= 1:
+                    html = detected_scene_break.sub(replacement_break, html)
+                    html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html)
                else:
                    html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html)
-            else:
-                html = scene_break.sub(self.scene_break_open+'\g<break>'+'</p>', html)

        if self.deleted_nbsps:
            # put back non-breaking spaces in empty paragraphs so they render correctly
--- a/src/calibre/ebooks/metadata/sources/overdrive.py
+++ b/src/calibre/ebooks/metadata/sources/overdrive.py
@ -197,14 +197,18 @@ class OverDrive(Source):
        title_tokens = list(self.get_title_tokens(title,
                strip_joiners=False, strip_subtitle=True))

-        if len(title_tokens) >= len(author_tokens):
+        xref_q = ''
+        if len(author_tokens) <= 1:
            initial_q = ' '.join(title_tokens)
            xref_q = '+'.join(author_tokens)
        else:
            initial_q = ' '.join(author_tokens)
-            xref_q = '+'.join(title_tokens)
-        #log.error('Initial query is %s'%initial_q)
-        #log.error('Cross reference query is %s'%xref_q)
+            for token in title_tokens:
+                if len(xref_q) < len(token):
+                    xref_q = token
+
+        log.error('Initial query is %s'%initial_q)
+        log.error('Cross reference query is %s'%xref_q)

        q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
        query = '{"szKeyword":"'+initial_q+'"}'
@ -219,27 +223,30 @@ class OverDrive(Source):

        # get the search results object
        results = False
+        iterations = 0
        while results == False:
+            iterations += 1
            xreq = mechanize.Request(q_xref)
            xreq.add_header('X-Requested-With', 'XMLHttpRequest')
            xreq.add_header('Referer', q_init_search)
            xreq.add_header('Accept', 'application/json, text/javascript, */*')
            raw = br.open_novisit(xreq).read()
            for m in re.finditer(ur'"iTotalDisplayRecords":(?P<displayrecords>\d+).*?"iTotalRecords":(?P<totalrecords>\d+)', raw):
-                if int(m.group('displayrecords')) >= 1:
-                    results = True
-                elif int(m.group('totalrecords')) >= 1:
-                    if int(m.group('totalrecords')) >= 100:
-                        if xref_q.find('+') != -1:
-                            xref_tokens = xref_q.split('+')
-                            xref_q = xref_tokens[0]
-                            #log.error('xref_q is '+xref_q)
-                    else:
-                        xref_q = ''
-                    xref_q = ''
-                    q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
-                elif int(m.group('totalrecords')) == 0:
+                if int(m.group('totalrecords')) == 0:
                    return ''
+                elif int(m.group('displayrecords')) >= 1:
+                    results = True
+                elif int(m.group('totalrecords')) >= 1 and iterations < 3:
+                    if xref_q.find('+') != -1:
+                        xref_tokens = xref_q.split('+')
+                        xref_q = xref_tokens[0]
+                        for token in xref_tokens:
+                            if len(xref_q) < len(token):
+                                xref_q = token
+                        #log.error('rewrote xref_q, new query is '+xref_q)
+                else:
+                        xref_q = ''
+                q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q

        return self.sort_ovrdrv_results(raw, log, title, title_tokens, author, author_tokens)

@ -263,6 +270,7 @@ class OverDrive(Source):
                else:
                    if creators:
                        creators = creators.split(', ')
+
                    # if an exact match in a preferred format occurs
                    if ((author and creators and creators[0] == author[0]) or (not author and not creators)) and od_title.lower() == title.lower() and int(formatid) in [1, 50, 410, 900] and thumbimage:
                        return self.format_results(reserveid, od_title, subtitle, series, publisher,
@ -330,9 +338,9 @@ class OverDrive(Source):
    def find_ovrdrv_data(self, br, log, title, author, isbn, ovrdrv_id=None):
        q = base_url
        if ovrdrv_id is None:
-           return self.overdrive_search(br, log, q, title, author)
+            return self.overdrive_search(br, log, q, title, author)
        else:
-           return self.overdrive_get_record(br, log, q, ovrdrv_id)
+            return self.overdrive_get_record(br, log, q, ovrdrv_id)



@ -461,10 +469,10 @@ if __name__ == '__main__':
        [

            (
-                {'title':'Foundation and Earth',
-                    'authors':['Asimov']},
-                [title_test('Foundation and Earth', exact=True),
-                    authors_test(['Isaac Asimov'])]
+                {'title':'The Sea Kings Daughter',
+                    'authors':['Elizabeth Peters']},
+                [title_test('The Sea Kings Daughter', exact=False),
+                    authors_test(['Elizabeth Peters'])]
            ),

            (
--- a/src/calibre/ebooks/mobi/debug/index.py
+++ b/src/calibre/ebooks/mobi/debug/index.py
@ -17,7 +17,7 @@ from calibre.ebooks.mobi.reader.ncx import (tag_fieldname_map, default_entry)
 File = namedtuple('File',
    'file_number name divtbl_count start_position length')

-Elem = namedtuple('Elem',
+Elem = namedtuple('Chunk',
    'insert_pos toc_text file_number sequence_number start_pos '
    'length')

@ -110,7 +110,7 @@ class SECTIndex(Index):
             for i, text in enumerate(self.table.iterkeys()):
                tag_map = self.table[text]
                if set(tag_map.iterkeys()) != {2, 3, 4, 6}:
-                    raise ValueError('SECT Index has unknown tags: %s'%
+                    raise ValueError('Chunk Index has unknown tags: %s'%
                            (set(tag_map.iterkeys())-{2, 3, 4, 6}))

                toc_text = self.cncx[tag_map[2][0]]
--- a/src/calibre/ebooks/mobi/debug/mobi8.py
+++ b/src/calibre/ebooks/mobi/debug/mobi8.py
@ -198,7 +198,7 @@ def inspect_mobi(mobi_file, ddir):
    with open(os.path.join(ddir, 'skel.record'), 'wb') as fo:
        fo.write(str(f.skel_index).encode('utf-8'))

-    with open(os.path.join(ddir, 'sect.record'), 'wb') as fo:
+    with open(os.path.join(ddir, 'chunks.record'), 'wb') as fo:
        fo.write(str(f.sect_index).encode('utf-8'))

    with open(os.path.join(ddir, 'ncx.record'), 'wb') as fo:
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@ -9,11 +9,13 @@ __docformat__ = 'restructuredtext en'

 import struct, string, imghdr, zlib, os
 from collections import OrderedDict
+from io import BytesIO

 from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
 from calibre.ebooks import normalize

 IMAGE_MAX_SIZE = 10 * 1024 * 1024
+RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))

 def decode_string(raw, codec='utf-8', ordt_map=''):
    length, = struct.unpack(b'>B', raw[0])
@ -498,3 +500,107 @@ def write_font_record(data, obfuscate=True, compress=True):

 # }}}

+def create_text_record(text):
+    '''
+    Return a Palmdoc record of size RECORD_SIZE from the text file object.
+    In case the record ends in the middle of a multibyte character return
+    the overlap as well.
+
+    Returns data, overlap: where both are byte strings. overlap is the
+    extra bytes needed to complete the truncated multibyte character.
+    '''
+    opos = text.tell()
+    text.seek(0, 2)
+    # npos is the position of the next record
+    npos = min((opos + RECORD_SIZE, text.tell()))
+    # Number of bytes from the next record needed to complete the last
+    # character in this record
+    extra = 0
+
+    last = b''
+    while not last.decode('utf-8', 'ignore'):
+        # last contains no valid utf-8 characters
+        size = len(last) + 1
+        text.seek(npos - size)
+        last = text.read(size)
+
+    # last now has one valid utf-8 char and possibly some bytes that belong
+    # to a truncated char
+
+    try:
+        last.decode('utf-8', 'strict')
+    except UnicodeDecodeError:
+        # There are some truncated bytes in last
+        prev = len(last)
+        while True:
+            text.seek(npos - prev)
+            last = text.read(len(last) + 1)
+            try:
+                last.decode('utf-8')
+            except UnicodeDecodeError:
+                pass
+            else:
+                break
+        extra = len(last) - prev
+
+    text.seek(opos)
+    data = text.read(RECORD_SIZE)
+    overlap = text.read(extra)
+    text.seek(npos)
+
+    return data, overlap
+
+class CNCX(object): # {{{
+
+    '''
+    Create the CNCX records. These are records containing all the strings from
+    an index. Each record is of the form: <vwi string size><utf-8 encoded
+    string>
+    '''
+
+    MAX_STRING_LENGTH = 500
+
+    def __init__(self, strings=()):
+        self.strings = OrderedDict((s, 0) for s in strings)
+
+        self.records = []
+        offset = 0
+        buf = BytesIO()
+        for key in tuple(self.strings.iterkeys()):
+            utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
+            l = len(utf8)
+            sz_bytes = encint(l)
+            raw = sz_bytes + utf8
+            if 0xfbf8 - buf.tell() < 6 + len(raw):
+                # Records in PDB files cannot be larger than 0x10000, so we
+                # stop well before that.
+                pad = 0xfbf8 - buf.tell()
+                buf.write(b'\0' * pad)
+                self.records.append(buf.getvalue())
+                buf.truncate(0)
+                offset = len(self.records) * 0x10000
+            buf.write(raw)
+            self.strings[key] = offset
+            offset += len(raw)
+
+        val = buf.getvalue()
+        if val:
+            self.records.append(align_block(val))
+
+    def __getitem__(self, string):
+        return self.strings[string]
+
+    def __bool__(self):
+        return bool(self.records)
+    __nonzero__ = __bool__
+
+    def __len__(self):
+        return len(self.records)
+
+# }}}
+
+def is_guide_ref_start(ref):
+    return (ref.title.lower() == 'start' or
+            (ref.type and ref.type.lower() in {'start',
+                    'other.start', 'text'}))
+
--- a/src/calibre/ebooks/mobi/writer2/init.py
+++ b/src/calibre/ebooks/mobi/writer2/init.py
@ -12,5 +12,4 @@ UNCOMPRESSED = 1
 PALMDOC = 2
 HUFFDIC = 17480
 PALM_MAX_IMAGE_SIZE = 63 * 1024
-RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))

--- a/src/calibre/ebooks/mobi/writer2/indexer.py
+++ b/src/calibre/ebooks/mobi/writer2/indexer.py
@ -12,56 +12,22 @@ from struct import pack
 from cStringIO import StringIO
 from collections import OrderedDict, defaultdict

-from calibre.ebooks.mobi.writer2 import RECORD_SIZE
 from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
-        encode_tbs, align_block, utf8_text)
+        encode_tbs, align_block, RECORD_SIZE, CNCX as CNCX_)

-class CNCX(object): # {{{
-
-    '''
-    Create the CNCX records. These are records containing all the strings from
-    the NCX. Each record is of the form: <vwi string size><utf-8 encoded
-    string>
-    '''
-
-    MAX_STRING_LENGTH = 500
+class CNCX(CNCX_): # {{{

    def __init__(self, toc, is_periodical):
-        self.strings = OrderedDict()
-
+        strings = []
        for item in toc.iterdescendants(breadth_first=True):
-            self.strings[item.title] = 0
+            strings.append(item.title)
            if is_periodical:
-                self.strings[item.klass] = 0
+                strings.append(item.klass)
                if item.author:
-                    self.strings[item.author] = 0
+                    strings.append(item.author)
                if item.description:
-                    self.strings[item.description] = 0
-
-        self.records = []
-        offset = 0
-        buf = StringIO()
-        for key in tuple(self.strings.iterkeys()):
-            utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
-            l = len(utf8)
-            sz_bytes = encint(l)
-            raw = sz_bytes + utf8
-            if 0xfbf8 - buf.tell() < 6 + len(raw):
-                # Records in PDB files cannot be larger than 0x10000, so we
-                # stop well before that.
-                pad = 0xfbf8 - buf.tell()
-                buf.write(b'\0' * pad)
-                self.records.append(buf.getvalue())
-                buf.truncate(0)
-                offset = len(self.records) * 0x10000
-            buf.write(raw)
-            self.strings[key] = offset
-            offset += len(raw)
-
-        self.records.append(align_block(buf.getvalue()))
-
-    def __getitem__(self, string):
-        return self.strings[string]
+                    strings.append(item.description)
+        CNCX_.__init__(self, strings)
 # }}}

 class TAGX(object): # {{{
--- a/src/calibre/ebooks/mobi/writer2/main.py
+++ b/src/calibre/ebooks/mobi/writer2/main.py
@ -16,9 +16,9 @@ from calibre.ebooks.mobi.writer2.serializer import Serializer
 from calibre.ebooks.compression.palmdoc import compress_doc
 from calibre.ebooks.mobi.langcodes import iana2mobi
 from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED, RECORD_SIZE)
+from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED)
 from calibre.ebooks.mobi.utils import (encint, encode_trailing_data,
-        align_block, detect_periodical)
+        align_block, detect_periodical, RECORD_SIZE, create_text_record)
 from calibre.ebooks.mobi.writer2.indexer import Indexer

 EXTH_CODES = {
@ -163,9 +163,7 @@ class MobiWriter(object):

    # }}}

-    # Text {{{
-
-    def generate_text(self):
+    def generate_text(self): # {{{
        self.oeb.logger.info('Serializing markup content...')
        self.serializer = Serializer(self.oeb, self.image_map,
                self.is_periodical,
@ -180,7 +178,7 @@ class MobiWriter(object):
            self.oeb.logger.info('  Compressing markup content...')

        while text.tell() < self.text_length:
-            data, overlap = self.read_text_record(text)
+            data, overlap = create_text_record(text)
            if self.compression == PALMDOC:
                data = compress_doc(data)

@ -197,57 +195,6 @@ class MobiWriter(object):
        if records_size % 4 != 0:
            self.records.append(b'\x00'*(records_size % 4))
            self.first_non_text_record_idx += 1
-
-    def read_text_record(self, text):
-        '''
-        Return a Palmdoc record of size RECORD_SIZE from the text file object.
-        In case the record ends in the middle of a multibyte character return
-        the overlap as well.
-
-        Returns data, overlap: where both are byte strings. overlap is the
-        extra bytes needed to complete the truncated multibyte character.
-        '''
-        opos = text.tell()
-        text.seek(0, 2)
-        # npos is the position of the next record
-        npos = min((opos + RECORD_SIZE, text.tell()))
-        # Number of bytes from the next record needed to complete the last
-        # character in this record
-        extra = 0
-
-        last = b''
-        while not last.decode('utf-8', 'ignore'):
-            # last contains no valid utf-8 characters
-            size = len(last) + 1
-            text.seek(npos - size)
-            last = text.read(size)
-
-        # last now has one valid utf-8 char and possibly some bytes that belong
-        # to a truncated char
-
-        try:
-            last.decode('utf-8', 'strict')
-        except UnicodeDecodeError:
-            # There are some truncated bytes in last
-            prev = len(last)
-            while True:
-                text.seek(npos - prev)
-                last = text.read(len(last) + 1)
-                try:
-                    last.decode('utf-8')
-                except UnicodeDecodeError:
-                    pass
-                else:
-                    break
-            extra = len(last) - prev
-
-        text.seek(opos)
-        data = text.read(RECORD_SIZE)
-        overlap = text.read(extra)
-        text.seek(npos)
-
-        return data, overlap
-
    # }}}

    def generate_record0(self): #  MOBI header {{{
--- a/src/calibre/ebooks/mobi/writer2/serializer.py
+++ b/src/calibre/ebooks/mobi/writer2/serializer.py
@ -12,6 +12,7 @@ import re
 from calibre.ebooks.oeb.base import (OEB_DOCS, XHTML, XHTML_NS, XML_NS,
        namespace, prefixname, urlnormalize)
 from calibre.ebooks.mobi.mobiml import MBP_NS
+from calibre.ebooks.mobi.utils import is_guide_ref_start

 from collections import defaultdict
 from urlparse import urldefrag
@ -161,9 +162,7 @@ class Serializer(object):
                buf.write(b'title="')
                self.serialize_text(ref.title, quot=True)
                buf.write(b'" ')
-                if (ref.title.lower() == 'start' or
-                    (ref.type and ref.type.lower() in {'start',
-                        'other.start', 'text'})):
+                if is_guide_ref_start(ref):
                    self._start_href = ref.href
            self.serialize_href(ref.href)
            # Space required or won't work, I kid you not
--- a/src/calibre/ebooks/mobi/writer8/header.py
+++ b/src/calibre/ebooks/mobi/writer8/header.py
@ -0,0 +1,77 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from io import BytesIO
+from collections import OrderedDict
+from struct import pack
+
+from calibre.ebooks.mobi.utils import align_block
+
+NULL = 0xffffffff
+zeroes = lambda x: b'\0'*x
+nulls = lambda x: b'\xff'*x
+
+class Header(OrderedDict):
+
+    HEADER_NAME = b''
+
+    DEFINITION = '''
+    '''
+
+    ALIGN_BLOCK = False
+    POSITIONS = {}
+
+    def __init__(self):
+        OrderedDict.__init__(self)
+
+        for line in self.DEFINITION.splitlines():
+            line = line.strip()
+            if not line or line.startswith('#'): continue
+            name, val = [x.strip() for x in line.partition('=')[0::2]]
+            if val:
+                val = eval(val, {'zeroes':zeroes, 'NULL':NULL, 'DYN':None,
+                    'nulls':nulls})
+            else:
+                val = 0
+            if name in self:
+                raise ValueError('Duplicate field in definition: %r'%name)
+            self[name] = val
+
+    def __call__(self, **kwargs):
+        positions = {}
+        for name, val in kwargs.iteritems():
+            if name not in self:
+                raise KeyError('Not a valid header field: %r'%name)
+            self[name] = val
+
+        buf = BytesIO()
+        buf.write(bytes(self.HEADER_NAME))
+        for name, val in self.iteritems():
+            val = self.format_value(name, val)
+            positions[name] = buf.tell()
+            if val is None:
+                raise ValueError('Dynamic field %r not set'%name)
+            if isinstance(val, (int, long)):
+                val = pack(b'>I', val)
+            buf.write(val)
+
+        for pos_field, field in self.POSITIONS.iteritems():
+            buf.seek(positions[pos_field])
+            buf.write(pack(b'>I', positions[field]))
+
+        ans = buf.getvalue()
+        if self.ALIGN_BLOCK:
+            ans = align_block(ans)
+        return ans
+
+
+    def format_value(self, name, val):
+        return val
+
+
--- a/src/calibre/ebooks/mobi/writer8/index.py
+++ b/src/calibre/ebooks/mobi/writer8/index.py
@ -0,0 +1,332 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+from future_builtins import map
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from collections import namedtuple
+from struct import pack
+from io import BytesIO
+
+from calibre.ebooks.mobi.utils import CNCX, encint, align_block
+from calibre.ebooks.mobi.writer8.header import Header
+
+TagMeta_ = namedtuple('TagMeta',
+        'name number values_per_entry bitmask end_flag')
+TagMeta = lambda x:TagMeta_(*x)
+EndTagTable = TagMeta(('eof', 0, 0, 0, 1))
+
+# map of mask to number of shifts needed, works with 1 bit and two-bit wide masks
+# could also be extended to 4 bit wide ones as well
+mask_to_bit_shifts = { 1:0, 2:1, 3:0, 4:2, 8:3, 12:2, 16:4, 32:5, 48:4, 64:6,
+        128:7, 192: 6 }
+
+class IndexHeader(Header): # {{{
+
+    HEADER_NAME = b'INDX'
+    ALIGN_BLOCK = True
+    HEADER_LENGTH = 192
+
+    DEFINITION = '''
+    # 4 - 8: Header Length
+    header_length = {header_length}
+
+    # 8 - 16: Unknown
+    unknown1 = zeroes(8)
+
+    # 16 - 20: Index type: 0 - normal 2 - inflection
+    type = 2
+
+    # 20 - 24: IDXT offset (filled in later)
+    idxt_offset
+
+    # 24 - 28: Number of index records
+    num_of_records = 1
+
+    # 28 - 32: Index encoding (65001 = utf-8)
+    encoding = 65001
+
+    # 32 - 36: Unknown
+    unknown2 = NULL
+
+    # 36 - 40: Number of Index entries
+    num_of_entries = DYN
+
+    # 40 - 44: ORDT offset
+    ordt_offset
+
+    # 44 - 48: LIGT offset
+    ligt_offset
+
+    # 48 - 52: Number of ORDT/LIGT? entries
+    num_of_ordt_entries
+
+    # 52 - 56: Number of CNCX records
+    num_of_cncx = DYN
+
+    # 56 - 180: Unknown
+    unknown3 = zeroes(124)
+
+    # 180 - 184: TAGX offset
+    tagx_offset = {header_length}
+
+    # 184 - 192: Unknown
+    unknown4 = zeroes(8)
+
+    # TAGX
+    tagx = DYN
+
+    # Last Index entry
+    last_index = DYN
+
+    # IDXT
+    idxt = DYN
+    '''.format(header_length=HEADER_LENGTH)
+
+    POSITIONS = {'idxt_offset':'idxt'}
+# }}}
+
+class Index(object): # {{{
+
+    control_byte_count = 1
+    cncx = CNCX()
+    tag_types = (EndTagTable,)
+
+    HEADER_LENGTH = IndexHeader.HEADER_LENGTH
+
+    @classmethod
+    def generate_tagx(cls):
+        header = b'TAGX'
+        byts = bytearray()
+        for tag_meta in cls.tag_types:
+            byts.extend(tag_meta[1:])
+        # table length, control byte count
+        header += pack(b'>II', 12+len(byts), cls.control_byte_count)
+        return header + bytes(byts)
+
+    @classmethod
+    def calculate_control_bytes_for_each_entry(cls, entries):
+        control_bytes = []
+        for lead_text, tags in entries:
+            cbs = []
+            ans = 0
+            for (name, number, vpe, mask, endi) in cls.tag_types:
+                if endi == 1:
+                    cbs.append(ans)
+                    ans = 0
+                    continue
+                try:
+                    nvals = len(tags.get(name, ()))
+                except TypeError:
+                    nvals = 1
+                nentries = nvals // vpe
+                shifts = mask_to_bit_shifts[mask]
+                ans |= mask & (nentries << shifts)
+            if len(cbs) != cls.control_byte_count:
+                raise ValueError('The entry %r is invalid'%[lead_text, tags])
+            control_bytes.append(cbs)
+        return control_bytes
+
+    def __call__(self):
+        self.control_bytes = self.calculate_control_bytes_for_each_entry(
+                self.entries)
+
+        rendered_entries = []
+        index, idxt, buf = BytesIO(), BytesIO(), BytesIO()
+        IndexEntry = namedtuple('IndexEntry', 'offset length raw')
+        last_lead_text = b''
+        too_large = ValueError('Index has too many entries, calibre does not'
+                    ' support generating multiple index records at this'
+                    ' time.')
+
+        for i, x in enumerate(self.entries):
+            control_bytes = self.control_bytes[i]
+            leading_text, tags = x
+            buf.seek(0), buf.truncate(0)
+            leading_text = (leading_text.encode('utf-8') if
+                    isinstance(leading_text, unicode) else leading_text)
+            raw = bytearray(leading_text)
+            raw.insert(0, len(leading_text))
+            buf.write(bytes(raw))
+            buf.write(bytes(bytearray(control_bytes)))
+            for tag in self.tag_types:
+                values = tags.get(tag.name, None)
+                if values is None: continue
+                try:
+                    len(values)
+                except TypeError:
+                    values = [values]
+                if values:
+                    for val in values:
+                        try:
+                            buf.write(encint(val))
+                        except ValueError:
+                            raise ValueError('Invalid values for %r: %r'%(
+                                tag, values))
+            raw = buf.getvalue()
+            offset = index.tell()
+            if offset + self.HEADER_LENGTH >= 0x10000:
+                raise too_large
+            rendered_entries.append(IndexEntry(offset, len(raw), raw))
+            idxt.write(pack(b'>H', self.HEADER_LENGTH+offset))
+            index.write(raw)
+            last_lead_text = leading_text
+
+        index_block = align_block(index.getvalue())
+        idxt_block = align_block(b'IDXT' + idxt.getvalue())
+        body = index_block + idxt_block
+        if len(body) + self.HEADER_LENGTH >= 0x10000:
+            raise too_large
+        header = b'INDX'
+        buf.truncate(0)
+        buf.write(pack(b'>I', self.HEADER_LENGTH))
+        buf.write(b'\0'*4) # Unknown
+        buf.write(pack(b'>I', 1)) # Header type? Or index record number?
+        buf.write(b'\0'*4) # Unknown
+
+        # IDXT block offset
+        buf.write(pack(b'>I', self.HEADER_LENGTH + len(index_block)))
+
+        # Number of index entries
+        buf.write(pack(b'>I', len(rendered_entries)))
+
+        buf.write(b'\xff'*8) # Unknown
+
+        buf.write(b'\0'*156) # Unknown
+
+        header += buf.getvalue()
+        index_record = header + body
+
+        tagx = self.generate_tagx()
+        idxt = (b'IDXT' + pack(b'>H', IndexHeader.HEADER_LENGTH + len(tagx)) +
+                b'\0')
+        # Last index
+        idx = bytes(bytearray([len(last_lead_text)])) + last_lead_text
+        idx += pack(b'>H', len(rendered_entries))
+
+        header = {
+                'num_of_entries': len(rendered_entries),
+                'num_of_cncx': len(self.cncx),
+                'tagx':tagx,
+                'last_index':align_block(idx),
+                'idxt':idxt
+        }
+        header = IndexHeader()(**header)
+        self.records = [header, index_record]
+        self.records.extend(self.cncx.records)
+        return self.records
+# }}}
+
+class SkelIndex(Index):
+
+    tag_types = tuple(map(TagMeta, (
+        ('chunk_count', 1, 1, 3, 0),
+        ('geometry',    6, 2, 12, 0),
+        EndTagTable
+    )))
+
+    def __init__(self, skel_table):
+        self.entries = [
+                (s.name, {
+                    # Dont ask me why these entries have to be repeated twice
+                    'chunk_count':(s.chunk_count, s.chunk_count),
+                    'geometry':(s.start_pos, s.length, s.start_pos, s.length),
+                    }) for s in skel_table
+        ]
+
+
+class ChunkIndex(Index):
+
+    tag_types = tuple(map(TagMeta, (
+        ('cncx_offset',     2, 1, 1, 0),
+        ('file_number',     3, 1, 2, 0),
+        ('sequence_number', 4, 1, 4, 0),
+        ('geometry',        6, 2, 8, 0),
+        EndTagTable
+    )))
+
+    def __init__(self, chunk_table):
+        self.cncx = CNCX(c.selector for c in chunk_table)
+
+        self.entries = [
+                ('%010d'%c.insert_pos, {
+
+                    'cncx_offset':self.cncx[c.selector],
+                    'file_number':c.file_number,
+                    'sequence_number':c.sequence_number,
+                    'geometry':(c.start_pos, c.length),
+                    }) for c in chunk_table
+        ]
+
+class GuideIndex(Index):
+
+    tag_types = tuple(map(TagMeta, (
+        ('title',           1, 1, 1, 0),
+        ('pos_fid',         6, 2, 2, 0),
+        EndTagTable
+    )))
+
+    def __init__(self, guide_table):
+        self.cncx = CNCX(c.title for c in guide_table)
+
+        self.entries = [
+                (r.type, {
+
+                    'title':self.cncx[r.title],
+                    'pos_fid':r.pos_fid,
+                    }) for r in guide_table
+        ]
+
+
+class NCXIndex(Index):
+
+    control_byte_count = 2
+    tag_types = tuple(map(TagMeta, (
+        ('offset',             1, 1, 1, 0),
+        ('length',             2, 1, 2, 0),
+        ('label',              3, 1, 4, 0),
+        ('depth',              4, 1, 8, 0),
+        ('parent',             21, 1, 16, 0),
+        ('first_child',        22, 1, 32, 0),
+        ('last_child',         23, 1, 64, 0),
+        ('pos_fid',            6, 2, 128, 0),
+        EndTagTable,
+        ('image',              69, 1, 1, 0),
+        ('description',        70, 1, 2, 0),
+        ('author',             71, 1, 4, 0),
+        ('caption',            72, 1, 8, 0),
+        ('attribution',        73, 1, 16, 0),
+        EndTagTable
+    )))
+
+    def __init__(self, toc_table):
+        strings = []
+        for entry in toc_table:
+            strings.append(entry['label'])
+            aut = entry.get('author', None)
+            if aut:
+                strings.append(aut)
+            desc = entry.get('description', None)
+            if desc:
+                strings.append(desc)
+        self.cncx = CNCX(strings)
+
+        def to_entry(x):
+            ans = {}
+            for f in ('offset', 'length', 'depth', 'pos_fid', 'parent',
+                    'first_child', 'last_child'):
+                if f in x:
+                    ans[f] = x[f]
+            for f in ('label', 'description', 'author'):
+                if f in x:
+                    ans[f] = self.cncx[x[f]]
+            return ('%02x'%x['index'], ans)
+
+        self.entries = list(map(to_entry, toc_table))
+
+
+
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@ -9,42 +9,56 @@ __docformat__ = 'restructuredtext en'

 import copy
 from functools import partial
-from collections import defaultdict
+from collections import defaultdict, namedtuple
+from io import BytesIO
+from struct import pack

 import cssutils
 from lxml import etree

 from calibre import isbytestring, force_unicode
-from calibre.ebooks.mobi.utils import to_base
+from calibre.ebooks.mobi.utils import (create_text_record, to_base,
+        is_guide_ref_start)
+from calibre.ebooks.compression.palmdoc import compress_doc
 from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
        extract, XHTML, urlnormalize)
 from calibre.ebooks.oeb.parse_utils import barename
-from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags
+from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href
+from calibre.ebooks.mobi.writer8.index import (NCXIndex, SkelIndex,
+        ChunkIndex, GuideIndex)

 XML_DOCS = OEB_DOCS | {SVG_MIME}

 # References to record numbers in KF8 are stored as base-32 encoded integers,
 # with 4 digits
 to_ref = partial(to_base, base=32, min_num_digits=4)
-# References in links are stored with 10 digits
-to_href = partial(to_base, base=32, min_num_digits=10)

 class KF8Writer(object):

    def __init__(self, oeb, opts, resources):
        self.oeb, self.opts, self.log = oeb, opts, oeb.log
+        self.compress = not self.opts.dont_compress
        self.log.info('Creating KF8 output')
        self.used_images = set()
        self.resources = resources
-        self.dup_data()
        self.flows = [None] # First flow item is reserved for the text
+        self.records = []

+        self.log('\tGenerating KF8 markup...')
+        self.dup_data()
        self.replace_resource_links()
        self.extract_css_into_flows()
        self.extract_svg_into_flows()
        self.replace_internal_links_with_placeholders()
        self.insert_aid_attributes()
        self.chunk_it_up()
+        # Dump the cloned data as it is no longer needed
+        del self._data_cache
+        self.create_text_records()
+        self.log('\tCreating indices...')
+        self.create_fdst_records()
+        self.create_indices()
+        self.create_guide()

    def dup_data(self):
        ''' Duplicate data so that any changes we make to markup/CSS only
@ -199,7 +213,137 @@ class KF8Writer(object):
                    j += 1

    def chunk_it_up(self):
-        chunker = Chunker(self.oeb, self.data)
-        chunker
+        placeholder_map = {}
+        for placeholder, x in self.link_map.iteritems():
+            href, frag = x
+            aid = self.id_map.get(x, None)
+            if aid is None:
+                aid = self.id_map.get((href, ''))
+            placeholder_map[placeholder] = aid
+        chunker = Chunker(self.oeb, self.data, placeholder_map)

+        for x in ('skel_table', 'chunk_table', 'aid_offset_map'):
+            setattr(self, x, getattr(chunker, x))
+
+        self.flows[0] = chunker.text
+
+    def create_text_records(self):
+        self.flows = [x.encode('utf-8') if isinstance(x, unicode) else x for x
+                in self.flows]
+        text = b''.join(self.flows)
+        self.text_length = len(text)
+        text = BytesIO(text)
+        nrecords = 0
+        records_size = 0
+
+        if self.compress:
+            self.oeb.logger.info('\tCompressing markup...')
+
+        while text.tell() < self.text_length:
+            data, overlap = create_text_record(text)
+            if self.compress:
+                data = compress_doc(data)
+
+            data += overlap
+            data += pack(b'>B', len(overlap))
+
+            self.records.append(data)
+            records_size += len(data)
+            nrecords += 1
+
+        self.last_text_record_idx = nrecords
+        self.first_non_text_record_idx = nrecords + 1
+        # Pad so that the next records starts at a 4 byte boundary
+        if records_size % 4 != 0:
+            self.records.append(b'\x00'*(records_size % 4))
+            self.first_non_text_record_idx += 1
+
+    def create_fdst_records(self):
+        FDST = namedtuple('Flow', 'start end')
+        entries = []
+        self.fdst_table = []
+        for i, flow in enumerate(self.flows):
+            start = 0 if i == 0 else self.fdst_table[-1].end
+            self.fdst_table.append(FDST(start, start + len(flow)))
+            entries.extend(self.fdst_table[-1])
+        rec = (b'FDST' + pack(b'>LL', len(self.fdst_table), 12) +
+                pack(b'>%dL'%len(entries), *entries))
+        self.fdst_records = [rec]
+
+    def create_indices(self):
+        self.skel_records = SkelIndex(self.skel_table)()
+        self.chunk_records = ChunkIndex(self.chunk_table)()
+        self.ncx_records = []
+        toc = self.oeb.toc
+        max_depth = toc.depth()
+        entries = []
+        is_periodical = self.opts.mobi_periodical
+        if toc.count() < 2:
+            self.log.warn('Document has no ToC, MOBI will have no NCX index')
+            return
+
+        # Flatten the ToC into a depth first list
+        fl = toc.iter() if is_periodical else toc.iterdescendants()
+        for i, item in enumerate(fl):
+            entry = {'index':i, 'depth': max_depth - item.depth() - (0 if
+                is_periodical else 1), 'href':item.href, 'label':(item.title or
+                    _('Unknown'))}
+            entries.append(entry)
+            for child in item:
+                child.ncx_parent = entry
+            p = getattr(item, 'ncx_parent', None)
+            if p is not None:
+                entry['parent'] = p['index']
+            if is_periodical:
+                if item.author:
+                    entry['author'] = item.author
+                if item.description:
+                    entry['description'] = item.description
+
+        for entry in entries:
+            children = [e for e in entries if e.get('parent', -1) == entry['index']]
+            if children:
+                entry['first_child'] = children[0]['index']
+                entry['last_child'] = children[-1]['index']
+            href = entry.pop('href')
+            href, frag = href.partition('#')[0::2]
+            aid = self.id_map.get((href, frag), None)
+            if aid is None:
+                aid = self.id_map.get((href, ''), None)
+            if aid is None:
+                pos, fid = 0, 0
+            else:
+                pos, fid = self.aid_offset_map[aid]
+            chunk = self.chunk_table[pos]
+            offset = chunk.insert_pos + fid
+            length = chunk.length
+            entry['pos_fid'] = (pos, fid)
+            entry['offset'] = offset
+            entry['length'] = length
+
+        self.ncx_records = NCXIndex(entries)()
+
+    def create_guide(self):
+        self.start_offset = None
+        self.guide_table = []
+        self.guide_records = []
+        GuideRef = namedtuple('GuideRef', 'title type pos_fid')
+        for ref in self.oeb.guide.values():
+            href, frag = ref.href.partition('#')[0::2]
+            aid = self.id_map.get((href, frag), None)
+            if aid is None:
+                aid = self.id_map.get((href, ''))
+            if aid is None:
+                continue
+            pos, fid = self.aid_offset_map[aid]
+            if is_guide_ref_start(ref) and fid == 0:
+                # If fid != 0 then we cannot represent the start position as a
+                # single number in the EXTH header, so we do not write it to
+                # EXTH
+                self.start_offset = pos
+            self.guide_table.append(GuideRef(ref.title or
+                _('Unknown'), ref.type, (pos, fid)))
+
+        if self.guide_table:
+            self.guide_records = GuideIndex(self.guide_table)()

--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@ -9,13 +9,19 @@ __docformat__ = 'restructuredtext en'

 import re
 from collections import namedtuple
+from functools import partial

 from lxml import etree

 from calibre.ebooks.oeb.base import XHTML_NS
+from calibre.constants import ispy3
+from calibre.ebooks.mobi.utils import to_base

 CHUNK_SIZE = 8192

+# References in links are stored with 10 digits
+to_href = partial(to_base, base=32, min_num_digits=10)
+
 # Tags to which kindlegen adds the aid attribute
 aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
 'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details',
@ -48,13 +54,36 @@ def node_from_path(root, path):
        parent = parent[idx]
    return parent

+mychr = chr if ispy3 else unichr
+
+def tostring(raw, **kwargs):
+    ''' lxml *sometimes* represents non-ascii characters as hex entities in
+    attribute values. I can't figure out exactly what circumstances cause it.
+    It seems to happen when serializing a part of a larger tree. Since we need
+    serialization to be the same when serializing full and partial trees, we
+    manually replace all hex entities with their unicode codepoints. '''
+
+    xml_declaration = kwargs.pop('xml_declaration', False)
+    encoding = kwargs.pop('encoding', 'UTF-8')
+    kwargs['encoding'] = unicode
+    kwargs['xml_declaration'] = False
+    ans = etree.tostring(raw, **kwargs)
+    if xml_declaration:
+        ans = '<?xml version="1.0" encoding="%s"?>\n'%encoding + ans
+    return re.sub(r'&#x([0-9A-Fa-f]+);', lambda m:mychr(int(m.group(1), 16)),
+            ans).encode(encoding)
+
 class Chunk(object):

-    def __init__(self, raw):
+    def __init__(self, raw, parent_tag):
        self.raw = raw
        self.starts_tags = []
        self.ends_tags = []
        self.insert_pos = None
+        self.parent_tag = parent_tag
+        self.parent_is_body = False
+        self.is_last_chunk = False
+        self.is_first_chunk = False

    def __len__(self):
        return len(self.raw)
@ -63,6 +92,17 @@ class Chunk(object):
        self.raw += chunk.raw
        self.ends_tags = chunk.ends_tags

+    def __repr__(self):
+        return 'Chunk(len=%r insert_pos=%r starts_tags=%r ends_tags=%r)'%(
+                len(self.raw), self.insert_pos, self.starts_tags, self.ends_tags)
+
+    @property
+    def selector(self):
+        typ = 'S' if (self.is_last_chunk and not self.parent_is_body) else 'P'
+        return "%s-//*[@aid='%s']"%(typ, self.parent_tag)
+
+    __str__ = __repr__
+
 class Skeleton(object):

    def __init__(self, file_number, item, root, chunks):
@ -76,8 +116,8 @@ class Skeleton(object):
        self.calculate_insert_positions()

    def render(self, root):
-        raw = etree.tostring(root, encoding='UTF-8', xml_declaration=True)
-        raw = raw.replace('<html', '<html xmlns="%s"'%XHTML_NS, 1)
+        raw = tostring(root, xml_declaration=True)
+        raw = raw.replace(b'<html', bytes('<html xmlns="%s"'%XHTML_NS), 1)
        return raw

    def calculate_metrics(self, root):
@ -85,8 +125,7 @@ class Skeleton(object):
        self.metrics = {}
        for tag in root.xpath('//*[@aid]'):
            text = (tag.text or '').encode('utf-8')
-            raw = etree.tostring(tag, encoding='UTF-8', with_tail=True,
-                    xml_declaration=False)
+            raw = tostring(tag, with_tail=True)
            start_length = len(raw.partition(b'>')[0]) + len(text) + 1
            end_length = len(raw.rpartition(b'<')[-1]) + 1
            self.metrics[tag.get('aid')] = Metric(start_length, end_length)
@ -101,19 +140,45 @@ class Skeleton(object):
            for tag in chunk.ends_tags:
                pos += self.metrics[tag].end

+    def rebuild(self):
+        ans = self.skeleton
+        for chunk in self.chunks:
+            i = chunk.insert_pos
+            ans = ans[:i] + chunk.raw + ans[i:]
+        return ans
+
+    def __len__(self):
+        return len(self.skeleton) + sum([len(x.raw) for x in self.chunks])
+
+    @property
+    def raw_text(self):
+        return b''.join([self.skeleton] + [x.raw for x in self.chunks])
+
 class Chunker(object):

-    def __init__(self, oeb, data_func):
+    def __init__(self, oeb, data_func, placeholder_map):
        self.oeb, self.log = oeb, oeb.log
        self.data = data_func
+        self.placeholder_map = placeholder_map

        self.skeletons = []

+        # Set this to a list to enable dumping of the original and rebuilt
+        # html files for debugging
+        orig_dumps = None
+
        for i, item in enumerate(self.oeb.spine):
            root = self.remove_namespaces(self.data(item))
            body = root.xpath('//body')[0]
            body.tail = '\n'

+            if orig_dumps is not None:
+                orig_dumps.append(tostring(root, xml_declaration=True,
+                    with_tail=True))
+                orig_dumps[-1] = close_self_closing_tags(
+                        orig_dumps[-1].replace(b'<html',
+                        bytes('<html xmlns="%s"'%XHTML_NS), 1))
+
            # First pass: break up document into rendered strings of length no
            # more than CHUNK_SIZE
            chunks = []
@ -128,6 +193,18 @@ class Chunker(object):
            # for all chunks
            self.skeletons.append(Skeleton(i, item, root, chunks))

+        if orig_dumps:
+            self.dump(orig_dumps)
+
+        # Create the SKEL and Chunk tables
+        self.skel_table = []
+        self.chunk_table = []
+        self.create_tables()
+
+        # Set internal links
+        text = b''.join(x.raw_text for x in self.skeletons)
+        self.text = self.set_internal_links(text)
+
    def remove_namespaces(self, root):
        lang = None
        for attr, val in root.attrib.iteritems():
@ -160,34 +237,33 @@ class Chunker(object):

        return nroot

-
    def step_into_tag(self, tag, chunks):
        aid = tag.get('aid')
+        is_body = tag.tag == 'body'

        first_chunk_idx = len(chunks)

        # First handle any text
        if tag.text and tag.text.strip(): # Leave pure whitespace in the skel
-            chunks.extend(self.chunk_up_text(tag.text))
+            chunks.extend(self.chunk_up_text(tag.text, aid))
            tag.text = None

        # Now loop over children
        for child in list(tag):
-            raw = etree.tostring(child, encoding='UTF-8',
-                    xml_declaration=False, with_tail=False)
+            raw = tostring(child, with_tail=False)
            raw = close_self_closing_tags(raw)
            if len(raw) > CHUNK_SIZE and child.get('aid', None):
                self.step_into_tag(child, chunks)
                if child.tail and child.tail.strip(): # Leave pure whitespace
-                    chunks.extend(self.chunk_up_text(child.tail))
+                    chunks.extend(self.chunk_up_text(child.tail, aid))
                    child.tail = None
            else:
                if len(raw) > CHUNK_SIZE:
                    self.log.warn('Tag %s has no aid and a too large chunk'
                            ' size. Adding anyway.'%child.tag)
-                chunks.append(Chunk(raw))
+                chunks.append(Chunk(raw, aid))
                if child.tail:
-                    chunks.extend(self.chunk_up_text(child.tail))
+                    chunks.extend(self.chunk_up_text(child.tail, aid))
                tag.remove(child)

        if len(chunks) <= first_chunk_idx and chunks:
@ -197,8 +273,15 @@ class Chunker(object):
        if chunks:
            chunks[first_chunk_idx].starts_tags.append(aid)
            chunks[-1].ends_tags.append(aid)
+            my_chunks = chunks[first_chunk_idx:]
+            if my_chunks:
+                my_chunks[0].is_first_chunk = True
+                my_chunks[-1].is_last_chunk = True
+                if is_body:
+                    for chunk in my_chunks:
+                        chunk.parent_is_body = True

-    def chunk_up_text(self, text):
+    def chunk_up_text(self, text, parent_tag):
        text = text.encode('utf-8')
        ans = []

@ -214,7 +297,7 @@ class Chunker(object):
        while rest:
            start, rest = split_multibyte_text(rest)
            ans.append(b'<span class="AmznBigTextBlock">' + start + '</span>')
-        return [Chunk(x) for x in ans]
+        return [Chunk(x, parent_tag) for x in ans]

    def merge_small_chunks(self, chunks):
        ans = chunks[:1]
@ -230,3 +313,99 @@ class Chunker(object):
                prev.merge(chunk)
        return ans

+    def create_tables(self):
+        Skel = namedtuple('Skel',
+                'file_number name chunk_count start_pos length')
+        sp = 0
+        for s in self.skeletons:
+            s.start_pos = sp
+            sp += len(s)
+        self.skel_table = [Skel(s.file_number, 'SKEL%010d'%s.file_number,
+            len(s.chunks), s.start_pos, len(s.skeleton)) for x in self.skeletons]
+
+        Chunk = namedtuple('Chunk',
+            'insert_pos selector file_number sequence_number start_pos length')
+        num = cp = 0
+        for skel in self.skeletons:
+            cp = skel.start_pos
+            for chunk in skel.chunks:
+                self.chunk_table.append(
+                    Chunk(chunk.insert_pos + skel.start_pos, chunk.selector,
+                        skel.file_number, num, cp, len(chunk.raw)))
+                cp += len(chunk.raw)
+                num += 1
+
+    def set_internal_links(self, text):
+        ''' Update the internal link placeholders to point to the correct
+        location, based on the chunk table.'''
+        # A kindle:pos:fid link contains two base 32 numbers of the form
+        # XXXX:YYYYYYYYYY
+        # The first number is an index into the chunk table and the second is
+        # an offset from the start of the chunk to the start of the tag pointed
+        # to by the link.
+        aid_map = {} # Map of aid to (pos, fid)
+        for match in re.finditer(br'<[^>]+? aid=[\'"]([A-Z0-9]+)[\'"]', text):
+            offset = match.start()
+            pos_fid = None
+            for chunk in self.chunk_table:
+                if chunk.insert_pos <= offset < chunk.insert_pos + chunk.length:
+                    pos_fid = (chunk.sequence_number, offset-chunk.insert_pos)
+                    break
+                if chunk.insert_pos > offset:
+                    # This aid is in the skeleton, not in a chunk, so we use
+                    # the chunk immediately after
+                    pos_fid = (chunk.sequence_number, 0)
+                    break
+            if pos_fid is None:
+                raise ValueError('Could not find chunk for aid: %r'%
+                        match.group(1))
+            aid_map[match.group(1)] = pos_fid
+
+        self.aid_offset_map = aid_map
+
+        def to_placeholder(aid):
+            pos, fid = aid_map[aid]
+            pos, fid = to_base(pos, min_num_digits=4), to_href(fid)
+            return bytes(':'.join((pos, fid)))
+
+        placeholder_map = {bytes(k):to_placeholder(v) for k, v in
+                self.placeholder_map.iteritems()}
+
+        # Now update the links
+        def sub(match):
+            raw = match.group()
+            pl = match.group(1)
+            try:
+                return raw[:-15] + placeholder_map[pl]
+            except KeyError:
+                pass
+            return raw
+
+        return re.sub(br'<[^>]+(kindle:pos:fid:0000:\d{10})', sub, text)
+
+    def dump(self, orig_dumps):
+        import tempfile, shutil, os
+        tdir = os.path.join(tempfile.gettempdir(), 'skeleton')
+        self.log('Skeletons dumped to:', tdir)
+        if os.path.exists(tdir):
+            shutil.rmtree(tdir)
+        orig = os.path.join(tdir, 'orig')
+        rebuilt = os.path.join(tdir, 'rebuilt')
+        for x in (orig, rebuilt):
+            os.makedirs(x)
+        error = False
+        for i, skeleton in enumerate(self.skeletons):
+            oraw, rraw = orig_dumps[i], skeleton.rebuild()
+            with open(os.path.join(orig, '%04d.html'%i),  'wb') as f:
+                f.write(oraw)
+            with open(os.path.join(rebuilt, '%04d.html'%i),  'wb') as f:
+                f.write(rraw)
+            if oraw != rraw:
+                error = True
+        if error:
+            raise ValueError('The before and after HTML differs. Run a diff '
+                    'tool on the orig and rebuilt directories')
+        else:
+            self.log('Skeleton HTML before and after is identical.')
+
+
--- a/src/calibre/gui2/convert/search_and_replace.ui
+++ b/src/calibre/gui2/convert/search_and_replace.ui
@ -137,7 +137,7 @@
     <item>
      <widget class="QPushButton" name="sr_load">
       <property name="toolTip">
-        <string>Load a listof expression from a previously saved file</string>
+        <string>Load a list of expressions from a previously saved file</string>
       </property>
       <property name="text">
        <string>&amp;Load</string>
@ -147,7 +147,7 @@
     <item>
      <widget class="QPushButton" name="sr_save">
       <property name="toolTip">
-        <string>Save this list of expression so that you can re-use it easily</string>
+        <string>Save this list of expressions so that you can re-use it easily</string>
       </property>
       <property name="text">
        <string>&amp;Save</string>
--- a/src/calibre/linux.py
+++ b/src/calibre/linux.py
@ -228,8 +228,8 @@ class PostInstall:
            from calibre.utils.smtp import option_parser as smtp_op
            from calibre.library.server.main import option_parser as serv_op
            from calibre.ebooks.epub.fix.main import option_parser as fix_op
-            any_formats = ['epub', 'htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip',
-                'txt', 'lit', 'rtf', 'pdf', 'prc', 'mobi', 'fb2', 'odt', 'lrf', 'snb']
+            from calibre.ebooks import BOOK_EXTENSIONS
+            input_formats = sorted(all_input_formats())
            bc = os.path.join(os.path.dirname(self.opts.staging_sharedir),
                'bash-completion')
            if os.path.exists(bc):
@ -249,11 +249,11 @@ class PostInstall:
            self.info('Installing bash completion to', f)
            with open(f, 'wb') as f:
                f.write('# calibre Bash Shell Completion\n')
-                f.write(opts_and_exts('calibre', guiop, any_formats))
+                f.write(opts_and_exts('calibre', guiop, BOOK_EXTENSIONS))
                f.write(opts_and_exts('lrf2lrs', lrf2lrsop, ['lrf']))
                f.write(opts_and_exts('ebook-meta', metaop, list(meta_filetypes())))
                f.write(opts_and_exts('lrfviewer', lrfviewerop, ['lrf']))
-                f.write(opts_and_exts('ebook-viewer', viewer_op, any_formats))
+                f.write(opts_and_exts('ebook-viewer', viewer_op, input_formats))
                f.write(opts_and_words('fetch-ebook-metadata', fem_op, []))
                f.write(opts_and_words('calibre-smtp', smtp_op, []))
                f.write(opts_and_words('calibre-server', serv_op, []))
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@ -29,7 +29,7 @@ It can convert every input format in the following list, to every output format.
    PRC is a generic format, |app| supports PRC files with TextRead and MOBIBook headers.
    PDB is also a generic format. |app| supports eReder, Plucker, PML and zTxt PDB files.
    DJVU support is only for converting DJVU files that contain embedded text. These are typically generated by OCR software.
-    MOBI books can be of two types Mobi6 and KF8. |app| currently fully supports Mobi6 and supports conversion from, but not to, KF8
+    MOBI books can be of two types Mobi6 and KF8. |app| fully supports both. MOBI files often have .azw or .azw3 file extensions

 .. _best-source-formats:

--- a/src/calibre/translations/calibre.pot
+++ b/src/calibre/translations/calibre.pot
@ -4,9 +4,9 @@
 #
 msgid ""
 msgstr ""
-"Project-Id-Version: calibre 0.8.47\n"
-"POT-Creation-Date: 2012-04-13 09:24+IST\n"
-"PO-Revision-Date: 2012-04-13 09:24+IST\n"
+"Project-Id-Version: calibre 0.8.48\n"
+"POT-Creation-Date: 2012-04-20 14:41+IST\n"
+"PO-Revision-Date: 2012-04-20 14:41+IST\n"
 "Last-Translator: Automatically generated\n"
 "Language-Team: LANGUAGE\n"
 "MIME-Version: 1.0\n"
@ -24,8 +24,8 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/db/cache.py:106
 #: /home/kovid/work/calibre/src/calibre/db/cache.py:109
 #: /home/kovid/work/calibre/src/calibre/db/cache.py:120
-#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:317
-#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:318
+#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:319
+#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:320
 #: /home/kovid/work/calibre/src/calibre/devices/hanvon/driver.py:100
 #: /home/kovid/work/calibre/src/calibre/devices/hanvon/driver.py:101
 #: /home/kovid/work/calibre/src/calibre/devices/jetbook/driver.py:74
@ -36,9 +36,9 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/devices/nook/driver.py:71
 #: /home/kovid/work/calibre/src/calibre/devices/prs500/books.py:267
 #: /home/kovid/work/calibre/src/calibre/devices/prs505/sony_cache.py:661
-#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:337
-#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:338
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:493
+#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:347
+#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:348
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:496
 #: /home/kovid/work/calibre/src/calibre/ebooks/chm/metadata.py:57
 #: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/chm_input.py:109
 #: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/chm_input.py:112
@ -109,7 +109,7 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/ebooks/mobi/writer2/indexer.py:497
 #: /home/kovid/work/calibre/src/calibre/ebooks/odt/input.py:168
 #: /home/kovid/work/calibre/src/calibre/ebooks/odt/input.py:170
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:836
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:850
 #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/parse_utils.py:353
 #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/parse_utils.py:356
 #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/parse_utils.py:360
@ -183,14 +183,15 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/library/database2.py:580
 #: /home/kovid/work/calibre/src/calibre/library/database2.py:2087
 #: /home/kovid/work/calibre/src/calibre/library/database2.py:2241
-#: /home/kovid/work/calibre/src/calibre/library/database2.py:3303
+#: /home/kovid/work/calibre/src/calibre/library/database2.py:2657
 #: /home/kovid/work/calibre/src/calibre/library/database2.py:3305
-#: /home/kovid/work/calibre/src/calibre/library/database2.py:3442
+#: /home/kovid/work/calibre/src/calibre/library/database2.py:3307
+#: /home/kovid/work/calibre/src/calibre/library/database2.py:3444
 #: /home/kovid/work/calibre/src/calibre/library/server/content.py:250
 #: /home/kovid/work/calibre/src/calibre/library/server/content.py:251
 #: /home/kovid/work/calibre/src/calibre/library/server/mobile.py:245
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:157
 #: /home/kovid/work/calibre/src/calibre/library/server/opds.py:160
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:163
 #: /home/kovid/work/calibre/src/calibre/library/server/xml.py:79
 #: /home/kovid/work/calibre/src/calibre/utils/localization.py:162
 #: /home/kovid/work/calibre/src/calibre/utils/podofo/__init__.py:46
@ -894,15 +895,15 @@ msgstr ""
 msgid "Communicate with Android phones."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:167
+#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:168
 msgid "Comma separated list of directories to send e-books to on the device. The first one that exists will be used"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:257
+#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:259
 msgid "Communicate with S60 phones."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:276
+#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:278
 msgid "Communicate with WebOS tablets."
 msgstr ""

@ -1002,8 +1003,8 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/devices/nook/driver.py:102
 #: /home/kovid/work/calibre/src/calibre/devices/prs505/sony_cache.py:448
 #: /home/kovid/work/calibre/src/calibre/devices/prs505/sony_cache.py:471
-#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:558
-#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:577
+#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:568
+#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:587
 #: /home/kovid/work/calibre/src/calibre/devices/usbms/device.py:1051
 #: /home/kovid/work/calibre/src/calibre/devices/usbms/device.py:1057
 #: /home/kovid/work/calibre/src/calibre/devices/usbms/device.py:1092
@ -1013,7 +1014,7 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/gui2/tag_browser/model.py:1165
 #: /home/kovid/work/calibre/src/calibre/library/database2.py:346
 #: /home/kovid/work/calibre/src/calibre/library/database2.py:359
-#: /home/kovid/work/calibre/src/calibre/library/database2.py:3160
+#: /home/kovid/work/calibre/src/calibre/library/database2.py:3162
 #: /home/kovid/work/calibre/src/calibre/library/field_metadata.py:187
 msgid "News"
 msgstr ""
@ -1021,8 +1022,8 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/devices/apple/driver.py:2727
 #: /home/kovid/work/calibre/src/calibre/gui2/catalog/catalog_epub_mobi.py:65
 #: /home/kovid/work/calibre/src/calibre/library/catalogs/epub_mobi.py:65
-#: /home/kovid/work/calibre/src/calibre/library/database2.py:3116
-#: /home/kovid/work/calibre/src/calibre/library/database2.py:3134
+#: /home/kovid/work/calibre/src/calibre/library/database2.py:3118
+#: /home/kovid/work/calibre/src/calibre/library/database2.py:3136
 msgid "Catalog"
 msgstr ""

@ -1067,20 +1068,20 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:111
 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:337
 #: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:155
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:141
 #: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:144
 #: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:147
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:215
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:222
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:245
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:150
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:218
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:225
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:248
 msgid "Getting list of books on device..."
 msgstr ""

 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:264
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:268
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:324
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:274
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:276
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:277
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:279
 msgid "Transferring books to device..."
 msgstr ""

@ -1088,8 +1089,8 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:344
 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:474
 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:509
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:298
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:329
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:301
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:332
 msgid "Adding books to device metadata listing..."
 msgstr ""

@ -1099,8 +1100,8 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/devices/hanvon/driver.py:126
 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:426
 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:458
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:335
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:356
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:338
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:359
 msgid "Removing books from device..."
 msgstr ""

@ -1108,13 +1109,13 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:374
 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:462
 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:469
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:363
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:368
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:366
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:371
 msgid "Removing books from device metadata listing..."
 msgstr ""

 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:442
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:398
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:401
 msgid "Sending metadata to device..."
 msgstr ""

@ -1364,11 +1365,11 @@ msgstr ""
 msgid "If you have a custom column in your library that you use to store the page count of books, you can have calibre use that information, instead of calculating a page count. Specify the name of the custom column here, for example, #pages. "
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/devices/kindle/driver.py:415
+#: /home/kovid/work/calibre/src/calibre/devices/kindle/driver.py:419
 msgid "Communicate with the Kindle DX eBook reader."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/devices/kindle/driver.py:424
+#: /home/kovid/work/calibre/src/calibre/devices/kindle/driver.py:428
 msgid "Communicate with the Kindle Fire"
 msgstr ""

@ -1900,31 +1901,31 @@ msgid "Modify the document text and structure using common patterns. Disabled by
 msgstr ""

 #: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:157
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:16
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:20
 msgid "Modify the document text and structure using user defined patterns."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:166
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:167
 msgid "Control auto-detection of document structure."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:176
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:177
 msgid "Control the automatic generation of a Table of Contents. By default, if the source file has a Table of Contents, it will be used in preference to the automatically generated one."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:186
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:187
 msgid "Options to set metadata in the output"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:189
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:190
 msgid "Options to help with debugging the conversion"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:217
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:219
 msgid "List builtin recipe names. You can create an ebook from a builtin recipe like this: ebook-convert \"Recipe Name.recipe\" output.epub"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:289
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:322
 msgid "Output saved to"
 msgstr ""

@ -2163,48 +2164,48 @@ msgstr ""
 msgid "Comic"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:23
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:21
 msgid "When present, use author sort field as author."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:27
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:25
 msgid "Don't add Table of Contents to the book. Useful if the book has its own table of contents."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:30
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:28
 #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/transforms/htmltoc.py:57
 msgid "Title for any generated in-line table of contents."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:34
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:32
 msgid "Disable compression of the file contents."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:37
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:35
 msgid "Tag marking book to be filed with Personal Docs"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:41
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:39
 msgid "Ignore margins in the input document. If False, then the MOBI output plugin will try to convert margins specified in the input document, otherwise it will ignore them."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:47
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:45
 msgid "When adding the Table of Contents to the book, add it at the start of the book instead of the end. Not recommended."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:51
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:49
 msgid "Extract the contents of the MOBI file to the specified directory. If the directory already exists, it will be deleted."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:56
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:54
 msgid "Enable sharing of book content via Facebook etc.  on the Kindle. WARNING: Using this feature means that  the book will not auto sync its last read position  on multiple devices. Complain to Amazon."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:63
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:61
 msgid "By default calibre converts all images to JPEG format in the output MOBI file. This is for maximum compatibility as some older MOBI viewers have problems with other image formats. This option tells calibre not to do this. Useful if your document contains lots of GIF/PNG images that become very large when converted to JPEG."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:114
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:103
 #: /home/kovid/work/calibre/src/calibre/ebooks/epub/periodical.py:125
 msgid "All articles"
 msgstr ""
@ -2714,27 +2715,31 @@ msgstr ""
 msgid "Replacement to replace the text found with sr3-search."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:733
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:632
+msgid "Path to a file containing search and replace regular expressions. The file must contain alternating lines of regular expression followed by replacement pattern (which can be an empty line). The regular expression must be in the python regex syntax and the file must be UTF-8 encoded."
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:741
 msgid "Could not find an ebook inside the archive"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:791
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:799
 msgid "Values of series index and rating must be numbers. Ignoring"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:798
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:806
 msgid "Failed to parse date/time"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:957
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:965
 msgid "Converting input to HTML..."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:984
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:992
 msgid "Running transforms on ebook..."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:1088
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:1096
 msgid "Creating"
 msgstr ""

@ -3032,7 +3037,7 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/gui2/store/search/models.py:41
 #: /home/kovid/work/calibre/src/calibre/gui2/store/stores/mobileread/models.py:23
 #: /home/kovid/work/calibre/src/calibre/library/field_metadata.py:375
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:580
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:583
 msgid "Title"
 msgstr ""

@ -3200,7 +3205,7 @@ msgid ""
 msgstr ""

 #: /home/kovid/work/calibre/src/calibre/ebooks/metadata/opf2.py:1434
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1244
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1258
 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:937
 #: /home/kovid/work/calibre/src/calibre/gui2/store/search/models.py:41
 msgid "Cover"
@ -3310,70 +3315,70 @@ msgstr ""
 msgid "No details available"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1245
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1259
 msgid "Title Page"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1246
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1260
 #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/transforms/htmltoc.py:15
 #: /home/kovid/work/calibre/src/calibre/gui2/viewer/main.py:57
 #: /home/kovid/work/calibre/src/calibre/gui2/viewer/main_ui.py:199
 msgid "Table of Contents"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1247
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1261
 msgid "Index"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1248
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1262
 msgid "Glossary"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1249
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1263
 msgid "Acknowledgements"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1250
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1264
 msgid "Bibliography"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1251
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1265
 msgid "Colophon"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1252
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1266
 msgid "Copyright"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1253
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1267
 msgid "Dedication"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1254
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1268
 msgid "Epigraph"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1255
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1269
 msgid "Foreword"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1256
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1270
 msgid "List of Illustrations"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1257
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1271
 msgid "List of Tables"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1258
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1272
 msgid "Notes"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1259
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1273
 msgid "Preface"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1260
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1274
 msgid "Main Text"
 msgstr ""

@ -4073,7 +4078,7 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/gui2/actions/choose_library.py:147
 #: /home/kovid/work/calibre/src/calibre/gui2/preferences/toolbar.py:58
 #: /home/kovid/work/calibre/src/calibre/library/server/browse.py:171
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:126
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:129
 #, python-format
 msgid "%d books"
 msgstr ""
@ -5043,7 +5048,7 @@ msgid "Selected books have no formats"
 msgstr ""

 #: /home/kovid/work/calibre/src/calibre/gui2/actions/view.py:153
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:128
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:138
 msgid "Choose the format to view"
 msgstr ""

@ -5416,7 +5421,7 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/gui2/convert/pdf_output_ui.py:54
 #: /home/kovid/work/calibre/src/calibre/gui2/convert/pmlz_output_ui.py:46
 #: /home/kovid/work/calibre/src/calibre/gui2/convert/rb_output_ui.py:33
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:147
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:110
 #: /home/kovid/work/calibre/src/calibre/gui2/convert/snb_output_ui.py:42
 #: /home/kovid/work/calibre/src/calibre/gui2/convert/structure_detection_ui.py:59
 #: /home/kovid/work/calibre/src/calibre/gui2/convert/toc_ui.py:70
@ -6631,23 +6636,32 @@ msgstr ""
 msgid "RB Output"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:134
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:146
 msgid "No formats available"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:135
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:147
 msgid "Cannot build regex using the GUI builder without a book."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:144
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:156
 msgid "Could not open file"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:145
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:157
 msgid "Could not open the file, do you have it open in another program?"
 msgstr ""

 #: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:175
+msgid "Failed to generate markup for testing. Click \"Show Details\" to learn more."
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:181
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:185
+msgid "Failed to generate preview"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:191
 msgid "Open book"
 msgstr ""

@ -6699,50 +6713,124 @@ msgstr ""
 msgid "Preview"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:15
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:19
 msgid ""
 "Search\n"
 "&\n"
 "Replace"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:29
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:32
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:35
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:43
 msgid "&Search Regular Expression"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:72
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:56
+msgid "Replacement Text"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:56
+msgid "Search Regular Expression"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:100
+msgid "Load Calibre Search-Replace definitions file"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:102
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:119
+msgid "Calibre Search-Replace definitions file"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:110
+msgid "Failed to read"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:111
+#, python-format
+msgid "Failed to load patterns from %s, click Show details to learn more."
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:117
+msgid "Save Calibre Search-Replace definitions file"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:168
+msgid "Unused Search & Replace definition"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:169
+msgid "The search / replace definition being edited  has not been added to the list of definitions. Do you wish to continue with the conversion (the definition will not be used)?"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:180
 #: /home/kovid/work/calibre/src/calibre/gui2/widgets.py:112
 msgid "Invalid regular expression"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:73
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:181
 #: /home/kovid/work/calibre/src/calibre/gui2/widgets.py:113
 #, python-format
 msgid "Invalid regular expression: %s"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:148
-msgid "First expression"
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:265
+msgid "The list of search/replace definitions that will be applied to this conversion."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:149
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:151
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:153
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:111
+msgid "Search/Replace Definition Edit"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:112
 msgid "&Replacement Text"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:150
-msgid "Second Expression"
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:113
+msgid "Add the current expression to the list of expressions that will be applied"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:152
-msgid "Third expression"
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:114
+msgid "&Add"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:154
-msgid "<p>Search and replace uses <i>regular expressions</i>. See the <a href=\"http://manual.calibre-ebook.com/regexp.html\">regular expressions tutorial</a> to get started with regular expressions. Also clicking the wizard buttons below will allow you to test your regular expression against the current input document."
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:115
+msgid "Edit the currently selected expression"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:116
+#: /home/kovid/work/calibre/src/calibre/gui2/wizard/library_ui.py:59
+msgid "&Change"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:117
+msgid "Remove the currently selected expression"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:118
+#: /home/kovid/work/calibre/src/calibre/gui2/metadata/basic_widgets.py:886
+msgid "&Remove"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:119
+msgid "Load a listof expression from a previously saved file"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:120
+msgid "&Load"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:121
+msgid "Save this list of expression so that you can re-use it easily"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:122
+#: /home/kovid/work/calibre/src/calibre/gui2/preferences/search_ui.py:131
+#: /usr/src/qt-everywhere-opensource-src-4.8.0/src/gui/widgets/qdialogbuttonbox.cpp:661
+msgid "&Save"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:123
+msgid "<p>Search and replace uses <i>regular expressions</i>. See the <a href=\"http://manual.calibre-ebook.com/regexp.html\">regular expressions tutorial</a> to get started with regular expressions. Also clicking the wizard button below will allow you to test your regular expression against the current input document. When you are happy with an expression, click the Add button to add it to the list of expressions."
 msgstr ""

 #: /home/kovid/work/calibre/src/calibre/gui2/convert/single.py:181
@ -7808,7 +7896,7 @@ msgstr ""

 #: /home/kovid/work/calibre/src/calibre/gui2/dialogs/confirm_delete_location_ui.py:77
 #: /home/kovid/work/calibre/src/calibre/gui2/layout.py:73
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:233
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:236
 msgid "Library"
 msgstr ""

@ -7843,7 +7931,7 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/gui2/preferences/create_custom_column.py:35
 #: /home/kovid/work/calibre/src/calibre/gui2/preferences/create_custom_column.py:76
 #: /home/kovid/work/calibre/src/calibre/library/field_metadata.py:365
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:579
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:582
 msgid "Date"
 msgstr ""

@ -10811,10 +10899,6 @@ msgstr ""
 msgid "T&rim"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/gui2/metadata/basic_widgets.py:886
-msgid "&Remove"
-msgstr ""
-
 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/basic_widgets.py:892
 msgid "Download co&ver"
 msgstr ""
@ -12867,11 +12951,6 @@ msgid ""
 "of a search term by changing the value box then pressing Save."
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/gui2/preferences/search_ui.py:131
-#: /usr/src/qt-everywhere-opensource-src-4.8.0/src/gui/widgets/qdialogbuttonbox.cpp:661
-msgid "&Save"
-msgstr ""
-
 #: /home/kovid/work/calibre/src/calibre/gui2/preferences/search_ui.py:132
 msgid "Make &user categories from:"
 msgstr ""
@ -14924,10 +15003,6 @@ msgstr ""
 msgid "<p>Choose a location for your books. When you add books to calibre, they will be copied here. Use an <b>empty folder</b> for a new calibre library:"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/gui2/wizard/library_ui.py:59
-msgid "&Change"
-msgstr ""
-
 #: /home/kovid/work/calibre/src/calibre/gui2/wizard/library_ui.py:60
 msgid "If you have an existing calibre library, it will be copied to the new location. If a calibre library already exists at the new location, calibre will switch to using it."
 msgstr ""
@ -15984,17 +16059,17 @@ msgstr ""
 msgid "%(tt)sAverage rating is %(rating)3.1f"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/library/database2.py:3468
+#: /home/kovid/work/calibre/src/calibre/library/database2.py:3470
 #, python-format
 msgid "<p>Migrating old database to ebook library in %s<br><center>"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/library/database2.py:3497
+#: /home/kovid/work/calibre/src/calibre/library/database2.py:3499
 #, python-format
 msgid "Copying <b>%s</b>"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/library/database2.py:3514
+#: /home/kovid/work/calibre/src/calibre/library/database2.py:3516
 msgid "Compacting database"
 msgstr ""

@ -16198,7 +16273,7 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/library/server/ajax.py:295
 #: /home/kovid/work/calibre/src/calibre/library/server/browse.py:341
 #: /home/kovid/work/calibre/src/calibre/library/server/browse.py:625
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:579
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:582
 msgid "Newest"
 msgstr ""

@ -16355,40 +16430,40 @@ msgstr ""
 msgid "The full interface gives you many more features, but it may not work well on a small screen"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:126
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:129
 #, python-format
 msgid "%d book"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:149
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:152
 #, python-format
 msgid "%d items"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:167
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:170
 #, python-format
 msgid "RATING: %s<br />"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:170
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:173
 #, python-format
 msgid "TAGS: %s<br />"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:175
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:178
 #, python-format
 msgid "SERIES: %(series)s [%(sidx)s]<br />"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:273
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:276
 msgid "Books in your library"
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:279
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:282
 msgid "By "
 msgstr ""

-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:280
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:283
 msgid "Books sorted by "
 msgstr ""