From 441b4e20cc9661579e09b6bfa6aa00b19c54eb3d Mon Sep 17 00:00:00 2001
From: Lee <ldolse@yahoo.com>
Date: Sun, 24 Apr 2011 22:43:25 +0800
Subject: [PATCH 01/37] re-factored the query logic for overdrive to handle
 titles including punctuation

---
 src/calibre/ebooks/metadata/sources/overdrive.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py
index 759da45610..62a3ca2091 100755
--- a/src/calibre/ebooks/metadata/sources/overdrive.py
+++ b/src/calibre/ebooks/metadata/sources/overdrive.py
@@ -198,12 +198,16 @@ class OverDrive(Source):
         title_tokens = list(self.get_title_tokens(title,
                 strip_joiners=False, strip_subtitle=True))
 
-        if len(title_tokens) >= len(author_tokens):
+        xref_q = ''
+        if len(author_tokens) <= 1:
             initial_q = ' '.join(title_tokens)
             xref_q = '+'.join(author_tokens)
         else:
             initial_q = ' '.join(author_tokens)
-            xref_q = '+'.join(title_tokens)
+            for token in title_tokens:
+                if len(xref_q) < len(token):
+                    xref_q = token
+
         #log.error('Initial query is %s'%initial_q)
         #log.error('Cross reference query is %s'%xref_q)
 
@@ -234,10 +238,12 @@ class OverDrive(Source):
                         if xref_q.find('+') != -1:
                             xref_tokens = xref_q.split('+')
                             xref_q = xref_tokens[0]
-                            #log.error('xref_q is '+xref_q)
+                            for token in xref_tokens:
+                                if len(xref_q) < len(token):
+                                    xref_q = token
+                            #log.error('rewrote xref_q, new query is '+xref_q)
                     else:
                         xref_q = ''
-                    xref_q = ''
                     q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
                 elif int(m.group('totalrecords')) == 0:
                     return ''
@@ -264,6 +270,7 @@ class OverDrive(Source):
                 else:
                     if creators:
                         creators = creators.split(', ')
+
                     # if an exact match in a preferred format occurs
                     if ((author and creators[0] == author[0]) or (not author and not creators)) and od_title.lower() == title.lower() and int(formatid) in [1, 50, 410, 900] and thumbimage:
                         return self.format_results(reserveid, od_title, subtitle, series, publisher,

From 88a54e805405cbc72f7eb72ba469a711a3285777 Mon Sep 17 00:00:00 2001
From: Lee <ldolse@yahoo.com>
Date: Mon, 25 Apr 2011 10:51:20 +0800
Subject: [PATCH 02/37] ...

---
 src/calibre/ebooks/metadata/sources/overdrive.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py
index 62a3ca2091..67eac7e337 100755
--- a/src/calibre/ebooks/metadata/sources/overdrive.py
+++ b/src/calibre/ebooks/metadata/sources/overdrive.py
@@ -337,9 +337,9 @@ class OverDrive(Source):
     def find_ovrdrv_data(self, br, log, title, author, isbn, ovrdrv_id=None):
         q = base_url
         if ovrdrv_id is None:
-           return self.overdrive_search(br, log, q, title, author)
+            return self.overdrive_search(br, log, q, title, author)
         else:
-           return self.overdrive_get_record(br, log, q, ovrdrv_id)
+            return self.overdrive_get_record(br, log, q, ovrdrv_id)
 
 
 

From a1bbba3198f0945ade6d4d70f51a87f22c1d284a Mon Sep 17 00:00:00 2001
From: Lee <ldolse@yahoo.com>
Date: Sun, 7 Aug 2011 22:18:47 +0800
Subject: [PATCH 03/37] fix a number of issues with scene break formatting in
 heuristics

---
 src/calibre/ebooks/conversion/utils.py | 38 +++++++++++++++++---------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 7488df4609..9962335da3 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -451,27 +451,33 @@ class HeuristicProcessor(object):
         return html
 
     def detect_whitespace(self, html):
-        blanks_around_headings = re.compile(r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<heading><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
+        blanks_around_headings = re.compile(r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<content><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
+        blanks_around_scene_breaks = re.compile(r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<content><p class="scenebreak"[^>]*>.*?</p>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
         blanks_n_nopunct = re.compile(r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE|re.DOTALL)
 
         def merge_header_whitespace(match):
             initblanks = match.group('initparas')
-            endblanks = match.group('initparas')
-            heading = match.group('heading')
+            endblanks = match.group('endparas')
+            content = match.group('content')
             top_margin = ''
             bottom_margin = ''
             if initblanks is not None:
+                print "initial blanks are:\n"+initblanks
                 top_margin = 'margin-top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
             if endblanks is not None:
-                bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(initblanks)))+'em;'
+                print "endblanks blanks are:\n"+endblanks
+                bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(endblanks)))+'em;'
 
             if initblanks == None and endblanks == None:
-                return heading
+                return content
+            elif content.find('scenebreak') != -1:
+                return content
             else:
-                heading = re.sub('(?i)<h(?P<hnum>\d+)[^>]*>', '\n\n<h'+'\g<hnum>'+' style="'+top_margin+bottom_margin+'">', heading)
-            return heading
+                content = re.sub('(?i)<h(?P<hnum>\d+)[^>]*>', '\n\n<h'+'\g<hnum>'+' style="'+top_margin+bottom_margin+'">', content)
+            return content
 
         html = blanks_around_headings.sub(merge_header_whitespace, html)
+        html = blanks_around_scene_breaks.sub(merge_header_whitespace, html)
 
         def markup_whitespaces(match):
             blanks = match.group(0)
@@ -506,6 +512,12 @@ class HeuristicProcessor(object):
             html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
         return html
 
+    def detect_scene_breaks(self, html):
+        scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
+        scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
+        html = scene_breaks.sub(self.scene_break_open+'\g<break>'+'</p>', html)
+        return html
+
     def markup_user_break(self, replacement_break):
         '''
         Takes string a user supplies and wraps it in markup that will be centered with
@@ -765,25 +777,25 @@ class HeuristicProcessor(object):
         # If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
         if getattr(self.extra_opts, 'format_scene_breaks', False):
             html = re.sub('(?i)<div[^>]*>\s*<br(\s?/)?>\s*</div>', '<p></p>', html)
+            html = self.detect_scene_breaks(html)
             html = self.detect_whitespace(html)
             html = self.detect_soft_breaks(html)
             blanks_count = len(self.any_multi_blank.findall(html))
             if blanks_count >= 1:
                 html = self.merge_blanks(html, blanks_count)
-            scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
-            scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
+            detected_scene_break = re.compile(r'<p class="scenebreak"[^>]*>.*?</p>')
+            scene_break_count = len(detected_scene_break.findall(html))
             # If the user has enabled scene break replacement, then either softbreaks
             # or 'hard' scene breaks are replaced, depending on which is in use
             # Otherwise separator lines are centered, use a bit larger margin in this case
             replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
             if replacement_break:
                 replacement_break = self.markup_user_break(replacement_break)
-                if len(scene_break.findall(html)) >= 1:
-                    html = scene_break.sub(replacement_break, html)
+                if scene_break_count >= 1:
+                    html = detected_scene_break.sub(replacement_break, html)
+                    html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html)
                 else:
                     html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html)
-            else:
-                html = scene_break.sub(self.scene_break_open+'\g<break>'+'</p>', html)
 
         if self.deleted_nbsps:
             # put back non-breaking spaces in empty paragraphs so they render correctly

From d07b4556e97c15b373fbf6b40c4fcc29b3872c10 Mon Sep 17 00:00:00 2001
From: Lee <ldolse@yahoo.com>
Date: Mon, 8 Aug 2011 03:51:56 +0800
Subject: [PATCH 04/37] fix issue where overdrive can get in an ifinite loop

---
 .../ebooks/metadata/sources/overdrive.py      | 43 ++++++++++---------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py
index 1faacaa3ef..0af41538b0 100755
--- a/src/calibre/ebooks/metadata/sources/overdrive.py
+++ b/src/calibre/ebooks/metadata/sources/overdrive.py
@@ -208,8 +208,8 @@ class OverDrive(Source):
                 if len(xref_q) < len(token):
                     xref_q = token
 
-        #log.error('Initial query is %s'%initial_q)
-        #log.error('Cross reference query is %s'%xref_q)
+        log.error('Initial query is %s'%initial_q)
+        log.error('Cross reference query is %s'%xref_q)
 
         q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
         query = '{"szKeyword":"'+initial_q+'"}'
@@ -224,29 +224,30 @@ class OverDrive(Source):
 
         # get the search results object
         results = False
+        iterations = 0
         while results == False:
+            iterations += 1
             xreq = mechanize.Request(q_xref)
             xreq.add_header('X-Requested-With', 'XMLHttpRequest')
             xreq.add_header('Referer', q_init_search)
             xreq.add_header('Accept', 'application/json, text/javascript, */*')
             raw = br.open_novisit(xreq).read()
             for m in re.finditer(ur'"iTotalDisplayRecords":(?P<displayrecords>\d+).*?"iTotalRecords":(?P<totalrecords>\d+)', raw):
-                if int(m.group('displayrecords')) >= 1:
-                    results = True
-                elif int(m.group('totalrecords')) >= 1:
-                    if int(m.group('totalrecords')) >= 100:
-                        if xref_q.find('+') != -1:
-                            xref_tokens = xref_q.split('+')
-                            xref_q = xref_tokens[0]
-                            for token in xref_tokens:
-                                if len(xref_q) < len(token):
-                                    xref_q = token
-                            #log.error('rewrote xref_q, new query is '+xref_q)
-                    else:
-                        xref_q = ''
-                    q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
-                elif int(m.group('totalrecords')) == 0:
+                if int(m.group('totalrecords')) == 0:
                     return ''
+                elif int(m.group('displayrecords')) >= 1:
+                    results = True
+                elif int(m.group('totalrecords')) >= 1 and iterations < 3:
+                    if xref_q.find('+') != -1:
+                        xref_tokens = xref_q.split('+')
+                        xref_q = xref_tokens[0]
+                        for token in xref_tokens:
+                            if len(xref_q) < len(token):
+                                xref_q = token
+                        #log.error('rewrote xref_q, new query is '+xref_q)
+                else:
+                        xref_q = ''
+                q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
 
         return self.sort_ovrdrv_results(raw, log, title, title_tokens, author, author_tokens)
 
@@ -461,10 +462,10 @@ if __name__ == '__main__':
         [
 
             (
-                {'title':'Foundation and Earth',
-                    'authors':['Asimov']},
-                [title_test('Foundation and Earth', exact=True),
-                    authors_test(['Isaac Asimov'])]
+                {'title':'The Sea Kings Daughter',
+                    'authors':['Elizabeth Peters']},
+                [title_test('The Sea Kings Daughter', exact=False),
+                    authors_test(['Elizabeth Peters'])]
             ),
 
             (

From b72aaf7fa60f5ed2508ac1e96834c67ad3cc99ff Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 20 Apr 2012 07:55:24 +0530
Subject: [PATCH 05/37] Acim Bilim Dergisi by thomass

---
 recipes/acim_bilim_dergisi.recipe | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 recipes/acim_bilim_dergisi.recipe

diff --git a/recipes/acim_bilim_dergisi.recipe b/recipes/acim_bilim_dergisi.recipe
new file mode 100644
index 0000000000..5d674fe93a
--- /dev/null
+++ b/recipes/acim_bilim_dergisi.recipe
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1334868409(BasicNewsRecipe):
+    title          = u'AÇIK BİLİM DERGİSİ'
+    description            =  ' Aylık çevrimiçi bilim dergisi'
+    __author__             = u'thomass'
+    oldest_article = 30
+    max_articles_per_feed = 300
+    auto_cleanup = True
+    encoding               = 'UTF-8'
+    publisher              = 'açık bilim'
+    category               = 'haber, bilim,TR,dergi'
+    language               = 'tr'
+    publication_type = 'magazine '
+    conversion_options = {
+                            'tags'            : category
+                            ,'language'        : language
+                            ,'publisher'       : publisher
+                            ,'linearize_tables': True
+                         }
+    cover_img_url = 'http://www.acikbilim.com/wp-content/themes/Equilibrium/images/logodene.jpg'
+    masthead_url = 'http://www.acikbilim.com/wp-content/themes/Equilibrium/images/logodene.jpg'
+
+
+    feeds          = [(u'Tüm Yayınlar', u'http://www.acikbilim.com/feed')]

From 6545602342c4309f02f552ffe926507754818668 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 20 Apr 2012 08:14:03 +0530
Subject: [PATCH 06/37] version 0.8.48

---
 Changelog.yaml           | 51 ++++++++++++++++++++++++++++++++++++++++
 src/calibre/constants.py |  2 +-
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/Changelog.yaml b/Changelog.yaml
index 17f3ebcf97..01425ec2ca 100644
--- a/Changelog.yaml
+++ b/Changelog.yaml
@@ -19,6 +19,57 @@
 #   new recipes:
 #     - title: 
 
+- version: 0.8.48
+  date: 2012-04-20
+
+  new features:
+    - title: "Conversion: The search and replace feature has been completely revamped."
+      description: "You can now use any number of search and replace
+      expression, not just three. You can also store and load frequently used
+      sets of search and replace expressions. Also, the wizard generates its
+      preview in a separate process to protect against crashes/memory leaks."
+      tickets: [983476,983484,983478]
+
+    - title: "Support for the new '.azw3' files that Amazon recently started generating. calibre will now detect them as ebooks. It can also view/convert them, if they are DRM free."
+
+    - title: "Drivers for Samsung Galaxy ACE GT-S5830L and HTC One X"
+      tickets: [981185] 
+ 
+  bug fixes:
+    - title: "Get Books: Support the new website design of Barnes & Noble"
+
+    - title: "T1 driver: Fix books sent to SD card sometimes resulting problems when deleted."
+      tickets: [943586]
+
+    - title: "Do not allow author names to be set to blank via the Manage authors function. Blank authors are now automatically set to 'Unknown'"
+
+    - title: "MOBI Output: Handle background color specified on <td> and <tr> in addition to <table> tags."
+      tickets: [980813]
+
+    - title: "MOBI Output: Fix underline style applied to parent element not getting inherited by <a> children."
+      tickets: [985711]
+
+  improved recipes:
+    - xkcd
+    - Metro Nieuws
+    - Calgary Herald
+    - Orlando Sentinel
+    - countryfile
+    - Heise
+
+  new recipes:
+    - title: Various new Polish news sources
+      author: fenuks
+
+    - title: Various Italian news sources 
+      author: faber1971
+
+    - title: Jakarta Globe 
+      author: rty
+
+    - title: Acim Bilim Dergisi
+      author: thomass
+
 - version: 0.8.47
   date: 2012-04-13
 
diff --git a/src/calibre/constants.py b/src/calibre/constants.py
index 402fef4c67..1db9c90466 100644
--- a/src/calibre/constants.py
+++ b/src/calibre/constants.py
@@ -4,7 +4,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 __appname__   = u'calibre'
-numeric_version = (0, 8, 47)
+numeric_version = (0, 8, 48)
 __version__   = u'.'.join(map(unicode, numeric_version))
 __author__    = u"Kovid Goyal <kovid@kovidgoyal.net>"
 

From 15ec14ab52844bfa88fc6cf3bb712f3f78b6c3a6 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 20 Apr 2012 08:18:25 +0530
Subject: [PATCH 07/37] ...

---
 src/calibre/devices/kobo/driver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/devices/kobo/driver.py b/src/calibre/devices/kobo/driver.py
index f68ea8feff..1384ec0810 100644
--- a/src/calibre/devices/kobo/driver.py
+++ b/src/calibre/devices/kobo/driver.py
@@ -298,7 +298,7 @@ class KOBO(USBMS):
             changed = False
             for i, row in enumerate(cursor):
             #  self.report_progress((i+1) / float(numrows), _('Getting list of books on device...'))
-                if row[3].startswith("file:///usr/local/Kobo/help/"):
+                if not hasattr(row[3], 'startswith') or row[3].startswith("file:///usr/local/Kobo/help/"):
                     # These are internal to the Kobo device and do not exist
                     continue
                 path = self.path_from_contentid(row[3], row[5], row[4], oncard)

From eb2d0761b0682f9f4b1580672c51b1f12252357d Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 20 Apr 2012 10:38:41 +0530
Subject: [PATCH 08/37] IGN:Tag release

---
 src/calibre/translations/calibre.pot | 331 ++++++++++++++++-----------
 1 file changed, 203 insertions(+), 128 deletions(-)

diff --git a/src/calibre/translations/calibre.pot b/src/calibre/translations/calibre.pot
index 7c3bb3b302..5b0c096104 100644
--- a/src/calibre/translations/calibre.pot
+++ b/src/calibre/translations/calibre.pot
@@ -4,9 +4,9 @@
 #
 msgid ""
 msgstr ""
-"Project-Id-Version: calibre 0.8.47\n"
-"POT-Creation-Date: 2012-04-13 09:24+IST\n"
-"PO-Revision-Date: 2012-04-13 09:24+IST\n"
+"Project-Id-Version: calibre 0.8.48\n"
+"POT-Creation-Date: 2012-04-20 08:19+IST\n"
+"PO-Revision-Date: 2012-04-20 08:19+IST\n"
 "Last-Translator: Automatically generated\n"
 "Language-Team: LANGUAGE\n"
 "MIME-Version: 1.0\n"
@@ -24,8 +24,8 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/db/cache.py:106
 #: /home/kovid/work/calibre/src/calibre/db/cache.py:109
 #: /home/kovid/work/calibre/src/calibre/db/cache.py:120
-#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:317
-#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:318
+#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:319
+#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:320
 #: /home/kovid/work/calibre/src/calibre/devices/hanvon/driver.py:100
 #: /home/kovid/work/calibre/src/calibre/devices/hanvon/driver.py:101
 #: /home/kovid/work/calibre/src/calibre/devices/jetbook/driver.py:74
@@ -36,9 +36,9 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/devices/nook/driver.py:71
 #: /home/kovid/work/calibre/src/calibre/devices/prs500/books.py:267
 #: /home/kovid/work/calibre/src/calibre/devices/prs505/sony_cache.py:661
-#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:337
-#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:338
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:493
+#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:347
+#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:348
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:496
 #: /home/kovid/work/calibre/src/calibre/ebooks/chm/metadata.py:57
 #: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/chm_input.py:109
 #: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/chm_input.py:112
@@ -109,7 +109,7 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/ebooks/mobi/writer2/indexer.py:497
 #: /home/kovid/work/calibre/src/calibre/ebooks/odt/input.py:168
 #: /home/kovid/work/calibre/src/calibre/ebooks/odt/input.py:170
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:836
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:850
 #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/parse_utils.py:353
 #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/parse_utils.py:356
 #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/parse_utils.py:360
@@ -183,14 +183,15 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/library/database2.py:580
 #: /home/kovid/work/calibre/src/calibre/library/database2.py:2087
 #: /home/kovid/work/calibre/src/calibre/library/database2.py:2241
-#: /home/kovid/work/calibre/src/calibre/library/database2.py:3303
+#: /home/kovid/work/calibre/src/calibre/library/database2.py:2657
 #: /home/kovid/work/calibre/src/calibre/library/database2.py:3305
-#: /home/kovid/work/calibre/src/calibre/library/database2.py:3442
+#: /home/kovid/work/calibre/src/calibre/library/database2.py:3307
+#: /home/kovid/work/calibre/src/calibre/library/database2.py:3444
 #: /home/kovid/work/calibre/src/calibre/library/server/content.py:250
 #: /home/kovid/work/calibre/src/calibre/library/server/content.py:251
 #: /home/kovid/work/calibre/src/calibre/library/server/mobile.py:245
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:157
 #: /home/kovid/work/calibre/src/calibre/library/server/opds.py:160
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:163
 #: /home/kovid/work/calibre/src/calibre/library/server/xml.py:79
 #: /home/kovid/work/calibre/src/calibre/utils/localization.py:162
 #: /home/kovid/work/calibre/src/calibre/utils/podofo/__init__.py:46
@@ -894,15 +895,15 @@ msgstr ""
 msgid "Communicate with Android phones."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:167
+#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:168
 msgid "Comma separated list of directories to send e-books to on the device. The first one that exists will be used"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:257
+#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:259
 msgid "Communicate with S60 phones."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:276
+#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:278
 msgid "Communicate with WebOS tablets."
 msgstr ""
 
@@ -1002,8 +1003,8 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/devices/nook/driver.py:102
 #: /home/kovid/work/calibre/src/calibre/devices/prs505/sony_cache.py:448
 #: /home/kovid/work/calibre/src/calibre/devices/prs505/sony_cache.py:471
-#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:558
-#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:577
+#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:568
+#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:587
 #: /home/kovid/work/calibre/src/calibre/devices/usbms/device.py:1051
 #: /home/kovid/work/calibre/src/calibre/devices/usbms/device.py:1057
 #: /home/kovid/work/calibre/src/calibre/devices/usbms/device.py:1092
@@ -1013,7 +1014,7 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/gui2/tag_browser/model.py:1165
 #: /home/kovid/work/calibre/src/calibre/library/database2.py:346
 #: /home/kovid/work/calibre/src/calibre/library/database2.py:359
-#: /home/kovid/work/calibre/src/calibre/library/database2.py:3160
+#: /home/kovid/work/calibre/src/calibre/library/database2.py:3162
 #: /home/kovid/work/calibre/src/calibre/library/field_metadata.py:187
 msgid "News"
 msgstr ""
@@ -1021,8 +1022,8 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/devices/apple/driver.py:2727
 #: /home/kovid/work/calibre/src/calibre/gui2/catalog/catalog_epub_mobi.py:65
 #: /home/kovid/work/calibre/src/calibre/library/catalogs/epub_mobi.py:65
-#: /home/kovid/work/calibre/src/calibre/library/database2.py:3116
-#: /home/kovid/work/calibre/src/calibre/library/database2.py:3134
+#: /home/kovid/work/calibre/src/calibre/library/database2.py:3118
+#: /home/kovid/work/calibre/src/calibre/library/database2.py:3136
 msgid "Catalog"
 msgstr ""
 
@@ -1067,20 +1068,20 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:111
 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:337
 #: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:155
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:141
 #: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:144
 #: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:147
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:215
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:222
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:245
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:150
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:218
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:225
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:248
 msgid "Getting list of books on device..."
 msgstr ""
 
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:264
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:268
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:324
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:274
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:276
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:277
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:279
 msgid "Transferring books to device..."
 msgstr ""
 
@@ -1088,8 +1089,8 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:344
 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:474
 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:509
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:298
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:329
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:301
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:332
 msgid "Adding books to device metadata listing..."
 msgstr ""
 
@@ -1099,8 +1100,8 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/devices/hanvon/driver.py:126
 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:426
 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:458
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:335
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:356
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:338
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:359
 msgid "Removing books from device..."
 msgstr ""
 
@@ -1108,13 +1109,13 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:374
 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:462
 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:469
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:363
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:368
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:366
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:371
 msgid "Removing books from device metadata listing..."
 msgstr ""
 
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:442
-#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:398
+#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:401
 msgid "Sending metadata to device..."
 msgstr ""
 
@@ -1364,11 +1365,11 @@ msgstr ""
 msgid "If you have a custom column in your library that you use to store the page count of books, you can have calibre use that information, instead of calculating a page count. Specify the name of the custom column here, for example, #pages. "
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/devices/kindle/driver.py:415
+#: /home/kovid/work/calibre/src/calibre/devices/kindle/driver.py:419
 msgid "Communicate with the Kindle DX eBook reader."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/devices/kindle/driver.py:424
+#: /home/kovid/work/calibre/src/calibre/devices/kindle/driver.py:428
 msgid "Communicate with the Kindle Fire"
 msgstr ""
 
@@ -1900,31 +1901,31 @@ msgid "Modify the document text and structure using common patterns. Disabled by
 msgstr ""
 
 #: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:157
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:16
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:20
 msgid "Modify the document text and structure using user defined patterns."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:166
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:167
 msgid "Control auto-detection of document structure."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:176
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:177
 msgid "Control the automatic generation of a Table of Contents. By default, if the source file has a Table of Contents, it will be used in preference to the automatically generated one."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:186
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:187
 msgid "Options to set metadata in the output"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:189
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:190
 msgid "Options to help with debugging the conversion"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:217
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:219
 msgid "List builtin recipe names. You can create an ebook from a builtin recipe like this: ebook-convert \"Recipe Name.recipe\" output.epub"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:289
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:322
 msgid "Output saved to"
 msgstr ""
 
@@ -2163,48 +2164,48 @@ msgstr ""
 msgid "Comic"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:23
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:21
 msgid "When present, use author sort field as author."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:27
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:25
 msgid "Don't add Table of Contents to the book. Useful if the book has its own table of contents."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:30
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:28
 #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/transforms/htmltoc.py:57
 msgid "Title for any generated in-line table of contents."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:34
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:32
 msgid "Disable compression of the file contents."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:37
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:35
 msgid "Tag marking book to be filed with Personal Docs"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:41
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:39
 msgid "Ignore margins in the input document. If False, then the MOBI output plugin will try to convert margins specified in the input document, otherwise it will ignore them."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:47
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:45
 msgid "When adding the Table of Contents to the book, add it at the start of the book instead of the end. Not recommended."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:51
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:49
 msgid "Extract the contents of the MOBI file to the specified directory. If the directory already exists, it will be deleted."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:56
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:54
 msgid "Enable sharing of book content via Facebook etc.  on the Kindle. WARNING: Using this feature means that  the book will not auto sync its last read position  on multiple devices. Complain to Amazon."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:63
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:61
 msgid "By default calibre converts all images to JPEG format in the output MOBI file. This is for maximum compatibility as some older MOBI viewers have problems with other image formats. This option tells calibre not to do this. Useful if your document contains lots of GIF/PNG images that become very large when converted to JPEG."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:114
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:103
 #: /home/kovid/work/calibre/src/calibre/ebooks/epub/periodical.py:125
 msgid "All articles"
 msgstr ""
@@ -2714,27 +2715,31 @@ msgstr ""
 msgid "Replacement to replace the text found with sr3-search."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:733
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:632
+msgid "Path to a file containing search and replace regular expressions. The file must contain alternating lines of regular expression followed by replacement pattern (which can be an empty line). The regular expression must be in the python regex syntax and the file must be UTF-8 encoded."
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:741
 msgid "Could not find an ebook inside the archive"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:791
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:799
 msgid "Values of series index and rating must be numbers. Ignoring"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:798
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:806
 msgid "Failed to parse date/time"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:957
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:965
 msgid "Converting input to HTML..."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:984
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:992
 msgid "Running transforms on ebook..."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:1088
+#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:1096
 msgid "Creating"
 msgstr ""
 
@@ -3032,7 +3037,7 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/gui2/store/search/models.py:41
 #: /home/kovid/work/calibre/src/calibre/gui2/store/stores/mobileread/models.py:23
 #: /home/kovid/work/calibre/src/calibre/library/field_metadata.py:375
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:580
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:583
 msgid "Title"
 msgstr ""
 
@@ -3200,7 +3205,7 @@ msgid ""
 msgstr ""
 
 #: /home/kovid/work/calibre/src/calibre/ebooks/metadata/opf2.py:1434
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1244
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1258
 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:937
 #: /home/kovid/work/calibre/src/calibre/gui2/store/search/models.py:41
 msgid "Cover"
@@ -3310,70 +3315,70 @@ msgstr ""
 msgid "No details available"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1245
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1259
 msgid "Title Page"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1246
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1260
 #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/transforms/htmltoc.py:15
 #: /home/kovid/work/calibre/src/calibre/gui2/viewer/main.py:57
 #: /home/kovid/work/calibre/src/calibre/gui2/viewer/main_ui.py:199
 msgid "Table of Contents"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1247
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1261
 msgid "Index"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1248
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1262
 msgid "Glossary"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1249
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1263
 msgid "Acknowledgements"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1250
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1264
 msgid "Bibliography"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1251
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1265
 msgid "Colophon"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1252
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1266
 msgid "Copyright"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1253
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1267
 msgid "Dedication"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1254
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1268
 msgid "Epigraph"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1255
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1269
 msgid "Foreword"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1256
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1270
 msgid "List of Illustrations"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1257
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1271
 msgid "List of Tables"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1258
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1272
 msgid "Notes"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1259
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1273
 msgid "Preface"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1260
+#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1274
 msgid "Main Text"
 msgstr ""
 
@@ -4073,7 +4078,7 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/gui2/actions/choose_library.py:147
 #: /home/kovid/work/calibre/src/calibre/gui2/preferences/toolbar.py:58
 #: /home/kovid/work/calibre/src/calibre/library/server/browse.py:171
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:126
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:129
 #, python-format
 msgid "%d books"
 msgstr ""
@@ -5043,7 +5048,7 @@ msgid "Selected books have no formats"
 msgstr ""
 
 #: /home/kovid/work/calibre/src/calibre/gui2/actions/view.py:153
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:128
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:138
 msgid "Choose the format to view"
 msgstr ""
 
@@ -5416,7 +5421,7 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/gui2/convert/pdf_output_ui.py:54
 #: /home/kovid/work/calibre/src/calibre/gui2/convert/pmlz_output_ui.py:46
 #: /home/kovid/work/calibre/src/calibre/gui2/convert/rb_output_ui.py:33
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:147
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:110
 #: /home/kovid/work/calibre/src/calibre/gui2/convert/snb_output_ui.py:42
 #: /home/kovid/work/calibre/src/calibre/gui2/convert/structure_detection_ui.py:59
 #: /home/kovid/work/calibre/src/calibre/gui2/convert/toc_ui.py:70
@@ -6631,23 +6636,32 @@ msgstr ""
 msgid "RB Output"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:134
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:146
 msgid "No formats available"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:135
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:147
 msgid "Cannot build regex using the GUI builder without a book."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:144
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:156
 msgid "Could not open file"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:145
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:157
 msgid "Could not open the file, do you have it open in another program?"
 msgstr ""
 
 #: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:175
+msgid "Failed to generate markup for testing. Click \"Show Details\" to learn more."
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:181
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:185
+msgid "Failed to generate preview"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:191
 msgid "Open book"
 msgstr ""
 
@@ -6699,50 +6713,124 @@ msgstr ""
 msgid "Preview"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:15
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:19
 msgid ""
 "Search\n"
 "&\n"
 "Replace"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:29
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:32
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:35
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:43
 msgid "&Search Regular Expression"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:72
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:56
+msgid "Replacement Text"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:56
+msgid "Search Regular Expression"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:100
+msgid "Load Calibre Search-Replace definitions file"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:102
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:119
+msgid "Calibre Search-Replace definitions file"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:110
+msgid "Failed to read"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:111
+#, python-format
+msgid "Failed to load patterns from %s, click Show details to learn more."
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:117
+msgid "Save Calibre Search-Replace definitions file"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:168
+msgid "Unused Search & Replace definition"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:169
+msgid "The search / replace definition being edited  has not been added to the list of definitions. Do you wish to continue with the conversion (the definition will not be used)?"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:180
 #: /home/kovid/work/calibre/src/calibre/gui2/widgets.py:112
 msgid "Invalid regular expression"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:73
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:181
 #: /home/kovid/work/calibre/src/calibre/gui2/widgets.py:113
 #, python-format
 msgid "Invalid regular expression: %s"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:148
-msgid "First expression"
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:265
+msgid "The list of search/replace definitions that will be applied to this conversion."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:149
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:151
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:153
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:111
+msgid "Search/Replace Definition Edit"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:112
 msgid "&Replacement Text"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:150
-msgid "Second Expression"
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:113
+msgid "Add the current expression to the list of expressions that will be applied"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:152
-msgid "Third expression"
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:114
+msgid "&Add"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:154
-msgid "<p>Search and replace uses <i>regular expressions</i>. See the <a href=\"http://manual.calibre-ebook.com/regexp.html\">regular expressions tutorial</a> to get started with regular expressions. Also clicking the wizard buttons below will allow you to test your regular expression against the current input document."
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:115
+msgid "Edit the currently selected expression"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:116
+#: /home/kovid/work/calibre/src/calibre/gui2/wizard/library_ui.py:59
+msgid "&Change"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:117
+msgid "Remove the currently selected expression"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:118
+#: /home/kovid/work/calibre/src/calibre/gui2/metadata/basic_widgets.py:886
+msgid "&Remove"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:119
+msgid "Load a listof expression from a previously saved file"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:120
+msgid "&Load"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:121
+msgid "Save this list of expression so that you can re-use it easily"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:122
+#: /home/kovid/work/calibre/src/calibre/gui2/preferences/search_ui.py:131
+#: /usr/src/qt-everywhere-opensource-src-4.8.0/src/gui/widgets/qdialogbuttonbox.cpp:661
+msgid "&Save"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:123
+msgid "<p>Search and replace uses <i>regular expressions</i>. See the <a href=\"http://manual.calibre-ebook.com/regexp.html\">regular expressions tutorial</a> to get started with regular expressions. Also clicking the wizard button below will allow you to test your regular expression against the current input document. When you are happy with an expression, click the Add button to add it to the list of expressions."
 msgstr ""
 
 #: /home/kovid/work/calibre/src/calibre/gui2/convert/single.py:181
@@ -7808,7 +7896,7 @@ msgstr ""
 
 #: /home/kovid/work/calibre/src/calibre/gui2/dialogs/confirm_delete_location_ui.py:77
 #: /home/kovid/work/calibre/src/calibre/gui2/layout.py:73
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:233
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:236
 msgid "Library"
 msgstr ""
 
@@ -7843,7 +7931,7 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/gui2/preferences/create_custom_column.py:35
 #: /home/kovid/work/calibre/src/calibre/gui2/preferences/create_custom_column.py:76
 #: /home/kovid/work/calibre/src/calibre/library/field_metadata.py:365
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:579
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:582
 msgid "Date"
 msgstr ""
 
@@ -10811,10 +10899,6 @@ msgstr ""
 msgid "T&rim"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/gui2/metadata/basic_widgets.py:886
-msgid "&Remove"
-msgstr ""
-
 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/basic_widgets.py:892
 msgid "Download co&ver"
 msgstr ""
@@ -12867,11 +12951,6 @@ msgid ""
 "of a search term by changing the value box then pressing Save."
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/gui2/preferences/search_ui.py:131
-#: /usr/src/qt-everywhere-opensource-src-4.8.0/src/gui/widgets/qdialogbuttonbox.cpp:661
-msgid "&Save"
-msgstr ""
-
 #: /home/kovid/work/calibre/src/calibre/gui2/preferences/search_ui.py:132
 msgid "Make &user categories from:"
 msgstr ""
@@ -14924,10 +15003,6 @@ msgstr ""
 msgid "<p>Choose a location for your books. When you add books to calibre, they will be copied here. Use an <b>empty folder</b> for a new calibre library:"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/gui2/wizard/library_ui.py:59
-msgid "&Change"
-msgstr ""
-
 #: /home/kovid/work/calibre/src/calibre/gui2/wizard/library_ui.py:60
 msgid "If you have an existing calibre library, it will be copied to the new location. If a calibre library already exists at the new location, calibre will switch to using it."
 msgstr ""
@@ -15984,17 +16059,17 @@ msgstr ""
 msgid "%(tt)sAverage rating is %(rating)3.1f"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/library/database2.py:3468
+#: /home/kovid/work/calibre/src/calibre/library/database2.py:3470
 #, python-format
 msgid "<p>Migrating old database to ebook library in %s<br><center>"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/library/database2.py:3497
+#: /home/kovid/work/calibre/src/calibre/library/database2.py:3499
 #, python-format
 msgid "Copying <b>%s</b>"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/library/database2.py:3514
+#: /home/kovid/work/calibre/src/calibre/library/database2.py:3516
 msgid "Compacting database"
 msgstr ""
 
@@ -16198,7 +16273,7 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/library/server/ajax.py:295
 #: /home/kovid/work/calibre/src/calibre/library/server/browse.py:341
 #: /home/kovid/work/calibre/src/calibre/library/server/browse.py:625
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:579
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:582
 msgid "Newest"
 msgstr ""
 
@@ -16355,40 +16430,40 @@ msgstr ""
 msgid "The full interface gives you many more features, but it may not work well on a small screen"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:126
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:129
 #, python-format
 msgid "%d book"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:149
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:152
 #, python-format
 msgid "%d items"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:167
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:170
 #, python-format
 msgid "RATING: %s<br />"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:170
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:173
 #, python-format
 msgid "TAGS: %s<br />"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:175
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:178
 #, python-format
 msgid "SERIES: %(series)s [%(sidx)s]<br />"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:273
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:276
 msgid "Books in your library"
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:279
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:282
 msgid "By "
 msgstr ""
 
-#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:280
+#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:283
 msgid "Books sorted by "
 msgstr ""
 

From abe11a17dcaf51dd5fc5b5a21ba6b148676b0d4f Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 20 Apr 2012 14:19:22 +0530
Subject: [PATCH 09/37] Finished testing the KF8 chunker

---
 src/calibre/ebooks/mobi/writer8/skeleton.py | 71 +++++++++++++++++++--
 1 file changed, 65 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py
index b8c90e0e36..04555467f3 100644
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@@ -13,6 +13,7 @@ from collections import namedtuple
 from lxml import etree
 
 from calibre.ebooks.oeb.base import XHTML_NS
+from calibre.constants import ispy3
 
 CHUNK_SIZE = 8192
 
@@ -48,6 +49,24 @@ def node_from_path(root, path):
         parent = parent[idx]
     return parent
 
+mychr = chr if ispy3 else unichr
+
+def tostring(raw, **kwargs):
+    ''' lxml *sometimes* represents non-ascii characters as hex entities in
+    attribute values. I can't figure out exactly what circumstances cause it.
+    It seems to happen when serializing a part of a larger tree. Since we need
+    serialization to be the same when serializing full and partial trees, we
+    manually replace all hex entities with their unicode codepoints. '''
+
+    xml_declaration = kwargs.pop('xml_declaration', False)
+    kwargs['encoding'] = unicode
+    kwargs['xml_declaration'] = False
+    ans = etree.tostring(raw, **kwargs)
+    if xml_declaration:
+        ans = '<?xml version="1.0" encoding="UTF-8"?>\n' + ans
+    return re.sub(r'&#x([0-9A-Fa-f]+);', lambda m:mychr(int(m.group(1), 16)),
+            ans)
+
 class Chunk(object):
 
     def __init__(self, raw):
@@ -63,6 +82,12 @@ class Chunk(object):
         self.raw += chunk.raw
         self.ends_tags = chunk.ends_tags
 
+    def __repr__(self):
+        return 'Chunk(len=%r insert_pos=%r starts_tags=%r ends_tags=%r)'%(
+                len(self.raw), self.insert_pos, self.starts_tags, self.ends_tags)
+
+    __str__ = __repr__
+
 class Skeleton(object):
 
     def __init__(self, file_number, item, root, chunks):
@@ -76,8 +101,8 @@ class Skeleton(object):
         self.calculate_insert_positions()
 
     def render(self, root):
-        raw = etree.tostring(root, encoding='UTF-8', xml_declaration=True)
-        raw = raw.replace('<html', '<html xmlns="%s"'%XHTML_NS, 1)
+        raw = tostring(root, xml_declaration=True)
+        raw = raw.replace(b'<html', bytes('<html xmlns="%s"'%XHTML_NS), 1)
         return raw
 
     def calculate_metrics(self, root):
@@ -85,8 +110,7 @@ class Skeleton(object):
         self.metrics = {}
         for tag in root.xpath('//*[@aid]'):
             text = (tag.text or '').encode('utf-8')
-            raw = etree.tostring(tag, encoding='UTF-8', with_tail=True,
-                    xml_declaration=False)
+            raw = tostring(tag, with_tail=True)
             start_length = len(raw.partition(b'>')[0]) + len(text) + 1
             end_length = len(raw.rpartition(b'<')[-1]) + 1
             self.metrics[tag.get('aid')] = Metric(start_length, end_length)
@@ -101,6 +125,13 @@ class Skeleton(object):
             for tag in chunk.ends_tags:
                 pos += self.metrics[tag].end
 
+    def rebuild(self):
+        ans = self.skeleton
+        for chunk in self.chunks:
+            i = chunk.insert_pos
+            ans = ans[:i] + chunk.raw + ans[i:]
+        return ans
+
 class Chunker(object):
 
     def __init__(self, oeb, data_func):
@@ -109,10 +140,20 @@ class Chunker(object):
 
         self.skeletons = []
 
+        # Set this to a list to enable dumping of the original and rebuilt
+        # html files for debugging
+        self.orig_dumps = []
+
         for i, item in enumerate(self.oeb.spine):
             root = self.remove_namespaces(self.data(item))
             body = root.xpath('//body')[0]
             body.tail = '\n'
+            if self.orig_dumps is not None:
+                self.orig_dumps.append(tostring(root, xml_declaration=True,
+                    with_tail=True))
+                self.orig_dumps[-1] = close_self_closing_tags(
+                        self.orig_dumps[-1].replace(b'<html',
+                        bytes('<html xmlns="%s"'%XHTML_NS), 1))
 
             # First pass: break up document into rendered strings of length no
             # more than CHUNK_SIZE
@@ -128,6 +169,9 @@ class Chunker(object):
             # for all chunks
             self.skeletons.append(Skeleton(i, item, root, chunks))
 
+        if self.orig_dumps:
+            self.dump()
+
     def remove_namespaces(self, root):
         lang = None
         for attr, val in root.attrib.iteritems():
@@ -173,8 +217,7 @@ class Chunker(object):
 
         # Now loop over children
         for child in list(tag):
-            raw = etree.tostring(child, encoding='UTF-8',
-                    xml_declaration=False, with_tail=False)
+            raw = tostring(child, with_tail=False)
             raw = close_self_closing_tags(raw)
             if len(raw) > CHUNK_SIZE and child.get('aid', None):
                 self.step_into_tag(child, chunks)
@@ -230,3 +273,19 @@ class Chunker(object):
                 prev.merge(chunk)
         return ans
 
+    def dump(self):
+        import tempfile, shutil, os
+        tdir = os.path.join(tempfile.gettempdir(), 'skeleton')
+        self.log('Skeletons dumped to:', tdir)
+        if os.path.exists(tdir):
+            shutil.rmtree(tdir)
+        orig = os.path.join(tdir, 'orig')
+        rebuilt = os.path.join(tdir, 'rebuilt')
+        for x in (orig, rebuilt):
+            os.makedirs(x)
+        for i, skeleton in enumerate(self.skeletons):
+            with open(os.path.join(orig, '%04d.html'%i),  'wb') as f:
+                f.write(self.orig_dumps[i])
+            with open(os.path.join(rebuilt, '%04d.html'%i),  'wb') as f:
+                f.write(skeleton.rebuild())
+

From 819b76f6575716ab35b43bf70af3b6ef42f8af93 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 20 Apr 2012 14:20:09 +0530
Subject: [PATCH 10/37] ...

---
 src/calibre/ebooks/mobi/writer8/skeleton.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py
index 04555467f3..d369e36b9d 100644
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@@ -65,7 +65,7 @@ def tostring(raw, **kwargs):
     if xml_declaration:
         ans = '<?xml version="1.0" encoding="UTF-8"?>\n' + ans
     return re.sub(r'&#x([0-9A-Fa-f]+);', lambda m:mychr(int(m.group(1), 16)),
-            ans)
+            ans).encode('utf-8')
 
 class Chunk(object):
 

From 67d93b84d69d093da5b62dc6546b624d281f77ec Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 20 Apr 2012 14:22:42 +0530
Subject: [PATCH 11/37] ...

---
 src/calibre/ebooks/mobi/writer8/skeleton.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py
index d369e36b9d..4e83791962 100644
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@@ -59,13 +59,14 @@ def tostring(raw, **kwargs):
     manually replace all hex entities with their unicode codepoints. '''
 
     xml_declaration = kwargs.pop('xml_declaration', False)
+    encoding = kwargs.pop('encoding', 'UTF-8')
     kwargs['encoding'] = unicode
     kwargs['xml_declaration'] = False
     ans = etree.tostring(raw, **kwargs)
     if xml_declaration:
-        ans = '<?xml version="1.0" encoding="UTF-8"?>\n' + ans
+        ans = '<?xml version="1.0" encoding="%s"?>\n'%encoding + ans
     return re.sub(r'&#x([0-9A-Fa-f]+);', lambda m:mychr(int(m.group(1), 16)),
-            ans).encode('utf-8')
+            ans).encode(encoding)
 
 class Chunk(object):
 

From f2b734a12095a707d697a508524a80a834a8315f Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 20 Apr 2012 14:25:03 +0530
Subject: [PATCH 12/37] ...

---
 src/calibre/ebooks/mobi/writer8/skeleton.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py
index 4e83791962..f28fbb8588 100644
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@@ -143,7 +143,7 @@ class Chunker(object):
 
         # Set this to a list to enable dumping of the original and rebuilt
         # html files for debugging
-        self.orig_dumps = []
+        self.orig_dumps = None
 
         for i, item in enumerate(self.oeb.spine):
             root = self.remove_namespaces(self.data(item))

From 85d357bd47583c13bf849f8008a34699edcf6891 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 20 Apr 2012 14:25:33 +0530
Subject: [PATCH 13/37] ...

---
 src/calibre/ebooks/mobi/writer8/skeleton.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py
index f28fbb8588..201d2b63d4 100644
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@@ -149,6 +149,7 @@ class Chunker(object):
             root = self.remove_namespaces(self.data(item))
             body = root.xpath('//body')[0]
             body.tail = '\n'
+
             if self.orig_dumps is not None:
                 self.orig_dumps.append(tostring(root, xml_declaration=True,
                     with_tail=True))

From 4b93ebc99068aa5493dcce549f84a9ee9f094488 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 20 Apr 2012 14:41:53 +0530
Subject: [PATCH 14/37] Fix #986070 (Typos in tooltip messages in trunk)

---
 src/calibre/gui2/convert/search_and_replace.ui | 4 ++--
 src/calibre/translations/calibre.pot           | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/calibre/gui2/convert/search_and_replace.ui b/src/calibre/gui2/convert/search_and_replace.ui
index 850f59885e..2497855abd 100644
--- a/src/calibre/gui2/convert/search_and_replace.ui
+++ b/src/calibre/gui2/convert/search_and_replace.ui
@@ -137,7 +137,7 @@
      <item>
       <widget class="QPushButton" name="sr_load">
        <property name="toolTip">
-        <string>Load a listof expression from a previously saved file</string>
+        <string>Load a list of expressions from a previously saved file</string>
        </property>
        <property name="text">
         <string>&amp;Load</string>
@@ -147,7 +147,7 @@
      <item>
       <widget class="QPushButton" name="sr_save">
        <property name="toolTip">
-        <string>Save this list of expression so that you can re-use it easily</string>
+        <string>Save this list of expressions so that you can re-use it easily</string>
        </property>
        <property name="text">
         <string>&amp;Save</string>
diff --git a/src/calibre/translations/calibre.pot b/src/calibre/translations/calibre.pot
index 5b0c096104..aecd4e2625 100644
--- a/src/calibre/translations/calibre.pot
+++ b/src/calibre/translations/calibre.pot
@@ -5,8 +5,8 @@
 msgid ""
 msgstr ""
 "Project-Id-Version: calibre 0.8.48\n"
-"POT-Creation-Date: 2012-04-20 08:19+IST\n"
-"PO-Revision-Date: 2012-04-20 08:19+IST\n"
+"POT-Creation-Date: 2012-04-20 14:41+IST\n"
+"PO-Revision-Date: 2012-04-20 14:41+IST\n"
 "Last-Translator: Automatically generated\n"
 "Language-Team: LANGUAGE\n"
 "MIME-Version: 1.0\n"

From 13abe2bb6efb537bd2b5d404a7eda1c81ce80b1f Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 20 Apr 2012 18:49:22 +0530
Subject: [PATCH 15/37] KF8 Output: Text processing layer is complete

---
 src/calibre/ebooks/mobi/utils.py            |  51 ++++++++
 src/calibre/ebooks/mobi/writer2/__init__.py |   1 -
 src/calibre/ebooks/mobi/writer2/indexer.py  |   3 +-
 src/calibre/ebooks/mobi/writer2/main.py     |  61 +--------
 src/calibre/ebooks/mobi/writer8/main.py     |  25 +++-
 src/calibre/ebooks/mobi/writer8/skeleton.py | 136 ++++++++++++++++++--
 6 files changed, 201 insertions(+), 76 deletions(-)

diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py
index 0ae992f438..fe5cd7eaf2 100644
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@@ -14,6 +14,7 @@ from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
 from calibre.ebooks import normalize
 
 IMAGE_MAX_SIZE = 10 * 1024 * 1024
+RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
 
 def decode_string(raw, codec='utf-8', ordt_map=''):
     length, = struct.unpack(b'>B', raw[0])
@@ -498,3 +499,53 @@ def write_font_record(data, obfuscate=True, compress=True):
 
 # }}}
 
+def create_text_record(text):
+    '''
+    Return a Palmdoc record of size RECORD_SIZE from the text file object.
+    In case the record ends in the middle of a multibyte character return
+    the overlap as well.
+
+    Returns data, overlap: where both are byte strings. overlap is the
+    extra bytes needed to complete the truncated multibyte character.
+    '''
+    opos = text.tell()
+    text.seek(0, 2)
+    # npos is the position of the next record
+    npos = min((opos + RECORD_SIZE, text.tell()))
+    # Number of bytes from the next record needed to complete the last
+    # character in this record
+    extra = 0
+
+    last = b''
+    while not last.decode('utf-8', 'ignore'):
+        # last contains no valid utf-8 characters
+        size = len(last) + 1
+        text.seek(npos - size)
+        last = text.read(size)
+
+    # last now has one valid utf-8 char and possibly some bytes that belong
+    # to a truncated char
+
+    try:
+        last.decode('utf-8', 'strict')
+    except UnicodeDecodeError:
+        # There are some truncated bytes in last
+        prev = len(last)
+        while True:
+            text.seek(npos - prev)
+            last = text.read(len(last) + 1)
+            try:
+                last.decode('utf-8')
+            except UnicodeDecodeError:
+                pass
+            else:
+                break
+        extra = len(last) - prev
+
+    text.seek(opos)
+    data = text.read(RECORD_SIZE)
+    overlap = text.read(extra)
+    text.seek(npos)
+
+    return data, overlap
+
diff --git a/src/calibre/ebooks/mobi/writer2/__init__.py b/src/calibre/ebooks/mobi/writer2/__init__.py
index bc8dbbf7de..df3dcefb94 100644
--- a/src/calibre/ebooks/mobi/writer2/__init__.py
+++ b/src/calibre/ebooks/mobi/writer2/__init__.py
@@ -12,5 +12,4 @@ UNCOMPRESSED = 1
 PALMDOC = 2
 HUFFDIC = 17480
 PALM_MAX_IMAGE_SIZE = 63 * 1024
-RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
 
diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py
index e349172d95..134fbadc60 100644
--- a/src/calibre/ebooks/mobi/writer2/indexer.py
+++ b/src/calibre/ebooks/mobi/writer2/indexer.py
@@ -12,9 +12,8 @@ from struct import pack
 from cStringIO import StringIO
 from collections import OrderedDict, defaultdict
 
-from calibre.ebooks.mobi.writer2 import RECORD_SIZE
 from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
-        encode_tbs, align_block, utf8_text)
+        encode_tbs, align_block, utf8_text, RECORD_SIZE)
 
 class CNCX(object): # {{{
 
diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py
index b7a0d76424..c930609489 100644
--- a/src/calibre/ebooks/mobi/writer2/main.py
+++ b/src/calibre/ebooks/mobi/writer2/main.py
@@ -16,9 +16,9 @@ from calibre.ebooks.mobi.writer2.serializer import Serializer
 from calibre.ebooks.compression.palmdoc import compress_doc
 from calibre.ebooks.mobi.langcodes import iana2mobi
 from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED, RECORD_SIZE)
+from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED)
 from calibre.ebooks.mobi.utils import (encint, encode_trailing_data,
-        align_block, detect_periodical)
+        align_block, detect_periodical, RECORD_SIZE, create_text_record)
 from calibre.ebooks.mobi.writer2.indexer import Indexer
 
 EXTH_CODES = {
@@ -163,9 +163,7 @@ class MobiWriter(object):
 
     # }}}
 
-    # Text {{{
-
-    def generate_text(self):
+    def generate_text(self): # {{{
         self.oeb.logger.info('Serializing markup content...')
         self.serializer = Serializer(self.oeb, self.image_map,
                 self.is_periodical,
@@ -180,7 +178,7 @@ class MobiWriter(object):
             self.oeb.logger.info('  Compressing markup content...')
 
         while text.tell() < self.text_length:
-            data, overlap = self.read_text_record(text)
+            data, overlap = create_text_record(text)
             if self.compression == PALMDOC:
                 data = compress_doc(data)
 
@@ -197,57 +195,6 @@ class MobiWriter(object):
         if records_size % 4 != 0:
             self.records.append(b'\x00'*(records_size % 4))
             self.first_non_text_record_idx += 1
-
-    def read_text_record(self, text):
-        '''
-        Return a Palmdoc record of size RECORD_SIZE from the text file object.
-        In case the record ends in the middle of a multibyte character return
-        the overlap as well.
-
-        Returns data, overlap: where both are byte strings. overlap is the
-        extra bytes needed to complete the truncated multibyte character.
-        '''
-        opos = text.tell()
-        text.seek(0, 2)
-        # npos is the position of the next record
-        npos = min((opos + RECORD_SIZE, text.tell()))
-        # Number of bytes from the next record needed to complete the last
-        # character in this record
-        extra = 0
-
-        last = b''
-        while not last.decode('utf-8', 'ignore'):
-            # last contains no valid utf-8 characters
-            size = len(last) + 1
-            text.seek(npos - size)
-            last = text.read(size)
-
-        # last now has one valid utf-8 char and possibly some bytes that belong
-        # to a truncated char
-
-        try:
-            last.decode('utf-8', 'strict')
-        except UnicodeDecodeError:
-            # There are some truncated bytes in last
-            prev = len(last)
-            while True:
-                text.seek(npos - prev)
-                last = text.read(len(last) + 1)
-                try:
-                    last.decode('utf-8')
-                except UnicodeDecodeError:
-                    pass
-                else:
-                    break
-            extra = len(last) - prev
-
-        text.seek(opos)
-        data = text.read(RECORD_SIZE)
-        overlap = text.read(extra)
-        text.seek(npos)
-
-        return data, overlap
-
     # }}}
 
     def generate_record0(self): #  MOBI header {{{
diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py
index 79ff7c3d96..b924a4df7c 100644
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@@ -19,15 +19,13 @@ from calibre.ebooks.mobi.utils import to_base
 from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
         extract, XHTML, urlnormalize)
 from calibre.ebooks.oeb.parse_utils import barename
-from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags
+from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href
 
 XML_DOCS = OEB_DOCS | {SVG_MIME}
 
 # References to record numbers in KF8 are stored as base-32 encoded integers,
 # with 4 digits
 to_ref = partial(to_base, base=32, min_num_digits=4)
-# References in links are stored with 10 digits
-to_href = partial(to_base, base=32, min_num_digits=10)
 
 class KF8Writer(object):
 
@@ -167,7 +165,7 @@ class KF8Writer(object):
         self.link_map = {}
         count = 0
         hrefs = {item.href for item in self.oeb.spine}
-        for item in self.oeb.spine:
+        for i, item in enumerate(self.oeb.spine):
             root = self.data(item)
 
             for a in XPath('//h:a[@href]')(root):
@@ -176,7 +174,8 @@ class KF8Writer(object):
                 href, _, frag = ref.partition('#')
                 href = urlnormalize(href)
                 if href in hrefs:
-                    placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count)
+                    placeholder = 'kindle:pos:fid:%04d:off:%s'%(i,
+                            to_href(count))
                     self.link_map[placeholder] = (href, frag)
                     a.set('href', placeholder)
 
@@ -199,7 +198,19 @@ class KF8Writer(object):
                     j += 1
 
     def chunk_it_up(self):
-        chunker = Chunker(self.oeb, self.data)
-        chunker
+        placeholder_map = {}
+        for placeholder, x in self.link_map.iteritems():
+            href, frag = x
+            aid = self.id_map.get(x, None)
+            if aid is None:
+                aid = self.id_map.get((href, ''))
+            placeholder_map[placeholder] = aid
+        chunker = Chunker(self.oeb, self.data, not self.opts.dont_compress,
+                placeholder_map)
+
+        for x in ('skel_table', 'chunk_table', 'aid_offset_map', 'records',
+                'last_text_record_idx', 'first_non_text_record_idx',
+                'text_length'):
+            setattr(self, x, getattr(chunker, x))
 
 
diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py
index 201d2b63d4..da3b9407bd 100644
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@@ -9,14 +9,22 @@ __docformat__ = 'restructuredtext en'
 
 import re
 from collections import namedtuple
+from io import BytesIO
+from struct import pack
+from functools import partial
 
 from lxml import etree
 
 from calibre.ebooks.oeb.base import XHTML_NS
 from calibre.constants import ispy3
+from calibre.ebooks.mobi.utils import create_text_record, to_base
+from calibre.ebooks.compression.palmdoc import compress_doc
 
 CHUNK_SIZE = 8192
 
+# References in links are stored with 10 digits
+to_href = partial(to_base, base=32, min_num_digits=10)
+
 # Tags to which kindlegen adds the aid attribute
 aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
 'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details',
@@ -70,11 +78,15 @@ def tostring(raw, **kwargs):
 
 class Chunk(object):
 
-    def __init__(self, raw):
+    def __init__(self, raw, parent_tag):
         self.raw = raw
         self.starts_tags = []
         self.ends_tags = []
         self.insert_pos = None
+        self.parent_tag = parent_tag
+        self.parent_is_body = False
+        self.is_last_chunk = False
+        self.is_first_chunk = False
 
     def __len__(self):
         return len(self.raw)
@@ -87,6 +99,11 @@ class Chunk(object):
         return 'Chunk(len=%r insert_pos=%r starts_tags=%r ends_tags=%r)'%(
                 len(self.raw), self.insert_pos, self.starts_tags, self.ends_tags)
 
+    @property
+    def selector(self):
+        typ = 'S' if (self.is_last_chunk and not self.parent_is_body) else 'P'
+        return "%s-//*[@aid='%s']"%(typ, self.parent_tag)
+
     __str__ = __repr__
 
 class Skeleton(object):
@@ -133,11 +150,20 @@ class Skeleton(object):
             ans = ans[:i] + chunk.raw + ans[i:]
         return ans
 
+    def __len__(self):
+        return len(self.skeleton) + sum([len(x.raw) for x in self.chunks])
+
+    @property
+    def raw_text(self):
+        return b''.join([self.skeleton] + [x.raw for x in self.chunks])
+
 class Chunker(object):
 
-    def __init__(self, oeb, data_func):
+    def __init__(self, oeb, data_func, compress, placeholder_map):
         self.oeb, self.log = oeb, oeb.log
         self.data = data_func
+        self.compress = compress
+        self.placeholder_map = placeholder_map
 
         self.skeletons = []
 
@@ -174,6 +200,19 @@ class Chunker(object):
         if self.orig_dumps:
             self.dump()
 
+        # Create the SKEL and Chunk tables
+        self.skel_table = []
+        self.chunk_table = []
+        self.create_tables()
+
+        # Set internal links
+        text = b''.join(x.raw_text for x in self.skeletons)
+        text = self.set_internal_links(text)
+
+        # Create text records
+        self.records = []
+        self.create_text_records(text)
+
     def remove_namespaces(self, root):
         lang = None
         for attr, val in root.attrib.iteritems():
@@ -206,15 +245,15 @@ class Chunker(object):
 
         return nroot
 
-
     def step_into_tag(self, tag, chunks):
         aid = tag.get('aid')
+        is_body = tag.tag == 'body'
 
         first_chunk_idx = len(chunks)
 
         # First handle any text
         if tag.text and tag.text.strip(): # Leave pure whitespace in the skel
-            chunks.extend(self.chunk_up_text(tag.text))
+            chunks.extend(self.chunk_up_text(tag.text, aid))
             tag.text = None
 
         # Now loop over children
@@ -224,15 +263,15 @@ class Chunker(object):
             if len(raw) > CHUNK_SIZE and child.get('aid', None):
                 self.step_into_tag(child, chunks)
                 if child.tail and child.tail.strip(): # Leave pure whitespace
-                    chunks.extend(self.chunk_up_text(child.tail))
+                    chunks.extend(self.chunk_up_text(child.tail, aid))
                     child.tail = None
             else:
                 if len(raw) > CHUNK_SIZE:
                     self.log.warn('Tag %s has no aid and a too large chunk'
                             ' size. Adding anyway.'%child.tag)
-                chunks.append(Chunk(raw))
+                chunks.append(Chunk(raw, aid))
                 if child.tail:
-                    chunks.extend(self.chunk_up_text(child.tail))
+                    chunks.extend(self.chunk_up_text(child.tail, aid))
                 tag.remove(child)
 
         if len(chunks) <= first_chunk_idx and chunks:
@@ -242,8 +281,15 @@ class Chunker(object):
         if chunks:
             chunks[first_chunk_idx].starts_tags.append(aid)
             chunks[-1].ends_tags.append(aid)
+            my_chunks = chunks[first_chunk_idx:]
+            if my_chunks:
+                my_chunks[0].is_first_chunk = True
+                my_chunks[-1].is_last_chunk = True
+                if is_body:
+                    for chunk in my_chunks:
+                        chunk.parent_is_body = True
 
-    def chunk_up_text(self, text):
+    def chunk_up_text(self, text, parent_tag):
         text = text.encode('utf-8')
         ans = []
 
@@ -259,7 +305,7 @@ class Chunker(object):
         while rest:
             start, rest = split_multibyte_text(rest)
             ans.append(b'<span class="AmznBigTextBlock">' + start + '</span>')
-        return [Chunk(x) for x in ans]
+        return [Chunk(x, parent_tag) for x in ans]
 
     def merge_small_chunks(self, chunks):
         ans = chunks[:1]
@@ -275,6 +321,77 @@ class Chunker(object):
                 prev.merge(chunk)
         return ans
 
+    def create_tables(self):
+        Skel = namedtuple('Skel',
+                'file_number name chunk_count start_pos length')
+        sp = 0
+        for s in self.skeletons:
+            s.start_pos = sp
+            sp += len(s)
+        self.skel_table = [Skel(s.file_number, 'SKEL%010d'%s.file_number,
+            len(s.chunks), s.start_pos, len(s.skeleton)) for x in self.skeletons]
+
+        Chunk = namedtuple('Chunk',
+            'insert_pos selector file_number sequence_number start_pos length')
+        num = cp = 0
+        for skel in self.skeletons:
+            cp = skel.start_pos
+            for chunk in skel.chunks:
+                self.chunk_table.append(
+                    Chunk(chunk.insert_pos + skel.start_pos, chunk.selector,
+                        skel.file_number, num, cp, len(chunk.raw)))
+                cp += len(chunk.raw)
+                num += 1
+
+    def set_internal_links(self, text):
+        # First find the start pos of all tags with aids
+        aid_map = {}
+        for match in re.finditer(br'<[^>]+? aid=[\'"]([A-Z0-9]+)[\'"]', text):
+            aid_map[match.group(1)] = match.start()
+        self.aid_offset_map = aid_map
+        placeholder_map = {bytes(k):bytes(to_href(aid_map[v])) for k, v in
+                self.placeholder_map.iteritems()}
+
+        # Now update the links
+        def sub(match):
+            raw = match.group()
+            pl = match.group(1)
+            try:
+                return raw[:-10] + placeholder_map[pl]
+            except KeyError:
+                pass
+            return raw
+
+        return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text)
+
+    def create_text_records(self, text):
+        self.text_length = len(text)
+        text = BytesIO(text)
+        nrecords = 0
+        records_size = 0
+
+        if self.compress:
+            self.oeb.logger.info('  Compressing markup content...')
+
+        while text.tell() < self.text_length:
+            data, overlap = create_text_record(text)
+            if self.compress:
+                data = compress_doc(data)
+
+            data += overlap
+            data += pack(b'>B', len(overlap))
+
+            self.records.append(data)
+            records_size += len(data)
+            nrecords += 1
+
+        self.last_text_record_idx = nrecords
+        self.first_non_text_record_idx = nrecords + 1
+        # Pad so that the next records starts at a 4 byte boundary
+        if records_size % 4 != 0:
+            self.records.append(b'\x00'*(records_size % 4))
+            self.first_non_text_record_idx += 1
+
     def dump(self):
         import tempfile, shutil, os
         tdir = os.path.join(tempfile.gettempdir(), 'skeleton')
@@ -291,3 +408,4 @@ class Chunker(object):
             with open(os.path.join(rebuilt, '%04d.html'%i),  'wb') as f:
                 f.write(skeleton.rebuild())
 
+

From a6efef3d3159348665323cda2f9e8c9bffd2d990 Mon Sep 17 00:00:00 2001
From: Lee <ldolse@yahoo.com>
Date: Fri, 20 Apr 2012 21:52:57 +0800
Subject: [PATCH 16/37] removed dash unwrap regression from bug #822744

---
 src/calibre/ebooks/conversion/utils.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 26b800f55b..e2a02702df 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -322,7 +322,6 @@ class HeuristicProcessor(object):
         lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
         em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
         soft_hyphen = u"\xad"
-        dash = u"\x2d" # some ocrs doesn't convert dashes to hyphens
         line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
         blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
         line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
@@ -331,23 +330,19 @@ class HeuristicProcessor(object):
         unwrap_regex = lookahead+line_ending+blanklines+line_opening
         em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
         shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
-        dash_unwrap_regex = dash+line_ending+blanklines+line_opening
 
         if format == 'txt':
             unwrap_regex = lookahead+txt_line_wrap
             em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
             shy_unwrap_regex = soft_hyphen+txt_line_wrap
-            dash_unwrap_regex = dash+txt_line_wrap
 
         unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
         em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
         shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
-        dash_unwrap = re.compile(u"%s" % dash_unwrap_regex, re.UNICODE)
 
         content = unwrap.sub(' ', content)
         content = em_en_unwrap.sub('', content)
         content = shy_unwrap.sub('', content)
-        content = dash_unwrap.sub('', content)
         return content
 
     def txt_process(self, match):

From 7800024bac39d901c575f4369dd4528691faaf90 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 20 Apr 2012 20:04:13 +0530
Subject: [PATCH 17/37] ...

---
 src/calibre/ebooks/mobi/writer8/main.py     | 61 ++++++++++++++++++---
 src/calibre/ebooks/mobi/writer8/skeleton.py | 42 +-------------
 2 files changed, 57 insertions(+), 46 deletions(-)

diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py
index b924a4df7c..d8ef501eb6 100644
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@@ -9,13 +9,16 @@ __docformat__ = 'restructuredtext en'
 
 import copy
 from functools import partial
-from collections import defaultdict
+from collections import defaultdict, namedtuple
+from io import BytesIO
+from struct import pack
 
 import cssutils
 from lxml import etree
 
 from calibre import isbytestring, force_unicode
-from calibre.ebooks.mobi.utils import to_base
+from calibre.ebooks.mobi.utils import create_text_record, to_base
+from calibre.ebooks.compression.palmdoc import compress_doc
 from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
         extract, XHTML, urlnormalize)
 from calibre.ebooks.oeb.parse_utils import barename
@@ -31,11 +34,14 @@ class KF8Writer(object):
 
     def __init__(self, oeb, opts, resources):
         self.oeb, self.opts, self.log = oeb, opts, oeb.log
+        self.compress = not self.opts.dont_compress
         self.log.info('Creating KF8 output')
         self.used_images = set()
         self.resources = resources
         self.dup_data()
         self.flows = [None] # First flow item is reserved for the text
+        self.records = []
+        self.fdst_table = []
 
         self.replace_resource_links()
         self.extract_css_into_flows()
@@ -43,6 +49,10 @@ class KF8Writer(object):
         self.replace_internal_links_with_placeholders()
         self.insert_aid_attributes()
         self.chunk_it_up()
+        # Dump the cloned data as it is no longer needed
+        del self._data_cache
+        self.create_text_records()
+        self.create_fdst_table()
 
     def dup_data(self):
         ''' Duplicate data so that any changes we make to markup/CSS only
@@ -205,12 +215,49 @@ class KF8Writer(object):
             if aid is None:
                 aid = self.id_map.get((href, ''))
             placeholder_map[placeholder] = aid
-        chunker = Chunker(self.oeb, self.data, not self.opts.dont_compress,
-                placeholder_map)
+        chunker = Chunker(self.oeb, self.data, placeholder_map)
 
-        for x in ('skel_table', 'chunk_table', 'aid_offset_map', 'records',
-                'last_text_record_idx', 'first_non_text_record_idx',
-                'text_length'):
+        for x in ('skel_table', 'chunk_table', 'aid_offset_map'):
             setattr(self, x, getattr(chunker, x))
 
+        self.flows[0] = chunker.text
+
+    def create_text_records(self):
+        self.flows = [x.encode('utf-8') if isinstance(x, unicode) else x for x
+                in self.flows]
+        text = b''.join(self.flows)
+        self.text_length = len(text)
+        text = BytesIO(text)
+        nrecords = 0
+        records_size = 0
+
+        if self.compress:
+            self.oeb.logger.info('  Compressing markup content...')
+
+        while text.tell() < self.text_length:
+            data, overlap = create_text_record(text)
+            if self.compress:
+                data = compress_doc(data)
+
+            data += overlap
+            data += pack(b'>B', len(overlap))
+
+            self.records.append(data)
+            records_size += len(data)
+            nrecords += 1
+
+        self.last_text_record_idx = nrecords
+        self.first_non_text_record_idx = nrecords + 1
+        # Pad so that the next records starts at a 4 byte boundary
+        if records_size % 4 != 0:
+            self.records.append(b'\x00'*(records_size % 4))
+            self.first_non_text_record_idx += 1
+
+    def create_fdst_table(self):
+        FDST = namedtuple('Flow', 'start end')
+        for i, flow in enumerate(self.flows):
+            start = 0 if i == 0 else self.fdst_table[-1].end
+            self.fdst_table.append(FDST(start, start + len(flow)))
+
+
 
diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py
index da3b9407bd..eff03c9de4 100644
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@@ -9,16 +9,13 @@ __docformat__ = 'restructuredtext en'
 
 import re
 from collections import namedtuple
-from io import BytesIO
-from struct import pack
 from functools import partial
 
 from lxml import etree
 
 from calibre.ebooks.oeb.base import XHTML_NS
 from calibre.constants import ispy3
-from calibre.ebooks.mobi.utils import create_text_record, to_base
-from calibre.ebooks.compression.palmdoc import compress_doc
+from calibre.ebooks.mobi.utils import to_base
 
 CHUNK_SIZE = 8192
 
@@ -159,10 +156,9 @@ class Skeleton(object):
 
 class Chunker(object):
 
-    def __init__(self, oeb, data_func, compress, placeholder_map):
+    def __init__(self, oeb, data_func, placeholder_map):
         self.oeb, self.log = oeb, oeb.log
         self.data = data_func
-        self.compress = compress
         self.placeholder_map = placeholder_map
 
         self.skeletons = []
@@ -207,11 +203,7 @@ class Chunker(object):
 
         # Set internal links
         text = b''.join(x.raw_text for x in self.skeletons)
-        text = self.set_internal_links(text)
-
-        # Create text records
-        self.records = []
-        self.create_text_records(text)
+        self.text = self.set_internal_links(text)
 
     def remove_namespaces(self, root):
         lang = None
@@ -364,34 +356,6 @@ class Chunker(object):
 
         return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text)
 
-    def create_text_records(self, text):
-        self.text_length = len(text)
-        text = BytesIO(text)
-        nrecords = 0
-        records_size = 0
-
-        if self.compress:
-            self.oeb.logger.info('  Compressing markup content...')
-
-        while text.tell() < self.text_length:
-            data, overlap = create_text_record(text)
-            if self.compress:
-                data = compress_doc(data)
-
-            data += overlap
-            data += pack(b'>B', len(overlap))
-
-            self.records.append(data)
-            records_size += len(data)
-            nrecords += 1
-
-        self.last_text_record_idx = nrecords
-        self.first_non_text_record_idx = nrecords + 1
-        # Pad so that the next records starts at a 4 byte boundary
-        if records_size % 4 != 0:
-            self.records.append(b'\x00'*(records_size % 4))
-            self.first_non_text_record_idx += 1
-
     def dump(self):
         import tempfile, shutil, os
         tdir = os.path.join(tempfile.gettempdir(), 'skeleton')

From fbcd3eb279b8acd0d450d42069827fda8d9c9d0f Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 20 Apr 2012 20:11:12 +0530
Subject: [PATCH 18/37] ...

---
 recipes/tpm_uk.recipe                   | 2 ++
 src/calibre/ebooks/mobi/writer8/main.py | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipes/tpm_uk.recipe b/recipes/tpm_uk.recipe
index aa042de951..0ccad32fa9 100644
--- a/recipes/tpm_uk.recipe
+++ b/recipes/tpm_uk.recipe
@@ -11,6 +11,8 @@ class TPM_uk(BasicNewsRecipe):
     __author__            = 'Darko Miletic'
     description           = 'Title says it all'
     publisher             = "The Philosophers' Magazine"
+    recipe_disabled = ('This recipe has been disabled as the website has'
+            ' started providing articles only in PDF form')
     category              = 'philosophy, news'
     oldest_article        = 25
     max_articles_per_feed = 200
diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py
index d8ef501eb6..c9334b22a3 100644
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@@ -259,5 +259,3 @@ class KF8Writer(object):
             start = 0 if i == 0 else self.fdst_table[-1].end
             self.fdst_table.append(FDST(start, start + len(flow)))
 
-
-

From 03ed4010f58ebb8499d0b9d49ecc5c275214a3d4 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 20 Apr 2012 20:49:23 +0530
Subject: [PATCH 19/37] ...

---
 src/calibre/ebooks/mobi/writer8/skeleton.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py
index eff03c9de4..4b39d0cb15 100644
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@@ -165,7 +165,7 @@ class Chunker(object):
 
         # Set this to a list to enable dumping of the original and rebuilt
         # html files for debugging
-        self.orig_dumps = None
+        orig_dumps = None
 
         for i, item in enumerate(self.oeb.spine):
             root = self.remove_namespaces(self.data(item))
@@ -193,8 +193,8 @@ class Chunker(object):
             # for all chunks
             self.skeletons.append(Skeleton(i, item, root, chunks))
 
-        if self.orig_dumps:
-            self.dump()
+        if orig_dumps:
+            self.dump(orig_dumps)
 
         # Create the SKEL and Chunk tables
         self.skel_table = []
@@ -356,7 +356,7 @@ class Chunker(object):
 
         return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text)
 
-    def dump(self):
+    def dump(self, orig_dumps):
         import tempfile, shutil, os
         tdir = os.path.join(tempfile.gettempdir(), 'skeleton')
         self.log('Skeletons dumped to:', tdir)
@@ -368,7 +368,7 @@ class Chunker(object):
             os.makedirs(x)
         for i, skeleton in enumerate(self.skeletons):
             with open(os.path.join(orig, '%04d.html'%i),  'wb') as f:
-                f.write(self.orig_dumps[i])
+                f.write(orig_dumps[i])
             with open(os.path.join(rebuilt, '%04d.html'%i),  'wb') as f:
                 f.write(skeleton.rebuild())
 

From 8d44e8d83f4b7f84051463117cef5cfcfdad5252 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 20 Apr 2012 21:08:27 +0530
Subject: [PATCH 20/37] ...

---
 src/calibre/ebooks/mobi/writer8/main.py     | 9 ++++-----
 src/calibre/ebooks/mobi/writer8/skeleton.py | 9 +++++++--
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py
index c9334b22a3..430d695fd1 100644
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@@ -175,7 +175,7 @@ class KF8Writer(object):
         self.link_map = {}
         count = 0
         hrefs = {item.href for item in self.oeb.spine}
-        for i, item in enumerate(self.oeb.spine):
+        for item in self.oeb.spine:
             root = self.data(item)
 
             for a in XPath('//h:a[@href]')(root):
@@ -184,8 +184,7 @@ class KF8Writer(object):
                 href, _, frag = ref.partition('#')
                 href = urlnormalize(href)
                 if href in hrefs:
-                    placeholder = 'kindle:pos:fid:%04d:off:%s'%(i,
-                            to_href(count))
+                    placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count)
                     self.link_map[placeholder] = (href, frag)
                     a.set('href', placeholder)
 
@@ -201,9 +200,9 @@ class KF8Writer(object):
                     aid = aidbase + j
                     tag.attrib['aid'] = to_base(aid, base=32)
                     if tag.tag == XHTML('body'):
-                        self.id_map[(item.href, '')] = tag.attrib['aid']
+                        self.id_map[(item.href, '')] = (i, tag.attrib['aid'])
                     if id_ is not None:
-                        self.id_map[(item.href, id_)] = tag.attrib['aid']
+                        self.id_map[(item.href, id_)] = (i, tag.attrib['aid'])
 
                     j += 1
 
diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py
index 4b39d0cb15..494aa30def 100644
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@@ -341,7 +341,12 @@ class Chunker(object):
         for match in re.finditer(br'<[^>]+? aid=[\'"]([A-Z0-9]+)[\'"]', text):
             aid_map[match.group(1)] = match.start()
         self.aid_offset_map = aid_map
-        placeholder_map = {bytes(k):bytes(to_href(aid_map[v])) for k, v in
+
+        def to_placeholder(x):
+            file_number, aid = x
+            return bytes('%04d:%s'%(file_number, to_href(aid_map[aid])))
+
+        placeholder_map = {bytes(k):to_placeholder(v) for k, v in
                 self.placeholder_map.iteritems()}
 
         # Now update the links
@@ -349,7 +354,7 @@ class Chunker(object):
             raw = match.group()
             pl = match.group(1)
             try:
-                return raw[:-10] + placeholder_map[pl]
+                return raw[:-15] + placeholder_map[pl]
             except KeyError:
                 pass
             return raw

From 6c631e0e64ce2ce7604367ebed60457d51924af2 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 20 Apr 2012 21:15:12 +0530
Subject: [PATCH 21/37] ...

---
 src/calibre/ebooks/mobi/writer8/skeleton.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py
index 494aa30def..d04f119316 100644
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@@ -371,10 +371,19 @@ class Chunker(object):
         rebuilt = os.path.join(tdir, 'rebuilt')
         for x in (orig, rebuilt):
             os.makedirs(x)
+        error = False
         for i, skeleton in enumerate(self.skeletons):
+            oraw, rraw = orig_dumps[i], skeleton.rebuild()
             with open(os.path.join(orig, '%04d.html'%i),  'wb') as f:
-                f.write(orig_dumps[i])
+                f.write(oraw)
             with open(os.path.join(rebuilt, '%04d.html'%i),  'wb') as f:
-                f.write(skeleton.rebuild())
+                f.write(rraw)
+            if oraw != rraw:
+                error = True
+        if error:
+            raise ValueError('The before and after HTML differs. Run a diff '
+                    'tool on the orig and rebuilt directories')
+        else:
+            self.log('Skeleton HTML before and after is identical.')
 
 

From 9217e6bed381211842f56d25c9ed4957324a2b7e Mon Sep 17 00:00:00 2001
From: Lee <ldolse@yahoo.com>
Date: Sat, 21 Apr 2012 00:24:32 +0800
Subject: [PATCH 22/37] remove full stop punctuation from the line unwrap
 heuristic

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index e2a02702df..24528d1fb8 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -319,7 +319,7 @@ class HeuristicProcessor(object):
         '''
         # define the pieces of the regex
 
-        lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
+        lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
         em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
         soft_hyphen = u"\xad"
         line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"

From 740c812de24e38120b33ba7d094ad288bc7cf234 Mon Sep 17 00:00:00 2001
From: Lee <ldolse@yahoo.com>
Date: Sat, 21 Apr 2012 00:52:13 +0800
Subject: [PATCH 23/37] expanded comments

---
 src/calibre/ebooks/conversion/utils.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 24528d1fb8..6dc3973213 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -316,9 +316,17 @@ class HeuristicProcessor(object):
         '''
         Unwraps lines based on line length and punctuation
         supports a range of html markup and text files
+        
+        the lookahead regex below is meant look for any non-full stop characters - punctuation
+        characters which can be used as a full stop should *not* be added below - e.g. ?!“”. etc
+        the reason for this is to prevent false positive wrapping.  False positives are more
+        difficult to detect than false negatives during a manual review of the doc
+        
+        This function intentionally leaves hyphenated content alone as that is handled by the 
+        dehyphenate routine in a future step
         '''
-        # define the pieces of the regex
 
+        # define the pieces of the regex
         lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
         em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
         soft_hyphen = u"\xad"

From c5c3354b338ac0bfc3fb1329923179638c02f7b1 Mon Sep 17 00:00:00 2001
From: Lee <ldolse@yahoo.com>
Date: Sat, 21 Apr 2012 00:54:35 +0800
Subject: [PATCH 24/37] ...

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 6dc3973213..acfa80e877 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -323,7 +323,7 @@ class HeuristicProcessor(object):
         difficult to detect than false negatives during a manual review of the doc
         
         This function intentionally leaves hyphenated content alone as that is handled by the 
-        dehyphenate routine in a future step
+        dehyphenate routine in a separate step
         '''
 
         # define the pieces of the regex

From b717749138e144155edc86c7d61ff8c1413e7d9a Mon Sep 17 00:00:00 2001
From: Lee <ldolse@yahoo.com>
Date: Sat, 21 Apr 2012 00:59:30 +0800
Subject: [PATCH 25/37] fix the pattern in preprocess

---
 src/calibre/ebooks/conversion/preprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index c526cba8a9..16acaad383 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -559,7 +559,7 @@ class HTMLPreProcessor(object):
                 end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
                 end_rules.append(
                     # Un wrap using punctuation
-                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                 )
 
         for rule in self.PREPROCESS + start_rules:

From 081897ae5723958830db099240dd461c521b822f Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 20 Apr 2012 22:39:32 +0530
Subject: [PATCH 26/37] KF8 Output: Start work on the index layer

---
 src/calibre/ebooks/mobi/utils.py           | 46 +++++++++++++
 src/calibre/ebooks/mobi/writer2/indexer.py | 49 +++-----------
 src/calibre/ebooks/mobi/writer8/index.py   | 78 ++++++++++++++++++++++
 3 files changed, 132 insertions(+), 41 deletions(-)
 create mode 100644 src/calibre/ebooks/mobi/writer8/index.py

diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py
index fe5cd7eaf2..319af30f86 100644
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en'
 
 import struct, string, imghdr, zlib, os
 from collections import OrderedDict
+from io import BytesIO
 
 from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
 from calibre.ebooks import normalize
@@ -549,3 +550,48 @@ def create_text_record(text):
 
     return data, overlap
 
+class CNCX(object): # {{{
+
+    '''
+    Create the CNCX records. These are records containing all the strings from
+    an index. Each record is of the form: <vwi string size><utf-8 encoded
+    string>
+    '''
+
+    MAX_STRING_LENGTH = 500
+
+    def __init__(self, strings=()):
+        self.strings = OrderedDict((s, 0) for s in strings)
+
+        self.records = []
+        offset = 0
+        buf = BytesIO()
+        for key in tuple(self.strings.iterkeys()):
+            utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
+            l = len(utf8)
+            sz_bytes = encint(l)
+            raw = sz_bytes + utf8
+            if 0xfbf8 - buf.tell() < 6 + len(raw):
+                # Records in PDB files cannot be larger than 0x10000, so we
+                # stop well before that.
+                pad = 0xfbf8 - buf.tell()
+                buf.write(b'\0' * pad)
+                self.records.append(buf.getvalue())
+                buf.truncate(0)
+                offset = len(self.records) * 0x10000
+            buf.write(raw)
+            self.strings[key] = offset
+            offset += len(raw)
+
+        self.records.append(align_block(buf.getvalue()))
+
+    def __getitem__(self, string):
+        return self.strings[string]
+
+    def __bool__(self):
+        return bool(self.records)
+    __nonzero__ = __bool__
+
+# }}}
+
+
diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py
index 134fbadc60..be926a80a0 100644
--- a/src/calibre/ebooks/mobi/writer2/indexer.py
+++ b/src/calibre/ebooks/mobi/writer2/indexer.py
@@ -13,54 +13,21 @@ from cStringIO import StringIO
 from collections import OrderedDict, defaultdict
 
 from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
-        encode_tbs, align_block, utf8_text, RECORD_SIZE)
+        encode_tbs, align_block, RECORD_SIZE, CNCX as CNCX_)
 
-class CNCX(object): # {{{
-
-    '''
-    Create the CNCX records. These are records containing all the strings from
-    the NCX. Each record is of the form: <vwi string size><utf-8 encoded
-    string>
-    '''
-
-    MAX_STRING_LENGTH = 500
+class CNCX(CNCX_): # {{{
 
     def __init__(self, toc, is_periodical):
-        self.strings = OrderedDict()
-
+        strings = []
         for item in toc.iterdescendants(breadth_first=True):
-            self.strings[item.title] = 0
+            strings.append(item.title)
             if is_periodical:
-                self.strings[item.klass] = 0
+                strings.append(item.klass)
                 if item.author:
-                    self.strings[item.author] = 0
+                    strings.append(item.author)
                 if item.description:
-                    self.strings[item.description] = 0
-
-        self.records = []
-        offset = 0
-        buf = StringIO()
-        for key in tuple(self.strings.iterkeys()):
-            utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
-            l = len(utf8)
-            sz_bytes = encint(l)
-            raw = sz_bytes + utf8
-            if 0xfbf8 - buf.tell() < 6 + len(raw):
-                # Records in PDB files cannot be larger than 0x10000, so we
-                # stop well before that.
-                pad = 0xfbf8 - buf.tell()
-                buf.write(b'\0' * pad)
-                self.records.append(buf.getvalue())
-                buf.truncate(0)
-                offset = len(self.records) * 0x10000
-            buf.write(raw)
-            self.strings[key] = offset
-            offset += len(raw)
-
-        self.records.append(align_block(buf.getvalue()))
-
-    def __getitem__(self, string):
-        return self.strings[string]
+                    strings.append(item.description)
+        CNCX_.__init__(self, strings)
 # }}}
 
 class TAGX(object): # {{{
diff --git a/src/calibre/ebooks/mobi/writer8/index.py b/src/calibre/ebooks/mobi/writer8/index.py
new file mode 100644
index 0000000000..a2b991a612
--- /dev/null
+++ b/src/calibre/ebooks/mobi/writer8/index.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+from future_builtins import map
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from collections import namedtuple
+from struct import pack
+
+from calibre.ebooks.mobi.utils import CNCX
+
+TagMeta = namedtuple('TagMeta',
+        'name number values_per_entry bitmask end_flag')
+EndTagTable = TagMeta('eof', 0, 0, 0, 1)
+
+class Index(object):
+
+    control_byte_count = 1
+    cncx = CNCX()
+    tag_types = (EndTagTable,)
+
+    @classmethod
+    def generate_tagx(cls):
+        header = b'TAGX'
+        byts = bytearray()
+        for tag_meta in cls.tag_types:
+            byts.extend(tag_meta[1:])
+        # table length, control byte count
+        header += pack(b'>II', 12+len(byts), cls.control_byte_count)
+        return header + bytes(byts)
+
+class SkelIndex(Index):
+
+    tag_types = tuple(map(TagMeta, (
+        ('chunk_count', 1, 1, 3, 0),
+        ('geometry',    6, 2, 12, 0),
+        EndTagTable
+    )))
+
+    def __init__(self, skel_table):
+        self.entries = [
+                (s.name, {
+                    # Dont ask me why these entries have to be repeated twice
+                    'chunk_count':(s.chunk_count, s.chunk_count),
+                    'geometry':(s.start_pos, s.length, s.start_pos, s.length),
+                    }) for s in skel_table
+        ]
+
+
+class ChunkIndex(Index):
+
+    tag_types = tuple(map(TagMeta, (
+        ('cncx_offset',     2, 1, 1, 0),
+        ('file_number',     3, 1, 2, 0),
+        ('sequence_number', 4, 1, 4, 0),
+        ('geometry',        6, 2, 8, 0),
+        EndTagTable
+    )))
+
+    def __init__(self, chunk_table):
+        self.cncx = CNCX(c.selector for c in chunk_table)
+
+        self.entries = [
+                ('%010d'%c.insert_pos, {
+
+                    'cncx_offset':self.cncx[c.selector],
+                    'file_number':c.file_number,
+                    'sequence_number':c.sequence_number,
+                    'geometry':(c.start_pos, c.length),
+                    }) for s in chunk_table
+        ]
+
+
+

From 22ee4152416a98e84a587f9fcf1a1f5aa52f4960 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 20 Apr 2012 23:32:53 +0530
Subject: [PATCH 27/37] ...

---
 src/calibre/ebooks/mobi/writer8/index.py | 55 ++++++++++++++++++++++--
 1 file changed, 52 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/mobi/writer8/index.py b/src/calibre/ebooks/mobi/writer8/index.py
index a2b991a612..1ee20857fb 100644
--- a/src/calibre/ebooks/mobi/writer8/index.py
+++ b/src/calibre/ebooks/mobi/writer8/index.py
@@ -10,13 +10,20 @@ __docformat__ = 'restructuredtext en'
 
 from collections import namedtuple
 from struct import pack
+from io import BytesIO
 
-from calibre.ebooks.mobi.utils import CNCX
+from calibre.ebooks.mobi.utils import CNCX, encint
 
 TagMeta = namedtuple('TagMeta',
         'name number values_per_entry bitmask end_flag')
 EndTagTable = TagMeta('eof', 0, 0, 0, 1)
 
+# map of mask to number of shifts needed, works with 1 bit and two-bit wide masks
+# could also be extended to 4 bit wide ones as well
+mask_to_bit_shifts = { 1:0, 2:1, 3:0, 4:2, 8:3, 12:2, 16:4, 32:5, 48:4, 64:6,
+        128:7, 192: 6 }
+
+
 class Index(object):
 
     control_byte_count = 1
@@ -33,6 +40,50 @@ class Index(object):
         header += pack(b'>II', 12+len(byts), cls.control_byte_count)
         return header + bytes(byts)
 
+    @classmethod
+    def calculate_control_bytes_for_each_entry(cls, entries):
+        control_bytes = []
+        for lead_text, tags in entries:
+            cbs = []
+            ans = 0
+            for (name, number, vpe, mask, endi) in cls.tag_types:
+                if endi == 1:
+                    cbs.append(ans)
+                    ans = 0
+                    continue
+                nvals = len(tags.get(name, ()))
+                nentries = nvals // vpe
+                shifts = mask_to_bit_shifts[mask]
+                ans |= mask & (nentries << shifts)
+            if len(cbs) != cls.control_byte_count:
+                raise ValueError('The entry %r is invalid'%[lead_text, tags])
+            control_bytes.append(cbs)
+        return control_bytes
+
+    def build_records(self):
+        self.control_bytes = self.calculate_control_bytes_for_each_entry(
+                self.entries)
+
+        self.rendered_entries = []
+        offset = 0
+        IndexEntry = namedtuple('IndexEntry', 'offset length raw')
+        for i, x in enumerate(self.entries):
+            control_bytes = self.control_bytes[i]
+            leading_text, tags = x
+            buf = BytesIO()
+            raw = bytearray(leading_text)
+            raw.insert(0, len(leading_text))
+            buf.write(bytes(raw))
+            buf.write(control_bytes)
+            for tag in self.tag_types:
+                values = tags.get(tag.name, None)
+                if values:
+                    for val in values:
+                        buf.write(encint(val))
+            raw = buf.getvalue()
+            self.rendered_entries.append(IndexEntry(offset, len(raw), raw))
+            offset += len(raw)
+
 class SkelIndex(Index):
 
     tag_types = tuple(map(TagMeta, (
@@ -74,5 +125,3 @@ class ChunkIndex(Index):
                     }) for s in chunk_table
         ]
 
-
-

From 5d3e24e1053e6078dfe3a7e9a0fe135baeb69286 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 21 Apr 2012 07:50:27 +0530
Subject: [PATCH 28/37] Fix set_internal_links()

---
 src/calibre/ebooks/mobi/writer8/main.py     |  4 +-
 src/calibre/ebooks/mobi/writer8/skeleton.py | 42 +++++++++++++++------
 2 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py
index 430d695fd1..955fbab460 100644
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@@ -200,9 +200,9 @@ class KF8Writer(object):
                     aid = aidbase + j
                     tag.attrib['aid'] = to_base(aid, base=32)
                     if tag.tag == XHTML('body'):
-                        self.id_map[(item.href, '')] = (i, tag.attrib['aid'])
+                        self.id_map[(item.href, '')] = tag.attrib['aid']
                     if id_ is not None:
-                        self.id_map[(item.href, id_)] = (i, tag.attrib['aid'])
+                        self.id_map[(item.href, id_)] = tag.attrib['aid']
 
                     j += 1
 
diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py
index d04f119316..4da540cac6 100644
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@@ -172,11 +172,11 @@ class Chunker(object):
             body = root.xpath('//body')[0]
             body.tail = '\n'
 
-            if self.orig_dumps is not None:
-                self.orig_dumps.append(tostring(root, xml_declaration=True,
+            if orig_dumps is not None:
+                orig_dumps.append(tostring(root, xml_declaration=True,
                     with_tail=True))
-                self.orig_dumps[-1] = close_self_closing_tags(
-                        self.orig_dumps[-1].replace(b'<html',
+                orig_dumps[-1] = close_self_closing_tags(
+                        orig_dumps[-1].replace(b'<html',
                         bytes('<html xmlns="%s"'%XHTML_NS), 1))
 
             # First pass: break up document into rendered strings of length no
@@ -336,15 +336,35 @@ class Chunker(object):
                 num += 1
 
     def set_internal_links(self, text):
-        # First find the start pos of all tags with aids
-        aid_map = {}
+        # A kindle pos:fid link contains two base 32 numbers of the form
+        # XXXX:YYYYYYYYYY
+        # The first number is an index into the chunk table and the second is
+        # an offset from the start of the chunk to the start of the tag pointed
+        # to by the link.
+        aid_map = {} # Map of aid to (pos, fid)
         for match in re.finditer(br'<[^>]+? aid=[\'"]([A-Z0-9]+)[\'"]', text):
-            aid_map[match.group(1)] = match.start()
+            offset = match.start()
+            pos_fid = None
+            for chunk in self.chunk_table:
+                if chunk.insert_pos <= offset < chunk.insert_pos + chunk.length:
+                    pos_fid = (chunk.sequence_number, offset-chunk.insert_pos)
+                    break
+                if chunk.insert_pos > offset:
+                    # This aid is in the skeleton, not in a chunk, so we use
+                    # the chunk immediately after
+                    pos_fid = (chunk.sequence_number, 0)
+                    break
+            if pos_fid is None:
+                raise ValueError('Could not find chunk for aid: %r'%
+                        match.group(1))
+            aid_map[match.group(1)] = (to_base(chunk.sequence_number,
+                                            base=32, min_num_digits=4),
+                                    to_href(offset-chunk.insert_pos))
+
         self.aid_offset_map = aid_map
 
-        def to_placeholder(x):
-            file_number, aid = x
-            return bytes('%04d:%s'%(file_number, to_href(aid_map[aid])))
+        def to_placeholder(aid):
+            return bytes(':'.join(aid_map[aid]))
 
         placeholder_map = {bytes(k):to_placeholder(v) for k, v in
                 self.placeholder_map.iteritems()}
@@ -359,7 +379,7 @@ class Chunker(object):
                 pass
             return raw
 
-        return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text)
+        return re.sub(br'<[^>]+(kindle:pos:fid:0000:\d{10})', sub, text)
 
     def dump(self, orig_dumps):
         import tempfile, shutil, os

From 5c72ad513b982741b6dc0777d89cda837f9566a8 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 21 Apr 2012 07:52:27 +0530
Subject: [PATCH 29/37] ...

---
 src/calibre/ebooks/mobi/writer8/skeleton.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py
index 4da540cac6..8f0a3795db 100644
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@@ -336,7 +336,9 @@ class Chunker(object):
                 num += 1
 
     def set_internal_links(self, text):
-        # A kindle pos:fid link contains two base 32 numbers of the form
+        ''' Update the internal link placeholders to point to the correct
+        location, based on the chunk table.'''
+        # A kindle:pos:fid link contains two base 32 numbers of the form
         # XXXX:YYYYYYYYYY
         # The first number is an index into the chunk table and the second is
         # an offset from the start of the chunk to the start of the tag pointed

From 9ab4ff1840a7b3735a6e94e4c1465295285bfc4f Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 21 Apr 2012 11:15:31 +0530
Subject: [PATCH 30/37] A nice framework for generating MOBI header records

---
 .../ebooks/conversion/plugins/mobi_output.py  |   2 +-
 src/calibre/ebooks/mobi/debug/index.py        |   4 +-
 src/calibre/ebooks/mobi/debug/mobi8.py        |   2 +-
 src/calibre/ebooks/mobi/utils.py              |   7 +-
 src/calibre/ebooks/mobi/writer8/header.py     |  77 +++++++++++
 src/calibre/ebooks/mobi/writer8/index.py      | 125 +++++++++++++++++-
 6 files changed, 206 insertions(+), 11 deletions(-)
 create mode 100644 src/calibre/ebooks/mobi/writer8/header.py

diff --git a/src/calibre/ebooks/conversion/plugins/mobi_output.py b/src/calibre/ebooks/conversion/plugins/mobi_output.py
index 89ab91f8eb..971d11df3b 100644
--- a/src/calibre/ebooks/conversion/plugins/mobi_output.py
+++ b/src/calibre/ebooks/conversion/plugins/mobi_output.py
@@ -169,6 +169,7 @@ class MOBIOutput(OutputFormatPlugin):
         self.remove_html_cover()
         resources = Resources(oeb, opts, self.is_periodical,
                 add_fonts=create_kf8)
+        self.check_for_periodical()
 
         kf8 = self.create_kf8(resources) if create_kf8 else None
 
@@ -203,7 +204,6 @@ class MOBIOutput(OutputFormatPlugin):
             resources.add_extra_images()
         mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
         mobimlizer(oeb, opts)
-        self.check_for_periodical()
         write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
         from calibre.ebooks.mobi.writer2.main import MobiWriter
         writer = MobiWriter(opts, resources, kf8,
diff --git a/src/calibre/ebooks/mobi/debug/index.py b/src/calibre/ebooks/mobi/debug/index.py
index 1af1611918..94f252e231 100644
--- a/src/calibre/ebooks/mobi/debug/index.py
+++ b/src/calibre/ebooks/mobi/debug/index.py
@@ -17,7 +17,7 @@ from calibre.ebooks.mobi.reader.ncx import (tag_fieldname_map, default_entry)
 File = namedtuple('File',
     'file_number name divtbl_count start_position length')
 
-Elem = namedtuple('Elem',
+Elem = namedtuple('Chunk',
     'insert_pos toc_text file_number sequence_number start_pos '
     'length')
 
@@ -110,7 +110,7 @@ class SECTIndex(Index):
              for i, text in enumerate(self.table.iterkeys()):
                 tag_map = self.table[text]
                 if set(tag_map.iterkeys()) != {2, 3, 4, 6}:
-                    raise ValueError('SECT Index has unknown tags: %s'%
+                    raise ValueError('Chunk Index has unknown tags: %s'%
                             (set(tag_map.iterkeys())-{2, 3, 4, 6}))
 
                 toc_text = self.cncx[tag_map[2][0]]
diff --git a/src/calibre/ebooks/mobi/debug/mobi8.py b/src/calibre/ebooks/mobi/debug/mobi8.py
index 1c61690d42..e3e26af0b1 100644
--- a/src/calibre/ebooks/mobi/debug/mobi8.py
+++ b/src/calibre/ebooks/mobi/debug/mobi8.py
@@ -198,7 +198,7 @@ def inspect_mobi(mobi_file, ddir):
     with open(os.path.join(ddir, 'skel.record'), 'wb') as fo:
         fo.write(str(f.skel_index).encode('utf-8'))
 
-    with open(os.path.join(ddir, 'sect.record'), 'wb') as fo:
+    with open(os.path.join(ddir, 'chunks.record'), 'wb') as fo:
         fo.write(str(f.sect_index).encode('utf-8'))
 
     with open(os.path.join(ddir, 'ncx.record'), 'wb') as fo:
diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py
index 319af30f86..aa59ee2217 100644
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@@ -583,7 +583,9 @@ class CNCX(object): # {{{
             self.strings[key] = offset
             offset += len(raw)
 
-        self.records.append(align_block(buf.getvalue()))
+        val = buf.getvalue()
+        if val:
+            self.records.append(align_block(val))
 
     def __getitem__(self, string):
         return self.strings[string]
@@ -592,6 +594,9 @@ class CNCX(object): # {{{
         return bool(self.records)
     __nonzero__ = __bool__
 
+    def __len__(self):
+        return len(self.records)
+
 # }}}
 
 
diff --git a/src/calibre/ebooks/mobi/writer8/header.py b/src/calibre/ebooks/mobi/writer8/header.py
new file mode 100644
index 0000000000..31571d0f5f
--- /dev/null
+++ b/src/calibre/ebooks/mobi/writer8/header.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from io import BytesIO
+from collections import OrderedDict
+from struct import pack
+
+from calibre.ebooks.mobi.utils import align_block
+
+NULL = 0xffffffff
+zeroes = lambda x: b'\0'*x
+nulls = lambda x: b'\xff'*x
+
+class Header(OrderedDict):
+
+    HEADER_NAME = b''
+
+    DEFINITION = '''
+    '''
+
+    ALIGN_BLOCK = False
+    POSITIONS = {}
+
+    def __init__(self):
+        OrderedDict.__init__(self)
+
+        for line in self.DEFINITION.splitlines():
+            line = line.strip()
+            if not line or line.startswith('#'): continue
+            name, val = [x.strip() for x in line.partition('=')[0::2]]
+            if val:
+                val = eval(val, {'zeroes':zeroes, 'NULL':NULL, 'DYN':None,
+                    'nulls':nulls})
+            else:
+                val = 0
+            if name in self:
+                raise ValueError('Duplicate field in definition: %r'%name)
+            self[name] = val
+
+    def __call__(self, **kwargs):
+        positions = {}
+        for name, val in kwargs.iteritems():
+            if name not in self:
+                raise KeyError('Not a valid header field: %r'%name)
+            self[name] = val
+
+        buf = BytesIO()
+        buf.write(bytes(self.HEADER_NAME))
+        for name, val in self.iteritems():
+            val = self.format_value(name, val)
+            positions[name] = buf.tell()
+            if val is None:
+                raise ValueError('Dynamic field %r not set'%name)
+            if isinstance(val, (int, long)):
+                val = pack(b'>I', val)
+            buf.write(val)
+
+        for pos_field, field in self.POSITIONS.iteritems():
+            buf.seek(positions[pos_field])
+            buf.write(pack(b'>I', positions[field]))
+
+        ans = buf.getvalue()
+        if self.ALIGN_BLOCK:
+            ans = align_block(ans)
+        return ans
+
+
+    def format_value(self, name, val):
+        return val
+
+
diff --git a/src/calibre/ebooks/mobi/writer8/index.py b/src/calibre/ebooks/mobi/writer8/index.py
index 1ee20857fb..153e140b06 100644
--- a/src/calibre/ebooks/mobi/writer8/index.py
+++ b/src/calibre/ebooks/mobi/writer8/index.py
@@ -12,7 +12,8 @@ from collections import namedtuple
 from struct import pack
 from io import BytesIO
 
-from calibre.ebooks.mobi.utils import CNCX, encint
+from calibre.ebooks.mobi.utils import CNCX, encint, align_block
+from calibre.ebooks.mobi.writer8.header import Header
 
 TagMeta = namedtuple('TagMeta',
         'name number values_per_entry bitmask end_flag')
@@ -23,13 +24,79 @@ EndTagTable = TagMeta('eof', 0, 0, 0, 1)
 mask_to_bit_shifts = { 1:0, 2:1, 3:0, 4:2, 8:3, 12:2, 16:4, 32:5, 48:4, 64:6,
         128:7, 192: 6 }
 
+class IndexHeader(Header): # {{{
 
-class Index(object):
+    HEADER_NAME = b'INDX'
+    ALIGN_BLOCK = True
+    HEADER_LENGTH = 192
+
+    DEFINITION = '''
+    # 4 - 8: Header Length
+    header_length = {header_length}
+
+    # 8 - 16: Unknown
+    unknown1 = zeroes(8)
+
+    # 16 - 20: Index type: 0 - normal 2 - inflection
+    type = 2
+
+    # 20 - 24: IDXT offset (filled in later)
+    idxt_offset
+
+    # 24 - 28: Number of index records
+    num_of_records = 1
+
+    # 28 - 32: Index encoding (65001 = utf-8)
+    encoding = 65001
+
+    # 32 - 36: Unknown
+    unknown2 = NULL
+
+    # 36 - 40: Number of Index entries
+    num_of_entries = DYN
+
+    # 40 - 44: ORDT offset
+    ordt_offset
+
+    # 44 - 48: LIGT offset
+    ligt_offset
+
+    # 48 - 52: Number of ORDT/LIGT? entries
+    num_of_ordt_entries
+
+    # 52 - 56: Number of CNCX records
+    num_of_cncx = DYN
+
+    # 56 - 180: Unknown
+    unknown3 = zeroes(124)
+
+    # 180 - 184: TAGX offset
+    tagx_offset = {header_length}
+
+    # 184 - 192: Unknown
+    unknown4 = zeroes(8)
+
+    # TAGX
+    tagx = DYN
+
+    # Last Index entry
+    last_index = DYN
+
+    # IDXT
+    idxt = DYN
+    '''.format(header_length=HEADER_LENGTH)
+
+    POSITIONS = {'idxt_offset':'idxt'}
+# }}}
+
+class Index(object): # {{{
 
     control_byte_count = 1
     cncx = CNCX()
     tag_types = (EndTagTable,)
 
+    HEADER_LENGTH = IndexHeader.HEADER_LENGTH
+
     @classmethod
     def generate_tagx(cls):
         header = b'TAGX'
@@ -60,17 +127,18 @@ class Index(object):
             control_bytes.append(cbs)
         return control_bytes
 
-    def build_records(self):
+    def __call__(self):
         self.control_bytes = self.calculate_control_bytes_for_each_entry(
                 self.entries)
 
-        self.rendered_entries = []
+        rendered_entries = []
         offset = 0
+        index, idxt, buf = BytesIO(), BytesIO(), BytesIO()
         IndexEntry = namedtuple('IndexEntry', 'offset length raw')
         for i, x in enumerate(self.entries):
             control_bytes = self.control_bytes[i]
             leading_text, tags = x
-            buf = BytesIO()
+            buf.truncate(0)
             raw = bytearray(leading_text)
             raw.insert(0, len(leading_text))
             buf.write(bytes(raw))
@@ -81,8 +149,53 @@ class Index(object):
                     for val in values:
                         buf.write(encint(val))
             raw = buf.getvalue()
-            self.rendered_entries.append(IndexEntry(offset, len(raw), raw))
+            rendered_entries.append(IndexEntry(offset, len(raw), raw))
+            idxt.write(pack(b'>H', self.HEADER_LENGTH+offset))
             offset += len(raw)
+            index.write(raw)
+
+        index_block = align_block(index.getvalue())
+        idxt_block = align_block(b'IDXT' + idxt.getvalue())
+        body = index_block + idxt_block
+        if len(body) + self.HEADER_LENGTH >= 0x10000:
+            raise ValueError('Index has too many entries, calibre does not'
+                    ' support generating multiple index records at this'
+                    ' time.')
+
+        header = b'INDX'
+        buf.truncate(0)
+        buf.write(pack(b'>I', self.HEADER_LENGTH))
+        buf.write(b'\0'*4) # Unknown
+        buf.write(pack(b'>I', 1)) # Header type? Or index record number?
+        buf.write(b'\0'*4) # Unknown
+
+        # IDXT block offset
+        buf.write(pack(b'>I', self.HEADER_LENGTH + len(index_block)))
+
+        # Number of index entries
+        buf.write(pack(b'>I', len(rendered_entries)))
+
+        buf.write(b'\xff'*8) # Unknown
+
+        buf.write(b'\0'*156) # Unknown
+
+        header += buf.getvalue()
+        index_record = header + body
+
+        tagx = self.generate_tagx()
+        idxt = (b'IDXT' + pack(b'>H', IndexHeader.HEADER_LENGTH + len(tagx)) +
+                b'\0')
+        header = {
+                'num_of_entries': len(rendered_entries),
+                'num_of_cncx': len(self.cncx),
+                'tagx':tagx,
+                'idxt':idxt
+        }
+        header = IndexHeader()(**header)
+        self.records = [header, index_record]
+        self.records.extend(self.cncx.records)
+        return self.records
+# }}}
 
 class SkelIndex(Index):
 

From 687586f9a16f55b7c675690e130c7a61be145f7e Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 21 Apr 2012 13:20:14 +0530
Subject: [PATCH 31/37] ...

---
 src/calibre/manual/faq.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst
index a248962abd..f0d9aa8bd3 100644
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@@ -29,7 +29,7 @@ It can convert every input format in the following list, to every output format.
     PRC is a generic format, |app| supports PRC files with TextRead and MOBIBook headers.
     PDB is also a generic format. |app| supports eReder, Plucker, PML and zTxt PDB files.
     DJVU support is only for converting DJVU files that contain embedded text. These are typically generated by OCR software.
-    MOBI books can be of two types Mobi6 and KF8. |app| currently fully supports Mobi6 and supports conversion from, but not to, KF8
+    MOBI books can be of two types Mobi6 and KF8. |app| fully supports both. MOBI files often have .azw or .azw3 file extensions
 
 .. _best-source-formats:
 

From 0db1fcb10396f81e7a1bbf13e7900d125eeeb88d Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 21 Apr 2012 23:38:52 +0530
Subject: [PATCH 32/37] Fix #986598 (New recipe for News agency Telam)

---
 recipes/icons/telam.png | Bin 0 -> 1992 bytes
 recipes/telam.recipe    |  62 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100644 recipes/icons/telam.png
 create mode 100644 recipes/telam.recipe

diff --git a/recipes/icons/telam.png b/recipes/icons/telam.png
new file mode 100644
index 0000000000000000000000000000000000000000..f86dcc1dbf16db5e41411f0dd20c4af478336877
GIT binary patch
literal 1992
zcmV;(2RHbMP)<h;3K|Lk000e1NJLTq001BW001Be0ssI2{21+{000M%Nkl<Zc$~$S
zd5B$C9mmf(cfa?oGxO$kw#<^uWSJ(@VlYXY)YvAX)QC7jK@=$>t)QrX=s)%^L5e6e
zVnBo<NGTK~ZAnsVm1;w?HQOX+j7i8gnaP{?-n{MJcek_M<C)GhW@>6IDE|Jqk9#@q
ze9mwCo*xuKfd8ez|35%3NC03A$aN4ffK&|u1mS-HkO<)gc)GSQ>-pCz*kqUjTGT+^
zNfom8X#Y@mT7&l^-5Vf3x<38mU(dZ&ZO?~3@x;}b7P0i8j4lTMMWmfJI2cyik(
zyBzD|0p!8ooGHKk?upY;He}73z6wHbkoq@G-HLD%lb)#IA(rN#Cl1tn?nj?~Y|F&D
zk8<V1fKZ6nj-Gz`O1;VLl%d~BOk!Nefx$J><<}|{$bB4NPdHK)04ZgD<3TN)+`04V
z&+ND-V+9}(;*GP{UYeS6oq-ucje~H^Bzt=LH#&MpE~6^8TuK?f+$>)y-8eUO`GSBv
zR1D;$HSGtV+V<7QxBn|+1>oFF>G@+<j`+rqfhJew9@#v&);9BKIdUMO0BHa$zmo)9
ztA;NfJN~;fr##Qk=6ing<tMgpTKAEFvKRjBz4O1Ft$kx)?AsfQ1I4r~OvVI8Nx%`q
zSdr)DR91k3kYN}$;<{l2k(6uhPYxV<bL#wt!NC{5^>lYCbq|1izW(?5A0IjP)Wpzt
zKJl>5S;-|=IH`r&g}5~Bb;jje9b7~_R-?89Svn2BIJ*Av@nP8zK^W?ev;X&R{qoqU
zuRXT)+1*dv9jIkMsU80O;L!&k=$hQV@o#hWw_B~tHF$<YiVB8}g`Ysg1*^rxF~d6Y
ztEyYtx@!I2$wxH=UwHLTLw$vISO0;-?^&$*{8M{Y<@0wKXzBR3JvU|2iLuqce&^f|
z&)nR1lbjKDf2z=-DMOa30qzhNF)PQbkl9eP8tI{G>pDmhgix*7c<z<gpF4Es4>)&H
z>wEQv7lME0brC>0Z#2BlV($xoIs9@fnpgBST4XW9r|m>rXn2aEatr1x#3<k<n9sx{
zhU%?GrTP9TKRZ4$ht-m|5F7e+wd<$nukHQw`6i)Dv03r}xGj9@^4#f=b7284kxruf
zfxv`<Vla<+$~AmFZkTZz0Ohvy31iXDL7o7?Gx=<QqJn}{oeROMQF`B;I6l)#?f_)(
zR$KnzfSM@Bo1w*MonYq4BcN1RzL+E#(#m;3fM6`#gbO9sG+rmdBM1Sgn1%_kf^f_v
zPbb1s$^kfdX)dOG89)*?-JIi?xu~XCjU0)Ju&qE5`+?%yc~PsWVxcXlP+`0P5Kkpl
zn3ytf0Keb-zyKPlNXLN+5(_X3K>IQbTZ^J2&~hySDiqJ;xhz(QXa_!uTTy?r;wrHS
zSgzy?jan^_K|6A(9a2ptgvvxn@3GD!K(zvpG7bR>7DXK;Ntm${S;KG<AS=pHGgc^4
zth|7MbWK5So?LILRJa~Rc||KZVT2Gv5!MvHC5yy^h)`Ww{CArOpfOK|i#veH4LQ)l
zjucQ3Tn>?nP`8b|C|U@BN<Ezfb{oz!jVFu+Hji$qUJ_8(jiOW%(l+*~=w`8!2qLc{
zq(=1;fSRKbWijUpFhUrI52d(<K1i~rsuKfi3Y^<GPGT7w(Ts36Wl<2QnwR#1Du;>J
za&TNHWKk-FDC@k}P|}IK>y%UFifX_n*P$WzOBQ>VsYAI3G?}C&fDF}0PuF0^(J5^s
zkd9p@i%G;hL|MOjk?Ng(o9b$aF#!ZqT4NBpVfWy$scM{&<1;fkq6;S-Lk=PXYH*z$
zj~8tPe$Z$#6~p<|qr<&^yOP7a0aOBTJ7glJ6%mb*+JffO5QOqtmOTlhEQ_wvWUN;T
zIGQJLwpyu~S!fhg#R5ndf&aM;gQFd(CAC=AUfVKuZC;;AXMh`;j8UlcvbutOf}KjF
zNTKfYJdq?axFC?330L`Z<GC&sDbr_<&cqt%NW**zMy3qUYTeq?@1)hcv}nm5?(3f%
zAG^w3-VCfdo5dE>3GnMh;YS{tRfQQZWWq$yxXyweH6GcpK7wHX$xBU?13aC`aDl);
z!802QJ9-N%#BRw<OaJ=L&0iVHM|y~5_=O8R0(3eT2@Shx>dtYa&L~2#lg4XC#@FPm
z>5DfGTx}GyIWX7_U65D3@AlGX)(xntvO?_cSYM&|{m(x++R>#(F$^e^V%OpbL@{Ht
zROump8)|Bjs5tz>mI(xuz5Cyw+^%Me02ro)ck~(Gd}K=|W!$~z3V`%$)9C77P43#X
zaZKoh6YiO^sWjPQ86e6q?-;le`CGe&C&tEqaq!4%wOYGx807~?66<@T-TQVtG?G=o
zO1sC{BEOW}@X+y-uS`vyC{-M!6^P!<IBlTFv~RGDXLs+uS-bt>yC=L%-@w|5Z9SPU
zjr8vt$msh0rivAqL|mFLpS^J5otgQoR~pmYrKCNIw2gzqnS5uRwE8>x)~+63GniRt
zTb+*npUy=eGR~M!Xb_Q*l7(iVB(%39qnf6nDO}aFx+bgf-%pPpW4gYxa%o5Tdf&lR
a+CKo7=wztDq%i#e0000<MNUMnLSTX;d(;sC

literal 0
HcmV?d00001

diff --git a/recipes/telam.recipe b/recipes/telam.recipe
new file mode 100644
index 0000000000..c2dbfee1d7
--- /dev/null
+++ b/recipes/telam.recipe
@@ -0,0 +1,62 @@
+__license__   = 'GPL v3'
+__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
+'''
+www.telam.com.ar
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Telam(BasicNewsRecipe):
+    title                 = 'Telam'
+    __author__            = 'Darko Miletic'
+    description           = 'AGENCIA DE NOTICIAS DE LA REPUBLICA ARGENTINA'
+    publisher             = 'Telam S.E.'
+    category              = 'news, politics, Argentina'
+    oldest_article        = 2
+    max_articles_per_feed = 200
+    no_stylesheets        = True
+    encoding              = 'utf8'
+    use_embedded_content  = False
+    language              = 'es_AR'
+    remove_empty_feeds    = True
+    publication_type      = 'newsportal'
+    masthead_url          = 'http://www.telam.com.ar/front/imagenes/encabezado/logotelam.jpg'
+    extra_css             = """
+                               body{font-family: Arial,Helvetica,sans-serif }
+                               img{margin-bottom: 0.4em; display:block}
+                            """
+
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
+
+    remove_tags        = [dict(name=['meta','link'])]
+    remove_tags_before = dict(attrs={'class':'nota_fecha'})
+    remove_tags_after  = dict(attrs={'class':'nota_completa'})
+    remove_attributes  = ['lang']
+
+
+    feeds = [
+              (u'Ultimas noticias', u'http://www.telam.com.ar/xml/rss/' )
+             ,(u'Politica'        , u'http://www.telam.com.ar/xml/rss/1')
+             ,(u'Economia'        , u'http://www.telam.com.ar/xml/rss/2')
+             ,(u'Sociedad'        , u'http://www.telam.com.ar/xml/rss/3')
+             ,(u'Policiales'      , u'http://www.telam.com.ar/xml/rss/4')
+             ,(u'Internacionales' , u'http://www.telam.com.ar/xml/rss/6')
+             ,(u'Espectaculos'    , u'http://www.telam.com.ar/xml/rss/7')
+             ,(u'Cultura'         , u'http://www.telam.com.ar/xml/rss/8')
+             ,(u'Deportes'        , u'http://www.telam.com.ar/xml/rss/9')
+             ,(u'Telam Investiga' , u'http://www.telam.com.ar/xml/rss/5')
+            ]
+
+    def print_version(self, url):
+        artid = url.rpartition('/')[2]
+        return 'http://www.telam.com.ar/?codProg=imprimir-nota&id=' + artid
+
+    def preprocess_html(self, soup):
+        for item in soup.findAll(style=True):
+            del item['style']
+        return soup

From e4a55aae564adfa92bcef668f020982b82a38aab Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 22 Apr 2012 10:17:06 +0530
Subject: [PATCH 33/37] KF8 Output: Create NCX and Guide records

---
 src/calibre/ebooks/mobi/utils.py              |   4 +
 src/calibre/ebooks/mobi/writer2/serializer.py |   5 +-
 src/calibre/ebooks/mobi/writer8/index.py      | 118 ++++++++++++++++--
 src/calibre/ebooks/mobi/writer8/main.py       |  99 ++++++++++++++-
 src/calibre/ebooks/mobi/writer8/skeleton.py   |   8 +-
 5 files changed, 208 insertions(+), 26 deletions(-)

diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py
index aa59ee2217..3b8ce61ba8 100644
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@@ -599,4 +599,8 @@ class CNCX(object): # {{{
 
 # }}}
 
+def is_guide_ref_start(ref):
+    return (ref.title.lower() == 'start' or
+            (ref.type and ref.type.lower() in {'start',
+                    'other.start', 'text'}))
 
diff --git a/src/calibre/ebooks/mobi/writer2/serializer.py b/src/calibre/ebooks/mobi/writer2/serializer.py
index d8d63bcff4..2dda657a93 100644
--- a/src/calibre/ebooks/mobi/writer2/serializer.py
+++ b/src/calibre/ebooks/mobi/writer2/serializer.py
@@ -12,6 +12,7 @@ import re
 from calibre.ebooks.oeb.base import (OEB_DOCS, XHTML, XHTML_NS, XML_NS,
         namespace, prefixname, urlnormalize)
 from calibre.ebooks.mobi.mobiml import MBP_NS
+from calibre.ebooks.mobi.utils import is_guide_ref_start
 
 from collections import defaultdict
 from urlparse import urldefrag
@@ -161,9 +162,7 @@ class Serializer(object):
                 buf.write(b'title="')
                 self.serialize_text(ref.title, quot=True)
                 buf.write(b'" ')
-                if (ref.title.lower() == 'start' or
-                    (ref.type and ref.type.lower() in {'start',
-                        'other.start', 'text'})):
+                if is_guide_ref_start(ref):
                     self._start_href = ref.href
             self.serialize_href(ref.href)
             # Space required or won't work, I kid you not
diff --git a/src/calibre/ebooks/mobi/writer8/index.py b/src/calibre/ebooks/mobi/writer8/index.py
index 153e140b06..1cf9f02d4b 100644
--- a/src/calibre/ebooks/mobi/writer8/index.py
+++ b/src/calibre/ebooks/mobi/writer8/index.py
@@ -15,9 +15,10 @@ from io import BytesIO
 from calibre.ebooks.mobi.utils import CNCX, encint, align_block
 from calibre.ebooks.mobi.writer8.header import Header
 
-TagMeta = namedtuple('TagMeta',
+TagMeta_ = namedtuple('TagMeta',
         'name number values_per_entry bitmask end_flag')
-EndTagTable = TagMeta('eof', 0, 0, 0, 1)
+TagMeta = lambda x:TagMeta_(*x)
+EndTagTable = TagMeta(('eof', 0, 0, 0, 1))
 
 # map of mask to number of shifts needed, works with 1 bit and two-bit wide masks
 # could also be extended to 4 bit wide ones as well
@@ -118,7 +119,10 @@ class Index(object): # {{{
                     cbs.append(ans)
                     ans = 0
                     continue
-                nvals = len(tags.get(name, ()))
+                try:
+                    nvals = len(tags.get(name, ()))
+                except TypeError:
+                    nvals = 1
                 nentries = nvals // vpe
                 shifts = mask_to_bit_shifts[mask]
                 ans |= mask & (nentries << shifts)
@@ -132,36 +136,51 @@ class Index(object): # {{{
                 self.entries)
 
         rendered_entries = []
-        offset = 0
         index, idxt, buf = BytesIO(), BytesIO(), BytesIO()
         IndexEntry = namedtuple('IndexEntry', 'offset length raw')
+        last_lead_text = b''
+        too_large = ValueError('Index has too many entries, calibre does not'
+                    ' support generating multiple index records at this'
+                    ' time.')
+
         for i, x in enumerate(self.entries):
             control_bytes = self.control_bytes[i]
             leading_text, tags = x
-            buf.truncate(0)
+            buf.seek(0), buf.truncate(0)
+            leading_text = (leading_text.encode('utf-8') if
+                    isinstance(leading_text, unicode) else leading_text)
             raw = bytearray(leading_text)
             raw.insert(0, len(leading_text))
             buf.write(bytes(raw))
-            buf.write(control_bytes)
+            buf.write(bytes(bytearray(control_bytes)))
             for tag in self.tag_types:
                 values = tags.get(tag.name, None)
+                if values is None: continue
+                try:
+                    len(values)
+                except TypeError:
+                    values = [values]
                 if values:
                     for val in values:
-                        buf.write(encint(val))
+                        try:
+                            buf.write(encint(val))
+                        except ValueError:
+                            raise ValueError('Invalid values for %r: %r'%(
+                                tag, values))
             raw = buf.getvalue()
+            offset = index.tell()
+            if offset + self.HEADER_LENGTH >= 0x10000:
+                raise too_large
             rendered_entries.append(IndexEntry(offset, len(raw), raw))
             idxt.write(pack(b'>H', self.HEADER_LENGTH+offset))
-            offset += len(raw)
             index.write(raw)
+            last_lead_text = leading_text
 
         index_block = align_block(index.getvalue())
         idxt_block = align_block(b'IDXT' + idxt.getvalue())
         body = index_block + idxt_block
         if len(body) + self.HEADER_LENGTH >= 0x10000:
-            raise ValueError('Index has too many entries, calibre does not'
-                    ' support generating multiple index records at this'
-                    ' time.')
-
+            raise too_large
         header = b'INDX'
         buf.truncate(0)
         buf.write(pack(b'>I', self.HEADER_LENGTH))
@@ -185,10 +204,15 @@ class Index(object): # {{{
         tagx = self.generate_tagx()
         idxt = (b'IDXT' + pack(b'>H', IndexHeader.HEADER_LENGTH + len(tagx)) +
                 b'\0')
+        # Last index
+        idx = bytes(bytearray([len(last_lead_text)])) + last_lead_text
+        idx += pack(b'>H', len(rendered_entries))
+
         header = {
                 'num_of_entries': len(rendered_entries),
                 'num_of_cncx': len(self.cncx),
                 'tagx':tagx,
+                'last_index':align_block(idx),
                 'idxt':idxt
         }
         header = IndexHeader()(**header)
@@ -235,6 +259,74 @@ class ChunkIndex(Index):
                     'file_number':c.file_number,
                     'sequence_number':c.sequence_number,
                     'geometry':(c.start_pos, c.length),
-                    }) for s in chunk_table
+                    }) for c in chunk_table
         ]
 
+class GuideIndex(Index):
+
+    tag_types = tuple(map(TagMeta, (
+        ('title',           1, 1, 1, 0),
+        ('pos_fid',         6, 2, 2, 0),
+        EndTagTable
+    )))
+
+    def __init__(self, guide_table):
+        self.cncx = CNCX(c.title for c in guide_table)
+
+        self.entries = [
+                (r.type, {
+
+                    'title':self.cncx[r.title],
+                    'pos_fid':r.pos_fid,
+                    }) for r in guide_table
+        ]
+
+
+class NCXIndex(Index):
+
+    control_byte_count = 2
+    tag_types = tuple(map(TagMeta, (
+        ('offset',             1, 1, 1, 0),
+        ('length',             2, 1, 2, 0),
+        ('label',              3, 1, 4, 0),
+        ('depth',              4, 1, 8, 0),
+        ('parent',             21, 1, 16, 0),
+        ('first_child',        22, 1, 32, 0),
+        ('last_child',         23, 1, 64, 0),
+        ('pos_fid',            6, 2, 128, 0),
+        EndTagTable,
+        ('image',              69, 1, 1, 0),
+        ('description',        70, 1, 2, 0),
+        ('author',             71, 1, 4, 0),
+        ('caption',            72, 1, 8, 0),
+        ('attribution',        73, 1, 16, 0),
+        EndTagTable
+    )))
+
+    def __init__(self, toc_table):
+        strings = []
+        for entry in toc_table:
+            strings.append(entry['label'])
+            aut = entry.get('author', None)
+            if aut:
+                strings.append(aut)
+            desc = entry.get('description', None)
+            if desc:
+                strings.append(desc)
+        self.cncx = CNCX(strings)
+
+        def to_entry(x):
+            ans = {}
+            for f in ('offset', 'length', 'depth', 'pos_fid', 'parent',
+                    'first_child', 'last_child'):
+                if f in x:
+                    ans[f] = x[f]
+            for f in ('label', 'description', 'author'):
+                if f in x:
+                    ans[f] = self.cncx[x[f]]
+            return ('%02x'%x['index'], ans)
+
+        self.entries = list(map(to_entry, toc_table))
+
+
+
diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py
index 955fbab460..76492cb9a9 100644
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@@ -17,12 +17,15 @@ import cssutils
 from lxml import etree
 
 from calibre import isbytestring, force_unicode
-from calibre.ebooks.mobi.utils import create_text_record, to_base
+from calibre.ebooks.mobi.utils import (create_text_record, to_base,
+        is_guide_ref_start)
 from calibre.ebooks.compression.palmdoc import compress_doc
 from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
         extract, XHTML, urlnormalize)
 from calibre.ebooks.oeb.parse_utils import barename
 from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href
+from calibre.ebooks.mobi.writer8.index import (NCXIndex, SkelIndex,
+        ChunkIndex, GuideIndex)
 
 XML_DOCS = OEB_DOCS | {SVG_MIME}
 
@@ -38,11 +41,11 @@ class KF8Writer(object):
         self.log.info('Creating KF8 output')
         self.used_images = set()
         self.resources = resources
-        self.dup_data()
         self.flows = [None] # First flow item is reserved for the text
         self.records = []
-        self.fdst_table = []
 
+        self.log('\tGenerating KF8 markup...')
+        self.dup_data()
         self.replace_resource_links()
         self.extract_css_into_flows()
         self.extract_svg_into_flows()
@@ -52,7 +55,10 @@ class KF8Writer(object):
         # Dump the cloned data as it is no longer needed
         del self._data_cache
         self.create_text_records()
-        self.create_fdst_table()
+        self.log('\tCreating indices...')
+        self.create_fdst_records()
+        self.create_indices()
+        self.create_guide()
 
     def dup_data(self):
         ''' Duplicate data so that any changes we make to markup/CSS only
@@ -231,7 +237,7 @@ class KF8Writer(object):
         records_size = 0
 
         if self.compress:
-            self.oeb.logger.info('  Compressing markup content...')
+            self.oeb.logger.info('\tCompressing markup...')
 
         while text.tell() < self.text_length:
             data, overlap = create_text_record(text)
@@ -252,9 +258,90 @@ class KF8Writer(object):
             self.records.append(b'\x00'*(records_size % 4))
             self.first_non_text_record_idx += 1
 
-    def create_fdst_table(self):
+    def create_fdst_records(self):
         FDST = namedtuple('Flow', 'start end')
+        entries = []
+        self.fdst_table = []
         for i, flow in enumerate(self.flows):
             start = 0 if i == 0 else self.fdst_table[-1].end
             self.fdst_table.append(FDST(start, start + len(flow)))
+            entries.extend(self.fdst_table[-1])
+        rec = (b'FDST' + pack(b'>LL', len(self.fdst_table), 12) +
+                pack(b'>%dL'%len(entries), *entries))
+        self.fdst_records = [rec]
+
+    def create_indices(self):
+        self.skel_records = SkelIndex(self.skel_table)()
+        self.chunk_records = ChunkIndex(self.chunk_table)()
+        self.ncx_records = []
+        toc = self.oeb.toc
+        max_depth = toc.depth()
+        entries = []
+        is_periodical = self.opts.mobi_periodical
+        if toc.count() < 2:
+            self.log.warn('Document has no ToC, MOBI will have no NCX index')
+            return
+
+        # Flatten the ToC into a depth first list
+        fl = toc.iter() if is_periodical else toc.iterdescendants()
+        for i, item in enumerate(fl):
+            entry = {'index':i, 'depth': max_depth - item.depth() - (0 if
+                is_periodical else 1), 'href':item.href, 'label':(item.title or
+                    _('Unknown'))}
+            entries.append(entry)
+            for child in item:
+                child.ncx_parent = entry
+            p = getattr(item, 'ncx_parent', None)
+            if p is not None:
+                entry['parent'] = p['index']
+            if is_periodical:
+                if item.author:
+                    entry['author'] = item.author
+                if item.description:
+                    entry['description'] = item.description
+
+        for entry in entries:
+            children = [e for e in entries if e.get('parent', -1) == entry['index']]
+            if children:
+                entry['first_child'] = children[0]['index']
+                entry['last_child'] = children[-1]['index']
+            href = entry.pop('href')
+            href, frag = href.partition('#')[0::2]
+            aid = self.id_map.get((href, frag), None)
+            if aid is None:
+                aid = self.id_map.get((href, ''), None)
+            if aid is None:
+                pos, fid = 0, 0
+            else:
+                pos, fid = self.aid_offset_map[aid]
+            chunk = self.chunk_table[pos]
+            offset = chunk.insert_pos + fid
+            length = chunk.length
+            entry['pos_fid'] = (pos, fid)
+            entry['offset'] = offset
+            entry['length'] = length
+
+        self.ncx_records = NCXIndex(entries)()
+
+    def create_guide(self):
+        self.start_offset = None
+        self.guide_table = []
+        self.guide_records = []
+        GuideRef = namedtuple('GuideRef', 'title type pos_fid')
+        for ref in self.oeb.guide:
+            ref = self.oeb.guide[ref]
+            href, frag = ref.href.partition('#')[0::2]
+            aid = self.id_map.get((href, frag), None)
+            if aid is None:
+                aid = self.id_map.get((href, ''))
+            if aid is None:
+                continue
+            pos, fid = self.aid_offset_map[aid]
+            if is_guide_ref_start(ref):
+                self.start_offset = pos
+            self.guide_table.append(GuideRef(ref.title or
+                _('Unknown'), ref.type, (pos, fid)))
+
+        if self.guide_table:
+            self.guide_records = GuideIndex(self.guide_table)()
 
diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py
index 8f0a3795db..398c684e43 100644
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@@ -359,14 +359,14 @@ class Chunker(object):
             if pos_fid is None:
                 raise ValueError('Could not find chunk for aid: %r'%
                         match.group(1))
-            aid_map[match.group(1)] = (to_base(chunk.sequence_number,
-                                            base=32, min_num_digits=4),
-                                    to_href(offset-chunk.insert_pos))
+            aid_map[match.group(1)] = pos_fid
 
         self.aid_offset_map = aid_map
 
         def to_placeholder(aid):
-            return bytes(':'.join(aid_map[aid]))
+            pos, fid = aid_map[aid]
+            pos, fid = to_base(pos, min_num_digits=4), to_href(fid)
+            return bytes(':'.join((pos, fid)))
 
         placeholder_map = {bytes(k):to_placeholder(v) for k, v in
                 self.placeholder_map.iteritems()}

From 3269b8c3611ec68855f60c46c675cde2a4e3dc5e Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 22 Apr 2012 10:20:47 +0530
Subject: [PATCH 34/37] ...

---
 src/calibre/ebooks/mobi/writer8/main.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py
index 76492cb9a9..ffc806cb5a 100644
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@@ -337,7 +337,10 @@ class KF8Writer(object):
             if aid is None:
                 continue
             pos, fid = self.aid_offset_map[aid]
-            if is_guide_ref_start(ref):
+            if is_guide_ref_start(ref) and fid == 0:
+                # If fid != 0 then we cannot represent the start position as a
+                # single number in the EXTH header, so we do not write it to
+                # EXTH
                 self.start_offset = pos
             self.guide_table.append(GuideRef(ref.title or
                 _('Unknown'), ref.type, (pos, fid)))

From e5e2bfd8f359df52428d000662613bab89b1a621 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 22 Apr 2012 10:34:28 +0530
Subject: [PATCH 35/37] ...

---
 src/calibre/ebooks/mobi/writer8/main.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py
index ffc806cb5a..e061da7df6 100644
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@@ -328,8 +328,7 @@ class KF8Writer(object):
         self.guide_table = []
         self.guide_records = []
         GuideRef = namedtuple('GuideRef', 'title type pos_fid')
-        for ref in self.oeb.guide:
-            ref = self.oeb.guide[ref]
+        for ref in self.oeb.guide.values():
             href, frag = ref.href.partition('#')[0::2]
             aid = self.id_map.get((href, frag), None)
             if aid is None:

From e0002deb1fba920695c88147b415d583ac79f517 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 22 Apr 2012 12:48:29 +0530
Subject: [PATCH 36/37] Sol Haber by Onur Gungor

---
 recipes/sol_haber.recipe | 141 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 recipes/sol_haber.recipe

diff --git a/recipes/sol_haber.recipe b/recipes/sol_haber.recipe
new file mode 100644
index 0000000000..29db88019c
--- /dev/null
+++ b/recipes/sol_haber.recipe
@@ -0,0 +1,141 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Onur Gungor onurgu@gmail.com'
+__docformat__ = 'restructuredtext en'
+
+'''
+www.sol.org.tr
+'''
+
+import datetime
+
+import re
+
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class SolHaberRecipe(BasicNewsRecipe):
+    title = u'soL Haber'
+    oldest_article = 7
+    max_articles_per_feed = 100
+
+    language = 'tr'
+    __author__ = 'Onur Güngör'
+    description = 'Hayata soL''dan bakın..'
+    publisher = 'soL Haber'
+    tags = 'news, haberler, siyaset, türkiye, turkey, politics'
+
+
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : tags
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
+
+    category_dict = { 'sonuncu-kavga':'Sonuncu Kavga',
+                      'devlet-ve-siyaset':'Devlet ve Siyaset',
+                      'ekonomi':'Ekonomi',
+                      'enternasyonal-gundem':'Enternasyonel Gündem',
+                      'kent-gundemleri':'Kent Gündemleri',
+                      'kultur-sanat':'Kültür Sanat',
+                      'dunyadan':'Dünyadan',
+                      'serbest-kursu':'Serbest Kürsü',
+                      'medya':'Medya',
+                      'liseliler':'Liseliler',
+                      'yazarlar':'Köşe Yazıları'}
+
+    end_date = datetime.date.today().isoformat()
+    start_date = (datetime.date.today()-datetime.timedelta(days=1)).isoformat()
+
+
+    section_tuples = [['Köşe Yazıları', 'http://haber.sol.org.tr/arsiv?icerik=kose_yazisi&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
+                    ['Haberler', 'http://haber.sol.org.tr/arsiv?icerik=haber&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
+                    ['soL postal', 'http://haber.sol.org.tr/arsiv?icerik=postal&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
+                    ['Bizim Amerika', 'http://haber.sol.org.tr/arsiv?icerik=bizim_amerika&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)]]
+
+
+    # Disable stylesheets from site.
+    no_stylesheets = True
+
+    cover_margins = (20, 20, '#ffffff')
+
+    storybody_reg_exp = '^\s*(haber|kose)\s*$'
+
+    comments_reg_exp = '^\s*makale-elestiri\s*$'
+
+    remove_tags = [dict(name='div', attrs={'class':re.compile(comments_reg_exp, re.IGNORECASE)})]
+
+    keep_only_tags = [dict(name='div', attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)})]
+
+    def get_masthead_title(self):
+        return self.title + "(" + self.end_date + ")"
+
+    def parse_index(self):
+
+        result = []
+        articles_dict = dict()
+
+        author_regexp = re.compile('^http://.*?/yazarlar/(.*?)/.*$')
+        category_regexp = re.compile('^http://.*?/(.+?)/.*$')
+
+        for section_tuple in self.section_tuples:
+
+            section_title = section_tuple[0]
+            section_index_url = section_tuple[1]
+
+            self.log('Bölüm:', section_title, 'URL:', section_index_url)
+
+            soup = self.index_to_soup(section_index_url)
+
+            logo = soup.find('div', id='logo').find('img', src=True)
+            if logo is not None:
+                self.cover_url = logo['src']
+                if self.cover_url.startswith('/'):
+                    self.cover_url = 'http://haber.sol.org.tr'+self.cover_url
+
+            view_content = soup.find('div', id='ana-icerik').find('div', attrs={'class':'view-content'})
+            if view_content == None:
+                break
+            rows = view_content.find('tbody').findAll('tr')
+
+            self.log('Row sayısı', len(rows))
+            for row in rows:
+                cells = row.findAll('td')
+
+                a = cells[1].find('a', href=True)
+
+                url = a['href']
+                title = self.tag_to_string(a)
+
+                if url.startswith('/'):
+                    url = 'http://haber.sol.org.tr'+url
+
+                category = section_title
+                category_match_result = category_regexp.match(url)
+                if category_match_result:
+                    category = category_match_result.group(1)
+
+                date = self.tag_to_string(cells[2])
+
+                author = 'soL haber'
+
+                author_match_result = author_regexp.match(url)
+                if author_match_result:
+                    author = author_match_result.group(1)
+
+                self.log('\tFound article:', title, 'at', url, 'published at ', date, 'by', author)
+                article = {'title':title, 'url':url, 'description':None, 'date':date, 'author':author}
+                if category in articles_dict:
+                    articles_dict[category].append(article)
+                else:
+                    articles_dict[category] = [article]
+
+        for category in articles_dict.keys():
+            if category in self.category_dict:
+                result.append((self.category_dict[category], articles_dict[category]))
+            else:
+                result.append((category, articles_dict[category]))
+
+        return result

From fe1e29082003058efbcdaf4f8610021bc3b393f1 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 22 Apr 2012 15:52:12 +0530
Subject: [PATCH 37/37] Bash completion for ebook-viewer should complete all
 fiel types for which calibre has an input plugin

---
 src/calibre/linux.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/calibre/linux.py b/src/calibre/linux.py
index 64bc9a5a0b..e3bfe04e75 100644
--- a/src/calibre/linux.py
+++ b/src/calibre/linux.py
@@ -228,8 +228,8 @@ class PostInstall:
             from calibre.utils.smtp import option_parser as smtp_op
             from calibre.library.server.main import option_parser as serv_op
             from calibre.ebooks.epub.fix.main import option_parser as fix_op
-            any_formats = ['epub', 'htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip',
-                'txt', 'lit', 'rtf', 'pdf', 'prc', 'mobi', 'fb2', 'odt', 'lrf', 'snb']
+            from calibre.ebooks import BOOK_EXTENSIONS
+            input_formats = sorted(all_input_formats())
             bc = os.path.join(os.path.dirname(self.opts.staging_sharedir),
                 'bash-completion')
             if os.path.exists(bc):
@@ -249,11 +249,11 @@ class PostInstall:
             self.info('Installing bash completion to', f)
             with open(f, 'wb') as f:
                 f.write('# calibre Bash Shell Completion\n')
-                f.write(opts_and_exts('calibre', guiop, any_formats))
+                f.write(opts_and_exts('calibre', guiop, BOOK_EXTENSIONS))
                 f.write(opts_and_exts('lrf2lrs', lrf2lrsop, ['lrf']))
                 f.write(opts_and_exts('ebook-meta', metaop, list(meta_filetypes())))
                 f.write(opts_and_exts('lrfviewer', lrfviewerop, ['lrf']))
-                f.write(opts_and_exts('ebook-viewer', viewer_op, any_formats))
+                f.write(opts_and_exts('ebook-viewer', viewer_op, input_formats))
                 f.write(opts_and_words('fetch-ebook-metadata', fem_op, []))
                 f.write(opts_and_words('calibre-smtp', smtp_op, []))
                 f.write(opts_and_words('calibre-server', serv_op, []))