From e80fcc13fcffef68f7eccb7d0f135f08dce91f12 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 25 Apr 2009 12:22:39 -0700
Subject: [PATCH 1/3] More miscellaneous fixes

---
 src/calibre/ebooks/oeb/base.py                 | 5 ++++-
 src/calibre/ebooks/oeb/transforms/structure.py | 1 +
 src/calibre/gui2/dialogs/metadata_single.py    | 7 +++++--
 3 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py
index 33bb44840b..9d8598c766 100644
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@@ -941,7 +941,10 @@ class Manifest(object):
             href = urlunparse(purl)
             path, frag = urldefrag(href)
             if not path:
-                return '#'.join((self.href, frag))
+                if frag:
+                    return '#'.join((self.href, frag))
+                else:
+                    return self.href
             if '/' not in self.href:
                 return href
             dirname = os.path.dirname(self.href)
diff --git a/src/calibre/ebooks/oeb/transforms/structure.py b/src/calibre/ebooks/oeb/transforms/structure.py
index 197a265139..605cdaa7cf 100644
--- a/src/calibre/ebooks/oeb/transforms/structure.py
+++ b/src/calibre/ebooks/oeb/transforms/structure.py
@@ -102,6 +102,7 @@ class DetectStructure(object):
                                 play_order=self.oeb.toc.next_play_order())
 
 
+
     def elem_to_link(self, item, elem, counter):
         text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')])
         text = text[:100].strip()
diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py
index e3e2080cc0..4d5471caf0 100644
--- a/src/calibre/gui2/dialogs/metadata_single.py
+++ b/src/calibre/gui2/dialogs/metadata_single.py
@@ -159,9 +159,12 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
         row = self.formats.currentRow()
         fmt = self.formats.item(row)
         if fmt is None:
-            error_dialog(self, _('No format selected'),
+            if self.formats.count() == 1:
+                fmt = self.formats.item(0)
+            if fmt is None:
+                error_dialog(self, _('No format selected'),
                     _('No format selected')).exec_()
-            return
+                return
         ext = fmt.ext.lower()
         if fmt.path is None:
             stream = self.db.format(self.row, ext, as_file=True)

From 0d07ad2610b8b58d237075392353fb35e45d2ae7 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 25 Apr 2009 14:12:23 -0700
Subject: [PATCH 2/3] Strip 0 bytes from HTML before parsing

---
 src/calibre/ebooks/conversion/preprocess.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index fb55ee74fb..42e6654127 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -26,9 +26,9 @@ def sanitize_head(match):
 def chap_head(match):
     chap = match.group('chap')
     title = match.group('title')
-    if not title: 
+    if not title:
                return '<h1>'+chap+'</h1><br/>'
-    else: 
+    else:
                return '<h1>'+chap+'<br/>'+title+'</h1><br/>'
 
 
@@ -49,19 +49,19 @@ def line_length(raw, percent):
     total = sum(lengths)
     avg = total / len(lengths)
     max_line = avg * 2
-    
+
     lengths = sorted(lengths)
     for i in range(len(lengths) - 1, -1, -1):
         if lengths[i] > max_line:
             del lengths[i]
-    
+
     if percent > 1:
         percent = 1
     if percent < 0:
         percent = 0
 
     index = int(len(lengths) * percent) - 1
-    
+
     return lengths[index]
 
 
@@ -110,17 +110,17 @@ class HTMLPreProcessor(object):
 
                   # Remove non breaking spaces
                   (re.compile(ur'\u00a0'), lambda match : ' '),
-                  
+
                   # Detect Chapters to match default XPATH in GUI
                   (re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head),
                   (re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
- 
+
                   # Have paragraphs show better
                   (re.compile(r'<br.*?>'), lambda match : '<p>'),
-                  
+
                   # Un wrap lines
                   (re.compile(r'(?<=[^\.^\^?^!^"^”])\s*(</(i|b|u)>)*\s*<p.*?>\s*(<(i|b|u)>)*\s*(?=[a-z0-9I])', re.UNICODE), lambda match: ' '),
-                  
+
                   # Clean up spaces
                   (re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
                   # Add space before and after italics
@@ -157,6 +157,7 @@ class HTMLPreProcessor(object):
     def __call__(self, html, remove_special_chars=None):
         if remove_special_chars is not None:
             html = remove_special_chars.sub('', html)
+        html = html.replace('\0', '')
         if self.is_baen(html):
             rules = []
         elif self.is_book_designer(html):
@@ -166,7 +167,7 @@ class HTMLPreProcessor(object):
             #line_length_rules = [
             #    (re.compile('%i' % line_length(html, .85)), lambda match:)
             #]
-            
+
             rules = self.PDFTOHTML # + line_length_rules
         else:
             rules = []

From d253544a1f311aa692e78e5ff333af6d870fece3 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 25 Apr 2009 14:38:23 -0700
Subject: [PATCH 3/3] Implement a --page-breaks-before option

---
 src/calibre/ebooks/conversion/cli.py           | 2 +-
 src/calibre/ebooks/conversion/plumber.py       | 8 ++++++++
 src/calibre/ebooks/oeb/transforms/structure.py | 8 ++++++++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index ae0af532ab..e12686a36c 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -128,7 +128,7 @@ def add_pipeline_options(parser, plumber):
                   [
                       'dont_split_on_page_breaks', 'chapter', 'chapter_mark',
                       'prefer_metadata_cover', 'remove_first_image',
-                      'insert_comments',
+                      'insert_comments', 'page_breaks_before',
                   ]
                   ),
 
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index f55d677d08..da9c9f11e2 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -227,6 +227,14 @@ OptionRecommendation(name='extra_css',
                 'rules.')
         ),
 
+OptionRecommendation(name='page_breaks_before',
+            recommended_value="//*[name()='h1' or name()='h2']",
+            level=OptionRecommendation.LOW,
+            help=_('An XPath expression. Page breaks are inserted '
+            'before the specified elements.')
+        ),
+
+
 OptionRecommendation(name='margin_top',
         recommended_value=5.0, level=OptionRecommendation.LOW,
         help=_('Set the top margin in pts. Default is %default')),
diff --git a/src/calibre/ebooks/oeb/transforms/structure.py b/src/calibre/ebooks/oeb/transforms/structure.py
index 605cdaa7cf..8ec3c7737a 100644
--- a/src/calibre/ebooks/oeb/transforms/structure.py
+++ b/src/calibre/ebooks/oeb/transforms/structure.py
@@ -45,6 +45,14 @@ class DetectStructure(object):
                 if not node.title or regexp.search(node.title) is not None:
                     self.oeb.toc.remove(node)
 
+        if opts.page_breaks_before is not None:
+            pb_xpath = XPath(opts.page_breaks_before)
+            for item in oeb.spine:
+                for elem in pb_xpath(item.data):
+                    style = elem.get('style', '')
+                    if style:
+                        style += '; '
+                    elem.set('style', style+'page-break-before:always')
 
     def detect_chapters(self):
         self.detected_chapters = []