From 74243ac0b9277f5468bcdb58e54b397862da76e1 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 10 Jan 2011 17:07:40 +0800
Subject: [PATCH 001/118] preprocess tweaks

---
 src/calibre/ebooks/conversion/preprocess.py | 2 +-
 src/calibre/ebooks/conversion/utils.py      | 4 ++--
 src/calibre/ebooks/txt/processor.py         | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 08a46cb8d9..f994888f19 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -360,7 +360,7 @@ class HTMLPreProcessor(object):
                   (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
 
                   # Center separator lines
-                  (re.compile(u'<br>\s*(?P<break>([*#•✦]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
+                  (re.compile(u'<br>\s*(?P<break>([*#•✦=]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
 
                   # Remove page links
                   (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 52d1bcc619..9177b5e53b 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -155,9 +155,9 @@ class PreProcessor(object):
 
         chapter_types = [
             [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"],
+            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"],  # Spaced Lettering
             [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
             [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
-            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"],  # Spaced Lettering
             [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
             [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"],  # Numeric Chapters, no dot or colon
             [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
@@ -357,6 +357,6 @@ class PreProcessor(object):
         html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
 
         # Center separator lines
-        html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
+        html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
 
         return html
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index 6a1a106681..ef9920185f 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -147,7 +147,7 @@ def detect_paragraph_type(txt):
             if .15 <= print_percent <= .75:
                 return 'print'
         elif .15 <= block_percent <= .75:
-            return 'block'     
+            return 'block'
 
         # Assume unformatted text with hardbreaks if nothing else matches        
         return 'unformatted'

From 9832b7118b592679541ab357de02e426e1f48a19 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 11 Jan 2011 11:27:25 +0800
Subject: [PATCH 002/118] chapter detection tweaks

---
 src/calibre/ebooks/conversion/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 9177b5e53b..cfa57a28c3 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -151,11 +151,11 @@ class PreProcessor(object):
         n_lookahead_open = "\s+(?!"
         n_lookahead_close = ")"
 
-        default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
+        default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\:\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
 
         chapter_types = [
             [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"],
-            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"],  # Spaced Lettering
+            [r"([A-Z-]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"],  # Spaced Lettering
             [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
             [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
             [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles

From 60b5d3853fc41cd17111287bfa2fff9b6bcab096 Mon Sep 17 00:00:00 2001
From: Hiroshi Miura <miurahr@linux.com>
Date: Fri, 14 Jan 2011 07:24:07 +0900
Subject: [PATCH 003/118] fix nikkei_sub economy

---
 resources/recipes/nikkei_sub_economy.recipe | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/resources/recipes/nikkei_sub_economy.recipe b/resources/recipes/nikkei_sub_economy.recipe
index 2dd8f1add8..8e7a68dfe7 100644
--- a/resources/recipes/nikkei_sub_economy.recipe
+++ b/resources/recipes/nikkei_sub_economy.recipe
@@ -27,6 +27,9 @@ class NikkeiNet_sub_economy(BasicNewsRecipe):
                        {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
                        {'class':"cmn-article_keyword cmn-clearfix"},
                        {'class':"cmn-print_headline cmn-clearfix"},
+                       {'class':"cmn-article_list"},
+                       dict(id="ABOUT-NIKKEI"),
+                       {'class':"cmn-sub_market"},
                          ]
     remove_tags_after = {'class':"cmn-pr_list"}
 

From b0a9c9659cda37d6cda41b22cd765713fb29f308 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Thu, 13 Jan 2011 19:58:09 -0500
Subject: [PATCH 004/118] Add heuristic options. Remove options that they
 replace.

---
 src/calibre/ebooks/conversion/cli.py     |  23 ++++-
 src/calibre/ebooks/conversion/plumber.py | 117 ++++++++++++++---------
 2 files changed, 92 insertions(+), 48 deletions(-)

diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index 3178fe1b43..f825776c9c 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -126,8 +126,21 @@ def add_pipeline_options(parser, plumber):
                       'margin_top', 'margin_left', 'margin_right',
                       'margin_bottom', 'change_justification',
                       'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
-                      'asciiize', 'remove_header', 'header_regex',
-                      'remove_footer', 'footer_regex',
+                      'asciiize',
+                  ]
+                  ),
+                  
+              'HEURISTICS' : (
+                  _('Modify the document text and strucutre using common patterns.'),
+                  [
+                      'enable_heuristics', 'markup_chapter_headings',
+                      'italicize_common_cases', 'fix_indents',
+                      'html_unwrap_factor', 'unwrap_lines',
+                      'delete_blank_paragraphs', 'format_scene_breaks',
+                      'dehyphenate',
+                      'sr1_search', 'sr1_replace',
+                      'sr2_search', 'sr2_replace',
+                      'sr3_search', 'sr3_replace',
                   ]
                   ),
 
@@ -137,7 +150,6 @@ def add_pipeline_options(parser, plumber):
                       'chapter', 'chapter_mark',
                       'prefer_metadata_cover', 'remove_first_image',
                       'insert_metadata', 'page_breaks_before',
-                      'preprocess_html', 'html_unwrap_factor',
                   ]
                   ),
 
@@ -164,8 +176,9 @@ def add_pipeline_options(parser, plumber):
 
               }
 
-    group_order = ['', 'LOOK AND FEEL', 'STRUCTURE DETECTION',
-            'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
+    group_order = ['', 'LOOK AND FEEL', 'HEURISTICS',
+            'STRUCTURE DETECTION', 'TABLE OF CONTENTS',
+            'METADATA', 'DEBUG']
 
     for group in group_order:
         desc, options = groups[group]
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 9b22fb46ec..3ec4e104f9 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -376,23 +376,6 @@ OptionRecommendation(name='insert_metadata',
             )
         ),
 
-OptionRecommendation(name='preprocess_html',
-        recommended_value=False, level=OptionRecommendation.LOW,
-        help=_('Attempt to detect and correct hard line breaks and other '
-            'problems in the source file. This may make things worse, so use '
-            'with care.'
-            )
-        ),
-
-OptionRecommendation(name='html_unwrap_factor',
-        recommended_value=0.40, level=OptionRecommendation.LOW,
-        help=_('Scale used to determine the length at which a line should '
-            'be unwrapped if preprocess is enabled. Valid values are a decimal between 0 and 1. The '
-            'default is 0.40, just below the median line length. This will unwrap typical books '
-            ' with hard line breaks, but should be reduced if the line length is variable.'
-            )
-        ),
-
 OptionRecommendation(name='smarten_punctuation',
         recommended_value=False, level=OptionRecommendation.LOW,
         help=_('Convert plain quotes, dashes and ellipsis to their '
@@ -401,32 +384,6 @@ OptionRecommendation(name='smarten_punctuation',
             )
         ),
 
-OptionRecommendation(name='remove_header',
-        recommended_value=False, level=OptionRecommendation.LOW,
-        help=_('Use a regular expression to try and remove the header.'
-            )
-        ),
-
-OptionRecommendation(name='header_regex',
-        recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
-        level=OptionRecommendation.LOW,
-        help=_('The regular expression to use to remove the header.'
-            )
-        ),
-
-OptionRecommendation(name='remove_footer',
-        recommended_value=False, level=OptionRecommendation.LOW,
-        help=_('Use a regular expression to try and remove the footer.'
-            )
-        ),
-
-OptionRecommendation(name='footer_regex',
-        recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
-        level=OptionRecommendation.LOW,
-        help=_('The regular expression to use to remove the footer.'
-            )
-        ),
-
 OptionRecommendation(name='read_metadata_from_opf',
             recommended_value=None, level=OptionRecommendation.LOW,
             short_switch='m',
@@ -526,7 +483,81 @@ OptionRecommendation(name='pubdate',
 OptionRecommendation(name='timestamp',
     recommended_value=None, level=OptionRecommendation.LOW,
     help=_('Set the book timestamp (used by the date column in calibre).')),
+    
+OptionRecommendation(name='enable_heuristics',
+    recommended_value=False, level=OptionRecommendation.LOW,
+    help=_('Enable heurisic processing. This option must be set for any '
+           'heuristic processing to take place.')),
 
+OptionRecommendation(name='markup_chapter_headings',
+    recommended_value=False, level=OptionRecommendation.LOW,
+    help=_('Detect chapter headings and sub headings. Change ' 
+           'them to h1 and h2 tags.')),
+           
+OptionRecommendation(name='italicize_common_cases',
+    recommended_value=False, level=OptionRecommendation.LOW,
+    help=_('Look for common words and patterns that denote '
+           'italics and italicize them.')),
+           
+OptionRecommendation(name='fix_indents',
+    recommended_value=True, level=OptionRecommendation.LOW,
+    help=_('Turn indentation created from multiple &nbsp; entities '
+           'into CSS indents.')),
+           
+OptionRecommendation(name='html_unwrap_factor',
+    recommended_value=0.40, level=OptionRecommendation.LOW,
+    help=_('Scale used to determine the length at which a line should '
+            'be unwrapped. Valid values are a decimal between 0 and 1. The '
+            'default is 0.4, just below the median line length.')),
+            
+OptionRecommendation(name='unwrap_lines',
+    recommended_value=False, level=OptionRecommendation.LOW,
+    help=_('Unwrap lines.')),
+    
+OptionRecommendation(name='delete_blank_paragraphs',
+    recommended_value=True, level=OptionRecommendation.LOW,
+    help=_('Remove empyt paragraphs from the document')),
+    
+OptionRecommendation(name='format_scene_breaks',
+    recommended_value=False, level=OptionRecommendation.LOW,
+    help=_('Replace soft scene breaks that use multiple blank lines '
+           'with horizontal rules.')),
+
+OptionRecommendation(name='dehyphenate',
+    recommended_value=True, level=OptionRecommendation.LOW,
+    help=_('Combine words that are separated by a hyphen. '
+           'This is for cases where a word is hyphenated across '
+           'two lines to denote the characters from a single word.')),
+    
+OptionRecommendation(name='sr1_search',
+    recommended_value='', level=OptionRecommendation.LOW,
+    help=_('Search pattern (regular expression) to be replaced with '
+           'sr1-replace.')),
+    
+OptionRecommendation(name='sr1_replace',
+    recommended_value='', level=OptionRecommendation.LOW,
+    help=_('Replace characters (can be lambda expression) to '
+           'replace the text found with sr1-search.')),
+
+OptionRecommendation(name='sr2_search',
+    recommended_value='', level=OptionRecommendation.LOW,
+    help=_('Search pattern (regular expression) to be replaced with '
+           'sr2-replace.')),
+
+OptionRecommendation(name='sr2_replace',
+    recommended_value='', level=OptionRecommendation.LOW,
+    help=_('Replace characters (can be lambda expression) to '
+           'replace the text found with sr2-search.')),
+
+OptionRecommendation(name='sr3_search',
+    recommended_value='', level=OptionRecommendation.LOW,
+    help=_('Search pattern (regular expression) to be replaced with '
+           'sr3-replace.')),
+
+OptionRecommendation(name='sr3_replace',
+    recommended_value='', level=OptionRecommendation.LOW,
+    help=_('Replace characters (can be lambda expression) to '
+           'replace the text found with sr3-search.')),
 ]
         # }}}
 

From 8676ddd30fba0df90eb62e7c1c84c3fd3dc13f39 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 14 Jan 2011 18:12:17 +0800
Subject: [PATCH 005/118] updated heuristics help messages

---
 src/calibre/ebooks/conversion/plumber.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 3ec4e104f9..50d0646c7d 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -491,8 +491,8 @@ OptionRecommendation(name='enable_heuristics',
 
 OptionRecommendation(name='markup_chapter_headings',
     recommended_value=False, level=OptionRecommendation.LOW,
-    help=_('Detect chapter headings and sub headings. Change ' 
-           'them to h1 and h2 tags.')),
+    help=_('Detect unformatted chapter headings and sub headings. Change ' 
+           'them to h2 and h3 tags.')),
            
 OptionRecommendation(name='italicize_common_cases',
     recommended_value=False, level=OptionRecommendation.LOW,
@@ -508,26 +508,30 @@ OptionRecommendation(name='html_unwrap_factor',
     recommended_value=0.40, level=OptionRecommendation.LOW,
     help=_('Scale used to determine the length at which a line should '
             'be unwrapped. Valid values are a decimal between 0 and 1. The '
-            'default is 0.4, just below the median line length.')),
+            'default is 0.4, just below the median line length.  If only a '
+            'few lines in the document require unwrapping this value should '
+            'be reduced')),
             
 OptionRecommendation(name='unwrap_lines',
     recommended_value=False, level=OptionRecommendation.LOW,
-    help=_('Unwrap lines.')),
+    help=_('Unwrap lines using punctuation and other formatting clues.')),
     
 OptionRecommendation(name='delete_blank_paragraphs',
     recommended_value=True, level=OptionRecommendation.LOW,
-    help=_('Remove empyt paragraphs from the document')),
+    help=_('Remove empty paragraphs from the document when they exist between '
+           'every other paragraph')),
     
 OptionRecommendation(name='format_scene_breaks',
     recommended_value=False, level=OptionRecommendation.LOW,
-    help=_('Replace soft scene breaks that use multiple blank lines '
-           'with horizontal rules.')),
+    help=_('Detects left aligned scene break markers and center aligns them. '
+           'Replace soft scene breaks that use multiple blank lines with'
+           'horizontal rules.')),
 
 OptionRecommendation(name='dehyphenate',
     recommended_value=True, level=OptionRecommendation.LOW,
-    help=_('Combine words that are separated by a hyphen. '
-           'This is for cases where a word is hyphenated across '
-           'two lines to denote the characters from a single word.')),
+    help=_('Analyses hyphenated words throughout the document.  The '
+           'document itself is used as a dictionary to determine whether hyphens '
+           'should be retained or removed.')),
     
 OptionRecommendation(name='sr1_search',
     recommended_value='', level=OptionRecommendation.LOW,

From 90c978bb1076e7afb7843df96959cc365d17332d Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 14 Jan 2011 21:33:47 +0800
Subject: [PATCH 006/118] tied enable heuristics to preprocess, moved various
 pieces to functions

---
 src/calibre/customize/conversion.py      |   2 +-
 src/calibre/ebooks/conversion/plumber.py |   4 +-
 src/calibre/ebooks/conversion/utils.py   | 117 ++++++++++++++---------
 src/calibre/ebooks/lit/input.py          |   2 +-
 4 files changed, 78 insertions(+), 47 deletions(-)

diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py
index ec83600a49..a9e573ffa0 100644
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@@ -160,7 +160,7 @@ class InputFormatPlugin(Plugin):
         '''
         raise NotImplementedError()
 
-    def preprocess_html(self, opts, html):
+    def heuristics(self, opts, html):
         '''
         This method is called by the conversion pipeline on all HTML before it
         is parsed. It is meant to be used to do any required preprocessing on
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 50d0646c7d..a40c17a743 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -1012,8 +1012,8 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
     Create an OEBBook.
     '''
     from calibre.ebooks.oeb.base import OEBBook
-    html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
-            opts.preprocess_html, opts)
+    html_preprocessor = HTMLPreProcessor(input_plugin.heuristics,
+            opts.enable_heuristics, opts)
     if not encoding:
         encoding = None
     oeb = OEBBook(log, html_preprocessor,
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index dac93fa2e2..44d4235b6c 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -113,6 +113,11 @@ class PreProcessor(object):
         return wordcount.words
 
     def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
+        '''
+        Searches for common chapter headings throughout the document
+        attempts multiple patterns based on likelihood of a match
+        with minimum false positives.  Exits after finding a successful pattern
+        '''
         # Typical chapters are between 2000 and 7000 words, use the larger number to decide the
         # minimum of chapters to search for
         self.min_chapters = 1
@@ -185,6 +190,10 @@ class PreProcessor(object):
         return html
 
     def punctuation_unwrap(self, length, content, format):
+        '''
+        Unwraps lines based on line length and punctuation
+        supports range of potential html markup and text files
+        '''
         # define the pieces of the regex
         lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
         line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
@@ -201,53 +210,38 @@ class PreProcessor(object):
         return content
 
 
-    def __call__(self, html):
-        self.log("*********  Preprocessing HTML  *********")
+    def text_process_pre(self, html):
+        pre = re.compile(r'<pre>', re.IGNORECASE)
+        if len(pre.findall(html)) == 1:
+            self.log("Running Text Processing")
+            from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
+            separate_paragraphs_single_line
+            outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
+            html = outerhtml.sub('\g<text>', html)
+            html = separate_paragraphs_single_line(html)
+            html = preserve_spaces(html)
+            html = convert_basic(html, epub_split_size_kb=0)
+        else:
+            # Add markup naively
+            # TODO - find out if there are cases where there are more than one <pre> tag or
+            # other types of unmarked html and handle them in some better fashion
+            add_markup = re.compile('(?<!>)(\n)')
+            html = add_markup.sub('</p>\n<p>', html)
+        return html
 
-        # Count the words in the document to estimate how many chapters to look for and whether
-        # other types of processing are attempted
-        totalwords = 0
-        totalwords = self.get_word_count(html)
-
-        if totalwords < 50:
-            self.log("not enough text, not preprocessing")
-            return html
-
-        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
+    def arrange_htm_line_endings(self, html):
         html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
         html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
+        return html
 
-        ###### Check Markup ######
-        #
-        # some lit files don't have any <p> tags or equivalent (generally just plain text between
-        # <pre> tags), check and  mark up line endings if required before proceeding
-        if self.no_markup(html, 0.1):
-            self.log("not enough paragraph markers, adding now")
-            # check if content is in pre tags, use txt processor to mark up if so
-            pre = re.compile(r'<pre>', re.IGNORECASE)
-            if len(pre.findall(html)) == 1:
-                self.log("Running Text Processing")
-                from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
-                separate_paragraphs_single_line
-                outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
-                html = outerhtml.sub('\g<text>', html)
-                html = separate_paragraphs_single_line(html)
-                html = preserve_spaces(html)
-                html = convert_basic(html, epub_split_size_kb=0)
-            else:
-                # Add markup naively
-                # TODO - find out if there are cases where there are more than one <pre> tag or
-                # other types of unmarked html and handle them in some better fashion
-                add_markup = re.compile('(?<!>)(\n)')
-                html = add_markup.sub('</p>\n<p>', html)
-
-        ###### Mark Indents/Cleanup ######
-        #
-        # Replace series of non-breaking spaces with text-indent
+    def fix_nbsp_indents(self, html):
         txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
         html = txtindent.sub(self.insert_indent, html)
         if self.found_indents > 1:
             self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
+        return html
+
+    def cleanup_markup(self, html):
         # remove remaining non-breaking spaces
         html = re.sub(ur'\u00a0', ' ', html)
         # Get rid of various common microsoft specific tags which can cause issues later
@@ -259,27 +253,64 @@ class PreProcessor(object):
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
         html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
+        return html
+
+
+    def __call__(self, html):
+        self.log("*********  Preprocessing HTML  *********")
+
+        # Count the words in the document to estimate how many chapters to look for and whether
+        # other types of processing are attempted
+        totalwords = 0
+        totalwords = self.get_word_count(html)
+
+        if totalwords < 50:
+            self.log("flow is too short, not running heuristics")
+            return html
+
+        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
+        html = self.arrange_htm_line_endings(html)
+
+
+        ###### Check Markup ######
+        #
+        # some lit files don't have any <p> tags or equivalent (generally just plain text between
+        # <pre> tags), check and  mark up line endings if required before proceeding
+        if self.no_markup(html, 0.1):
+            self.log("not enough paragraph markers, adding now")
+            # check if content is in pre tags, use txt processor to mark up if so
+            html = self.text_process_pre(html)
+
+        ###### Mark Indents/Cleanup ######
+        #
+        # Replace series of non-breaking spaces with text-indent
+        html = self.fix_nbsp_indents(html)
+        
+        html = self.cleanup_markup(html)
+
         # ADE doesn't render <br />, change to empty paragraphs
         #html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
 
-        # If more than 40% of the lines are empty paragraphs and the user has enabled remove
-        # paragraph spacing then delete blank lines to clean up spacing
+        # If more than 40% of the lines are empty paragraphs and the user has enabled delete
+        # blank paragraphs then delete blank lines to clean up spacing
         linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
         blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         #multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
         blanklines = blankreg.findall(html)
         lines = linereg.findall(html)
         blanks_between_paragraphs = False
+        if getattr(self.extra_opts, 'delete_blank_paragraphs', False):
+            print "configured to delete blank paragraphs"
         if len(lines) > 1:
             self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
                     unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
             if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
-            'remove_paragraph_spacing', False):
+            'delete_blank_paragraphs', False):
                 self.log("deleting blank lines")
                 html = blankreg.sub('', html)
             elif float(len(blanklines)) / float(len(lines)) > 0.40:
                 blanks_between_paragraphs = True
-                #print "blanks between paragraphs is marked True"
+                print "blanks between paragraphs is marked True"
             else:
                 blanks_between_paragraphs = False
 
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 46a5e75977..d0ecf008b7 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -53,7 +53,7 @@ class LITInput(InputFormatPlugin):
                         pre.append(ne)
 
 
-    def preprocess_html(self, options, html):
+    def heuristics(self, options, html):
         self.options = options
         preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
         return preprocessor(html)

From 4893fa5d3a5ff8f4a3e3ebf8915ff6611c9c3921 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 14 Jan 2011 22:18:44 +0800
Subject: [PATCH 007/118] fixed the other plugins using preprocess

---
 src/calibre/ebooks/html/input.py | 2 +-
 src/calibre/ebooks/lrf/input.py  | 2 +-
 src/calibre/ebooks/mobi/input.py | 2 +-
 src/calibre/ebooks/pdb/input.py  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 1f07f4ca41..479f852c77 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -486,7 +486,7 @@ class HTMLInput(InputFormatPlugin):
             return (None, None)
         return (None, raw)
 
-    def preprocess_html(self, options, html):
+    def heuristics(self, options, html):
         self.options = options
         preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
         return preprocessor(html)
diff --git a/src/calibre/ebooks/lrf/input.py b/src/calibre/ebooks/lrf/input.py
index 70529c0a04..05c8731da5 100644
--- a/src/calibre/ebooks/lrf/input.py
+++ b/src/calibre/ebooks/lrf/input.py
@@ -420,7 +420,7 @@ class LRFInput(InputFormatPlugin):
         styles.write()
         return os.path.abspath('content.opf')
 
-    def preprocess_html(self, options, html):
+    def heuristics(self, options, html):
         self.options = options
         preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
         return preprocessor(html)
diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py
index 9ab7996a74..584be71fe4 100644
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@@ -39,7 +39,7 @@ class MOBIInput(InputFormatPlugin):
                 accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
         return mr.created_opf_path
 
-    def preprocess_html(self, options, html):
+    def heuristics(self, options, html):
         # search for places where a first or second level heading is immediately followed by another
         # top level heading.  demote the second heading to h3 to prevent splitting between chapter
         # headings and titles, images, etc
diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py
index 1b665bf94e..b0e7746c7e 100644
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@@ -33,7 +33,7 @@ class PDBInput(InputFormatPlugin):
 
         return opf
 
-    def preprocess_html(self, options, html):
+    def heuristics(self, options, html):
         self.options = options
         preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
         return preprocessor(html)

From 80ed2e7d4ee1f94b6f6ffe1297b7058345a8d22a Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 14 Jan 2011 22:32:03 +0800
Subject: [PATCH 008/118] ...

---
 src/calibre/ebooks/conversion/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 44d4235b6c..bfb23c45aa 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -299,8 +299,7 @@ class PreProcessor(object):
         blanklines = blankreg.findall(html)
         lines = linereg.findall(html)
         blanks_between_paragraphs = False
-        if getattr(self.extra_opts, 'delete_blank_paragraphs', False):
-            print "configured to delete blank paragraphs"
+        print "delete blank paragraphs is "+str(getattr(self.extra_opts, 'delete_blank_paragraphs', True))
         if len(lines) > 1:
             self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
                     unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")

From 65f9eff665042099269114f2905bfc30eef0a456 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 14 Jan 2011 23:43:38 +0800
Subject: [PATCH 009/118] remove heuristics from pdb input

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 src/calibre/ebooks/pdb/input.py        | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index bfb23c45aa..286fad1aaa 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -299,7 +299,7 @@ class PreProcessor(object):
         blanklines = blankreg.findall(html)
         lines = linereg.findall(html)
         blanks_between_paragraphs = False
-        print "delete blank paragraphs is "+str(getattr(self.extra_opts, 'delete_blank_paragraphs', True))
+        print "delete blank paragraphs is "+str(getattr(self.extra_opts, 'delete_blank_paragraphs', False))
         if len(lines) > 1:
             self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
                     unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py
index b0e7746c7e..de210e0a6d 100644
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@@ -32,8 +32,3 @@ class PDBInput(InputFormatPlugin):
         opf = reader.extract_content(os.getcwd())
 
         return opf
-
-    def heuristics(self, options, html):
-        self.options = options
-        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
-        return preprocessor(html)

From f46749863850242e58e92a2f337a6abb1be03486 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 00:11:21 +0800
Subject: [PATCH 010/118] preserve soft breaks when deleting blank paragraphs

---
 src/calibre/ebooks/conversion/utils.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 286fad1aaa..96bd303933 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -294,8 +294,8 @@ class PreProcessor(object):
         # If more than 40% of the lines are empty paragraphs and the user has enabled delete
         # blank paragraphs then delete blank lines to clean up spacing
         linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
-        blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
-        #multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
+        blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
+        multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
         blanklines = blankreg.findall(html)
         lines = linereg.findall(html)
         blanks_between_paragraphs = False
@@ -303,11 +303,8 @@ class PreProcessor(object):
         if len(lines) > 1:
             self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
                     unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
-            if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
-            'delete_blank_paragraphs', False):
-                self.log("deleting blank lines")
-                html = blankreg.sub('', html)
-            elif float(len(blanklines)) / float(len(lines)) > 0.40:
+                    
+            if float(len(blanklines)) / float(len(lines)) > 0.40:
                 blanks_between_paragraphs = True
                 print "blanks between paragraphs is marked True"
             else:
@@ -319,7 +316,12 @@ class PreProcessor(object):
 
         html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
 
-
+        if blanks_between_paragraphs and getattr(self.extra_opts,
+        'delete_blank_paragraphs', False):
+            self.log("deleting blank lines")
+            html = multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
+            html = blankreg.sub('', html)
+            
         ###### Unwrap lines ######
         #
         # Some OCR sourced files have line breaks in the html using a combination of span & p tags

From 680de553652677aae6e532c2b4ece965454457db Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Fri, 14 Jan 2011 18:50:27 -0500
Subject: [PATCH 011/118] Beginnings of Heuristics GUI options widget.

---
 src/calibre/gui2/convert/heuristics.py        |  62 +++++
 src/calibre/gui2/convert/heuristics.ui        | 219 ++++++++++++++++++
 src/calibre/gui2/convert/single.py            |   4 +-
 .../gui2/convert/structure_detection.py       |  32 +--
 .../gui2/convert/structure_detection.ui       | 113 +--------
 src/calibre/gui2/preferences/conversion.py    |   5 +-
 6 files changed, 293 insertions(+), 142 deletions(-)
 create mode 100644 src/calibre/gui2/convert/heuristics.py
 create mode 100644 src/calibre/gui2/convert/heuristics.ui

diff --git a/src/calibre/gui2/convert/heuristics.py b/src/calibre/gui2/convert/heuristics.py
new file mode 100644
index 0000000000..132652701a
--- /dev/null
+++ b/src/calibre/gui2/convert/heuristics.py
@@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import re
+
+from calibre.gui2.convert.heuristics_ui import Ui_Form
+from calibre.gui2.convert import Widget
+from calibre.gui2 import error_dialog
+
+class HeuristicsWidget(Widget, Ui_Form):
+
+    TITLE = _('Heuristics')
+    HELP  = _('')
+    COMMIT_NAME = 'heuristics'
+
+    def __init__(self, parent, get_option, get_help, db=None, book_id=None):
+        Widget.__init__(self, parent,
+                ['enable_heuristics', 'markup_chapter_headings',
+                 'italicize_common_cases', 'fix_indents',
+                 'html_unwrap_factor', 'unwrap_lines',
+                 'delete_blank_paragraphs', 'format_scene_breaks',
+                 'dehyphenate',
+                 'sr1_search', 'sr1_replace',
+                 'sr2_search', 'sr2_replace',
+                 'sr3_search', 'sr3_replace']
+                )
+        self.db, self.book_id = db, book_id
+        self.initialize_options(get_option, get_help, db, book_id)
+        self.opt_sr1_search.set_msg(_('Search regular expression 1:'))
+        self.opt_sr1_replace.set_msg(_('Replace regular expression 1:'))
+        self.opt_sr2_search.set_msg(_('Search regular expression 2:'))
+        self.opt_sr2_replace.set_msg(_('Replace regular expression 2:'))
+        self.opt_sr3_search.set_msg(_('Search regular expression 3:'))
+        self.opt_sr3_replace.set_msg(_('Replace regular expression 3:'))
+
+    def break_cycles(self):
+        Widget.break_cycles(self)
+        self.opt_sr1_search.break_cycles()
+        self.opt_sr1_replace.break_cycles()
+        self.opt_sr2_search.break_cycles()
+        self.opt_sr2_replace.break_cycles()
+        self.opt_sr3_search.break_cycles()
+        self.opt_sr3_replace.break_cycles()
+
+    def pre_commit_check(self):
+        for x in ('sr1-search', 'sr1-replace', 'sr2-search', 'sr2-replace', 'sr3-search', 'sr3-replace',):
+            x = getattr(self, 'opt_'+x)
+            try:
+                pat = unicode(x.regex)
+                re.compile(pat)
+            except Exception, err:
+                error_dialog(self, _('Invalid regular expression'),
+                             _('Invalid regular expression: %s')%err).exec_()
+                return False
+            
+    def set_value_handler(self, g, val):
+        if val is None and g is self.opt_html_unwrap_factor:
+            g.setValue(0.0)
+            return True
diff --git a/src/calibre/gui2/convert/heuristics.ui b/src/calibre/gui2/convert/heuristics.ui
new file mode 100644
index 0000000000..2c103ff5b6
--- /dev/null
+++ b/src/calibre/gui2/convert/heuristics.ui
@@ -0,0 +1,219 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>Form</class>
+ <widget class="QWidget" name="Form">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>657</width>
+    <height>479</height>
+   </rect>
+  </property>
+  <property name="windowTitle">
+   <string>Form</string>
+  </property>
+  <layout class="QVBoxLayout" name="verticalLayout">
+   <item>
+    <widget class="QCheckBox" name="opt_enable_heuristics">
+     <property name="text">
+      <string>&amp;Preprocess input file to possibly improve structure detection</string>
+     </property>
+    </widget>
+   </item>
+   <item>
+    <widget class="QGroupBox" name="groupBox">
+     <property name="title">
+      <string>Heuristics</string>
+     </property>
+     <layout class="QGridLayout" name="gridLayout_2">
+      <item row="0" column="0" colspan="2">
+       <widget class="QCheckBox" name="opt_unwrap_lines">
+        <property name="text">
+         <string>Unwrap lines</string>
+        </property>
+       </widget>
+      </item>
+      <item row="1" column="0">
+       <spacer name="horizontalSpacer">
+        <property name="orientation">
+         <enum>Qt::Horizontal</enum>
+        </property>
+        <property name="sizeHint" stdset="0">
+         <size>
+          <width>131</width>
+          <height>22</height>
+         </size>
+        </property>
+       </spacer>
+      </item>
+      <item row="1" column="1">
+       <widget class="QLabel" name="huf_label">
+        <property name="text">
+         <string>Line &amp;un-wrap factor during preprocess:</string>
+        </property>
+        <property name="buddy">
+         <cstring>opt_html_unwrap_factor</cstring>
+        </property>
+       </widget>
+      </item>
+      <item row="1" column="2">
+       <widget class="QDoubleSpinBox" name="opt_html_unwrap_factor">
+        <property name="toolTip">
+         <string/>
+        </property>
+        <property name="maximum">
+         <double>1.000000000000000</double>
+        </property>
+        <property name="singleStep">
+         <double>0.050000000000000</double>
+        </property>
+        <property name="value">
+         <double>0.400000000000000</double>
+        </property>
+       </widget>
+      </item>
+      <item row="2" column="0" colspan="3">
+       <widget class="QCheckBox" name="opt_markup_chapter_headings">
+        <property name="text">
+         <string>markup_chapter_headings</string>
+        </property>
+       </widget>
+      </item>
+      <item row="3" column="0" colspan="2">
+       <widget class="QCheckBox" name="opt_delete_blank_paragraphs">
+        <property name="text">
+         <string>Delete blank lines between paragraphs</string>
+        </property>
+       </widget>
+      </item>
+      <item row="4" column="0" colspan="2">
+       <widget class="QCheckBox" name="opt_format_scene_breaks">
+        <property name="text">
+         <string>format_scene_breaks</string>
+        </property>
+       </widget>
+      </item>
+      <item row="5" column="0" colspan="2">
+       <widget class="QCheckBox" name="opt_dehyphenate">
+        <property name="text">
+         <string>dehyphenate</string>
+        </property>
+       </widget>
+      </item>
+      <item row="6" column="0" colspan="2">
+       <widget class="QCheckBox" name="opt_italicize_common_cases">
+        <property name="text">
+         <string>italicize_common_cases</string>
+        </property>
+       </widget>
+      </item>
+      <item row="7" column="0" colspan="2">
+       <widget class="QCheckBox" name="opt_fix_indents">
+        <property name="text">
+         <string>fix_indents</string>
+        </property>
+       </widget>
+      </item>
+      <item row="8" column="0">
+       <spacer name="verticalSpacer">
+        <property name="orientation">
+         <enum>Qt::Vertical</enum>
+        </property>
+        <property name="sizeHint" stdset="0">
+         <size>
+          <width>131</width>
+          <height>95</height>
+         </size>
+        </property>
+       </spacer>
+      </item>
+      <item row="1" column="3">
+       <spacer name="horizontalSpacer_2">
+        <property name="orientation">
+         <enum>Qt::Horizontal</enum>
+        </property>
+        <property name="sizeHint" stdset="0">
+         <size>
+          <width>40</width>
+          <height>20</height>
+         </size>
+        </property>
+       </spacer>
+      </item>
+     </layout>
+    </widget>
+   </item>
+   <item>
+    <widget class="QGroupBox" name="groupBox_2">
+     <property name="title">
+      <string>Search and Replace</string>
+     </property>
+     <layout class="QGridLayout" name="gridLayout">
+      <item row="0" column="0">
+       <widget class="RegexEdit" name="opt_sr1_search" native="true"/>
+      </item>
+      <item row="0" column="1">
+       <widget class="RegexEdit" name="opt_sr1_replace" native="true"/>
+      </item>
+      <item row="1" column="0">
+       <widget class="RegexEdit" name="opt_sr2_search" native="true"/>
+      </item>
+      <item row="1" column="1">
+       <widget class="RegexEdit" name="opt_sr2_replace" native="true"/>
+      </item>
+      <item row="2" column="0">
+       <widget class="RegexEdit" name="opt_sr3_search" native="true"/>
+      </item>
+      <item row="2" column="1">
+       <widget class="RegexEdit" name="opt_sr3_replace" native="true"/>
+      </item>
+     </layout>
+    </widget>
+   </item>
+  </layout>
+ </widget>
+ <customwidgets>
+  <customwidget>
+   <class>RegexEdit</class>
+   <extends>QWidget</extends>
+   <header>regex_builder.h</header>
+   <container>1</container>
+  </customwidget>
+ </customwidgets>
+ <resources/>
+ <connections>
+  <connection>
+   <sender>opt_enable_heuristics</sender>
+   <signal>toggled(bool)</signal>
+   <receiver>opt_html_unwrap_factor</receiver>
+   <slot>setEnabled(bool)</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>328</x>
+     <y>87</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>481</x>
+     <y>113</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>opt_enable_heuristics</sender>
+   <signal>toggled(bool)</signal>
+   <receiver>huf_label</receiver>
+   <slot>setEnabled(bool)</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>295</x>
+     <y>88</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>291</x>
+     <y>105</y>
+    </hint>
+   </hints>
+  </connection>
+ </connections>
+</ui>
diff --git a/src/calibre/gui2/convert/single.py b/src/calibre/gui2/convert/single.py
index 7fa8c29835..0337b779a0 100644
--- a/src/calibre/gui2/convert/single.py
+++ b/src/calibre/gui2/convert/single.py
@@ -16,6 +16,7 @@ from calibre.ebooks.conversion.config import GuiRecommendations, save_specifics,
 from calibre.gui2.convert.single_ui import Ui_Dialog
 from calibre.gui2.convert.metadata import MetadataWidget
 from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
+from calibre.gui2.convert.heuristics import HeuristicsWidget
 from calibre.gui2.convert.page_setup import PageSetupWidget
 from calibre.gui2.convert.structure_detection import StructureDetectionWidget
 from calibre.gui2.convert.toc import TOCWidget
@@ -170,6 +171,7 @@ class Config(ResizableDialog, Ui_Dialog):
         self.mw = widget_factory(MetadataWidget)
         self.setWindowTitle(_('Convert')+ ' ' + unicode(self.mw.title.text()))
         lf = widget_factory(LookAndFeelWidget)
+        hw = widget_factory(HeuristicsWidget)
         ps = widget_factory(PageSetupWidget)
         sd = widget_factory(StructureDetectionWidget)
         toc = widget_factory(TOCWidget)
@@ -203,7 +205,7 @@ class Config(ResizableDialog, Ui_Dialog):
             if not c: break
             self.stack.removeWidget(c)
 
-        widgets = [self.mw, lf, ps, sd, toc]
+        widgets = [self.mw, lf, hw, ps, sd, toc]
         if input_widget is not None:
             widgets.append(input_widget)
         if output_widget is not None:
diff --git a/src/calibre/gui2/convert/structure_detection.py b/src/calibre/gui2/convert/structure_detection.py
index 3f350d4508..2c64303ee7 100644
--- a/src/calibre/gui2/convert/structure_detection.py
+++ b/src/calibre/gui2/convert/structure_detection.py
@@ -6,8 +6,6 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-import re
-
 from calibre.gui2.convert.structure_detection_ui import Ui_Form
 from calibre.gui2.convert import Widget
 from calibre.gui2 import error_dialog
@@ -24,12 +22,8 @@ class StructureDetectionWidget(Widget, Ui_Form):
         Widget.__init__(self, parent,
                 ['chapter', 'chapter_mark',
                 'remove_first_image',
-                'insert_metadata', 'page_breaks_before',
-                'preprocess_html', 'remove_header', 'header_regex',
-                'remove_footer', 'footer_regex','html_unwrap_factor']
+                'insert_metadata', 'page_breaks_before']
                 )
-        self.opt_html_unwrap_factor.setEnabled(False)
-        self.huf_label.setEnabled(False)
         self.db, self.book_id = db, book_id
         for x in ('pagebreak', 'rule', 'both', 'none'):
             self.opt_chapter_mark.addItem(x)
@@ -37,28 +31,11 @@ class StructureDetectionWidget(Widget, Ui_Form):
         self.opt_chapter.set_msg(_('Detect chapters at (XPath expression):'))
         self.opt_page_breaks_before.set_msg(_('Insert page breaks before '
             '(XPath expression):'))
-        self.opt_header_regex.set_msg(_('Header regular expression:'))
-        self.opt_header_regex.set_book_id(book_id)
-        self.opt_header_regex.set_db(db)
-        self.opt_footer_regex.set_msg(_('Footer regular expression:'))
-        self.opt_footer_regex.set_book_id(book_id)
-        self.opt_footer_regex.set_db(db)
-
+        
     def break_cycles(self):
         Widget.break_cycles(self)
-        self.opt_header_regex.break_cycles()
-        self.opt_footer_regex.break_cycles()
 
     def pre_commit_check(self):
-        for x in ('header_regex', 'footer_regex'):
-            x = getattr(self, 'opt_'+x)
-            try:
-                pat = unicode(x.regex)
-                re.compile(pat)
-            except Exception, err:
-                error_dialog(self, _('Invalid regular expression'),
-                             _('Invalid regular expression: %s')%err).exec_()
-                return False
         for x in ('chapter', 'page_breaks_before'):
             x = getattr(self, 'opt_'+x)
             if not x.check():
@@ -66,8 +43,3 @@ class StructureDetectionWidget(Widget, Ui_Form):
                 _('The XPath expression %s is invalid.')%x.text).exec_()
                 return False
         return True
-
-    def set_value_handler(self, g, val):
-        if val is None and g is self.opt_html_unwrap_factor:
-            g.setValue(0.0)
-            return True
diff --git a/src/calibre/gui2/convert/structure_detection.ui b/src/calibre/gui2/convert/structure_detection.ui
index 21fe365e99..b690a68b0a 100644
--- a/src/calibre/gui2/convert/structure_detection.ui
+++ b/src/calibre/gui2/convert/structure_detection.ui
@@ -41,17 +41,17 @@
      </property>
     </widget>
    </item>
-   <item row="5" column="0" colspan="2">
+   <item row="3" column="0" colspan="2">
     <widget class="QCheckBox" name="opt_insert_metadata">
      <property name="text">
       <string>Insert &amp;metadata as page at start of book</string>
      </property>
     </widget>
    </item>
-   <item row="11" column="0" colspan="3">
+   <item row="5" column="0" colspan="3">
     <widget class="XPathEdit" name="opt_page_breaks_before" native="true"/>
    </item>
-   <item row="12" column="0" colspan="3">
+   <item row="6" column="0" colspan="3">
     <spacer name="verticalSpacer">
      <property name="orientation">
       <enum>Qt::Vertical</enum>
@@ -64,72 +64,6 @@
      </property>
     </spacer>
    </item>
-   <item row="8" column="0" colspan="2">
-    <widget class="QCheckBox" name="opt_remove_footer">
-     <property name="text">
-      <string>Remove F&amp;ooter</string>
-     </property>
-    </widget>
-   </item>
-   <item row="6" column="0" colspan="2">
-    <widget class="QCheckBox" name="opt_remove_header">
-     <property name="text">
-      <string>Remove H&amp;eader</string>
-     </property>
-    </widget>
-   </item>
-   <item row="7" column="0" colspan="3">
-    <widget class="RegexEdit" name="opt_header_regex" native="true"/>
-   </item>
-   <item row="9" column="0" colspan="3">
-    <widget class="RegexEdit" name="opt_footer_regex" native="true"/>
-   </item>
-   <item row="4" column="1">
-    <widget class="QLabel" name="huf_label">
-     <property name="text">
-      <string>Line &amp;un-wrap factor during preprocess:</string>
-     </property>
-     <property name="buddy">
-      <cstring>opt_html_unwrap_factor</cstring>
-     </property>
-    </widget>
-   </item>
-   <item row="4" column="2">
-    <widget class="QDoubleSpinBox" name="opt_html_unwrap_factor">
-     <property name="toolTip">
-      <string/>
-     </property>
-     <property name="maximum">
-      <double>1.000000000000000</double>
-     </property>
-     <property name="singleStep">
-      <double>0.050000000000000</double>
-     </property>
-     <property name="value">
-      <double>0.400000000000000</double>
-     </property>
-    </widget>
-   </item>
-   <item row="4" column="0">
-    <spacer name="horizontalSpacer">
-     <property name="orientation">
-      <enum>Qt::Horizontal</enum>
-     </property>
-     <property name="sizeHint" stdset="0">
-      <size>
-       <width>40</width>
-       <height>20</height>
-      </size>
-     </property>
-    </spacer>
-   </item>
-   <item row="3" column="0" colspan="2">
-    <widget class="QCheckBox" name="opt_preprocess_html">
-     <property name="text">
-      <string>&amp;Preprocess input file to possibly improve structure detection</string>
-     </property>
-    </widget>
-   </item>
   </layout>
  </widget>
  <customwidgets>
@@ -139,46 +73,7 @@
    <header>convert/xpath_wizard.h</header>
    <container>1</container>
   </customwidget>
-  <customwidget>
-   <class>RegexEdit</class>
-   <extends>QWidget</extends>
-   <header>regex_builder.h</header>
-   <container>1</container>
-  </customwidget>
  </customwidgets>
  <resources/>
- <connections>
-  <connection>
-   <sender>opt_preprocess_html</sender>
-   <signal>toggled(bool)</signal>
-   <receiver>opt_html_unwrap_factor</receiver>
-   <slot>setEnabled(bool)</slot>
-   <hints>
-    <hint type="sourcelabel">
-     <x>328</x>
-     <y>87</y>
-    </hint>
-    <hint type="destinationlabel">
-     <x>481</x>
-     <y>113</y>
-    </hint>
-   </hints>
-  </connection>
-  <connection>
-   <sender>opt_preprocess_html</sender>
-   <signal>toggled(bool)</signal>
-   <receiver>huf_label</receiver>
-   <slot>setEnabled(bool)</slot>
-   <hints>
-    <hint type="sourcelabel">
-     <x>295</x>
-     <y>88</y>
-    </hint>
-    <hint type="destinationlabel">
-     <x>291</x>
-     <y>105</y>
-    </hint>
-   </hints>
-  </connection>
- </connections>
+ <connections/>
 </ui>
diff --git a/src/calibre/gui2/preferences/conversion.py b/src/calibre/gui2/preferences/conversion.py
index 0063d4a341..a20872cee0 100644
--- a/src/calibre/gui2/preferences/conversion.py
+++ b/src/calibre/gui2/preferences/conversion.py
@@ -12,6 +12,7 @@ from calibre.ebooks.conversion.plumber import Plumber
 from calibre.utils.logging import Log
 from calibre.gui2.preferences.conversion_ui import Ui_Form
 from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
+from calibre.gui2.convert.heuristics import HeuristicsWidget
 from calibre.gui2.convert.page_setup import PageSetupWidget
 from calibre.gui2.convert.structure_detection import StructureDetectionWidget
 from calibre.gui2.convert.toc import TOCWidget
@@ -82,8 +83,8 @@ class Base(ConfigWidgetBase, Ui_Form):
 class CommonOptions(Base):
 
     def load_conversion_widgets(self):
-        self.conversion_widgets = [LookAndFeelWidget, PageSetupWidget,
-                StructureDetectionWidget, TOCWidget]
+        self.conversion_widgets = [LookAndFeelWidget, HeuristicsWidget,
+                PageSetupWidget, StructureDetectionWidget, TOCWidget]
 
 class InputOptions(Base):
 

From 7d75b065126f1bc5feade93c991db0b6fd261073 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Fri, 14 Jan 2011 18:53:02 -0500
Subject: [PATCH 012/118] Change heuristic options to default False to maintain
 consistency.

---
 src/calibre/ebooks/conversion/plumber.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index a40c17a743..2e88baea4e 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -500,7 +500,7 @@ OptionRecommendation(name='italicize_common_cases',
            'italics and italicize them.')),
            
 OptionRecommendation(name='fix_indents',
-    recommended_value=True, level=OptionRecommendation.LOW,
+    recommended_value=False, level=OptionRecommendation.LOW,
     help=_('Turn indentation created from multiple &nbsp; entities '
            'into CSS indents.')),
            
@@ -517,7 +517,7 @@ OptionRecommendation(name='unwrap_lines',
     help=_('Unwrap lines using punctuation and other formatting clues.')),
     
 OptionRecommendation(name='delete_blank_paragraphs',
-    recommended_value=True, level=OptionRecommendation.LOW,
+    recommended_value=False, level=OptionRecommendation.LOW,
     help=_('Remove empty paragraphs from the document when they exist between '
            'every other paragraph')),
     
@@ -528,7 +528,7 @@ OptionRecommendation(name='format_scene_breaks',
            'horizontal rules.')),
 
 OptionRecommendation(name='dehyphenate',
-    recommended_value=True, level=OptionRecommendation.LOW,
+    recommended_value=False, level=OptionRecommendation.LOW,
     help=_('Analyses hyphenated words throughout the document.  The '
            'document itself is used as a dictionary to determine whether hyphens '
            'should be retained or removed.')),

From 8f4d60073f982185522bdc20cfbaf0aea0f9de0c Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Fri, 14 Jan 2011 19:24:31 -0500
Subject: [PATCH 013/118] Finish new GUI option widgets.

---
 src/calibre/ebooks/conversion/cli.py          | 16 +++--
 src/calibre/gui2/convert/bulk.py              |  6 +-
 src/calibre/gui2/convert/heuristics.py        | 68 ++++++++++---------
 src/calibre/gui2/convert/heuristics.ui        | 47 ++-----------
 .../gui2/convert/search_and_replace.py        | 57 ++++++++++++++++
 .../gui2/convert/search_and_replace.ui        | 47 +++++++++++++
 src/calibre/gui2/convert/single.py            |  4 +-
 .../gui2/convert/structure_detection.ui       |  6 +-
 src/calibre/gui2/preferences/conversion.py    |  4 +-
 9 files changed, 172 insertions(+), 83 deletions(-)
 create mode 100644 src/calibre/gui2/convert/search_and_replace.py
 create mode 100644 src/calibre/gui2/convert/search_and_replace.ui

diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index f825776c9c..91f0f95348 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -131,18 +131,24 @@ def add_pipeline_options(parser, plumber):
                   ),
                   
               'HEURISTICS' : (
-                  _('Modify the document text and strucutre using common patterns.'),
+                  _('Modify the document text and structure using common patterns.'),
                   [
                       'enable_heuristics', 'markup_chapter_headings',
                       'italicize_common_cases', 'fix_indents',
                       'html_unwrap_factor', 'unwrap_lines',
                       'delete_blank_paragraphs', 'format_scene_breaks',
                       'dehyphenate',
+                  ]
+                  ),
+                  
+              'SEARCH AND REPLACE' : (
+                 _('Modify the document text and structure using user defined patterns.'),
+                 [
                       'sr1_search', 'sr1_replace',
                       'sr2_search', 'sr2_replace',
                       'sr3_search', 'sr3_replace',
-                  ]
-                  ),
+                 ]
+              ),
 
               'STRUCTURE DETECTION' : (
                   _('Control auto-detection of document structure.'),
@@ -177,8 +183,8 @@ def add_pipeline_options(parser, plumber):
               }
 
     group_order = ['', 'LOOK AND FEEL', 'HEURISTICS',
-            'STRUCTURE DETECTION', 'TABLE OF CONTENTS',
-            'METADATA', 'DEBUG']
+            'SEARCH AND REPLACE' 'STRUCTURE DETECTION',
+            'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
 
     for group in group_order:
         desc, options = groups[group]
diff --git a/src/calibre/gui2/convert/bulk.py b/src/calibre/gui2/convert/bulk.py
index 198f6144e4..b97ab1a2dc 100644
--- a/src/calibre/gui2/convert/bulk.py
+++ b/src/calibre/gui2/convert/bulk.py
@@ -11,6 +11,8 @@ from calibre.gui2.convert.single import Config, sort_formats_by_preference, \
 from calibre.customize.ui import available_output_formats
 from calibre.gui2 import ResizableDialog
 from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
+from calibre.gui2.convert.heuristics import HeuristicsWidget
+from calibre.gui2.convert.search_and_replace import SearchAndReplaceWidget
 from calibre.gui2.convert.page_setup import PageSetupWidget
 from calibre.gui2.convert.structure_detection import StructureDetectionWidget
 from calibre.gui2.convert.toc import TOCWidget
@@ -69,6 +71,8 @@ class BulkConfig(Config):
 
         self.setWindowTitle(_('Bulk Convert'))
         lf = widget_factory(LookAndFeelWidget)
+        hw = widget_factory(HeuristicsWidget)
+        sr = widget_factory(SearchAndReplaceWidget)
         ps = widget_factory(PageSetupWidget)
         sd = widget_factory(StructureDetectionWidget)
         toc = widget_factory(TOCWidget)
@@ -90,7 +94,7 @@ class BulkConfig(Config):
             if not c: break
             self.stack.removeWidget(c)
 
-        widgets = [lf, ps, sd, toc]
+        widgets = [lf, hw, sr, ps, sd, toc]
         if output_widget is not None:
             widgets.append(output_widget)
         for w in widgets:
diff --git a/src/calibre/gui2/convert/heuristics.py b/src/calibre/gui2/convert/heuristics.py
index 132652701a..2b9df50457 100644
--- a/src/calibre/gui2/convert/heuristics.py
+++ b/src/calibre/gui2/convert/heuristics.py
@@ -4,16 +4,15 @@ __license__ = 'GPL 3'
 __copyright__ = '2011, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 
-import re
+from PyQt4.Qt import Qt
 
 from calibre.gui2.convert.heuristics_ui import Ui_Form
 from calibre.gui2.convert import Widget
-from calibre.gui2 import error_dialog
 
 class HeuristicsWidget(Widget, Ui_Form):
 
     TITLE = _('Heuristics')
-    HELP  = _('')
+    HELP  = _('Modify the document text and structure using common patterns.')
     COMMIT_NAME = 'heuristics'
 
     def __init__(self, parent, get_option, get_help, db=None, book_id=None):
@@ -22,41 +21,48 @@ class HeuristicsWidget(Widget, Ui_Form):
                  'italicize_common_cases', 'fix_indents',
                  'html_unwrap_factor', 'unwrap_lines',
                  'delete_blank_paragraphs', 'format_scene_breaks',
-                 'dehyphenate',
-                 'sr1_search', 'sr1_replace',
-                 'sr2_search', 'sr2_replace',
-                 'sr3_search', 'sr3_replace']
+                 'dehyphenate']
                 )
         self.db, self.book_id = db, book_id
         self.initialize_options(get_option, get_help, db, book_id)
-        self.opt_sr1_search.set_msg(_('Search regular expression 1:'))
-        self.opt_sr1_replace.set_msg(_('Replace regular expression 1:'))
-        self.opt_sr2_search.set_msg(_('Search regular expression 2:'))
-        self.opt_sr2_replace.set_msg(_('Replace regular expression 2:'))
-        self.opt_sr3_search.set_msg(_('Search regular expression 3:'))
-        self.opt_sr3_replace.set_msg(_('Replace regular expression 3:'))
+        
+        self.opt_enable_heuristics.stateChanged.connect(self.enable_heuristics)
+        self.opt_unwrap_lines.stateChanged.connect(self.enable_unwrap)
+        
+        self.enable_heuristics(self.opt_enable_heuristics.checkState())
 
     def break_cycles(self):
         Widget.break_cycles(self)
-        self.opt_sr1_search.break_cycles()
-        self.opt_sr1_replace.break_cycles()
-        self.opt_sr2_search.break_cycles()
-        self.opt_sr2_replace.break_cycles()
-        self.opt_sr3_search.break_cycles()
-        self.opt_sr3_replace.break_cycles()
-
-    def pre_commit_check(self):
-        for x in ('sr1-search', 'sr1-replace', 'sr2-search', 'sr2-replace', 'sr3-search', 'sr3-replace',):
-            x = getattr(self, 'opt_'+x)
-            try:
-                pat = unicode(x.regex)
-                re.compile(pat)
-            except Exception, err:
-                error_dialog(self, _('Invalid regular expression'),
-                             _('Invalid regular expression: %s')%err).exec_()
-                return False
-            
+        
+        self.opt_enable_heuristics.stateChanged.disconnect()
+        self.opt_unwrap_lines.stateChanged.disconnect()
+        
     def set_value_handler(self, g, val):
         if val is None and g is self.opt_html_unwrap_factor:
             g.setValue(0.0)
             return True
+
+    def enable_heuristics(self, state):
+        if state == Qt.Checked:
+            state = True
+        else:
+            state = False
+        self.opt_markup_chapter_headings.setEnabled(state)
+        self.opt_italicize_common_cases.setEnabled(state)
+        self.opt_fix_indents.setEnabled(state)
+        self.opt_delete_blank_paragraphs.setEnabled(state)
+        self.opt_format_scene_breaks.setEnabled(state)
+        self.opt_dehyphenate.setEnabled(state)
+        
+        self.opt_unwrap_lines.setEnabled(state)
+        if state and self.opt_unwrap_lines.checkState() == Qt.Checked:
+            self.opt_html_unwrap_factor.setEnabled(True)
+        else:
+            self.opt_html_unwrap_factor.setEnabled(False)
+
+    def enable_unwrap(self, state):
+        if state == Qt.Checked:
+            state = True
+        else:
+            state = False
+        self.opt_html_unwrap_factor.setEnabled(state)
diff --git a/src/calibre/gui2/convert/heuristics.ui b/src/calibre/gui2/convert/heuristics.ui
index 2c103ff5b6..e64e79e1df 100644
--- a/src/calibre/gui2/convert/heuristics.ui
+++ b/src/calibre/gui2/convert/heuristics.ui
@@ -76,7 +76,7 @@
       <item row="2" column="0" colspan="3">
        <widget class="QCheckBox" name="opt_markup_chapter_headings">
         <property name="text">
-         <string>markup_chapter_headings</string>
+         <string>Detect and markup unformatted chapter headings and sub headings</string>
         </property>
        </widget>
       </item>
@@ -90,28 +90,28 @@
       <item row="4" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_format_scene_breaks">
         <property name="text">
-         <string>format_scene_breaks</string>
+         <string>Ensure scene breaks are consistently formatted</string>
         </property>
        </widget>
       </item>
       <item row="5" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_dehyphenate">
         <property name="text">
-         <string>dehyphenate</string>
+         <string>Remove unnecessary hyphens</string>
         </property>
        </widget>
       </item>
       <item row="6" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_italicize_common_cases">
         <property name="text">
-         <string>italicize_common_cases</string>
+         <string>Italicize common words and patterns</string>
         </property>
        </widget>
       </item>
       <item row="7" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_fix_indents">
         <property name="text">
-         <string>fix_indents</string>
+         <string>Replace entity indents with CSS indents</string>
         </property>
        </widget>
       </item>
@@ -123,7 +123,7 @@
         <property name="sizeHint" stdset="0">
          <size>
           <width>131</width>
-          <height>95</height>
+          <height>35</height>
          </size>
         </property>
        </spacer>
@@ -144,43 +144,8 @@
      </layout>
     </widget>
    </item>
-   <item>
-    <widget class="QGroupBox" name="groupBox_2">
-     <property name="title">
-      <string>Search and Replace</string>
-     </property>
-     <layout class="QGridLayout" name="gridLayout">
-      <item row="0" column="0">
-       <widget class="RegexEdit" name="opt_sr1_search" native="true"/>
-      </item>
-      <item row="0" column="1">
-       <widget class="RegexEdit" name="opt_sr1_replace" native="true"/>
-      </item>
-      <item row="1" column="0">
-       <widget class="RegexEdit" name="opt_sr2_search" native="true"/>
-      </item>
-      <item row="1" column="1">
-       <widget class="RegexEdit" name="opt_sr2_replace" native="true"/>
-      </item>
-      <item row="2" column="0">
-       <widget class="RegexEdit" name="opt_sr3_search" native="true"/>
-      </item>
-      <item row="2" column="1">
-       <widget class="RegexEdit" name="opt_sr3_replace" native="true"/>
-      </item>
-     </layout>
-    </widget>
-   </item>
   </layout>
  </widget>
- <customwidgets>
-  <customwidget>
-   <class>RegexEdit</class>
-   <extends>QWidget</extends>
-   <header>regex_builder.h</header>
-   <container>1</container>
-  </customwidget>
- </customwidgets>
  <resources/>
  <connections>
   <connection>
diff --git a/src/calibre/gui2/convert/search_and_replace.py b/src/calibre/gui2/convert/search_and_replace.py
new file mode 100644
index 0000000000..860cc11d4e
--- /dev/null
+++ b/src/calibre/gui2/convert/search_and_replace.py
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import re
+
+from PyQt4.Qt import Qt
+
+from calibre.gui2.convert.search_and_replace_ui import Ui_Form
+from calibre.gui2.convert import Widget
+from calibre.gui2 import error_dialog
+
+class SearchAndReplaceWidget(Widget, Ui_Form):
+
+    TITLE = _('Search and Replace')
+    HELP  = _('Modify the document text and structure using user defined patterns.')
+    COMMIT_NAME = 'search_and_replace'
+
+    def __init__(self, parent, get_option, get_help, db=None, book_id=None):
+        Widget.__init__(self, parent,
+                ['sr1_search', 'sr1_replace',
+                 'sr2_search', 'sr2_replace',
+                 'sr3_search', 'sr3_replace']
+                )
+        self.db, self.book_id = db, book_id
+        self.initialize_options(get_option, get_help, db, book_id)
+        self.opt_sr1_search.set_msg(_('Search regular expression 1:'))
+        self.opt_sr1_replace.set_msg(_('Replace regular expression 1:'))
+        self.opt_sr2_search.set_msg(_('Search regular expression 2:'))
+        self.opt_sr2_replace.set_msg(_('Replace regular expression 2:'))
+        self.opt_sr3_search.set_msg(_('Search regular expression 3:'))
+        self.opt_sr3_replace.set_msg(_('Replace regular expression 3:'))
+        
+    def break_cycles(self):
+        Widget.break_cycles(self)
+        
+        self.opt_sr1_search.break_cycles()
+        self.opt_sr1_replace.break_cycles()
+        self.opt_sr2_search.break_cycles()
+        self.opt_sr2_replace.break_cycles()
+        self.opt_sr3_search.break_cycles()
+        self.opt_sr3_replace.break_cycles()
+
+    def pre_commit_check(self):
+        for x in ('sr1-search', 'sr1-replace', 'sr2-search', 'sr2-replace', 'sr3-search', 'sr3-replace',):
+            x = getattr(self, 'opt_'+x)
+            try:
+                pat = unicode(x.regex)
+                re.compile(pat)
+            except Exception, err:
+                error_dialog(self, _('Invalid regular expression'),
+                             _('Invalid regular expression: %s')%err).exec_()
+                return False
+            
+
diff --git a/src/calibre/gui2/convert/search_and_replace.ui b/src/calibre/gui2/convert/search_and_replace.ui
new file mode 100644
index 0000000000..5913f2c098
--- /dev/null
+++ b/src/calibre/gui2/convert/search_and_replace.ui
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>Form</class>
+ <widget class="QWidget" name="Form">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>657</width>
+    <height>479</height>
+   </rect>
+  </property>
+  <property name="windowTitle">
+   <string>Form</string>
+  </property>
+  <layout class="QVBoxLayout" name="verticalLayout">
+   <item>
+    <widget class="RegexEdit" name="opt_sr1_search" native="true"/>
+   </item>
+   <item>
+    <widget class="RegexEdit" name="opt_sr1_replace" native="true"/>
+   </item>
+   <item>
+    <widget class="RegexEdit" name="opt_sr2_search" native="true"/>
+   </item>
+   <item>
+    <widget class="RegexEdit" name="opt_sr2_replace" native="true"/>
+   </item>
+   <item>
+    <widget class="RegexEdit" name="opt_sr3_search" native="true"/>
+   </item>
+   <item>
+    <widget class="RegexEdit" name="opt_sr3_replace" native="true"/>
+   </item>
+  </layout>
+ </widget>
+ <customwidgets>
+  <customwidget>
+   <class>RegexEdit</class>
+   <extends>QWidget</extends>
+   <header>regex_builder.h</header>
+   <container>1</container>
+  </customwidget>
+ </customwidgets>
+ <resources/>
+ <connections/>
+</ui>
diff --git a/src/calibre/gui2/convert/single.py b/src/calibre/gui2/convert/single.py
index 0337b779a0..8826d398f5 100644
--- a/src/calibre/gui2/convert/single.py
+++ b/src/calibre/gui2/convert/single.py
@@ -17,6 +17,7 @@ from calibre.gui2.convert.single_ui import Ui_Dialog
 from calibre.gui2.convert.metadata import MetadataWidget
 from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
 from calibre.gui2.convert.heuristics import HeuristicsWidget
+from calibre.gui2.convert.search_and_replace import SearchAndReplaceWidget
 from calibre.gui2.convert.page_setup import PageSetupWidget
 from calibre.gui2.convert.structure_detection import StructureDetectionWidget
 from calibre.gui2.convert.toc import TOCWidget
@@ -172,6 +173,7 @@ class Config(ResizableDialog, Ui_Dialog):
         self.setWindowTitle(_('Convert')+ ' ' + unicode(self.mw.title.text()))
         lf = widget_factory(LookAndFeelWidget)
         hw = widget_factory(HeuristicsWidget)
+        sr = widget_factory(SearchAndReplaceWidget)
         ps = widget_factory(PageSetupWidget)
         sd = widget_factory(StructureDetectionWidget)
         toc = widget_factory(TOCWidget)
@@ -205,7 +207,7 @@ class Config(ResizableDialog, Ui_Dialog):
             if not c: break
             self.stack.removeWidget(c)
 
-        widgets = [self.mw, lf, hw, ps, sd, toc]
+        widgets = [self.mw, lf, hw, sr, ps, sd, toc]
         if input_widget is not None:
             widgets.append(input_widget)
         if output_widget is not None:
diff --git a/src/calibre/gui2/convert/structure_detection.ui b/src/calibre/gui2/convert/structure_detection.ui
index b690a68b0a..262894d42d 100644
--- a/src/calibre/gui2/convert/structure_detection.ui
+++ b/src/calibre/gui2/convert/structure_detection.ui
@@ -14,10 +14,10 @@
    <string>Form</string>
   </property>
   <layout class="QGridLayout" name="gridLayout">
-   <item row="0" column="1" colspan="2">
+   <item row="0" column="0" colspan="3">
     <widget class="XPathEdit" name="opt_chapter" native="true"/>
    </item>
-   <item row="1" column="0" colspan="2">
+   <item row="1" column="0">
     <widget class="QLabel" name="label">
      <property name="text">
       <string>Chapter &amp;mark:</string>
@@ -27,7 +27,7 @@
      </property>
     </widget>
    </item>
-   <item row="1" column="2">
+   <item row="1" column="1" colspan="2">
     <widget class="QComboBox" name="opt_chapter_mark">
      <property name="minimumContentsLength">
       <number>20</number>
diff --git a/src/calibre/gui2/preferences/conversion.py b/src/calibre/gui2/preferences/conversion.py
index a20872cee0..0a8fc375ea 100644
--- a/src/calibre/gui2/preferences/conversion.py
+++ b/src/calibre/gui2/preferences/conversion.py
@@ -13,6 +13,7 @@ from calibre.utils.logging import Log
 from calibre.gui2.preferences.conversion_ui import Ui_Form
 from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
 from calibre.gui2.convert.heuristics import HeuristicsWidget
+from calibre.gui2.convert.search_and_replace import SearchAndReplaceWidget
 from calibre.gui2.convert.page_setup import PageSetupWidget
 from calibre.gui2.convert.structure_detection import StructureDetectionWidget
 from calibre.gui2.convert.toc import TOCWidget
@@ -84,7 +85,8 @@ class CommonOptions(Base):
 
     def load_conversion_widgets(self):
         self.conversion_widgets = [LookAndFeelWidget, HeuristicsWidget,
-                PageSetupWidget, StructureDetectionWidget, TOCWidget]
+                SearchAndReplaceWidget, PageSetupWidget,
+                StructureDetectionWidget, TOCWidget]
 
 class InputOptions(Base):
 

From 9d29e46a2cbe82879caa4ceeccb51cefbe153ed4 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Fri, 14 Jan 2011 19:28:36 -0500
Subject: [PATCH 014/118] ...

---
 src/calibre/gui2/convert/search_and_replace.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/calibre/gui2/convert/search_and_replace.py b/src/calibre/gui2/convert/search_and_replace.py
index 860cc11d4e..de9033a46e 100644
--- a/src/calibre/gui2/convert/search_and_replace.py
+++ b/src/calibre/gui2/convert/search_and_replace.py
@@ -6,8 +6,6 @@ __docformat__ = 'restructuredtext en'
 
 import re
 
-from PyQt4.Qt import Qt
-
 from calibre.gui2.convert.search_and_replace_ui import Ui_Form
 from calibre.gui2.convert import Widget
 from calibre.gui2 import error_dialog

From e8133432fd1280468a98666e05532e8e1da6d5b3 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 09:06:05 +0800
Subject: [PATCH 015/118] ...

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 96bd303933..417f3a1e5b 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -156,7 +156,7 @@ class PreProcessor(object):
         n_lookahead_open = "\s+(?!"
         n_lookahead_close = ")"
 
-        default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
+        default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
 
         chapter_types = [
             [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"],

From f85ba4e3261b4e64c84722087471824fbf12278e Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Fri, 14 Jan 2011 21:15:34 -0500
Subject: [PATCH 016/118] Fix sr key. Change footer and header removal to
 generic search and replace options.

---
 src/calibre/ebooks/conversion/cli.py        |  2 +-
 src/calibre/ebooks/conversion/plumber.py    |  9 ++---
 src/calibre/ebooks/conversion/preprocess.py | 42 +++++++++++----------
 3 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index 91f0f95348..db1ec0857d 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -183,7 +183,7 @@ def add_pipeline_options(parser, plumber):
               }
 
     group_order = ['', 'LOOK AND FEEL', 'HEURISTICS',
-            'SEARCH AND REPLACE' 'STRUCTURE DETECTION',
+            'SEARCH AND REPLACE', 'STRUCTURE DETECTION',
             'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
 
     for group in group_order:
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 2e88baea4e..a12dbd48e1 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -540,8 +540,7 @@ OptionRecommendation(name='sr1_search',
     
 OptionRecommendation(name='sr1_replace',
     recommended_value='', level=OptionRecommendation.LOW,
-    help=_('Replace characters (can be lambda expression) to '
-           'replace the text found with sr1-search.')),
+    help=_('Replace characters to replace the text found with sr1-search.')),
 
 OptionRecommendation(name='sr2_search',
     recommended_value='', level=OptionRecommendation.LOW,
@@ -550,8 +549,7 @@ OptionRecommendation(name='sr2_search',
 
 OptionRecommendation(name='sr2_replace',
     recommended_value='', level=OptionRecommendation.LOW,
-    help=_('Replace characters (can be lambda expression) to '
-           'replace the text found with sr2-search.')),
+    help=_('Replace characters to replace the text found with sr2-search.')),
 
 OptionRecommendation(name='sr3_search',
     recommended_value='', level=OptionRecommendation.LOW,
@@ -560,8 +558,7 @@ OptionRecommendation(name='sr3_search',
 
 OptionRecommendation(name='sr3_replace',
     recommended_value='', level=OptionRecommendation.LOW,
-    help=_('Replace characters (can be lambda expression) to '
-           'replace the text found with sr3-search.')),
+    help=_('Replace characters to replace the text found with sr3-search.')),
 ]
         # }}}
 
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 08a46cb8d9..35a311d58f 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -436,27 +436,29 @@ class HTMLPreProcessor(object):
         if not getattr(self.extra_opts, 'keep_ligatures', False):
             html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
 
+        if getattr(self.extra_opts, 'sr3_search', None):
+            try:
+                rules.insert(0,  (re.compile(self.extra_opts.sr3_search), self.extra_opts.sr3_replace))
+            except:
+                import traceback
+                print 'Failed to parse sr3-search regexp'
+                traceback.print_exc()
+        if getattr(self.extra_opts, 'sr2_search', None):
+            try:
+                rules.insert(0, (re.compile(self.extra_opts.sr2_search), self.extra_opts.sr2_replace))
+            except:
+                import traceback
+                print 'Failed to parse sr2-search regexp'
+                traceback.print_exc()
+        if getattr(self.extra_opts, 'sr1_search', None):
+            try:
+                rules.insert(0, (re.compile(self.extra_opts.sr1_search), self.extra_opts.sr1_replace))
+            except:
+                import traceback
+                print 'Failed to parse sr1-search regexp'
+                traceback.print_exc()
+
         end_rules = []
-        if getattr(self.extra_opts, 'remove_header', None):
-            try:
-                rules.insert(0,
-                    (re.compile(self.extra_opts.header_regex), lambda match : '')
-                )
-            except:
-                import traceback
-                print 'Failed to parse remove_header regexp'
-                traceback.print_exc()
-
-        if getattr(self.extra_opts, 'remove_footer', None):
-            try:
-                rules.insert(0,
-                    (re.compile(self.extra_opts.footer_regex), lambda match : '')
-                )
-            except:
-                import traceback
-                print 'Failed to parse remove_footer regexp'
-                traceback.print_exc()
-
         # delete soft hyphens - moved here so it's executed after header/footer removal
         if is_pdftohtml:
             # unwrap/delete soft hyphens

From 33793372759002f287f837a4a51cdc6767501035 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 10:50:01 +0800
Subject: [PATCH 017/118] tied rtf input to heuristics, removed option to not
 include softbreaks, users can combine delete_blank_paragraphs and
 remove_paragraph_spacing to achieve desired results

---
 src/calibre/ebooks/rtf/input.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 92ac8a2519..2f931d1d04 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -320,11 +320,10 @@ class RTFInput(InputFormatPlugin):
             res = transform.tostring(result)
             res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
             # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
-            if not getattr(self.opts, 'remove_paragraph_spacing', False):
-                res = re.sub('\s*<body>', '<body>', res)
-                res = re.sub('(?<=\n)\n{2}',
-                        u'<p>\u00a0</p>\n'.encode('utf-8'), res)
-            if self.opts.preprocess_html:
+            res = re.sub('\s*<body>', '<body>', res)
+            res = re.sub('(?<=\n)\n{2}',
+                    u'<p>\u00a0</p>\n'.encode('utf-8'), res)
+            if self.opts.enable_heuristics:
                 preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None))
                 res = preprocessor(res.decode('utf-8')).encode('utf-8')
             f.write(res)

From 6f252bb1050a6a7d66dcad365fb3992088f9fe86 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 13:34:35 +0800
Subject: [PATCH 018/118] tied all the new heuristics options to
 preprocess.utils

---
 src/calibre/ebooks/conversion/plumber.py |   2 +-
 src/calibre/ebooks/conversion/utils.py   | 177 ++++++++++++-----------
 2 files changed, 97 insertions(+), 82 deletions(-)

diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index a12dbd48e1..48b965f624 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -523,7 +523,7 @@ OptionRecommendation(name='delete_blank_paragraphs',
     
 OptionRecommendation(name='format_scene_breaks',
     recommended_value=False, level=OptionRecommendation.LOW,
-    help=_('Detects left aligned scene break markers and center aligns them. '
+    help=_('left aligned scene break markers are center aligned. '
            'Replace soft scene breaks that use multiple blank lines with'
            'horizontal rules.')),
 
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 417f3a1e5b..68afc464a0 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -18,6 +18,11 @@ class PreProcessor(object):
         self.html_preprocess_sections = 0
         self.found_indents = 0
         self.extra_opts = extra_opts
+        self.deleted_nbsps = False
+        self.min_chapters = 1
+        self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
+        self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
+        self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
 
     def is_pdftohtml(self, src):
         return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
@@ -120,7 +125,6 @@ class PreProcessor(object):
         '''
         # Typical chapters are between 2000 and 7000 words, use the larger number to decide the
         # minimum of chapters to search for
-        self.min_chapters = 1
         if wordcount > 7000:
             self.min_chapters = int(ceil(wordcount / 7000.))
         #print "minimum chapters required are: "+str(self.min_chapters)
@@ -192,21 +196,28 @@ class PreProcessor(object):
     def punctuation_unwrap(self, length, content, format):
         '''
         Unwraps lines based on line length and punctuation
-        supports range of potential html markup and text files
+        supports a range of html markup and text files
         '''
         # define the pieces of the regex
         lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
-        line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
+        em_en_lookahead = "(?<=.{"+str(length)+"}[\u2013\u2014])"
+        line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
         blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
-        line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
+        line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
         txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
 
         unwrap_regex = lookahead+line_ending+blanklines+line_opening
+        em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
+
         if format == 'txt':
             unwrap_regex = lookahead+txt_line_wrap
+            em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
 
         unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
+        em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
+
         content = unwrap.sub(' ', content)
+        content = em_en_unwrap.sub('', content)
         return content
 
 
@@ -253,8 +264,38 @@ class PreProcessor(object):
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
         html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
+        self.deleted_nbsps = True
         return html
 
+    def analyze_line_endings(self, html):
+        '''
+        determines the type of html line ending used most commonly in a document
+        use before calling docanalysis functions
+        '''
+        paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
+        spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
+        paras = len(paras_reg.findall(html))
+        spans = len(spans_reg.findall(html))
+        if spans > 1:
+            if float(paras) / float(spans) < 0.75:
+                return 'spanned_html'
+            else:
+                return 'html'
+        else:
+            return 'html'
+
+    def analyze_blanks(self, html):
+        blanklines = self.blankreg.findall(html)
+        lines = self.linereg.findall(html)
+        if len(lines) > 1:
+            self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
+                    unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
+                    
+            if float(len(blanklines)) / float(len(lines)) > 0.40:
+                return True
+            else:
+                return False
+
 
     def __call__(self, html):
         self.log("*********  Preprocessing HTML  *********")
@@ -281,97 +322,69 @@ class PreProcessor(object):
             # check if content is in pre tags, use txt processor to mark up if so
             html = self.text_process_pre(html)
 
-        ###### Mark Indents/Cleanup ######
-        #
         # Replace series of non-breaking spaces with text-indent
-        html = self.fix_nbsp_indents(html)
+        if getattr(self.extra_opts, 'fix_indents', True):
+            html = self.fix_nbsp_indents(html)
         
         html = self.cleanup_markup(html)
 
         # ADE doesn't render <br />, change to empty paragraphs
         #html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
 
-        # If more than 40% of the lines are empty paragraphs and the user has enabled delete
-        # blank paragraphs then delete blank lines to clean up spacing
-        linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
-        blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
-        multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
-        blanklines = blankreg.findall(html)
-        lines = linereg.findall(html)
-        blanks_between_paragraphs = False
-        print "delete blank paragraphs is "+str(getattr(self.extra_opts, 'delete_blank_paragraphs', False))
-        if len(lines) > 1:
-            self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
-                    unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
-                    
-            if float(len(blanklines)) / float(len(lines)) > 0.40:
-                blanks_between_paragraphs = True
-                print "blanks between paragraphs is marked True"
-            else:
-                blanks_between_paragraphs = False
+        # Determine whether the document uses interleaved blank lines
+        blanks_between_paragraphs = self.analyze_blanks(html)
 
         #self.dump(html, 'before_chapter_markup')
         # detect chapters/sections to match xpath or splitting logic
-        #
 
-        html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
+        if getattr(self.extra_opts, 'markup_chapter_headings', True):
+            html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
 
+        # If more than 40% of the lines are empty paragraphs and the user has enabled delete
+        # blank paragraphs then delete blank lines to clean up spacing
         if blanks_between_paragraphs and getattr(self.extra_opts,
         'delete_blank_paragraphs', False):
             self.log("deleting blank lines")
-            html = multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
-            html = blankreg.sub('', html)
+            html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
+            html = self.blankreg.sub('', html)
             
         ###### Unwrap lines ######
-        #
-        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
-        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
-        # that lines can be un-wrapped across page boundaries
-        paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
-        spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
-        paras = len(paras_reg.findall(html))
-        spans = len(spans_reg.findall(html))
-        if spans > 1:
-            if float(paras) / float(spans) < 0.75:
-                format = 'spanned_html'
-            else:
-                format = 'html'
-        else:
-            format = 'html'
-        # Check Line histogram to determine if the document uses hard line breaks, If 50% or
-        # more of the lines break in the same region of the document then unwrapping is required
-        docanalysis = DocAnalysis(format, html)
-        hardbreaks = docanalysis.line_histogram(.50)
-        self.log("Hard line breaks check returned "+unicode(hardbreaks))
-        # Calculate Length
-        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
-        length = docanalysis.line_length(unwrap_factor)
-        self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
-        # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
-        if hardbreaks or unwrap_factor < 0.4:
-            self.log("Unwrapping required, unwrapping Lines")
-            # Unwrap em/en dashes
-            html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
-            # Dehyphenate
-            self.log("Unwrapping/Removing hyphens")
-            dehyphenator = Dehyphenator()
-            html = dehyphenator(html,'html', length)
-            self.log("Done dehyphenating")
-            # Unwrap lines using punctation and line length
-            #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
-            html = self.punctuation_unwrap(length, html, 'html')
-            #check any remaining hyphens, but only unwrap if there is a match
-            dehyphenator = Dehyphenator()
-            html = dehyphenator(html,'html_cleanup', length)
-        else:
-            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
-            self.log("Cleaning up hyphenation")
-            dehyphenator = Dehyphenator()
-            html = dehyphenator(html,'html_cleanup', length)
-            self.log("Done dehyphenating")
+        if getattr(self.extra_opts, 'unwrap_lines', True):
+            # Determine line ending type
+            # Some OCR sourced files have line breaks in the html using a combination of span & p tags
+            # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
+            # that lines can be un-wrapped across page boundaries
+            format = self.analyze_line_endings(html)
 
-        # delete soft hyphens
-        html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
+            # Check Line histogram to determine if the document uses hard line breaks, If 50% or
+            # more of the lines break in the same region of the document then unwrapping is required
+            docanalysis = DocAnalysis(format, html)
+            hardbreaks = docanalysis.line_histogram(.50)
+            self.log("Hard line breaks check returned "+unicode(hardbreaks))
+
+            # Calculate Length
+            unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
+            length = docanalysis.line_length(unwrap_factor)
+            self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
+
+            # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
+            if hardbreaks or unwrap_factor < 0.4:
+                self.log("Unwrapping required, unwrapping Lines")
+                # Dehyphenate with line length limiters
+                dehyphenator = Dehyphenator()
+                html = dehyphenator(html,'html', length)
+                html = self.punctuation_unwrap(length, html, 'html')
+                #check any remaining hyphens, but only unwrap if there is a match
+                dehyphenator = Dehyphenator()
+                html = dehyphenator(html,'html_cleanup', length)
+
+        if getattr(self.extra_opts, 'dehyphenate', True):
+            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
+            self.log("Fixing hyphenated content")
+            dehyphenator = Dehyphenator()
+            html = dehyphenator(html,'html_cleanup', length)
+            # delete soft hyphens
+            html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
 
         # If still no sections after unwrapping mark split points on lines with no punctuation
         if self.html_preprocess_sections < self.min_chapters:
@@ -385,10 +398,12 @@ class PreProcessor(object):
         doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
         html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
 
-        # put back non-breaking spaces in empty paragraphs to preserve original formatting
-        html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
+        if getattr(self.extra_opts, 'dehyphenate', True):
+            # Center separator lines
+            html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
 
-        # Center separator lines
-        html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
+        if self.deleted_nbsps:
+            # put back non-breaking spaces in empty paragraphs to preserve original formatting
+            html = self.blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
 
         return html

From e581c8c5dedf0e68fa5c3ca5a06d660546e9996c Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 15:40:59 +0800
Subject: [PATCH 019/118] created sub-functions for text processing, added soft
 hyphens to punctuation unwrap

---
 src/calibre/ebooks/conversion/utils.py | 43 +++++++++++++++-----------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 68afc464a0..99685e90d1 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -19,6 +19,7 @@ class PreProcessor(object):
         self.found_indents = 0
         self.extra_opts = extra_opts
         self.deleted_nbsps = False
+        self.totalwords = 0
         self.min_chapters = 1
         self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
         self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
@@ -201,6 +202,7 @@ class PreProcessor(object):
         # define the pieces of the regex
         lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
         em_en_lookahead = "(?<=.{"+str(length)+"}[\u2013\u2014])"
+        soft_hyphen = "\xad"
         line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
         blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
         line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
@@ -208,10 +210,12 @@ class PreProcessor(object):
 
         unwrap_regex = lookahead+line_ending+blanklines+line_opening
         em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
+        shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
 
         if format == 'txt':
             unwrap_regex = lookahead+txt_line_wrap
             em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
+            shy_unwrap_regex = soft_hyphen+txt_line_wrap
 
         unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
         em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
@@ -220,18 +224,21 @@ class PreProcessor(object):
         content = em_en_unwrap.sub('', content)
         return content
 
+    def txt_process(self, match):
+        from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
+        separate_paragraphs_single_line
+        content = match.group('text')
+        content = separate_paragraphs_single_line(content)
+        content = preserve_spaces(content)
+        content = convert_basic(content, epub_split_size_kb=0)
+        return content
 
-    def text_process_pre(self, html):
+    def markup_pre(self, html):
         pre = re.compile(r'<pre>', re.IGNORECASE)
-        if len(pre.findall(html)) == 1:
+        if len(pre.findall(html)) >= 1:
             self.log("Running Text Processing")
-            from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
-            separate_paragraphs_single_line
             outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
-            html = outerhtml.sub('\g<text>', html)
-            html = separate_paragraphs_single_line(html)
-            html = preserve_spaces(html)
-            html = convert_basic(html, epub_split_size_kb=0)
+            html = outerhtml.sub(self.txt_process, html)
         else:
             # Add markup naively
             # TODO - find out if there are cases where there are more than one <pre> tag or
@@ -302,25 +309,26 @@ class PreProcessor(object):
 
         # Count the words in the document to estimate how many chapters to look for and whether
         # other types of processing are attempted
-        totalwords = 0
-        totalwords = self.get_word_count(html)
+        try:
+            self.totalwords = self.get_word_count(html)
+        except:
+            self.log("Can't get wordcount")
 
-        if totalwords < 50:
+        if 0 < self.totalwords < 50:
             self.log("flow is too short, not running heuristics")
             return html
 
         # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
         html = self.arrange_htm_line_endings(html)
 
-
         ###### Check Markup ######
         #
         # some lit files don't have any <p> tags or equivalent (generally just plain text between
         # <pre> tags), check and  mark up line endings if required before proceeding
         if self.no_markup(html, 0.1):
             self.log("not enough paragraph markers, adding now")
-            # check if content is in pre tags, use txt processor to mark up if so
-            html = self.text_process_pre(html)
+            # markup using text processing
+            html = self.markup_pre(html)
 
         # Replace series of non-breaking spaces with text-indent
         if getattr(self.extra_opts, 'fix_indents', True):
@@ -338,7 +346,7 @@ class PreProcessor(object):
         # detect chapters/sections to match xpath or splitting logic
 
         if getattr(self.extra_opts, 'markup_chapter_headings', True):
-            html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
+            html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
 
         # If more than 40% of the lines are empty paragraphs and the user has enabled delete
         # blank paragraphs then delete blank lines to clean up spacing
@@ -383,8 +391,6 @@ class PreProcessor(object):
             self.log("Fixing hyphenated content")
             dehyphenator = Dehyphenator()
             html = dehyphenator(html,'html_cleanup', length)
-            # delete soft hyphens
-            html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
 
         # If still no sections after unwrapping mark split points on lines with no punctuation
         if self.html_preprocess_sections < self.min_chapters:
@@ -392,13 +398,14 @@ class PreProcessor(object):
                     " currently have " + unicode(self.html_preprocess_sections))
             chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
             html = chapdetect3.sub(self.chapter_break, html)
+
         # search for places where a first or second level heading is immediately followed by another
         # top level heading.  demote the second heading to h3 to prevent splitting between chapter
         # headings and titles, images, etc
         doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
         html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
 
-        if getattr(self.extra_opts, 'dehyphenate', True):
+        if getattr(self.extra_opts, 'format_scene_breaks', True):
             # Center separator lines
             html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
 

From a44d29e840acd0eb14b43093e0a4c178da4a69a6 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 20:13:51 +0800
Subject: [PATCH 020/118] only run cleanup_markup when required, begin
 markup_chapters rewrite

---
 src/calibre/ebooks/conversion/utils.py | 35 +++++++++++++++-----------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 99685e90d1..ec175061cc 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -25,6 +25,16 @@ class PreProcessor(object):
         self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
 
+        self.chapter_types = [
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings", 'common'],
+            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
+            [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
+            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
+            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
+            [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings", 'plain_number'],  # Numeric Chapters, no dot or colon
+            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
+            ]
+
     def is_pdftohtml(self, src):
         return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
 
@@ -163,18 +173,8 @@ class PreProcessor(object):
 
         default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
 
-        chapter_types = [
-            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"],
-            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
-            [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
-            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"],  # Spaced Lettering
-            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
-            [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"],  # Numeric Chapters, no dot or colon
-            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
-            ]
-
         # Start with most typical chapter headings, get more aggressive until one works
-        for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
+        for [chapter_type, lookahead_ignorecase, log_message, type_name] in self.chapter_types:
             if self.html_preprocess_sections >= self.min_chapters:
                 break
             full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
@@ -303,6 +303,12 @@ class PreProcessor(object):
             else:
                 return False
 
+    def cleanup_required(self):
+        for option in ['unwrap_lines', 'markup_chapter_headings', 'format_scene_breaks', 'delete_blank_paragraphs']:
+            if getattr(self.extra_opts, option, False):
+                return True
+        return False
+
 
     def __call__(self, html):
         self.log("*********  Preprocessing HTML  *********")
@@ -333,8 +339,9 @@ class PreProcessor(object):
         # Replace series of non-breaking spaces with text-indent
         if getattr(self.extra_opts, 'fix_indents', True):
             html = self.fix_nbsp_indents(html)
-        
-        html = self.cleanup_markup(html)
+
+        if self.cleanup_required():
+            html = self.cleanup_markup(html)
 
         # ADE doesn't render <br />, change to empty paragraphs
         #html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
@@ -393,7 +400,7 @@ class PreProcessor(object):
             html = dehyphenator(html,'html_cleanup', length)
 
         # If still no sections after unwrapping mark split points on lines with no punctuation
-        if self.html_preprocess_sections < self.min_chapters:
+        if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
             self.log("Looking for more split points based on punctuation,"
                     " currently have " + unicode(self.html_preprocess_sections))
             chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)

From 1301fe69d16e452644944efbd2447f61fd6fe4fb Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 20:53:16 +0800
Subject: [PATCH 021/118] started multi-pass chapter analysis

---
 src/calibre/ebooks/conversion/utils.py | 71 +++++++++++++++++---------
 1 file changed, 47 insertions(+), 24 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index ec175061cc..2a88d371cc 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -21,20 +21,12 @@ class PreProcessor(object):
         self.deleted_nbsps = False
         self.totalwords = 0
         self.min_chapters = 1
+        self.chapters_no_title = 0
+        self.chapters_with_title = 0
         self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
         self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
 
-        self.chapter_types = [
-            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings", 'common'],
-            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
-            [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
-            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
-            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
-            [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings", 'plain_number'],  # Numeric Chapters, no dot or colon
-            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
-            ]
-
     def is_pdftohtml(self, src):
         return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
 
@@ -60,6 +52,14 @@ class PreProcessor(object):
                 " section markers based on punctuation. - " + unicode(chap))
         return '<'+styles+' style="page-break-before:always">'+chap
 
+    def analyze_title_matches(self, match):
+        chap = match.group('chap')
+        title = match.group('title')
+        if not title:
+            self.chapters_no_title = self.chapters_no_title + 1
+        else:
+            self.chapters_with_title = self.chapters_with_title + 1
+
     def insert_indent(self, match):
         pstyle = match.group('formatting')
         span = match.group('span')
@@ -173,20 +173,43 @@ class PreProcessor(object):
 
         default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
 
-        # Start with most typical chapter headings, get more aggressive until one works
-        for [chapter_type, lookahead_ignorecase, log_message, type_name] in self.chapter_types:
-            if self.html_preprocess_sections >= self.min_chapters:
-                break
-            full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
-            n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
-            self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
-            if lookahead_ignorecase:
-                chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-                chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
-            else:
-                chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
-                chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
-            html = chapdetect.sub(self.chapter_head, html)
+        chapter_types = [
+            [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for most common chapter headings", 'chapter'],  # Highest frequency headings which include titles
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common section headings", 'common'],
+            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
+            [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
+            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
+            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
+            [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric headings", 'plain_number'],  # Numeric Chapters, no dot or colon
+            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
+            ]
+
+        def recurse_patterns(html, analyze):
+            # Start with most typical chapter headings, get more aggressive until one works
+            for [chapter_type, lookahead_ignorecase, log_message, type_name] in chapter_types:
+                if self.html_preprocess_sections >= self.min_chapters:
+                    break
+                full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
+                n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
+                self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
+                if lookahead_ignorecase:
+                    chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+                    chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+                else:
+                    chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
+                    chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
+                if analyze:
+                    hits = len(chapdetect.findall(html))
+                    print unicode(type_name)+" had "+unicode(hits)+" hits"
+                    chapdetect.sub(self.analyze_title_matches, html)
+                    print unicode(self.chapters_no_title)+" chapters with no title"
+                    print unicode(self.chapters_with_title)+" chapters with titles"
+                else:
+                    html = chapdetect.sub(self.chapter_head, html)
+                    return html
+
+        recurse_patterns(html, True)
+        html = recurse_patterns(html, False)
 
         words_per_chptr = wordcount
         if words_per_chptr > 0 and self.html_preprocess_sections > 0:

From fabd4f5fdfdc56d98c96ffa2b8b726fcf60c340b Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 15 Jan 2011 08:36:15 -0500
Subject: [PATCH 022/118] Clean up search and replace GUI widget.

---
 .../gui2/convert/search_and_replace.py        | 14 ++--
 .../gui2/convert/search_and_replace.ui        | 68 ++++++++++++++++---
 2 files changed, 62 insertions(+), 20 deletions(-)

diff --git a/src/calibre/gui2/convert/search_and_replace.py b/src/calibre/gui2/convert/search_and_replace.py
index de9033a46e..36a496c520 100644
--- a/src/calibre/gui2/convert/search_and_replace.py
+++ b/src/calibre/gui2/convert/search_and_replace.py
@@ -24,25 +24,19 @@ class SearchAndReplaceWidget(Widget, Ui_Form):
                 )
         self.db, self.book_id = db, book_id
         self.initialize_options(get_option, get_help, db, book_id)
-        self.opt_sr1_search.set_msg(_('Search regular expression 1:'))
-        self.opt_sr1_replace.set_msg(_('Replace regular expression 1:'))
-        self.opt_sr2_search.set_msg(_('Search regular expression 2:'))
-        self.opt_sr2_replace.set_msg(_('Replace regular expression 2:'))
-        self.opt_sr3_search.set_msg(_('Search regular expression 3:'))
-        self.opt_sr3_replace.set_msg(_('Replace regular expression 3:'))
+        self.opt_sr1_search.set_msg(_('Regular Expression'))
+        self.opt_sr2_search.set_msg(_('Regular Expression'))
+        self.opt_sr3_search.set_msg(_('Regular Expression'))
         
     def break_cycles(self):
         Widget.break_cycles(self)
         
         self.opt_sr1_search.break_cycles()
-        self.opt_sr1_replace.break_cycles()
         self.opt_sr2_search.break_cycles()
-        self.opt_sr2_replace.break_cycles()
         self.opt_sr3_search.break_cycles()
-        self.opt_sr3_replace.break_cycles()
 
     def pre_commit_check(self):
-        for x in ('sr1-search', 'sr1-replace', 'sr2-search', 'sr2-replace', 'sr3-search', 'sr3-replace',):
+        for x in ('sr1-search', 'sr2-search', 'sr3-search'):
             x = getattr(self, 'opt_'+x)
             try:
                 pat = unicode(x.regex)
diff --git a/src/calibre/gui2/convert/search_and_replace.ui b/src/calibre/gui2/convert/search_and_replace.ui
index 5913f2c098..ed500a4dd0 100644
--- a/src/calibre/gui2/convert/search_and_replace.ui
+++ b/src/calibre/gui2/convert/search_and_replace.ui
@@ -13,24 +13,72 @@
   <property name="windowTitle">
    <string>Form</string>
   </property>
-  <layout class="QVBoxLayout" name="verticalLayout">
-   <item>
+  <layout class="QGridLayout" name="gridLayout">
+   <item row="0" column="1">
+    <widget class="QLabel" name="label_4">
+     <property name="text">
+      <string>Search</string>
+     </property>
+    </widget>
+   </item>
+   <item row="0" column="2">
+    <widget class="QLabel" name="label_5">
+     <property name="text">
+      <string>Replace</string>
+     </property>
+    </widget>
+   </item>
+   <item row="1" column="0">
+    <widget class="QLabel" name="label">
+     <property name="text">
+      <string>1.</string>
+     </property>
+    </widget>
+   </item>
+   <item row="1" column="1">
     <widget class="RegexEdit" name="opt_sr1_search" native="true"/>
    </item>
-   <item>
-    <widget class="RegexEdit" name="opt_sr1_replace" native="true"/>
+   <item row="1" column="2">
+    <widget class="QLineEdit" name="opt_sr1_replace"/>
    </item>
-   <item>
+   <item row="2" column="0">
+    <widget class="QLabel" name="label_2">
+     <property name="text">
+      <string>2.</string>
+     </property>
+    </widget>
+   </item>
+   <item row="2" column="1">
     <widget class="RegexEdit" name="opt_sr2_search" native="true"/>
    </item>
-   <item>
-    <widget class="RegexEdit" name="opt_sr2_replace" native="true"/>
+   <item row="2" column="2">
+    <widget class="QLineEdit" name="opt_sr2_replace"/>
    </item>
-   <item>
+   <item row="3" column="0">
+    <widget class="QLabel" name="label_3">
+     <property name="text">
+      <string>3.</string>
+     </property>
+    </widget>
+   </item>
+   <item row="3" column="1">
     <widget class="RegexEdit" name="opt_sr3_search" native="true"/>
    </item>
-   <item>
-    <widget class="RegexEdit" name="opt_sr3_replace" native="true"/>
+   <item row="3" column="2">
+    <widget class="QLineEdit" name="opt_sr3_replace"/>
+   </item>
+   <item row="4" column="1">
+    <spacer name="verticalSpacer">
+     <property name="orientation">
+      <enum>Qt::Vertical</enum>
+     </property>
+     <property name="sizeHint" stdset="0">
+      <size>
+       <width>20</width>
+       <height>330</height>
+      </size>
+     </property>
+    </spacer>
    </item>
   </layout>
  </widget>

From cfaa113f9557b9359208409a538302d9ec0af1d4 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 15 Jan 2011 09:05:08 -0500
Subject: [PATCH 023/118] Move italic marking to preprocessor. Have TXT input
 use the preprocessor for heuristics. Change preprocessor getattr to default
 to False otherwise every option set to off will run.

---
 src/calibre/ebooks/conversion/utils.py       | 46 +++++++++++++---
 src/calibre/ebooks/txt/heuristicprocessor.py | 58 --------------------
 src/calibre/ebooks/txt/input.py              | 13 +++--
 src/calibre/ebooks/txt/processor.py          |  5 --
 4 files changed, 48 insertions(+), 74 deletions(-)
 delete mode 100644 src/calibre/ebooks/txt/heuristicprocessor.py

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 2a88d371cc..56d4339d8c 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -128,6 +128,36 @@ class PreProcessor(object):
         wordcount = get_wordcount_obj(word_count_text)
         return wordcount.words
 
+    def markup_italicis(self, html):
+        ITALICIZE_WORDS = [
+            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
+            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
+            'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
+            'Mlle.', 'Mons.', 'PS.', 'PPS.',
+        ]
+        
+        ITALICIZE_STYLE_PATS = [
+            r'(?msu)_(?P<words>.+?)_',
+            r'(?msu)/(?P<words>[^<>]+?)/',
+            r'(?msu)~~(?P<words>.+?)~~',
+            r'(?msu)\*(?P<words>.+?)\*',
+            r'(?msu)~(?P<words>.+?)~',
+            r'(?msu)_/(?P<words>[^<>]+?)/_',
+            r'(?msu)_\*(?P<words>.+?)\*_',
+            r'(?msu)\*/(?P<words>[^<>]+?)/\*',
+            r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
+            r'(?msu)/:(?P<words>[^<>]+?):/',
+            r'(?msu)\|:(?P<words>.+?):\|',
+        ]
+        
+        for word in ITALICIZE_WORDS:
+            html = html.replace(word, '<i>%s</i>' % word)
+
+        for pat in ITALICIZE_STYLE_PATS:
+            html = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), html)
+
+        return html
+
     def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
         '''
         Searches for common chapter headings throughout the document
@@ -360,7 +390,7 @@ class PreProcessor(object):
             html = self.markup_pre(html)
 
         # Replace series of non-breaking spaces with text-indent
-        if getattr(self.extra_opts, 'fix_indents', True):
+        if getattr(self.extra_opts, 'fix_indents', False):
             html = self.fix_nbsp_indents(html)
 
         if self.cleanup_required():
@@ -375,19 +405,21 @@ class PreProcessor(object):
         #self.dump(html, 'before_chapter_markup')
         # detect chapters/sections to match xpath or splitting logic
 
-        if getattr(self.extra_opts, 'markup_chapter_headings', True):
+        if getattr(self.extra_opts, 'markup_chapter_headings', False):
             html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
 
+        if getattr(self.extra_opts, 'italicize_common_cases', False): 
+            html = self.markup_italicis(html)
+
         # If more than 40% of the lines are empty paragraphs and the user has enabled delete
         # blank paragraphs then delete blank lines to clean up spacing
-        if blanks_between_paragraphs and getattr(self.extra_opts,
-        'delete_blank_paragraphs', False):
+        if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
             self.log("deleting blank lines")
             html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
             html = self.blankreg.sub('', html)
             
         ###### Unwrap lines ######
-        if getattr(self.extra_opts, 'unwrap_lines', True):
+        if getattr(self.extra_opts, 'unwrap_lines', False):
             # Determine line ending type
             # Some OCR sourced files have line breaks in the html using a combination of span & p tags
             # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
@@ -416,7 +448,7 @@ class PreProcessor(object):
                 dehyphenator = Dehyphenator()
                 html = dehyphenator(html,'html_cleanup', length)
 
-        if getattr(self.extra_opts, 'dehyphenate', True):
+        if getattr(self.extra_opts, 'dehyphenate', False):
             # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
             self.log("Fixing hyphenated content")
             dehyphenator = Dehyphenator()
@@ -435,7 +467,7 @@ class PreProcessor(object):
         doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
         html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
 
-        if getattr(self.extra_opts, 'format_scene_breaks', True):
+        if getattr(self.extra_opts, 'format_scene_breaks', False):
             # Center separator lines
             html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
 
diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py
deleted file mode 100644
index b9d18fd23a..0000000000
--- a/src/calibre/ebooks/txt/heuristicprocessor.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# -*- coding: utf-8 -*-
-
-__license__ = 'GPL 3'
-__copyright__ = '2011, John Schember <john@nachtimwald.com>'
-__docformat__ = 'restructuredtext en'
-
-import re
-
-from calibre import prepare_string_for_xml
-
-class TXTHeuristicProcessor(object):
-
-    def __init__(self):
-        self.ITALICIZE_WORDS = [
-            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
-            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
-            'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
-            'Mlle.', 'Mons.', 'PS.', 'PPS.',
-        ]
-        self.ITALICIZE_STYLE_PATS = [
-            r'(?msu)_(?P<words>.+?)_',
-            r'(?msu)/(?P<words>[^<>]+?)/',
-            r'(?msu)~~(?P<words>.+?)~~',
-            r'(?msu)\*(?P<words>.+?)\*',
-            r'(?msu)~(?P<words>.+?)~',
-            r'(?msu)_/(?P<words>[^<>]+?)/_',
-            r'(?msu)_\*(?P<words>.+?)\*_',
-            r'(?msu)\*/(?P<words>[^<>]+?)/\*',
-            r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
-            r'(?msu)/:(?P<words>[^<>]+?):/',
-            r'(?msu)\|:(?P<words>.+?):\|',
-        ]
-
-    def process_paragraph(self, paragraph):
-        for word in self.ITALICIZE_WORDS:
-            paragraph = paragraph.replace(word, '<i>%s</i>' % word)
-        for pat in self.ITALICIZE_STYLE_PATS:
-            paragraph = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), paragraph)
-        return paragraph
-
-    def convert(self, txt, title='', epub_split_size_kb=0):
-        from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE
-        txt = clean_txt(txt)
-        txt = split_txt(txt, epub_split_size_kb)
-
-        processed = []
-        for line in txt.split('\n\n'):
-            processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
-
-        txt = u'\n'.join(processed)
-        txt = re.sub('[ ]{2,}', ' ', txt)
-        html = HTML_TEMPLATE % (title, txt)
-
-        from calibre.ebooks.conversion.utils import PreProcessor
-        pp = PreProcessor()
-        html = pp.markup_chapters(html, pp.get_word_count(html), False)
-
-        return html
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 0b0bd6d570..5cffbafe21 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
     separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
     preserve_spaces, detect_paragraph_type, detect_formatting_type, \
-    convert_heuristic, normalize_line_endings, convert_textile
+    normalize_line_endings, convert_textile
 from calibre import _ent_pat, xml_entity_to_unicode
 
 class TXTInput(InputFormatPlugin):
@@ -126,11 +126,16 @@ class TXTInput(InputFormatPlugin):
                 txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
 
             flow_size = getattr(options, 'flow_size', 0)
+            html = convert_basic(txt, epub_split_size_kb=flow_size)
 
             if options.formatting_type == 'heuristic':
-                html = convert_heuristic(txt, epub_split_size_kb=flow_size)
-            else:
-                html = convert_basic(txt, epub_split_size_kb=flow_size)
+                setattr(options, 'enable_heuristics', True)
+                setattr(options, 'markup_chapter_headings', True)
+                setattr(options, 'italicize_common_cases', True)
+                setattr(options, 'fix_indents', True)
+                setattr(options, 'delete_blank_paragraphs', True)
+                setattr(options, 'format_scene_breaks', True)
+                setattr(options, 'dehyphenate', True)
 
         # Dehyphenate in cleanup mode for missed txt and markdown conversion
         dehyphenator = Dehyphenator()
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index e1979063c0..9fd8af0d70 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -12,7 +12,6 @@ import os, re
 
 from calibre import prepare_string_for_xml, isbytestring
 from calibre.ebooks.metadata.opf2 import OPFCreator
-from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
 from calibre.ebooks.conversion.preprocess import DocAnalysis
 from calibre.utils.cleantext import clean_ascii_chars
 
@@ -67,10 +66,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
 
     return HTML_TEMPLATE % (title, u'\n'.join(lines))
 
-def convert_heuristic(txt, title='', epub_split_size_kb=0):
-    tp = TXTHeuristicProcessor()
-    return tp.convert(txt, title, epub_split_size_kb)
-
 def convert_markdown(txt, title='', disable_toc=False):
     from calibre.ebooks.markdown import markdown
     md = markdown.Markdown(

From 946f1cf6c0e332898d34a7cf41680b6b2e3fce7b Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 22:07:20 +0800
Subject: [PATCH 024/118] added option for renumbering heading tags

---
 src/calibre/ebooks/conversion/cli.py     |  2 +-
 src/calibre/ebooks/conversion/plumber.py |  8 +++++++-
 src/calibre/ebooks/conversion/utils.py   | 15 ++++++++-------
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index db1ec0857d..c9612d97b9 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -137,7 +137,7 @@ def add_pipeline_options(parser, plumber):
                       'italicize_common_cases', 'fix_indents',
                       'html_unwrap_factor', 'unwrap_lines',
                       'delete_blank_paragraphs', 'format_scene_breaks',
-                      'dehyphenate',
+                      'dehyphenate', 'renumber_headings',
                   ]
                   ),
                   
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 48b965f624..b8c45dfa14 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -532,7 +532,13 @@ OptionRecommendation(name='dehyphenate',
     help=_('Analyses hyphenated words throughout the document.  The '
            'document itself is used as a dictionary to determine whether hyphens '
            'should be retained or removed.')),
-    
+
+OptionRecommendation(name='renumber_headings',
+    recommended_value=False, level=OptionRecommendation.LOW,
+    help=_('Looks for occurences of sequential <h1> or <h2> tags. '
+           'The tags are renumbered to prevent splitting in the middle '
+           'of chapter headings.')),
+
 OptionRecommendation(name='sr1_search',
     recommended_value='', level=OptionRecommendation.LOW,
     help=_('Search pattern (regular expression) to be replaced with '
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 2a88d371cc..4c62d2c06f 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -416,7 +416,7 @@ class PreProcessor(object):
                 dehyphenator = Dehyphenator()
                 html = dehyphenator(html,'html_cleanup', length)
 
-        if getattr(self.extra_opts, 'dehyphenate', True):
+        if getattr(self.extra_opts, 'dehyphenate', False):
             # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
             self.log("Fixing hyphenated content")
             dehyphenator = Dehyphenator()
@@ -429,13 +429,14 @@ class PreProcessor(object):
             chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
             html = chapdetect3.sub(self.chapter_break, html)
 
-        # search for places where a first or second level heading is immediately followed by another
-        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
-        # headings and titles, images, etc
-        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
-        html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
+        if getattr(self.extra_opts, 'renumber_headings', True):
+            # search for places where a first or second level heading is immediately followed by another
+            # top level heading.  demote the second heading to h3 to prevent splitting between chapter
+            # headings and titles, images, etc
+            doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
+            html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
 
-        if getattr(self.extra_opts, 'format_scene_breaks', True):
+        if getattr(self.extra_opts, 'format_scene_breaks', False):
             # Center separator lines
             html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
 

From 81c365b3a9efd8b546fa096bbe4eb737e607b6ba Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 22:41:38 +0800
Subject: [PATCH 025/118] ...

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 96d386bf78..3693d11cee 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -461,7 +461,7 @@ class PreProcessor(object):
             chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
             html = chapdetect3.sub(self.chapter_break, html)
 
-        if getattr(self.extra_opts, 'renumber_headings', True):
+        if getattr(self.extra_opts, 'renumber_headings', False):
             # search for places where a first or second level heading is immediately followed by another
             # top level heading.  demote the second heading to h3 to prevent splitting between chapter
             # headings and titles, images, etc

From 0edf1e550ea35f4b63208138ecd07c3d5dcb6856 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 22:47:51 +0800
Subject: [PATCH 026/118] ...

---
 src/calibre/ebooks/conversion/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 3693d11cee..305346d496 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -272,9 +272,11 @@ class PreProcessor(object):
 
         unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
         em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
+        shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
 
         content = unwrap.sub(' ', content)
         content = em_en_unwrap.sub('', content)
+        content = shy_unwrap.sub('', content)
         return content
 
     def txt_process(self, match):

From d6256ef452c130c471a184a9517b50e247e6f854 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 15 Jan 2011 11:06:43 -0500
Subject: [PATCH 027/118] Add renumber_headings option to GUI.

---
 src/calibre/gui2/convert/heuristics.py |  3 ++-
 src/calibre/gui2/convert/heuristics.ui | 21 ++++++++++++++-------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/calibre/gui2/convert/heuristics.py b/src/calibre/gui2/convert/heuristics.py
index 2b9df50457..904804f32e 100644
--- a/src/calibre/gui2/convert/heuristics.py
+++ b/src/calibre/gui2/convert/heuristics.py
@@ -21,7 +21,7 @@ class HeuristicsWidget(Widget, Ui_Form):
                  'italicize_common_cases', 'fix_indents',
                  'html_unwrap_factor', 'unwrap_lines',
                  'delete_blank_paragraphs', 'format_scene_breaks',
-                 'dehyphenate']
+                 'dehyphenate', 'renumber_headings']
                 )
         self.db, self.book_id = db, book_id
         self.initialize_options(get_option, get_help, db, book_id)
@@ -53,6 +53,7 @@ class HeuristicsWidget(Widget, Ui_Form):
         self.opt_delete_blank_paragraphs.setEnabled(state)
         self.opt_format_scene_breaks.setEnabled(state)
         self.opt_dehyphenate.setEnabled(state)
+        self.opt_renumber_headings(state)
         
         self.opt_unwrap_lines.setEnabled(state)
         if state and self.opt_unwrap_lines.checkState() == Qt.Checked:
diff --git a/src/calibre/gui2/convert/heuristics.ui b/src/calibre/gui2/convert/heuristics.ui
index e64e79e1df..c5f3c2cb3e 100644
--- a/src/calibre/gui2/convert/heuristics.ui
+++ b/src/calibre/gui2/convert/heuristics.ui
@@ -6,7 +6,7 @@
    <rect>
     <x>0</x>
     <y>0</y>
-    <width>657</width>
+    <width>811</width>
     <height>479</height>
    </rect>
   </property>
@@ -80,42 +80,42 @@
         </property>
        </widget>
       </item>
-      <item row="3" column="0" colspan="2">
+      <item row="4" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_delete_blank_paragraphs">
         <property name="text">
          <string>Delete blank lines between paragraphs</string>
         </property>
        </widget>
       </item>
-      <item row="4" column="0" colspan="2">
+      <item row="5" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_format_scene_breaks">
         <property name="text">
          <string>Ensure scene breaks are consistently formatted</string>
         </property>
        </widget>
       </item>
-      <item row="5" column="0" colspan="2">
+      <item row="6" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_dehyphenate">
         <property name="text">
          <string>Remove unnecessary hyphens</string>
         </property>
        </widget>
       </item>
-      <item row="6" column="0" colspan="2">
+      <item row="7" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_italicize_common_cases">
         <property name="text">
          <string>Italicize common words and patterns</string>
         </property>
        </widget>
       </item>
-      <item row="7" column="0" colspan="2">
+      <item row="8" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_fix_indents">
         <property name="text">
          <string>Replace entity indents with CSS indents</string>
         </property>
        </widget>
       </item>
-      <item row="8" column="0">
+      <item row="9" column="0">
        <spacer name="verticalSpacer">
         <property name="orientation">
          <enum>Qt::Vertical</enum>
@@ -141,6 +141,13 @@
         </property>
        </spacer>
       </item>
+      <item row="3" column="0">
+       <widget class="QCheckBox" name="opt_renumber_headings">
+        <property name="text">
+         <string>Renumber sequences of &lt;h1&gt; or &lt;h2&gt; tags to prevent splitting</string>
+        </property>
+       </widget>
+      </item>
      </layout>
     </widget>
    </item>

From 64796696ae0bec276c798bcc12e8b6d10a878788 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 15 Jan 2011 12:35:02 -0500
Subject: [PATCH 028/118] Enable heuristic processing over the entire
 conversion pipe line when option is enabled.

---
 src/calibre/customize/conversion.py         | 12 ------------
 src/calibre/ebooks/conversion/plumber.py    |  6 ++----
 src/calibre/ebooks/conversion/preprocess.py | 12 ++++++------
 src/calibre/ebooks/conversion/utils.py      |  4 ++--
 src/calibre/ebooks/html/input.py            |  7 -------
 src/calibre/ebooks/lit/input.py             |  9 +--------
 src/calibre/ebooks/lrf/input.py             |  9 ---------
 src/calibre/ebooks/pdb/input.py             |  1 -
 src/calibre/ebooks/rtf/input.py             |  1 -
 9 files changed, 11 insertions(+), 50 deletions(-)

diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py
index a9e573ffa0..b77ac81587 100644
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@@ -160,18 +160,6 @@ class InputFormatPlugin(Plugin):
         '''
         raise NotImplementedError()
 
-    def heuristics(self, opts, html):
-        '''
-        This method is called by the conversion pipeline on all HTML before it
-        is parsed. It is meant to be used to do any required preprocessing on
-        the HTML, like removing hard line breaks, etc.
-
-        :param html: A unicode string
-        :return: A unicode string
-        '''
-        return html
-
-
     def convert(self, stream, options, file_ext, log, accelerators):
         '''
         This method must be implemented in sub-classes. It must return
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index b8c45dfa14..249f848661 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -899,7 +899,6 @@ OptionRecommendation(name='sr3_replace',
                 self.opts_to_mi(self.user_metadata)
             if not hasattr(self.oeb, 'manifest'):
                 self.oeb = create_oebbook(self.log, self.oeb, self.opts,
-                        self.input_plugin,
                         encoding=self.input_plugin.output_encoding)
             self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
             self.opts.is_image_collection = self.input_plugin.is_image_collection
@@ -1009,14 +1008,13 @@ OptionRecommendation(name='sr3_replace',
         self.log(self.output_fmt.upper(), 'output written to', self.output)
         self.flush()
 
-def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
+def create_oebbook(log, path_or_stream, opts, reader=None,
         encoding='utf-8', populate=True):
     '''
     Create an OEBBook.
     '''
     from calibre.ebooks.oeb.base import OEBBook
-    html_preprocessor = HTMLPreProcessor(input_plugin.heuristics,
-            opts.enable_heuristics, opts)
+    html_preprocessor = HTMLPreProcessor(log, opts)
     if not encoding:
         encoding = None
     oeb = OEBBook(log, html_preprocessor,
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 35a311d58f..abaff77f33 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -397,10 +397,8 @@ class HTMLPreProcessor(object):
                      (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
                       lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
                      ]
-    def __init__(self, input_plugin_preprocess, plugin_preprocess,
-            extra_opts=None):
-        self.input_plugin_preprocess = input_plugin_preprocess
-        self.plugin_preprocess = plugin_preprocess
+    def __init__(self, log=None, extra_opts=None):
+        self.log = log
         self.extra_opts = extra_opts
 
     def is_baen(self, src):
@@ -542,8 +540,10 @@ class HTMLPreProcessor(object):
             unidecoder = Unidecoder()
             html = unidecoder.decode(html)
 
-        if self.plugin_preprocess:
-            html = self.input_plugin_preprocess(self.extra_opts, html)
+        if getattr(self.extra_opts, 'enable_heuristics', False):
+            from calibre.ebooks.conversion.utils import HeuristicProcessor
+            preprocessor = HeuristicProcessor(self.extra_opts, self.log)
+            html = preprocessor(html)
 
         if getattr(self.extra_opts, 'smarten_punctuation', False):
             html = self.smarten_punctuation(html)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 305346d496..48806e78e7 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -11,7 +11,7 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.utils.logging import default_log
 from calibre.utils.wordcount import get_wordcount_obj
 
-class PreProcessor(object):
+class HeuristicProcessor(object):
 
     def __init__(self, extra_opts=None, log=None):
         self.log = default_log if log is None else log
@@ -366,7 +366,7 @@ class PreProcessor(object):
 
 
     def __call__(self, html):
-        self.log("*********  Preprocessing HTML  *********")
+        self.log("*********  Heuristic processing HTML  *********")
 
         # Count the words in the document to estimate how many chapters to look for and whether
         # other types of processing are attempted
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 479f852c77..ed0bf7b3ef 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -24,7 +24,6 @@ from calibre.constants import islinux, isfreebsd, iswindows
 from calibre import unicode_path
 from calibre.utils.localization import get_lang
 from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.conversion.utils import PreProcessor
 
 class Link(object):
     '''
@@ -485,9 +484,3 @@ class HTMLInput(InputFormatPlugin):
             self.log.exception('Failed to read CSS file: %r'%link)
             return (None, None)
         return (None, raw)
-
-    def heuristics(self, options, html):
-        self.options = options
-        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
-        return preprocessor(html)
-
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index d0ecf008b7..7b822b68a6 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -7,7 +7,7 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
+from calibre.ebooks.conversion.utils import HeuristicProcessor
 
 
 class LITInput(InputFormatPlugin):
@@ -51,10 +51,3 @@ class LITInput(InputFormatPlugin):
                     for elem in body:
                         ne = copy.deepcopy(elem)
                         pre.append(ne)
-
-
-    def heuristics(self, options, html):
-        self.options = options
-        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
-        return preprocessor(html)
-
diff --git a/src/calibre/ebooks/lrf/input.py b/src/calibre/ebooks/lrf/input.py
index 05c8731da5..70f3c3a15a 100644
--- a/src/calibre/ebooks/lrf/input.py
+++ b/src/calibre/ebooks/lrf/input.py
@@ -12,7 +12,6 @@ from copy import deepcopy
 from lxml import etree
 
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
 from calibre import guess_type
 
 class Canvas(etree.XSLTExtension):
@@ -419,11 +418,3 @@ class LRFInput(InputFormatPlugin):
             f.write(result)
         styles.write()
         return os.path.abspath('content.opf')
-
-    def heuristics(self, options, html):
-        self.options = options
-        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
-        return preprocessor(html)
-
-
-
diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py
index de210e0a6d..cd861216af 100644
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@@ -9,7 +9,6 @@ import os
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.pdb.header import PdbHeaderReader
 from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
-from calibre.ebooks.conversion.utils import PreProcessor
 
 class PDBInput(InputFormatPlugin):
 
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 2f931d1d04..d3849bc5f5 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -7,7 +7,6 @@ import os, glob, re, textwrap
 from lxml import etree
 
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
 
 border_style_map = {
         'single' : 'solid',

From 60c50f39442b09872fb5aeb98a3be2bea3f4ec56 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 16 Jan 2011 01:46:04 +0800
Subject: [PATCH 029/118] tied mobi into preprocess

---
 src/calibre/ebooks/conversion/utils.py |  5 +++--
 src/calibre/ebooks/mobi/input.py       | 11 ++++-------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 305346d496..9825585cbf 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -236,7 +236,7 @@ class PreProcessor(object):
                     print unicode(self.chapters_with_title)+" chapters with titles"
                 else:
                     html = chapdetect.sub(self.chapter_head, html)
-                    return html
+            return html
 
         recurse_patterns(html, True)
         html = recurse_patterns(html, False)
@@ -322,7 +322,8 @@ class PreProcessor(object):
         html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
         # Delete microsoft 'smart' tags
         html = re.sub('(?i)</?st1:\w+>', '', html)
-        # Get rid of empty span, bold, & italics tags
+        # Get rid of empty span, bold, font, & italics tags
+        html = re.sub(r'\s*<font[^>]*>\s*</font>\s*', '', html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
         html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py
index 584be71fe4..4f3a087065 100644
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@@ -5,6 +5,7 @@ __docformat__ = 'restructuredtext en'
 
 import re
 from calibre.customize.conversion import InputFormatPlugin
+from calibre.ebooks.conversion.utils import PreProcessor
 
 class MOBIInput(InputFormatPlugin):
 
@@ -40,10 +41,6 @@ class MOBIInput(InputFormatPlugin):
         return mr.created_opf_path
 
     def heuristics(self, options, html):
-        # search for places where a first or second level heading is immediately followed by another
-        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
-        # headings and titles, images, etc
-        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
-        html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
-        return html
-
+        self.options = options
+        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
+        return preprocessor(html)

From d354a085b8e06f3283231a18fecbf2ee775f52bd Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 16 Jan 2011 01:53:49 +0800
Subject: [PATCH 030/118] ...

---
 src/calibre/ebooks/mobi/input.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py
index 4f3a087065..8188027e01 100644
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@@ -5,7 +5,6 @@ __docformat__ = 'restructuredtext en'
 
 import re
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
 
 class MOBIInput(InputFormatPlugin):
 
@@ -40,7 +39,3 @@ class MOBIInput(InputFormatPlugin):
                 accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
         return mr.created_opf_path
 
-    def heuristics(self, options, html):
-        self.options = options
-        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
-        return preprocessor(html)

From 8f345212da4d3c0289e54babd2b01cd4bf4fd767 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 15 Jan 2011 14:32:20 -0500
Subject: [PATCH 031/118] Fix issue with disabling checkbox.

---
 src/calibre/gui2/convert/heuristics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/gui2/convert/heuristics.py b/src/calibre/gui2/convert/heuristics.py
index 904804f32e..525d5ba2f1 100644
--- a/src/calibre/gui2/convert/heuristics.py
+++ b/src/calibre/gui2/convert/heuristics.py
@@ -53,7 +53,7 @@ class HeuristicsWidget(Widget, Ui_Form):
         self.opt_delete_blank_paragraphs.setEnabled(state)
         self.opt_format_scene_breaks.setEnabled(state)
         self.opt_dehyphenate.setEnabled(state)
-        self.opt_renumber_headings(state)
+        self.opt_renumber_headings.setEnabled(state)
         
         self.opt_unwrap_lines.setEnabled(state)
         if state and self.opt_unwrap_lines.checkState() == Qt.Checked:

From 9134d51377c1813fadec856867646bce8b74d762 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 15 Jan 2011 20:32:21 -0500
Subject: [PATCH 032/118] Clean up GUI option widgets.

---
 src/calibre/gui2/convert/heuristics.ui        |  65 +++---
 .../gui2/convert/search_and_replace.py        |   8 +-
 .../gui2/convert/search_and_replace.ui        | 206 +++++++++++++-----
 .../gui2/convert/structure_detection.ui       |  15 +-
 src/calibre/gui2/convert/xexp_edit.ui         |  21 +-
 5 files changed, 199 insertions(+), 116 deletions(-)

diff --git a/src/calibre/gui2/convert/heuristics.ui b/src/calibre/gui2/convert/heuristics.ui
index c5f3c2cb3e..1578b7146c 100644
--- a/src/calibre/gui2/convert/heuristics.ui
+++ b/src/calibre/gui2/convert/heuristics.ui
@@ -6,8 +6,8 @@
    <rect>
     <x>0</x>
     <y>0</y>
-    <width>811</width>
-    <height>479</height>
+    <width>938</width>
+    <height>470</height>
    </rect>
   </property>
   <property name="windowTitle">
@@ -26,7 +26,7 @@
      <property name="title">
       <string>Heuristics</string>
      </property>
-     <layout class="QGridLayout" name="gridLayout_2">
+     <layout class="QGridLayout" name="gridLayout">
       <item row="0" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_unwrap_lines">
         <property name="text">
@@ -34,19 +34,6 @@
         </property>
        </widget>
       </item>
-      <item row="1" column="0">
-       <spacer name="horizontalSpacer">
-        <property name="orientation">
-         <enum>Qt::Horizontal</enum>
-        </property>
-        <property name="sizeHint" stdset="0">
-         <size>
-          <width>131</width>
-          <height>22</height>
-         </size>
-        </property>
-       </spacer>
-      </item>
       <item row="1" column="1">
        <widget class="QLabel" name="huf_label">
         <property name="text">
@@ -73,13 +60,33 @@
         </property>
        </widget>
       </item>
-      <item row="2" column="0" colspan="3">
+      <item row="1" column="3">
+       <spacer name="horizontalSpacer_2">
+        <property name="orientation">
+         <enum>Qt::Horizontal</enum>
+        </property>
+        <property name="sizeHint" stdset="0">
+         <size>
+          <width>40</width>
+          <height>20</height>
+         </size>
+        </property>
+       </spacer>
+      </item>
+      <item row="2" column="0" colspan="4">
        <widget class="QCheckBox" name="opt_markup_chapter_headings">
         <property name="text">
          <string>Detect and markup unformatted chapter headings and sub headings</string>
         </property>
        </widget>
       </item>
+      <item row="3" column="0" colspan="4">
+       <widget class="QCheckBox" name="opt_renumber_headings">
+        <property name="text">
+         <string>Renumber sequences of &lt;h1&gt; or &lt;h2&gt; tags to prevent splitting</string>
+        </property>
+       </widget>
+      </item>
       <item row="4" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_delete_blank_paragraphs">
         <property name="text">
@@ -87,7 +94,7 @@
         </property>
        </widget>
       </item>
-      <item row="5" column="0" colspan="2">
+      <item row="5" column="0" colspan="3">
        <widget class="QCheckBox" name="opt_format_scene_breaks">
         <property name="text">
          <string>Ensure scene breaks are consistently formatted</string>
@@ -115,7 +122,7 @@
         </property>
        </widget>
       </item>
-      <item row="9" column="0">
+      <item row="9" column="0" colspan="2">
        <spacer name="verticalSpacer">
         <property name="orientation">
          <enum>Qt::Vertical</enum>
@@ -128,26 +135,6 @@
         </property>
        </spacer>
       </item>
-      <item row="1" column="3">
-       <spacer name="horizontalSpacer_2">
-        <property name="orientation">
-         <enum>Qt::Horizontal</enum>
-        </property>
-        <property name="sizeHint" stdset="0">
-         <size>
-          <width>40</width>
-          <height>20</height>
-         </size>
-        </property>
-       </spacer>
-      </item>
-      <item row="3" column="0">
-       <widget class="QCheckBox" name="opt_renumber_headings">
-        <property name="text">
-         <string>Renumber sequences of &lt;h1&gt; or &lt;h2&gt; tags to prevent splitting</string>
-        </property>
-       </widget>
-      </item>
      </layout>
     </widget>
    </item>
diff --git a/src/calibre/gui2/convert/search_and_replace.py b/src/calibre/gui2/convert/search_and_replace.py
index 36a496c520..fff75a29ba 100644
--- a/src/calibre/gui2/convert/search_and_replace.py
+++ b/src/calibre/gui2/convert/search_and_replace.py
@@ -12,7 +12,7 @@ from calibre.gui2 import error_dialog
 
 class SearchAndReplaceWidget(Widget, Ui_Form):
 
-    TITLE = _('Search and Replace')
+    TITLE = _('Search &\nReplace')
     HELP  = _('Modify the document text and structure using user defined patterns.')
     COMMIT_NAME = 'search_and_replace'
 
@@ -24,9 +24,9 @@ class SearchAndReplaceWidget(Widget, Ui_Form):
                 )
         self.db, self.book_id = db, book_id
         self.initialize_options(get_option, get_help, db, book_id)
-        self.opt_sr1_search.set_msg(_('Regular Expression'))
-        self.opt_sr2_search.set_msg(_('Regular Expression'))
-        self.opt_sr3_search.set_msg(_('Regular Expression'))
+        self.opt_sr1_search.set_msg(_('Search Regular Expression'))
+        self.opt_sr2_search.set_msg(_('Search Regular Expression'))
+        self.opt_sr3_search.set_msg(_('Search Regular Expression'))
         
     def break_cycles(self):
         Widget.break_cycles(self)
diff --git a/src/calibre/gui2/convert/search_and_replace.ui b/src/calibre/gui2/convert/search_and_replace.ui
index ed500a4dd0..e0e9570f8c 100644
--- a/src/calibre/gui2/convert/search_and_replace.ui
+++ b/src/calibre/gui2/convert/search_and_replace.ui
@@ -6,80 +6,176 @@
    <rect>
     <x>0</x>
     <y>0</y>
-    <width>657</width>
-    <height>479</height>
+    <width>198</width>
+    <height>350</height>
    </rect>
   </property>
+  <property name="sizePolicy">
+   <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+    <horstretch>0</horstretch>
+    <verstretch>0</verstretch>
+   </sizepolicy>
+  </property>
   <property name="windowTitle">
    <string>Form</string>
   </property>
-  <layout class="QGridLayout" name="gridLayout">
-   <item row="0" column="1">
-    <widget class="QLabel" name="label_4">
-     <property name="text">
-      <string>Search</string>
+  <layout class="QGridLayout" name="gridLayout_4">
+   <property name="sizeConstraint">
+    <enum>QLayout::SetDefaultConstraint</enum>
+   </property>
+   <item row="0" column="0">
+    <widget class="QGroupBox" name="groupBox">
+     <property name="sizePolicy">
+      <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+       <horstretch>0</horstretch>
+       <verstretch>0</verstretch>
+      </sizepolicy>
      </property>
-    </widget>
-   </item>
-   <item row="0" column="2">
-    <widget class="QLabel" name="label_5">
-     <property name="text">
-      <string>Replace</string>
+     <property name="title">
+      <string>1.</string>
      </property>
+     <layout class="QGridLayout" name="gridLayout_2">
+      <property name="sizeConstraint">
+       <enum>QLayout::SetMinimumSize</enum>
+      </property>
+      <item row="0" column="0">
+       <widget class="RegexEdit" name="opt_sr1_search" native="true">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+       </widget>
+      </item>
+      <item row="1" column="0">
+       <widget class="QLabel" name="label_4">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+        <property name="text">
+         <string>Replacement Text</string>
+        </property>
+       </widget>
+      </item>
+      <item row="2" column="0">
+       <widget class="QLineEdit" name="opt_sr1_replace">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Fixed">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+       </widget>
+      </item>
+     </layout>
     </widget>
    </item>
    <item row="1" column="0">
-    <widget class="QLabel" name="label">
-     <property name="text">
-      <string>1.</string>
+    <widget class="QGroupBox" name="groupBox_2">
+     <property name="sizePolicy">
+      <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+       <horstretch>0</horstretch>
+       <verstretch>0</verstretch>
+      </sizepolicy>
      </property>
-    </widget>
-   </item>
-   <item row="1" column="1">
-    <widget class="RegexEdit" name="opt_sr1_search" native="true"/>
-   </item>
-   <item row="1" column="2">
-    <widget class="QLineEdit" name="opt_sr1_replace"/>
-   </item>
-   <item row="2" column="0">
-    <widget class="QLabel" name="label_2">
-     <property name="text">
+     <property name="title">
       <string>2.</string>
      </property>
+     <layout class="QGridLayout" name="gridLayout">
+      <property name="sizeConstraint">
+       <enum>QLayout::SetMinimumSize</enum>
+      </property>
+      <item row="0" column="0">
+       <widget class="RegexEdit" name="opt_sr2_search" native="true">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+       </widget>
+      </item>
+      <item row="1" column="0">
+       <widget class="QLabel" name="label_5">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+        <property name="text">
+         <string>Replacement Text</string>
+        </property>
+       </widget>
+      </item>
+      <item row="2" column="0">
+       <widget class="QLineEdit" name="opt_sr2_replace">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Fixed">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+       </widget>
+      </item>
+     </layout>
     </widget>
    </item>
-   <item row="2" column="1">
-    <widget class="RegexEdit" name="opt_sr2_search" native="true"/>
-   </item>
-   <item row="2" column="2">
-    <widget class="QLineEdit" name="opt_sr2_replace"/>
-   </item>
-   <item row="3" column="0">
-    <widget class="QLabel" name="label_3">
-     <property name="text">
+   <item row="2" column="0">
+    <widget class="QGroupBox" name="groupBox_3">
+     <property name="sizePolicy">
+      <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+       <horstretch>0</horstretch>
+       <verstretch>0</verstretch>
+      </sizepolicy>
+     </property>
+     <property name="title">
       <string>3.</string>
      </property>
+     <layout class="QGridLayout" name="gridLayout_3">
+      <property name="sizeConstraint">
+       <enum>QLayout::SetMinimumSize</enum>
+      </property>
+      <item row="0" column="0">
+       <widget class="RegexEdit" name="opt_sr3_search" native="true">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+       </widget>
+      </item>
+      <item row="1" column="0">
+       <widget class="QLabel" name="label_6">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+        <property name="text">
+         <string>Replacement Text</string>
+        </property>
+       </widget>
+      </item>
+      <item row="2" column="0">
+       <widget class="QLineEdit" name="opt_sr3_replace">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Fixed">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+       </widget>
+      </item>
+     </layout>
     </widget>
    </item>
-   <item row="3" column="1">
-    <widget class="RegexEdit" name="opt_sr3_search" native="true"/>
-   </item>
-   <item row="3" column="2">
-    <widget class="QLineEdit" name="opt_sr3_replace"/>
-   </item>
-   <item row="4" column="1">
-    <spacer name="verticalSpacer">
-     <property name="orientation">
-      <enum>Qt::Vertical</enum>
-     </property>
-     <property name="sizeHint" stdset="0">
-      <size>
-       <width>20</width>
-       <height>330</height>
-      </size>
-     </property>
-    </spacer>
-   </item>
   </layout>
  </widget>
  <customwidgets>
diff --git a/src/calibre/gui2/convert/structure_detection.ui b/src/calibre/gui2/convert/structure_detection.ui
index 262894d42d..ef0677a67c 100644
--- a/src/calibre/gui2/convert/structure_detection.ui
+++ b/src/calibre/gui2/convert/structure_detection.ui
@@ -27,7 +27,7 @@
      </property>
     </widget>
    </item>
-   <item row="1" column="1" colspan="2">
+   <item row="1" column="1">
     <widget class="QComboBox" name="opt_chapter_mark">
      <property name="minimumContentsLength">
       <number>20</number>
@@ -64,6 +64,19 @@
      </property>
     </spacer>
    </item>
+   <item row="1" column="2">
+    <spacer name="horizontalSpacer">
+     <property name="orientation">
+      <enum>Qt::Horizontal</enum>
+     </property>
+     <property name="sizeHint" stdset="0">
+      <size>
+       <width>40</width>
+       <height>20</height>
+      </size>
+     </property>
+    </spacer>
+   </item>
   </layout>
  </widget>
  <customwidgets>
diff --git a/src/calibre/gui2/convert/xexp_edit.ui b/src/calibre/gui2/convert/xexp_edit.ui
index 7e89ec5d43..4b26eb8dcf 100644
--- a/src/calibre/gui2/convert/xexp_edit.ui
+++ b/src/calibre/gui2/convert/xexp_edit.ui
@@ -6,8 +6,8 @@
    <rect>
     <x>0</x>
     <y>0</y>
-    <width>422</width>
-    <height>64</height>
+    <width>434</width>
+    <height>74</height>
    </rect>
   </property>
   <property name="windowTitle">
@@ -53,13 +53,13 @@
    <item row="0" column="1">
     <widget class="QToolButton" name="button">
      <property name="toolTip">
-      <string>Use a wizard to help construct the XPath expression</string>
+      <string>Use a wizard to help construct the Regular expression</string>
      </property>
      <property name="text">
       <string>...</string>
      </property>
      <property name="icon">
-      <iconset resource="../../../../resources/images.qrc">
+      <iconset>
        <normaloff>:/images/wizard.png</normaloff>:/images/wizard.png</iconset>
      </property>
      <property name="iconSize">
@@ -70,19 +70,6 @@
      </property>
     </widget>
    </item>
-   <item row="0" column="2">
-    <spacer name="horizontalSpacer">
-     <property name="orientation">
-      <enum>Qt::Horizontal</enum>
-     </property>
-     <property name="sizeHint" stdset="0">
-      <size>
-       <width>20</width>
-       <height>20</height>
-      </size>
-     </property>
-    </spacer>
-   </item>
   </layout>
  </widget>
  <customwidgets>

From a21bf30ff8207ec9a4e644d1ffbfcbee58a346c6 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sun, 16 Jan 2011 08:41:16 +0000
Subject: [PATCH 033/118] 1) finish JSON-storage of template function source
 code 2) fix for #8388. This fix was chosen because it emulates the behavior
 in 0.7.38, where get_metadata returned empty author lists

---
 .../gui2/preferences/template_functions.py        | 15 +++++++++++++--
 src/calibre/library/database2.py                  |  5 ++++-
 src/calibre/library/sqlite.py                     |  2 +-
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/calibre/gui2/preferences/template_functions.py b/src/calibre/gui2/preferences/template_functions.py
index 2e16b0f4c3..8ffd65b2b5 100644
--- a/src/calibre/gui2/preferences/template_functions.py
+++ b/src/calibre/gui2/preferences/template_functions.py
@@ -5,7 +5,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-import traceback
+import json, traceback
 
 from calibre.gui2 import error_dialog
 from calibre.gui2.preferences import ConfigWidgetBase, test_widget
@@ -73,6 +73,12 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
         self.textBrowser.setHtml(help_text)
 
     def initialize(self):
+        try:
+            with open(P('template-functions.json'), 'rb') as f:
+                self.builtin_source_dict = json.load(f, encoding='utf-8')
+        except:
+            self.builtin_source_dict = {}
+
         self.funcs = formatter_functions.get_functions()
         self.builtins = formatter_functions.get_builtins()
 
@@ -179,8 +185,13 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
         func = self.funcs[txt]
         self.argument_count.setValue(func.arg_count)
         self.documentation.setText(func.doc)
-        self.program.setPlainText(func.program_text)
         if txt in self.builtins:
+            if hasattr(func, 'program_text'):
+                self.program.setPlainText(func.program_text)
+            elif txt in self.builtin_source_dict:
+                self.program.setPlainText(self.builtin_source_dict[txt])
+            else:
+                self.program.setPlainText(_('function source code not available'))
             self.documentation.setReadOnly(True)
             self.argument_count.setReadOnly(True)
             self.program.setReadOnly(True)
diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py
index 3a2109e01e..df094347b8 100644
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@@ -690,7 +690,10 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
         mi = Metadata(None)
 
         aut_list = row[fm['au_map']]
-        aut_list = [p.split(':::') for p in aut_list.split(':#:')]
+        if not aut_list:
+            aut_list = []
+        else:
+            aut_list = [p.split(':::') for p in aut_list.split(':#:')]
         aum = []
         aus = {}
         for (author, author_sort) in aut_list:
diff --git a/src/calibre/library/sqlite.py b/src/calibre/library/sqlite.py
index 83f19b8711..622d6b8459 100644
--- a/src/calibre/library/sqlite.py
+++ b/src/calibre/library/sqlite.py
@@ -100,7 +100,7 @@ class AumSortedConcatenate(object):
         keys = self.ans.keys()
         l = len(keys)
         if l == 0:
-            return 'Unknown:::Unknown'
+            return None
         if l == 1:
             return self.ans[keys[0]]
         return ':#:'.join([self.ans[v] for v in sorted(keys)])

From e0b2d0b62a6b1f4c4a2f37cdcdffa5e1744b892b Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sun, 16 Jan 2011 10:34:05 +0000
Subject: [PATCH 034/118] Add function documentation to template editing dialog
 (F2 on a composite column)

---
 src/calibre/gui2/dialogs/template_dialog.py | 40 +++++++++-
 src/calibre/gui2/dialogs/template_dialog.ui | 81 ++++++++++++++++-----
 2 files changed, 103 insertions(+), 18 deletions(-)

diff --git a/src/calibre/gui2/dialogs/template_dialog.py b/src/calibre/gui2/dialogs/template_dialog.py
index 60d4025ef9..62accdc842 100644
--- a/src/calibre/gui2/dialogs/template_dialog.py
+++ b/src/calibre/gui2/dialogs/template_dialog.py
@@ -3,8 +3,11 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 __license__   = 'GPL v3'
 
+import json
+
 from PyQt4.Qt import Qt, QDialog, QDialogButtonBox
 from calibre.gui2.dialogs.template_dialog_ui import Ui_TemplateDialog
+from calibre.utils.formatter_functions import formatter_functions
 
 class TemplateDialog(QDialog, Ui_TemplateDialog):
 
@@ -17,9 +20,44 @@ class TemplateDialog(QDialog, Ui_TemplateDialog):
         self.setWindowFlags(self.windowFlags()&(~Qt.WindowContextHelpButtonHint))
         self.setWindowIcon(icon)
 
+        self.textbox.setTabStopWidth(10)
+        self.source_code.setTabStopWidth(10)
+        self.documentation.setReadOnly(True)
+        self.source_code.setReadOnly(True)
+
         if text is not None:
             self.textbox.setPlainText(text)
-        self.textbox.setTabStopWidth(50)
         self.buttonBox.button(QDialogButtonBox.Ok).setText(_('&OK'))
         self.buttonBox.button(QDialogButtonBox.Cancel).setText(_('&Cancel'))
 
+        try:
+            with open(P('template-functions.json'), 'rb') as f:
+                self.builtin_source_dict = json.load(f, encoding='utf-8')
+        except:
+            self.builtin_source_dict = {}
+
+        self.funcs = formatter_functions.get_functions()
+        self.builtins = formatter_functions.get_builtins()
+
+        func_names = sorted(self.funcs)
+        self.function.clear()
+        self.function.addItem('')
+        self.function.addItems(func_names)
+        self.function.setCurrentIndex(0)
+        self.function.currentIndexChanged[str].connect(self.function_changed)
+
+        print self.textbox.tabStopWidth()
+        print self.source_code.tabStopWidth()
+
+    def function_changed(self, toWhat):
+        name = unicode(toWhat)
+        self.source_code.clear()
+        self.documentation.clear()
+        if name in self.funcs:
+            self.documentation.setPlainText(self.funcs[name].doc)
+            if name in self.builtins:
+                if name in self.builtin_source_dict:
+                    self.source_code.setPlainText(self.builtin_source_dict[name])
+            else:
+                self.source_code.setPlainText(self.funcs[name].program_text)
+
diff --git a/src/calibre/gui2/dialogs/template_dialog.ui b/src/calibre/gui2/dialogs/template_dialog.ui
index a30d6ef273..e1980a8397 100644
--- a/src/calibre/gui2/dialogs/template_dialog.ui
+++ b/src/calibre/gui2/dialogs/template_dialog.ui
@@ -6,8 +6,8 @@
    <rect>
     <x>0</x>
     <y>0</y>
-    <width>500</width>
-    <height>235</height>
+    <width>588</width>
+    <height>546</height>
    </rect>
   </property>
   <property name="sizePolicy">
@@ -19,21 +19,68 @@
   <property name="windowTitle">
    <string>Edit Comments</string>
   </property>
-   <layout class="QVBoxLayout" name="verticalLayout">
-    <item>
-     <widget class="QPlainTextEdit" name="textbox"/>
-    </item>
-    <item>
-     <widget class="QDialogButtonBox" name="buttonBox">
-      <property name="orientation">
-       <enum>Qt::Horizontal</enum>
-      </property>
-      <property name="standardButtons">
-       <set>QDialogButtonBox::Cancel|QDialogButtonBox::Ok</set>
-      </property>
-     </widget>
-    </item>
-   </layout>
+  <layout class="QVBoxLayout" name="verticalLayout">
+   <item>
+    <widget class="QPlainTextEdit" name="textbox"/>
+   </item>
+   <item>
+    <widget class="QDialogButtonBox" name="buttonBox">
+     <property name="orientation">
+      <enum>Qt::Horizontal</enum>
+     </property>
+     <property name="standardButtons">
+      <set>QDialogButtonBox::Cancel|QDialogButtonBox::Ok</set>
+     </property>
+    </widget>
+   </item>
+   <item>
+    <layout class="QGridLayout" name="gridLayout">
+     <item row="0" column="0">
+      <widget class="QLabel" name="label">
+       <property name="text">
+        <string>Function name:</string>
+       </property>
+      </widget>
+     </item>
+     <item row="0" column="1">
+      <widget class="QComboBox" name="function"/>
+     </item>
+     <item row="1" column="0">
+      <widget class="QLabel" name="label_2">
+       <property name="text">
+        <string>Documentation:</string>
+       </property>
+       <property name="alignment">
+        <set>Qt::AlignLeading|Qt::AlignLeft|Qt::AlignTop</set>
+       </property>
+      </widget>
+     </item>
+     <item row="2" column="0">
+      <widget class="QLabel" name="label_3">
+       <property name="text">
+        <string>Python code:</string>
+       </property>
+       <property name="alignment">
+        <set>Qt::AlignLeading|Qt::AlignLeft|Qt::AlignTop</set>
+       </property>
+      </widget>
+     </item>
+     <item row="1" column="1">
+      <widget class="QPlainTextEdit" name="documentation">
+       <property name="maximumSize">
+        <size>
+         <width>16777215</width>
+         <height>75</height>
+        </size>
+       </property>
+      </widget>
+     </item>
+     <item row="2" column="1">
+      <widget class="QPlainTextEdit" name="source_code"/>
+     </item>
+    </layout>
+   </item>
+  </layout>
  </widget>
  <resources/>
  <connections>

From 4a9e8bcb2fd0d60438e3265defb358891de6bc75 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sun, 16 Jan 2011 10:48:43 +0000
Subject: [PATCH 035/118] Remove some print statements.

---
 src/calibre/gui2/dialogs/template_dialog.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/calibre/gui2/dialogs/template_dialog.py b/src/calibre/gui2/dialogs/template_dialog.py
index 62accdc842..174056ef80 100644
--- a/src/calibre/gui2/dialogs/template_dialog.py
+++ b/src/calibre/gui2/dialogs/template_dialog.py
@@ -46,9 +46,6 @@ class TemplateDialog(QDialog, Ui_TemplateDialog):
         self.function.setCurrentIndex(0)
         self.function.currentIndexChanged[str].connect(self.function_changed)
 
-        print self.textbox.tabStopWidth()
-        print self.source_code.tabStopWidth()
-
     def function_changed(self, toWhat):
         name = unicode(toWhat)
         self.source_code.clear()

From 5c4154bb0d37a273e4925c2daa4dc15b4e9f752b Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sun, 16 Jan 2011 11:26:34 +0000
Subject: [PATCH 036/118] Make date.format_date render UNDEFINED_DATE as ''.
 This makes composite columns and templates behave in the same fashion as the
 GUI has for some time.

---
 src/calibre/utils/date.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/calibre/utils/date.py b/src/calibre/utils/date.py
index f025a0c9bf..2551b90788 100644
--- a/src/calibre/utils/date.py
+++ b/src/calibre/utils/date.py
@@ -148,6 +148,9 @@ def format_date(dt, format, assume_utc=False, as_utc=False):
         if len(mo.group(0)) == 2: return '%02d'%(dt.year % 100)
         return '%04d'%dt.year
 
+    if dt == UNDEFINED_DATE:
+        return ''
+
     format = re.sub('d{1,4}', format_day, format)
     format = re.sub('M{1,4}', format_month, format)
     return re.sub('yyyy|yy', format_year, format)

From 1272988089814321248ffe0c58232f1d061a67a3 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 16 Jan 2011 20:11:52 +0800
Subject: [PATCH 037/118] enabled hyphen removal across the entire document
 text, refactored logic to reduce false positives, added verbose debug output

---
 src/calibre/ebooks/conversion/preprocess.py | 47 +++++++++-----
 src/calibre/ebooks/conversion/utils.py      | 69 +++++++++++----------
 src/calibre/ebooks/txt/input.py             |  4 +-
 3 files changed, 72 insertions(+), 48 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index abaff77f33..9dedd05e33 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -174,13 +174,19 @@ class Dehyphenator(object):
     retain hyphens.
     '''
 
-    def __init__(self):
+    def __init__(self, verbose=0, log=None):
+        self.log = default_log if log is None else log
+        self.verbose = verbose
         # Add common suffixes to the regex below to increase the likelihood of a match -
         # don't add suffixes which are also complete words, such as 'able' or 'sex'
-        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
+        # only remove if it's not already the point of hyphenation
+        self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$"
+        self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
+        self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
         # remove prefixes if the prefix was not already the point of hyphenation
-        self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
-        self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
+        self.prefix_string = '^(dis|re|un|in|ex)'
+        self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
+        self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
 
     def dehyphenate(self, match):
         firsthalf = match.group('firstpart')
@@ -191,31 +197,44 @@ class Dehyphenator(object):
             wraptags = ''
         hyphenated = unicode(firsthalf) + "-" + unicode(secondhalf)
         dehyphenated = unicode(firsthalf) + unicode(secondhalf)
-        lookupword = self.removesuffixes.sub('', dehyphenated)
-        if self.prefixes.match(firsthalf) is None:
+        if self.suffixes.match(secondhalf) is None:
+            lookupword = self.removesuffixes.sub('', dehyphenated)
+        else:
+            lookupword = dehyphenated
+        if len(firsthalf) > 3 and self.prefixes.match(firsthalf) is None:
             lookupword = self.removeprefix.sub('', lookupword)
-        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+        if self.verbose > 2:
+            self.log("lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated))
         try:
             searchresult = self.html.find(lookupword.lower())
         except:
             return hyphenated
         if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
             if self.html.find(lookupword) != -1 or searchresult != -1:
-                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+                if self.verbose > 2:
+                    self.log("    Cleanup:returned dehyphenated word: " + str(dehyphenated))
                 return dehyphenated
             elif self.html.find(hyphenated) != -1:
-                #print "Cleanup:returned hyphenated word: " + str(hyphenated)
+                if self.verbose > 2:
+                    self.log("        Cleanup:returned hyphenated word: " + str(hyphenated))
                 return hyphenated
             else:
-                #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+                if self.verbose > 2:
+                    self.log("            Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf))
                 return firsthalf+u'\u2014'+wraptags+secondhalf
 
         else:
+            if len(firsthalf) <= 2 and len(secondhalf) <= 2:
+                if self.verbose > 2:
+                    self.log("too short, returned hyphenated word: " + str(hyphenated))
+                return hyphenated
             if self.html.find(lookupword) != -1 or searchresult != -1:
-                #print "returned dehyphenated word: " + str(dehyphenated)
+                if self.verbose > 2:
+                    self.log("     returned dehyphenated word: " + str(dehyphenated))
                 return dehyphenated
             else:
-                #print "           returned hyphenated word: " + str(hyphenated)
+                if self.verbose > 2:
+                    self.log("          returned hyphenated word: " + str(hyphenated))
                 return hyphenated
 
     def __call__(self, html, format, length=1):
@@ -228,7 +247,7 @@ class Dehyphenator(object):
         elif format == 'txt':
             intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
         elif format == 'individual_words':
-            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
+            intextmatch = re.compile(u'(?!<)(?P<firstpart>\w+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)') # for later, not called anywhere yet
         elif format == 'html_cleanup':
             intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
         elif format == 'txt_cleanup':
@@ -512,7 +531,7 @@ class HTMLPreProcessor(object):
 
         if is_pdftohtml and length > -1:
             # Dehyphenate
-            dehyphenator = Dehyphenator()
+            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
             html = dehyphenator(html,'html', length)
 
         if is_pdftohtml:
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 96a9a4783d..4a118d423c 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -322,11 +322,11 @@ class HeuristicProcessor(object):
         html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
         # Delete microsoft 'smart' tags
         html = re.sub('(?i)</?st1:\w+>', '', html)
-        # Get rid of empty span, bold, font, & italics tags
-        html = re.sub(r'\s*<font[^>]*>\s*</font>\s*', '', html)
+        # Get rid of empty span, bold, font, em, & italics tags
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
-        html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
+        html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
+        html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
         self.deleted_nbsps = True
         return html
 
@@ -376,27 +376,31 @@ class HeuristicProcessor(object):
         except:
             self.log("Can't get wordcount")
 
-        if 0 < self.totalwords < 50:
+        print "found "+unicode(self.totalwords)+" words in the flow"
+        if self.totalwords < 50:
             self.log("flow is too short, not running heuristics")
             return html
 
         # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
         html = self.arrange_htm_line_endings(html)
 
-        ###### Check Markup ######
-        #
-        # some lit files don't have any <p> tags or equivalent (generally just plain text between
-        # <pre> tags), check and  mark up line endings if required before proceeding
-        if self.no_markup(html, 0.1):
-            self.log("not enough paragraph markers, adding now")
-            # markup using text processing
-            html = self.markup_pre(html)
+        if self.cleanup_required():
+            ###### Check Markup ######
+            #
+            # some lit files don't have any <p> tags or equivalent (generally just plain text between
+            # <pre> tags), check and  mark up line endings if required before proceeding
+            # fix indents must run after this step
+            if self.no_markup(html, 0.1):
+                self.log("not enough paragraph markers, adding now")
+                # markup using text processing
+                html = self.markup_pre(html)
 
         # Replace series of non-breaking spaces with text-indent
         if getattr(self.extra_opts, 'fix_indents', False):
             html = self.fix_nbsp_indents(html)
 
         if self.cleanup_required():
+            # fix indents must run before this step, as it removes non-breaking spaces
             html = self.cleanup_markup(html)
 
         # ADE doesn't render <br />, change to empty paragraphs
@@ -420,26 +424,26 @@ class HeuristicProcessor(object):
             self.log("deleting blank lines")
             html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
             html = self.blankreg.sub('', html)
+
+        # Determine line ending type
+        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
+        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
+        # that lines can be un-wrapped across page boundaries
+        format = self.analyze_line_endings(html)
+
+        # Check Line histogram to determine if the document uses hard line breaks, If 50% or
+        # more of the lines break in the same region of the document then unwrapping is required
+        docanalysis = DocAnalysis(format, html)
+        hardbreaks = docanalysis.line_histogram(.50)
+        self.log("Hard line breaks check returned "+unicode(hardbreaks))
+
+        # Calculate Length
+        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
+        length = docanalysis.line_length(unwrap_factor)
+        self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
             
         ###### Unwrap lines ######
         if getattr(self.extra_opts, 'unwrap_lines', False):
-            # Determine line ending type
-            # Some OCR sourced files have line breaks in the html using a combination of span & p tags
-            # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
-            # that lines can be un-wrapped across page boundaries
-            format = self.analyze_line_endings(html)
-
-            # Check Line histogram to determine if the document uses hard line breaks, If 50% or
-            # more of the lines break in the same region of the document then unwrapping is required
-            docanalysis = DocAnalysis(format, html)
-            hardbreaks = docanalysis.line_histogram(.50)
-            self.log("Hard line breaks check returned "+unicode(hardbreaks))
-
-            # Calculate Length
-            unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
-            length = docanalysis.line_length(unwrap_factor)
-            self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
-
             # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
             if hardbreaks or unwrap_factor < 0.4:
                 self.log("Unwrapping required, unwrapping Lines")
@@ -447,15 +451,16 @@ class HeuristicProcessor(object):
                 dehyphenator = Dehyphenator()
                 html = dehyphenator(html,'html', length)
                 html = self.punctuation_unwrap(length, html, 'html')
-                #check any remaining hyphens, but only unwrap if there is a match
-                dehyphenator = Dehyphenator()
+                # unwrap remaining hyphens based on line length, but only remove if there is a match
+                dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
                 html = dehyphenator(html,'html_cleanup', length)
 
         if getattr(self.extra_opts, 'dehyphenate', False):
             # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
             self.log("Fixing hyphenated content")
-            dehyphenator = Dehyphenator()
+            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
             html = dehyphenator(html,'html_cleanup', length)
+            html = dehyphenator(html, 'individual_words', length)
 
         # If still no sections after unwrapping mark split points on lines with no punctuation
         if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 5cffbafe21..8bf33c4837 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -106,7 +106,7 @@ class TXTInput(InputFormatPlugin):
                     log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
 
             # Dehyphenate
-            dehyphenator = Dehyphenator()
+            dehyphenator = Dehyphenator(options.verbose, log=getattr(self, 'log', None))
             txt = dehyphenator(txt,'txt', length)
 
             # We don't check for block because the processor assumes block.
@@ -138,7 +138,7 @@ class TXTInput(InputFormatPlugin):
                 setattr(options, 'dehyphenate', True)
 
         # Dehyphenate in cleanup mode for missed txt and markdown conversion
-        dehyphenator = Dehyphenator()
+        dehyphenator = Dehyphenator(options.verbose, log=getattr(self, 'log', None))
         html = dehyphenator(html,'txt_cleanup', length)
         html = dehyphenator(html,'html_cleanup', length)
 

From 89dd86056e727de35ff844cf712051b96a96e712 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 16 Jan 2011 20:26:52 +0800
Subject: [PATCH 038/118] ...

---
 src/calibre/ebooks/conversion/preprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 9dedd05e33..d1d275eb97 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -247,7 +247,7 @@ class Dehyphenator(object):
         elif format == 'txt':
             intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
         elif format == 'individual_words':
-            intextmatch = re.compile(u'(?!<)(?P<firstpart>\w+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)') # for later, not called anywhere yet
+            intextmatch = re.compile(u'(?!<)(?P<firstpart>\w+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)')
         elif format == 'html_cleanup':
             intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
         elif format == 'txt_cleanup':

From e0d1de2ce8832eb55abacf85edbfdcb1fb5d549e Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 16 Jan 2011 20:54:17 +0800
Subject: [PATCH 039/118] removed hyphen removal from text input that's covered
 by the heuristics option

---
 src/calibre/ebooks/txt/input.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 8bf33c4837..39bfb4b132 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -137,11 +137,6 @@ class TXTInput(InputFormatPlugin):
                 setattr(options, 'format_scene_breaks', True)
                 setattr(options, 'dehyphenate', True)
 
-        # Dehyphenate in cleanup mode for missed txt and markdown conversion
-        dehyphenator = Dehyphenator(options.verbose, log=getattr(self, 'log', None))
-        html = dehyphenator(html,'txt_cleanup', length)
-        html = dehyphenator(html,'html_cleanup', length)
-
         from calibre.customize.ui import plugin_for_input_format
         html_input = plugin_for_input_format('html')
         for opt in html_input.options:

From b2626bace330aba990a35e5caf15a33193a43652 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 16 Jan 2011 08:55:26 -0500
Subject: [PATCH 040/118] Fix search_and_replace option names.

---
 src/calibre/gui2/convert/search_and_replace.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/gui2/convert/search_and_replace.py b/src/calibre/gui2/convert/search_and_replace.py
index fff75a29ba..34c6cdf1e9 100644
--- a/src/calibre/gui2/convert/search_and_replace.py
+++ b/src/calibre/gui2/convert/search_and_replace.py
@@ -36,7 +36,7 @@ class SearchAndReplaceWidget(Widget, Ui_Form):
         self.opt_sr3_search.break_cycles()
 
     def pre_commit_check(self):
-        for x in ('sr1-search', 'sr2-search', 'sr3-search'):
+        for x in ('sr1_search', 'sr2_search', 'sr3_search'):
             x = getattr(self, 'opt_'+x)
             try:
                 pat = unicode(x.regex)

From 68587e8679b70463c51bd66bdd78339ea9838a8a Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 16 Jan 2011 09:16:05 -0500
Subject: [PATCH 041/118] Fix GUI dialog errors preventing them from returning
 properly.

---
 src/calibre/gui2/convert/heuristics.py         | 7 +++++--
 src/calibre/gui2/convert/search_and_replace.py | 3 +--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/calibre/gui2/convert/heuristics.py b/src/calibre/gui2/convert/heuristics.py
index 525d5ba2f1..4735782f52 100644
--- a/src/calibre/gui2/convert/heuristics.py
+++ b/src/calibre/gui2/convert/heuristics.py
@@ -34,8 +34,11 @@ class HeuristicsWidget(Widget, Ui_Form):
     def break_cycles(self):
         Widget.break_cycles(self)
         
-        self.opt_enable_heuristics.stateChanged.disconnect()
-        self.opt_unwrap_lines.stateChanged.disconnect()
+        try:
+            self.opt_enable_heuristics.stateChanged.disconnect()
+            self.opt_unwrap_lines.stateChanged.disconnect()
+        except:
+            pass
         
     def set_value_handler(self, g, val):
         if val is None and g is self.opt_html_unwrap_factor:
diff --git a/src/calibre/gui2/convert/search_and_replace.py b/src/calibre/gui2/convert/search_and_replace.py
index 34c6cdf1e9..af944a74d1 100644
--- a/src/calibre/gui2/convert/search_and_replace.py
+++ b/src/calibre/gui2/convert/search_and_replace.py
@@ -45,5 +45,4 @@ class SearchAndReplaceWidget(Widget, Ui_Form):
                 error_dialog(self, _('Invalid regular expression'),
                              _('Invalid regular expression: %s')%err).exec_()
                 return False
-            
-
+        return True

From d271747cffafb7723ff15ba5628d6e0a96ac98c7 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 16 Jan 2011 09:44:39 -0500
Subject: [PATCH 042/118] Simplify more GUI widgets.

---
 src/calibre/gui2/convert/pdb_output.py | 19 ++++-----------
 src/calibre/gui2/convert/pdf_output.py | 33 ++++++--------------------
 src/calibre/gui2/convert/txt_output.py | 18 ++++----------
 src/calibre/gui2/widgets.py            | 26 --------------------
 4 files changed, 15 insertions(+), 81 deletions(-)

diff --git a/src/calibre/gui2/convert/pdb_output.py b/src/calibre/gui2/convert/pdb_output.py
index ec6b7abb08..bf1d5048e2 100644
--- a/src/calibre/gui2/convert/pdb_output.py
+++ b/src/calibre/gui2/convert/pdb_output.py
@@ -6,8 +6,6 @@ __docformat__ = 'restructuredtext en'
 
 from calibre.gui2.convert.pdb_output_ui import Ui_Form
 from calibre.gui2.convert import Widget
-from calibre.ebooks.pdb import FORMAT_WRITERS
-from calibre.gui2.widgets import BasicComboModel
 
 format_model = None
 
@@ -21,17 +19,8 @@ class PluginWidget(Widget, Ui_Form):
     def __init__(self, parent, get_option, get_help, db=None, book_id=None):
         Widget.__init__(self, parent, ['format', 'inline_toc', 'pdb_output_encoding'])
         self.db, self.book_id = db, book_id
+
+        for x in get_option('format').option.choices:
+            self.opt_format.addItem(x)
+        
         self.initialize_options(get_option, get_help, db, book_id)
-
-        default = self.opt_format.currentText()
-
-        global format_model
-        if format_model is None:
-            format_model = BasicComboModel(FORMAT_WRITERS.keys())
-        self.format_model = format_model
-        self.opt_format.setModel(self.format_model)
-
-        default_index = self.opt_format.findText(default)
-        format_index = self.opt_format.findText('doc')
-        self.opt_format.setCurrentIndex(default_index if default_index != -1 else format_index if format_index != -1 else 0)
-
diff --git a/src/calibre/gui2/convert/pdf_output.py b/src/calibre/gui2/convert/pdf_output.py
index 5d6a595079..1c526939c2 100644
--- a/src/calibre/gui2/convert/pdf_output.py
+++ b/src/calibre/gui2/convert/pdf_output.py
@@ -6,8 +6,6 @@ __docformat__ = 'restructuredtext en'
 
 from calibre.gui2.convert.pdf_output_ui import Ui_Form
 from calibre.gui2.convert import Widget
-from calibre.ebooks.pdf.pageoptions import PAPER_SIZES, ORIENTATIONS
-from calibre.gui2.widgets import BasicComboModel
 
 paper_size_model = None
 orientation_model = None
@@ -23,28 +21,11 @@ class PluginWidget(Widget, Ui_Form):
         Widget.__init__(self, parent, ['paper_size',
             'orientation', 'preserve_cover_aspect_ratio'])
         self.db, self.book_id = db, book_id
+        
+        for x in get_option('paper_size').option.choices:
+            self.opt_paper_size.addItem(x)
+        for x in get_option('orientation').option.choices:
+            self.opt_orientation.addItem(x)
+        
         self.initialize_options(get_option, get_help, db, book_id)
-
-        default_paper_size = self.opt_paper_size.currentText()
-        default_orientation = self.opt_orientation.currentText()
-
-        global paper_size_model
-        if paper_size_model is None:
-            paper_size_model = BasicComboModel(PAPER_SIZES.keys())
-        self.paper_size_model = paper_size_model
-        self.opt_paper_size.setModel(self.paper_size_model)
-
-        default_paper_size_index = self.opt_paper_size.findText(default_paper_size)
-        letter_index = self.opt_paper_size.findText('letter')
-        self.opt_paper_size.setCurrentIndex(default_paper_size_index if default_paper_size_index != -1 else letter_index if letter_index != -1 else 0)
-
-        global orientation_model
-        if orientation_model is None:
-            orientation_model = BasicComboModel(ORIENTATIONS.keys())
-        self.orientation_model = orientation_model
-        self.opt_orientation.setModel(self.orientation_model)
-
-        default_orientation_index = self.opt_orientation.findText(default_orientation)
-        orientation_index = self.opt_orientation.findText('portrait')
-        self.opt_orientation.setCurrentIndex(default_orientation_index if default_orientation_index != -1 else orientation_index if orientation_index != -1 else 0)
-
+        
\ No newline at end of file
diff --git a/src/calibre/gui2/convert/txt_output.py b/src/calibre/gui2/convert/txt_output.py
index 9a228bd4cf..21a9e60bed 100644
--- a/src/calibre/gui2/convert/txt_output.py
+++ b/src/calibre/gui2/convert/txt_output.py
@@ -6,8 +6,6 @@ __docformat__ = 'restructuredtext en'
 
 from calibre.gui2.convert.txt_output_ui import Ui_Form
 from calibre.gui2.convert import Widget
-from calibre.ebooks.txt.newlines import TxtNewlines
-from calibre.gui2.widgets import BasicComboModel
 
 newline_model = None
 
@@ -24,16 +22,8 @@ class PluginWidget(Widget, Ui_Form):
         'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references',
         'txt_output_encoding'])
         self.db, self.book_id = db, book_id
+        
+        for x in get_option('newline').option.choices:
+            self.opt_newline.addItem(x)
+        
         self.initialize_options(get_option, get_help, db, book_id)
-
-        default = self.opt_newline.currentText()
-
-        global newline_model
-        if newline_model is None:
-            newline_model = BasicComboModel(TxtNewlines.NEWLINE_TYPES.keys())
-        self.newline_model = newline_model
-        self.opt_newline.setModel(self.newline_model)
-
-        default_index = self.opt_newline.findText(default)
-        system_index = self.opt_newline.findText('system')
-        self.opt_newline.setCurrentIndex(default_index if default_index != -1 else system_index if system_index != -1 else 0)
diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py
index f2ff783a76..28c5de4322 100644
--- a/src/calibre/gui2/widgets.py
+++ b/src/calibre/gui2/widgets.py
@@ -311,32 +311,6 @@ class FontFamilyModel(QAbstractListModel):
     def index_of(self, family):
         return self.families.index(family.strip())
 
-class BasicComboModel(QAbstractListModel):
-
-    def __init__(self, items, *args):
-        QAbstractListModel.__init__(self, *args)
-        self.items = [i for i in items]
-        self.items.sort()
-
-    def rowCount(self, *args):
-        return len(self.items)
-
-    def data(self, index, role):
-        try:
-            item = self.items[index.row()]
-        except:
-            traceback.print_exc()
-            return NONE
-        if role == Qt.DisplayRole:
-            return QVariant(item)
-        if role == Qt.FontRole:
-            return QVariant(QFont(item))
-        return NONE
-
-    def index_of(self, item):
-        return self.items.index(item.strip())
-
-
 class BasicListItem(QListWidgetItem):
 
     def __init__(self, text, user_data=None):

From f7650de369d1dec7e3ee82744b55f292870335f7 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 16 Jan 2011 09:51:03 -0500
Subject: [PATCH 043/118] TXT Output GUI widet: Disable markdown options when
 markdown is not enabled.

---
 src/calibre/gui2/convert/txt_output.py | 28 ++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/calibre/gui2/convert/txt_output.py b/src/calibre/gui2/convert/txt_output.py
index 21a9e60bed..a16dd68014 100644
--- a/src/calibre/gui2/convert/txt_output.py
+++ b/src/calibre/gui2/convert/txt_output.py
@@ -4,6 +4,8 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 
+from PyQt4.Qt import Qt
+
 from calibre.gui2.convert.txt_output_ui import Ui_Form
 from calibre.gui2.convert import Widget
 
@@ -21,9 +23,27 @@ class PluginWidget(Widget, Ui_Form):
         ['newline', 'max_line_length', 'force_max_line_length',
         'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references',
         'txt_output_encoding'])
-        self.db, self.book_id = db, book_id
-        
+        self.db, self.book_id = db, book_id        
         for x in get_option('newline').option.choices:
-            self.opt_newline.addItem(x)
-        
+            self.opt_newline.addItem(x)        
         self.initialize_options(get_option, get_help, db, book_id)
+
+        self.opt_markdown_format.stateChanged.connect(self.enable_markdown_format)
+        self.enable_markdown_format(self.opt_markdown_format.checkState())
+
+    def break_cycles(self):
+        Widget.break_cycles(self)
+        
+        try:
+            self.opt_markdown_format.stateChanged.disconnect()
+        except:
+            pass
+        
+    def enable_markdown_format(self, state):
+        if state == Qt.Checked:
+            state = True
+        else:
+            state = False
+        self.opt_keep_links.setEnabled(state)
+        self.opt_keep_image_references.setEnabled(state)
+        
\ No newline at end of file

From 9148320a8bd9263f3495a28cd4d32f6cfa467c35 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 16 Jan 2011 10:00:23 -0500
Subject: [PATCH 044/118] Heuristic class use log.debug to reduce output during
 CLI conversion.

---
 src/calibre/ebooks/conversion/utils.py | 42 +++++++++++++-------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 4a118d423c..15522d25e6 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -35,12 +35,12 @@ class HeuristicProcessor(object):
         title = match.group('title')
         if not title:
             self.html_preprocess_sections = self.html_preprocess_sections + 1
-            self.log("marked " + unicode(self.html_preprocess_sections) +
+            self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                     " chapters. - " + unicode(chap))
             return '<h2>'+chap+'</h2>\n'
         else:
             self.html_preprocess_sections = self.html_preprocess_sections + 1
-            self.log("marked " + unicode(self.html_preprocess_sections) +
+            self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                     " chapters & titles. - " + unicode(chap) + ", " + unicode(title))
             return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
 
@@ -48,7 +48,7 @@ class HeuristicProcessor(object):
         chap = match.group('section')
         styles = match.group('styles')
         self.html_preprocess_sections = self.html_preprocess_sections + 1
-        self.log("marked " + unicode(self.html_preprocess_sections) +
+        self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                 " section markers based on punctuation. - " + unicode(chap))
         return '<'+styles+' style="page-break-before:always">'+chap
 
@@ -91,7 +91,7 @@ class HeuristicProcessor(object):
         line_end = line_end_ere.findall(raw)
         tot_htm_ends = len(htm_end)
         tot_ln_fds = len(line_end)
-        self.log("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
+        self.log.debug("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
                 unicode(tot_htm_ends) + " marked up endings")
 
         if percent > 1:
@@ -100,7 +100,7 @@ class HeuristicProcessor(object):
             percent = 0
 
         min_lns = tot_ln_fds * percent
-        self.log("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
+        self.log.debug("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
         if min_lns > tot_htm_ends:
             return True
 
@@ -171,7 +171,7 @@ class HeuristicProcessor(object):
         #print "minimum chapters required are: "+str(self.min_chapters)
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
-        self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
+        self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
 
         # Build the Regular Expressions in pieces
         init_lookahead = "(?=<(p|div))"
@@ -221,7 +221,7 @@ class HeuristicProcessor(object):
                     break
                 full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
                 n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
-                self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
+                self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
                 if lookahead_ignorecase:
                     chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
                     chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
@@ -244,7 +244,7 @@ class HeuristicProcessor(object):
         words_per_chptr = wordcount
         if words_per_chptr > 0 and self.html_preprocess_sections > 0:
             words_per_chptr = wordcount / self.html_preprocess_sections
-        self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
+        self.log.debug("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
         return html
 
     def punctuation_unwrap(self, length, content, format):
@@ -291,7 +291,7 @@ class HeuristicProcessor(object):
     def markup_pre(self, html):
         pre = re.compile(r'<pre>', re.IGNORECASE)
         if len(pre.findall(html)) >= 1:
-            self.log("Running Text Processing")
+            self.log.debug("Running Text Processing")
             outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
             html = outerhtml.sub(self.txt_process, html)
         else:
@@ -311,7 +311,7 @@ class HeuristicProcessor(object):
         txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
         html = txtindent.sub(self.insert_indent, html)
         if self.found_indents > 1:
-            self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
+            self.log.debug("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
         return html
 
     def cleanup_markup(self, html):
@@ -351,7 +351,7 @@ class HeuristicProcessor(object):
         blanklines = self.blankreg.findall(html)
         lines = self.linereg.findall(html)
         if len(lines) > 1:
-            self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
+            self.log.debug("There are " + unicode(len(blanklines)) + " blank lines. " +
                     unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
                     
             if float(len(blanklines)) / float(len(lines)) > 0.40:
@@ -367,18 +367,18 @@ class HeuristicProcessor(object):
 
 
     def __call__(self, html):
-        self.log("*********  Heuristic processing HTML  *********")
+        self.log.debug("*********  Heuristic processing HTML  *********")
 
         # Count the words in the document to estimate how many chapters to look for and whether
         # other types of processing are attempted
         try:
             self.totalwords = self.get_word_count(html)
         except:
-            self.log("Can't get wordcount")
+            self.log.warn("Can't get wordcount")
 
         print "found "+unicode(self.totalwords)+" words in the flow"
         if self.totalwords < 50:
-            self.log("flow is too short, not running heuristics")
+            self.log.warn("flow is too short, not running heuristics")
             return html
 
         # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
@@ -391,7 +391,7 @@ class HeuristicProcessor(object):
             # <pre> tags), check and  mark up line endings if required before proceeding
             # fix indents must run after this step
             if self.no_markup(html, 0.1):
-                self.log("not enough paragraph markers, adding now")
+                self.log.debug("not enough paragraph markers, adding now")
                 # markup using text processing
                 html = self.markup_pre(html)
 
@@ -421,7 +421,7 @@ class HeuristicProcessor(object):
         # If more than 40% of the lines are empty paragraphs and the user has enabled delete
         # blank paragraphs then delete blank lines to clean up spacing
         if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
-            self.log("deleting blank lines")
+            self.log.debug("deleting blank lines")
             html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
             html = self.blankreg.sub('', html)
 
@@ -435,18 +435,18 @@ class HeuristicProcessor(object):
         # more of the lines break in the same region of the document then unwrapping is required
         docanalysis = DocAnalysis(format, html)
         hardbreaks = docanalysis.line_histogram(.50)
-        self.log("Hard line breaks check returned "+unicode(hardbreaks))
+        self.log.debug("Hard line breaks check returned "+unicode(hardbreaks))
 
         # Calculate Length
         unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
         length = docanalysis.line_length(unwrap_factor)
-        self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
+        self.log.debug("Median line length is " + unicode(length) + ", calculated with " + format + " format")
             
         ###### Unwrap lines ######
         if getattr(self.extra_opts, 'unwrap_lines', False):
             # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
             if hardbreaks or unwrap_factor < 0.4:
-                self.log("Unwrapping required, unwrapping Lines")
+                self.log.debug("Unwrapping required, unwrapping Lines")
                 # Dehyphenate with line length limiters
                 dehyphenator = Dehyphenator()
                 html = dehyphenator(html,'html', length)
@@ -457,14 +457,14 @@ class HeuristicProcessor(object):
 
         if getattr(self.extra_opts, 'dehyphenate', False):
             # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
-            self.log("Fixing hyphenated content")
+            self.log.debug("Fixing hyphenated content")
             dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
             html = dehyphenator(html,'html_cleanup', length)
             html = dehyphenator(html, 'individual_words', length)
 
         # If still no sections after unwrapping mark split points on lines with no punctuation
         if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
-            self.log("Looking for more split points based on punctuation,"
+            self.log.debug("Looking for more split points based on punctuation,"
                     " currently have " + unicode(self.html_preprocess_sections))
             chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
             html = chapdetect3.sub(self.chapter_break, html)

From f8971944fb6b72836f61f8989861db06c3ce415a Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 16 Jan 2011 23:22:17 +0800
Subject: [PATCH 045/118] made replace optional for users who just want
 equivalent of old feature, eliminate requirement to populate replace box

---
 src/calibre/ebooks/conversion/preprocess.py | 34 ++++++++-------------
 1 file changed, 13 insertions(+), 21 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index d1d275eb97..0ceed67bf9 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -453,27 +453,19 @@ class HTMLPreProcessor(object):
         if not getattr(self.extra_opts, 'keep_ligatures', False):
             html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
 
-        if getattr(self.extra_opts, 'sr3_search', None):
-            try:
-                rules.insert(0,  (re.compile(self.extra_opts.sr3_search), self.extra_opts.sr3_replace))
-            except:
-                import traceback
-                print 'Failed to parse sr3-search regexp'
-                traceback.print_exc()
-        if getattr(self.extra_opts, 'sr2_search', None):
-            try:
-                rules.insert(0, (re.compile(self.extra_opts.sr2_search), self.extra_opts.sr2_replace))
-            except:
-                import traceback
-                print 'Failed to parse sr2-search regexp'
-                traceback.print_exc()
-        if getattr(self.extra_opts, 'sr1_search', None):
-            try:
-                rules.insert(0, (re.compile(self.extra_opts.sr1_search), self.extra_opts.sr1_replace))
-            except:
-                import traceback
-                print 'Failed to parse sr1-search regexp'
-                traceback.print_exc()
+        for search, replace in [['sr3_search', 'sr3_replace'], ['sr2_search', 'sr2_replace'], ['sr1_search', 'sr1_replace']]:
+            replace_pattern = ''
+            if getattr(self.extra_opts, search, None):
+                search_pattern = getattr(self.extra_opts, search, None)
+                if getattr(self.extra_opts, replace, None):
+                    replace_pattern = getattr(self.extra_opts, replace, None)
+                try:
+                    rules.insert(0,  (re.compile(search_pattern), replace_pattern))
+                except:
+                    import traceback
+                    print 'Failed to parse sr3-search regexp'
+                    traceback.print_exc()
+
 
         end_rules = []
         # delete soft hyphens - moved here so it's executed after header/footer removal

From e131f99db8e5a80a68e4e28a152848effb7ece76 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 16 Jan 2011 09:18:37 -0700
Subject: [PATCH 046/118] Cleanup layout of bulk metadata edit dialog

---
 src/calibre/gui2/dialogs/metadata_bulk.ui | 236 +++++++++++-----------
 1 file changed, 115 insertions(+), 121 deletions(-)

diff --git a/src/calibre/gui2/dialogs/metadata_bulk.ui b/src/calibre/gui2/dialogs/metadata_bulk.ui
index 9240cd1af8..d52bc2cb89 100644
--- a/src/calibre/gui2/dialogs/metadata_bulk.ui
+++ b/src/calibre/gui2/dialogs/metadata_bulk.ui
@@ -75,13 +75,31 @@
              </property>
             </widget>
            </item>
-           <item row="1" column="1">
-            <widget class="QCheckBox" name="auto_author_sort">
-             <property name="text">
-              <string>A&amp;utomatically set author sort</string>
+           <item row="0" column="1">
+            <widget class="EnComboBox" name="authors">
+             <property name="editable">
+              <bool>true</bool>
              </property>
             </widget>
            </item>
+           <item row="1" column="1">
+            <layout class="QHBoxLayout" name="horizontalLayout_2">
+             <item>
+              <widget class="QCheckBox" name="auto_author_sort">
+               <property name="text">
+                <string>A&amp;utomatically set author sort</string>
+               </property>
+              </widget>
+             </item>
+             <item>
+              <widget class="QCheckBox" name="swap_title_and_author">
+               <property name="text">
+                <string>&amp;Swap title and author</string>
+               </property>
+              </widget>
+             </item>
+            </layout>
+           </item>
            <item row="2" column="0">
             <widget class="QLabel" name="label_8">
              <property name="text">
@@ -95,7 +113,7 @@
              </property>
             </widget>
            </item>
-           <item row="2" column="1" colspan="2">
+           <item row="2" column="1">
             <widget class="EnLineEdit" name="author_sort">
              <property name="toolTip">
               <string>Specify how the author(s) of this book should be sorted. For example Charles Dickens should be sorted as Dickens, Charles.</string>
@@ -115,7 +133,7 @@
              </property>
             </widget>
            </item>
-           <item row="3" column="1" colspan="2">
+           <item row="3" column="1">
             <widget class="QSpinBox" name="rating">
              <property name="toolTip">
               <string>Rating of this book. 0-5 stars</string>
@@ -156,7 +174,7 @@
              </property>
             </widget>
            </item>
-           <item row="4" column="1" colspan="2">
+           <item row="4" column="1">
             <widget class="EnComboBox" name="publisher">
              <property name="editable">
               <bool>true</bool>
@@ -220,7 +238,7 @@
               <string>Check this box to remove all tags from the books.</string>
              </property>
              <property name="text">
-              <string>Remove all</string>
+              <string>Remove &amp;all</string>
              </property>
             </widget>
            </item>
@@ -241,52 +259,35 @@
             </widget>
            </item>
            <item row="7" column="1">
-            <layout class="QHBoxLayout" name="HLayout_34">
-             <item>
-              <widget class="EnComboBox" name="series">
-               <property name="toolTip">
-                <string>List of known series. You can add new series.</string>
-               </property>
-               <property name="whatsThis">
-                <string>List of known series. You can add new series.</string>
-               </property>
-               <property name="editable">
-                <bool>true</bool>
-               </property>
-               <property name="insertPolicy">
-                <enum>QComboBox::InsertAlphabetically</enum>
-               </property>
-               <property name="sizeAdjustPolicy">
-                <enum>QComboBox::AdjustToContents</enum>
-               </property>
-              </widget>
-             </item>
-             <item>
-              <widget class="QCheckBox" name="clear_series">
-               <property name="toolTip">
-                <string>If checked, the series will be cleared</string>
-               </property>
-               <property name="text">
-                <string>Clear series</string>
-               </property>
-              </widget>
-             </item>
-             <item>
-              <spacer name="HSpacer_344">
-               <property name="orientation">
-                <enum>Qt::Horizontal</enum>
-               </property>
-               <property name="sizeHint" stdset="0">
-                <size>
-                 <width>20</width>
-                 <height>0</height>
-                </size>
-               </property>
-              </spacer>
-             </item>
-            </layout>
+            <widget class="EnComboBox" name="series">
+             <property name="toolTip">
+              <string>List of known series. You can add new series.</string>
+             </property>
+             <property name="whatsThis">
+              <string>List of known series. You can add new series.</string>
+             </property>
+             <property name="editable">
+              <bool>true</bool>
+             </property>
+             <property name="insertPolicy">
+              <enum>QComboBox::InsertAlphabetically</enum>
+             </property>
+             <property name="sizeAdjustPolicy">
+              <enum>QComboBox::AdjustToContents</enum>
+             </property>
+            </widget>
            </item>
-           <item row="8" column="1" colspan="2">
+           <item row="7" column="2">
+            <widget class="QCheckBox" name="clear_series">
+             <property name="toolTip">
+              <string>If checked, the series will be cleared</string>
+             </property>
+             <property name="text">
+              <string>&amp;Clear series</string>
+             </property>
+            </widget>
+           </item>
+           <item row="8" column="1">
             <layout class="QHBoxLayout" name="HLayout_3">
              <item>
               <widget class="QCheckBox" name="autonumber_series">
@@ -297,7 +298,7 @@ you selected them. So if you selected Book A and then Book B,
 Book A will have series number 1 and Book B series number 2.</string>
                </property>
                <property name="text">
-                <string>Automatically number books in this series</string>
+                <string>&amp;Automatically number books in this series</string>
                </property>
               </widget>
              </item>
@@ -312,7 +313,7 @@ for that series. Checking this box will tell calibre to start numbering
 from the value in the box</string>
                </property>
                <property name="text">
-                <string>Force numbers to start with </string>
+                <string>&amp;Force numbers to start with:</string>
                </property>
               </widget>
              </item>
@@ -332,19 +333,6 @@ from the value in the box</string>
                </property>
               </widget>
              </item>
-             <item>
-              <spacer name="HSpacer_34">
-               <property name="orientation">
-                <enum>Qt::Horizontal</enum>
-               </property>
-               <property name="sizeHint" stdset="0">
-                <size>
-                 <width>20</width>
-                 <height>10</height>
-                </size>
-               </property>
-              </spacer>
-             </item>
             </layout>
            </item>
            <item row="9" column="0">
@@ -358,59 +346,56 @@ from the value in the box</string>
             </widget>
            </item>
            <item row="9" column="1">
-            <widget class="QComboBox" name="remove_format"/>
-           </item>
-           <item row="0" column="1">
-            <widget class="EnComboBox" name="authors">
-             <property name="editable">
-              <bool>true</bool>
-             </property>
-            </widget>
-           </item>
-           <item row="11" column="0" colspan="2">
-            <widget class="QCheckBox" name="swap_title_and_author">
-             <property name="text">
-              <string>&amp;Swap title and author</string>
-             </property>
-            </widget>
-           </item>
-           <item row="12" column="0" colspan="2">
-            <widget class="QCheckBox" name="change_title_to_title_case">
-             <property name="toolTip">
-              <string>Force the title to be in title case. If both this and swap authors are checked,
-title and author are swapped before the title case is set</string>
-             </property>
-             <property name="text">
-              <string>Change title to title case</string>
-             </property>
-            </widget>
-           </item>
-           <item row="10" column="0" colspan="2">
-            <widget class="QCheckBox" name="remove_conversion_settings">
-             <property name="toolTip">
-              <string>Remove stored conversion settings for the selected books.
-
-Future conversion of these books will use the default settings.</string>
-             </property>
-             <property name="text">
-              <string>Remove &amp;stored conversion settings for the selected books</string>
-             </property>
-            </widget>
-           </item>
-           <item row="14" column="0" colspan="3">
-            <spacer name="verticalSpacer_2">
-             <property name="orientation">
-              <enum>Qt::Vertical</enum>
-             </property>
-             <property name="sizeHint" stdset="0">
+            <widget class="QComboBox" name="remove_format">
+             <property name="maximumSize">
               <size>
-               <width>20</width>
-               <height>40</height>
+               <width>120</width>
+               <height>16777215</height>
               </size>
              </property>
-            </spacer>
+            </widget>
            </item>
-           <item row="13" column="0" colspan="3">
+           <item row="10" column="0" colspan="3">
+            <layout class="QHBoxLayout" name="horizontalLayout_3">
+             <item>
+              <widget class="QCheckBox" name="change_title_to_title_case">
+               <property name="toolTip">
+                <string>Force the title to be in title case. If both this and swap authors are checked,
+title and author are swapped before the title case is set</string>
+               </property>
+               <property name="text">
+                <string>Change title to title &amp;case</string>
+               </property>
+              </widget>
+             </item>
+             <item>
+              <spacer name="horizontalSpacer">
+               <property name="orientation">
+                <enum>Qt::Horizontal</enum>
+               </property>
+               <property name="sizeHint" stdset="0">
+                <size>
+                 <width>40</width>
+                 <height>20</height>
+                </size>
+               </property>
+              </spacer>
+             </item>
+             <item>
+              <widget class="QCheckBox" name="remove_conversion_settings">
+               <property name="toolTip">
+                <string>Remove stored conversion settings for the selected books.
+
+Future conversion of these books will use the default settings.</string>
+               </property>
+               <property name="text">
+                <string>Remove &amp;stored conversion settings for the selected books</string>
+               </property>
+              </widget>
+             </item>
+            </layout>
+           </item>
+           <item row="11" column="0" colspan="3">
             <widget class="QGroupBox" name="groupBox">
              <property name="title">
               <string>Change &amp;cover</string>
@@ -440,6 +425,19 @@ Future conversion of these books will use the default settings.</string>
              </layout>
             </widget>
            </item>
+           <item row="12" column="0">
+            <spacer name="verticalSpacer_2">
+             <property name="orientation">
+              <enum>Qt::Vertical</enum>
+             </property>
+             <property name="sizeHint" stdset="0">
+              <size>
+               <width>20</width>
+               <height>40</height>
+              </size>
+             </property>
+            </spacer>
+           </item>
           </layout>
          </widget>
          <widget class="QWidget" name="tab">
@@ -902,14 +900,10 @@ not multiple and the destination field is multiple</string>
   <tabstop>remove_tags</tabstop>
   <tabstop>remove_all_tags</tabstop>
   <tabstop>series</tabstop>
-  <tabstop>clear_series</tabstop>
   <tabstop>autonumber_series</tabstop>
   <tabstop>series_numbering_restarts</tabstop>
   <tabstop>series_start_number</tabstop>
   <tabstop>remove_format</tabstop>
-  <tabstop>remove_conversion_settings</tabstop>
-  <tabstop>swap_title_and_author</tabstop>
-  <tabstop>change_title_to_title_case</tabstop>
   <tabstop>button_box</tabstop>
   <tabstop>search_field</tabstop>
   <tabstop>search_mode</tabstop>

From 73e60f8c7e70893294f32f594401dcb6f19aacf0 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 16 Jan 2011 11:20:11 -0500
Subject: [PATCH 047/118] Fix search and replace.

---
 src/calibre/ebooks/conversion/preprocess.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index b35a163044..d2bdba4928 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -458,7 +458,10 @@ class HTMLPreProcessor(object):
             if search_pattern:
                 try:
                     search_re = re.compile(search_pattern)
-                    rules.insert(0,  (search_re, getattr(self.extra_opts, replace, '')))
+                    replace_txt = getattr(self.extra_opts, replace, '')
+                    if replace_txt == None:
+                        replace_txt = ''
+                    rules.insert(0, (search_re, replace_txt))
                 except Exception as e:
                     self.log.error('Failed to parse %s regexp because %s' % (search, e))
 

From d18910510ba19700690bf596b39025d3ea0cebde Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 16 Jan 2011 11:42:43 -0500
Subject: [PATCH 048/118] Fix Regex builder in search and replace.

---
 src/calibre/gui2/convert/search_and_replace.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/calibre/gui2/convert/search_and_replace.py b/src/calibre/gui2/convert/search_and_replace.py
index af944a74d1..c85e4fe414 100644
--- a/src/calibre/gui2/convert/search_and_replace.py
+++ b/src/calibre/gui2/convert/search_and_replace.py
@@ -25,8 +25,14 @@ class SearchAndReplaceWidget(Widget, Ui_Form):
         self.db, self.book_id = db, book_id
         self.initialize_options(get_option, get_help, db, book_id)
         self.opt_sr1_search.set_msg(_('Search Regular Expression'))
+        self.opt_sr1_search.set_book_id(book_id)
+        self.opt_sr1_search.set_db(db)
         self.opt_sr2_search.set_msg(_('Search Regular Expression'))
+        self.opt_sr2_search.set_book_id(book_id)
+        self.opt_sr2_search.set_db(db)
         self.opt_sr3_search.set_msg(_('Search Regular Expression'))
+        self.opt_sr3_search.set_book_id(book_id)
+        self.opt_sr3_search.set_db(db)
         
     def break_cycles(self):
         Widget.break_cycles(self)

From b83b89ce7437a8e0b7547cf6ac8a27112ee243ae Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 16 Jan 2011 10:09:14 -0700
Subject: [PATCH 049/118] Fix #7568 (Allow bulk editing of Published date)

---
 src/calibre/gui2/dialogs/metadata_bulk.py   |  31 ++-
 src/calibre/gui2/dialogs/metadata_bulk.ui   | 108 ++++++--
 src/calibre/gui2/dialogs/metadata_single.py |  13 +-
 src/calibre/gui2/dialogs/metadata_single.ui | 287 ++++++++++----------
 4 files changed, 274 insertions(+), 165 deletions(-)

diff --git a/src/calibre/gui2/dialogs/metadata_bulk.py b/src/calibre/gui2/dialogs/metadata_bulk.py
index 5ea8f00148..da6e92c26a 100644
--- a/src/calibre/gui2/dialogs/metadata_bulk.py
+++ b/src/calibre/gui2/dialogs/metadata_bulk.py
@@ -15,15 +15,16 @@ from calibre.ebooks.metadata import string_to_authors, authors_to_string
 from calibre.ebooks.metadata.book.base import composite_formatter
 from calibre.ebooks.metadata.meta import get_metadata
 from calibre.gui2.custom_column_widgets import populate_metadata_page
-from calibre.gui2 import error_dialog, ResizableDialog
+from calibre.gui2 import error_dialog, ResizableDialog, UNDEFINED_QDATE
 from calibre.gui2.progress_indicator import ProgressIndicator
 from calibre.utils.config import dynamic
 from calibre.utils.titlecase import titlecase
 from calibre.utils.icu import sort_key, capitalize
-from calibre.utils.config import prefs
+from calibre.utils.config import prefs, tweaks
 from calibre.utils.magick.draw import identify_data
+from calibre.utils.date import qt_to_dt
 
-def get_cover_data(path):
+def get_cover_data(path): # {{{
     old = prefs['read_file_metadata']
     if not old:
         prefs['read_file_metadata'] = True
@@ -46,7 +47,7 @@ def get_cover_data(path):
         prefs['read_file_metadata'] = old
 
     return cdata, area
-
+# }}}
 
 
 class MyBlockingBusy(QDialog): # {{{
@@ -132,7 +133,8 @@ class MyBlockingBusy(QDialog): # {{{
         remove_all, remove, add, au, aus, do_aus, rating, pub, do_series, \
             do_autonumber, do_remove_format, remove_format, do_swap_ta, \
             do_remove_conv, do_auto_author, series, do_series_restart, \
-            series_start_value, do_title_case, cover_action, clear_series = self.args
+            series_start_value, do_title_case, cover_action, clear_series, \
+            pubdate = self.args
 
 
         # first loop: do author and title. These will commit at the end of each
@@ -209,6 +211,9 @@ class MyBlockingBusy(QDialog): # {{{
             if clear_series:
                 self.db.set_series(id, '', notify=False, commit=False)
 
+            if pubdate is not None:
+                self.db.set_pubdate(id, pubdate, notify=False, commit=False)
+
             if do_series:
                 if do_series_restart:
                     if self.series_start_value is None:
@@ -288,6 +293,12 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog):
         self.series.editTextChanged.connect(self.series_changed)
         self.tag_editor_button.clicked.connect(self.tag_editor)
         self.autonumber_series.stateChanged[int].connect(self.auto_number_changed)
+        self.pubdate.setMinimumDate(UNDEFINED_QDATE)
+        pubdate_format = tweaks['gui_pubdate_display_format']
+        if pubdate_format is not None:
+            self.pubdate.setDisplayFormat(pubdate_format)
+        self.pubdate.setSpecialValueText(_('Undefined'))
+        self.clear_pubdate_button.clicked.connect(self.clear_pubdate)
 
         if len(self.db.custom_field_keys(include_composites=False)) == 0:
             self.central_widget.removeTab(1)
@@ -304,6 +315,9 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog):
         self.central_widget.setCurrentIndex(tab)
         self.exec_()
 
+    def clear_pubdate(self, *args):
+        self.pubdate.setDate(UNDEFINED_QDATE)
+
     def button_clicked(self, which):
         if which == self.button_box.button(QDialogButtonBox.Apply):
             self.do_again = True
@@ -783,6 +797,10 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog):
         do_remove_conv = self.remove_conversion_settings.isChecked()
         do_auto_author = self.auto_author_sort.isChecked()
         do_title_case = self.change_title_to_title_case.isChecked()
+        pubdate = None
+        if self.apply_pubdate.isChecked():
+            pubdate = qt_to_dt(self.pubdate.date())
+
         cover_action = None
         if self.cover_remove.isChecked():
             cover_action = 'remove'
@@ -794,7 +812,8 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog):
         args = (remove_all, remove, add, au, aus, do_aus, rating, pub, do_series,
                 do_autonumber, do_remove_format, remove_format, do_swap_ta,
                 do_remove_conv, do_auto_author, series, do_series_restart,
-                series_start_value, do_title_case, cover_action, clear_series)
+                series_start_value, do_title_case, cover_action, clear_series,
+                pubdate)
 
         bb = MyBlockingBusy(_('Applying changes to %d books.\nPhase {0} {1}%%.')
                 %len(self.ids), args, self.db, self.ids,
diff --git a/src/calibre/gui2/dialogs/metadata_bulk.ui b/src/calibre/gui2/dialogs/metadata_bulk.ui
index d52bc2cb89..b14c31c9d1 100644
--- a/src/calibre/gui2/dialogs/metadata_bulk.ui
+++ b/src/calibre/gui2/dialogs/metadata_bulk.ui
@@ -220,6 +220,9 @@
              <property name="text">
               <string>&amp;Remove tags:</string>
              </property>
+             <property name="alignment">
+              <set>Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter</set>
+             </property>
              <property name="buddy">
               <cstring>remove_tags</cstring>
              </property>
@@ -260,6 +263,12 @@
            </item>
            <item row="7" column="1">
             <widget class="EnComboBox" name="series">
+             <property name="sizePolicy">
+              <sizepolicy hsizetype="Preferred" vsizetype="Fixed">
+               <horstretch>0</horstretch>
+               <verstretch>0</verstretch>
+              </sizepolicy>
+             </property>
              <property name="toolTip">
               <string>List of known series. You can add new series.</string>
              </property>
@@ -273,7 +282,10 @@
               <enum>QComboBox::InsertAlphabetically</enum>
              </property>
              <property name="sizeAdjustPolicy">
-              <enum>QComboBox::AdjustToContents</enum>
+              <enum>QComboBox::AdjustToMinimumContentsLengthWithIcon</enum>
+             </property>
+             <property name="minimumContentsLength">
+              <number>40</number>
              </property>
             </widget>
            </item>
@@ -335,27 +347,52 @@ from the value in the box</string>
              </item>
             </layout>
            </item>
-           <item row="9" column="0">
-            <widget class="QLabel" name="label_5">
+           <item row="10" column="0">
+            <widget class="QLabel" name="label_9">
              <property name="text">
-              <string>Remove &amp;format:</string>
+              <string>&amp;Published:</string>
+             </property>
+             <property name="alignment">
+              <set>Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter</set>
              </property>
              <property name="buddy">
-              <cstring>remove_format</cstring>
+              <cstring>pubdate</cstring>
              </property>
             </widget>
            </item>
-           <item row="9" column="1">
-            <widget class="QComboBox" name="remove_format">
-             <property name="maximumSize">
-              <size>
-               <width>120</width>
-               <height>16777215</height>
-              </size>
+           <item row="10" column="1">
+            <layout class="QHBoxLayout" name="horizontalLayout_4">
+             <item>
+              <widget class="QDateEdit" name="pubdate">
+               <property name="displayFormat">
+                <string>MMM yyyy</string>
+               </property>
+              </widget>
+             </item>
+             <item>
+              <widget class="QToolButton" name="clear_pubdate_button">
+               <property name="toolTip">
+                <string>Clear published date</string>
+               </property>
+               <property name="text">
+                <string>...</string>
+               </property>
+               <property name="icon">
+                <iconset resource="../../../../resources/images.qrc">
+                 <normaloff>:/images/trash.png</normaloff>:/images/trash.png</iconset>
+               </property>
+              </widget>
+             </item>
+            </layout>
+           </item>
+           <item row="10" column="2">
+            <widget class="QCheckBox" name="apply_pubdate">
+             <property name="text">
+              <string>&amp;Apply date</string>
              </property>
             </widget>
            </item>
-           <item row="10" column="0" colspan="3">
+           <item row="13" column="0" colspan="3">
             <layout class="QHBoxLayout" name="horizontalLayout_3">
              <item>
               <widget class="QCheckBox" name="change_title_to_title_case">
@@ -395,7 +432,7 @@ Future conversion of these books will use the default settings.</string>
              </item>
             </layout>
            </item>
-           <item row="11" column="0" colspan="3">
+           <item row="14" column="0" colspan="3">
             <widget class="QGroupBox" name="groupBox">
              <property name="title">
               <string>Change &amp;cover</string>
@@ -425,7 +462,7 @@ Future conversion of these books will use the default settings.</string>
              </layout>
             </widget>
            </item>
-           <item row="12" column="0">
+           <item row="15" column="0">
             <spacer name="verticalSpacer_2">
              <property name="orientation">
               <enum>Qt::Vertical</enum>
@@ -438,6 +475,42 @@ Future conversion of these books will use the default settings.</string>
              </property>
             </spacer>
            </item>
+           <item row="12" column="0">
+            <spacer name="verticalSpacer">
+             <property name="orientation">
+              <enum>Qt::Vertical</enum>
+             </property>
+             <property name="sizeType">
+              <enum>QSizePolicy::Fixed</enum>
+             </property>
+             <property name="sizeHint" stdset="0">
+              <size>
+               <width>20</width>
+               <height>15</height>
+              </size>
+             </property>
+            </spacer>
+           </item>
+           <item row="11" column="0">
+            <widget class="QLabel" name="label_5">
+             <property name="text">
+              <string>Remove &amp;format:</string>
+             </property>
+             <property name="buddy">
+              <cstring>remove_format</cstring>
+             </property>
+            </widget>
+           </item>
+           <item row="11" column="1">
+            <widget class="QComboBox" name="remove_format">
+             <property name="maximumSize">
+              <size>
+               <width>120</width>
+               <height>16777215</height>
+              </size>
+             </property>
+            </widget>
+           </item>
           </layout>
          </widget>
          <widget class="QWidget" name="tab">
@@ -798,8 +871,8 @@ not multiple and the destination field is multiple</string>
                <rect>
                 <x>0</x>
                 <y>0</y>
-                <width>197</width>
-                <height>60</height>
+                <width>826</width>
+                <height>313</height>
                </rect>
               </property>
               <layout class="QGridLayout" name="testgrid">
@@ -903,7 +976,6 @@ not multiple and the destination field is multiple</string>
   <tabstop>autonumber_series</tabstop>
   <tabstop>series_numbering_restarts</tabstop>
   <tabstop>series_start_number</tabstop>
-  <tabstop>remove_format</tabstop>
   <tabstop>button_box</tabstop>
   <tabstop>search_field</tabstop>
   <tabstop>search_mode</tabstop>
diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py
index ede605343b..e4efdf0470 100644
--- a/src/calibre/gui2/dialogs/metadata_single.py
+++ b/src/calibre/gui2/dialogs/metadata_single.py
@@ -16,7 +16,7 @@ from PyQt4.Qt import SIGNAL, QObject, Qt, QTimer, QDate, \
 
 from calibre.gui2 import error_dialog, file_icon_provider, dynamic, \
                            choose_files, choose_images, ResizableDialog, \
-                           warning_dialog, question_dialog
+                           warning_dialog, question_dialog, UNDEFINED_QDATE
 from calibre.gui2.dialogs.metadata_single_ui import Ui_MetadataSingleDialog
 from calibre.gui2.dialogs.fetch_metadata import FetchMetadata
 from calibre.gui2.dialogs.tag_editor import TagEditor
@@ -491,11 +491,15 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
         self.formats.setAcceptDrops(True)
         self.cover_changed = False
         self.cpixmap = None
-        self.pubdate.setMinimumDate(QDate(100,1,1))
+        self.pubdate.setMinimumDate(UNDEFINED_QDATE)
         pubdate_format = tweaks['gui_pubdate_display_format']
         if pubdate_format is not None:
             self.pubdate.setDisplayFormat(pubdate_format)
-        self.date.setMinimumDate(QDate(100,1,1))
+        self.date.setMinimumDate(UNDEFINED_QDATE)
+        self.pubdate.setSpecialValueText(_('Undefined'))
+        self.date.setSpecialValueText(_('Undefined'))
+        self.clear_pubdate_button.clicked.connect(self.clear_pubdate)
+
 
         self.connect(self.cover, SIGNAL('cover_changed(PyQt_PyObject)'), self.cover_dropped)
         QObject.connect(self.cover_button, SIGNAL("clicked(bool)"), \
@@ -615,6 +619,9 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
 
         self.show()
 
+    def clear_pubdate(self, *args):
+        self.pubdate.setDate(UNDEFINED_QDATE)
+
     def create_custom_column_editors(self):
         w = self.central_widget.widget(1)
         layout = w.layout()
diff --git a/src/calibre/gui2/dialogs/metadata_single.ui b/src/calibre/gui2/dialogs/metadata_single.ui
index 6d31342dcf..60c221be1a 100644
--- a/src/calibre/gui2/dialogs/metadata_single.ui
+++ b/src/calibre/gui2/dialogs/metadata_single.ui
@@ -100,6 +100,112 @@
                     </property>
                    </widget>
                   </item>
+                  <item row="0" column="2" rowspan="4">
+                   <layout class="QVBoxLayout" name="verticalLayout_7">
+                    <item>
+                     <spacer name="verticalSpacer_3">
+                      <property name="orientation">
+                       <enum>Qt::Vertical</enum>
+                      </property>
+                      <property name="sizeHint" stdset="0">
+                       <size>
+                        <width>20</width>
+                        <height>40</height>
+                       </size>
+                      </property>
+                     </spacer>
+                    </item>
+                    <item>
+                     <widget class="QToolButton" name="auto_title_sort">
+                      <property name="toolTip">
+                       <string>Automatically create the title sort entry based on the current title entry.
+Using this button to create title sort will change title sort from red to green.</string>
+                      </property>
+                      <property name="text">
+                       <string>...</string>
+                      </property>
+                      <property name="icon">
+                       <iconset resource="../../../../resources/images.qrc">
+                        <normaloff>:/images/auto_author_sort.png</normaloff>:/images/auto_author_sort.png</iconset>
+                      </property>
+                     </widget>
+                    </item>
+                    <item>
+                     <spacer name="verticalSpacer">
+                      <property name="orientation">
+                       <enum>Qt::Vertical</enum>
+                      </property>
+                      <property name="sizeHint" stdset="0">
+                       <size>
+                        <width>20</width>
+                        <height>40</height>
+                       </size>
+                      </property>
+                     </spacer>
+                    </item>
+                    <item>
+                     <widget class="QToolButton" name="swap_button">
+                      <property name="toolTip">
+                       <string>Swap the author and title</string>
+                      </property>
+                      <property name="text">
+                       <string>...</string>
+                      </property>
+                      <property name="icon">
+                       <iconset resource="../../../../resources/images.qrc">
+                        <normaloff>:/images/swap.png</normaloff>:/images/swap.png</iconset>
+                      </property>
+                      <property name="iconSize">
+                       <size>
+                        <width>16</width>
+                        <height>16</height>
+                       </size>
+                      </property>
+                     </widget>
+                    </item>
+                    <item>
+                     <spacer name="verticalSpacer_2">
+                      <property name="orientation">
+                       <enum>Qt::Vertical</enum>
+                      </property>
+                      <property name="sizeHint" stdset="0">
+                       <size>
+                        <width>20</width>
+                        <height>40</height>
+                       </size>
+                      </property>
+                     </spacer>
+                    </item>
+                    <item>
+                     <widget class="QToolButton" name="auto_author_sort">
+                      <property name="toolTip">
+                       <string>Automatically create the author sort entry based on the current author entry.
+Using this button to create author sort will change author sort from red to green.</string>
+                      </property>
+                      <property name="text">
+                       <string>...</string>
+                      </property>
+                      <property name="icon">
+                       <iconset resource="../../../../resources/images.qrc">
+                        <normaloff>:/images/auto_author_sort.png</normaloff>:/images/auto_author_sort.png</iconset>
+                      </property>
+                     </widget>
+                    </item>
+                    <item>
+                     <spacer name="verticalSpacer_4">
+                      <property name="orientation">
+                       <enum>Qt::Vertical</enum>
+                      </property>
+                      <property name="sizeHint" stdset="0">
+                       <size>
+                        <width>20</width>
+                        <height>40</height>
+                       </size>
+                      </property>
+                     </spacer>
+                    </item>
+                   </layout>
+                  </item>
                   <item row="1" column="0">
                    <widget class="QLabel" name="label">
                     <property name="text">
@@ -226,6 +332,31 @@ If the box is colored green, then text matches the individual author's sort stri
                     </property>
                    </widget>
                   </item>
+                  <item row="6" column="1">
+                   <layout class="QHBoxLayout" name="_2">
+                    <item>
+                     <widget class="TagsLineEdit" name="tags">
+                      <property name="toolTip">
+                       <string>Tags categorize the book. This is particularly useful while searching. &lt;br&gt;&lt;br&gt;They can be any words or phrases, separated by commas.</string>
+                      </property>
+                     </widget>
+                    </item>
+                   </layout>
+                  </item>
+                  <item row="6" column="2">
+                   <widget class="QToolButton" name="tag_editor_button">
+                    <property name="toolTip">
+                     <string>Open Tag Editor</string>
+                    </property>
+                    <property name="text">
+                     <string>Open Tag Editor</string>
+                    </property>
+                    <property name="icon">
+                     <iconset resource="../../../../resources/images.qrc">
+                      <normaloff>:/images/chapters.png</normaloff>:/images/chapters.png</iconset>
+                    </property>
+                   </widget>
+                  </item>
                   <item row="7" column="0">
                    <widget class="QLabel" name="label_7">
                     <property name="text">
@@ -265,6 +396,20 @@ If the box is colored green, then text matches the individual author's sort stri
                     </item>
                    </layout>
                   </item>
+                  <item row="7" column="2">
+                   <widget class="QToolButton" name="remove_series_button">
+                    <property name="toolTip">
+                     <string>Remove unused series (Series that have no books)</string>
+                    </property>
+                    <property name="text">
+                     <string>...</string>
+                    </property>
+                    <property name="icon">
+                     <iconset resource="../../../../resources/images.qrc">
+                      <normaloff>:/images/trash.png</normaloff>:/images/trash.png</iconset>
+                    </property>
+                   </widget>
+                  </item>
                   <item row="8" column="1" colspan="2">
                    <widget class="QDoubleSpinBox" name="series_index">
                     <property name="enabled">
@@ -330,7 +475,7 @@ If the box is colored green, then text matches the individual author's sort stri
                     </property>
                    </widget>
                   </item>
-                  <item row="11" column="1" colspan="2">
+                  <item row="11" column="1">
                    <widget class="QDateEdit" name="pubdate">
                     <property name="displayFormat">
                      <string>MMM yyyy</string>
@@ -340,144 +485,10 @@ If the box is colored green, then text matches the individual author's sort stri
                     </property>
                    </widget>
                   </item>
-                  <item row="0" column="2" rowspan="4">
-                   <layout class="QVBoxLayout" name="verticalLayout_7">
-                    <item>
-                     <spacer name="verticalSpacer_3">
-                      <property name="orientation">
-                       <enum>Qt::Vertical</enum>
-                      </property>
-                      <property name="sizeHint" stdset="0">
-                       <size>
-                        <width>20</width>
-                        <height>40</height>
-                       </size>
-                      </property>
-                     </spacer>
-                    </item>
-                    <item>
-                     <widget class="QToolButton" name="auto_title_sort">
-                      <property name="toolTip">
-                       <string>Automatically create the title sort entry based on the current title entry.
-Using this button to create title sort will change title sort from red to green.</string>
-                      </property>
-                      <property name="text">
-                       <string>...</string>
-                      </property>
-                      <property name="icon">
-                       <iconset resource="../../../../resources/images.qrc">
-                        <normaloff>:/images/auto_author_sort.png</normaloff>:/images/auto_author_sort.png</iconset>
-                      </property>
-                     </widget>
-                    </item>
-                    <item>
-                     <spacer name="verticalSpacer">
-                      <property name="orientation">
-                       <enum>Qt::Vertical</enum>
-                      </property>
-                      <property name="sizeHint" stdset="0">
-                       <size>
-                        <width>20</width>
-                        <height>40</height>
-                       </size>
-                      </property>
-                     </spacer>
-                    </item>
-                    <item>
-                     <widget class="QToolButton" name="swap_button">
-                      <property name="toolTip">
-                       <string>Swap the author and title</string>
-                      </property>
-                      <property name="text">
-                       <string>...</string>
-                      </property>
-                      <property name="icon">
-                       <iconset resource="../../../../resources/images.qrc">
-                        <normaloff>:/images/swap.png</normaloff>:/images/swap.png</iconset>
-                      </property>
-                      <property name="iconSize">
-                       <size>
-                        <width>16</width>
-                        <height>16</height>
-                       </size>
-                      </property>
-                     </widget>
-                    </item>
-                    <item>
-                     <spacer name="verticalSpacer_2">
-                      <property name="orientation">
-                       <enum>Qt::Vertical</enum>
-                      </property>
-                      <property name="sizeHint" stdset="0">
-                       <size>
-                        <width>20</width>
-                        <height>40</height>
-                       </size>
-                      </property>
-                     </spacer>
-                    </item>
-                    <item>
-                     <widget class="QToolButton" name="auto_author_sort">
-                      <property name="toolTip">
-                       <string>Automatically create the author sort entry based on the current author entry.
-Using this button to create author sort will change author sort from red to green.</string>
-                      </property>
-                      <property name="text">
-                       <string>...</string>
-                      </property>
-                      <property name="icon">
-                       <iconset resource="../../../../resources/images.qrc">
-                        <normaloff>:/images/auto_author_sort.png</normaloff>:/images/auto_author_sort.png</iconset>
-                      </property>
-                     </widget>
-                    </item>
-                    <item>
-                     <spacer name="verticalSpacer_4">
-                      <property name="orientation">
-                       <enum>Qt::Vertical</enum>
-                      </property>
-                      <property name="sizeHint" stdset="0">
-                       <size>
-                        <width>20</width>
-                        <height>40</height>
-                       </size>
-                      </property>
-                     </spacer>
-                    </item>
-                   </layout>
-                  </item>
-                  <item row="6" column="1">
-                   <layout class="QHBoxLayout" name="_2">
-                    <item>
-                     <widget class="TagsLineEdit" name="tags">
-                      <property name="toolTip">
-                       <string>Tags categorize the book. This is particularly useful while searching. &lt;br&gt;&lt;br&gt;They can be any words or phrases, separated by commas.</string>
-                      </property>
-                     </widget>
-                    </item>
-                   </layout>
-                  </item>
-                  <item row="6" column="2">
-                   <widget class="QToolButton" name="tag_editor_button">
+                  <item row="11" column="2">
+                   <widget class="QToolButton" name="clear_pubdate_button">
                     <property name="toolTip">
-                     <string>Open Tag Editor</string>
-                    </property>
-                    <property name="text">
-                     <string>Open Tag Editor</string>
-                    </property>
-                    <property name="icon">
-                     <iconset resource="../../../../resources/images.qrc">
-                      <normaloff>:/images/chapters.png</normaloff>:/images/chapters.png</iconset>
-                    </property>
-                   </widget>
-                  </item>
-                  <item row="7" column="2">
-                   <widget class="QToolButton" name="remove_series_button">
-                    <property name="toolTip">
-                     <string>Remove unused series (Series that have no books)</string>
-                    </property>
-                    <property name="text">
-                     <string>...</string>
+                     <string>Clear published date</string>
                     </property>
                     <property name="icon">
                      <iconset resource="../../../../resources/images.qrc">

From 4e5d5bbce0ed89e1ff33836ac2222243fbb0cb24 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 16 Jan 2011 11:45:18 -0700
Subject: [PATCH 050/118] MOBI Input: SPecial case handling of emptu div tags
 with a defined height used as paragraph separators. Fixes #8391 (formatting
 issues when converting from .azw to .mobi. Not duplicating space between
 paragraphs.)

---
 src/calibre/ebooks/mobi/reader.py | 12 +++++++++++-
 src/calibre/manual/faq.rst        |  5 +++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py
index e07418f41c..2f397006a1 100644
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@@ -542,7 +542,17 @@ class MobiReader(object):
                         elif tag.tag == 'img':
                             tag.set('height', height)
                         else:
-                            styles.append('margin-top: %s' % self.ensure_unit(height))
+                            if tag.tag == 'div' and not tag.text and \
+                                    (not tag.tail or not tag.tail.strip()) and \
+                                    not len(list(tag.iterdescendants())):
+                                # Paragraph spacer
+                                # Insert nbsp so that the element is never
+                                # discarded by a renderer
+                                tag.text = u'\u00a0' # nbsp
+                                styles.append('height: %s' %
+                                        self.ensure_unit(height))
+                            else:
+                                styles.append('margin-top: %s' % self.ensure_unit(height))
             if attrib.has_key('width'):
                 width = attrib.pop('width').strip()
                 if width and re.search(r'\d+', width):
diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst
index 0e8c101620..ee72bf6fdb 100644
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@@ -450,6 +450,11 @@ How do I use purchased EPUB books with |app|?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Most purchased EPUB books have `DRM <http://wiki.mobileread.com/wiki/DRM>`_. This prevents |app| from opening them. You can still use |app| to store and transfer them to your e-book reader. First, you must authorize your reader on a windows machine with Adobe Digital Editions. Once this is done, EPUB books transferred with |app| will work fine on your reader. When you purchase an epub book from a website, you will get an ".acsm" file. This file should be opened with Adobe Digital Editions, which will then download the actual ".epub" e-book. The e-book file will be stored in the folder "My Digital Editions", from where you can add it to |app|.
 
+I am getting a "Permission Denied" error?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A permission denied error can occur because of many possible reasons, none of them having anything to do with |app|. You can get permission denied errors if you are using an SD card with write protect enabled. Or if you, or some program you used changed the file permissions of the files in question to read only. Or if there is a filesystem error on the device which caused your operating system to mount the filesystem in read only mode or mark a particular file as read only pending recovery. Or if the files have their owner set to a user other than you. You will need to fix the underlying cause of the permissions error before resuming to use |app|. Read the error message carefully, see what file it points to and fix the permissions on that file.
+
 Can I have the comment metadata show up on my reader?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

From 91ccd9ad749a0ef88fff14971312c77d1f3838ae Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 16 Jan 2011 12:31:19 -0700
Subject: [PATCH 051/118] Add AZW to the default list of internall viewed
 formats

---
 src/calibre/gui2/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py
index 6a9becee50..c94b99f141 100644
--- a/src/calibre/gui2/__init__.py
+++ b/src/calibre/gui2/__init__.py
@@ -85,7 +85,7 @@ def _config():
     c.add_opt('LRF_ebook_viewer_options', default=None,
               help=_('Options for the LRF ebook viewer'))
     c.add_opt('internally_viewed_formats', default=['LRF', 'EPUB', 'LIT',
-        'MOBI', 'PRC', 'HTML', 'FB2', 'PDB', 'RB', 'SNB'],
+        'MOBI', 'PRC', 'AZW', 'HTML', 'FB2', 'PDB', 'RB', 'SNB'],
               help=_('Formats that are viewed using the internal viewer'))
     c.add_opt('column_map', default=ALL_COLUMNS,
               help=_('Columns to be displayed in the book list'))

From 4d428dfa9a90023876413cab30f6d38fefdac620 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 16 Jan 2011 12:34:49 -0700
Subject: [PATCH 052/118] Bulk metadata edit: Check apply date automatically
 whenever user changes the date in the pubdate field

---
 src/calibre/gui2/dialogs/metadata_bulk.py | 4 ++++
 src/calibre/gui2/dialogs/metadata_bulk.ui | 7 +++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/calibre/gui2/dialogs/metadata_bulk.py b/src/calibre/gui2/dialogs/metadata_bulk.py
index da6e92c26a..302766a92d 100644
--- a/src/calibre/gui2/dialogs/metadata_bulk.py
+++ b/src/calibre/gui2/dialogs/metadata_bulk.py
@@ -299,6 +299,7 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog):
             self.pubdate.setDisplayFormat(pubdate_format)
         self.pubdate.setSpecialValueText(_('Undefined'))
         self.clear_pubdate_button.clicked.connect(self.clear_pubdate)
+        self.pubdate.dateChanged.connect(self.do_apply_pubdate)
 
         if len(self.db.custom_field_keys(include_composites=False)) == 0:
             self.central_widget.removeTab(1)
@@ -315,6 +316,9 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog):
         self.central_widget.setCurrentIndex(tab)
         self.exec_()
 
+    def do_apply_pubdate(self, *args):
+        self.apply_pubdate.setChecked(True)
+
     def clear_pubdate(self, *args):
         self.pubdate.setDate(UNDEFINED_QDATE)
 
diff --git a/src/calibre/gui2/dialogs/metadata_bulk.ui b/src/calibre/gui2/dialogs/metadata_bulk.ui
index b14c31c9d1..5690a8e555 100644
--- a/src/calibre/gui2/dialogs/metadata_bulk.ui
+++ b/src/calibre/gui2/dialogs/metadata_bulk.ui
@@ -367,6 +367,9 @@ from the value in the box</string>
                <property name="displayFormat">
                 <string>MMM yyyy</string>
                </property>
+               <property name="calendarPopup">
+                <bool>true</bool>
+               </property>
               </widget>
              </item>
              <item>
@@ -871,8 +874,8 @@ not multiple and the destination field is multiple</string>
                <rect>
                 <x>0</x>
                 <y>0</y>
-                <width>826</width>
-                <height>313</height>
+                <width>197</width>
+                <height>60</height>
                </rect>
               </property>
               <layout class="QGridLayout" name="testgrid">

From 497b381bfa8ce1d5c8de65da03cc31b2b109ed10 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 16 Jan 2011 12:45:11 -0700
Subject: [PATCH 053/118] The update found link now opens the update
 notification dialog instead of going straight to the download page

---
 src/calibre/gui2/init.py   | 14 +++++++++++---
 src/calibre/gui2/update.py |  9 ++++-----
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/calibre/gui2/init.py b/src/calibre/gui2/init.py
index fc70f0579d..95af265856 100644
--- a/src/calibre/gui2/init.py
+++ b/src/calibre/gui2/init.py
@@ -148,7 +148,6 @@ class StatusBar(QStatusBar): # {{{
                 self.get_version() + ' ' + _('created by Kovid Goyal')
         self.device_string = ''
         self.update_label = QLabel('')
-        self.update_label.setOpenExternalLinks(True)
         self.addPermanentWidget(self.update_label)
         self.update_label.setVisible(False)
         self._font = QFont()
@@ -174,8 +173,9 @@ class StatusBar(QStatusBar): # {{{
         self.clearMessage()
 
     def new_version_available(self, ver, url):
-        msg = (u'<span style="color:red; font-weight: bold">%s: <a href="%s">%s<a></span>') % (
-                _('Update found'), url, ver)
+        msg = (u'<span style="color:red; font-weight: bold">%s: <a'
+               ' href="update:%s">%s<a></span>') % (
+                _('Update found'), ver, ver)
         self.update_label.setText(msg)
         self.update_label.setCursor(Qt.PointingHandCursor)
         self.update_label.setVisible(True)
@@ -240,6 +240,14 @@ class LayoutMixin(object): # {{{
             self.status_bar.addPermanentWidget(button)
         self.status_bar.addPermanentWidget(self.jobs_button)
         self.setStatusBar(self.status_bar)
+        self.status_bar.update_label.linkActivated.connect(self.update_link_clicked)
+
+    def update_link_clicked(self, url):
+        print 11111111, url
+        url = unicode(url)
+        if url.startswith('update:'):
+            version = url.partition(':')[-1]
+            self.update_found(version, force=True)
 
     def finalize_layout(self):
         self.status_bar.initialize(self.system_tray_icon)
diff --git a/src/calibre/gui2/update.py b/src/calibre/gui2/update.py
index 30cfe8f5e4..9929d50a7e 100644
--- a/src/calibre/gui2/update.py
+++ b/src/calibre/gui2/update.py
@@ -52,8 +52,7 @@ class UpdateNotification(QDialog):
         self.label = QLabel('<p>'+
             _('%s has been updated to version <b>%s</b>. '
             'See the <a href="http://calibre-ebook.com/whats-new'
-            '">new features</a>. Visit the download pa'
-            'ge?')%(__appname__, version))
+            '">new features</a>.')%(__appname__, version))
         self.label.setOpenExternalLinks(True)
         self.label.setWordWrap(True)
         self.setWindowTitle(_('Update available!'))
@@ -94,13 +93,13 @@ class UpdateMixin(object):
                     type=Qt.QueuedConnection)
             self.update_checker.start()
 
-    def update_found(self, version):
+    def update_found(self, version, force=False):
         os = 'windows' if iswindows else 'osx' if isosx else 'linux'
         url = 'http://calibre-ebook.com/download_%s'%os
         self.status_bar.new_version_available(version, url)
 
-        if config.get('new_version_notification') and \
-                dynamic.get('update to version %s'%version, True):
+        if force or (config.get('new_version_notification') and \
+                dynamic.get('update to version %s'%version, True)):
             self._update_notification__ = UpdateNotification(version,
                     parent=self)
             self._update_notification__.show()

From 69c6ad02110d780f131aa41bf92317b8d82838fe Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 16 Jan 2011 12:49:46 -0700
Subject: [PATCH 054/118] ...

---
 src/calibre/gui2/init.py          | 1 -
 src/calibre/manual/conversion.rst | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/gui2/init.py b/src/calibre/gui2/init.py
index 95af265856..ebd670c8fa 100644
--- a/src/calibre/gui2/init.py
+++ b/src/calibre/gui2/init.py
@@ -243,7 +243,6 @@ class LayoutMixin(object): # {{{
         self.status_bar.update_label.linkActivated.connect(self.update_link_clicked)
 
     def update_link_clicked(self, url):
-        print 11111111, url
         url = unicode(url)
         if url.startswith('update:'):
             version = url.partition(':')[-1]
diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index 4b2b169d72..71639ca749 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -547,6 +547,7 @@ Some limitations of PDF input are:
     * Extraction of vector images and tables from within the document is also not supported.
     * Some PDFs use special glyphs to represent ll or ff or fi, etc. Conversion of these may or may not work depending on just how they are represented internally in the PDF.
     * Some PDFs store their images upside down with a rotation instruction, |app| currently doesn't support that instruction, so the images will be rotated in the output as well. 
+    * Links and Tables of Contents are not supported
 
 To re-iterate **PDF is a really, really bad** format to use as input. If you absolutely must use PDF, then be prepared for an
 output ranging anywhere from decent to unusable, depending on the input PDF.

From 188a96caeb39966d0cc5e9fa1bd91e0ef6a77ac4 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 16 Jan 2011 16:40:27 -0500
Subject: [PATCH 055/118] Make TagsLineEdit into a generic CompleteLineEdit
 class. Use CompleteLineEdit for Author completion in the main GUI window.

---
 src/calibre/gui2/convert/metadata.py        |  2 +-
 src/calibre/gui2/convert/metadata.ui        |  4 +-
 src/calibre/gui2/custom_column_widgets.py   | 12 ++--
 src/calibre/gui2/dialogs/metadata_bulk.py   |  8 +--
 src/calibre/gui2/dialogs/metadata_bulk.ui   |  6 +-
 src/calibre/gui2/dialogs/metadata_single.py |  4 +-
 src/calibre/gui2/dialogs/metadata_single.ui |  4 +-
 src/calibre/gui2/dialogs/search.py          |  2 +-
 src/calibre/gui2/dialogs/search.ui          |  4 +-
 src/calibre/gui2/library/delegates.py       | 54 +++++++++++++++++-
 src/calibre/gui2/library/views.py           |  9 ++-
 src/calibre/gui2/widgets.py                 | 62 ++++++++++++---------
 src/calibre/library/database.py             |  3 +
 13 files changed, 118 insertions(+), 56 deletions(-)

diff --git a/src/calibre/gui2/convert/metadata.py b/src/calibre/gui2/convert/metadata.py
index d3744bb614..de03033060 100644
--- a/src/calibre/gui2/convert/metadata.py
+++ b/src/calibre/gui2/convert/metadata.py
@@ -75,7 +75,7 @@ class MetadataWidget(Widget, Ui_Form):
             self.publisher.setCurrentIndex(self.publisher.findText(mi.publisher))
         self.author_sort.setText(mi.author_sort if mi.author_sort else '')
         self.tags.setText(', '.join(mi.tags if mi.tags else []))
-        self.tags.update_tags_cache(self.db.all_tags())
+        self.tags.update_items_cache(self.db.all_tags())
         self.comment.setPlainText(mi.comments if mi.comments else '')
         if mi.series:
             self.series.setCurrentIndex(self.series.findText(mi.series))
diff --git a/src/calibre/gui2/convert/metadata.ui b/src/calibre/gui2/convert/metadata.ui
index a594f47b5d..5735193424 100644
--- a/src/calibre/gui2/convert/metadata.ui
+++ b/src/calibre/gui2/convert/metadata.ui
@@ -190,7 +190,7 @@
         </widget>
        </item>
        <item row="4" column="1">
-        <widget class="TagsLineEdit" name="tags">
+        <widget class="CompleteLineEdit" name="tags">
          <property name="toolTip">
           <string>Tags categorize the book. This is particularly useful while searching. &lt;br&gt;&lt;br&gt;They can be any words or phrases, separated by commas.</string>
          </property>
@@ -310,7 +310,7 @@
    <header>widgets.h</header>
   </customwidget>
   <customwidget>
-   <class>TagsLineEdit</class>
+   <class>CompleteLineEdit</class>
    <extends>QLineEdit</extends>
    <header>widgets.h</header>
   </customwidget>
diff --git a/src/calibre/gui2/custom_column_widgets.py b/src/calibre/gui2/custom_column_widgets.py
index ec18675359..d80909c4bb 100644
--- a/src/calibre/gui2/custom_column_widgets.py
+++ b/src/calibre/gui2/custom_column_widgets.py
@@ -14,7 +14,7 @@ from PyQt4.Qt import QComboBox, QLabel, QSpinBox, QDoubleSpinBox, QDateEdit, \
         QPushButton
 
 from calibre.utils.date import qt_to_dt, now
-from calibre.gui2.widgets import TagsLineEdit, EnComboBox
+from calibre.gui2.widgets import CompleteLineEdit, EnComboBox
 from calibre.gui2.comments_editor import Editor as CommentsEditor
 from calibre.gui2 import UNDEFINED_QDATE, error_dialog
 from calibre.utils.config import tweaks
@@ -212,7 +212,7 @@ class Text(Base):
         values = self.all_values = list(self.db.all_custom(num=self.col_id))
         values.sort(key=sort_key)
         if self.col_metadata['is_multiple']:
-            w = TagsLineEdit(parent, values)
+            w = CompleteLineEdit(parent, values)
             w.setSizePolicy(QSizePolicy.Minimum, QSizePolicy.Preferred)
         else:
             w = EnComboBox(parent)
@@ -226,7 +226,7 @@ class Text(Base):
         val = self.normalize_db_val(val)
         if self.col_metadata['is_multiple']:
             self.setter(val)
-            self.widgets[1].update_tags_cache(self.all_values)
+            self.widgets[1].update_items_cache(self.all_values)
         else:
             idx = None
             for i, c in enumerate(self.all_values):
@@ -656,7 +656,7 @@ class RemoveTags(QWidget):
         layout.setSpacing(5)
         layout.setContentsMargins(0, 0, 0, 0)
 
-        self.tags_box = TagsLineEdit(parent, values)
+        self.tags_box = CompleteLineEdit(parent, values)
         layout.addWidget(self.tags_box, stretch = 1)
         # self.tags_box.setSizePolicy(QSizePolicy.Minimum, QSizePolicy.Preferred)
 
@@ -678,7 +678,7 @@ class BulkText(BulkBase):
         values = self.all_values = list(self.db.all_custom(num=self.col_id))
         values.sort(key=sort_key)
         if self.col_metadata['is_multiple']:
-            w = TagsLineEdit(parent, values)
+            w = CompleteLineEdit(parent, values)
             w.setSizePolicy(QSizePolicy.Minimum, QSizePolicy.Preferred)
             self.widgets = [QLabel('&'+self.col_metadata['name']+': ' +
                                    _('tags to add'), parent), w]
@@ -697,7 +697,7 @@ class BulkText(BulkBase):
 
     def initialize(self, book_ids):
         if self.col_metadata['is_multiple']:
-            self.widgets[1].update_tags_cache(self.all_values)
+            self.widgets[1].update_items_cache(self.all_values)
         else:
             val = self.get_initial_value(book_ids)
             self.initial_val = val = self.normalize_db_val(val)
diff --git a/src/calibre/gui2/dialogs/metadata_bulk.py b/src/calibre/gui2/dialogs/metadata_bulk.py
index 302766a92d..c7d5add912 100644
--- a/src/calibre/gui2/dialogs/metadata_bulk.py
+++ b/src/calibre/gui2/dialogs/metadata_bulk.py
@@ -279,8 +279,8 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog):
         self.changed = False
 
         all_tags = self.db.all_tags()
-        self.tags.update_tags_cache(all_tags)
-        self.remove_tags.update_tags_cache(all_tags)
+        self.tags.update_items_cache(all_tags)
+        self.remove_tags.update_items_cache(all_tags)
 
         self.initialize_combos()
 
@@ -751,8 +751,8 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog):
         if d.result() == QDialog.Accepted:
             tag_string = ', '.join(d.tags)
             self.tags.setText(tag_string)
-            self.tags.update_tags_cache(self.db.all_tags())
-            self.remove_tags.update_tags_cache(self.db.all_tags())
+            self.tags.update_items_cache(self.db.all_tags())
+            self.remove_tags.update_items_cache(self.db.all_tags())
 
     def auto_number_changed(self, state):
         if state:
diff --git a/src/calibre/gui2/dialogs/metadata_bulk.ui b/src/calibre/gui2/dialogs/metadata_bulk.ui
index 5690a8e555..b826f8d48d 100644
--- a/src/calibre/gui2/dialogs/metadata_bulk.ui
+++ b/src/calibre/gui2/dialogs/metadata_bulk.ui
@@ -195,7 +195,7 @@
             </widget>
            </item>
            <item row="5" column="1">
-            <widget class="TagsLineEdit" name="tags">
+            <widget class="CompleteLineEdit" name="tags">
              <property name="toolTip">
               <string>Tags categorize the book. This is particularly useful while searching. &lt;br&gt;&lt;br&gt;They can be any words or phrases, separated by commas.</string>
              </property>
@@ -229,7 +229,7 @@
             </widget>
            </item>
            <item row="6" column="1">
-            <widget class="TagsLineEdit" name="remove_tags">
+            <widget class="CompleteLineEdit" name="remove_tags">
              <property name="toolTip">
               <string>Comma separated list of tags to remove from the books. </string>
              </property>
@@ -955,7 +955,7 @@ not multiple and the destination field is multiple</string>
    <header>widgets.h</header>
   </customwidget>
   <customwidget>
-   <class>TagsLineEdit</class>
+   <class>CompleteLineEdit</class>
    <extends>QLineEdit</extends>
    <header>widgets.h</header>
   </customwidget>
diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py
index e4efdf0470..139b8a2ebe 100644
--- a/src/calibre/gui2/dialogs/metadata_single.py
+++ b/src/calibre/gui2/dialogs/metadata_single.py
@@ -556,7 +556,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
         tags = self.db.tags(row)
         self.original_tags = ', '.join(tags.split(',')) if tags else ''
         self.tags.setText(self.original_tags)
-        self.tags.update_tags_cache(self.db.all_tags())
+        self.tags.update_items_cache(self.db.all_tags())
         rating = self.db.rating(row)
         if rating > 0:
             self.rating.setValue(int(rating/2.))
@@ -776,7 +776,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
         if d.result() == QDialog.Accepted:
             tag_string = ', '.join(d.tags)
             self.tags.setText(tag_string)
-            self.tags.update_tags_cache(self.db.all_tags())
+            self.tags.update_items_cache(self.db.all_tags())
 
 
     def fetch_metadata(self):
diff --git a/src/calibre/gui2/dialogs/metadata_single.ui b/src/calibre/gui2/dialogs/metadata_single.ui
index 60c221be1a..b95267a618 100644
--- a/src/calibre/gui2/dialogs/metadata_single.ui
+++ b/src/calibre/gui2/dialogs/metadata_single.ui
@@ -335,7 +335,7 @@ If the box is colored green, then text matches the individual author's sort stri
                   <item row="6" column="1">
                    <layout class="QHBoxLayout" name="_2">
                     <item>
-                     <widget class="TagsLineEdit" name="tags">
+                     <widget class="CompleteLineEdit" name="tags">
                       <property name="toolTip">
                        <string>Tags categorize the book. This is particularly useful while searching. &lt;br&gt;&lt;br&gt;They can be any words or phrases, separated by commas.</string>
                       </property>
@@ -842,7 +842,7 @@ If the box is colored green, then text matches the individual author's sort stri
    <header>widgets.h</header>
   </customwidget>
   <customwidget>
-   <class>TagsLineEdit</class>
+   <class>CompleteLineEdit</class>
    <extends>QLineEdit</extends>
    <header>widgets.h</header>
   </customwidget>
diff --git a/src/calibre/gui2/dialogs/search.py b/src/calibre/gui2/dialogs/search.py
index 62a0f8a9f1..4f72fa915e 100644
--- a/src/calibre/gui2/dialogs/search.py
+++ b/src/calibre/gui2/dialogs/search.py
@@ -42,7 +42,7 @@ class SearchDialog(QDialog, Ui_Dialog):
         self.series_box.setAutoCompletionCaseSensitivity(Qt.CaseInsensitive)
 
         all_tags = db.all_tags()
-        self.tags_box.update_tags_cache(all_tags)
+        self.tags_box.update_items_cache(all_tags)
 
         self.box_last_values = copy.deepcopy(box_values)
         if self.box_last_values:
diff --git a/src/calibre/gui2/dialogs/search.ui b/src/calibre/gui2/dialogs/search.ui
index 6848a45506..519ae8c462 100644
--- a/src/calibre/gui2/dialogs/search.ui
+++ b/src/calibre/gui2/dialogs/search.ui
@@ -279,7 +279,7 @@
         </widget>
        </item>
        <item row="4" column="1">
-        <widget class="TagsLineEdit" name="tags_box">
+        <widget class="CompleteLineEdit" name="tags_box">
          <property name="toolTip">
           <string>Enter tags separated by spaces</string>
          </property>
@@ -360,7 +360,7 @@
    <header>widgets.h</header>
   </customwidget>
   <customwidget>
-   <class>TagsLineEdit</class>
+   <class>CompleteLineEdit</class>
    <extends>QLineEdit</extends>
    <header>widgets.h</header>
   </customwidget>
diff --git a/src/calibre/gui2/library/delegates.py b/src/calibre/gui2/library/delegates.py
index b41fd78dc3..af8f9c4d8a 100644
--- a/src/calibre/gui2/library/delegates.py
+++ b/src/calibre/gui2/library/delegates.py
@@ -16,7 +16,7 @@ from PyQt4.Qt import QColor, Qt, QModelIndex, QSize, \
                      QComboBox, QTextDocument
 
 from calibre.gui2 import UNDEFINED_QDATE, error_dialog
-from calibre.gui2.widgets import EnLineEdit, TagsLineEdit
+from calibre.gui2.widgets import EnLineEdit, CompleteLineEdit
 from calibre.utils.date import now, format_date
 from calibre.utils.config import tweaks
 from calibre.utils.formatter import validation_formatter
@@ -173,9 +173,9 @@ class TagsDelegate(QStyledItemDelegate): # {{{
         if self.db:
             col = index.model().column_map[index.column()]
             if not index.model().is_custom_column(col):
-                editor = TagsLineEdit(parent, self.db.all_tags())
+                editor = CompleteLineEdit(parent, self.db.all_tags())
             else:
-                editor = TagsLineEdit(parent,
+                editor = CompleteLineEdit(parent,
                         sorted(list(self.db.all_custom(label=self.db.field_metadata.key_to_label(col))),
                                key=sort_key))
                 return editor
@@ -184,6 +184,54 @@ class TagsDelegate(QStyledItemDelegate): # {{{
         return editor
 # }}}
 
+class AuthorsDelegate(QStyledItemDelegate): # {{{
+    def __init__(self, parent):
+        QStyledItemDelegate.__init__(self, parent)
+        self.db = None
+
+    def set_database(self, db):
+        self.db = db
+
+    def createEditor(self, parent, option, index):
+        if self.db:
+            col = index.model().column_map[index.column()]
+            if not index.model().is_custom_column(col):
+                editor = CompleteLineEdit(parent, self.db.all_author_names(), '&', True)
+            else:
+                editor = CompleteLineEdit(parent,
+                        sorted(list(self.db.all_custom(label=self.db.field_metadata.key_to_label(col))),
+                               key=sort_key), '&', True)
+                return editor
+        else:
+            editor = EnLineEdit(parent)
+        return editor
+# }}}
+
+class CompleteDelegate(QStyledItemDelegate): # {{{
+    def __init__(self, parent, sep, items_func_name, space_before_sep=False):
+        QStyledItemDelegate.__init__(self, parent)
+        self.sep = sep
+        self.items_func_name = items_func_name
+        self.space_before_sep = space_before_sep
+
+    def set_database(self, db):
+        self.db = db
+
+    def createEditor(self, parent, option, index):
+        if self.db and hasattr(self.db, self.items_func_name):
+            col = index.model().column_map[index.column()]
+            if not index.model().is_custom_column(col):
+                editor = CompleteLineEdit(parent, getattr(self.db, self.items_func_name)(),
+                    self.sep, self.space_before_sep)
+            else:
+                editor = CompleteLineEdit(parent,
+                    sorted(list(self.db.all_custom(label=self.db.field_metadata.key_to_label(col))),
+                    key=sort_key), self.sep, self.space_before_sep)
+        else:
+            editor = EnLineEdit(parent)
+        return editor
+# }}}
+
 class CcDateDelegate(QStyledItemDelegate): # {{{
     '''
     Delegate for custom columns dates. Because this delegate stores the
diff --git a/src/calibre/gui2/library/views.py b/src/calibre/gui2/library/views.py
index 3ff0fc3cd7..61161cd5e6 100644
--- a/src/calibre/gui2/library/views.py
+++ b/src/calibre/gui2/library/views.py
@@ -13,7 +13,7 @@ from PyQt4.Qt import QTableView, Qt, QAbstractItemView, QMenu, pyqtSignal, \
     QPoint, QPixmap, QUrl, QImage, QPainter, QColor, QRect
 
 from calibre.gui2.library.delegates import RatingDelegate, PubDateDelegate, \
-    TextDelegate, DateDelegate, TagsDelegate, CcTextDelegate, \
+    TextDelegate, DateDelegate, CompleteDelegate, CcTextDelegate, \
     CcBoolDelegate, CcCommentsDelegate, CcDateDelegate, CcTemplateDelegate, \
     CcEnumDelegate
 from calibre.gui2.library.models import BooksModel, DeviceBooksModel
@@ -76,8 +76,8 @@ class BooksView(QTableView): # {{{
         self.rating_delegate = RatingDelegate(self)
         self.timestamp_delegate = DateDelegate(self)
         self.pubdate_delegate = PubDateDelegate(self)
-        self.tags_delegate = TagsDelegate(self)
-        self.authors_delegate = TextDelegate(self)
+        self.tags_delegate = CompleteDelegate(self, ',', 'all_tags')
+        self.authors_delegate = CompleteDelegate(self, '&', 'all_author_names', True)
         self.series_delegate = TextDelegate(self)
         self.publisher_delegate = TextDelegate(self)
         self.text_delegate = TextDelegate(self)
@@ -410,8 +410,7 @@ class BooksView(QTableView): # {{{
         self.save_state()
         self._model.set_database(db)
         self.tags_delegate.set_database(db)
-        self.authors_delegate.set_auto_complete_function(
-                lambda: [(x, y.replace('|', ',')) for (x, y) in db.all_authors()])
+        self.authors_delegate.set_database(db)
         self.series_delegate.set_auto_complete_function(db.all_series)
         self.publisher_delegate.set_auto_complete_function(db.all_publishers)
 
diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py
index f2ff783a76..6e2d52f835 100644
--- a/src/calibre/gui2/widgets.py
+++ b/src/calibre/gui2/widgets.py
@@ -426,46 +426,47 @@ class EnLineEdit(LineEditECM, QLineEdit):
     pass
 
 
-class TagsCompleter(QCompleter):
+class ItemsCompleter(QCompleter):
 
     '''
     A completer object that completes a list of tags. It is used in conjunction
     with a CompleterLineEdit.
     '''
 
-    def __init__(self, parent, all_tags):
-        QCompleter.__init__(self, all_tags, parent)
-        self.all_tags = set(all_tags)
+    def __init__(self, parent, all_items):
+        QCompleter.__init__(self, all_items, parent)
+        self.all_items = set(all_items)
 
-    def update(self, text_tags, completion_prefix):
-        tags = list(self.all_tags.difference(text_tags))
-        model = QStringListModel(tags, self)
+    def update(self, text_items, completion_prefix):
+        items = list(self.all_items.difference(text_items))
+        model = QStringListModel(items, self)
         self.setModel(model)
 
         self.setCompletionPrefix(completion_prefix)
         if completion_prefix.strip() != '':
             self.complete()
 
-    def update_tags_cache(self, tags):
-        self.all_tags = set(tags)
-        model = QStringListModel(tags, self)
+    def update_items_cache(self, items):
+        self.all_items = set(items)
+        model = QStringListModel(items, self)
         self.setModel(model)
 
 
-class TagsLineEdit(EnLineEdit):
+class CompleteLineEdit(EnLineEdit):
 
     '''
     A QLineEdit that can complete parts of text separated by separator.
     '''
 
-    def __init__(self, parent=0, tags=[]):
+    def __init__(self, parent=0, complete_items=[], sep=',', space_before_sep=False):
         EnLineEdit.__init__(self, parent)
 
-        self.separator = ','
+        self.separator = sep
+        self.space_before_sep = space_before_sep
 
         self.connect(self, SIGNAL('textChanged(QString)'), self.text_changed)
 
-        self.completer = TagsCompleter(self, tags)
+        self.completer = ItemsCompleter(self, complete_items)
         self.completer.setCaseSensitivity(Qt.CaseInsensitive)
 
         self.connect(self,
@@ -476,32 +477,43 @@ class TagsLineEdit(EnLineEdit):
 
         self.completer.setWidget(self)
 
-    def update_tags_cache(self, tags):
-        self.completer.update_tags_cache(tags)
+    def update_items_cache(self, complete_items):
+        self.completer.update_items_cache(complete_items)
+        
+    def set_separator(self, sep):
+        self.separator = sep
+        
+    def set_space_before_sep(self, space_before):
+        self.space_before_sep = space_before
 
     def text_changed(self, text):
         all_text = unicode(text)
         text = all_text[:self.cursorPosition()]
-        prefix = text.split(',')[-1].strip()
+        prefix = text.split(self.separator)[-1].strip()
 
-        text_tags = []
+        text_items = []
         for t in all_text.split(self.separator):
             t1 = unicode(t).strip()
             if t1 != '':
-                text_tags.append(t)
-        text_tags = list(set(text_tags))
+                text_items.append(t)
+        text_items = list(set(text_items))
 
         self.emit(SIGNAL('text_changed(PyQt_PyObject, PyQt_PyObject)'),
-            text_tags, prefix)
+            text_items, prefix)
 
     def complete_text(self, text):
         cursor_pos = self.cursorPosition()
         before_text = unicode(self.text())[:cursor_pos]
         after_text = unicode(self.text())[cursor_pos:]
-        prefix_len = len(before_text.split(',')[-1].strip())
-        self.setText('%s%s%s %s' % (before_text[:cursor_pos - prefix_len],
-            text, self.separator, after_text))
-        self.setCursorPosition(cursor_pos - prefix_len + len(text) + 2)
+        prefix_len = len(before_text.split(self.separator)[-1].strip())
+        if self.space_before_sep:
+            complete_text_pat = '%s%s %s %s'
+            len_extra = 3
+        else:
+            complete_text_pat = '%s%s%s %s'
+            len_extra = 2
+        self.setText(complete_text_pat % (before_text[:cursor_pos - prefix_len], text, self.separator, after_text))
+        self.setCursorPosition(cursor_pos - prefix_len + len(text) + len_extra)
 
 
 class EnComboBox(QComboBox):
diff --git a/src/calibre/library/database.py b/src/calibre/library/database.py
index 6016dbd03e..e2ad8796a0 100644
--- a/src/calibre/library/database.py
+++ b/src/calibre/library/database.py
@@ -1059,6 +1059,9 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
     def all_authors(self):
         return [ (i[0], i[1]) for i in \
                 self.conn.get('SELECT id, name FROM authors')]
+        
+    def all_author_names(self):
+        return [i[0].strip() for i in self.conn.get('SELECT name FROM authors') if i[0].strip()]
 
     def all_publishers(self):
         return [ (i[0], i[1]) for i in \

From cfa57f63df7e4fe8d683082e780b7df31c52fabb Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 16 Jan 2011 17:14:26 -0500
Subject: [PATCH 056/118] GUI: Have Authors combo boxes use completion using &
 just like tags use completion with ,

---
 src/calibre/gui2/convert/metadata.py        |  3 +
 src/calibre/gui2/convert/metadata.ui        | 69 +++++++++------------
 src/calibre/gui2/dialogs/metadata_bulk.py   |  4 ++
 src/calibre/gui2/dialogs/metadata_bulk.ui   | 19 +++---
 src/calibre/gui2/dialogs/metadata_single.py |  4 ++
 src/calibre/gui2/dialogs/metadata_single.ui | 51 +++++++--------
 src/calibre/gui2/dialogs/search.py          |  3 +
 src/calibre/gui2/dialogs/search.ui          | 13 ++--
 src/calibre/gui2/widgets.py                 | 16 +++++
 9 files changed, 104 insertions(+), 78 deletions(-)

diff --git a/src/calibre/gui2/convert/metadata.py b/src/calibre/gui2/convert/metadata.py
index de03033060..5f39202e26 100644
--- a/src/calibre/gui2/convert/metadata.py
+++ b/src/calibre/gui2/convert/metadata.py
@@ -68,6 +68,9 @@ class MetadataWidget(Widget, Ui_Form):
     def initialize_metadata_options(self):
         self.initialize_combos()
         self.author.editTextChanged.connect(self.deduce_author_sort)
+        self.author.set_separator('&')
+        self.author.set_space_before_sep(True)
+        self.author.update_items_cache(self.db.all_author_names())
 
         mi = self.db.get_metadata(self.book_id, index_is_id=True)
         self.title.setText(mi.title)
diff --git a/src/calibre/gui2/convert/metadata.ui b/src/calibre/gui2/convert/metadata.ui
index 5735193424..24b7c0904a 100644
--- a/src/calibre/gui2/convert/metadata.ui
+++ b/src/calibre/gui2/convert/metadata.ui
@@ -20,38 +20,8 @@
       <string>Book Cover</string>
      </property>
      <layout class="QGridLayout" name="_2">
-      <item row="0" column="0">
-       <layout class="QHBoxLayout" name="_3">
-        <item>
-         <widget class="ImageView" name="cover" native="true">
-          <property name="sizePolicy">
-           <sizepolicy hsizetype="Expanding" vsizetype="Expanding">
-            <horstretch>0</horstretch>
-            <verstretch>0</verstretch>
-           </sizepolicy>
-          </property>
-         </widget>
-        </item>
-       </layout>
-      </item>
-      <item row="2" column="0">
-       <widget class="QCheckBox" name="opt_prefer_metadata_cover">
-        <property name="text">
-         <string>Use cover from &amp;source file</string>
-        </property>
-        <property name="checked">
-         <bool>true</bool>
-        </property>
-       </widget>
-      </item>
       <item row="1" column="0">
        <layout class="QVBoxLayout" name="_4">
-        <property name="spacing">
-         <number>6</number>
-        </property>
-        <property name="margin">
-         <number>0</number>
-        </property>
         <item>
          <widget class="QLabel" name="label_5">
           <property name="text">
@@ -64,12 +34,6 @@
         </item>
         <item>
          <layout class="QHBoxLayout" name="_5">
-          <property name="spacing">
-           <number>6</number>
-          </property>
-          <property name="margin">
-           <number>0</number>
-          </property>
           <item>
            <widget class="QLineEdit" name="cover_path">
             <property name="readOnly">
@@ -86,7 +50,7 @@
              <string>...</string>
             </property>
             <property name="icon">
-             <iconset resource="../../../../resources/images.qrc">
+             <iconset>
               <normaloff>:/images/document_open.png</normaloff>:/images/document_open.png</iconset>
             </property>
            </widget>
@@ -95,6 +59,30 @@
         </item>
        </layout>
       </item>
+      <item row="2" column="0">
+       <widget class="QCheckBox" name="opt_prefer_metadata_cover">
+        <property name="text">
+         <string>Use cover from &amp;source file</string>
+        </property>
+        <property name="checked">
+         <bool>true</bool>
+        </property>
+       </widget>
+      </item>
+      <item row="0" column="0">
+       <layout class="QHBoxLayout" name="_3">
+        <item>
+         <widget class="ImageView" name="cover" native="true">
+          <property name="sizePolicy">
+           <sizepolicy hsizetype="Expanding" vsizetype="Expanding">
+            <horstretch>0</horstretch>
+            <verstretch>0</verstretch>
+           </sizepolicy>
+          </property>
+         </widget>
+        </item>
+       </layout>
+      </item>
      </layout>
      <zorder>opt_prefer_metadata_cover</zorder>
      <zorder></zorder>
@@ -255,7 +243,7 @@
         </widget>
        </item>
        <item row="1" column="1">
-        <widget class="EnComboBox" name="author">
+        <widget class="CompleteComboBox" name="author">
          <property name="editable">
           <bool>true</bool>
          </property>
@@ -320,6 +308,11 @@
    <header>calibre/gui2/widgets.h</header>
    <container>1</container>
   </customwidget>
+  <customwidget>
+   <class>CompleteComboBox</class>
+   <extends>QComboBox</extends>
+   <header>widgets.h</header>
+  </customwidget>
  </customwidgets>
  <tabstops>
   <tabstop>title</tabstop>
diff --git a/src/calibre/gui2/dialogs/metadata_bulk.py b/src/calibre/gui2/dialogs/metadata_bulk.py
index c7d5add912..2b3a319663 100644
--- a/src/calibre/gui2/dialogs/metadata_bulk.py
+++ b/src/calibre/gui2/dialogs/metadata_bulk.py
@@ -726,6 +726,10 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog):
             name = name.strip().replace('|', ',')
             self.authors.addItem(name)
         self.authors.setEditText('')
+        
+        self.authors.set_separator('&')
+        self.authors.set_space_before_sep(True)
+        self.authors.update_items_cache(self.db.all_author_names())
 
     def initialize_series(self):
         all_series = self.db.all_series()
diff --git a/src/calibre/gui2/dialogs/metadata_bulk.ui b/src/calibre/gui2/dialogs/metadata_bulk.ui
index b826f8d48d..3950026325 100644
--- a/src/calibre/gui2/dialogs/metadata_bulk.ui
+++ b/src/calibre/gui2/dialogs/metadata_bulk.ui
@@ -14,7 +14,7 @@
    <string>Edit Meta information</string>
   </property>
   <property name="windowIcon">
-   <iconset resource="../../../../resources/images.qrc">
+   <iconset>
     <normaloff>:/images/edit_input.png</normaloff>:/images/edit_input.png</iconset>
   </property>
   <layout class="QGridLayout" name="gridLayout_2">
@@ -45,7 +45,7 @@
         <x>0</x>
         <y>0</y>
         <width>842</width>
-        <height>589</height>
+        <height>553</height>
        </rect>
       </property>
       <layout class="QVBoxLayout" name="verticalLayout_2">
@@ -76,7 +76,7 @@
             </widget>
            </item>
            <item row="0" column="1">
-            <widget class="EnComboBox" name="authors">
+            <widget class="CompleteComboBox" name="authors">
              <property name="editable">
               <bool>true</bool>
              </property>
@@ -210,7 +210,7 @@
               <string>Open Tag Editor</string>
              </property>
              <property name="icon">
-              <iconset resource="../../../../resources/images.qrc">
+              <iconset>
                <normaloff>:/images/chapters.png</normaloff>:/images/chapters.png</iconset>
              </property>
             </widget>
@@ -381,7 +381,7 @@ from the value in the box</string>
                 <string>...</string>
                </property>
                <property name="icon">
-                <iconset resource="../../../../resources/images.qrc">
+                <iconset>
                  <normaloff>:/images/trash.png</normaloff>:/images/trash.png</iconset>
                </property>
               </widget>
@@ -874,8 +874,8 @@ not multiple and the destination field is multiple</string>
                <rect>
                 <x>0</x>
                 <y>0</y>
-                <width>197</width>
-                <height>60</height>
+                <width>231</width>
+                <height>82</height>
                </rect>
               </property>
               <layout class="QGridLayout" name="testgrid">
@@ -964,6 +964,11 @@ not multiple and the destination field is multiple</string>
    <extends>QLineEdit</extends>
    <header>widgets.h</header>
   </customwidget>
+  <customwidget>
+   <class>CompleteComboBox</class>
+   <extends>QComboBox</extends>
+   <header>widgets.h</header>
+  </customwidget>
  </customwidgets>
  <tabstops>
   <tabstop>authors</tabstop>
diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py
index 139b8a2ebe..4ca2072317 100644
--- a/src/calibre/gui2/dialogs/metadata_single.py
+++ b/src/calibre/gui2/dialogs/metadata_single.py
@@ -724,6 +724,10 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
             au = _('Unknown')
         au = ' & '.join([a.strip().replace('|', ',') for a in au.split(',')])
         self.authors.setEditText(au)
+        
+        self.authors.set_separator('&')
+        self.authors.set_space_before_sep(True)
+        self.authors.update_items_cache(self.db.all_author_names())
 
     def initialize_series(self):
         self.series.setSizeAdjustPolicy(self.series.AdjustToContentsOnFirstShow)
diff --git a/src/calibre/gui2/dialogs/metadata_single.ui b/src/calibre/gui2/dialogs/metadata_single.ui
index b95267a618..b2f42937da 100644
--- a/src/calibre/gui2/dialogs/metadata_single.ui
+++ b/src/calibre/gui2/dialogs/metadata_single.ui
@@ -20,7 +20,7 @@
    <string>Edit Meta Information</string>
   </property>
   <property name="windowIcon">
-   <iconset resource="../../../../resources/images.qrc">
+   <iconset>
     <normaloff>:/images/edit_input.png</normaloff>:/images/edit_input.png</iconset>
   </property>
   <property name="sizeGripEnabled">
@@ -43,8 +43,8 @@
        <rect>
         <x>0</x>
         <y>0</y>
-        <width>986</width>
-        <height>677</height>
+        <width>955</width>
+        <height>665</height>
        </rect>
       </property>
       <layout class="QVBoxLayout" name="verticalLayout_5">
@@ -125,7 +125,7 @@ Using this button to create title sort will change title sort from red to green.
                        <string>...</string>
                       </property>
                       <property name="icon">
-                       <iconset resource="../../../../resources/images.qrc">
+                       <iconset>
                         <normaloff>:/images/auto_author_sort.png</normaloff>:/images/auto_author_sort.png</iconset>
                       </property>
                      </widget>
@@ -152,7 +152,7 @@ Using this button to create title sort will change title sort from red to green.
                        <string>...</string>
                       </property>
                       <property name="icon">
-                       <iconset resource="../../../../resources/images.qrc">
+                       <iconset>
                         <normaloff>:/images/swap.png</normaloff>:/images/swap.png</iconset>
                       </property>
                       <property name="iconSize">
@@ -186,7 +186,7 @@ Using this button to create author sort will change author sort from red to gree
                        <string>...</string>
                       </property>
                       <property name="icon">
-                       <iconset resource="../../../../resources/images.qrc">
+                       <iconset>
                         <normaloff>:/images/auto_author_sort.png</normaloff>:/images/auto_author_sort.png</iconset>
                       </property>
                      </widget>
@@ -240,7 +240,7 @@ Using this button to create author sort will change author sort from red to gree
                    </widget>
                   </item>
                   <item row="2" column="1">
-                   <widget class="EnComboBox" name="authors">
+                   <widget class="CompleteComboBox" name="authors">
                     <property name="editable">
                      <bool>true</bool>
                     </property>
@@ -352,7 +352,7 @@ If the box is colored green, then text matches the individual author's sort stri
                      <string>Open Tag Editor</string>
                     </property>
                     <property name="icon">
-                     <iconset resource="../../../../resources/images.qrc">
+                     <iconset>
                       <normaloff>:/images/chapters.png</normaloff>:/images/chapters.png</iconset>
                     </property>
                    </widget>
@@ -405,7 +405,7 @@ If the box is colored green, then text matches the individual author's sort stri
                      <string>...</string>
                     </property>
                     <property name="icon">
-                     <iconset resource="../../../../resources/images.qrc">
+                     <iconset>
                       <normaloff>:/images/trash.png</normaloff>:/images/trash.png</iconset>
                     </property>
                    </widget>
@@ -491,7 +491,7 @@ If the box is colored green, then text matches the individual author's sort stri
                      <string>Clear published date</string>
                     </property>
                     <property name="icon">
-                     <iconset resource="../../../../resources/images.qrc">
+                     <iconset>
                       <normaloff>:/images/trash.png</normaloff>:/images/trash.png</iconset>
                     </property>
                    </widget>
@@ -550,15 +550,9 @@ If the box is colored green, then text matches the individual author's sort stri
                   </item>
                   <item>
                    <layout class="QVBoxLayout" name="_4">
-                    <property name="spacing">
-                     <number>6</number>
-                    </property>
                     <property name="sizeConstraint">
                      <enum>QLayout::SetMaximumSize</enum>
                     </property>
-                    <property name="margin">
-                     <number>0</number>
-                    </property>
                     <item>
                      <widget class="QLabel" name="label_5">
                       <property name="text">
@@ -571,19 +565,13 @@ If the box is colored green, then text matches the individual author's sort stri
                     </item>
                     <item>
                      <layout class="QHBoxLayout" name="_5">
-                      <property name="spacing">
-                       <number>6</number>
-                      </property>
-                      <property name="margin">
-                       <number>0</number>
-                      </property>
                       <item>
                        <widget class="QPushButton" name="cover_button">
                         <property name="text">
                          <string>&amp;Browse</string>
                         </property>
                         <property name="icon">
-                         <iconset resource="../../../../resources/images.qrc">
+                         <iconset>
                           <normaloff>:/images/document_open.png</normaloff>:/images/document_open.png</iconset>
                         </property>
                        </widget>
@@ -597,7 +585,7 @@ If the box is colored green, then text matches the individual author's sort stri
                          <string>T&amp;rim</string>
                         </property>
                         <property name="icon">
-                         <iconset resource="../../../../resources/images.qrc">
+                         <iconset>
                           <normaloff>:/images/trim.png</normaloff>:/images/trim.png</iconset>
                         </property>
                        </widget>
@@ -611,7 +599,7 @@ If the box is colored green, then text matches the individual author's sort stri
                          <string>&amp;Remove</string>
                         </property>
                         <property name="icon">
-                         <iconset resource="../../../../resources/images.qrc">
+                         <iconset>
                           <normaloff>:/images/trash.png</normaloff>:/images/trash.png</iconset>
                         </property>
                        </widget>
@@ -702,7 +690,7 @@ If the box is colored green, then text matches the individual author's sort stri
                        <string>...</string>
                       </property>
                       <property name="icon">
-                       <iconset resource="../../../../resources/images.qrc">
+                       <iconset>
                         <normaloff>:/images/add_book.png</normaloff>:/images/add_book.png</iconset>
                       </property>
                       <property name="iconSize">
@@ -722,7 +710,7 @@ If the box is colored green, then text matches the individual author's sort stri
                        <string>...</string>
                       </property>
                       <property name="icon">
-                       <iconset resource="../../../../resources/images.qrc">
+                       <iconset>
                         <normaloff>:/images/trash.png</normaloff>:/images/trash.png</iconset>
                       </property>
                       <property name="iconSize">
@@ -742,7 +730,7 @@ If the box is colored green, then text matches the individual author's sort stri
                        <string>...</string>
                       </property>
                       <property name="icon">
-                       <iconset resource="../../../../resources/images.qrc">
+                       <iconset>
                         <normaloff>:/images/book.png</normaloff>:/images/book.png</iconset>
                       </property>
                       <property name="iconSize">
@@ -762,7 +750,7 @@ If the box is colored green, then text matches the individual author's sort stri
                        <string/>
                       </property>
                       <property name="icon">
-                       <iconset resource="../../../../resources/images.qrc">
+                       <iconset>
                         <normaloff>:/images/edit_input.png</normaloff>:/images/edit_input.png</iconset>
                       </property>
                       <property name="iconSize">
@@ -863,6 +851,11 @@ If the box is colored green, then text matches the individual author's sort stri
    <header location="global">calibre/gui2/comments_editor.h</header>
    <container>1</container>
   </customwidget>
+  <customwidget>
+   <class>CompleteComboBox</class>
+   <extends>QComboBox</extends>
+   <header>widgets.h</header>
+  </customwidget>
  </customwidgets>
  <tabstops>
   <tabstop>title</tabstop>
diff --git a/src/calibre/gui2/dialogs/search.py b/src/calibre/gui2/dialogs/search.py
index 4f72fa915e..95c7cf9225 100644
--- a/src/calibre/gui2/dialogs/search.py
+++ b/src/calibre/gui2/dialogs/search.py
@@ -31,6 +31,9 @@ class SearchDialog(QDialog, Ui_Dialog):
         self.authors_box.setEditText('')
         self.authors_box.completer().setCompletionMode(QCompleter.PopupCompletion)
         self.authors_box.setAutoCompletionCaseSensitivity(Qt.CaseInsensitive)
+        self.authors_box.set_separator('&')
+        self.authors_box.set_space_before_sep(True)
+        self.authors_box.update_items_cache(self.db.all_author_names())
 
         all_series = db.all_series()
         all_series.sort(key=lambda x : sort_key(x[1]))
diff --git a/src/calibre/gui2/dialogs/search.ui b/src/calibre/gui2/dialogs/search.ui
index 519ae8c462..06ea9de379 100644
--- a/src/calibre/gui2/dialogs/search.ui
+++ b/src/calibre/gui2/dialogs/search.ui
@@ -6,15 +6,15 @@
    <rect>
     <x>0</x>
     <y>0</y>
-    <width>731</width>
-    <height>411</height>
+    <width>752</width>
+    <height>472</height>
    </rect>
   </property>
   <property name="windowTitle">
    <string>Advanced Search</string>
   </property>
   <property name="windowIcon">
-   <iconset resource="../../../../resources/images.qrc">
+   <iconset>
     <normaloff>:/images/search.png</normaloff>:/images/search.png</iconset>
   </property>
   <layout class="QGridLayout" name="gridLayout_2">
@@ -265,7 +265,7 @@
         </widget>
        </item>
        <item row="2" column="1">
-        <widget class="EnComboBox" name="authors_box">
+        <widget class="CompleteComboBox" name="authors_box">
          <property name="toolTip">
           <string>Enter an author's name. Only one author can be used.</string>
          </property>
@@ -364,6 +364,11 @@
    <extends>QLineEdit</extends>
    <header>widgets.h</header>
   </customwidget>
+  <customwidget>
+   <class>CompleteComboBox</class>
+   <extends>QComboBox</extends>
+   <header>widgets.h</header>
+  </customwidget>
  </customwidgets>
  <tabstops>
   <tabstop>all</tabstop>
diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py
index 6e2d52f835..0bb5ee7634 100644
--- a/src/calibre/gui2/widgets.py
+++ b/src/calibre/gui2/widgets.py
@@ -540,6 +540,22 @@ class EnComboBox(QComboBox):
             idx = 0
         self.setCurrentIndex(idx)
 
+class CompleteComboBox(EnComboBox):
+    
+    def __init__(self, *args):
+        EnComboBox.__init__(self, *args)
+        self.setLineEdit(CompleteLineEdit(self))
+
+    def update_items_cache(self, complete_items):
+        self.lineEdit().update_items_cache(complete_items)
+        
+    def set_separator(self, sep):
+        self.lineEdit().set_separator(sep)
+        
+    def set_space_before_sep(self, space_before):
+        self.lineEdit().set_space_before_sep(space_before)
+
+
 class HistoryLineEdit(QComboBox):
 
     lost_focus = pyqtSignal()

From ca0e545253e3d4b5a6bf641cac78e9b816f10d81 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 16 Jan 2011 16:51:26 -0700
Subject: [PATCH 057/118] E-book viewer: Display cover when viewing FB2 files

---
 src/calibre/ebooks/fb2/input.py    | 18 +++++++++++-------
 src/calibre/ebooks/oeb/iterator.py |  2 +-
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/calibre/ebooks/fb2/input.py b/src/calibre/ebooks/fb2/input.py
index b019873d39..3d3ec69833 100644
--- a/src/calibre/ebooks/fb2/input.py
+++ b/src/calibre/ebooks/fb2/input.py
@@ -104,13 +104,17 @@ class FB2Input(InputFormatPlugin):
         entries = [(f, guess_type(f)[0]) for f in os.listdir('.')]
         opf.create_manifest(entries)
         opf.create_spine(['index.xhtml'])
-
-        for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
-            href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
-            if href is not None:
-                if href.startswith('#'):
-                    href = href[1:]
-                opf.guide.set_cover(os.path.abspath(href))
+        if mi.cover_data and mi.cover_data[1]:
+            with open('fb2_cover_calibre_mi.jpg', 'wb') as f:
+                f.write(mi.cover_data[1])
+            opf.guide.set_cover(os.path.abspath('fb2_cover_calibre_mi.jpg'))
+        else:
+            for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
+                href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
+                if href is not None:
+                    if href.startswith('#'):
+                        href = href[1:]
+                    opf.guide.set_cover(os.path.abspath(href))
 
         opf.render(open('metadata.opf', 'wb'))
         return os.path.join(os.getcwd(), 'metadata.opf')
diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py
index 6820709b3e..08b4369078 100644
--- a/src/calibre/ebooks/oeb/iterator.py
+++ b/src/calibre/ebooks/oeb/iterator.py
@@ -227,7 +227,7 @@ class EbookIterator(object):
                 self.log.warn('Missing spine item:', repr(spath))
 
         cover = self.opf.cover
-        if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf') and cover:
+        if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf', 'fb2') and cover:
             cfile = os.path.join(self.base, 'calibre_iterator_cover.html')
             chtml = (TITLEPAGE%os.path.relpath(cover, self.base).replace(os.sep,
                 '/')).encode('utf-8')

From 7a79f8d98d2556803177769797e5b51e169e48a1 Mon Sep 17 00:00:00 2001
From: Shixin Zeng <zeng.shixin@gmail.com>
Date: Sun, 16 Jan 2011 19:43:13 -0600
Subject: [PATCH 058/118] make  use_embedded_content settable per feed

---
 src/calibre/web/feeds/news.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py
index ee5b11c5f6..dd32d3749f 100644
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@@ -901,10 +901,7 @@ class BasicNewsRecipe(Recipe):
         if self.test:
             feeds = feeds[:2]
         self.has_single_feed = len(feeds) == 1
-
-        if self.use_embedded_content is None:
-            self.use_embedded_content = feeds[0].has_embedded_content()
-
+        
         index = os.path.join(self.output_dir, 'index.html')
 
         html = self.feeds2index(feeds)
@@ -939,7 +936,9 @@ class BasicNewsRecipe(Recipe):
                     url = None
                 if not url:
                     continue
-                func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
+                func, arg = (self.fetch_embedded_article, article) \
+                            if self.use_embedded_content or (self.use_embedded_content == None and feed.has_embedded_content()) \
+                            else \
                             ((self.fetch_obfuscated_article if self.articles_are_obfuscated \
                               else self.fetch_article), url)
                 req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),

From 34da8b73ccf55b846c6e612f923682f1abe3f09c Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 16 Jan 2011 21:35:53 -0500
Subject: [PATCH 059/118] Fix bug #8381: reference \t and \T PML indents
 properly.

---
 src/calibre/ebooks/pml/pmlconverter.py | 35 ++++++++++++++++++--------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py
index 10e5871d31..a0814ee0dd 100644
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@@ -34,18 +34,15 @@ class PML_HTMLizer(object):
         'ra',
         'c',
         'r',
-        't',
         's',
         'l',
         'k',
-        'T',
         'FN',
         'SB',
     ]
 
     STATES_VALUE_REQ = [
         'a',
-        'T',
         'FN',
         'SB',
     ]
@@ -96,8 +93,6 @@ class PML_HTMLizer(object):
         'Sb': 'sb',
         'c': 'c',
         'r': 'r',
-        't': 't',
-        'T': 'T',
         'i': 'i',
         'I': 'i',
         'u': 'u',
@@ -133,8 +128,6 @@ class PML_HTMLizer(object):
     DIV_STATES = [
         'c',
         'r',
-        't',
-        'T',
         'FN',
         'SB',
     ]
@@ -255,8 +248,6 @@ class PML_HTMLizer(object):
 
         for key, val in self.state.items():
             if val[0]:
-                if key == 'T':
-                    self.state['T'][0] = False
                 if key in self.DIV_STATES:
                     div.append(key)
                 elif key in self.SPAN_STATES:
@@ -506,6 +497,9 @@ class PML_HTMLizer(object):
         self.toc = TOC()
         self.file_name = file_name
 
+        indent_state = {'t': False, 'T': False}
+        adv_indent_val = ''
+
         for s in self.STATES:
             self.state[s] = [False, ''];
 
@@ -515,6 +509,8 @@ class PML_HTMLizer(object):
 
             parsed = []
             empty = True
+            basic_indent = indent_state['t']
+            adv_indent = indent_state['T']
 
             # Must use StringIO, cStringIO does not support unicode
             line = StringIO.StringIO(line)
@@ -527,7 +523,7 @@ class PML_HTMLizer(object):
                 if c == '\\':
                     c = line.read(1)
 
-                    if c in 'qcrtTiIuobBlk':
+                    if c in 'qcriIuobBlk':
                         text = self.process_code(c, line)
                     elif c in 'FS':
                         l = line.read(1)
@@ -574,6 +570,15 @@ class PML_HTMLizer(object):
                     elif c == 'w':
                         empty = False
                         text = '<hr width="%s" />' % self.code_value(line)
+                    elif c == 't':
+                        indent_state[c] = not indent_state[c]
+                        if indent_state[c]:
+                            basic_indent = True
+                    elif c == 'T':
+                        indent_state[c] = not indent_state[c]
+                        if indent_state[c]:
+                            adv_indent = True
+                            adv_indent_val = self.code_value(line)
                     elif c == '-':
                         empty = False
                         text = '&shy;'
@@ -590,6 +595,16 @@ class PML_HTMLizer(object):
             if not empty:
                 text = self.end_line()
                 parsed.append(text)
+                
+                if basic_indent:
+                    parsed.insert(0, self.STATES_TAGS['t'][0])
+                    parsed.append(self.STATES_TAGS['t'][1])
+                elif adv_indent:
+                    parsed.insert(0, self.STATES_TAGS['T'][0] % adv_indent_val)
+                    parsed.append(self.STATES_TAGS['T'][1])
+                    indent_state['T'] = False
+                    adv_indent_val = ''
+                
                 output.append(u''.join(parsed))
             line.close()
 

From 899263f3b34febbe3f7c1fc435017b14b502a802 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 16 Jan 2011 20:37:04 -0700
Subject: [PATCH 060/118] Updated Nature News

---
 resources/recipes/freenature.recipe | 66 +++++++++++++++++++++++++++--
 1 file changed, 63 insertions(+), 3 deletions(-)

diff --git a/resources/recipes/freenature.recipe b/resources/recipes/freenature.recipe
index cf06e7163d..0b287842ec 100644
--- a/resources/recipes/freenature.recipe
+++ b/resources/recipes/freenature.recipe
@@ -1,4 +1,5 @@
 from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import Tag
 import re
 
 class NatureNews(BasicNewsRecipe):
@@ -10,17 +11,76 @@ class NatureNews(BasicNewsRecipe):
     max_articles_per_feed = 50
 
     no_stylesheets = True
-    remove_tags_before = dict(name='h1', attrs={'class':'heading entry-title'})
-    remove_tags_after  = dict(name='h2', attrs={'id':'comments'})
+    keep_only_tags = [dict(name='div', attrs={'id':'content'})]
+#    remove_tags_before = dict(name='h1', attrs={'class':'heading entry-title'})
+#    remove_tags_after  = dict(name='h2', attrs={'id':'comments'})
     remove_tags = [
        dict(name='h2', attrs={'id':'comments'}),
        dict(attrs={'alt':'Advertisement'}),
        dict(name='div', attrs={'class':'ad'}),
-    ] 
+       dict(attrs={'class':'Z3988'}),
+       dict(attrs={'class':['formatpublished','type-of-article','cleardiv','disclaimer','buttons','comments xoxo']}),
+       dict(name='a', attrs={'href':'#comments'}),
+       dict(name='h2',attrs={'class':'subheading plusicon icon-add-comment'})
+    ]
 
     preprocess_regexps = [
         (re.compile(r'<p>ADVERTISEMENT</p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
         ]
 
+    extra_css             = '''
+                            .author { text-align: right; font-size: small; line-height:1em; margin-top:0px; margin-left:0; margin-right:0; margin-bottom: 0; }
+                            .imagedescription { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
+                            .imagecredit { font-size: x-small; font-style: normal; font-weight: bold}
+                            '''
+
     feeds = [('Nature News', 'http://feeds.nature.com/news/rss/most_recent')]
 
+    def preprocess_html(self,soup):
+        # The author name is slightly buried - dig it up
+        author = soup.find('p', {'class':'byline'})
+        if author:
+            # Find out the author's name
+            authornamediv = author.find('span',{'class':'author fn'})
+            authornamelink = authornamediv.find('a')
+            if authornamelink:
+                authorname = authornamelink.contents[0]
+            else:
+                authorname = authornamediv.contents[0]
+            # Stick the author's name in the byline tag
+            tag = Tag(soup,'div')
+            tag['class'] = 'author'
+            tag.insert(0,authorname.strip())
+            author.replaceWith(tag)
+
+        # Change the intro from a p to a div
+        intro = soup.find('p',{'class':'intro'})
+        if intro:
+            tag = Tag(soup,'div')
+            tag['class'] = 'intro'
+            tag.insert(0,intro.contents[0])
+            intro.replaceWith(tag)
+
+        # Change span class=imagedescription to div
+        descr = soup.find('span',{'class':'imagedescription'})
+        if descr:
+            tag = Tag(soup,'div')
+            tag['class'] = 'imagedescription'
+            tag.insert(0,descr.renderContents())
+            descr.replaceWith(tag)
+
+        # The references are in a list, let's make them simpler
+        reflistcont =  soup.find('ul',{'id':'article-refrences'})
+        if reflistcont:
+            reflist = reflistcont.li.renderContents()
+            tag = Tag(soup,'div')
+            tag['class'] = 'article-references'
+            tag.insert(0,reflist)
+            reflistcont.replaceWith(tag)
+
+        # Within the id=content div, we need to remove all the stuff after the end of the class=entry-content
+        entrycontent = soup.find('div',{'class':'entry-content'})
+        for nextSibling in entrycontent.findNextSiblings():
+            nextSibling.extract()
+
+        return soup

From 2ebf94812e8ed82491b7579333f66cde7ce15096 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 17 Jan 2011 17:13:19 +0800
Subject: [PATCH 061/118] started updating manuals, fix lit postprocess to
 handle content in pre tags correctly

---
 src/calibre/ebooks/conversion/plumber.py |  6 ++-
 src/calibre/ebooks/lit/input.py          |  9 ++--
 src/calibre/manual/conversion.rst        | 59 ++++++++++++++++++++----
 3 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 249f848661..6fdf7ddc68 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -492,7 +492,9 @@ OptionRecommendation(name='enable_heuristics',
 OptionRecommendation(name='markup_chapter_headings',
     recommended_value=False, level=OptionRecommendation.LOW,
     help=_('Detect unformatted chapter headings and sub headings. Change ' 
-           'them to h2 and h3 tags.')),
+           'them to h2 and h3 tags.  This setting will not create a TOC, '
+           'but can be used in conjunction with structure detection to create '
+           'one.')),
            
 OptionRecommendation(name='italicize_common_cases',
     recommended_value=False, level=OptionRecommendation.LOW,
@@ -501,7 +503,7 @@ OptionRecommendation(name='italicize_common_cases',
            
 OptionRecommendation(name='fix_indents',
     recommended_value=False, level=OptionRecommendation.LOW,
-    help=_('Turn indentation created from multiple &nbsp; entities '
+    help=_('Turn indentation created from multiple non-breaking space entities '
            'into CSS indents.')),
            
 OptionRecommendation(name='html_unwrap_factor',
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 7b822b68a6..ff8955939e 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -22,7 +22,7 @@ class LITInput(InputFormatPlugin):
         from calibre.ebooks.lit.reader import LitReader
         from calibre.ebooks.conversion.plumber import create_oebbook
         self.log = log
-        return create_oebbook(log, stream, options, self, reader=LitReader)
+        return create_oebbook(log, stream, options, reader=LitReader)
 
     def postprocess_book(self, oeb, opts, log):
         from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
@@ -39,10 +39,13 @@ class LITInput(InputFormatPlugin):
                 body = body[0]
                 if len(body) == 1 and body[0].tag == XHTML('pre'):
                     pre = body[0]
-                    from calibre.ebooks.txt.processor import convert_basic
+                    from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
+                    separate_paragraphs_single_line
                     from lxml import etree
                     import copy
-                    html = convert_basic(pre.text).replace('<html>',
+                    html = separate_paragraphs_single_line(pre.text)
+                    html = preserve_spaces(html)
+                    html = convert_basic(html).replace('<html>',
                             '<html xmlns="%s">'%XHTML_NS)
                     root = etree.fromstring(html)
                     body = XPath('//h:body')(root)
diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index 4b2b169d72..3383708b72 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -255,6 +255,46 @@ you are producing are meant for a particular device type, choose the correspondi
 
 The Output profile also controls the screen size. This will cause, for example, images to be auto-resized to be fit to the screen in some output formats. So choose a profile of a device that has a screen size similar to your device.
 
+.. _heuristic-processing:
+
+Heuristic Processing
+---------------------
+
+:guilabel:`Preprocess input`
+    This option activates various algorithms that try to detect and correct common cases of
+    badly formatted input documents. Things like hard line breaks, large blocks of text with no formatting, etc.
+    Turn this option on if your input document suffers from bad formatting. But be aware that in
+    some cases, this option can lead to worse results, so use with care.
+
+:guilabel:`Line-unwrap factor`
+    This option control the algorithm |app| uses to remove hard line breaks. For example, if the value of this
+    option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
+    than the length of 40% of all lines in the document. 
+
+:guilabel:`Unwrap lines`
+    Lorem ipsum
+    
+:guilabel:`Detect and markup unformatted chapter headings and sub headings`
+    Lorem ipsum
+
+:guilabel:`Renumber sequences of &lt;h1&gt; or &lt;h2&gt; tags to prevent splitting`
+    Lorem ipsum
+    
+:guilabel:`Delete blank lines between paragraphs`
+    Lorem ipsum
+
+:guilabel:`Ensure scene breaks are consistently formatted`
+    Lorem ipsum
+
+:guilabel:`Remove unnecessary hyphens`
+    Lorem ipsum
+
+:guilabel:`Italicize common words and patterns`
+    Lorem ipsum
+
+:guilabel:`Replace entity indents with CSS indents`
+    Lorem ipsum
+
 .. _structure-detection:
 
 Structure Detection
@@ -330,16 +370,6 @@ There are a few more options in this section.
     two covers. This option will simply remove the first image from the source document, thereby
     ensuring that the converted book has only one cover, the one specified in |app|.
 
-:guilabel:`Preprocess input`
-    This option activates various algorithms that try to detect and correct common cases of
-    badly formatted input documents. Things like hard line breaks, large blocks of text with no formatting, etc.
-    Turn this option on if your input document suffers from bad formatting. But be aware that in
-    some cases, this option can lead to worse results, so use with care.
-
-:guilabel:`Line-unwrap factor`
-    This option control the algorithm |app| uses to remove hard line breaks. For example, if the value of this
-    option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
-    than the length of 40% of all lines in the document. 
     
 Table of Contents
 ------------------
@@ -500,6 +530,9 @@ more blank lines are a paragraph boundary::
 
 TXT input supports a number of options to differentiate how paragraphs are detected.
 
+    :guilabel:`Auto`
+        Analyzes the text file and attempts to determine how paragraphs are defined.
+
     :guilabel:`Treat each line as a paragraph`
         Assumes that every line is a paragraph::
 
@@ -518,6 +551,12 @@ TXT input supports a number of options to differentiate how paragraphs are detec
             This is the
             third.
 
+    :guilabel:`Unformatted`
+        Assumes that the document has no formatting, but does use hard line breaks.  Punctuation
+        and median line length are used to attempt to re-create paragraphs.
+
+    :guilabel:`Process using Textile`
+
     :guilabel:`Process using markdown`
         |app| also supports running TXT input though a transformation preprocessor known as markdown. Markdown
         allows for basic formatting to be added to TXT documents, such as bold, italics, section headings, tables,

From 05730e1886c8562e819364c43a7fa58c172392d6 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 17 Jan 2011 19:00:14 +0800
Subject: [PATCH 062/118] insert horizontal rules for softbreaks when option is
 enabled

---
 src/calibre/ebooks/conversion/utils.py |  5 +++++
 src/calibre/manual/conversion.rst      | 15 +++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 15522d25e6..d9e5246223 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -23,6 +23,7 @@ class HeuristicProcessor(object):
         self.min_chapters = 1
         self.chapters_no_title = 0
         self.chapters_with_title = 0
+        self.blanks_deleted = False
         self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
         self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
@@ -422,6 +423,7 @@ class HeuristicProcessor(object):
         # blank paragraphs then delete blank lines to clean up spacing
         if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
             self.log.debug("deleting blank lines")
+            self.blanks_deleted = True
             html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
             html = self.blankreg.sub('', html)
 
@@ -479,6 +481,9 @@ class HeuristicProcessor(object):
         if getattr(self.extra_opts, 'format_scene_breaks', False):
             # Center separator lines
             html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
+            if not self.blanks_deleted:
+                html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
+            html = re.sub('<p\s+id="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
 
         if self.deleted_nbsps:
             # put back non-breaking spaces in empty paragraphs to preserve original formatting
diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index 3383708b72..96a8e30e3c 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -530,17 +530,18 @@ more blank lines are a paragraph boundary::
 
 TXT input supports a number of options to differentiate how paragraphs are detected.
 
-    :guilabel:`Auto`
-        Analyzes the text file and attempts to determine how paragraphs are defined.
+    :guilabel:`Paragraph Style: Auto`
+        Analyzes the text file and attempts to automatically determine how paragraphs are defined.  This
+        option will generally work fine, if you achieve undesirable results try one of the manual options.
 
-    :guilabel:`Treat each line as a paragraph`
+    :guilabel:`Paragraph Style: Single`
         Assumes that every line is a paragraph::
 
             This is the first.
             This is the second.
             This is the third.
         
-    :guilabel:`Assume print formatting`
+    :guilabel:`Paragraph Style: Print`
         Assumes that every paragraph starts with an indent (either a tab or 2+ spaces). Paragraphs end when
         the next line that starts with an indent is reached::
 
@@ -551,11 +552,13 @@ TXT input supports a number of options to differentiate how paragraphs are detec
             This is the
             third.
 
-    :guilabel:`Unformatted`
+    :guilabel:`Paragraph Style: Unformatted`
         Assumes that the document has no formatting, but does use hard line breaks.  Punctuation
         and median line length are used to attempt to re-create paragraphs.
 
-    :guilabel:`Process using Textile`
+    :guilabel:`Formatting Style: Auto`
+
+    :guilabel:`Formatting Style: Heuristic`
 
     :guilabel:`Process using markdown`
         |app| also supports running TXT input though a transformation preprocessor known as markdown. Markdown

From a661a17f1f027ab2cbf9b006e85a2a366d4e4c37 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Mon, 17 Jan 2011 11:27:10 +0000
Subject: [PATCH 063/118] Change formatter to show an error if an unknown
 function is used.

---
 src/calibre/utils/formatter.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/calibre/utils/formatter.py b/src/calibre/utils/formatter.py
index 49b807ff1c..740e67bee8 100644
--- a/src/calibre/utils/formatter.py
+++ b/src/calibre/utils/formatter.py
@@ -274,9 +274,9 @@ class TemplateFormatter(string.Formatter):
                     colon += 1
 
                 funcs = formatter_functions.get_functions()
-                if fmt[colon:p] in funcs:
-                    field = fmt[colon:p]
-                    func = funcs[field]
+                fname = fmt[colon:p]
+                if fname in funcs:
+                    func = funcs[fname]
                     if func.arg_count == 2:
                         # only one arg expected. Don't bother to scan. Avoids need
                         # for escaping characters
@@ -292,6 +292,8 @@ class TemplateFormatter(string.Formatter):
                     else:
                         val = func.eval_(self, self.kwargs, self.book, self.locals,
                                         val, *args).strip()
+                else:
+                    return _('%s: unknown function')%fname
         if val:
             val = self._do_format(val, dispfmt)
         if not val:

From 20d8d908ee70d69c68bba1bb221e8e850fb51da7 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Jan 2011 08:37:07 -0700
Subject: [PATCH 064/118] ...

---
 src/calibre/devices/sne/driver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/devices/sne/driver.py b/src/calibre/devices/sne/driver.py
index 04e5cd0d76..bb8d34c59c 100644
--- a/src/calibre/devices/sne/driver.py
+++ b/src/calibre/devices/sne/driver.py
@@ -33,6 +33,6 @@ class SNE(USBMS):
     STORAGE_CARD_VOLUME_LABEL = 'SNE Storage Card'
 
     EBOOK_DIR_MAIN = EBOOK_DIR_CARD_A = 'Books'
-    SUPPORTS_SUB_DIRS = False
+    SUPPORTS_SUB_DIRS = True
 
 

From 9013a5d97dc73e6f3ea416b6b30a2cdf3522fcc0 Mon Sep 17 00:00:00 2001
From: GRiker <griker@hotmail.com>
Date: Mon, 17 Jan 2011 08:53:18 -0700
Subject: [PATCH 065/118] GwR catalog 1.0 revisions

---
 resources/catalog/section_list_templates.py |  39 ++
 src/calibre/gui2/actions/catalog.py         |  18 +-
 src/calibre/library/catalog.py              | 550 ++++++++------------
 3 files changed, 279 insertions(+), 328 deletions(-)
 create mode 100644 resources/catalog/section_list_templates.py

diff --git a/resources/catalog/section_list_templates.py b/resources/catalog/section_list_templates.py
new file mode 100644
index 0000000000..de73147fcf
--- /dev/null
+++ b/resources/catalog/section_list_templates.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+'''
+    Available fields:
+    {title}          Title of the book
+    {series}         Series name
+    {series_index}   Number of the book in the series
+    {rating}         Rating
+    {rating_parens}  Rating, in parentheses
+    {pubyear}        Year the book was published
+    {pubyear_parens} Year the book was published, in parentheses
+'''
+# Books by Author
+by_authors_normal_title_template = '{title} {pubyear_parens}'
+by_authors_series_title_template = '[{series_index}] {title} {pubyear_parens}'
+
+# Books by Title
+by_titles_normal_title_template = '{title}'
+by_titles_series_title_template = '{title} ({series} [{series_index}])'
+
+# Books by Series
+by_series_title_template = '[{series_index}] {title} {pubyear_parens}'
+
+# Books by Genre
+by_genres_normal_title_template = '{title} {pubyear_parens}'
+by_genres_series_title_template = '{series_index}. {title} {pubyear_parens}'
+
+# Recently Added
+by_recently_added_normal_title_template = '{title}'
+by_recently_added_series_title_template = '{title} ({series} [{series_index}])'
+
+# By Month added
+by_month_added_normal_title_template = '{title} {pubyear_parens}'
+by_month_added_series_title_template = '[{series_index}] {title} {pubyear_parens}'
\ No newline at end of file
diff --git a/src/calibre/gui2/actions/catalog.py b/src/calibre/gui2/actions/catalog.py
index 6d3bb539a2..1650c80d70 100644
--- a/src/calibre/gui2/actions/catalog.py
+++ b/src/calibre/gui2/actions/catalog.py
@@ -5,11 +5,11 @@ __license__   = 'GPL v3'
 __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-import os, shutil
+import re, os, shutil
 
 from PyQt4.Qt import QModelIndex
 
-from calibre.gui2 import error_dialog, choose_dir
+from calibre.gui2 import choose_dir, error_dialog, info_dialog, warning_dialog
 from calibre.gui2.tools import generate_catalog
 from calibre.utils.config import dynamic
 from calibre.gui2.actions import InterfaceAction
@@ -55,10 +55,16 @@ class GenerateCatalogAction(InterfaceAction):
 
     def catalog_generated(self, job):
         if job.result:
-            # Error during catalog generation
-            return error_dialog(self.gui, _('Catalog generation terminated'),
-                    job.result,
-                    show=True)
+            # Problems during catalog generation
+            dialog_title = job.result.pop(0)
+            if re.match('warning:', job.result[0].lower()):
+                job.result.append("Catalog generation complete.")
+                warning_dialog(self.gui, dialog_title, '\n'.join(job.result), show=True)
+            else:
+                job.result.append("Catalog generation terminated.")
+                error_dialog(self.gui, dialog_title,'\n'.join(job.result),show=True)
+                return
+
         if job.failed:
             return self.gui.job_exception(job)
         id = self.gui.library_view.model().add_catalog(job.catalog_file_path, job.catalog_title)
diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py
index 087d40c4eb..f1c5e3ae65 100644
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@@ -546,9 +546,9 @@ class EPUB_MOBI(CatalogPlugin):
     name = 'Catalog_EPUB_MOBI'
     description = 'EPUB/MOBI catalog generator'
     supported_platforms = ['windows', 'osx', 'linux']
-    minimum_calibre_version = (0, 6, 34)
+    minimum_calibre_version = (0, 7, 40)
     author = 'Greg Riker'
-    version = (0, 0, 1)
+    version = (1, 0, 0)
     file_types = set(['epub','mobi'])
 
     THUMB_SMALLEST = "1.0"
@@ -900,15 +900,7 @@ class EPUB_MOBI(CatalogPlugin):
         '''
         Generates catalog source files from calibre database
 
-        Implementation notes
-        - 'Marker tags' in a book's metadata are used to flag special conditions:
-                    (Defaults)
-                    '~' : Do not catalog this book
-                    '+' : Mark this book as read (check mark) in lists
-                    '*' : Display trailing text as 'Note: <text>' in top frame next to cover
-            '[<source>] : Source of content (e.g., Amazon, Project Gutenberg).  Do not create genre
-
-        - Program flow
+        Flow of control:
             gui2.actions.catalog:generate_catalog()
             gui2.tools:generate_catalog() or library.cli:command_catalog()
             called from gui2.convert.gui_conversion:gui_catalog()
@@ -953,7 +945,7 @@ class EPUB_MOBI(CatalogPlugin):
             self.__creator = opts.creator
             self.__db = db
             self.__descriptionClip = opts.descriptionClip
-            self.__error = None
+            self.__error = []
             self.__generateForKindle = True if (self.opts.fmt == 'mobi' and \
                                        self.opts.output_profile and \
                                        self.opts.output_profile.startswith("kindle")) else False
@@ -1033,6 +1025,22 @@ class EPUB_MOBI(CatalogPlugin):
                 # +1 thumbs
                 self.__totalSteps += 3
 
+            # Load section list templates
+            templates = ['by_authors_normal_title_template',
+                         'by_authors_series_title_template',
+                         'by_titles_normal_title_template',
+                         'by_titles_series_title_template',
+                         'by_series_title_template',
+                         'by_genres_normal_title_template',
+                         'by_genres_series_title_template',
+                         'by_recently_added_normal_title_template',
+                         'by_recently_added_series_title_template',
+                         'by_month_added_normal_title_template',
+                         'by_month_added_series_title_template']
+            execfile(P(os.path.join('catalog','section_list_templates.py')),locals())
+            for t in templates:
+                setattr(self,t,eval(t))
+
         # Accessors
         if True:
             '''
@@ -1420,26 +1428,12 @@ class EPUB_MOBI(CatalogPlugin):
             '''
 
             self.updateProgressFullStep("Sorting database")
-
-            '''
-            # Sort titles case-insensitive, by author
-            self.booksByAuthor = sorted(self.booksByTitle,
-                                 key=lambda x:(x['author_sort'].upper(), x['author_sort'].upper()))
-            '''
-
             self.booksByAuthor = list(self.booksByTitle)
-            self.booksByAuthor.sort(self.author_compare)
-
-            if False and self.verbose:
-                self.opts.log.info("fetchBooksByAuthor(): %d books" % len(self.booksByAuthor))
-                self.opts.log.info(" %-30s %-20s %s" % ('title', 'series', 'series_index'))
-                for title in self.booksByAuthor:
-                    self.opts.log.info((u" %-30s %-20s%5s " % \
-                                        (title['title'][:30],
-                                         title['series'][:20] if title['series'] else '',
-                                         title['series_index'],
-                                         )).encode('utf-8'))
-                raise SystemExit
+            self.booksByAuthor = sorted(self.booksByAuthor, key=self.booksByAuthorSorter_author)
+#             for book in self.booksByAuthor:
+#                 print "{0:<30}  {1:<30}  {2:<30}".format(book['title'],book['author'],book['author_sort'])
+#             print
+#             stop
 
             # Build the unique_authors set from existing data
             authors = [(record['author'], record['author_sort'].capitalize()) for record in self.booksByAuthor]
@@ -1457,16 +1451,17 @@ class EPUB_MOBI(CatalogPlugin):
                     multiple_authors = True
 
                 if author != current_author and i:
-                    # Warn, exit if friendly matches previous, but sort doesn't
+                    # Exit if author matches previous, but author_sort doesn't match
                     if author[0] == current_author[0]:
                         error_msg = _('''
-\n*** Metadata error ***
-Inconsistent Author Sort values for Author '{0}', unable to continue building catalog.
+Inconsistent Author Sort values for Author '{0}', unable to continue building catalog.\n
 Select all books by '{0}', apply correct Author Sort value in Edit Metadata dialog,
 then rebuild the catalog.\n''').format(author[0])
-
+                        self.opts.log.warn('\n*** Metadata error ***')
                         self.opts.log.warn(error_msg)
-                        self.error = error_msg
+
+                        self.error.append('Metadata error')
+                        self.error.append(error_msg)
                         return False
 
                     # New author, save the previous author/sort/count
@@ -1496,16 +1491,8 @@ then rebuild the catalog.\n''').format(author[0])
             return True
 
         def fetchBooksByTitle(self):
-
             self.updateProgressFullStep("Fetching database")
 
-            # Get the database as a dictionary
-            # Sort by title
-            # Search is a string like this:
-            # not tag:<exclude_tag> author:"Riker"
-            # So we need to merge opts.exclude_tag with opts.search_text
-            # not tag:"~" author:"Riker"
-
             self.opts.sort_by = 'title'
 
             # Merge opts.exclude_tags with opts.search_text
@@ -1528,7 +1515,6 @@ then rebuild the catalog.\n''').format(author[0])
                 else:
                     self.opts.search_text = search_phrase
 
-            #print "fetchBooksByTitle(): opts.search_text: %s" % self.opts.search_text
             # Fetch the database as a dictionary
             data = self.plugin.search_sort_db(self.db, self.opts)
             data = self.processExclusions(data)
@@ -1536,8 +1522,6 @@ then rebuild the catalog.\n''').format(author[0])
             # Populate this_title{} from data[{},{}]
             titles = []
             for record in data:
-                if False:
-                    print "available record metadata:\n%s" % sorted(record.keys())
                 this_title = {}
 
                 this_title['id'] = record['id']
@@ -1547,7 +1531,6 @@ then rebuild the catalog.\n''').format(author[0])
                 if record['series']:
                     this_title['series'] = record['series']
                     this_title['series_index'] = record['series_index']
-                    this_title['title'] = self.generateSeriesTitle(this_title)
                 else:
                     this_title['series'] = None
                     this_title['series_index'] = 0.0
@@ -1572,7 +1555,12 @@ then rebuild the catalog.\n''').format(author[0])
                     this_title['publisher'] = re.sub('&', '&amp;', record['publisher'])
 
                 this_title['rating'] = record['rating'] if record['rating'] else 0
-                this_title['date'] = strftime(u'%B %Y', record['pubdate'].timetuple())
+
+                if re.match('0100-01-01',str(record['pubdate'].date())):
+                    this_title['date'] = None
+                else:
+                    this_title['date'] = strftime(u'%B %Y', record['pubdate'].timetuple())
+
                 this_title['timestamp'] = record['timestamp']
 
                 if record['comments']:
@@ -1646,7 +1634,7 @@ then rebuild the catalog.\n''').format(author[0])
                                                                title['title_sort'][0:40])).decode('mac-roman'))
                 return True
             else:
-                self.error = _("No books found to catalog.\nCheck 'Excluded books' criteria in E-book options.")
+                self.error.append( _("No books found to catalog.\nCheck 'Excluded books' criteria in E-book options."))
                 return False
 
         def fetchBookmarks(self):
@@ -1748,13 +1736,12 @@ then rebuild the catalog.\n''').format(author[0])
                 self.bookmarked_books = {}
 
         def generateHTMLDescriptions(self):
-            # Write each title to a separate HTML file in contentdir
+            '''
+            Write each title to a separate HTML file in contentdir
+            '''
             self.updateProgressFullStep("'Descriptions'")
 
             for (title_num, title) in enumerate(self.booksByTitle):
-                if False:
-                    self.opts.log.info("%3s: %s - %s" % (title['id'], title['title'], title['author']))
-
                 self.updateProgressMicroStep("Description %d of %d" % \
                                              (title_num, len(self.booksByTitle)),
                                              float(title_num*100/len(self.booksByTitle))/100)
@@ -1768,8 +1755,9 @@ then rebuild the catalog.\n''').format(author[0])
                 outfile.close()
 
         def generateHTMLByTitle(self):
-            # Write books by title A-Z to HTML file
-
+            '''
+            Write books by title A-Z to HTML file
+            '''
             self.updateProgressFullStep("'Titles'")
 
             soup = self.generateHTMLEmptyHeader("Books By Alpha Title")
@@ -1807,22 +1795,11 @@ then rebuild the catalog.\n''').format(author[0])
             current_letter = ""
 
             # Re-sort title list without leading series/series_index
+            # Incoming title <series> <series_index>: <title>
             if not self.useSeriesPrefixInTitlesSection:
                 nspt = deepcopy(self.booksByTitle)
-                for book in nspt:
-                    if book['series']:
-                        tokens = book['title'].partition(':')
-                        book['title'] = '%s (%s)' % (tokens[2].strip(), tokens[0])
-                        book['title_sort'] = self.generateSortTitle(book['title'])
-                nspt = sorted(nspt,
-                                     key=lambda x:(x['title_sort'].upper(), x['title_sort'].upper()))
+                nspt = sorted(nspt, key=lambda x:(x['title_sort'].upper(), x['title_sort'].upper()))
                 self.booksByTitle_noSeriesPrefix = nspt
-                if False and self.verbose:
-                    self.opts.log.info("no_series_prefix_titles: %d books" % len(nspt))
-                    self.opts.log.info(" %-40s %-40s" % ('title', 'title_sort'))
-                    for title in nspt:
-                        self.opts.log.info((u" %-40s %-40s" % (title['title'][0:40],
-                                                               title['title_sort'][0:40])).encode('utf-8'))
 
             # Loop through the books by title
             title_list = self.booksByTitle
@@ -1878,7 +1855,14 @@ then rebuild the catalog.\n''').format(author[0])
                 aTag = Tag(soup, "a")
                 if self.opts.generate_descriptions:
                     aTag['href'] = "book_%d.html" % (int(float(book['id'])))
-                aTag.insert(0,escape(book['title']))
+
+                # Generate the title from the template
+                args = self.generateFormatArgs(book)
+                if book['series']:
+                    formatted_title = self.by_titles_series_title_template.format(**args).rstrip()
+                else:
+                    formatted_title = self.by_titles_normal_title_template.format(**args).rstrip()
+                aTag.insert(0,NavigableString(escape(formatted_title)))
                 pBookTag.insert(ptc, aTag)
                 ptc += 1
 
@@ -1916,7 +1900,9 @@ then rebuild the catalog.\n''').format(author[0])
             self.htmlFileList_1.append("content/ByAlphaTitle.html")
 
         def generateHTMLByAuthor(self):
-            # Write books by author A-Z
+            '''
+            Write books by author A-Z
+            '''
             self.updateProgressFullStep("'Authors'")
 
             friendly_name = "Authors"
@@ -1953,7 +1939,8 @@ then rebuild the catalog.\n''').format(author[0])
             current_author = ''
             current_letter = ''
             current_series = None
-            for book in self.booksByAuthor:
+            for book in sorted(self.booksByAuthor, key = self.booksByAuthorSorter_author_sort):
+
                 book_count += 1
                 if self.letter_or_symbol(book['author_sort'][0].upper()) != current_letter :
                     # Start a new letter with Index letter
@@ -2067,14 +2054,18 @@ then rebuild the catalog.\n''').format(author[0])
                 aTag = Tag(soup, "a")
                 if self.opts.generate_descriptions:
                     aTag['href'] = "book_%d.html" % (int(float(book['id'])))
-                # Use series, series index if avail else title, + year of publication
+
+                # Generate the title from the template
+                args = self.generateFormatArgs(book)
                 if current_series:
-                    aTag.insert(0,'%s (%s)' % (escape(book['title'][len(book['series'])+1:]),
-                                               book['date'].split()[1]))
+                    #aTag.insert(0,'%s%s' % (escape(book['title'][len(book['series'])+1:]),pubyear))
+                    formatted_title = self.by_authors_series_title_template.format(**args).rstrip()
                 else:
-                    aTag.insert(0,'%s (%s)' % (escape(book['title']),
-                                               book['date'].split()[1]))
+                    #aTag.insert(0,'%s%s' % (escape(book['title']), pubyear))
+                    formatted_title = self.by_authors_normal_title_template.format(**args).rstrip()
                     non_series_books += 1
+                aTag.insert(0,NavigableString(escape(formatted_title)))
+
                 pBookTag.insert(ptc, aTag)
                 ptc += 1
 
@@ -2111,7 +2102,6 @@ then rebuild the catalog.\n''').format(author[0])
             # Add the divTag to the body
             body.insert(btc, divTag)
 
-
             # Write the generated file to contentdir
             outfile_spec = "%s/ByAlphaAuthor.html" % (self.contentDir)
             outfile = open(outfile_spec, 'w')
@@ -2120,13 +2110,15 @@ then rebuild the catalog.\n''').format(author[0])
             self.htmlFileList_1.append("content/ByAlphaAuthor.html")
 
         def generateHTMLByDateAdded(self):
-            # Write books by reverse chronological order
+            '''
+            Write books by reverse chronological order
+            '''
             self.updateProgressFullStep("'Recently Added'")
 
             def add_books_to_HTML_by_month(this_months_list, dtc):
                 if len(this_months_list):
 
-                    this_months_list.sort(self.author_compare)
+                    this_months_list = sorted(this_months_list, key=self.booksByAuthorSorter_author_sort)
 
                     # Create a new month anchor
                     date_string = strftime(u'%B %Y', current_date.timetuple())
@@ -2156,16 +2148,6 @@ then rebuild the catalog.\n''').format(author[0])
                             divTag.insert(dtc,pAuthorTag)
                             dtc += 1
 
-                        '''
-                        # Insert an <hr /> between non-series and series
-                        if not current_series and non_series_books and new_entry['series']:
-                            # Insert an <hr />
-                            hrTag = Tag(soup,'hr')
-                            hrTag['class'] = "series_divider"
-                            divTag.insert(dtc,hrTag)
-                            dtc += 1
-                        '''
-
                         # Check for series
                         if new_entry['series'] and new_entry['series'] != current_series:
                             # Start a new series
@@ -2213,11 +2195,15 @@ then rebuild the catalog.\n''').format(author[0])
                         aTag = Tag(soup, "a")
                         if self.opts.generate_descriptions:
                             aTag['href'] = "book_%d.html" % (int(float(new_entry['id'])))
+
+                        # Generate the title from the template
+                        args = self.generateFormatArgs(new_entry)
                         if current_series:
-                            aTag.insert(0,escape(new_entry['title'][len(new_entry['series'])+1:]))
+                            formatted_title = self.by_month_added_series_title_template.format(**args).rstrip()
                         else:
-                            aTag.insert(0,escape(new_entry['title']))
+                            formatted_title = self.by_month_added_normal_title_template.format(**args).rstrip()
                             non_series_books += 1
+                        aTag.insert(0,NavigableString(escape(formatted_title)))
                         pBookTag.insert(ptc, aTag)
                         ptc += 1
 
@@ -2265,7 +2251,14 @@ then rebuild the catalog.\n''').format(author[0])
                         aTag = Tag(soup, "a")
                         if self.opts.generate_descriptions:
                             aTag['href'] = "book_%d.html" % (int(float(new_entry['id'])))
-                        aTag.insert(0,escape(new_entry['title']))
+
+                        # Generate the title from the template
+                        args = self.generateFormatArgs(new_entry)
+                        if new_entry['series']:
+                            formatted_title = self.by_recently_added_series_title_template.format(**args).rstrip()
+                        else:
+                            formatted_title = self.by_recently_added_normal_title_template.format(**args).rstrip()
+                        aTag.insert(0,NavigableString(escape(formatted_title)))
                         pBookTag.insert(ptc, aTag)
                         ptc += 1
 
@@ -2323,17 +2316,12 @@ then rebuild the catalog.\n''').format(author[0])
             divTag = Tag(soup, "div")
             dtc = 0
 
-            # Add books by date range
+            # >>> Books by date range <<<
             if self.useSeriesPrefixInTitlesSection:
                 self.booksByDateRange = sorted(self.booksByTitle,
                                  key=lambda x:(x['timestamp'], x['timestamp']),reverse=True)
             else:
                 nspt = deepcopy(self.booksByTitle)
-                for book in nspt:
-                    if book['series']:
-                        tokens = book['title'].partition(':')
-                        book['title'] = '%s (%s)' % (tokens[2].strip(), tokens[0])
-                        book['title_sort'] = self.generateSortTitle(book['title'])
                 self.booksByDateRange = sorted(nspt, key=lambda x:(x['timestamp'], x['timestamp']),reverse=True)
 
             date_range_list = []
@@ -2356,15 +2344,6 @@ then rebuild the catalog.\n''').format(author[0])
                 dtc = add_books_to_HTML_by_date_range(date_range_list, date_range, dtc)
                 date_range_list = [book]
 
-            '''
-            if books_added_in_date_range:
-                # Add an <hr> separating date ranges from months
-                hrTag = Tag(soup,'hr')
-                hrTag['class'] = "description_divider"
-                divTag.insert(dtc,hrTag)
-                dtc += 1
-            '''
-
             # >>>> Books by month <<<<
             # Sort titles case-insensitive for by month using series prefix
             self.booksByMonth = sorted(self.booksByTitle,
@@ -2395,7 +2374,9 @@ then rebuild the catalog.\n''').format(author[0])
             self.htmlFileList_2.append("content/ByDateAdded.html")
 
         def generateHTMLByDateRead(self):
-            # Write books by active bookmarks
+            '''
+            Write books by active bookmarks
+            '''
             friendly_name = 'Recently Read'
             self.updateProgressFullStep("'%s'" % friendly_name)
             if not self.bookmarked_books:
@@ -2533,32 +2514,6 @@ then rebuild the catalog.\n''').format(author[0])
             self.booksByDateRead = sorted(bookmarked_books,
                              key=lambda x:(x['bookmark_timestamp'], x['bookmark_timestamp']),reverse=True)
 
-            '''
-            # >>>> Recently by date range <<<<
-            date_range_list = []
-            today_time = datetime.datetime.utcnow()
-            today_time.replace(hour=23, minute=59, second=59)
-            books_added_in_date_range = False
-            for (i, date) in enumerate(self.DATE_RANGE):
-                date_range_limit = self.DATE_RANGE[i]
-                if i:
-                    date_range = '%d to %d days ago' % (self.DATE_RANGE[i-1], self.DATE_RANGE[i])
-                else:
-                    date_range = 'Last %d days' % (self.DATE_RANGE[i])
-
-                for book in self.booksByDateRead:
-                    bookmark_time = datetime.datetime.utcfromtimestamp(book['bookmark_timestamp'])
-                    delta = today_time-bookmark_time
-                    if delta.days <= date_range_limit:
-                        date_range_list.append(book)
-                        books_added_in_date_range = True
-                    else:
-                        break
-
-                dtc = add_books_to_HTML_by_date_range(date_range_list, date_range, dtc)
-                date_range_list = [book]
-            '''
-
             # >>>> Recently read by day <<<<
             current_date = datetime.date.fromordinal(1)
             todays_list = []
@@ -2713,10 +2668,15 @@ then rebuild the catalog.\n''').format(author[0])
                 # Use series, series index if avail else just title
                 #aTag.insert(0,'%d. %s &middot; %s' % (book['series_index'],escape(book['title']), ' & '.join(book['authors'])))
 
-                # Link to book
-                aTag.insert(0,'%d. %s (%s)' % (book['series_index'],
-                                               escape(book['title']),
-                                               strftime(u'%Y', book['pubdate'].timetuple())))
+                # Reassert 'date' since this is the result of a new search
+                if re.match('0100-01-01',str(book['pubdate'].date())):
+                    book['date'] = None
+                else:
+                    book['date'] = strftime(u'%B %Y', book['pubdate'].timetuple())
+
+                args = self.generateFormatArgs(book)
+                formatted_title = self.by_series_title_template.format(**args).rstrip()
+                aTag.insert(0,NavigableString(escape(formatted_title)))
                 pBookTag.insert(ptc, aTag)
                 ptc += 1
 
@@ -2760,10 +2720,11 @@ then rebuild the catalog.\n''').format(author[0])
             self.htmlFileList_1.append("content/BySeries.html")
 
         def generateHTMLByTags(self):
-            # Generate individual HTML files for each tag, e.g. Fiction, Nonfiction ...
-            # Note that special tags - ~+*[] -  have already been filtered from books[]
-            # There may be synonomous tags
-
+            '''
+            Generate individual HTML files for each tag, e.g. Fiction, Nonfiction ...
+            Note that special tags -  have already been filtered from books[]
+            There may be synonomous tags
+            '''
             self.updateProgressFullStep("'Genres'")
 
             self.genre_tags_dict = self.filterDbTags(self.db.all_tags())
@@ -2787,6 +2748,8 @@ then rebuild the catalog.\n''').format(author[0])
                         this_book['tags'] = book['tags']
                         this_book['id'] = book['id']
                         this_book['series'] = book['series']
+                        this_book['series_index'] = book['series_index']
+                        this_book['date'] = book['date']
                         normalized_tag = self.genre_tags_dict[friendly_tag]
                         genre_tag_list = [key for genre in genre_list for key in genre]
                         if normalized_tag in genre_tag_list:
@@ -2843,13 +2806,7 @@ then rebuild the catalog.\n''').format(author[0])
                             unique_authors.append((current_author[0], current_author[1], books_by_current_author))
                         else:
                             books_by_current_author += 1
-                    '''
-                    # Extract the unique entries
-                    unique_authors = []
-                    for author in authors:
-                        if not author in unique_authors:
-                            unique_authors.append(author)
-                    '''
+
                     # Write the genre book list as an article
                     titles_spanned = self.generateHTMLByGenre(genre, True if index==0 else False,
                                           genre_tag_set[genre],
@@ -2863,18 +2820,14 @@ then rebuild the catalog.\n''').format(author[0])
                                               'books':genre_tag_set[genre],
                                               'titles_spanned':titles_spanned})
 
-            if False and self.opts.verbose:
-                for genre in master_genre_list:
-                    print "genre['tag']: %s" % genre['tag']
-                    for book in genre['books']:
-                        print book['title']
             self.genres = master_genre_list
 
         def generateThumbnails(self):
-            # Generate a thumbnail per cover.  If a current thumbnail exists, skip
-            # If a cover doesn't exist, use default
-            # Return list of active thumbs
-
+            '''
+            Generate a thumbnail per cover.  If a current thumbnail exists, skip
+            If a cover doesn't exist, use default
+            Return list of active thumbs
+            '''
             self.updateProgressFullStep("'Thumbnails'")
             thumbs = ['thumbnail_default.jpg']
             image_dir = "%s/images" % self.catalogPath
@@ -2886,45 +2839,51 @@ then rebuild the catalog.\n''').format(author[0])
 
                 thumb_file = 'thumbnail_%d.jpg' % int(title['id'])
                 thumb_generated = True
+                valid_cover = True
                 try:
-                    self.generateThumbnail(title, image_dir, thumb_file)
                     thumbs.append("thumbnail_%d.jpg" % int(title['id']))
+                    self.generateThumbnail(title, image_dir, thumb_file)
                 except:
+                    if 'cover' in title and os.path.exists(title['cover']):
+                        valid_cover = False
+                        self.opts.log.warn(" *** Invalid cover file for '%s' ***" % (title['title']))
+                        if not self.error:
+                            self.error.append('Invalid cover files')
+                        self.error.append("Warning: invalid cover file for '%s', default cover substituted.\n" % (title['title']))
                     thumb_generated = False
 
-
                 if not thumb_generated:
-                    # Use default cover
-                    if False and self.verbose:
-                        self.opts.log.warn(" using default cover for '%s'" % \
-                        (title['title']))
+                    self.opts.log.warn(" using default cover for '%s'" % (title['title']))
                     # Check to make sure default is current
                     # Check to see if thumbnail exists
-                    thumb_fp = "%s/thumbnail_default.jpg" % (image_dir)
-                    cover = "%s/DefaultCover.png" % (self.catalogPath)
+                    default_thumb_fp = os.path.join(image_dir,"thumbnail_default.jpg")
+                    cover = os.path.join(self.catalogPath, "DefaultCover.png")
                     if not os.path.exists(cover):
                         shutil.copyfile(I('book.png'), cover)
 
-                    if os.path.isfile(thumb_fp):
+                    if os.path.isfile(default_thumb_fp):
                         # Check to see if default cover is newer than thumbnail
                         # os.path.getmtime() = modified time
                         # os.path.ctime() = creation time
                         cover_timestamp = os.path.getmtime(cover)
-                        thumb_timestamp = os.path.getmtime(thumb_fp)
+                        thumb_timestamp = os.path.getmtime(default_thumb_fp)
                         if thumb_timestamp < cover_timestamp:
                             if False and self.verbose:
                                 self.opts.log.warn("updating thumbnail_default for %s" % title['title'])
-                            #title['cover'] = "%s/DefaultCover.jpg" % self.catalogPath
+                            #title['cover'] = os.path.join(self.catalogPath,"DefaultCover.jpg")
                             title['cover'] = cover
-                            self.generateThumbnail(title, image_dir, "thumbnail_default.jpg")
+                            self.generateThumbnail(title, image_dir,
+                                                "thumbnail_default.jpg" if valid_cover else thumb_file)
                     else:
                         if False and self.verbose:
                             self.opts.log.warn(" generating new thumbnail_default.jpg")
-                        #title['cover'] = "%s/DefaultCover.jpg" % self.catalogPath
+                        #title['cover'] = os.path.join(self.catalogPath,"DefaultCover.jpg")
                         title['cover'] = cover
-                        self.generateThumbnail(title, image_dir, "thumbnail_default.jpg")
+                        self.generateThumbnail(title, image_dir,
+                                                "thumbnail_default.jpg" if valid_cover else thumb_file)
 
-            # Write the thumb_width to the file validating cache contents
+
+            # Write thumb_width to the file, validating cache contents
             # Allows detection of aborted catalog builds
             with ZipFile(self.__archive_path, mode='a') as zfw:
                 zfw.writestr('thumb_width', self.opts.thumb_width)
@@ -3162,15 +3121,17 @@ then rebuild the catalog.\n''').format(author[0])
                 navLabelTag = Tag(ncx_soup, "navLabel")
                 textTag = Tag(ncx_soup, "text")
                 if book['series']:
-                    tokens = list(book['title'].partition(':'))
+                    series_index = str(book['series_index'])
+                    if series_index.endswith('.0'):
+                        series_index = series_index[:-2]
                     if self.generateForKindle:
                         # Don't include Author for Kindle
-                        textTag.insert(0, NavigableString(self.formatNCXText('%s (%s)' % \
-                                                      (tokens[2].strip(), tokens[0]), dest='title')))
+                        textTag.insert(0, NavigableString(self.formatNCXText('%s (%s [%s])' %
+                                       (book['title'], book['series'], series_index), dest='title')))
                     else:
                         # Include Author for non-Kindle
-                        textTag.insert(0, NavigableString(self.formatNCXText('%s &middot; %s (%s)' % \
-                                                      (tokens[2].strip(), book['author'], tokens[0]), dest='title')))
+                        textTag.insert(0, NavigableString(self.formatNCXText('%s (%s [%s]) &middot; %s ' %
+                                       (book['title'], book['series'], series_index, book['author']), dest='title')))
                 else:
                     if self.generateForKindle:
                         # Don't include Author for Kindle
@@ -3725,43 +3686,6 @@ then rebuild the catalog.\n''').format(author[0])
                     add_to_master_date_range_list(current_titles_list)
                 current_titles_list = [book['title']]
 
-            '''
-            # Add *article* entries for each populated date range
-            # master_date_range_list{}: [0]:titles list [1]:datestr
-            for books_by_date_range in master_date_range_list:
-                navPointByDateRangeTag = Tag(soup, 'navPoint')
-                navPointByDateRangeTag['class'] = "article"
-                navPointByDateRangeTag['id'] = "%s-ID" %  books_by_date_range[1].replace(' ','')
-                navPointTag['playOrder'] = self.playOrder
-                self.playOrder += 1
-                navLabelTag = Tag(soup, 'navLabel')
-                textTag = Tag(soup, 'text')
-                textTag.insert(0, NavigableString(books_by_date_range[1]))
-                navLabelTag.insert(0, textTag)
-                navPointByDateRangeTag.insert(0,navLabelTag)
-                contentTag = Tag(soup, 'content')
-                contentTag['src'] = "%s#bdr_%s" % (HTML_file,
-                    books_by_date_range[1].replace(' ',''))
-
-                navPointByDateRangeTag.insert(1,contentTag)
-
-                if self.generateForKindle:
-                    cmTag = Tag(soup, '%s' % 'calibre:meta')
-                    cmTag['name'] = "description"
-                    cmTag.insert(0, NavigableString(books_by_date_range[0]))
-                    navPointByDateRangeTag.insert(2, cmTag)
-
-                    cmTag = Tag(soup, '%s' % 'calibre:meta')
-                    cmTag['name'] = "author"
-                    navStr = '%d titles' % books_by_date_range[2] if books_by_date_range[2] > 1 else \
-                             '%d title' % books_by_date_range[2]
-                    cmTag.insert(0, NavigableString(navStr))
-                    navPointByDateRangeTag.insert(3, cmTag)
-
-                navPointTag.insert(nptc, navPointByDateRangeTag)
-                nptc += 1
-            '''
-
             # Create an NCX article entry for each populated day
             # Loop over the booksByDate list, find start of each month,
             # add description_preview_count titles
@@ -3944,7 +3868,8 @@ then rebuild the catalog.\n''').format(author[0])
             outfile = open("%s/%s.ncx" % (self.catalogPath, self.basename), 'w')
             outfile.write(self.ncxSoup.prettify())
 
-        # Helpers
+
+        # --------------- Helpers ---------------
         def author_to_author_sort(self, author):
             tokens = author.split()
             tokens = tokens[-1:] + tokens[:-1]
@@ -3952,45 +3877,39 @@ then rebuild the catalog.\n''').format(author[0])
                 tokens[0] += ','
             return ' '.join(tokens).capitalize()
 
-        def author_compare(self,x,y):
-            # Return -1 if x<y
-            # Return  0 if x==y
-            # Return  1 if x>y
-
-            # Different authors - sort by author_sort
-            if x['author_sort'].capitalize() > y['author_sort'].capitalize():
-                return 1
-            elif x['author_sort'].capitalize() < y['author_sort'].capitalize():
-                return -1
+        def booksByAuthorSorter_author_sort(self, book):
+            '''
+            Sort non-series books before series books
+            '''
+            if not book['series']:
+                key = '%s %s' % (book['author_sort'],
+                                 book['title_sort'].capitalize())
             else:
-                # Same author
-                if x['series'] != y['series']:
-                    # One title is a series, the other is not
-                    if not x['series']:
-                        # Sort regular titles < series titles
-                        return -1
-                    elif not y['series']:
-                        return 1
+                index = book['series_index']
+                integer = int(index)
+                fraction = index-integer
+                series_index = '%04d%s' % (integer, str('%0.4f' % fraction).lstrip('0'))
+                key = '%s ~%s %s' % (book['author_sort'],
+                                     self.generateSortTitle(book['series']),
+                                     series_index)
+            return key
 
-                    # Different series
-                    if x['title_sort'].lstrip() > y['title_sort'].lstrip():
-                        return 1
-                    else:
-                        return -1
-                else:
-                    # Same series
-                    if x['series'] == y['series']:
-                        if float(x['series_index']) > float(y['series_index']):
-                            return 1
-                        elif float(x['series_index']) < float(y['series_index']):
-                            return -1
-                        else:
-                            return 0
-                    else:
-                        if x['series'] > y['series']:
-                            return 1
-                        else:
-                            return -1
+        def booksByAuthorSorter_author(self, book):
+            '''
+            Sort non-series books before series books
+            '''
+            if not book['series']:
+                key = '%s %s' % (self.author_to_author_sort(book['author']),
+                                 book['title_sort'].capitalize())
+            else:
+                index = book['series_index']
+                integer = int(index)
+                fraction = index-integer
+                series_index = '%04d%s' % (integer, str('%0.4f' % fraction).lstrip('0'))
+                key = '%s ~%s %s' % (self.author_to_author_sort(book['author']),
+                                     self.generateSortTitle(book['series']),
+                                     series_index)
+            return key
 
         def calculateThumbnailSize(self):
             ''' Calculate thumbnail dimensions based on device DPI.  Scale Kindle by 50% '''
@@ -4155,6 +4074,20 @@ then rebuild the catalog.\n''').format(author[0])
             # Strip white space to ''
             return re.sub("\W","", author)
 
+        def generateFormatArgs(self, book):
+            series_index = str(book['series_index'])
+            if series_index.endswith('.0'):
+                series_index = series_index[:-2]
+            args = dict(
+                    title = book['title'],
+                    series = book['series'],
+                    series_index = series_index,
+                    rating = self.generateRatingString(book),
+                    rating_parens = '(%s)' % self.generateRatingString(book) if 'rating' in book else '',
+                    pubyear = book['date'].split()[1] if book['date'] else '',
+                    pubyear_parens = "(%s)" % book['date'].split()[1] if book['date'] else '')
+            return args
+
         def generateHTMLByGenre(self, genre, section_head, books, outfile):
             # Write an HTML file of this genre's book list
             # Return a list with [(first_author, first_book), (last_author, last_book)]
@@ -4201,16 +4134,6 @@ then rebuild the catalog.\n''').format(author[0])
                     divTag.insert(dtc,pAuthorTag)
                     dtc += 1
 
-                '''
-                # Insert an <hr /> between non-series and series
-                if not current_series and non_series_books and book['series']:
-                    # Insert an <hr />
-                    hrTag = Tag(soup,'hr')
-                    hrTag['class'] = "series_divider"
-                    divTag.insert(dtc,hrTag)
-                    dtc += 1
-                '''
-
                 # Check for series
                 if book['series'] and book['series'] != current_series:
                     # Start a new series
@@ -4235,17 +4158,6 @@ then rebuild the catalog.\n''').format(author[0])
                 pBookTag = Tag(soup, "p")
                 ptc = 0
 
-                '''
-                # This if clause does not display MISSING_SYMBOL for wishlist items
-                # If this is the wishlist_tag genre, don't show missing symbols
-                # normalized_wishlist_tag = self.genre_tags_dict[self.opts.wishlist_tag]
-                if self.opts.wishlist_tag in book['tags'] and \
-                   self.genre_tags_dict[self.opts.wishlist_tag] != genre:
-                    pBookTag['class'] = "wishlist_item"
-                    pBookTag.insert(ptc,NavigableString(self.MISSING_SYMBOL))
-                    ptc += 1
-                '''
-
                 #  book with read|reading|unread symbol or wishlist item
                 if self.opts.wishlist_tag in book.get('tags', []):
                         pBookTag['class'] = "wishlist_item"
@@ -4271,12 +4183,18 @@ then rebuild the catalog.\n''').format(author[0])
                 aTag = Tag(soup, "a")
                 if self.opts.generate_descriptions:
                     aTag['href'] = "book_%d.html" % (int(float(book['id'])))
-                # Use series, series index if avail else just title
+
+                # Generate the title from the template
+                args = self.generateFormatArgs(book)
                 if current_series:
-                    aTag.insert(0,escape(book['title'][len(book['series'])+1:]))
+                    #aTag.insert(0,escape(book['title'][len(book['series'])+1:]))
+                    formatted_title = self.by_genres_series_title_template.format(**args).rstrip()
                 else:
-                    aTag.insert(0,escape(book['title']))
+                    #aTag.insert(0,escape(book['title']))
+                    formatted_title = self.by_genres_normal_title_template.format(**args).rstrip()
                     non_series_books += 1
+                aTag.insert(0,NavigableString(escape(formatted_title)))
+
                 pBookTag.insert(ptc, aTag)
                 ptc += 1
 
@@ -4322,36 +4240,21 @@ then rebuild the catalog.\n''').format(author[0])
                             xmlns=XHTML_NS,
                             )
 
-                generated_html = P('catalog/template.xhtml',
+                generated_html = P(os.path.join('catalog','template.xhtml'),
                         data=True).decode('utf-8').format(**args)
                 generated_html = substitute_entites(generated_html)
                 return BeautifulSoup(generated_html)
 
-            if False:
-                print "title metadata:\n%s" % ', '.join(sorted(book.keys()))
-            if False:
-                for item in sorted(book.keys()):
-                    try:
-                        print "%s: %s%s" % (item, book[item][:50], '...' if len(book[item])>50 else '')
-                    except:
-                        print "%s: %s" % (item, book[item])
-
             # Generate the template arguments
-            css = P('catalog/stylesheet.css', data=True).decode('utf-8')
-            title_str = escape(book['title'])
-
-            # Title/series
+            css = P(os.path.join('catalog','stylesheet.css'), data=True).decode('utf-8')
+            title_str = title = escape(book['title'])
+            series = ''
+            series_index = ''
             if book['series']:
-                series_id, _, title = book['title'].partition(':')
-                title = escape(title.strip())
                 series = escape(book['series'])
                 series_index = str(book['series_index'])
                 if series_index.endswith('.0'):
                     series_index = series_index[:-2]
-            else:
-                title = escape(book['title'])
-                series = ''
-                series_index = ''
 
             # Author, author_prefix (read|reading|none symbol or missing symbol)
             author = book['author']
@@ -4392,7 +4295,10 @@ then rebuild the catalog.\n''').format(author[0])
 
             # Date of publication
             pubdate = book['date']
-            pubmonth, pubyear = pubdate.split(' ')
+            if pubdate:
+                pubmonth, pubyear = pubdate.split(' ')
+            else:
+                pubmonth = pubyear = ''
 
             # Thumb
             _soup = BeautifulSoup('<html>',selfClosingTags=['img'])
@@ -4525,7 +4431,7 @@ then rebuild the catalog.\n''').format(author[0])
         def generateMastheadImage(self, out_path):
             from calibre.ebooks.conversion.config import load_defaults
             from calibre.utils.fonts import fontconfig
-            font_path = default_font = P('fonts/liberation/LiberationSerif-Bold.ttf')
+            font_path = default_font = P(os.path.join('fonts','liberation','LiberationSerif-Bold.ttf'))
             recs = load_defaults('mobi_output')
             masthead_font_family = recs.get('masthead_font', 'Default')
 
@@ -4562,16 +4468,15 @@ then rebuild the catalog.\n''').format(author[0])
             draw.text((left, top), text, fill=(0,0,0), font=font)
             img.save(open(out_path, 'wb'), 'GIF')
 
-        def generateSeriesTitle(self, title):
-            if float(title['series_index']) - int(title['series_index']):
-                series_title = '%s %4.2f: %s' % (title['series'],
-                                                title['series_index'],
-                                                title['title'])
-            else:
-                series_title = '%s %d: %s' % (title['series'],
-                                             title['series_index'],
-                                             title['title'])
-            return series_title
+        def generateRatingString(self, book):
+            rating = ''
+            if 'rating' in book:
+                stars = int(book['rating']) / 2
+                if stars:
+                    star_string = self.FULL_RATING_SYMBOL * stars
+                    empty_stars = self.EMPTY_RATING_SYMBOL * (5 - stars)
+                    rating = '%s%s' % (star_string,empty_stars)
+            return rating
 
         def generateShortDescription(self, description, dest=None):
             # Truncate the description, on word boundaries if necessary
@@ -4610,9 +4515,11 @@ then rebuild the catalog.\n''').format(author[0])
                 raise RuntimeError
 
         def generateSortTitle(self, title):
-            # Generate a string suitable for sorting from the title
-            # Ignore leading stop words
-            # Optionally convert leading numbers to strings
+            '''
+            Generate a string suitable for sorting from the title
+            Ignore leading stop words
+            Optionally convert leading numbers to strings
+            '''
             from calibre.ebooks.metadata import title_sort
 
             # Strip stop words
@@ -4912,10 +4819,10 @@ then rebuild the catalog.\n''').format(author[0])
 
         class NotImplementedError:
             def __init__(self, error):
-                self.error = error
+                self.error.append(error)
 
             def logerror(self):
-                self.opts.log.info('%s not implemented' % self.error)
+                self.opts.log.info('%s not implemented' % error)
 
     def run(self, path_to_output, opts, db, notification=DummyReporter()):
         opts.log = log
@@ -4982,11 +4889,12 @@ then rebuild the catalog.\n''').format(author[0])
         if opts_dict['ids']:
             build_log.append(" book count: %d" % len(opts_dict['ids']))
 
-        sections_list = ['Authors']
         '''
+        sections_list = []
         if opts.generate_authors:
             sections_list.append('Authors')
         '''
+        sections_list = ['Authors']
         if opts.generate_titles:
             sections_list.append('Titles')
         if opts.generate_genres:
@@ -5042,7 +4950,7 @@ then rebuild the catalog.\n''').format(author[0])
             if catalog_source_built:
                 log.info(" Completed catalog source generation\n")
             else:
-                log.warn(" No database hits with supplied criteria")
+                log.warn(" *** Errors during catalog generation, check log for details ***")
 
         if catalog_source_built:
             recommendations = []
@@ -5072,8 +4980,6 @@ then rebuild the catalog.\n''').format(author[0])
                             abort_after_input_dump=False)
             plumber.merge_ui_recommendations(recommendations)
             plumber.run()
-            # returns to gui2.actions.catalog:catalog_generated()
-            return None
-        else:
-            # returns to gui2.actions.catalog:catalog_generated()
-            return catalog.error
+
+        # returns to gui2.actions.catalog:catalog_generated()
+        return catalog.error

From a0aa719bb0b8de97a12c96c41a4bff70f656b213 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 17 Jan 2011 23:53:27 +0800
Subject: [PATCH 066/118] implemented multi-pass analysis for chapter detection

---
 src/calibre/ebooks/conversion/utils.py | 93 +++++++++++++++++++-------
 1 file changed, 68 insertions(+), 25 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index d9e5246223..1a691b2e14 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -21,6 +21,7 @@ class HeuristicProcessor(object):
         self.deleted_nbsps = False
         self.totalwords = 0
         self.min_chapters = 1
+        self.max_chapters = 150
         self.chapters_no_title = 0
         self.chapters_with_title = 0
         self.blanks_deleted = False
@@ -132,7 +133,7 @@ class HeuristicProcessor(object):
     def markup_italicis(self, html):
         ITALICIZE_WORDS = [
             'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
-            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
+            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
             'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
             'Mlle.', 'Mons.', 'PS.', 'PPS.',
         ]
@@ -166,9 +167,11 @@ class HeuristicProcessor(object):
         with minimum false positives.  Exits after finding a successful pattern
         '''
         # Typical chapters are between 2000 and 7000 words, use the larger number to decide the
-        # minimum of chapters to search for
+        # minimum of chapters to search for.  A max limit is calculated to prevent things like OCR
+        # or pdf page numbers from being treated as TOC markers
         if wordcount > 7000:
             self.min_chapters = int(ceil(wordcount / 7000.))
+            self.max_chapters = int(ceil(wordcount / 100.))
         #print "minimum chapters required are: "+str(self.min_chapters)
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
@@ -202,44 +205,84 @@ class HeuristicProcessor(object):
         n_lookahead_open = "\s+(?!"
         n_lookahead_close = ")"
 
-        default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
+        default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
+        simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter).{0,50}?(</[ibu][^>]*>)?(?=<)"
+
+        analysis_result = []
 
         chapter_types = [
-            [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for most common chapter headings", 'chapter'],  # Highest frequency headings which include titles
-            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common section headings", 'common'],
-            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
-            [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
-            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
-            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
-            [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric headings", 'plain_number'],  # Numeric Chapters, no dot or colon
-            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
+            [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],  # Highest frequency headings which include titles
+            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
+            [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
+            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
+            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
+            [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for simple numeric headings", 'plain_number'],  # Numeric Chapters, no dot or colon
+            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
             ]
 
         def recurse_patterns(html, analyze):
             # Start with most typical chapter headings, get more aggressive until one works
-            for [chapter_type, lookahead_ignorecase, log_message, type_name] in chapter_types:
+            for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
+                n_lookahead = ''
+                hits = 0
+                self.chapters_no_title = 0
+                self.chapters_with_title = 0
+
+                if n_lookahead_req:
+                    lp_n_lookahead_open = n_lookahead_open
+                    lp_n_lookahead_close = n_lookahead_close
+                else:
+                    lp_n_lookahead_open = ''
+                    lp_n_lookahead_close = ''
+
+                if strict_title:
+                    lp_title = default_title
+                else:
+                    lp_title = simple_title
+                 
+                if ignorecase:
+                    arg_ignorecase = r'(?i)'
+                else:
+                    arg_ignorecase = ''
+
+                if title_req:
+                    lp_opt_title_open = ''
+                    lp_opt_title_close = ''        
+                else:
+                    lp_opt_title_open = opt_title_open
+                    lp_opt_title_close = opt_title_close
+
                 if self.html_preprocess_sections >= self.min_chapters:
                     break
                 full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
-                n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
-                self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
-                if lookahead_ignorecase:
-                    chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-                    chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
-                else:
-                    chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
-                    chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
+                if n_lookahead_req:
+                    n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
+                if not analyze:
+                    self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
+
+                chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
+                chapdetect = re.compile(r'%s' % chapter_marker)
+
                 if analyze:
                     hits = len(chapdetect.findall(html))
-                    print unicode(type_name)+" had "+unicode(hits)+" hits"
-                    chapdetect.sub(self.analyze_title_matches, html)
-                    print unicode(self.chapters_no_title)+" chapters with no title"
-                    print unicode(self.chapters_with_title)+" chapters with titles"
+                    if hits:
+                        chapdetect.sub(self.analyze_title_matches, html)
+                        if float(self.chapters_with_title) / float(hits) > .5:
+                            title_req = True
+                            strict_title = False
+                        self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
+                        if type_name == 'common':
+                            analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
+                        elif self.min_chapters <= hits < self.max_chapters:
+                            analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
+                            break
                 else:
                     html = chapdetect.sub(self.chapter_head, html)
             return html
 
         recurse_patterns(html, True)
+        chapter_types = analysis_result
         html = recurse_patterns(html, False)
 
         words_per_chptr = wordcount
@@ -293,7 +336,7 @@ class HeuristicProcessor(object):
         pre = re.compile(r'<pre>', re.IGNORECASE)
         if len(pre.findall(html)) >= 1:
             self.log.debug("Running Text Processing")
-            outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
+            outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
             html = outerhtml.sub(self.txt_process, html)
         else:
             # Add markup naively

From 30922f75f20b0c9e96adc53e0951574057ace50e Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Jan 2011 09:26:45 -0700
Subject: [PATCH 067/118] Fix #8430 (Economist Print Edition no longer requires
 a password)

---
 resources/recipes/economist.recipe      | 9 +++++----
 resources/recipes/economist_free.recipe | 4 ++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/resources/recipes/economist.recipe b/resources/recipes/economist.recipe
index 01ee8e0baf..95b4a2ae05 100644
--- a/resources/recipes/economist.recipe
+++ b/resources/recipes/economist.recipe
@@ -9,7 +9,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.BeautifulSoup import Tag, NavigableString
 
-import mechanize, string, urllib, time, re
+import string, time, re
 
 class Economist(BasicNewsRecipe):
 
@@ -18,19 +18,19 @@ class Economist(BasicNewsRecipe):
 
     __author__ = "Kovid Goyal"
     INDEX = 'http://www.economist.com/printedition'
-    description = ('Global news and current affairs from a European perspective.'
-            ' Needs a subscription from ')+INDEX
+    description = 'Global news and current affairs from a European perspective.'
 
     oldest_article = 7.0
     cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
     remove_tags = [dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
             dict(attrs={'class':['dblClkTrk', 'ec-article-info']})]
     keep_only_tags = [dict(id='ec-article-body')]
-    needs_subscription = True
+    needs_subscription = False
     no_stylesheets = True
     preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
         lambda x:'</html>')]
 
+    '''
     def get_browser(self):
         br = BasicNewsRecipe.get_browser()
         br.open('http://www.economist.com')
@@ -50,6 +50,7 @@ class Economist(BasicNewsRecipe):
                     }))
         br.open(req).read()
         return br
+    '''
 
     def parse_index(self):
         try:
diff --git a/resources/recipes/economist_free.recipe b/resources/recipes/economist_free.recipe
index 1a783521f6..321c7d29ce 100644
--- a/resources/recipes/economist_free.recipe
+++ b/resources/recipes/economist_free.recipe
@@ -7,12 +7,12 @@ from lxml import html
 
 class Economist(BasicNewsRecipe):
 
-    title = 'The Economist (free)'
+    title = 'The Economist (RSS)'
     language = 'en'
 
     __author__ = "Kovid Goyal"
     description = ('Global news and current affairs from a European perspective.'
-            ' Much slower than the subscription based version.')
+            ' Much slower than the print edition based version.')
 
     oldest_article = 7.0
     cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'

From 0d91ea7055c24b98b9117517abf669cae3f6f53a Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Jan 2011 09:39:06 -0700
Subject: [PATCH 068/118] When auto converting books and the device is
 unplugged, do not raise an error. Fixes #8426 (Exception when attempting to
 send books to a Kindle that was sleeping)

---
 src/calibre/gui2/device.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py
index 944ce03305..734d8cd56c 100644
--- a/src/calibre/gui2/device.py
+++ b/src/calibre/gui2/device.py
@@ -1018,7 +1018,8 @@ class DeviceMixin(object): # {{{
         ids = [self.library_view.model().id(r) \
                for r in self.library_view.selectionModel().selectedRows()] \
                                 if send_ids is None else send_ids
-        if not self.device_manager or not ids or len(ids) == 0:
+        if not self.device_manager or not ids or len(ids) == 0 or \
+                not self.device_manager.is_device_connected:
             return
 
         settings = self.device_manager.device.settings()

From 61d365c25b35a265175f1c9d26ae573fb4998965 Mon Sep 17 00:00:00 2001
From: GRiker <griker@hotmail.com>
Date: Mon, 17 Jan 2011 09:59:46 -0700
Subject: [PATCH 069/118] GwR catalog 1.0 revisions

---
 src/calibre/library/catalog.py | 46 ++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py
index f1c5e3ae65..cd50cf4378 100644
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@@ -1429,12 +1429,29 @@ class EPUB_MOBI(CatalogPlugin):
 
             self.updateProgressFullStep("Sorting database")
             self.booksByAuthor = list(self.booksByTitle)
-            self.booksByAuthor = sorted(self.booksByAuthor, key=self.booksByAuthorSorter_author)
-#             for book in self.booksByAuthor:
-#                 print "{0:<30}  {1:<30}  {2:<30}".format(book['title'],book['author'],book['author_sort'])
-#             print
-#             stop
 
+            # Test for author_sort mismatches
+            self.booksByAuthor = sorted(self.booksByAuthor, key=self.booksByAuthorSorter_author)
+            # Build the unique_authors set from existing data
+            authors = [(record['author'], record['author_sort']) for record in self.booksByAuthor]
+            current_author = authors[0]
+            for (i,author) in enumerate(authors):
+                if author != current_author and i:
+                    # Exit if author matches previous, but author_sort doesn't match
+                    if author[0] == current_author[0]:
+                        error_msg = _('''
+Inconsistent Author Sort values for Author '{0}' ('{1}' <> '{2}'), unable to build catalog.\n
+Select all books by '{0}', apply correct Author Sort value in Edit Metadata dialog,
+then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
+                        self.opts.log.warn('\n*** Metadata error ***')
+                        self.opts.log.warn(error_msg)
+
+                        self.error.append('Metadata error')
+                        self.error.append(error_msg)
+                        return False
+
+
+            self.booksByAuthor = sorted(self.booksByAuthor, key=self.booksByAuthorSorter_author_sort)
             # Build the unique_authors set from existing data
             authors = [(record['author'], record['author_sort'].capitalize()) for record in self.booksByAuthor]
 
@@ -1450,20 +1467,6 @@ class EPUB_MOBI(CatalogPlugin):
                     # Note that current_author and author are tuples: (friendly, sort)
                     multiple_authors = True
 
-                if author != current_author and i:
-                    # Exit if author matches previous, but author_sort doesn't match
-                    if author[0] == current_author[0]:
-                        error_msg = _('''
-Inconsistent Author Sort values for Author '{0}', unable to continue building catalog.\n
-Select all books by '{0}', apply correct Author Sort value in Edit Metadata dialog,
-then rebuild the catalog.\n''').format(author[0])
-                        self.opts.log.warn('\n*** Metadata error ***')
-                        self.opts.log.warn(error_msg)
-
-                        self.error.append('Metadata error')
-                        self.error.append(error_msg)
-                        return False
-
                     # New author, save the previous author/sort/count
                     unique_authors.append((current_author[0], icu_title(current_author[1]),
                                            books_by_current_author))
@@ -1939,7 +1942,8 @@ then rebuild the catalog.\n''').format(author[0])
             current_author = ''
             current_letter = ''
             current_series = None
-            for book in sorted(self.booksByAuthor, key = self.booksByAuthorSorter_author_sort):
+            #for book in sorted(self.booksByAuthor, key = self.booksByAuthorSorter_author_sort):
+            for book in self.booksByAuthor:
 
                 book_count += 1
                 if self.letter_or_symbol(book['author_sort'][0].upper()) != current_letter :
@@ -2118,7 +2122,7 @@ then rebuild the catalog.\n''').format(author[0])
             def add_books_to_HTML_by_month(this_months_list, dtc):
                 if len(this_months_list):
 
-                    this_months_list = sorted(this_months_list, key=self.booksByAuthorSorter_author_sort)
+                    #this_months_list = sorted(this_months_list, key=self.booksByAuthorSorter_author_sort)
 
                     # Create a new month anchor
                     date_string = strftime(u'%B %Y', current_date.timetuple())

From 1d24b14bc21a933067cc140760b59af0c4d709b2 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Jan 2011 09:59:50 -0700
Subject: [PATCH 070/118] ...

---
 src/calibre/manual/faq.rst | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst
index ee72bf6fdb..b473893673 100644
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@@ -101,6 +101,17 @@ We just need some information from you:
 Once you send us the output for a particular operating system, support for the device in that operating system
 will appear in the next release of |app|.
 
+My device is not being detected by |app|?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Follow these steps to find the problem:
+
+    * Make sure that you are connecting only a single device to your computer at a time. Do not have another |app| supported device like an iPhone/iPad etc. at the same time.
+    * Make sure you are running the latest version of |app|. The latest version can always be downloaded from `http://calibre-ebook.com/download`_.
+    * Ensure your operating system is seeing the device. That is, the device should be mounted as a disk that you can access using Windows explorer or whatever the file management program on your computer is
+    * In calibre, go to Preferences->Plugins->Device Interface plugin and make sure the plugin for your device is enabled.
+    * If all the above steps fail, go to Preferences->Miscellaneous and click debug device detection with your device attached and post the output as a ticket on `http://bugs.calibre-ebook.com`_.
+
 How does |app| manage collections on my SONY reader?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

From 5e2a2d71a61bfdb85d191461a2d7be5a252436e6 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Jan 2011 10:11:22 -0700
Subject: [PATCH 071/118] Better error message in debug log when failed to
 fetch a news article

---
 src/calibre/web/feeds/news.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py
index dd32d3749f..6215132e4b 100644
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@@ -839,7 +839,13 @@ class BasicNewsRecipe(Recipe):
         fetcher.image_url_processor = self.image_url_processor
         res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
         if not res or not os.path.exists(res):
-            raise Exception(_('Could not fetch article. Run with -vv to see the reason'))
+            msg = _('Could not fetch article.') + ' '
+            if self.debug:
+                msg += _('The debug traceback is available earlier in this log')
+            else:
+                msg += _('Run with -vv to see the reason')
+            raise Exception(msg)
+
         return res, path, failures
 
     def fetch_article(self, url, dir, f, a, num_of_feeds):
@@ -901,7 +907,7 @@ class BasicNewsRecipe(Recipe):
         if self.test:
             feeds = feeds[:2]
         self.has_single_feed = len(feeds) == 1
-        
+
         index = os.path.join(self.output_dir, 'index.html')
 
         html = self.feeds2index(feeds)

From 46ab37e98f310c8134c44c79bf2da14422ec0bd7 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Jan 2011 10:17:34 -0700
Subject: [PATCH 072/118] iHNed by Karel Bilek

---
 resources/recipes/ihned.recipe | 182 +++++++++++++++++++++++++++++++++
 1 file changed, 182 insertions(+)
 create mode 100644 resources/recipes/ihned.recipe

diff --git a/resources/recipes/ihned.recipe b/resources/recipes/ihned.recipe
new file mode 100644
index 0000000000..daf63e19ed
--- /dev/null
+++ b/resources/recipes/ihned.recipe
@@ -0,0 +1,182 @@
+import re, time
+from calibre import strftime
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class IHNed(BasicNewsRecipe):
+
+
+    stahnout_vsechny = False
+        #True   = stahuje vsechny z homepage
+        #False  = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten)
+
+    title       = 'iHNed'
+    __author__  = 'Karel Bílek'
+    language = 'cs'
+    description = 'Zprávy z iHNed.cz'
+    timefmt = ' [%a, %d %b, %Y]'
+    needs_subscription = False
+    remove_tags = [dict(attrs={'class':['borderbottom', 'web', 'foot', 'reklama', 'd-elm d-rellinks', 'd-elm']}),
+                 dict(style=['text-align: center;']),
+                 dict(id=['r-bfull']),
+                 dict(name=['script', 'noscript', 'style'])]
+    encoding = 'windows-1250'
+    no_stylesheets = True
+    remove_tags_before = dict(attrs={'class':'d-nadtit'})
+    remove_tags_after = dict(attrs={'class':'like'})
+
+    conversion_options = {
+      'linearize_tables' : True,
+    }
+
+
+
+    def preprocess_html(self, soup):
+
+        def makeurl(wat):
+            return "http://ihned.cz"+wat;
+
+        for h1 in soup.findAll('h1'):
+             a = h1.find('a')
+             if a:
+                 string = a.string
+                 if string:
+                     soup.a.replaceWith(string)
+        for a in soup.findAll('a',  href=True) :
+            cil = str(a['href'])
+            if cil.startswith("/") or  cil.startswith("index"):
+                a['href'] = makeurl(cil)
+        return soup
+
+
+    def parse_index(self):
+
+        def makeurl(wat):
+            if wat.startswith("/") or  wat.startswith("index"):
+                return "http://ihned.cz"+wat;
+            else:
+                return wat
+
+
+        articles = {} #vysledek, asi
+        key = None #soucasna sekce
+        ans = [] #vsechny sekce
+
+        articles["Hlavní"] = []
+        ans.append("Hlavní")
+
+        was = {}
+
+        def parse_subpage(url, name):
+            articles[name] = []
+            ans.append(name)
+
+
+            soup = self.index_to_soup(url)
+            otvirak = soup.find(True, attrs={'class':['otv']})
+            if otvirak:
+
+                #the code is copypasted here because I don't know python. simple as that.
+                a = otvirak.find('a', href=True)
+                title = self.tag_to_string(a, use_alt=True).strip()
+                txt = otvirak.find(True, attrs={'class':['txt']})
+                description = ''
+                if txt:
+                    match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
+                    if match:
+                        description = match.group(1)
+
+                pubdate = strftime('%d. %m.')
+                if not title in was:
+                    articles[name].append(
+                          dict(title=title, url=makeurl(a['href']), date=pubdate,
+                                description=description,
+                                content=''))
+
+            otv234 = soup.find(True, attrs={'class':['otv234', 'col2a']})
+            if otv234:
+                for ow in otv234.findAll(True, attrs={'class':['ow']}):
+                    a = ow.find('a', href=True)
+                    title = self.tag_to_string(a, use_alt=True).strip()
+                    description=''
+                    prx = ow.find(True, attrs={'class':['prx']});
+                    if prx:
+                        description = str(prx.string)
+                    nfo = ow.find(True, attrs={'class':['nfo']});
+                    pubdate = ''
+                    if nfo:
+                        dtime = time.localtime();
+                        day = dtime[2]
+                        month = dtime[1]
+
+                        pubdate = strftime('%d. %m.')
+
+                        match = re.search(r'([0-9]*)\.([0-9]*)\.', str(nfo))
+
+                        if self.stahnout_vsechny or (int(day) == int(match.group(1)) and int(month) == int(match.group(2))):
+                            if not title in was:
+                                articles[name].append(
+                                      dict(title=title, url=makeurl(a['href']), date=pubdate,
+                                            description=description,
+                                            content=''))
+
+
+
+
+
+
+        soup = self.index_to_soup('http://ihned.cz/')
+        otvirak = soup.find(True, attrs={'class':['otv']})
+        if otvirak:
+            a = otvirak.find('a', href=True)
+            title = self.tag_to_string(a, use_alt=True).strip()
+            txt = otvirak.find(True, attrs={'class':['txt']})
+            description = ''
+            if txt:
+                match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
+                if match:
+                    description = match.group(1)
+
+            pubdate = strftime('%d. %m.')
+            feed = "Hlavní"
+            articles[feed].append(
+                      dict(title=title, url=(a['href']), date=pubdate,
+                            description=description,
+                            content=''))
+            was[title]=1
+
+        otvirak2345 = soup.find(True, attrs={'class':['otv2345']})
+        if otvirak2345:
+            for otv2 in otvirak2345.findAll(True, attrs={'class':['otv2-5']}):
+                a = otv2.find('a', attrs={'class':['tit2']}, href=True)
+                title = self.tag_to_string(a, use_alt=True).strip()
+                description=''
+                span = otv2.find('span');
+                if span:
+                    match = re.match(r'<span>\s*([^<]*)\s*<a', str(span), re.L)
+                    if match:
+                        description = match.group(1)
+                feed = "Hlavní"
+                pubdate = strftime('%d. %m.')
+                articles[feed].append(
+                          dict(title=title, url=(a['href']), date=pubdate,
+                                description=description,
+                                content=''))
+                was[title]=1
+
+
+        parse_subpage("http://komentare.ihned.cz/", "Komentáře")
+        parse_subpage("http://domaci.ihned.cz", "Domácí")
+        parse_subpage("http://ekonomika.ihned.cz", "Ekonomika")
+        parse_subpage("http://zahranicni.ihned.cz/", "Zahraničí");
+        parse_subpage("http://finweb.ihned.cz/", "Finance");
+        parse_subpage("http://digiweb.ihned.cz/", "DigiWeb");
+        parse_subpage("http://kultura.ihned.cz/", "Kultura")
+        parse_subpage("http://sport.ihned.cz/", "Sport");
+
+        #seradi kategorie
+        ans = self.sort_index_by(ans, {'Hlavni':1, 'Domácí':2, 'Ekonomika':5, 'Zahraničí':3, 'Finance':6, 'DigiWeb':7, 'Kultura':8, 'Sport':9, 'Komentáře':4})
+
+        #vrati, ale pouze, kdyz je v kategoriich...
+        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
+        return ans
+

From 3ca18da2cfc48a1ce3a201245eeee8ed005f0541 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 18 Jan 2011 01:17:42 +0800
Subject: [PATCH 073/118] fix pdf preprocess call

---
 src/calibre/ebooks/conversion/preprocess.py | 13 +++----------
 src/calibre/ebooks/conversion/utils.py      |  7 ++++---
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index d2bdba4928..54639df93c 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -473,12 +473,6 @@ class HTMLPreProcessor(object):
             # unwrap/delete soft hyphens with formatting
             end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
 
-        # Make the more aggressive chapter marking regex optional with the preprocess option to
-        # reduce false positives and move after header/footer removal
-        if getattr(self.extra_opts, 'preprocess_html', None):
-            if is_pdftohtml:
-                end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
-
         length = -1
         if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
             docanalysis = DocAnalysis('pdf', html)
@@ -525,11 +519,10 @@ class HTMLPreProcessor(object):
             html = dehyphenator(html,'html', length)
 
         if is_pdftohtml:
-            from calibre.ebooks.conversion.utils import PreProcessor
-            pdf_markup = PreProcessor(self.extra_opts, None)
+            from calibre.ebooks.conversion.utils import HeuristicProcessor
+            pdf_markup = HeuristicProcessor(self.extra_opts, None)
             totalwords = 0
-            totalwords = pdf_markup.get_word_count(html)
-            if totalwords > 7000:
+            if pdf_markup.get_word_count(html) > 7000:
                 html = pdf_markup.markup_chapters(html, totalwords, True)
 
         #dump(html, 'post-preprocess')
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 1a691b2e14..888d24d791 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -170,9 +170,9 @@ class HeuristicProcessor(object):
         # minimum of chapters to search for.  A max limit is calculated to prevent things like OCR
         # or pdf page numbers from being treated as TOC markers
         if wordcount > 7000:
-            self.min_chapters = int(ceil(wordcount / 7000.))
-            self.max_chapters = int(ceil(wordcount / 100.))
-        #print "minimum chapters required are: "+str(self.min_chapters)
+            self.min_chapters = int(ceil(wordcount / 15000.))
+            self.max_chapters = int(ceil(wordcount / 1200.))
+        print "minimum chapters required are: "+str(self.min_chapters)
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
         self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
@@ -272,6 +272,7 @@ class HeuristicProcessor(object):
                             title_req = True
                             strict_title = False
                         self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
+                        print "max chapters is "+str(self.max_chapters)
                         if type_name == 'common':
                             analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
                         elif self.min_chapters <= hits < self.max_chapters:

From e18e5a5db9fc1682fafed636d2d6187cebbf8a0e Mon Sep 17 00:00:00 2001
From: GRiker <griker@hotmail.com>
Date: Mon, 17 Jan 2011 10:29:38 -0700
Subject: [PATCH 074/118] GwR catalog 1.0 revisions

---
 src/calibre/gui2/actions/catalog.py |  2 ++
 src/calibre/library/catalog.py      | 14 +++++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/calibre/gui2/actions/catalog.py b/src/calibre/gui2/actions/catalog.py
index 1650c80d70..be6e7bfe60 100644
--- a/src/calibre/gui2/actions/catalog.py
+++ b/src/calibre/gui2/actions/catalog.py
@@ -56,6 +56,8 @@ class GenerateCatalogAction(InterfaceAction):
     def catalog_generated(self, job):
         if job.result:
             # Problems during catalog generation
+            # jobs.results is a list - the first entry is the intended title for the dialog
+            # Subsequent strings are error messages
             dialog_title = job.result.pop(0)
             if re.match('warning:', job.result[0].lower()):
                 job.result.append("Catalog generation complete.")
diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py
index cd50cf4378..cf02e9b792 100644
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@@ -1637,7 +1637,10 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
                                                                title['title_sort'][0:40])).decode('mac-roman'))
                 return True
             else:
-                self.error.append( _("No books found to catalog.\nCheck 'Excluded books' criteria in E-book options."))
+                error_msg = _("No books found to catalog.\nCheck 'Excluded books' criteria in E-book options.\n")
+                self.opts.log.error('*** ' + error_msg + ' ***')
+                self.error.append(_('No books available to include in catalog'))
+                self.error.append(error_msg)
                 return False
 
         def fetchBookmarks(self):
@@ -3164,8 +3167,13 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
                     # Add the author tag
                     cmTag = Tag(ncx_soup, '%s' % 'calibre:meta')
                     cmTag['name'] = "author"
-                    navStr = '%s | %s' % (self.formatNCXText(book['author'], dest='author'),
-                          book['date'].split()[1])
+
+                    if book['date']:
+                        navStr = '%s | %s' % (self.formatNCXText(book['author'], dest='author'),
+                              book['date'].split()[1])
+                    else:
+                        navStr = '%s' % (self.formatNCXText(book['author'], dest='author'))
+
                     if 'tags' in book and len(book['tags']):
                         navStr = self.formatNCXText(navStr + ' | ' + ' &middot; '.join(sorted(book['tags'])), dest='author')
                     cmTag.insert(0, NavigableString(navStr))

From 5f2e4a1f3f0a7c596cf0db87dfcebdc7b274e7f6 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Jan 2011 10:34:38 -0700
Subject: [PATCH 075/118] kath.net bu Bobus

---
 resources/recipes/kath_net.recipe | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 resources/recipes/kath_net.recipe

diff --git a/resources/recipes/kath_net.recipe b/resources/recipes/kath_net.recipe
new file mode 100644
index 0000000000..7c469adbe8
--- /dev/null
+++ b/resources/recipes/kath_net.recipe
@@ -0,0 +1,16 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1295262156(BasicNewsRecipe):
+    title          = u'kath.net'
+    __author__     = 'Bobus'
+    oldest_article = 7
+    max_articles_per_feed = 100
+
+    feeds          = [(u'kath.net', u'http://www.kath.net/2005/xml/index.xml')]
+
+
+    def print_version(self, url):
+        return url+"&print=yes"
+
+    extra_css = 'td.textb {font-size: medium;}'
+

From 1a20df3291576cb346c42584b14d044f80255bde Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Mon, 17 Jan 2011 13:17:33 -0500
Subject: [PATCH 076/118] Modify italicize patterns to reduce false positivies.

---
 src/calibre/ebooks/conversion/utils.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 888d24d791..bfb5f1c153 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -139,17 +139,17 @@ class HeuristicProcessor(object):
         ]
         
         ITALICIZE_STYLE_PATS = [
-            r'(?msu)_(?P<words>.+?)_',
-            r'(?msu)/(?P<words>[^<>]+?)/',
-            r'(?msu)~~(?P<words>.+?)~~',
-            r'(?msu)\*(?P<words>.+?)\*',
-            r'(?msu)~(?P<words>.+?)~',
-            r'(?msu)_/(?P<words>[^<>]+?)/_',
-            r'(?msu)_\*(?P<words>.+?)\*_',
-            r'(?msu)\*/(?P<words>[^<>]+?)/\*',
-            r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
-            r'(?msu)/:(?P<words>[^<>]+?):/',
-            r'(?msu)\|:(?P<words>.+?):\|',
+            r'(?msu)_(?P<words>[^\s][^<>_]+?[^\s])?_',
+            r'(?msu)/(?P<words>[^\s][^<>/]+?[^\s])?/',
+            r'(?msu)~~(?P<words>[^\s][^<>~]+?[^\s])?~~',
+            r'(?msu)\*(?P<words>[^\s][^<>\*]+?[^\s])?\*',
+            r'(?msu)~(?P<words>[^\s][^<>~]+?[^\s])?~',
+            r'(?msu)_/(?P<words>[^\s][^<>/_]+?[^\s])?/_',
+            r'(?msu)_\*(?P<words>[^\s][^<>\*_]+?[^\s])?\*_',
+            r'(?msu)\*/(?P<words>[^\s][^<>/\*]+?[^\s])?/\*',
+            r'(?msu)_\*/(?P<words>[^\s][^<>\*_]+?[^\s])?/\*_',
+            r'(?msu)/:(?P<words>[^\s][^<>:/]+?[^\s])?:/',
+            r'(?msu)\|:(?P<words>[^\s][^<>:\|]+?[^\s])?:\|',
         ]
         
         for word in ITALICIZE_WORDS:

From f965037fb44184957d9bc20dc3efb1ce1adee6a9 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Jan 2011 12:22:30 -0700
Subject: [PATCH 077/118] MOBI Output: Fix bug that could cause a link pointing
 to the start of a section to go to a point later in the section is the
 section contained an empty id attribute

---
 src/calibre/ebooks/mobi/writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py
index cd6674c2e2..ed102ecc80 100644
--- a/src/calibre/ebooks/mobi/writer.py
+++ b/src/calibre/ebooks/mobi/writer.py
@@ -251,7 +251,7 @@ class Serializer(object):
         tag = prefixname(elem.tag, nsrmap)
         # Previous layers take care of @name
         id = elem.attrib.pop('id', None)
-        if id is not None:
+        if id:
             href = '#'.join((item.href, id))
             offset = self.anchor_offset or buffer.tell()
             self.id_offsets[urlnormalize(href)] = offset

From aa28b379517028b5e7951c8aa94f2226a99c918d Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Jan 2011 13:03:34 -0700
Subject: [PATCH 078/118] Fix #8424 (Dilbert retrieval fails)

---
 resources/recipes/dilbert.recipe | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/recipes/dilbert.recipe b/resources/recipes/dilbert.recipe
index 2c3268da2f..56aa4af8c9 100644
--- a/resources/recipes/dilbert.recipe
+++ b/resources/recipes/dilbert.recipe
@@ -28,7 +28,7 @@ class DilbertBig(BasicNewsRecipe):
                             ,'publisher'       : publisher
                          }
 
-    feeds = [(u'Dilbert', u'http://feeds.dilbert.com/DilbertDailyStrip' )]
+    feeds = [(u'Dilbert', u'http://feed.dilbert.com/dilbert/daily_strip' )]
 
     def get_article_url(self, article):
         return article.get('feedburner_origlink', None)

From 84d1dd94d23db7b53f47051ecb3cfd5c47965f0b Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Jan 2011 13:10:10 -0700
Subject: [PATCH 079/118] Make postprocess_html in the NY Times recipes more
 robust

---
 resources/recipes/nytimes.recipe     | 204 +++++++++++++------------
 resources/recipes/nytimes_sub.recipe | 214 +++++++++++++++------------
 2 files changed, 229 insertions(+), 189 deletions(-)

diff --git a/resources/recipes/nytimes.recipe b/resources/recipes/nytimes.recipe
index 6f80f4f85f..7e313e5727 100644
--- a/resources/recipes/nytimes.recipe
+++ b/resources/recipes/nytimes.recipe
@@ -586,105 +586,125 @@ class NYTimes(BasicNewsRecipe):
         return self.strip_anchors(soup)
 
     def postprocess_html(self,soup, True):
+		try:
+			if self.one_picture_per_article:
+				# Remove all images after first
+				largeImg = soup.find(True, {'class':'articleSpanImage'})
+				inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
+				if largeImg:
+					for inlineImg in inlineImgs:
+						inlineImg.extract()
+				else:
+					if inlineImgs:
+						firstImg = inlineImgs[0]
+						for inlineImg in inlineImgs[1:]:
+							inlineImg.extract()
+						# Move firstImg before article body
+						cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
+						if cgFirst:
+							# Strip all sibling NavigableStrings: noise
+							navstrings = cgFirst.findAll(text=True, recursive=False)
+							[ns.extract() for ns in navstrings]
+							headline_found = False
+							tag = cgFirst.find(True)
+							insertLoc = 0
+							while True:
+								insertLoc += 1
+								if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
+										headline_found = True
+										break
+								tag = tag.nextSibling
+								if not tag:
+									headline_found = False
+									break
+							if headline_found:
+								cgFirst.insert(insertLoc,firstImg)
+						else:
+							self.log(">>> No class:'columnGroup first' found <<<")
+		except:
+			self.log("ERROR: One picture per article in postprocess_html")
 
-        if self.one_picture_per_article:
-            # Remove all images after first
-            largeImg = soup.find(True, {'class':'articleSpanImage'})
-            inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
-            if largeImg:
-                for inlineImg in inlineImgs:
-                    inlineImg.extract()
-            else:
-                if inlineImgs:
-                    firstImg = inlineImgs[0]
-                    for inlineImg in inlineImgs[1:]:
-                        inlineImg.extract()
-                    # Move firstImg before article body
-                    cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
-                    if cgFirst:
-                        # Strip all sibling NavigableStrings: noise
-                        navstrings = cgFirst.findAll(text=True, recursive=False)
-                        [ns.extract() for ns in navstrings]
-                        headline_found = False
-                        tag = cgFirst.find(True)
-                        insertLoc = 0
-                        while True:
-                            insertLoc += 1
-                            if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
-                                    headline_found = True
-                                    break
-                            tag = tag.nextSibling
-                            if not tag:
-                                headline_found = False
-                                break
-                        if headline_found:
-                            cgFirst.insert(insertLoc,firstImg)
-                    else:
-                        self.log(">>> No class:'columnGroup first' found <<<")
+		try:
+			# Change captions to italic
+			for caption in soup.findAll(True, {'class':'caption'}) :
+				if caption and len(caption) > 0:
+					cTag = Tag(soup, "p", [("class", "caption")])
+					c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
+					mp_off = c.find("More Photos")
+					if mp_off >= 0:
+						c = c[:mp_off]
+					cTag.insert(0, c)
+					caption.replaceWith(cTag)
+		except:
+			self.log("ERROR:  Problem in change captions to italic")
 
-        # Change captions to italic
-        for caption in soup.findAll(True, {'class':'caption'}) :
-            if caption and caption.contents[0]:
-                cTag = Tag(soup, "p", [("class", "caption")])
-                c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
-                mp_off = c.find("More Photos")
-                if mp_off >= 0:
-                    c = c[:mp_off]
-                cTag.insert(0, c)
-                caption.replaceWith(cTag)
+		try:
+			# Change <nyt_headline> to <h2>
+			h1 = soup.find('h1')
+			if h1:
+				headline = h1.find("nyt_headline")
+				if headline:
+					tag = Tag(soup, "h2")
+					tag['class'] = "headline"
+					tag.insert(0, self.fixChars(headline.contents[0]))
+					h1.replaceWith(tag)
+			else:
+				# Blog entry - replace headline, remove <hr> tags
+				headline = soup.find('title')
+				if headline:
+					tag = Tag(soup, "h2")
+					tag['class'] = "headline"
+					tag.insert(0, self.fixChars(headline.contents[0]))
+					soup.insert(0, tag)
+					hrs = soup.findAll('hr')
+					for hr in hrs:
+						hr.extract()
+		except:
+			self.log("ERROR:  Problem in Change <nyt_headline> to <h2>")
 
-        # Change <nyt_headline> to <h2>
-        h1 = soup.find('h1')
-        if h1:
-            headline = h1.find("nyt_headline")
-            if headline:
-                tag = Tag(soup, "h2")
-                tag['class'] = "headline"
-                tag.insert(0, self.fixChars(headline.contents[0]))
-                h1.replaceWith(tag)
-        else:
-            # Blog entry - replace headline, remove <hr> tags
-            headline = soup.find('title')
-            if headline:
-                tag = Tag(soup, "h2")
-                tag['class'] = "headline"
-                tag.insert(0, self.fixChars(headline.contents[0]))
-                soup.insert(0, tag)
-                hrs = soup.findAll('hr')
-                for hr in hrs:
-                    hr.extract()
+		try:
+			# Change <h1> to <h3> - used in editorial blogs
+			masthead = soup.find("h1")
+			if masthead:
+				# Nuke the href
+				if masthead.a:
+					del(masthead.a['href'])
+				tag = Tag(soup, "h3")
+				tag.insert(0, self.fixChars(masthead.contents[0]))
+				masthead.replaceWith(tag)
+		except:
+			self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
 
-        # Change <h1> to <h3> - used in editorial blogs
-        masthead = soup.find("h1")
-        if masthead:
-            # Nuke the href
-            if masthead.a:
-                del(masthead.a['href'])
-            tag = Tag(soup, "h3")
-            tag.insert(0, self.fixChars(masthead.contents[0]))
-            masthead.replaceWith(tag)
+		try:
+			# Change <span class="bold"> to <b>
+			for subhead in soup.findAll(True, {'class':'bold'}) :
+				if subhead.contents:
+					bTag = Tag(soup, "b")
+					bTag.insert(0, subhead.contents[0])
+					subhead.replaceWith(bTag)
+		except:
+			self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
 
-        # Change <span class="bold"> to <b>
-        for subhead in soup.findAll(True, {'class':'bold'}) :
-            if subhead.contents:
-                bTag = Tag(soup, "b")
-                bTag.insert(0, subhead.contents[0])
-                subhead.replaceWith(bTag)
+		try:
+			divTag = soup.find('div',attrs={'id':'articleBody'})
+			if divTag:
+				divTag['class'] = divTag['id']
+		except:
+			self.log("ERROR:  Problem in soup.find(div,attrs={id:articleBody})")
 
-        divTag = soup.find('div',attrs={'id':'articleBody'})
-        if divTag:
-            divTag['class'] = divTag['id']
+		try:
+			# Add class="authorId" to <div> so we can format with CSS
+			divTag = soup.find('div',attrs={'id':'authorId'})
+			if divTag and divTag.contents[0]:
+				tag = Tag(soup, "p")
+				tag['class'] = "authorId"
+				tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
+								 use_alt=False)))
+				divTag.replaceWith(tag)
+		except:
+			self.log("ERROR:  Problem in Add class=authorId to <div> so we can format with CSS")
 
-        # Add class="authorId" to <div> so we can format with CSS
-        divTag = soup.find('div',attrs={'id':'authorId'})
-        if divTag and divTag.contents[0]:
-            tag = Tag(soup, "p")
-            tag['class'] = "authorId"
-            tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
-                             use_alt=False)))
-            divTag.replaceWith(tag)
-
-        return soup
+		return soup
 
     def populate_article_metadata(self, article, soup, first):
         shortparagraph = ""
diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe
index 8ac7c735f7..8f92852237 100644
--- a/resources/recipes/nytimes_sub.recipe
+++ b/resources/recipes/nytimes_sub.recipe
@@ -586,105 +586,125 @@ class NYTimes(BasicNewsRecipe):
         return self.strip_anchors(soup)
 
     def postprocess_html(self,soup, True):
+		try:
+			if self.one_picture_per_article:
+				# Remove all images after first
+				largeImg = soup.find(True, {'class':'articleSpanImage'})
+				inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
+				if largeImg:
+					for inlineImg in inlineImgs:
+						inlineImg.extract()
+				else:
+					if inlineImgs:
+						firstImg = inlineImgs[0]
+						for inlineImg in inlineImgs[1:]:
+							inlineImg.extract()
+						# Move firstImg before article body
+						cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
+						if cgFirst:
+							# Strip all sibling NavigableStrings: noise
+							navstrings = cgFirst.findAll(text=True, recursive=False)
+							[ns.extract() for ns in navstrings]
+							headline_found = False
+							tag = cgFirst.find(True)
+							insertLoc = 0
+							while True:
+								insertLoc += 1
+								if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
+										headline_found = True
+										break
+								tag = tag.nextSibling
+								if not tag:
+									headline_found = False
+									break
+							if headline_found:
+								cgFirst.insert(insertLoc,firstImg)
+						else:
+							self.log(">>> No class:'columnGroup first' found <<<")
+		except:
+			self.log("ERROR: One picture per article in postprocess_html")
+									
+		try:
+			# Change captions to italic
+			for caption in soup.findAll(True, {'class':'caption'}) :
+				if caption and len(caption) > 0:
+					cTag = Tag(soup, "p", [("class", "caption")])
+					c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
+					mp_off = c.find("More Photos")
+					if mp_off >= 0:
+						c = c[:mp_off]
+					cTag.insert(0, c)
+					caption.replaceWith(cTag)
+		except:
+			self.log("ERROR:  Problem in change captions to italic")
+		
+		try:
+			# Change <nyt_headline> to <h2>
+			h1 = soup.find('h1')
+			if h1:
+				headline = h1.find("nyt_headline")
+				if headline:
+					tag = Tag(soup, "h2")
+					tag['class'] = "headline"
+					tag.insert(0, self.fixChars(headline.contents[0]))
+					h1.replaceWith(tag)
+			else:
+				# Blog entry - replace headline, remove <hr> tags
+				headline = soup.find('title')
+				if headline:
+					tag = Tag(soup, "h2")
+					tag['class'] = "headline"
+					tag.insert(0, self.fixChars(headline.contents[0]))
+					soup.insert(0, tag)
+					hrs = soup.findAll('hr')
+					for hr in hrs:
+						hr.extract()
+		except:
+			self.log("ERROR:  Problem in Change <nyt_headline> to <h2>")
 
-        if self.one_picture_per_article:
-            # Remove all images after first
-            largeImg = soup.find(True, {'class':'articleSpanImage'})
-            inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
-            if largeImg:
-                for inlineImg in inlineImgs:
-                    inlineImg.extract()
-            else:
-                if inlineImgs:
-                    firstImg = inlineImgs[0]
-                    for inlineImg in inlineImgs[1:]:
-                        inlineImg.extract()
-                    # Move firstImg before article body
-                    cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
-                    if cgFirst:
-                        # Strip all sibling NavigableStrings: noise
-                        navstrings = cgFirst.findAll(text=True, recursive=False)
-                        [ns.extract() for ns in navstrings]
-                        headline_found = False
-                        tag = cgFirst.find(True)
-                        insertLoc = 0
-                        while True:
-                            insertLoc += 1
-                            if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
-                                    headline_found = True
-                                    break
-                            tag = tag.nextSibling
-                            if not tag:
-                                headline_found = False
-                                break
-                        if headline_found:
-                            cgFirst.insert(insertLoc,firstImg)
-                    else:
-                        self.log(">>> No class:'columnGroup first' found <<<")
+		try:
+			# Change <h1> to <h3> - used in editorial blogs
+			masthead = soup.find("h1")
+			if masthead:
+				# Nuke the href
+				if masthead.a:
+					del(masthead.a['href'])
+				tag = Tag(soup, "h3")
+				tag.insert(0, self.fixChars(masthead.contents[0]))
+				masthead.replaceWith(tag)
+		except:
+			self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
 
-        # Change captions to italic
-        for caption in soup.findAll(True, {'class':'caption'}) :
-            if caption and caption.contents[0]:
-                cTag = Tag(soup, "p", [("class", "caption")])
-                c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
-                mp_off = c.find("More Photos")
-                if mp_off >= 0:
-                    c = c[:mp_off]
-                cTag.insert(0, c)
-                caption.replaceWith(cTag)
-
-        # Change <nyt_headline> to <h2>
-        h1 = soup.find('h1')
-        if h1:
-            headline = h1.find("nyt_headline")
-            if headline:
-                tag = Tag(soup, "h2")
-                tag['class'] = "headline"
-                tag.insert(0, self.fixChars(headline.contents[0]))
-                h1.replaceWith(tag)
-        else:
-            # Blog entry - replace headline, remove <hr> tags
-            headline = soup.find('title')
-            if headline:
-                tag = Tag(soup, "h2")
-                tag['class'] = "headline"
-                tag.insert(0, self.fixChars(headline.contents[0]))
-                soup.insert(0, tag)
-                hrs = soup.findAll('hr')
-                for hr in hrs:
-                    hr.extract()
-
-        # Change <h1> to <h3> - used in editorial blogs
-        masthead = soup.find("h1")
-        if masthead:
-            # Nuke the href
-            if masthead.a:
-                del(masthead.a['href'])
-            tag = Tag(soup, "h3")
-            tag.insert(0, self.fixChars(masthead.contents[0]))
-            masthead.replaceWith(tag)
-
-        # Change <span class="bold"> to <b>
-        for subhead in soup.findAll(True, {'class':'bold'}) :
-            if subhead.contents:
-                bTag = Tag(soup, "b")
-                bTag.insert(0, subhead.contents[0])
-                subhead.replaceWith(bTag)
-
-        divTag = soup.find('div',attrs={'id':'articleBody'})
-        if divTag:
-            divTag['class'] = divTag['id']
-
-        # Add class="authorId" to <div> so we can format with CSS
-        divTag = soup.find('div',attrs={'id':'authorId'})
-        if divTag and divTag.contents[0]:
-            tag = Tag(soup, "p")
-            tag['class'] = "authorId"
-            tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
-                             use_alt=False)))
-            divTag.replaceWith(tag)
-
-        return soup
+		try:		
+			# Change <span class="bold"> to <b>
+			for subhead in soup.findAll(True, {'class':'bold'}) :
+				if subhead.contents:
+					bTag = Tag(soup, "b")
+					bTag.insert(0, subhead.contents[0])
+					subhead.replaceWith(bTag)
+		except:
+			self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
+		
+		try:		
+			divTag = soup.find('div',attrs={'id':'articleBody'})
+			if divTag:
+				divTag['class'] = divTag['id']
+		except:
+			self.log("ERROR:  Problem in soup.find(div,attrs={id:articleBody})")
+			
+		try:	
+			# Add class="authorId" to <div> so we can format with CSS
+			divTag = soup.find('div',attrs={'id':'authorId'})
+			if divTag and divTag.contents[0]:
+				tag = Tag(soup, "p")
+				tag['class'] = "authorId"
+				tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
+								 use_alt=False)))
+				divTag.replaceWith(tag)		
+		except:
+			self.log("ERROR:  Problem in Add class=authorId to <div> so we can format with CSS")
+		
+		return soup
     def populate_article_metadata(self, article, soup, first):
         shortparagraph = ""
         try:

From 927c389e91ffe47880ed1b0949b421d449f7ad4b Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Jan 2011 13:20:12 -0700
Subject: [PATCH 080/118] Fix #8436 (Add tag when using Add ISBN Dialog)

---
 src/calibre/gui2/actions/add.py           |  9 ++--
 src/calibre/gui2/dialogs/add_from_isbn.py |  7 +++
 src/calibre/gui2/dialogs/add_from_isbn.ui | 52 +++++++++++++++++++----
 3 files changed, 56 insertions(+), 12 deletions(-)

diff --git a/src/calibre/gui2/actions/add.py b/src/calibre/gui2/actions/add.py
index 9917c542ae..6fa53d6290 100644
--- a/src/calibre/gui2/actions/add.py
+++ b/src/calibre/gui2/actions/add.py
@@ -91,13 +91,14 @@ class AddAction(InterfaceAction):
                 self.gui.library_view.model().db.import_book(MetaInformation(None), [])
             self.gui.library_view.model().books_added(num)
 
-    def add_isbns(self, books):
+    def add_isbns(self, books, add_tags=[]):
         from calibre.ebooks.metadata import MetaInformation
         ids = set([])
+        db = self.gui.library_view.model().db
+
         for x in books:
             mi = MetaInformation(None)
             mi.isbn = x['isbn']
-            db = self.gui.library_view.model().db
             if x['path'] is not None:
                 ids.add(db.import_book(mi, [x['path']]))
             else:
@@ -109,6 +110,8 @@ class AddAction(InterfaceAction):
             self.gui.iactions['Edit Metadata'].do_download_metadata(ids)
         finally:
             config['overwrite_author_title_metadata'] = orig
+        if add_tags and ids:
+            db.bulk_modify_tags(ids, add=add_tags)
 
 
     def files_dropped(self, paths):
@@ -166,7 +169,7 @@ class AddAction(InterfaceAction):
         from calibre.gui2.dialogs.add_from_isbn import AddFromISBN
         d = AddFromISBN(self.gui)
         if d.exec_() == d.Accepted:
-            self.add_isbns(d.books)
+            self.add_isbns(d.books, add_tags=d.set_tags)
 
     def add_books(self, *args):
         '''
diff --git a/src/calibre/gui2/dialogs/add_from_isbn.py b/src/calibre/gui2/dialogs/add_from_isbn.py
index f93cddecd5..433b70291c 100644
--- a/src/calibre/gui2/dialogs/add_from_isbn.py
+++ b/src/calibre/gui2/dialogs/add_from_isbn.py
@@ -12,6 +12,7 @@ from PyQt4.Qt import QDialog, QApplication
 from calibre.gui2.dialogs.add_from_isbn_ui import Ui_Dialog
 from calibre.ebooks.metadata import check_isbn
 from calibre.constants import iswindows
+from calibre.gui2 import gprefs
 
 class AddFromISBN(QDialog, Ui_Dialog):
 
@@ -25,7 +26,9 @@ class AddFromISBN(QDialog, Ui_Dialog):
 
         self.isbns = []
         self.books = []
+        self.set_tags = []
         self.paste_button.clicked.connect(self.paste)
+        self.add_tags.setText(', '.join(gprefs.get('add from ISBN tags', [])))
 
     def paste(self, *args):
         app = QApplication.instance()
@@ -37,6 +40,10 @@ class AddFromISBN(QDialog, Ui_Dialog):
             self.isbn_box.setPlainText(new)
 
     def accept(self, *args):
+        tags = unicode(self.add_tags.text()).strip().split(',')
+        tags = list(filter(None, [x.strip() for x in tags]))
+        gprefs['add from ISBN tags'] = tags
+        self.set_tags = tags
         for line in unicode(self.isbn_box.toPlainText()).strip().splitlines():
             line = line.strip()
             if not line:
diff --git a/src/calibre/gui2/dialogs/add_from_isbn.ui b/src/calibre/gui2/dialogs/add_from_isbn.ui
index e37c4ed769..f598e6f1d8 100644
--- a/src/calibre/gui2/dialogs/add_from_isbn.ui
+++ b/src/calibre/gui2/dialogs/add_from_isbn.ui
@@ -18,8 +18,19 @@
     <normaloff>:/images/add_book.png</normaloff>:/images/add_book.png</iconset>
   </property>
   <layout class="QGridLayout" name="gridLayout">
-   <item row="0" column="0">
-    <widget class="QPlainTextEdit" name="isbn_box"/>
+   <item row="0" column="0" rowspan="2">
+    <layout class="QVBoxLayout" name="verticalLayout_2">
+     <item>
+      <widget class="QPlainTextEdit" name="isbn_box"/>
+     </item>
+     <item>
+      <widget class="QPushButton" name="paste_button">
+       <property name="text">
+        <string>&amp;Paste from clipboard</string>
+       </property>
+      </widget>
+     </item>
+    </layout>
    </item>
    <item row="0" column="1">
     <widget class="QLabel" name="label">
@@ -34,6 +45,36 @@
      </property>
     </widget>
    </item>
+   <item row="1" column="1">
+    <layout class="QVBoxLayout" name="verticalLayout">
+     <item>
+      <widget class="QLabel" name="label_2">
+       <property name="text">
+        <string>&amp;Tags to set on created book entries:</string>
+       </property>
+       <property name="buddy">
+        <cstring>add_tags</cstring>
+       </property>
+      </widget>
+     </item>
+     <item>
+      <widget class="QLineEdit" name="add_tags"/>
+     </item>
+     <item>
+      <spacer name="verticalSpacer">
+       <property name="orientation">
+        <enum>Qt::Vertical</enum>
+       </property>
+       <property name="sizeHint" stdset="0">
+        <size>
+         <width>20</width>
+         <height>40</height>
+        </size>
+       </property>
+      </spacer>
+     </item>
+    </layout>
+   </item>
    <item row="2" column="0" colspan="2">
     <widget class="QDialogButtonBox" name="buttonBox">
      <property name="orientation">
@@ -44,13 +85,6 @@
      </property>
     </widget>
    </item>
-   <item row="1" column="0">
-    <widget class="QPushButton" name="paste_button">
-     <property name="text">
-      <string>&amp;Paste from clipboard</string>
-     </property>
-    </widget>
-   </item>
   </layout>
  </widget>
  <resources>

From e1fd50d72f2eefb70928d20dab094f0402172bae Mon Sep 17 00:00:00 2001
From: GRiker <griker@hotmail.com>
Date: Mon, 17 Jan 2011 15:39:23 -0700
Subject: [PATCH 081/118] GwR fix for TOC discontinuity, default cover swapping

---
 src/calibre/library/catalog.py | 37 ++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py
index fa5041bfec..13df6625d4 100644
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@@ -1360,6 +1360,7 @@ class EPUB_MOBI(CatalogPlugin):
                 return False
             self.fetchBookmarks()
             if self.opts.generate_descriptions:
+                self.generateThumbnails()
                 self.generateHTMLDescriptions()
             self.generateHTMLByAuthor()
             if self.opts.generate_titles:
@@ -1372,8 +1373,7 @@ class EPUB_MOBI(CatalogPlugin):
                 self.generateHTMLByDateAdded()
                 if self.generateRecentlyRead:
                     self.generateHTMLByDateRead()
-            if self.opts.generate_descriptions:
-                self.generateThumbnails()
+
             self.generateOPF()
             self.generateNCXHeader()
             self.generateNCXByAuthor("Authors")
@@ -1452,6 +1452,12 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
 
 
             self.booksByAuthor = sorted(self.booksByAuthor, key=self.booksByAuthorSorter_author_sort)
+
+#             for book in self.booksByAuthor:
+#                 print '{0:<10} {1:<5} {2:<20} {3:<20} {4:<20} {5:<20}'.format(book['series'], book['series_index'], book['title'],
+#                                                 book['author'], book['authors'],book['author_sort'])
+#             print
+
             # Build the unique_authors set from existing data
             authors = [(record['author'], record['author_sort'].capitalize()) for record in self.booksByAuthor]
 
@@ -2848,23 +2854,26 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
                 thumb_generated = True
                 valid_cover = True
                 try:
-                    thumbs.append("thumbnail_%d.jpg" % int(title['id']))
                     self.generateThumbnail(title, image_dir, thumb_file)
+                    thumbs.append("thumbnail_%d.jpg" % int(title['id']))
                 except:
                     if 'cover' in title and os.path.exists(title['cover']):
                         valid_cover = False
-                        self.opts.log.warn(" *** Invalid cover file for '%s' ***" % (title['title']))
+                        self.opts.log.warn(" *** Invalid cover file for '%s'***" %
+                                             (title['title']))
                         if not self.error:
                             self.error.append('Invalid cover files')
                         self.error.append("Warning: invalid cover file for '%s', default cover substituted.\n" % (title['title']))
+
                     thumb_generated = False
 
                 if not thumb_generated:
-                    self.opts.log.warn(" using default cover for '%s'" % (title['title']))
-                    # Check to make sure default is current
-                    # Check to see if thumbnail exists
+                    self.opts.log.warn(" using default cover for '%s' (%d)" % (title['title'], title['id']))
+                    # Confirm thumb exists, default is current
                     default_thumb_fp = os.path.join(image_dir,"thumbnail_default.jpg")
                     cover = os.path.join(self.catalogPath, "DefaultCover.png")
+                    title['cover'] = cover
+
                     if not os.path.exists(cover):
                         shutil.copyfile(I('book.png'), cover)
 
@@ -2877,17 +2886,15 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
                         if thumb_timestamp < cover_timestamp:
                             if False and self.verbose:
                                 self.opts.log.warn("updating thumbnail_default for %s" % title['title'])
-                            #title['cover'] = os.path.join(self.catalogPath,"DefaultCover.jpg")
-                            title['cover'] = cover
                             self.generateThumbnail(title, image_dir,
                                                 "thumbnail_default.jpg" if valid_cover else thumb_file)
                     else:
                         if False and self.verbose:
                             self.opts.log.warn(" generating new thumbnail_default.jpg")
-                        #title['cover'] = os.path.join(self.catalogPath,"DefaultCover.jpg")
-                        title['cover'] = cover
                         self.generateThumbnail(title, image_dir,
                                                 "thumbnail_default.jpg" if valid_cover else thumb_file)
+                    # Clear the book's cover property
+                    title['cover'] = None
 
 
             # Write thumb_width to the file, validating cache contents
@@ -3881,7 +3888,7 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
             outfile.write(self.ncxSoup.prettify())
 
 
-        # --------------- Helpers ---------------
+        # ======================== Helpers ========================
         def author_to_author_sort(self, author):
             tokens = author.split()
             tokens = tokens[-1:] + tokens[:-1]
@@ -3894,14 +3901,14 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
             Sort non-series books before series books
             '''
             if not book['series']:
-                key = '%s %s' % (book['author_sort'],
+                key = '%s %s' % (book['author_sort'].capitalize(),
                                  book['title_sort'].capitalize())
             else:
                 index = book['series_index']
                 integer = int(index)
                 fraction = index-integer
                 series_index = '%04d%s' % (integer, str('%0.4f' % fraction).lstrip('0'))
-                key = '%s ~%s %s' % (book['author_sort'],
+                key = '%s ~%s %s' % (book['author_sort'].capitalize(),
                                      self.generateSortTitle(book['series']),
                                      series_index)
             return key
@@ -4315,7 +4322,7 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
             # Thumb
             _soup = BeautifulSoup('<html>',selfClosingTags=['img'])
             thumb = Tag(_soup,"img")
-            if 'cover' in book:
+            if 'cover' in book and book['cover']:
                 thumb['src']  = "../images/thumbnail_%d.jpg" % int(book['id'])
             else:
                 thumb['src']  = "../images/thumbnail_default.jpg"

From af04ca87cefa556c025e5cecaf5c92934b1264ff Mon Sep 17 00:00:00 2001
From: GRiker <griker@hotmail.com>
Date: Mon, 17 Jan 2011 15:52:25 -0700
Subject: [PATCH 082/118] GwR fix for TOC discontinuity, default cover swapping

---
 src/calibre/library/catalog.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py
index 13df6625d4..8edf266cfb 100644
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@@ -4314,10 +4314,9 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
 
             # Date of publication
             pubdate = book['date']
-            if pubdate:
-                pubmonth, pubyear = pubdate.split(' ')
-            else:
-                pubmonth = pubyear = ''
+            pubmonth, pubyear = pubdate.split()
+            if pubyear == '101':
+                pubdate = pubmonth = pubyear = ''
 
             # Thumb
             _soup = BeautifulSoup('<html>',selfClosingTags=['img'])

From e82dd54242eb86382f55a2840338acea526b4fb1 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Jan 2011 17:42:40 -0700
Subject: [PATCH 083/118] Fix #8409 (bulk edit date)

---
 src/calibre/gui2/dialogs/metadata_bulk.py |  24 ++++-
 src/calibre/gui2/dialogs/metadata_bulk.ui | 117 +++++++++++++++-------
 2 files changed, 100 insertions(+), 41 deletions(-)

diff --git a/src/calibre/gui2/dialogs/metadata_bulk.py b/src/calibre/gui2/dialogs/metadata_bulk.py
index 2b3a319663..6e6b553dba 100644
--- a/src/calibre/gui2/dialogs/metadata_bulk.py
+++ b/src/calibre/gui2/dialogs/metadata_bulk.py
@@ -49,7 +49,6 @@ def get_cover_data(path): # {{{
     return cdata, area
 # }}}
 
-
 class MyBlockingBusy(QDialog): # {{{
 
     do_one_signal = pyqtSignal()
@@ -134,7 +133,7 @@ class MyBlockingBusy(QDialog): # {{{
             do_autonumber, do_remove_format, remove_format, do_swap_ta, \
             do_remove_conv, do_auto_author, series, do_series_restart, \
             series_start_value, do_title_case, cover_action, clear_series, \
-            pubdate = self.args
+            pubdate, adddate = self.args
 
 
         # first loop: do author and title. These will commit at the end of each
@@ -214,6 +213,9 @@ class MyBlockingBusy(QDialog): # {{{
             if pubdate is not None:
                 self.db.set_pubdate(id, pubdate, notify=False, commit=False)
 
+            if adddate is not None:
+                self.db.set_timestamp(id, adddate, notify=False, commit=False)
+
             if do_series:
                 if do_series_restart:
                     if self.series_start_value is None:
@@ -300,6 +302,10 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog):
         self.pubdate.setSpecialValueText(_('Undefined'))
         self.clear_pubdate_button.clicked.connect(self.clear_pubdate)
         self.pubdate.dateChanged.connect(self.do_apply_pubdate)
+        self.adddate.setMinimumDate(UNDEFINED_QDATE)
+        self.adddate.setSpecialValueText(_('Undefined'))
+        self.clear_adddate_button.clicked.connect(self.clear_adddate)
+        self.adddate.dateChanged.connect(self.do_apply_adddate)
 
         if len(self.db.custom_field_keys(include_composites=False)) == 0:
             self.central_widget.removeTab(1)
@@ -322,6 +328,12 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog):
     def clear_pubdate(self, *args):
         self.pubdate.setDate(UNDEFINED_QDATE)
 
+    def do_apply_adddate(self, *args):
+        self.apply_adddate.setChecked(True)
+
+    def clear_adddate(self, *args):
+        self.adddate.setDate(UNDEFINED_QDATE)
+
     def button_clicked(self, which):
         if which == self.button_box.button(QDialogButtonBox.Apply):
             self.do_again = True
@@ -726,7 +738,7 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog):
             name = name.strip().replace('|', ',')
             self.authors.addItem(name)
         self.authors.setEditText('')
-        
+
         self.authors.set_separator('&')
         self.authors.set_space_before_sep(True)
         self.authors.update_items_cache(self.db.all_author_names())
@@ -805,9 +817,11 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog):
         do_remove_conv = self.remove_conversion_settings.isChecked()
         do_auto_author = self.auto_author_sort.isChecked()
         do_title_case = self.change_title_to_title_case.isChecked()
-        pubdate = None
+        pubdate = adddate = None
         if self.apply_pubdate.isChecked():
             pubdate = qt_to_dt(self.pubdate.date())
+        if self.apply_adddate.isChecked():
+            adddate = qt_to_dt(self.adddate.date())
 
         cover_action = None
         if self.cover_remove.isChecked():
@@ -821,7 +835,7 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog):
                 do_autonumber, do_remove_format, remove_format, do_swap_ta,
                 do_remove_conv, do_auto_author, series, do_series_restart,
                 series_start_value, do_title_case, cover_action, clear_series,
-                pubdate)
+                pubdate, adddate)
 
         bb = MyBlockingBusy(_('Applying changes to %d books.\nPhase {0} {1}%%.')
                 %len(self.ids), args, self.db, self.ids,
diff --git a/src/calibre/gui2/dialogs/metadata_bulk.ui b/src/calibre/gui2/dialogs/metadata_bulk.ui
index 8db74b343d..f8ae926be6 100644
--- a/src/calibre/gui2/dialogs/metadata_bulk.ui
+++ b/src/calibre/gui2/dialogs/metadata_bulk.ui
@@ -347,6 +347,51 @@ from the value in the box</string>
              </item>
             </layout>
            </item>
+           <item row="9" column="0">
+            <widget class="QLabel" name="label_10">
+             <property name="text">
+              <string>&amp;Date:</string>
+             </property>
+             <property name="alignment">
+              <set>Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter</set>
+             </property>
+             <property name="buddy">
+              <cstring>adddate</cstring>
+             </property>
+            </widget>
+           </item>
+           <item row="9" column="1">
+            <layout class="QHBoxLayout" name="horizontalLayout_5">
+             <item>
+              <widget class="QDateEdit" name="adddate">
+               <property name="displayFormat">
+                <string>d MMM yyyy</string>
+               </property>
+               <property name="calendarPopup">
+                <bool>true</bool>
+               </property>
+              </widget>
+             </item>
+             <item>
+              <widget class="QToolButton" name="clear_adddate_button">
+               <property name="text">
+                <string>...</string>
+               </property>
+               <property name="icon">
+                <iconset resource="../../../../resources/images.qrc">
+                 <normaloff>:/images/trash.png</normaloff>:/images/trash.png</iconset>
+               </property>
+              </widget>
+             </item>
+            </layout>
+           </item>
+           <item row="9" column="2">
+            <widget class="QCheckBox" name="apply_adddate">
+             <property name="text">
+              <string>&amp;Apply date</string>
+             </property>
+            </widget>
+           </item>
            <item row="10" column="0">
             <widget class="QLabel" name="label_9">
              <property name="text">
@@ -395,6 +440,42 @@ from the value in the box</string>
              </property>
             </widget>
            </item>
+           <item row="11" column="0">
+            <widget class="QLabel" name="label_5">
+             <property name="text">
+              <string>Remove &amp;format:</string>
+             </property>
+             <property name="buddy">
+              <cstring>remove_format</cstring>
+             </property>
+            </widget>
+           </item>
+           <item row="11" column="1">
+            <widget class="QComboBox" name="remove_format">
+             <property name="maximumSize">
+              <size>
+               <width>120</width>
+               <height>16777215</height>
+              </size>
+             </property>
+            </widget>
+           </item>
+           <item row="12" column="0">
+            <spacer name="verticalSpacer">
+             <property name="orientation">
+              <enum>Qt::Vertical</enum>
+             </property>
+             <property name="sizeType">
+              <enum>QSizePolicy::Fixed</enum>
+             </property>
+             <property name="sizeHint" stdset="0">
+              <size>
+               <width>20</width>
+               <height>15</height>
+              </size>
+             </property>
+            </spacer>
+           </item>
            <item row="13" column="0" colspan="3">
             <layout class="QHBoxLayout" name="horizontalLayout_3">
              <item>
@@ -478,42 +559,6 @@ Future conversion of these books will use the default settings.</string>
              </property>
             </spacer>
            </item>
-           <item row="12" column="0">
-            <spacer name="verticalSpacer">
-             <property name="orientation">
-              <enum>Qt::Vertical</enum>
-             </property>
-             <property name="sizeType">
-              <enum>QSizePolicy::Fixed</enum>
-             </property>
-             <property name="sizeHint" stdset="0">
-              <size>
-               <width>20</width>
-               <height>15</height>
-              </size>
-             </property>
-            </spacer>
-           </item>
-           <item row="11" column="0">
-            <widget class="QLabel" name="label_5">
-             <property name="text">
-              <string>Remove &amp;format:</string>
-             </property>
-             <property name="buddy">
-              <cstring>remove_format</cstring>
-             </property>
-            </widget>
-           </item>
-           <item row="11" column="1">
-            <widget class="QComboBox" name="remove_format">
-             <property name="maximumSize">
-              <size>
-               <width>120</width>
-               <height>16777215</height>
-              </size>
-             </property>
-            </widget>
-           </item>
           </layout>
          </widget>
          <widget class="QWidget" name="tab">

From 5376137156d4dc1e86aeddc8c1238f16fc46754b Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Jan 2011 18:01:05 -0700
Subject: [PATCH 084/118] ...

---
 src/calibre/gui2/ui.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/calibre/gui2/ui.py b/src/calibre/gui2/ui.py
index 9eb202d761..6c6e41e0a5 100644
--- a/src/calibre/gui2/ui.py
+++ b/src/calibre/gui2/ui.py
@@ -485,7 +485,8 @@ class Main(MainWindow, MainWindowMixin, DeviceMixin, EmailMixin, # {{{
             if 'calibre.ebooks.DRMError' in job.details:
                 if not minz:
                     from calibre.gui2.dialogs.drm_error import DRMErrorMessage
-                    d = DRMErrorMessage(self, job.description.split(':')[-1])
+                    d = DRMErrorMessage(self, _('Cannot convert') + ' ' +
+                        job.description.split(':')[-1].partition('(')[-1][:-1])
                     d.setModal(False)
                     d.show()
                     self._modeless_dialogs.append(d)

From 8f38a56699d727e6338aaf3b228cb1a438b9fe76 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Jan 2011 18:01:35 -0700
Subject: [PATCH 085/118] Use html editor widget for comments in convert dialog
 as well

---
 src/calibre/gui2/comments_editor.py  |  5 ++
 src/calibre/gui2/convert/metadata.py |  6 +-
 src/calibre/gui2/convert/metadata.ui | 85 ++++++++++------------------
 3 files changed, 40 insertions(+), 56 deletions(-)

diff --git a/src/calibre/gui2/comments_editor.py b/src/calibre/gui2/comments_editor.py
index 013d13f9e7..04bc5284ed 100644
--- a/src/calibre/gui2/comments_editor.py
+++ b/src/calibre/gui2/comments_editor.py
@@ -593,6 +593,11 @@ class Editor(QWidget): # {{{
     def code_dirtied(self, *args):
         self.source_dirty = True
 
+    def hide_toolbars(self):
+        self.toolbar1.setVisible(False)
+        self.toolbar2.setVisible(False)
+        self.toolbar3.setVisible(False)
+
 # }}}
 
 if __name__ == '__main__':
diff --git a/src/calibre/gui2/convert/metadata.py b/src/calibre/gui2/convert/metadata.py
index 5f39202e26..23cac74cf8 100644
--- a/src/calibre/gui2/convert/metadata.py
+++ b/src/calibre/gui2/convert/metadata.py
@@ -18,6 +18,7 @@ from calibre.ebooks.metadata.opf2 import metadata_to_opf
 from calibre.ptempfile import PersistentTemporaryFile
 from calibre.gui2.convert import Widget
 from calibre.utils.icu import sort_key
+from calibre.library.comments import comments_to_html
 
 def create_opf_file(db, book_id):
     mi = db.get_metadata(book_id, index_is_id=True)
@@ -57,6 +58,7 @@ class MetadataWidget(Widget, Ui_Form):
             self.initialize_metadata_options()
         self.initialize_options(get_option, get_help, db, book_id)
         self.connect(self.cover_button, SIGNAL("clicked()"), self.select_cover)
+        self.comment.hide_toolbars()
 
     def deduce_author_sort(self, *args):
         au = unicode(self.author.currentText())
@@ -79,7 +81,7 @@ class MetadataWidget(Widget, Ui_Form):
         self.author_sort.setText(mi.author_sort if mi.author_sort else '')
         self.tags.setText(', '.join(mi.tags if mi.tags else []))
         self.tags.update_items_cache(self.db.all_tags())
-        self.comment.setPlainText(mi.comments if mi.comments else '')
+        self.comment.html = comments_to_html(mi.comments) if mi.comments else ''
         if mi.series:
             self.series.setCurrentIndex(self.series.findText(mi.series))
         if mi.series_index is not None:
@@ -154,7 +156,7 @@ class MetadataWidget(Widget, Ui_Form):
         author_sort = unicode(self.author_sort.text()).strip()
         if author_sort:
             mi.author_sort = author_sort
-        comments = unicode(self.comment.toPlainText()).strip()
+        comments = self.comment.html
         if comments:
             mi.comments = comments
         mi.series_index = float(self.series_index.value())
diff --git a/src/calibre/gui2/convert/metadata.ui b/src/calibre/gui2/convert/metadata.ui
index 8db4cfa2a1..61c27594c4 100644
--- a/src/calibre/gui2/convert/metadata.ui
+++ b/src/calibre/gui2/convert/metadata.ui
@@ -20,30 +20,6 @@
       <string>Book Cover</string>
      </property>
      <layout class="QGridLayout" name="_2">
-      <item row="0" column="0">
-       <layout class="QHBoxLayout" name="_3">
-        <item>
-         <widget class="ImageView" name="cover" native="true">
-          <property name="sizePolicy">
-           <sizepolicy hsizetype="Expanding" vsizetype="Expanding">
-            <horstretch>0</horstretch>
-            <verstretch>0</verstretch>
-           </sizepolicy>
-          </property>
-         </widget>
-        </item>
-       </layout>
-      </item>
-      <item row="2" column="0">
-       <widget class="QCheckBox" name="opt_prefer_metadata_cover">
-        <property name="text">
-         <string>Use cover from &amp;source file</string>
-        </property>
-        <property name="checked">
-         <bool>true</bool>
-        </property>
-       </widget>
-      </item>
       <item row="1" column="0">
        <layout class="QVBoxLayout" name="_4">
         <property name="spacing">
@@ -95,6 +71,30 @@
         </item>
        </layout>
       </item>
+      <item row="2" column="0">
+       <widget class="QCheckBox" name="opt_prefer_metadata_cover">
+        <property name="text">
+         <string>Use cover from &amp;source file</string>
+        </property>
+        <property name="checked">
+         <bool>true</bool>
+        </property>
+       </widget>
+      </item>
+      <item row="0" column="0">
+       <layout class="QHBoxLayout" name="_3">
+        <item>
+         <widget class="ImageView" name="cover" native="true">
+          <property name="sizePolicy">
+           <sizepolicy hsizetype="Expanding" vsizetype="Expanding">
+            <horstretch>0</horstretch>
+            <verstretch>0</verstretch>
+           </sizepolicy>
+          </property>
+         </widget>
+        </item>
+       </layout>
+      </item>
      </layout>
      <zorder>opt_prefer_metadata_cover</zorder>
      <zorder></zorder>
@@ -264,35 +264,7 @@
       </layout>
      </item>
      <item>
-      <widget class="QGroupBox" name="groupBox_2">
-       <property name="sizePolicy">
-        <sizepolicy hsizetype="Minimum" vsizetype="Minimum">
-         <horstretch>0</horstretch>
-         <verstretch>0</verstretch>
-        </sizepolicy>
-       </property>
-       <property name="maximumSize">
-        <size>
-         <width>16777215</width>
-         <height>200</height>
-        </size>
-       </property>
-       <property name="title">
-        <string>Comments</string>
-       </property>
-       <layout class="QGridLayout" name="_8">
-        <item row="0" column="0">
-         <widget class="QTextEdit" name="comment">
-          <property name="maximumSize">
-           <size>
-            <width>16777215</width>
-            <height>180</height>
-           </size>
-          </property>
-         </widget>
-        </item>
-       </layout>
-      </widget>
+      <widget class="Editor" name="comment" native="true"/>
      </item>
     </layout>
    </item>
@@ -325,6 +297,12 @@
    <header>calibre/gui2/widgets.h</header>
    <container>1</container>
   </customwidget>
+  <customwidget>
+   <class>Editor</class>
+   <extends>QWidget</extends>
+   <header>calibre/gui2/comments_editor.h</header>
+   <container>1</container>
+  </customwidget>
  </customwidgets>
  <tabstops>
   <tabstop>title</tabstop>
@@ -334,7 +312,6 @@
   <tabstop>tags</tabstop>
   <tabstop>series</tabstop>
   <tabstop>series_index</tabstop>
-  <tabstop>comment</tabstop>
   <tabstop>cover_path</tabstop>
   <tabstop>cover_button</tabstop>
   <tabstop>opt_prefer_metadata_cover</tabstop>

From 798c361bcdca8634c7a1ab79704e63294a08a73b Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Mon, 17 Jan 2011 20:38:51 -0500
Subject: [PATCH 086/118] TXT Input: Add textile option to options.

---
 src/calibre/ebooks/txt/input.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 0b0bd6d570..cca9a74250 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -34,7 +34,7 @@ class TXTInput(InputFormatPlugin):
                    'starts a paragraph.'
                    '* unformatted: Most lines have hard line breaks, few/no blank lines or indents.')),
         OptionRecommendation(name='formatting_type', recommended_value='auto',
-            choices=['auto', 'none', 'heuristic', 'markdown'],
+            choices=['auto', 'none', 'heuristic', 'textile', 'markdown'],
             help=_('Formatting used within the document.'
                    '* auto: Automatically decide which formatting processor to use.\n'
                    '* none: Do not process the document formatting. Everything is a '

From 39bffe09836da0a47fba0efd235d4664120241f8 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Jan 2011 20:43:27 -0700
Subject: [PATCH 087/118] Updated Ars Technica

---
 resources/recipes/ars_technica.recipe | 51 +++++++++++++++++++++------
 1 file changed, 40 insertions(+), 11 deletions(-)

diff --git a/resources/recipes/ars_technica.recipe b/resources/recipes/ars_technica.recipe
index 3997ee4645..3a955d5e15 100644
--- a/resources/recipes/ars_technica.recipe
+++ b/resources/recipes/ars_technica.recipe
@@ -1,6 +1,5 @@
-
 __license__   = 'GPL v3'
-__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
 '''
 arstechnica.com
 '''
@@ -9,19 +8,26 @@ import re
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
 
-class ArsTechnica2(BasicNewsRecipe):
+class ArsTechnica(BasicNewsRecipe):
     title                 = u'Ars Technica'
     language              = 'en'
-    __author__            = 'Darko Miletic and Sujata Raman'
+    __author__            = 'Darko Miletic, Sujata Raman, Alexis Rohou'
     description           = 'The art of technology'
     publisher             = 'Ars Technica'
     category              = 'news, IT, technology'
-    oldest_article        = 2
+    oldest_article        = 5
     max_articles_per_feed = 100
     no_stylesheets        = True
     encoding              = 'utf-8'
     use_embedded_content  = False
-    extra_css             = ' body {font-family: Arial,Helvetica,sans-serif} .title{text-align: left} .byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none} '
+    extra_css             = 	'''
+				body {font-family: Arial,Helvetica,sans-serif}
+				.title{text-align: left}
+				.byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none}
+				.news-item-figure-caption-text{font-size:small; font-style:italic}
+				.news-item-figure-caption-byline{font-size:small; font-style:italic; font-weight:bold}
+				'''
+    ignoreEtcArticles     = True	# Etc feed items can be ignored, as they're not real stories
 
     conversion_options = {
                              'comments'  : description
@@ -31,10 +37,10 @@ class ArsTechnica2(BasicNewsRecipe):
                          }
 
 
-    preprocess_regexps = [
-                (re.compile(r'<div class="news-item-figure', re.DOTALL|re.IGNORECASE),lambda match: '<div class="news-item-figure"')
-               ,(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
-                         ]
+    #preprocess_regexps = [
+    #            (re.compile(r'<div class="news-item-figure', re.DOTALL|re.IGNORECASE),lambda match: '<div class="news-item-figure"')
+    #           ,(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
+    #                     ]
 
     keep_only_tags = [dict(name='div', attrs={'id':['story','etc-story']})]
 
@@ -42,7 +48,7 @@ class ArsTechnica2(BasicNewsRecipe):
                      dict(name=['object','link','embed'])
                     ,dict(name='div', attrs={'class':'read-more-link'})
                   ]
-    remove_attributes=['width','height']
+    #remove_attributes=['width','height']
 
     feeds = [
               (u'Infinite Loop (Apple content)'        , u'http://feeds.arstechnica.com/arstechnica/apple/'      )
@@ -56,6 +62,7 @@ class ArsTechnica2(BasicNewsRecipe):
              ,(u'Law & Disorder (Tech policy content)' , u'http://feeds.arstechnica.com/arstechnica/tech-policy/')
             ]
 
+    # This deals with multi-page stories
     def append_page(self, soup, appendtag, position):
         pager = soup.find('div',attrs={'class':'pager'})
         if pager:
@@ -81,6 +88,7 @@ class ArsTechnica2(BasicNewsRecipe):
 
 
     def preprocess_html(self, soup):
+	# Adds line breaks near the byline (not sure why this is needed)
         ftag = soup.find('div', attrs={'class':'byline'})
         if ftag:
            brtag = Tag(soup,'br')
@@ -88,12 +96,33 @@ class ArsTechnica2(BasicNewsRecipe):
            ftag.insert(4,brtag)
            ftag.insert(5,brtag2)
 
+	# Remove style items
         for item in soup.findAll(style=True):
            del item['style']
 
+	# Remove id
+	for item in soup.findAll(id=True):
+		del item['id']
+
+	# For some reason, links to authors don't have the domainname
+	a_author = soup.find('a',{'href':re.compile("^/author")})
+	if a_author:
+		a_author['href'] = 'http://arstechnica.com'+a_author['href']
+
+	# within div class news-item-figure, we need to grab images
+
+	# Deal with multi-page stories
         self.append_page(soup, soup.body, 3)
 
         return soup
 
     def get_article_url(self, article):
+	# If the article title starts with Etc:, don't return it
+	if self.ignoreEtcArticles:
+		article_title = article.get('title',None)
+		if re.match('Etc: ',article_title) is not None:
+			return None
+
+	# The actual article is in a guid tag
         return article.get('guid',  None).rpartition('?')[0]
+

From de60086a688c87116a593154c49398a72f25a2d0 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Jan 2011 21:08:07 -0700
Subject: [PATCH 088/118] Updated Seattle Times

---
 resources/recipes/seattle_times.recipe | 57 +++++++++++++++++++++-----
 1 file changed, 47 insertions(+), 10 deletions(-)

diff --git a/resources/recipes/seattle_times.recipe b/resources/recipes/seattle_times.recipe
index 7fcea9cae5..cd7f96fc8b 100644
--- a/resources/recipes/seattle_times.recipe
+++ b/resources/recipes/seattle_times.recipe
@@ -21,16 +21,53 @@ class SeattleTimes(BasicNewsRecipe):
     encoding              = 'cp1252'
     language = 'en'
 
-
-    html2lrf_options = [
-                          '--comment'  , description
-                        , '--category' , category
-                        , '--publisher', publisher
-                        ]
-
-    html2epub_options  = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
-
-    feeds              = [(u'Articles', u'http://seattletimes.nwsource.com/rss/seattletimes.xml')]
+    feeds              = [
+                          (u'Top Stories',
+                              u'http://seattletimes.nwsource.com/rss/home.xml'),
+                          #(u'Articles', u'http://seattletimes.nwsource.com/rss/seattletimes.xml')
+                          (u'Business & Technology',
+                              u'http://seattletimes.nwsource.com/rss/businesstechnology.xml'),
+                          (u'Personal Technology',
+                              u'http://seattletimes.nwsource.com/rss/personaltechnology.xml'),
+                          (u'Entertainment & the Arts',
+                              u'http://seattletimes.nwsource.com/rss/artsentertainment.xml'),
+                          (u'Health',
+                              u'http://seattletimes.nwsource.com/rss/health.xml'),
+                          (u'Living',
+                              u'http://seattletimes.nwsource.com/rss/living.xml'),
+                          (u'Local News',
+                              u'http://seattletimes.nwsource.com/rss/localnews.xml'),
+                          (u'Nation & World',
+                              u'http://seattletimes.nwsource.com/rss/nationworld.xml'),
+                          (u'Opinion',
+                              u'http://seattletimes.nwsource.com/rss/opinion.xml'),
+                          (u'Politics',
+                              u'http://seattletimes.nwsource.com/rss/politics.xml'),
+                          (u'Sports',
+                              u'http://seattletimes.nwsource.com/rss/sports.xml'),
+                          (u'Nicole Brodeur',
+                              u'http://seattletimes.nwsource.com/rss/nicolebrodeur.xml'),
+                          (u'Danny Westneat',
+                              u'http://seattletimes.nwsource.com/rss/dannywestneat.xml'),
+                          (u'Jerry Large',
+                              u'http://seattletimes.nwsource.com/rss/jerrylarge.xml'),
+                          (u'Ron Judd',
+                              u'http://seattletimes.nwsource.com/rss/ronjudd.xml'),
+                          (u'Education',
+                              u'http://seattletimes.nwsource.com/rss/education.xml'),
+                          (u'Letters to the Editor',
+                              u'http://seattletimes.nwsource.com/rss/northwestvoices.xml'),
+                          (u'Travel',
+                              u'http://seattletimes.nwsource.com/rss/travel.xml'),
+                          (u'Outdoors',
+                              u'http://seattletimes.nwsource.com/rss/outdoors.xml'),
+                          (u'Steve Kelley',
+                              u'http://seattletimes.nwsource.com/rss/stevekelley.xml'),
+                          (u'Jerry Brewer',
+                              u'http://seattletimes.nwsource.com/rss/jerrybrewer.xml'),
+                          (u'Most Read Articles',
+                              u'http://seattletimes.nwsource.com/rss/mostreadarticles.xml'),
+                         ]
 
     remove_tags        = [
                              dict(name=['object','link','script'])

From 1768d5cdab9520d0295c7c01f574e441f01cd4f4 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Jan 2011 21:23:16 -0700
Subject: [PATCH 089/118] ...

---
 src/calibre/devices/eb600/driver.py |  2 +-
 src/calibre/devices/misc.py         |  4 ++--
 src/calibre/gui2/wizard/__init__.py | 15 +++++++++++++--
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/calibre/devices/eb600/driver.py b/src/calibre/devices/eb600/driver.py
index 3201229699..95f6dc6ab0 100644
--- a/src/calibre/devices/eb600/driver.py
+++ b/src/calibre/devices/eb600/driver.py
@@ -178,7 +178,7 @@ class INVESBOOK(EB600):
 
 class BOOQ(EB600):
     name = 'Booq Device Interface'
-    gui_name = 'Booq'
+    gui_name = 'bq Reader'
 
     FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'pdf', 'doc', 'rtf', 'txt', 'html']
 
diff --git a/src/calibre/devices/misc.py b/src/calibre/devices/misc.py
index ecd12ac61d..aaf948f25e 100644
--- a/src/calibre/devices/misc.py
+++ b/src/calibre/devices/misc.py
@@ -33,8 +33,8 @@ class PALMPRE(USBMS):
 
 class AVANT(USBMS):
     name           = 'Booq Avant Device Interface'
-    gui_name       = 'Avant'
-    description    = _('Communicate with the Booq Avant')
+    gui_name       = 'bq Avant'
+    description    = _('Communicate with the Bq Avant')
     author         = 'Kovid Goyal'
     supported_platforms = ['windows', 'osx', 'linux']
 
diff --git a/src/calibre/gui2/wizard/__init__.py b/src/calibre/gui2/wizard/__init__.py
index 4e5e79bbdf..8144dcabf3 100644
--- a/src/calibre/gui2/wizard/__init__.py
+++ b/src/calibre/gui2/wizard/__init__.py
@@ -111,7 +111,7 @@ class Kobo(Device):
     id = 'kobo'
 
 class Booq(Device):
-    name = 'Booq Reader'
+    name = 'bq Classic'
     manufacturer = 'Booq'
     output_profile = 'sony'
     output_format = 'EPUB'
@@ -125,7 +125,18 @@ class TheBook(Device):
     id = 'thebook'
 
 class Avant(Booq):
-    name = 'Booq Avant'
+    name = 'bq Avant'
+
+class AvantXL(Booq):
+    name = 'bq Avant XL'
+    output_profile = 'ipad'
+
+class BooqPocketPlus(Booq):
+    name = 'bq Pocket Plus'
+    output_profile = 'sony300'
+
+class BooqCervantes(Booq):
+    name = 'bq Cervantes'
 
 class Sony300(Sony505):
 

From dccbc65fc98e3227ebb5e03f90bdeb7e5ca1ade2 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Jan 2011 21:50:00 -0700
Subject: [PATCH 090/118] ...

---
 src/calibre/gui2/preferences/plugins.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/calibre/gui2/preferences/plugins.py b/src/calibre/gui2/preferences/plugins.py
index c53c634ab4..b00a485566 100644
--- a/src/calibre/gui2/preferences/plugins.py
+++ b/src/calibre/gui2/preferences/plugins.py
@@ -196,6 +196,12 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
     def modify_plugin(self, op=''):
         index = self.plugin_view.currentIndex()
         if index.isValid():
+            if not index.parent().isValid():
+                name = unicode(index.data().toString())
+                return error_dialog(self, _('Error'), '<p>'+
+                        _('Select an actual plugin under <b>%s</b> to customize')%name,
+                        show=True, show_copy_button=False)
+
             plugin = self._plugin_model.index_to_plugin(index)
             if op == 'toggle':
                 if not plugin.can_be_disabled:

From 539f24213d0410f413c4802dc3ae83bcd338c783 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 18 Jan 2011 12:51:55 +0800
Subject: [PATCH 091/118] tweaked chapter thresholds

---
 src/calibre/ebooks/conversion/utils.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index bfb5f1c153..4d017b7df4 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -21,7 +21,6 @@ class HeuristicProcessor(object):
         self.deleted_nbsps = False
         self.totalwords = 0
         self.min_chapters = 1
-        self.max_chapters = 150
         self.chapters_no_title = 0
         self.chapters_with_title = 0
         self.blanks_deleted = False
@@ -169,9 +168,12 @@ class HeuristicProcessor(object):
         # Typical chapters are between 2000 and 7000 words, use the larger number to decide the
         # minimum of chapters to search for.  A max limit is calculated to prevent things like OCR
         # or pdf page numbers from being treated as TOC markers
+        max_chapters = 150
+        typical_chapters = 7000.
         if wordcount > 7000:
-            self.min_chapters = int(ceil(wordcount / 15000.))
-            self.max_chapters = int(ceil(wordcount / 1200.))
+            if wordcount > 200000:
+                typical_chapters = 15000.
+            self.min_chapters = int(ceil(wordcount / typical_chapters))
         print "minimum chapters required are: "+str(self.min_chapters)
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))

From e72ceb5c59ef96a7d67b8aaa675b8b90a057a642 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 18 Jan 2011 16:35:40 +0800
Subject: [PATCH 092/118] updated docs and labels

---
 src/calibre/ebooks/conversion/cli.py   |   4 +-
 src/calibre/gui2/convert/heuristics.py |   2 +-
 src/calibre/gui2/convert/heuristics.ui |   2 +-
 src/calibre/manual/conversion.rst      | 100 ++++++++++++++++++-------
 4 files changed, 75 insertions(+), 33 deletions(-)

diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index c9612d97b9..b5c057b0f9 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -130,7 +130,7 @@ def add_pipeline_options(parser, plumber):
                   ]
                   ),
                   
-              'HEURISTICS' : (
+              'HEURISTIC PROCESSING' : (
                   _('Modify the document text and structure using common patterns.'),
                   [
                       'enable_heuristics', 'markup_chapter_headings',
@@ -182,7 +182,7 @@ def add_pipeline_options(parser, plumber):
 
               }
 
-    group_order = ['', 'LOOK AND FEEL', 'HEURISTICS',
+    group_order = ['', 'LOOK AND FEEL', 'HEURISTIC PROCESSING',
             'SEARCH AND REPLACE', 'STRUCTURE DETECTION',
             'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
 
diff --git a/src/calibre/gui2/convert/heuristics.py b/src/calibre/gui2/convert/heuristics.py
index 4735782f52..6739c199b7 100644
--- a/src/calibre/gui2/convert/heuristics.py
+++ b/src/calibre/gui2/convert/heuristics.py
@@ -11,7 +11,7 @@ from calibre.gui2.convert import Widget
 
 class HeuristicsWidget(Widget, Ui_Form):
 
-    TITLE = _('Heuristics')
+    TITLE = _('Heuristic Processing')
     HELP  = _('Modify the document text and structure using common patterns.')
     COMMIT_NAME = 'heuristics'
 
diff --git a/src/calibre/gui2/convert/heuristics.ui b/src/calibre/gui2/convert/heuristics.ui
index 1578b7146c..8048bef204 100644
--- a/src/calibre/gui2/convert/heuristics.ui
+++ b/src/calibre/gui2/convert/heuristics.ui
@@ -24,7 +24,7 @@
    <item>
     <widget class="QGroupBox" name="groupBox">
      <property name="title">
-      <string>Heuristics</string>
+      <string>Heuristic Processing</string>
      </property>
      <layout class="QGridLayout" name="gridLayout">
       <item row="0" column="0" colspan="2">
diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index c392df9a5e..94a3a60721 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -260,40 +260,72 @@ The Output profile also controls the screen size. This will cause, for example,
 Heuristic Processing
 ---------------------
 
-:guilabel:`Preprocess input`
-    This option activates various algorithms that try to detect and correct common cases of
-    badly formatted input documents. Things like hard line breaks, large blocks of text with no formatting, etc.
-    Turn this option on if your input document suffers from bad formatting. But be aware that in
-    some cases, this option can lead to worse results, so use with care.
+Heuristic Processing provides a variety of functions which can be used that try to detect and correct 
+common problems in poorly formatted input documents.  Use these functions if your input document suffers 
+from bad formatting. Because these functions rely on common patterns, be aware that in some cases an 
+option may lead to worse results, so use with care.  As an example, several of these options will
+remove all non-breaking-space entities.
 
-:guilabel:`Line-unwrap factor`
-    This option control the algorithm |app| uses to remove hard line breaks. For example, if the value of this
-    option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
-    than the length of 40% of all lines in the document. 
+:guilabel:`Preprocess input`
+    This option activates various activates |app|'s Heuristic Processing stage of the conversion pipeline.
+    This must be enabled in order for various sub-functions to be applied
 
 :guilabel:`Unwrap lines`
-    Lorem ipsum
+    Enabling this option will cause |app| to attempt to detect and correct hard line breaks that exist 
+    within a document using punctuation clues and line length.  |app| will first attempt to detect whether 
+    hard line breaks exist, if they do not appear to exist |app| will not attempt to unwrap lines.  The 
+    line-unwrap factor can be reduced if you want to 'force' |app| to unwrap lines.
+
+:guilabel:`Line-unwrap factor`
+    This option controls the algorithm |app| uses to remove hard line breaks. For example, if the value of this
+    option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
+    than the length of 40% of all lines in the document.  If your document only has a few line breaks which need
+    correction, then this value should be reduced to somewhere between 0.1 and 0.2.
     
 :guilabel:`Detect and markup unformatted chapter headings and sub headings`
-    Lorem ipsum
+    If your document does not have Chapter Markers and titles formatted differently from the rest of the text,
+    |app| can use this option to attempt detection them and surround them with heading tags. &lt;h2&gt; tags are used 
+    for chapter headings; &lt;h3&gt; tags are used for any titles that are detected.  This function will 
+    not create a TOC, but in many cases it will cause |app|'s default chapter detection settings to correctly
+    detect chapters and build a TOC.  Adjust the Xpath under Structure Detection if a TOC is not automatically
+    created.  The inserted heading tags are not formatted, to apply formatting use the 'extra_css' option under
+    the Look and Feel conversion settings.  For example, to center heading tags, use the following::
+
+        h2, h3 { text-align: center }
+
+:guilabel:`Renumber sequences of &lt;h1&gt; or &lt;h2&gt; tags`
+    Some publishers format chapter headings using multiple &lt;h1&gt; or &lt;h2&gt; tags sequentially.  
+    |app|'s default conversion settings will cause such titles to be split into two pieces.  This option 
+    will re-number the heading tags to prevent splitting.
 
-:guilabel:`Renumber sequences of &lt;h1&gt; or &lt;h2&gt; tags to prevent splitting`
-    Lorem ipsum
-    
 :guilabel:`Delete blank lines between paragraphs`
-    Lorem ipsum
+    This option will cause |app| to analyze blank lines included within the document.  If every paragraph is interleaved
+    with a blank line, then |app| will remove all those blank paragraphs.  Sequences of multiple blank lines will be
+    considered scene breaks and retained as a single paragraph.  This option differs from the 'Remove Paragraph Spacing' 
+    option under 'Look and Feel' in that it actually modifies the HTML content, while the other option modifies the document
+    styles.  This option can also remove paragraphs which were inserted using |app|'s 'Insert blank line' option.
 
 :guilabel:`Ensure scene breaks are consistently formatted`
-    Lorem ipsum
+    With this option |app| will attempt to detect common scene-break markers and ensure that they are center aligned.  
+    It also attempts to detect scene breaks defined by white space and replace them with a horizontal rule 15% of the
+    page width.  Some readers may find this desirable as these 'soft' scene breaks often become page breaks on readers, and 
+    thus become difficult to distinguish.
 
 :guilabel:`Remove unnecessary hyphens`
-    Lorem ipsum
+    |app| will analyze all hyphenated content in the document when this option is enabled.  The document itself is used
+    as a dictionary for analysis.  This allows |app| to accurately remove hyphens for any words in the document in any language, 
+    along with made-up and obscure scientific words.  The primary drawback is words appearing only a single time in the document 
+    will not be changed.  Analysis happens in two passes, the first pass analyzes line endings.  Lines are only unwrapped if the 
+    word exists with or without a hyphen in the document.  The second pass analyzes all hyphenated words throughout the document, 
+    hyphens are removed if the word exists elsewhere in the document without a match.
 
 :guilabel:`Italicize common words and patterns`
-    Lorem ipsum
+    When enabled, |app| will look for common words and patterns that denote italics and italicize them.  Examples are common text
+    conventions such as ~word~ or phrases that should generally be italicized, e.g. latin phrases like 'etc.' or 'et cetera'.
 
 :guilabel:`Replace entity indents with CSS indents`
-    Lorem ipsum
+    Some documents use a convention of defining text indents using non-breaking space entities.  When this option is enabled |app| will
+    attempt to detect this sort of formatting and convert them to a 3% text indent using css.
 
 .. _structure-detection:
 
@@ -518,15 +550,10 @@ at `mobileread <http://www.mobileread.com/forums/showthread.php?t=28313>`_.
 Convert TXT documents
 ~~~~~~~~~~~~~~~~~~~~~~
 
-TXT documents have no well defined way to specify formatting like bold, italics, etc, or document structure like paragraphs, headings, sections and so on.
-Since TXT documents provide no way to explicitly mark parts of
-the text, by default |app| only groups lines in the input document into paragraphs. The default is to assume one or
-more blank lines are a paragraph boundary::
-
-    This is the first.
-    
-    This is the
-    second paragraph.
+TXT documents have no well defined way to specify formatting like bold, italics, etc, or document 
+structure like paragraphs, headings, sections and so on, but there are a variety of conventions commonly 
+used.  By default |app| attempts automatic detection of the correct formatting and markup based on those
+conventions.
 
 TXT input supports a number of options to differentiate how paragraphs are detected.
 
@@ -534,6 +561,14 @@ TXT input supports a number of options to differentiate how paragraphs are detec
         Analyzes the text file and attempts to automatically determine how paragraphs are defined.  This
         option will generally work fine, if you achieve undesirable results try one of the manual options.
 
+    :guilabel:`Paragraph Style: Block`
+        Assumes one or more blank lines are a paragraph boundary::
+        
+            This is the first.
+    
+            This is the
+            second paragraph.
+
     :guilabel:`Paragraph Style: Single`
         Assumes that every line is a paragraph::
 
@@ -557,16 +592,23 @@ TXT input supports a number of options to differentiate how paragraphs are detec
         and median line length are used to attempt to re-create paragraphs.
 
     :guilabel:`Formatting Style: Auto`
+        Attemtps to detect the type of formatting markup being used.  If no markup is used then heuristic
+        formatting will be applied.
 
     :guilabel:`Formatting Style: Heuristic`
+        Analyses the document for common chapter headings, scene breaks, and italicized words and applies the
+        appropriate html markup during conversion.
 
-    :guilabel:`Process using markdown`
+    :guilabel:`Formatting Style: Markdown`
         |app| also supports running TXT input though a transformation preprocessor known as markdown. Markdown
         allows for basic formatting to be added to TXT documents, such as bold, italics, section headings, tables,
         lists, a Table of Contents, etc. Marking chapter headings with a leading # and setting the chapter XPath detection
         expression to "//h:h1" is the easiest way to have a proper table of contents generated from a TXT document.
         You can learn more about the markdown syntax at `daringfireball <http://daringfireball.net/projects/markdown/syntax>`_.
 
+    :guilabel:`Formatting Style: None`
+        Applies no special formatting to the text, the document is converted to html with no other changes.
+
 
 Convert PDF documents
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~

From 041b8c14f2f45c070f98b50c54427d205085815d Mon Sep 17 00:00:00 2001
From: GRiker <griker@hotmail.com>
Date: Tue, 18 Jan 2011 03:17:28 -0700
Subject: [PATCH 093/118] GwR change capitalize()

---
 src/calibre/library/catalog.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py
index 8edf266cfb..fe92486462 100644
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@@ -19,6 +19,7 @@ from calibre.ebooks.oeb.base import XHTML_NS
 from calibre.ptempfile import PersistentTemporaryDirectory
 from calibre.utils.config import config_dir
 from calibre.utils.date import format_date, isoformat, now as nowf
+from calibre.utils.icu import capitalize
 from calibre.utils.logging import default_log as log
 from calibre.utils.zipfile import ZipFile, ZipInfo
 from calibre.utils.magick.draw import thumbnail
@@ -1459,7 +1460,7 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
 #             print
 
             # Build the unique_authors set from existing data
-            authors = [(record['author'], record['author_sort'].capitalize()) for record in self.booksByAuthor]
+            authors = [(record['author'], capitalize(record['author_sort'])) for record in self.booksByAuthor]
 
             # authors[] contains a list of all book authors, with multiple entries for multiple books by author
             #        authors[]: (([0]:friendly  [1]:sort))
@@ -2756,7 +2757,7 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
                         this_book = {}
                         this_book['author'] = book['author']
                         this_book['title'] = book['title']
-                        this_book['author_sort'] = book['author_sort'].capitalize()
+                        this_book['author_sort'] = capitalize(book['author_sort'])
                         this_book['read'] = book['read']
                         this_book['tags'] = book['tags']
                         this_book['id'] = book['id']
@@ -3901,14 +3902,14 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
             Sort non-series books before series books
             '''
             if not book['series']:
-                key = '%s %s' % (book['author_sort'].capitalize(),
-                                 book['title_sort'].capitalize())
+                key = '%s %s' % (capitalize(book['author_sort']),
+                                 capitalize(book['title_sort']))
             else:
                 index = book['series_index']
                 integer = int(index)
                 fraction = index-integer
                 series_index = '%04d%s' % (integer, str('%0.4f' % fraction).lstrip('0'))
-                key = '%s ~%s %s' % (book['author_sort'].capitalize(),
+                key = '%s ~%s %s' % (capitalize(book['author_sort']),
                                      self.generateSortTitle(book['series']),
                                      series_index)
             return key
@@ -3919,7 +3920,7 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
             '''
             if not book['series']:
                 key = '%s %s' % (self.author_to_author_sort(book['author']),
-                                 book['title_sort'].capitalize())
+                                 capitalize(book['title_sort']))
             else:
                 index = book['series_index']
                 integer = int(index)
@@ -4570,7 +4571,7 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
                         if self.letter_or_symbol(word[0]) != word[0]:
                             if word[0] > 'A' or (ord('9') < ord(word[0]) < ord('A')) :
                                 translated.append('/')
-                        translated.append(word.capitalize())
+                        translated.append(capitalize(word))
 
                 else:
                     if re.search('[0-9]+',word[0]):

From cff26ebcbba92ba1bf9d65e7dcc4393b156677f2 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Tue, 18 Jan 2011 06:45:53 -0500
Subject: [PATCH 094/118] Rework Italicize patterns to match less false
 positives.

---
 src/calibre/ebooks/conversion/utils.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index bfb5f1c153..5fc986b7d8 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -139,17 +139,17 @@ class HeuristicProcessor(object):
         ]
         
         ITALICIZE_STYLE_PATS = [
-            r'(?msu)_(?P<words>[^\s][^<>_]+?[^\s])?_',
-            r'(?msu)/(?P<words>[^\s][^<>/]+?[^\s])?/',
-            r'(?msu)~~(?P<words>[^\s][^<>~]+?[^\s])?~~',
-            r'(?msu)\*(?P<words>[^\s][^<>\*]+?[^\s])?\*',
-            r'(?msu)~(?P<words>[^\s][^<>~]+?[^\s])?~',
-            r'(?msu)_/(?P<words>[^\s][^<>/_]+?[^\s])?/_',
-            r'(?msu)_\*(?P<words>[^\s][^<>\*_]+?[^\s])?\*_',
-            r'(?msu)\*/(?P<words>[^\s][^<>/\*]+?[^\s])?/\*',
-            r'(?msu)_\*/(?P<words>[^\s][^<>\*_]+?[^\s])?/\*_',
-            r'(?msu)/:(?P<words>[^\s][^<>:/]+?[^\s])?:/',
-            r'(?msu)\|:(?P<words>[^\s][^<>:\|]+?[^\s])?:\|',
+            r'(?msu)(?<=\s)_(?P<words>\S[^_]{0,40}?\S)?_(?=\s)',
+            r'(?msu)(?<=\s)/(?P<words>\S[^/]{0,40}?\S)?/(?=\s)',
+            r'(?msu)(?<=\s)~~(?P<words>\S[^~]{0,40}?\S)?~~(?=\s)',
+            r'(?msu)(?<=\s)\*(?P<words>\S[^\*]{0,40}?\S)?\*(?=\s)',
+            r'(?msu)(?<=\s)~(?P<words>\S[^~]{0,40}?\S)?~(?=\s)',
+            r'(?msu)(?<=\s)_/(?P<words>\S[^/_]{0,40}?\S)?/_(?=\s)',
+            r'(?msu)(?<=\s)_\*(?P<words>\S[^\*_]{0,40}?\S)?\*_(?=\s)',
+            r'(?msu)(?<=\s)\*/(?P<words>\S[^/\*]{0,40}?\S)?/\*(?=\s)',
+            r'(?msu)(?<=\s)_\*/(?P<words>\S[^\*_]{0,40}?\S)?/\*_(?=\s)',
+            r'(?msu)(?<=\s)/:(?P<words>\S[^:/]{0,40}?\S)?:/(?=\s)',
+            r'(?msu)(?<=\s)\|:(?P<words>\S[^:\|]{0,40}?\S)?:\|(?=\s)',
         ]
         
         for word in ITALICIZE_WORDS:

From 4fd784a9c17bf6f286819656ce2baf82f9f9bada Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 18 Jan 2011 20:28:53 +0800
Subject: [PATCH 095/118] ...

---
 src/calibre/ebooks/conversion/utils.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 5984723aa3..bcc6f5a236 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -496,12 +496,9 @@ class HeuristicProcessor(object):
             if hardbreaks or unwrap_factor < 0.4:
                 self.log.debug("Unwrapping required, unwrapping Lines")
                 # Dehyphenate with line length limiters
-                dehyphenator = Dehyphenator()
+                dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
                 html = dehyphenator(html,'html', length)
                 html = self.punctuation_unwrap(length, html, 'html')
-                # unwrap remaining hyphens based on line length, but only remove if there is a match
-                dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
-                html = dehyphenator(html,'html_cleanup', length)
 
         if getattr(self.extra_opts, 'dehyphenate', False):
             # dehyphenate in cleanup mode to fix anything previous conversions/editing missed

From fc2ae0d4b54dfaa4c81bddfd080a3989a3aa63fd Mon Sep 17 00:00:00 2001
From: GRiker <griker@hotmail.com>
Date: Tue, 18 Jan 2011 06:21:17 -0700
Subject: [PATCH 096/118] GwR revisions to catalog generator

---
 resources/catalog/section_list_templates.py |  3 ++
 src/calibre/library/catalog.py              | 40 +++++++++------------
 2 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/resources/catalog/section_list_templates.py b/resources/catalog/section_list_templates.py
index de73147fcf..7f92fad6ac 100644
--- a/resources/catalog/section_list_templates.py
+++ b/resources/catalog/section_list_templates.py
@@ -6,6 +6,8 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
 '''
+    These templates control the content of titles displayed in the various sections
+
     Available fields:
     {title}          Title of the book
     {series}         Series name
@@ -14,6 +16,7 @@ __docformat__ = 'restructuredtext en'
     {rating_parens}  Rating, in parentheses
     {pubyear}        Year the book was published
     {pubyear_parens} Year the book was published, in parentheses
+
 '''
 # Books by Author
 by_authors_normal_title_template = '{title} {pubyear_parens}'
diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py
index fe92486462..ea02c29fa7 100644
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@@ -1027,17 +1027,12 @@ class EPUB_MOBI(CatalogPlugin):
                 self.__totalSteps += 3
 
             # Load section list templates
-            templates = ['by_authors_normal_title_template',
-                         'by_authors_series_title_template',
-                         'by_titles_normal_title_template',
-                         'by_titles_series_title_template',
-                         'by_series_title_template',
-                         'by_genres_normal_title_template',
-                         'by_genres_series_title_template',
-                         'by_recently_added_normal_title_template',
-                         'by_recently_added_series_title_template',
-                         'by_month_added_normal_title_template',
-                         'by_month_added_series_title_template']
+            templates = []
+            with open(P('catalog/section_list_templates.py'), 'r') as f:
+                for line in f:
+                    t = re.match("(by_.+_template)",line)
+                    if t:
+                        templates.append(t.group(1))
             execfile(P('catalog/section_list_templates.py'), locals())
             for t in templates:
                 setattr(self,t,eval(t))
@@ -1441,7 +1436,9 @@ class EPUB_MOBI(CatalogPlugin):
                     # Exit if author matches previous, but author_sort doesn't match
                     if author[0] == current_author[0]:
                         error_msg = _('''
-Inconsistent Author Sort values for Author '{0}' ('{1}' <> '{2}'), unable to build catalog.\n
+Inconsistent Author Sort values for Author '{0}':
+'{1}' <> '{2}',
+unable to build catalog.\n
 Select all books by '{0}', apply correct Author Sort value in Edit Metadata dialog,
 then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
                         self.opts.log.warn('\n*** Metadata error ***')
@@ -1450,15 +1447,11 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
                         self.error.append('Metadata error')
                         self.error.append(error_msg)
                         return False
+                    current_author = author
 
 
             self.booksByAuthor = sorted(self.booksByAuthor, key=self.booksByAuthorSorter_author_sort)
 
-#             for book in self.booksByAuthor:
-#                 print '{0:<10} {1:<5} {2:<20} {3:<20} {4:<20} {5:<20}'.format(book['series'], book['series_index'], book['title'],
-#                                                 book['author'], book['authors'],book['author_sort'])
-#             print
-
             # Build the unique_authors set from existing data
             authors = [(record['author'], capitalize(record['author_sort'])) for record in self.booksByAuthor]
 
@@ -1566,7 +1559,7 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
 
                 this_title['rating'] = record['rating'] if record['rating'] else 0
 
-                if re.match('0100-01-01',str(record['pubdate'].date())):
+                if re.match('0101-01-01',str(record['pubdate'].date())):
                     this_title['date'] = None
                 else:
                     this_title['date'] = strftime(u'%B %Y', record['pubdate'].timetuple())
@@ -2683,7 +2676,7 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
                 #aTag.insert(0,'%d. %s &middot; %s' % (book['series_index'],escape(book['title']), ' & '.join(book['authors'])))
 
                 # Reassert 'date' since this is the result of a new search
-                if re.match('0100-01-01',str(book['pubdate'].date())):
+                if re.match('0101-01-01',str(book['pubdate'].date())):
                     book['date'] = None
                 else:
                     book['date'] = strftime(u'%B %Y', book['pubdate'].timetuple())
@@ -4314,10 +4307,11 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
                 formats = ' &middot; '.join(formats)
 
             # Date of publication
-            pubdate = book['date']
-            pubmonth, pubyear = pubdate.split()
-            if pubyear == '101':
-                pubdate = pubmonth = pubyear = ''
+            if book['date']:
+                pubdate = book['date']
+                pubmonth, pubyear = pubdate.split()
+            else:
+                pubdate = pubyear = pubmonth = ''
 
             # Thumb
             _soup = BeautifulSoup('<html>',selfClosingTags=['img'])

From 4e93f9d761d865f8e8bc431cb52e0164e598c921 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 18 Jan 2011 08:24:27 -0700
Subject: [PATCH 097/118] Fix #8444 (Calibre doesn't detect Acer LumiRead 600
 on Win7, USB 2.0)

---
 src/calibre/devices/misc.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/calibre/devices/misc.py b/src/calibre/devices/misc.py
index aaf948f25e..9f8dbcb379 100644
--- a/src/calibre/devices/misc.py
+++ b/src/calibre/devices/misc.py
@@ -193,6 +193,9 @@ class LUMIREAD(USBMS):
 
     THUMBNAIL_HEIGHT = 200
 
+    VENDOR_NAME = 'ACER'
+    WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'LUMIREAD_600'
+
     def upload_cover(self, path, filename, metadata, filepath):
         if metadata.thumbnail and metadata.thumbnail[-1]:
             cfilepath = filepath.replace('/', os.sep)

From 492fb4c5266b2af61c44e6d97707c597db57d6f0 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 18 Jan 2011 08:48:21 -0700
Subject: [PATCH 098/118] Email: Fix bug when connecting to SMTP relays that
 use MD5 auth

---
 src/calibre/utils/smtplib.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/calibre/utils/smtplib.py b/src/calibre/utils/smtplib.py
index d6f3fb0b69..9992039d00 100755
--- a/src/calibre/utils/smtplib.py
+++ b/src/calibre/utils/smtplib.py
@@ -554,6 +554,8 @@ class SMTP:
 
         def encode_cram_md5(challenge, user, password):
             challenge = base64.decodestring(challenge)
+            if isinstance(password, unicode): # Added by Kovid, see http://bugs.python.org/issue5285
+                password = password.encode('utf-8')
             response = user + " " + hmac.HMAC(password, challenge).hexdigest()
             return encode_base64(response, eol="")
 

From 74d48de82c914883670370de59326305ddfc5686 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 18 Jan 2011 09:06:28 -0700
Subject: [PATCH 099/118] Support for the SmartQ T7

---
 src/calibre/devices/android/driver.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py
index 5a82882dfa..277070020b 100644
--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@@ -36,7 +36,7 @@ class ANDROID(USBMS):
 
             # Google
             0x18d1 : { 0x4e11 : [0x0100, 0x226, 0x227], 0x4e12: [0x0100, 0x226,
-                0x227], 0x4e21: [0x0100, 0x226, 0x227]},
+                0x227], 0x4e21: [0x0100, 0x226, 0x227], 0xb058: [0x0222]},
 
             # Samsung
             0x04e8 : { 0x681d : [0x0222, 0x0223, 0x0224, 0x0400],
@@ -64,12 +64,13 @@ class ANDROID(USBMS):
     EXTRA_CUSTOMIZATION_DEFAULT = ', '.join(EBOOK_DIR_MAIN)
 
     VENDOR_NAME      = ['HTC', 'MOTOROLA', 'GOOGLE_', 'ANDROID', 'ACER',
-            'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE', 'ARCHOS']
+            'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE', 'ARCHOS',
+            'TELECHIP']
     WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
             '__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
             'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID',
             'SCH-I500_CARD', 'SPH-D700_CARD', 'MB810', 'GT-P1000', 'DESIRE',
-            'SGH-T849', '_MB300', 'A70S']
+            'SGH-T849', '_MB300', 'A70S', 'S_ANDROID']
     WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
             'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
             'A70S']

From 80065cb443021536762bf0fdf8d479b1b06bbd0d Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 19 Jan 2011 00:18:11 +0800
Subject: [PATCH 100/118] tweaked chapter_markup for false positives/negatives

---
 src/calibre/ebooks/conversion/utils.py | 10 ++++------
 src/calibre/manual/conversion.rst      | 12 ++++++++----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index bcc6f5a236..812a863717 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -174,7 +174,7 @@ class HeuristicProcessor(object):
             if wordcount > 200000:
                 typical_chapters = 15000.
             self.min_chapters = int(ceil(wordcount / typical_chapters))
-        print "minimum chapters required are: "+str(self.min_chapters)
+        self.log.debug("minimum chapters required are: "+str(self.min_chapters))
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
         self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
@@ -208,12 +208,12 @@ class HeuristicProcessor(object):
         n_lookahead_close = ")"
 
         default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
-        simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter).{0,50}?(</[ibu][^>]*>)?(?=<)"
+        simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"
 
         analysis_result = []
 
         chapter_types = [
-            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
             [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],  # Highest frequency headings which include titles
             [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
             [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
@@ -274,10 +274,9 @@ class HeuristicProcessor(object):
                             title_req = True
                             strict_title = False
                         self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
-                        print "max chapters is "+str(self.max_chapters)
                         if type_name == 'common':
                             analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
-                        elif self.min_chapters <= hits < self.max_chapters:
+                        elif self.min_chapters <= hits < max_chapters:
                             analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
                             break
                 else:
@@ -423,7 +422,6 @@ class HeuristicProcessor(object):
         except:
             self.log.warn("Can't get wordcount")
 
-        print "found "+unicode(self.totalwords)+" words in the flow"
         if self.totalwords < 50:
             self.log.warn("flow is too short, not running heuristics")
             return html
diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index 94a3a60721..e7c09a57a5 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -285,10 +285,14 @@ remove all non-breaking-space entities.
 :guilabel:`Detect and markup unformatted chapter headings and sub headings`
     If your document does not have Chapter Markers and titles formatted differently from the rest of the text,
     |app| can use this option to attempt detection them and surround them with heading tags. &lt;h2&gt; tags are used 
-    for chapter headings; &lt;h3&gt; tags are used for any titles that are detected.  This function will 
-    not create a TOC, but in many cases it will cause |app|'s default chapter detection settings to correctly
-    detect chapters and build a TOC.  Adjust the Xpath under Structure Detection if a TOC is not automatically
-    created.  The inserted heading tags are not formatted, to apply formatting use the 'extra_css' option under
+    for chapter headings; &lt;h3&gt; tags are used for any titles that are detected.  
+    
+    This function will not create a TOC, but in many cases it will cause |app|'s default chapter detection settings 
+    to correctly detect chapters and build a TOC.  Adjust the Xpath under Structure Detection if a TOC is not automatically
+    created.  If there are no other headings used in the document then setting "//h:h2" under Structure Detection would
+    be the easiest way to create a TOC for the document.
+    
+    The inserted headings are not formatted, to apply formatting use the 'extra_css' option under
     the Look and Feel conversion settings.  For example, to center heading tags, use the following::
 
         h2, h3 { text-align: center }

From b4c5cd0122b0afd540862c1eee7708a3be1d9baa Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 19 Jan 2011 00:41:59 +0800
Subject: [PATCH 101/118] fix calls to create_oebbook, mark several strings as
 unicode

---
 src/calibre/ebooks/chm/input.py        | 2 +-
 src/calibre/ebooks/conversion/utils.py | 6 +++---
 src/calibre/ebooks/html/input.py       | 2 +-
 src/calibre/ebooks/snb/input.py        | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/chm/input.py b/src/calibre/ebooks/chm/input.py
index c4b124fe98..89efa2b4d1 100644
--- a/src/calibre/ebooks/chm/input.py
+++ b/src/calibre/ebooks/chm/input.py
@@ -75,7 +75,7 @@ class CHMInput(InputFormatPlugin):
     def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
         from calibre.ebooks.conversion.plumber import create_oebbook
         from calibre.ebooks.oeb.base import DirContainer
-        oeb = create_oebbook(log, None, opts, self,
+        oeb = create_oebbook(log, None, opts,
                 encoding=opts.input_encoding, populate=False)
         self.oeb = oeb
 
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 812a863717..9ae8e5ab6f 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -299,9 +299,9 @@ class HeuristicProcessor(object):
         supports a range of html markup and text files
         '''
         # define the pieces of the regex
-        lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
-        em_en_lookahead = "(?<=.{"+str(length)+"}[\u2013\u2014])"
-        soft_hyphen = "\xad"
+        lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
+        em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
+        soft_hyphen = u"\xad"
         line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
         blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
         line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index ed0bf7b3ef..080faffae6 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -295,7 +295,7 @@ class HTMLInput(InputFormatPlugin):
             return oeb
 
         from calibre.ebooks.conversion.plumber import create_oebbook
-        return create_oebbook(log, stream.name, opts, self,
+        return create_oebbook(log, stream.name, opts,
                 encoding=opts.input_encoding)
 
     def is_case_sensitive(self, path):
diff --git a/src/calibre/ebooks/snb/input.py b/src/calibre/ebooks/snb/input.py
index d2acb257aa..100ac1447f 100755
--- a/src/calibre/ebooks/snb/input.py
+++ b/src/calibre/ebooks/snb/input.py
@@ -41,7 +41,7 @@ class SNBInput(InputFormatPlugin):
             raise ValueError("Invalid SNB file")
         log.debug("Handle meta data ...")
         from calibre.ebooks.conversion.plumber import create_oebbook
-        oeb = create_oebbook(log, None, options, self,
+        oeb = create_oebbook(log, None, options,
                 encoding=options.input_encoding, populate=False)
         meta = snbFile.GetFileStream('snbf/book.snbf')
         if meta != None:

From fca9ccc67d87647363d436742a7c02795f15183d Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 18 Jan 2011 10:17:56 -0700
Subject: [PATCH 102/118] Fix for shortcuts and combobox delegates

---
 src/calibre/gui2/library/delegates.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/calibre/gui2/library/delegates.py b/src/calibre/gui2/library/delegates.py
index ea614aa817..e9b9255c64 100644
--- a/src/calibre/gui2/library/delegates.py
+++ b/src/calibre/gui2/library/delegates.py
@@ -353,6 +353,17 @@ class CcCommentsDelegate(QStyledItemDelegate): # {{{
         model.setData(index, QVariant(editor.textbox.html), Qt.EditRole)
 # }}}
 
+class DelegateCB(QComboBox): # {{{
+
+    def __init__(self, parent):
+        QComboBox.__init__(self, parent)
+
+    def event(self, e):
+        if e.type() == e.ShortcutOverride:
+            e.accept()
+        return QComboBox.event(self, e)
+# }}}
+
 class CcBoolDelegate(QStyledItemDelegate): # {{{
     def __init__(self, parent):
         '''
@@ -361,7 +372,7 @@ class CcBoolDelegate(QStyledItemDelegate): # {{{
         QStyledItemDelegate.__init__(self, parent)
 
     def createEditor(self, parent, option, index):
-        editor = QComboBox(parent)
+        editor = DelegateCB(parent)
         items = [_('Y'), _('N'), ' ']
         icons = [I('ok.png'), I('list_remove.png'), I('blank.png')]
         if tweaks['bool_custom_columns_are_tristate'] == 'no':

From 2a8ebdb76680f484b9fd2140f9b74c434e35f126 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 18 Jan 2011 10:21:57 -0700
Subject: [PATCH 103/118] ...

---
 resources/recipes/ihned.recipe | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/recipes/ihned.recipe b/resources/recipes/ihned.recipe
index daf63e19ed..a74f9e5649 100644
--- a/resources/recipes/ihned.recipe
+++ b/resources/recipes/ihned.recipe
@@ -5,7 +5,7 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
 class IHNed(BasicNewsRecipe):
 
 
-    stahnout_vsechny = False
+    stahnout_vsechny = True
         #True   = stahuje vsechny z homepage
         #False  = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten)
 

From 5e9fbd8f66fdecd3b7d41dbfe5af3df6d172ac46 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 18 Jan 2011 10:26:35 -0700
Subject: [PATCH 104/118] ...

---
 src/calibre/gui2/device_drivers/configwidget.ui | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/calibre/gui2/device_drivers/configwidget.ui b/src/calibre/gui2/device_drivers/configwidget.ui
index f4902a7387..619d7052e8 100644
--- a/src/calibre/gui2/device_drivers/configwidget.ui
+++ b/src/calibre/gui2/device_drivers/configwidget.ui
@@ -85,6 +85,9 @@
    </item>
    <item row="2" column="0">
     <widget class="QCheckBox" name="opt_use_subdirs">
+     <property name="toolTip">
+      <string>If checked, books are placed into sub directories based on their metadata on the device. If unchecked, books are all put into the top level directory.</string>
+     </property>
      <property name="text">
       <string>Use sub directories</string>
      </property>

From 25fa78ed6e1a9469cb64c4f7a3fcfae3779a64ce Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Tue, 18 Jan 2011 17:31:03 +0000
Subject: [PATCH 105/118] Use DelegateCB with enum columns

---
 src/calibre/gui2/library/delegates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/gui2/library/delegates.py b/src/calibre/gui2/library/delegates.py
index e9b9255c64..ae9d6e2f71 100644
--- a/src/calibre/gui2/library/delegates.py
+++ b/src/calibre/gui2/library/delegates.py
@@ -292,7 +292,7 @@ class CcEnumDelegate(QStyledItemDelegate): # {{{
     def createEditor(self, parent, option, index):
         m = index.model()
         col = m.column_map[index.column()]
-        editor = QComboBox(parent)
+        editor = DelegateCB(parent)
         editor.addItem('')
         for v in m.custom_columns[col]['display']['enum_values']:
             editor.addItem(v)

From 01584b07841f27494b897b4b398576a0bbbb9746 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 19 Jan 2011 01:32:22 +0800
Subject: [PATCH 106/118] removed rtf preprocess call

---
 src/calibre/ebooks/rtf/input.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index d3d8c78dbd..ca6f2c7b95 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -321,9 +321,6 @@ class RTFInput(InputFormatPlugin):
             res = re.sub('\s*<body>', '<body>', res)
             res = re.sub('(?<=\n)\n{2}',
                     u'<p>\u00a0</p>\n'.encode('utf-8'), res)
-            if self.opts.enable_heuristics:
-                preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None))
-                res = preprocessor(res.decode('utf-8')).encode('utf-8')
             f.write(res)
         self.write_inline_css(inline_class, border_styles)
         stream.seek(0)

From f8182c38043e663ba3c53d334529f8e55fe89608 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 18 Jan 2011 10:47:24 -0700
Subject: [PATCH 107/118] Add a is_undefined_date method to utils.date

---
 src/calibre/utils/date.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/calibre/utils/date.py b/src/calibre/utils/date.py
index 2551b90788..d970ed228d 100644
--- a/src/calibre/utils/date.py
+++ b/src/calibre/utils/date.py
@@ -46,6 +46,14 @@ local_tz = _local_tz = SafeLocalTimeZone()
 
 UNDEFINED_DATE = datetime(101,1,1, tzinfo=utc_tz)
 
+def is_date_undefined(qt_or_dt):
+    d = qt_or_dt
+    if hasattr(d, 'toString'):
+        d = datetime(d.year(), d.month(), d.day(), tzinfo=utc_tz)
+    return d.year == UNDEFINED_DATE.year and \
+            d.month == UNDEFINED_DATE.month and \
+            d.day == UNDEFINED_DATE.day
+
 def parse_date(date_string, assume_utc=False, as_utc=True, default=None):
     '''
     Parse a date/time string into a timezone aware datetime object. The timezone

From 6c92177944f2a49e3d50d6b5d7ddcd84119016e9 Mon Sep 17 00:00:00 2001
From: GRiker <griker@hotmail.com>
Date: Tue, 18 Jan 2011 10:51:56 -0700
Subject: [PATCH 108/118] GwR revisions to catalog generator

---
 src/calibre/library/catalog.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py
index ea02c29fa7..16e90aaf0c 100644
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@@ -18,7 +18,7 @@ from calibre.ebooks.chardet import substitute_entites
 from calibre.ebooks.oeb.base import XHTML_NS
 from calibre.ptempfile import PersistentTemporaryDirectory
 from calibre.utils.config import config_dir
-from calibre.utils.date import format_date, isoformat, now as nowf
+from calibre.utils.date import format_date, isoformat, now as nowf, UNDEFINED_DATE, utc_tz
 from calibre.utils.icu import capitalize
 from calibre.utils.logging import default_log as log
 from calibre.utils.zipfile import ZipFile, ZipInfo
@@ -1559,6 +1559,8 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
 
                 this_title['rating'] = record['rating'] if record['rating'] else 0
 
+                #pubdate = record['pubdate'].astimezone(utc_tz)
+                #if pubdate == UNDEFINED_DATE:
                 if re.match('0101-01-01',str(record['pubdate'].date())):
                     this_title['date'] = None
                 else:
@@ -2676,6 +2678,7 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
                 #aTag.insert(0,'%d. %s &middot; %s' % (book['series_index'],escape(book['title']), ' & '.join(book['authors'])))
 
                 # Reassert 'date' since this is the result of a new search
+                #if book['pubdate'] == UNDEFINED_DATE:  # tz doesn't match
                 if re.match('0101-01-01',str(book['pubdate'].date())):
                     book['date'] = None
                 else:

From 8c8583b7298dccb243a4c15aaa467ff87af08949 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 18 Jan 2011 10:58:29 -0700
Subject: [PATCH 109/118] Update El Pais

---
 resources/recipes/el_pais.recipe | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/resources/recipes/el_pais.recipe b/resources/recipes/el_pais.recipe
index 2e358060b8..4da3384093 100644
--- a/resources/recipes/el_pais.recipe
+++ b/resources/recipes/el_pais.recipe
@@ -9,13 +9,14 @@ __docformat__ = 'restructuredtext en'
 elpais.es
 '''
 
+from time import strftime
+
 from calibre.web.feeds.news import BasicNewsRecipe
 
 class ElPais(BasicNewsRecipe):
     __author__        = 'Kovid Goyal & Lorenzo Vigentini & Jordi Balcells'
     description   = 'Main daily newspaper from Spain'
 
-    cover_url      = 'http://www.elpais.com/im/tit_logo_global.gif'
     title          = u'El Pais'
     publisher      = u'Ediciones El Pa\xeds SL'
     category       = 'News, politics, culture, economy, general interest'
@@ -62,6 +63,6 @@ class ElPais(BasicNewsRecipe):
                         (u'Vi\xf1etas', u'http://www.elpais.com/rss/feed.html?feedId=17058')
                         ]
 
-def print_version(self, url):
-    url = url+'?print=1'
-    return url
+    def get_cover_url(self):
+        return 'http://img5.kiosko.net/' + strftime("%Y/%m/%d") + '/es/elpais.750.jpg'
+

From f178ce16f19c0aa8f149447cf9d287691b66fa52 Mon Sep 17 00:00:00 2001
From: GRiker <griker@hotmail.com>
Date: Tue, 18 Jan 2011 10:58:57 -0700
Subject: [PATCH 110/118] GwR revisions to catalog generator

---
 src/calibre/library/catalog.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py
index 16e90aaf0c..ae600a29f9 100644
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@@ -18,7 +18,7 @@ from calibre.ebooks.chardet import substitute_entites
 from calibre.ebooks.oeb.base import XHTML_NS
 from calibre.ptempfile import PersistentTemporaryDirectory
 from calibre.utils.config import config_dir
-from calibre.utils.date import format_date, isoformat, now as nowf, UNDEFINED_DATE, utc_tz
+from calibre.utils.date import format_date, isoformat, is_date_undefined, now as nowf
 from calibre.utils.icu import capitalize
 from calibre.utils.logging import default_log as log
 from calibre.utils.zipfile import ZipFile, ZipInfo
@@ -1559,9 +1559,7 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
 
                 this_title['rating'] = record['rating'] if record['rating'] else 0
 
-                #pubdate = record['pubdate'].astimezone(utc_tz)
-                #if pubdate == UNDEFINED_DATE:
-                if re.match('0101-01-01',str(record['pubdate'].date())):
+                if is_date_undefined(record['pubdate']):
                     this_title['date'] = None
                 else:
                     this_title['date'] = strftime(u'%B %Y', record['pubdate'].timetuple())
@@ -2677,9 +2675,7 @@ then rebuild the catalog.\n''').format(author[0],author[1],current_author[1])
                 # Use series, series index if avail else just title
                 #aTag.insert(0,'%d. %s &middot; %s' % (book['series_index'],escape(book['title']), ' & '.join(book['authors'])))
 
-                # Reassert 'date' since this is the result of a new search
-                #if book['pubdate'] == UNDEFINED_DATE:  # tz doesn't match
-                if re.match('0101-01-01',str(book['pubdate'].date())):
+                if is_date_undefined(book['pubdate']):
                     book['date'] = None
                 else:
                     book['date'] = strftime(u'%B %Y', book['pubdate'].timetuple())

From 3c45dba7ccb24e6328236c65c04c43b2378d5d03 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 19 Jan 2011 02:01:40 +0800
Subject: [PATCH 111/118] ...

---
 src/calibre/ebooks/conversion/preprocess.py | 2 +-
 src/calibre/ebooks/conversion/utils.py      | 6 +++---
 src/calibre/ebooks/txt/input.py             | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index da20af6e8a..bbd71ede3a 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -175,7 +175,7 @@ class Dehyphenator(object):
     '''
 
     def __init__(self, verbose=0, log=None):
-        self.log = default_log if log is None else log
+        self.log = log
         self.verbose = verbose
         # Add common suffixes to the regex below to increase the likelihood of a match -
         # don't add suffixes which are also complete words, such as 'able' or 'sex'
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 9ae8e5ab6f..4663eeccdf 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -92,8 +92,8 @@ class HeuristicProcessor(object):
         line_end = line_end_ere.findall(raw)
         tot_htm_ends = len(htm_end)
         tot_ln_fds = len(line_end)
-        self.log.debug("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
-                unicode(tot_htm_ends) + " marked up endings")
+        #self.log.debug("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
+        #        unicode(tot_htm_ends) + " marked up endings")
 
         if percent > 1:
             percent = 1
@@ -101,7 +101,7 @@ class HeuristicProcessor(object):
             percent = 0
 
         min_lns = tot_ln_fds * percent
-        self.log.debug("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
+        #self.log.debug("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
         if min_lns > tot_htm_ends:
             return True
 
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index c918d145f4..dd14de2d20 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -118,11 +118,11 @@ class TXTInput(InputFormatPlugin):
                 txt = separate_paragraphs_print_formatted(txt)
 
             if options.paragraph_type == 'unformatted':
-                from calibre.ebooks.conversion.utils import PreProcessor
+                from calibre.ebooks.conversion.utils import HeuristicProcessor
                 # get length
 
                 # unwrap lines based on punctuation
-                preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
+                preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
                 txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
 
             flow_size = getattr(options, 'flow_size', 0)

From ca89710f65059c1148a6da1d44b040b47a4f8335 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 19 Jan 2011 02:18:49 +0800
Subject: [PATCH 112/118] doc update

---
 src/calibre/manual/conversion.rst | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index e7c09a57a5..2bc5687262 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -331,6 +331,22 @@ remove all non-breaking-space entities.
     Some documents use a convention of defining text indents using non-breaking space entities.  When this option is enabled |app| will
     attempt to detect this sort of formatting and convert them to a 3% text indent using css.
 
+.. search-replace:
+
+Search & Replace
+---------------------
+
+These options are useful primarily for conversion of PDF documents. Often, the conversion leaves
+behind page headers and footers in the text. These options use regular expressions to try and detect
+the headers and footers and remove them. Remember that they operate on the intermediate XHTML produced
+by the conversion pipeline. There is also a wizard to help you customize the regular expressions for
+your document.  These options can also be used for generic search and replace of any content by additionally 
+specifying a replacement expression.
+
+The search works by using a python regular expression. All matched text is simply removed from
+the document or replaced using the replacement pattern. You can learn more about regular expressions and 
+their syntax at http://docs.python.org/library/re.html.
+
 .. _structure-detection:
 
 Structure Detection
@@ -374,21 +390,6 @@ which means that |app| will insert page breaks before every `<h1>` and `<h2>` ta
     
     The default expressions may change depending on the input format you are converting.
 
-Removing headers and footers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-These options are useful primarily for conversion of PDF documents. Often, the conversion leaves
-behind page headers and footers in the text. These options use regular expressions to try and detect
-the headers and footers and remove them. Remember that they operate on the intermediate XHTML produced
-by the conversion pipeline. There is also a wizard to help you customize the regular expressions for
-your document.
-
-The header and footer regular expressions are used in conjunction with the remove header and footer options.
-If the remove option is not enabled the regular expression will not be applied to remove the matched text.
-The removal works by using a python regular expression. All matched text is simply removed from
-the document. You can learn more about regular expressions and their syntax at
-http://docs.python.org/library/re.html.
-
 Miscellaneous
 ~~~~~~~~~~~~~~
 

From 54fb874621bb7c56c35f930633fb226e58f244fb Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 18 Jan 2011 13:00:20 -0700
Subject: [PATCH 113/118] ...

---
 resources/recipes/nytimes_sub.recipe | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe
index 8f92852237..cdacc42d92 100644
--- a/resources/recipes/nytimes_sub.recipe
+++ b/resources/recipes/nytimes_sub.recipe
@@ -1,4 +1,5 @@
 #!/usr/bin/env  python
+# -*- coding: utf-8 -*-
 
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
@@ -23,6 +24,10 @@ class NYTimes(BasicNewsRecipe):
     webEdition = False
     oldest_article = 7
 
+    # replace paid Kindle Version:  the name will be changed to "The New York Times" to cause
+    # previous paid versions of the new york times to best sent to the back issues folder on the kindle
+    replaceKindleVersion = False
+
     # includeSections: List of sections to include. If empty, all sections found will be included.
     # Otherwise, only the sections named will be included. For example,
     #
@@ -94,6 +99,10 @@ class NYTimes(BasicNewsRecipe):
         title='New York Times (Web)'
         description = 'New York Times on the Web'
         needs_subscription = True
+    elif replaceKindleVersion:
+	title='The New York Times'
+        description = 'Today\'s New York Times'
+        needs_subscription = True
     else:
         title='New York Times'
         description = 'Today\'s New York Times'
@@ -623,7 +632,7 @@ class NYTimes(BasicNewsRecipe):
 							self.log(">>> No class:'columnGroup first' found <<<")
 		except:
 			self.log("ERROR: One picture per article in postprocess_html")
-									
+
 		try:
 			# Change captions to italic
 			for caption in soup.findAll(True, {'class':'caption'}) :
@@ -637,7 +646,7 @@ class NYTimes(BasicNewsRecipe):
 					caption.replaceWith(cTag)
 		except:
 			self.log("ERROR:  Problem in change captions to italic")
-		
+
 		try:
 			# Change <nyt_headline> to <h2>
 			h1 = soup.find('h1')
@@ -675,7 +684,7 @@ class NYTimes(BasicNewsRecipe):
 		except:
 			self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
 
-		try:		
+		try:
 			# Change <span class="bold"> to <b>
 			for subhead in soup.findAll(True, {'class':'bold'}) :
 				if subhead.contents:
@@ -684,15 +693,15 @@ class NYTimes(BasicNewsRecipe):
 					subhead.replaceWith(bTag)
 		except:
 			self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
-		
-		try:		
+
+		try:
 			divTag = soup.find('div',attrs={'id':'articleBody'})
 			if divTag:
 				divTag['class'] = divTag['id']
 		except:
 			self.log("ERROR:  Problem in soup.find(div,attrs={id:articleBody})")
-			
-		try:	
+
+		try:
 			# Add class="authorId" to <div> so we can format with CSS
 			divTag = soup.find('div',attrs={'id':'authorId'})
 			if divTag and divTag.contents[0]:
@@ -700,10 +709,10 @@ class NYTimes(BasicNewsRecipe):
 				tag['class'] = "authorId"
 				tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
 								 use_alt=False)))
-				divTag.replaceWith(tag)		
+				divTag.replaceWith(tag)
 		except:
 			self.log("ERROR:  Problem in Add class=authorId to <div> so we can format with CSS")
-		
+
 		return soup
     def populate_article_metadata(self, article, soup, first):
         shortparagraph = ""

From c7c563e0bbaee6820e986d49f987dd3a720ef808 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Tue, 18 Jan 2011 20:22:15 +0000
Subject: [PATCH 114/118] Fix #8441: Custom Meta Data tab requiring
 capitalization where main GUI does not.

---
 src/calibre/gui2/widgets.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py
index 0bb5ee7634..8d3af55bd9 100644
--- a/src/calibre/gui2/widgets.py
+++ b/src/calibre/gui2/widgets.py
@@ -479,10 +479,10 @@ class CompleteLineEdit(EnLineEdit):
 
     def update_items_cache(self, complete_items):
         self.completer.update_items_cache(complete_items)
-        
+
     def set_separator(self, sep):
         self.separator = sep
-        
+
     def set_space_before_sep(self, space_before):
         self.space_before_sep = space_before
 
@@ -527,7 +527,7 @@ class EnComboBox(QComboBox):
     def __init__(self, *args):
         QComboBox.__init__(self, *args)
         self.setLineEdit(EnLineEdit(self))
-        self.setAutoCompletionCaseSensitivity(Qt.CaseSensitive)
+        self.setAutoCompletionCaseSensitivity(Qt.CaseInsensitive)
         self.setMinimumContentsLength(20)
 
     def text(self):
@@ -541,17 +541,17 @@ class EnComboBox(QComboBox):
         self.setCurrentIndex(idx)
 
 class CompleteComboBox(EnComboBox):
-    
+
     def __init__(self, *args):
         EnComboBox.__init__(self, *args)
         self.setLineEdit(CompleteLineEdit(self))
 
     def update_items_cache(self, complete_items):
         self.lineEdit().update_items_cache(complete_items)
-        
+
     def set_separator(self, sep):
         self.lineEdit().set_separator(sep)
-        
+
     def set_space_before_sep(self, space_before):
         self.lineEdit().set_space_before_sep(space_before)
 

From ebda738c8136455df0dd71cb857e8ca620386982 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 18 Jan 2011 13:49:12 -0700
Subject: [PATCH 115/118] Updated NY Times

---
 resources/recipes/nytimes_sub.recipe | 275 ++++++++++++++++-----------
 1 file changed, 161 insertions(+), 114 deletions(-)

diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe
index cdacc42d92..2424113e31 100644
--- a/resources/recipes/nytimes_sub.recipe
+++ b/resources/recipes/nytimes_sub.recipe
@@ -159,6 +159,11 @@ class NYTimes(BasicNewsRecipe):
                             'relatedSearchesModule',
                             'side_tool',
                             'singleAd',
+                            'entry entry-utility', #added for DealBook
+                            'entry-tags', #added for DealBook
+                            'footer promos clearfix', #added for DealBook
+                            'footer links clearfix', #added for DealBook
+                            'inlineImage module', #added for DealBook
                             re.compile('^subNavigation'),
                             re.compile('^leaderboard'),
                             re.compile('^module'),
@@ -192,6 +197,9 @@ class NYTimes(BasicNewsRecipe):
                             'side_index',
                             'side_tool',
                             'toolsRight',
+                            'skybox', #added for DealBook
+                            'TopAd', #added for DealBook
+                            'related-content', #added for DealBook
                             ]),
                    dict(name=['script', 'noscript', 'style','form','hr'])]
     no_stylesheets = True
@@ -246,7 +254,7 @@ class NYTimes(BasicNewsRecipe):
     def exclude_url(self,url):
         if not url.startswith("http"):
             return True
-        if not url.endswith(".html"):
+        if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
             return True
         if 'nytimes.com' not in url:
             return True
@@ -569,7 +577,6 @@ class NYTimes(BasicNewsRecipe):
 
 
     def preprocess_html(self, soup):
-
         if self.webEdition & (self.oldest_article>0):
             date_tag = soup.find(True,attrs={'class': ['dateline','date']})
             if date_tag:
@@ -592,128 +599,168 @@ class NYTimes(BasicNewsRecipe):
                 img_div = soup.find('div','inlineImage module')
                 if img_div:
                     img_div.extract()
+
+
         return self.strip_anchors(soup)
 
     def postprocess_html(self,soup, True):
-		try:
-			if self.one_picture_per_article:
-				# Remove all images after first
-				largeImg = soup.find(True, {'class':'articleSpanImage'})
-				inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
-				if largeImg:
-					for inlineImg in inlineImgs:
-						inlineImg.extract()
-				else:
-					if inlineImgs:
-						firstImg = inlineImgs[0]
-						for inlineImg in inlineImgs[1:]:
-							inlineImg.extract()
-						# Move firstImg before article body
-						cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
-						if cgFirst:
-							# Strip all sibling NavigableStrings: noise
-							navstrings = cgFirst.findAll(text=True, recursive=False)
-							[ns.extract() for ns in navstrings]
-							headline_found = False
-							tag = cgFirst.find(True)
-							insertLoc = 0
-							while True:
-								insertLoc += 1
-								if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
-										headline_found = True
-										break
-								tag = tag.nextSibling
-								if not tag:
-									headline_found = False
-									break
-							if headline_found:
-								cgFirst.insert(insertLoc,firstImg)
-						else:
-							self.log(">>> No class:'columnGroup first' found <<<")
-		except:
-			self.log("ERROR: One picture per article in postprocess_html")
 
-		try:
-			# Change captions to italic
-			for caption in soup.findAll(True, {'class':'caption'}) :
-				if caption and len(caption) > 0:
-					cTag = Tag(soup, "p", [("class", "caption")])
-					c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
-					mp_off = c.find("More Photos")
-					if mp_off >= 0:
-						c = c[:mp_off]
-					cTag.insert(0, c)
-					caption.replaceWith(cTag)
-		except:
-			self.log("ERROR:  Problem in change captions to italic")
+        try:
+                if self.one_picture_per_article:
+                        # Remove all images after first
+                        largeImg = soup.find(True, {'class':'articleSpanImage'})
+                        inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
+                        if largeImg:
+                                for inlineImg in inlineImgs:
+                                        inlineImg.extract()
+                        else:
+                                if inlineImgs:
+                                        firstImg = inlineImgs[0]
+                                        for inlineImg in inlineImgs[1:]:
+                                                inlineImg.extract()
+                                        # Move firstImg before article body
+                                        cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
+                                        if cgFirst:
+                                                # Strip all sibling NavigableStrings: noise
+                                                navstrings = cgFirst.findAll(text=True, recursive=False)
+                                                [ns.extract() for ns in navstrings]
+                                                headline_found = False
+                                                tag = cgFirst.find(True)
+                                                insertLoc = 0
+                                                while True:
+                                                        insertLoc += 1
+                                                        if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
+                                                                        headline_found = True
+                                                                        break
+                                                        tag = tag.nextSibling
+                                                        if not tag:
+                                                                headline_found = False
+                                                                break
+                                                if headline_found:
+                                                        cgFirst.insert(insertLoc,firstImg)
+                                        else:
+                                                self.log(">>> No class:'columnGroup first' found <<<")
+        except:
+                self.log("ERROR: One picture per article in postprocess_html")
 
-		try:
-			# Change <nyt_headline> to <h2>
-			h1 = soup.find('h1')
-			if h1:
-				headline = h1.find("nyt_headline")
-				if headline:
-					tag = Tag(soup, "h2")
-					tag['class'] = "headline"
-					tag.insert(0, self.fixChars(headline.contents[0]))
-					h1.replaceWith(tag)
-			else:
-				# Blog entry - replace headline, remove <hr> tags
-				headline = soup.find('title')
-				if headline:
-					tag = Tag(soup, "h2")
-					tag['class'] = "headline"
-					tag.insert(0, self.fixChars(headline.contents[0]))
-					soup.insert(0, tag)
-					hrs = soup.findAll('hr')
-					for hr in hrs:
-						hr.extract()
-		except:
-			self.log("ERROR:  Problem in Change <nyt_headline> to <h2>")
+        try:
+                # Change captions to italic
+                for caption in soup.findAll(True, {'class':'caption'}) :
+                        if caption and len(caption) > 0:
+                                cTag = Tag(soup, "p", [("class", "caption")])
+                                c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
+                                mp_off = c.find("More Photos")
+                                if mp_off >= 0:
+                                        c = c[:mp_off]
+                                cTag.insert(0, c)
+                                caption.replaceWith(cTag)
+        except:
+                self.log("ERROR:  Problem in change captions to italic")
 
-		try:
-			# Change <h1> to <h3> - used in editorial blogs
-			masthead = soup.find("h1")
-			if masthead:
-				# Nuke the href
-				if masthead.a:
-					del(masthead.a['href'])
-				tag = Tag(soup, "h3")
-				tag.insert(0, self.fixChars(masthead.contents[0]))
-				masthead.replaceWith(tag)
-		except:
-			self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
+        try:
+                # Change <nyt_headline> to <h2>
+                h1 = soup.find('h1')
+                blogheadline = str(h1) #added for dealbook
+                if h1:
+                        headline = h1.find("nyt_headline")
+                        if headline:
+                                tag = Tag(soup, "h2")
+                                tag['class'] = "headline"
+                                tag.insert(0, self.fixChars(headline.contents[0]))
+                                h1.replaceWith(tag)
+                        elif blogheadline.find('entry-title'):#added for dealbook
+                                tag = Tag(soup, "h2")#added for dealbook
+                                tag['class'] = "headline"#added for dealbook
+                                tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
+                                h1.replaceWith(tag)#added for dealbook
 
-		try:
-			# Change <span class="bold"> to <b>
-			for subhead in soup.findAll(True, {'class':'bold'}) :
-				if subhead.contents:
-					bTag = Tag(soup, "b")
-					bTag.insert(0, subhead.contents[0])
-					subhead.replaceWith(bTag)
-		except:
-			self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
+                else:
+                        # Blog entry - replace headline, remove <hr> tags  - BCC I think this is no longer functional 1-18-2011
+                        headline = soup.find('title')
+                        if headline:
+                                tag = Tag(soup, "h2")
+                                tag['class'] = "headline"
+                                tag.insert(0, self.fixChars(headline.renderContents()))
+                                soup.insert(0, tag)
+                                hrs = soup.findAll('hr')
+                                for hr in hrs:
+                                        hr.extract()
+        except:
+                self.log("ERROR:  Problem in Change <nyt_headline> to <h2>")
 
-		try:
-			divTag = soup.find('div',attrs={'id':'articleBody'})
-			if divTag:
-				divTag['class'] = divTag['id']
-		except:
-			self.log("ERROR:  Problem in soup.find(div,attrs={id:articleBody})")
+        try:
+                #if this is from a blog (dealbook, fix the byline format
+                bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
+                if bylineauthor:
+                    tag = Tag(soup, "h6")
+                    tag['class'] = "byline"
+                    tag.insert(0, self.fixChars(bylineauthor.renderContents()))
+                    bylineauthor.replaceWith(tag)
+        except:
+            self.log("ERROR:  fixing byline author format")
 
-		try:
-			# Add class="authorId" to <div> so we can format with CSS
-			divTag = soup.find('div',attrs={'id':'authorId'})
-			if divTag and divTag.contents[0]:
-				tag = Tag(soup, "p")
-				tag['class'] = "authorId"
-				tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
-								 use_alt=False)))
-				divTag.replaceWith(tag)
-		except:
-			self.log("ERROR:  Problem in Add class=authorId to <div> so we can format with CSS")
+        try:
+                #if this is a blog (dealbook) fix the credit style for the pictures
+                blogcredit = soup.find('div',attrs={'class':'credit'})
+                if blogcredit:
+                    tag = Tag(soup, "h6")
+                    tag['class'] = "credit"
+                    tag.insert(0, self.fixChars(blogcredit.renderContents()))
+                    blogcredit.replaceWith(tag)
+        except:
+            self.log("ERROR:  fixing credit format")
 
-		return soup
+
+        try:
+                # Change <h1> to <h3> - used in editorial blogs
+                masthead = soup.find("h1")
+                if masthead:
+                        # Nuke the href
+                        if masthead.a:
+                                del(masthead.a['href'])
+                        tag = Tag(soup, "h3")
+                        tag.insert(0, self.fixChars(masthead.contents[0]))
+                        masthead.replaceWith(tag)
+        except:
+                self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
+
+        try:
+                # Change <span class="bold"> to <b>
+                for subhead in soup.findAll(True, {'class':'bold'}) :
+                        if subhead.contents:
+                                bTag = Tag(soup, "b")
+                                bTag.insert(0, subhead.contents[0])
+                                subhead.replaceWith(bTag)
+        except:
+                self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
+        try:
+                #remove the <strong> update tag
+                blogupdated = soup.find('span', {'class':'update'})
+                if blogupdated:
+                    blogupdated.replaceWith("")
+        except:
+                self.log("ERROR:  Removing strong tag")
+
+        try:
+                divTag = soup.find('div',attrs={'id':'articleBody'})
+                if divTag:
+                        divTag['class'] = divTag['id']
+        except:
+                self.log("ERROR:  Problem in soup.find(div,attrs={id:articleBody})")
+
+        try:
+                # Add class="authorId" to <div> so we can format with CSS
+                divTag = soup.find('div',attrs={'id':'authorId'})
+                if divTag and divTag.contents[0]:
+                        tag = Tag(soup, "p")
+                        tag['class'] = "authorId"
+                        tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
+                                                         use_alt=False)))
+                        divTag.replaceWith(tag)
+        except:
+                self.log("ERROR:  Problem in Add class=authorId to <div> so we can format with CSS")
+
+        return soup
     def populate_article_metadata(self, article, soup, first):
         shortparagraph = ""
         try:

From 19d7f6d39fcc235105896203ca37b750ca9350cb Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 18 Jan 2011 14:02:27 -0700
Subject: [PATCH 116/118] ...

---
 src/calibre/ebooks/html/input.py | 4 ++--
 src/calibre/utils/date.py        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 1f07f4ca41..ac16e459e8 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -21,7 +21,7 @@ from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.customize.conversion import OptionRecommendation
 from calibre.constants import islinux, isfreebsd, iswindows
-from calibre import unicode_path
+from calibre import unicode_path, as_unicode
 from calibre.utils.localization import get_lang
 from calibre.utils.filenames import ascii_filename
 from calibre.ebooks.conversion.utils import PreProcessor
@@ -112,7 +112,7 @@ class HTMLFile(object):
             with open(self.path, 'rb') as f:
                 src = f.read()
         except IOError, err:
-            msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
+            msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err))
             if level == 0:
                 raise IOError(msg)
             raise IgnoreFile(msg, err.errno)
diff --git a/src/calibre/utils/date.py b/src/calibre/utils/date.py
index bc16ebb0b6..f67f51ffc6 100644
--- a/src/calibre/utils/date.py
+++ b/src/calibre/utils/date.py
@@ -52,7 +52,7 @@ def is_date_undefined(qt_or_dt):
         return True
     if hasattr(d, 'toString'):
         d = datetime(d.year(), d.month(), d.day(), tzinfo=utc_tz)
-    return d.year == UNDEFINED_DATE.year and \
+    return d.year <= UNDEFINED_DATE.year and \
             d.month == UNDEFINED_DATE.month and \
             d.day == UNDEFINED_DATE.day
 

From 78f599d5993c98e7f7ff71b98cf2c74db78fa0bd Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 18 Jan 2011 14:24:52 -0700
Subject: [PATCH 117/118] Updated calibre portable bat file

---
 resources/calibre-portable.bat | 85 +++++++++++++++++++++++-----------
 1 file changed, 58 insertions(+), 27 deletions(-)

diff --git a/resources/calibre-portable.bat b/resources/calibre-portable.bat
index fb3444e34e..473cdc4236 100644
--- a/resources/calibre-portable.bat
+++ b/resources/calibre-portable.bat
@@ -1,6 +1,4 @@
 @echo OFF
-REM			CalibreRun.bat
-REM			~~~~~~~~~~~~~~
 REM Batch File to start a Calibre configuration on Windows
 REM giving explicit control of the location of:
 REM  - Calibe Program Files
@@ -24,7 +22,10 @@ REM -------------------------------------
 REM Set up Calibre Config folder
 REM -------------------------------------
 
-If EXIST CalibreConfig SET CALIBRE_CONFIG_DIRECTORY=%cd%\CalibreConfig
+IF EXIST CalibreConfig (
+	SET CALIBRE_CONFIG_DIRECTORY=%cd%\CalibreConfig
+	ECHO CONFIG=%cd%\CalibreConfig
+)
 
 
 REM --------------------------------------------------------------
@@ -38,24 +39,53 @@ REM drive letter of the USB stick.
 REM Comment out any of the following that are not to be used
 REM --------------------------------------------------------------
 
-SET CALIBRE_LIBRARY_DIRECTORY=U:\eBOOKS\CalibreLibrary
-IF EXIST CalibreLibrary SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreLibrary
-IF EXIST CalibreBooks SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreBooks
+IF EXIST U:\eBooks\CalibreLibrary (
+	SET CALIBRE_LIBRARY_DIRECTORY=U:\eBOOKS\CalibreLibrary
+	ECHO LIBRARY=U:\eBOOKS\CalibreLibrary
+)
+IF EXIST CalibreLibrary (
+	SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreLibrary
+	ECHO LIBRARY=%cd%\CalibreLibrary
+)
+IF EXIST CalibreBooks (
+	SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreBooks
+	ECHO LIBRARY=%cd%\CalibreBooks
+)
 
 
 REM --------------------------------------------------------------
-REM Specify Location of metadata database  (optional)
+REM Specify Location of metadata database (optional)
 REM
 REM Location where the metadata.db file is located.  If not set
 REM the same location as Books files will be assumed.  This.
 REM options is used to get better performance when the Library is
 REM on a (slow) network drive.  Putting the metadata.db file 
-REM locally gives a big performance improvement.
+REM locally makes gives a big performance improvement.
+REM
+REM NOTE.  If you use this option, then the ability to switch
+REM        libraries within Calibre will be disabled.  Therefore
+REM        you do not want to set it if the metadata.db file
+REM        is at the same location as the book files.
 REM --------------------------------------------------------------
 
-IF EXIST CalibreBooks SET SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreBooks\metadata.db
-IF EXIST CalibreMetadata SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreMetadata\metadata.db
-
+IF EXIST CalibreBooks (
+	IF NOT "%CALIBRE_LIBRARY_DIRECTORY%" == "%cd%\CalibreBooks" (
+		SET SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreBooks\metadata.db
+		ECHO DATABASE=%cd%\CalibreBooks\metadata.db
+		ECHO '
+		ECHO ***CAUTION*** Library Switching will be disabled 
+		ECHO '
+	)
+)
+IF EXIST CalibreMetadata (
+	IF NOT "%CALIBRE_LIBRARY_DIRECTORY%" == "%cd%\CalibreMetadata" (
+		SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreMetadata\metadata.db
+		ECHO DATABASE=%cd%\CalibreMetadata\metadata.db
+		ECHO '
+		ECHO ***CAUTION*** Library Switching will be disabled 
+		ECHO '
+	)
+)
 
 REM --------------------------------------------------------------
 REM Specify Location of source (optional)
@@ -63,13 +93,20 @@ REM
 REM It is easy to run Calibre from source
 REM Just set the environment variable to where the source is located
 REM When running from source the GUI will have a '*' after the version.
+REM number that is displayed at the bottom of the Calibre main screen.
 REM --------------------------------------------------------------
 
-IF EXIST Calibre\src SET CALIBRE_DEVELOP_FROM=%cd%\Calibre\src
-
+IF EXIST Calibre\src (
+	SET CALIBRE_DEVELOP_FROM=%cd%\Calibre\src
+	ECHO SOURCE=%cd%\Calibre\src
+)
+IF EXIST D:\Calibre\Calibre\src (
+	SET CALIBRE_DEVELOP_FROM=D:\Calibre\Calibre\src
+	ECHO SOURCE=D:\Calibre\Calibre\src
+)
 
 REM --------------------------------------------------------------
-REM Specify Location of calibre binaries (optinal)
+REM Specify Location of calibre binaries (optional)
 REM
 REM To avoid needing Calibre to be set in the search path, ensure
 REM that Calibre Program Files is current directory when starting.
@@ -78,21 +115,15 @@ REM This folder can be populated by cpying the Calibre2 folder from
 REM an existing isntallation or by isntalling direct to here.
 REM --------------------------------------------------------------
 
-IF EXIST Calibre2 CD Calibre2
-
-
-REM --------------------------------------------
-REM Display settings that will be used
-REM --------------------------------------------
-
-echo PROGRAMS=%cd%
-echo SOURCE=%CALIBRE_DEVELOP_FROM%
-echo CONFIG=%CALIBRE_CONFIG_DIRECTORY%
-echo LIBRARY=%CALIBRE_LIBRARY_DIRECTORY%
-echo DATABASE=%CALIBRE_OVERRIDE_DATABASE_PATH%
+IF EXIST Calibre2 (
+	Calibre2 CD Calibre2
+	ECHO PROGRAMS=%cd%
+)
 
+REM ----------------------------------------------------------
 REM  The following gives a chance to check the settings before
 REM  starting Calibre.  It can be commented out if not wanted.
+REM ----------------------------------------------------------
 
 echo "Press CTRL-C if you do not want to continue"
 pause
@@ -111,4 +142,4 @@ REM Use with /WAIT to wait until Calibre completes to run a task on exit
 REM --------------------------------------------------------
 
 echo "Starting up Calibre"
-START /belownormal Calibre --with-library %CALIBRE_LIBRARY_DIRECTORY%
+START /belownormal Calibre --with-library "%CALIBRE_LIBRARY_DIRECTORY%"

From f56b7453b169cc4a8299bb6fc445c168e3dfdb5c Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 18 Jan 2011 14:41:49 -0700
Subject: [PATCH 118/118] Fix call to create_oebbok in oeb.iterator

---
 src/calibre/ebooks/conversion/plumber.py | 32 ++++++++++++------------
 src/calibre/ebooks/oeb/iterator.py       |  4 +--
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 6fdf7ddc68..04ee892c19 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -483,29 +483,29 @@ OptionRecommendation(name='pubdate',
 OptionRecommendation(name='timestamp',
     recommended_value=None, level=OptionRecommendation.LOW,
     help=_('Set the book timestamp (used by the date column in calibre).')),
-    
+
 OptionRecommendation(name='enable_heuristics',
     recommended_value=False, level=OptionRecommendation.LOW,
-    help=_('Enable heurisic processing. This option must be set for any '
+    help=_('Enable heuristic processing. This option must be set for any '
            'heuristic processing to take place.')),
 
 OptionRecommendation(name='markup_chapter_headings',
     recommended_value=False, level=OptionRecommendation.LOW,
-    help=_('Detect unformatted chapter headings and sub headings. Change ' 
+    help=_('Detect unformatted chapter headings and sub headings. Change '
            'them to h2 and h3 tags.  This setting will not create a TOC, '
            'but can be used in conjunction with structure detection to create '
            'one.')),
-           
+
 OptionRecommendation(name='italicize_common_cases',
     recommended_value=False, level=OptionRecommendation.LOW,
     help=_('Look for common words and patterns that denote '
            'italics and italicize them.')),
-           
+
 OptionRecommendation(name='fix_indents',
     recommended_value=False, level=OptionRecommendation.LOW,
     help=_('Turn indentation created from multiple non-breaking space entities '
            'into CSS indents.')),
-           
+
 OptionRecommendation(name='html_unwrap_factor',
     recommended_value=0.40, level=OptionRecommendation.LOW,
     help=_('Scale used to determine the length at which a line should '
@@ -513,31 +513,31 @@ OptionRecommendation(name='html_unwrap_factor',
             'default is 0.4, just below the median line length.  If only a '
             'few lines in the document require unwrapping this value should '
             'be reduced')),
-            
+
 OptionRecommendation(name='unwrap_lines',
     recommended_value=False, level=OptionRecommendation.LOW,
     help=_('Unwrap lines using punctuation and other formatting clues.')),
-    
+
 OptionRecommendation(name='delete_blank_paragraphs',
     recommended_value=False, level=OptionRecommendation.LOW,
     help=_('Remove empty paragraphs from the document when they exist between '
            'every other paragraph')),
-    
+
 OptionRecommendation(name='format_scene_breaks',
     recommended_value=False, level=OptionRecommendation.LOW,
-    help=_('left aligned scene break markers are center aligned. '
+    help=_('Left aligned scene break markers are center aligned. '
            'Replace soft scene breaks that use multiple blank lines with'
            'horizontal rules.')),
 
 OptionRecommendation(name='dehyphenate',
     recommended_value=False, level=OptionRecommendation.LOW,
-    help=_('Analyses hyphenated words throughout the document.  The '
+    help=_('Analyze hyphenated words throughout the document.  The '
            'document itself is used as a dictionary to determine whether hyphens '
            'should be retained or removed.')),
 
 OptionRecommendation(name='renumber_headings',
     recommended_value=False, level=OptionRecommendation.LOW,
-    help=_('Looks for occurences of sequential <h1> or <h2> tags. '
+    help=_('Looks for occurrences of sequential <h1> or <h2> tags. '
            'The tags are renumbered to prevent splitting in the middle '
            'of chapter headings.')),
 
@@ -545,10 +545,10 @@ OptionRecommendation(name='sr1_search',
     recommended_value='', level=OptionRecommendation.LOW,
     help=_('Search pattern (regular expression) to be replaced with '
            'sr1-replace.')),
-    
+
 OptionRecommendation(name='sr1_replace',
     recommended_value='', level=OptionRecommendation.LOW,
-    help=_('Replace characters to replace the text found with sr1-search.')),
+    help=_('Replacement to replace the text found with sr1-search.')),
 
 OptionRecommendation(name='sr2_search',
     recommended_value='', level=OptionRecommendation.LOW,
@@ -557,7 +557,7 @@ OptionRecommendation(name='sr2_search',
 
 OptionRecommendation(name='sr2_replace',
     recommended_value='', level=OptionRecommendation.LOW,
-    help=_('Replace characters to replace the text found with sr2-search.')),
+    help=_('Replacement to replace the text found with sr2-search.')),
 
 OptionRecommendation(name='sr3_search',
     recommended_value='', level=OptionRecommendation.LOW,
@@ -566,7 +566,7 @@ OptionRecommendation(name='sr3_search',
 
 OptionRecommendation(name='sr3_replace',
     recommended_value='', level=OptionRecommendation.LOW,
-    help=_('Replace characters to replace the text found with sr3-search.')),
+    help=_('Replacement to replace the text found with sr3-search.')),
 ]
         # }}}
 
diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py
index 08b4369078..299c77af10 100644
--- a/src/calibre/ebooks/oeb/iterator.py
+++ b/src/calibre/ebooks/oeb/iterator.py
@@ -199,8 +199,8 @@ class EbookIterator(object):
                     not hasattr(self.pathtoopf, 'manifest'):
                 if hasattr(self.pathtoopf, 'manifest'):
                     self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
-                self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts,
-                        plumber.input_plugin)
+                self.pathtoopf = create_oebbook(self.log, self.pathtoopf,
+                        plumber.opts)
 
         if hasattr(self.pathtoopf, 'manifest'):
             self.pathtoopf = write_oebbook(self.pathtoopf, self.base)