diff --git a/resources/recipes/ajc.recipe b/resources/recipes/ajc.recipe
index ccd0efebdd..ea989b4b4c 100644
--- a/resources/recipes/ajc.recipe
+++ b/resources/recipes/ajc.recipe
@@ -1,6 +1,6 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
-__author__    = 'Tony Stegall' 
+__author__    = 'Tony Stegall'
 __copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com'
 __version__   = '1.03'
 __date__      = '27, September 2010'
@@ -9,6 +9,8 @@ __docformat__ = 'restructuredtext en'
 
 
 import datetime
+from calibre.web.feeds.news import BasicNewsRecipe
+
 class AdvancedUserRecipe1282101454(BasicNewsRecipe):
     now = datetime.datetime.now()
     title = 'The AJC'
@@ -20,39 +22,39 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
     oldest_article = 1
     max_articles_per_feed = 100
     no_stylesheets = True
-    
+
     masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif'
     extra_css = '''
                     h1.articleHeadline{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
                     h2.articleSubheadline{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
-                    
+
                     p.byline{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;}
                     p.organization{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;}
-                    
-                    
+
+
                     p{font-family:Helvetica,Arial,sans-serif;font-size:small;}
 		        '''
-    
-    
+
+
     keep_only_tags    = [
                         dict(name='div', attrs={'class':['cxArticleHeader']})
                        ,dict(attrs={'id':['cxArticleText']})
                         ]
-    
-                  
+
+
     remove_tags = [
                      dict(name='div'  , attrs={'class':'cxArticleList'       })
                     ,dict(name='div'  , attrs={'class':'cxFeedTease' })
                     ,dict(name='div'  , attrs={'class':'cxElementEnlarge'  })
                     ,dict(name='div'  , attrs={'id':'cxArticleTools'  })
                   ]
-              
-                  
-                  
+
+
+
     feeds          = [
                       ('Breaking News', 'http://www.ajc.com/genericList-rss.do?source=61499'),
                       # -------------------------------------------------------------------
-                      # Here are the different area feeds. Choose which ever one you wish to 
+                      # Here are the different area feeds. Choose which ever one you wish to
                       # read by simply removing the pound sign from it.  I currently have it
                       # set to only get the Cobb area
                       # --------------------------------------------------------------------
@@ -70,7 +72,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
                        ('Opinions', 'http://www.ajc.com/section-rss.do?source=opinion'),
                        ('Ga Politics', 'http://www.ajc.com/section-rss.do?source=georgia-politics-elections'),
                       # ------------------------------------------------------------------------
-                      # Here are the different sports feeds. I only follow the Falcons, and Highschool 
+                      # Here are the different sports feeds. I only follow the Falcons, and Highschool
                       # but again
                       # You can enable which ever team you like by removing the pound sign
                       # ------------------------------------------------------------------------
@@ -85,25 +87,25 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
                        ('Music', 'http://www.accessatlanta.com/section-rss.do?source=music'),
                     ]
 
-   
+
 
     def postprocess_html(self, soup, first):
       for credit_tag in soup.findAll('span', attrs={'class':['imageCredit rightFloat']}):
        credit_tag.extract()
-      
+
       return soup
-       
+
    #def print_version(self, url):
    #     return url.partition('?')[0] +'?printArticle=y'
-   
-      
-       
-    
 
 
 
 
 
-    
- 
+
+
+
+
+
+
 
diff --git a/resources/recipes/boortz.recipe b/resources/recipes/boortz.recipe
index dfb624c4bc..b281798ac8 100644
--- a/resources/recipes/boortz.recipe
+++ b/resources/recipes/boortz.recipe
@@ -1,6 +1,6 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
-__author__    = 'Tony Stegall' 
+__author__    = 'Tony Stegall'
 __copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com'
 __version__   = '1.04'
 __date__      = '27, September 2010'
@@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
 
 
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
+
 class AdvancedUserRecipe1282101454(BasicNewsRecipe):
     title = 'Nealz Nuze'
     language = 'en'
@@ -18,7 +18,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
     category = 'news, politics, USA, talkshow'
     oldest_article = 1
     max_articles_per_feed = 100
-    
+
     no_stylesheets = True
     remove_javascript   = True
     use_embedded_content = True
@@ -26,5 +26,5 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
     conversion_options = {'linearize_tables' : True}
     feeds          = [
                       ('NUZE', 'http://boortz.com/nealz_nuze_rss/rss.xml')
-                      
+
                     ]
diff --git a/resources/recipes/popscience.recipe b/resources/recipes/popscience.recipe
index 2bef7e4807..5f66d048a6 100644
--- a/resources/recipes/popscience.recipe
+++ b/resources/recipes/popscience.recipe
@@ -1,5 +1,5 @@
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
+import re
 
 class AdvancedUserRecipe1282101454(BasicNewsRecipe):
     title = 'Popular Science'
@@ -13,35 +13,35 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
     no_stylesheets = True
     remove_javascript = True
     use_embedded_content = True
-    
+
     masthead_url = 'http://www.raytheon.com/newsroom/rtnwcm/groups/Public/documents/masthead/rtn08_popscidec_masthead.jpg'
-    
-               
+
+
     feeds          = [
-                      
+
                       ('Gadgets', 'http://www.popsci.com/full-feed/gadgets'),
                       ('Cars', 'http://www.popsci.com/full-feed/cars'),
                       ('Science', 'http://www.popsci.com/full-feed/science'),
                       ('Technology', 'http://www.popsci.com/full-feed/technology'),
                       ('DIY', 'http://www.popsci.com/full-feed/diy'),
-                      
+
                     ]
 
-    
- #The following will get read of the Gallery: links when found    
-        
+
+ #The following will get read of the Gallery: links when found
+
     def preprocess_html(self, soup) :
         print 'SOUP IS: ', soup
         weblinks = soup.findAll(['head','h2'])
         if weblinks is not None:
             for link in weblinks:
                 if re.search('(Gallery)(:)',str(link)):
-                  
+
                   link.parent.extract()
         return soup
-  #-----------------------------------------------------------------      
-        
-        
+  #-----------------------------------------------------------------
+
+
 
 
 
diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py
index 2b5eb5011e..0310f09242 100644
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@@ -1,3 +1,4 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import with_statement
 __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
@@ -251,6 +252,9 @@ class OutputProfile(Plugin):
     #: The character used to represent a star in ratings
     ratings_char = u'*'
 
+    #: Unsupported unicode characters to be replaced during preprocessing
+    unsupported_unicode_chars = []
+
     @classmethod
     def tags_to_string(cls, tags):
         return escape(', '.join(tags))
@@ -422,6 +426,8 @@ class SonyReaderOutput(OutputProfile):
     dpi                       = 168.451
     fbase                     = 12
     fsizes                    = [7.5, 9, 10, 12, 15.5, 20, 22, 24]
+    unsupported_unicode_chars = [u'\u201f', u'\u201b']
+
 
 class KoboReaderOutput(OutputProfile):
 
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 92c2fe5954..bb5c26a50c 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -62,49 +62,104 @@ def wrap_lines(match):
     else:
                return ital+' '
 
-def line_length(format, raw, percent):
+class DocAnalysis(object):
     '''
-    raw is the raw text to find the line length to use for wrapping.
-    percentage is a decimal number, 0 - 1 which is used to determine
-    how far in the list of line lengths to use. The list of line lengths is
-    ordered smallest to larged and does not include duplicates. 0.5 is the
-    median value.
+    Provides various text analysis functions to determine how the document is structured.
+    format is the type of document analysis will be done against.
+    raw is the raw text to determine the line length to use for wrapping.
+    Blank lines are excluded from analysis
     '''
-    raw = raw.replace('&nbsp;', ' ')
-    if format == 'html':
-        linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
-    elif format == 'pdf':
-        linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
-    elif format == 'spanned_html':
-        linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
-    lines = linere.findall(raw)
 
-    lengths = []
-    for line in lines:
-        if len(line) > 0:
-            lengths.append(len(line))
+    def __init__(self, format='html', raw=''):
+        raw = raw.replace('&nbsp;', ' ')
+        if format == 'html':
+            linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
+        elif format == 'pdf':
+            linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
+        elif format == 'spanned_html':
+            linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
+        self.lines = linere.findall(raw)
 
-    if not lengths:
-        return 0
+    def line_length(self, percent):
+        '''
+        Analyses the document to find the median line length.
+        percentage is a decimal number, 0 - 1 which is used to determine
+        how far in the list of line lengths to use. The list of line lengths is
+        ordered smallest to larged and does not include duplicates. 0.5 is the
+        median value.
+        '''
+        lengths = []
+        for line in self.lines:
+            if len(line) > 0:
+                lengths.append(len(line))
 
-    lengths = list(set(lengths))
-    total = sum(lengths)
-    avg = total / len(lengths)
-    max_line = avg * 2
+        if not lengths:
+            return 0
 
-    lengths = sorted(lengths)
-    for i in range(len(lengths) - 1, -1, -1):
-        if lengths[i] > max_line:
-            del lengths[i]
+        lengths = list(set(lengths))
+        total = sum(lengths)
+        avg = total / len(lengths)
+        max_line = avg * 2
 
-    if percent > 1:
-        percent = 1
-    if percent < 0:
-        percent = 0
+        lengths = sorted(lengths)
+        for i in range(len(lengths) - 1, -1, -1):
+            if lengths[i] > max_line:
+                del lengths[i]
 
-    index = int(len(lengths) * percent) - 1
+        if percent > 1:
+            percent = 1
+        if percent < 0:
+            percent = 0
 
-    return lengths[index]
+        index = int(len(lengths) * percent) - 1
+
+        return lengths[index]
+
+    def line_histogram(self, percent):
+        '''
+        Creates a broad histogram of the document to determine whether it incorporates hard
+        line breaks.  Lines are sorted into 20 'buckets' based on length.
+        percent is the percentage of lines that should be in a single bucket to return true
+        The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
+        '''
+        minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
+        maxLineLength=1900 # Discard larger than this to stay in range
+        buckets=20 # Each line is divided into a bucket based on length
+
+        #print "there are "+str(len(lines))+" lines"
+        #max = 0
+        #for line in self.lines:
+        #    l = len(line)
+        #    if l > max:
+        #        max = l
+        #print "max line found is "+str(max)
+        # Build the line length histogram
+        hRaw = [ 0 for i in range(0,buckets) ]
+        for line in self.lines:
+            l = len(line)
+            if l > minLineLength and l < maxLineLength:
+                    l = int(l/100)
+                    #print "adding "+str(l)
+                    hRaw[l]+=1
+
+        # Normalize the histogram into percents
+        totalLines = len(self.lines)
+        h = [ float(count)/totalLines for count in hRaw ]
+        #print "\nhRaw histogram lengths are: "+str(hRaw)
+        #print "              percents are: "+str(h)+"\n"
+
+        # Find the biggest bucket
+        maxValue = 0
+        for i in range(0,len(h)):
+            if h[i] > maxValue:
+                maxValue = h[i]
+
+        if maxValue < percent:
+            #print "Line lengths are too variable. Not unwrapping."
+            return False
+        else:
+            #print str(maxValue)+" of the lines were in one bucket"
+            return True
 
 class Dehyphenator(object):
     '''
@@ -117,42 +172,62 @@ class Dehyphenator(object):
     def __init__(self):
         # Add common suffixes to the regex below to increase the likelihood of a match -
         # don't add suffixes which are also complete words, such as 'able' or 'sex'
-        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
         # remove prefixes if the prefix was not already the point of hyphenation
-        self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
-        self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
+        self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
+        self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
 
     def dehyphenate(self, match):
         firsthalf = match.group('firstpart')
         secondhalf = match.group('secondpart')
+        try:
+            wraptags = match.group('wraptags')
+        except:
+            wraptags = ''
         hyphenated = str(firsthalf) + "-" + str(secondhalf)
         dehyphenated = str(firsthalf) + str(secondhalf)
         lookupword = self.removesuffixes.sub('', dehyphenated)
         if self.prefixes.match(firsthalf) is None:
            lookupword = self.removeprefix.sub('', lookupword)
-        booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
         #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
-        match = booklookup.search(self.html)
-        if match:
-            #print "returned dehyphenated word: " + str(dehyphenated)
-            return dehyphenated
-        else:
-            #print "returned hyphenated word: " + str(hyphenated)
+        try:
+            searchresult = self.html.find(str.lower(lookupword))
+        except:
             return hyphenated
+        if self.format == 'html_cleanup':
+            if self.html.find(lookupword) != -1 or searchresult != -1:
+                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+                return dehyphenated
+            elif self.html.find(hyphenated) != -1:
+                #print "Cleanup:returned hyphenated word: " + str(hyphenated)
+                return hyphenated
+            else:
+                #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+                return firsthalf+u'\u2014'+wraptags+secondhalf
+
+        else:
+            if self.html.find(lookupword) != -1 or searchresult != -1:
+                #print "returned dehyphenated word: " + str(dehyphenated)
+                return dehyphenated
+            else:
+                #print "           returned hyphenated word: " + str(hyphenated)
+                return hyphenated
 
     def __call__(self, html, format, length=1):
         self.html = html
+        self.format = format
         if format == 'html':
-            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
         elif format == 'pdf':
-            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(<p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
         elif format == 'individual_words':
-            intextmatch = re.compile('>[^<]*\b(?P<firstpart>[^"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
+            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
+        elif format == 'html_cleanup':
+            intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
 
         html = intextmatch.sub(self.dehyphenate, html)
         return html
 
-
 class CSSPreProcessor(object):
 
     PAGE_PAT   = re.compile(r'@page[^{]*?{[^}]*?}')
@@ -286,7 +361,7 @@ class HTMLPreProcessor(object):
                   (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
 
                   # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
+                  (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
                   # Cover the case where every letter in a chapter title is separated by a space
                   (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
 
@@ -374,10 +449,8 @@ class HTMLPreProcessor(object):
                 print 'Failed to parse remove_footer regexp'
                 traceback.print_exc()
 
-        # unwrap em/en dashes, delete soft hyphens - moved here so it's executed after header/footer removal
+        # delete soft hyphens - moved here so it's executed after header/footer removal
         if is_pdftohtml:
-            # unwrap em/en dashes
-            end_rules.append((re.compile(u'(?<=[–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
             # unwrap/delete soft hyphens
             end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
             # unwrap/delete soft hyphens with formatting
@@ -391,12 +464,15 @@ class HTMLPreProcessor(object):
 
         length = -1
         if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
-            length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
+            docanalysis = DocAnalysis('pdf', html)
+            length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
             if length:
-                # print "The pdf line length returned is " + str(length)
+                #print "The pdf line length returned is " + str(length)
+                # unwrap em/en dashes
+                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
                 end_rules.append(
                     # Un wrap using punctuation
-                    (re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                 )
 
         for rule in self.PREPROCESS + start_rules:
@@ -454,6 +530,14 @@ class HTMLPreProcessor(object):
         if getattr(self.extra_opts, 'smarten_punctuation', False):
             html = self.smarten_punctuation(html)
 
+        unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
+        if unsupported_unicode_chars:
+            from calibre.ebooks.unidecode.unidecoder import Unidecoder
+            unidecoder = Unidecoder()
+            for char in unsupported_unicode_chars:
+                asciichar = unidecoder.decode(char)
+                html = html.replace(char, asciichar)
+
         return html
 
     def smarten_punctuation(self, html):
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 6a5eaa4a34..5f5c12a703 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
 import re
-from calibre.ebooks.conversion.preprocess import line_length, Dehyphenator
+from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.utils.logging import default_log
 
 class PreProcessor(object):
@@ -77,13 +77,18 @@ class PreProcessor(object):
 
     def __call__(self, html):
         self.log("*********  Preprocessing HTML  *********")
+
+        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
+        html = re.sub(r"\s*</p>", "</p>\n", html)
+        html = re.sub(r"\s*<p>\s*", "\n<p>", html)
+
         ###### Check Markup ######
         #
         # some lit files don't have any <p> tags or equivalent (generally just plain text between
         # <pre> tags), check and  mark up line endings if required before proceeding
         if self.no_markup(html, 0.1):
              self.log("not enough paragraph markers, adding now")
-             # check if content is in pre tags, use txt procesor to mark up if so
+             # check if content is in pre tags, use txt processor to mark up if so
              pre = re.compile(r'<pre>', re.IGNORECASE)
              if len(pre.findall(html)) == 1:
                  self.log("Running Text Processing")
@@ -113,47 +118,77 @@ class PreProcessor(object):
         # Get rid of empty <o:p> tags to simplify other processing
         html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
         # Get rid of empty span, bold, & italics tags
-        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
+        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
         html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
 
-        # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+        # If more than 40% of the lines are empty paragraphs and the user has enabled remove
+        # paragraph spacing then delete blank lines to clean up spacing
         linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
         blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         #multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
         blanklines = blankreg.findall(html)
         lines = linereg.findall(html)
+        blanks_between_paragraphs = False
         if len(lines) > 1:
             self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
             if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
             'remove_paragraph_spacing', False):
                 self.log("deleting blank lines")
                 html = blankreg.sub('', html)
-        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
-        html = re.sub(r"\s*</p>", "</p>\n", html)
-        html = re.sub(r"\s*<p>\s*", "\n<p>", html)
+            elif float(len(blanklines)) / float(len(lines)) > 0.40:
+               blanks_between_paragraphs = True
+               #print "blanks between paragraphs is marked True"
+            else:
+                blanks_between_paragraphs = False
+        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
         # detect chapters/sections to match xpath or splitting logic
+        #
+        # Build the Regular Expressions in pieces
+        lookahead = "(?=<(p|div))"
+        chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
+        chapter_header_open = r"(?P<chap>"
+        chapter_header_close = ")\s*"
+        chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)\s[^>]*>)?\s*</(?P=outer)>\s*"
+        if blanks_between_paragraphs:
+            blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
+        else:
+            blank_lines = ""
+        opt_title_open = "("
+        title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
+        title_header_open = "(?P<title>"
+        title_header_close = ")\s*"
+        title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"
+        opt_title_close = ")?"
+
+        default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
+        typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
+        numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
+        uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
+
+        chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+        #print chapter_marker
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
         self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
         #
         # Start with most typical chapter headings, get more aggressive until one works
         if self.html_preprocess_sections < 10:
-            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(p|/?br)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE)
+            chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
             html = chapdetect.sub(self.chapter_head, html)
         if self.html_preprocess_sections < 10:
             self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+            chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
             html = chapdetect2.sub(self.chapter_head, html)
 
         if self.html_preprocess_sections < 10:
             self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+            chapdetect2 = re.compile(r'%s' % chapter_marker,  re.UNICODE)
             html = chapdetect2.sub(self.chapter_head, html)
-
         ###### Unwrap lines ######
         #
-        self.log("Unwrapping Lines")
         # Some OCR sourced files have line breaks in the html using a combination of span & p tags
         # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
         # that lines can be un-wrapped across page boundaries
@@ -168,25 +203,40 @@ class PreProcessor(object):
                 format = 'html'
         else:
             format = 'html'
-
+        # Check Line histogram to determine if the document uses hard line breaks, If 50% or
+        # more of the lines break in the same region of the document then unwrapping is required
+        docanalysis = DocAnalysis(format, html)
+        hardbreaks = docanalysis.line_histogram(.50)
+        self.log("Hard line breaks check returned "+str(hardbreaks))
         # Calculate Length
-        length = line_length(format, html, getattr(self.extra_opts,
-            'html_unwrap_factor', 0.4))
+        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
+        length = docanalysis.line_length(unwrap_factor)
         self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
-        max_length = length * 1.4
-        min_max = str("(?<=.{"+str(length)+"})(?<!.{"+str(max_length)+"})")
-        #
-        # Unwrap em/en dashes, delete soft-hyphens
-        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
-        html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
-        html = re.sub(u'%s(?<=[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % min_max, '', html)
-        # Dehyphenate
-        dehyphenator = Dehyphenator()
-        html = dehyphenator(html,'html', length)
+        # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
+        if hardbreaks or unwrap_factor < 0.4:
+            self.log("Unwrapping required, unwrapping Lines")
+            # Unwrap em/en dashes
+            html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
+            # Dehyphenate
+            self.log("Unwrapping/Removing hyphens")
+            dehyphenator = Dehyphenator()
+            html = dehyphenator(html,'html', length)
+            self.log("Done dehyphenating")
+            # Unwrap lines using punctation and line length
+            unwrap = re.compile(u"(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+            html = unwrap.sub(' ', html)
+            #check any remaining hyphens, but only unwrap if there is a match
+            dehyphenator = Dehyphenator()
+            html = dehyphenator(html,'html_cleanup', length)
+        else:
+            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
+            self.log("Cleaning up hyphenation")
+            dehyphenator = Dehyphenator()
+            html = dehyphenator(html,'html_cleanup', length)
+            self.log("Done dehyphenating")
 
-        # Unwrap lines using punctation and line length
-        unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
-        html = unwrap.sub(' ', html)
+        # delete soft hyphens
+        html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
 
         # If still no sections after unwrapping mark split points on lines with no punctuation
         if self.html_preprocess_sections < 10: