diff --git a/resources/recipes/infobae.recipe b/resources/recipes/infobae.recipe
index cda9bf83d2..b7f9cd3c6c 100644
--- a/resources/recipes/infobae.recipe
+++ b/resources/recipes/infobae.recipe
@@ -1,12 +1,8 @@
-#!/usr/bin/env  python
-
 __license__   = 'GPL v3'
-__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
 infobae.com
 '''
-import re
-import urllib, urlparse
 
 from calibre.web.feeds.news import BasicNewsRecipe
 
@@ -20,35 +16,24 @@ class Infobae(BasicNewsRecipe):
     max_articles_per_feed = 100
     no_stylesheets        = True
     use_embedded_content  = False
-    language = 'es'
-    lang = 'es-AR'
-
+    language              = 'es'
     encoding              = 'cp1252'
-    cover_url             = 'http://www.infobae.com/imgs/header/header.gif'
+    masthead_url          = 'http://www.infobae.com/imgs/header/header.gif'
     remove_javascript     = True
-    preprocess_regexps = [(re.compile(
-        r'<meta name="Description" content="[^"]+">'), lambda m:'')]
-
-
-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
-
-    extra_css = '''
-                    .col-center{font-family:Arial,Helvetica,sans-serif;}
-                    h1{font-family:Arial,Helvetica,sans-serif; color:#0D4261;}
-                    .fuenteIntNota{font-family:Arial,Helvetica,sans-serif; color:#1D1D1D; font-size:x-small;}
-                '''
-
-    keep_only_tags = [dict(name='div', attrs={'class':['content']})]
-
-
-    remove_tags = [
-               dict(name='div', attrs={'class':['options','col-right','controles', 'bannerLibre','tiulo-masleidas','masleidas-h']}),
-               dict(name='a', attrs={'name' : 'comentario',}),
-               dict(name='iframe'),
-               dict(name='img', alt = "Ver galerias de imagenes"),
-
-                                 ]
-
+    remove_empty_feeds    = True
+    extra_css             = '''
+                              body{font-family:Arial,Helvetica,sans-serif;}
+                              .popUpTitulo{color:#0D4261; font-size: xx-large}
+                            '''
+    
+    conversion_options = {
+                          'comment'          : description
+                        , 'tags'             : category
+                        , 'publisher'        : publisher
+                        , 'language'         : language
+                        , 'linearize_tables' : True
+                        }
+    
 
     feeds = [
               (u'Noticias'  , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml'       )
@@ -57,39 +42,14 @@ class Infobae(BasicNewsRecipe):
              ,(u'Deportes'  , u'http://www.infobae.com/adjuntos/html/RSS/deportes.xml'  )
             ]
 
-#    def print_version(self, url):
-#        main, sep, article_part = url.partition('contenidos/')
-#        article_id, rsep, rrest = article_part.partition('-')
-#        return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
-
-    def get_article_url(self, article):
-        ans = article.get('link').encode('utf-8')
-        parts = list(urlparse.urlparse(ans))
-        parts[2] = urllib.quote(parts[2])
-        ans = urlparse.urlunparse(parts)
-        return ans.decode('utf-8')
-
-
-    def preprocess_html(self, soup):
-
-        for tag in soup.head.findAll('strong'):
-            tag.extract()
-        for tag in soup.findAll('meta'):
-            del tag['content']
-            tag.extract()
-
-        mtag = '<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\n<meta http-equiv="Content-Language" content="es-AR"/>\n'
-        soup.head.insert(0,mtag)
-        for item in soup.findAll(style=True):
-            del item['style']
-
-        return soup
+    def print_version(self, url):
+        article_part = url.rpartition('/')[2]
+        article_id= article_part.partition('-')[0]
+        return 'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
 
     def postprocess_html(self, soup, first):
-
         for tag in soup.findAll(name='strong'):
              tag.name = 'b'
-
         return soup
 
 
diff --git a/resources/recipes/nspm.recipe b/resources/recipes/nspm.recipe
index 13ff42b277..58b782415b 100644
--- a/resources/recipes/nspm.recipe
+++ b/resources/recipes/nspm.recipe
@@ -6,6 +6,7 @@ nspm.rs
 
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import NavigableString
 
 class Nspm(BasicNewsRecipe):
     title                 = 'Nova srpska politicka misao'
@@ -21,6 +22,7 @@ class Nspm(BasicNewsRecipe):
     encoding              = 'utf-8'
     language              = 'sr'
     delay                 = 2
+    remove_empty_feeds    = True
     publication_type      = 'magazine'
     masthead_url          = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg'
     extra_css             = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
@@ -45,8 +47,9 @@ class Nspm(BasicNewsRecipe):
                            dict(name=['link','object','embed','script','meta','base','iframe'])
                           ,dict(attrs={'class':'buttonheading'})
                          ]
-    remove_tags_after = dict(attrs={'class':'article_separator'})
-    remove_attributes = ['width','height']
+    remove_tags_before = dict(attrs={'class':'contentheading'})
+    remove_tags_after  = dict(attrs={'class':'article_separator'})
+    remove_attributes  = ['width','height']
 
     def get_browser(self):
         br = BasicNewsRecipe.get_browser()
@@ -67,4 +70,8 @@ class Nspm(BasicNewsRecipe):
     def preprocess_html(self, soup):
         for item in soup.body.findAll(style=True):
             del item['style']
+        for item in soup.body.findAll('h1'):
+            nh = NavigableString(item.a.string)
+            item.a.extract()
+            item.insert(0,nh)
         return self.adeify_images(soup)
diff --git a/resources/recipes/xkcd.recipe b/resources/recipes/xkcd.recipe
index 312027004e..ad0d420deb 100644
--- a/resources/recipes/xkcd.recipe
+++ b/resources/recipes/xkcd.recipe
@@ -24,18 +24,18 @@ class XkcdCom(BasicNewsRecipe):
         (re.compile(r'(<img.*title=")([^"]+)(".*>)'),
          lambda m: '%s%s<p>%s</p>' % (m.group(1), m.group(3), m.group(2)))
     ]
-    
+
     def parse_index(self):
         INDEX = 'http://xkcd.com/archive/'
 
-        soup = self.index_to_soup(INDEX) 
+        soup = self.index_to_soup(INDEX)
         articles = []
         for item in soup.findAll('a', title=True):
             articles.append({
                 'date': item['title'],
                 'timestamp': time.mktime(time.strptime(item['title'], '%Y-%m-%d'))+1,
                 'url': 'http://xkcd.com' + item['href'],
-                'title': self.tag_to_string(item).encode('UTF-8'),
+                'title': self.tag_to_string(item),
                 'description': '',
                 'content': '',
             })
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 256bcce6fc..7742a20a21 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -75,6 +75,8 @@ def line_length(format, raw, percent):
         linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
     elif format == 'pdf':
         linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
+    elif format == 'spanned_html':
+        linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
     lines = linere.findall(raw)
 
     lengths = []
@@ -223,34 +225,33 @@ class HTMLPreProcessor(object):
                   # ˙
                   (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
                   (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
-                  
+
+                  # If pdf printed from a browser then the header/footer has a reliable pattern
+                  (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
+
+                  # Center separator lines
+                  (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
 
                   # Remove page links
                   (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
                   # Remove <hr> tags
-                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
-                  # Replace <br><br> with <p>
-                  (re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
-
-                  # Remove hyphenation
-                  (re.compile(r'-<br.*?>\n\r?'), lambda match: ''),
+                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br>'),
 
                   # Remove gray background
                   (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
 
                   # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
-                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
-
+                  (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
+                  # Cover the case where every letter in a chapter title is separated by a space
+                  (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
+                  
                   # Have paragraphs show better
                   (re.compile(r'<br.*?>'), lambda match : '<p>'),
                   # Clean up spaces
                   (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
-                  # Connect paragraphs split by -
-                  (re.compile(u'(?<=[^\s][-–])[\s]*(</p>)*[\s]*(<p>)*\s*(?=[^\s])'), lambda match: ''),
                   # Add space before and after italics
                   (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
-                  (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
+                  (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),                            
                  ]
 
     # Fix Book Designer markup
@@ -327,13 +328,30 @@ class HTMLPreProcessor(object):
                 import traceback
                 print 'Failed to parse remove_footer regexp'
                 traceback.print_exc()
-
+      
+        # unwrap hyphenation - moved here so it's executed after header/footer removal
+        if is_pdftohtml:
+            # unwrap visible dashes and hyphens - don't delete they are often hyphens for
+            # for compound words, formatting, etc
+            end_rules.append((re.compile(u'(?<=[-–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
+            # unwrap/delete soft hyphens
+            end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
+            # unwrap/delete soft hyphens with formatting
+            end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
+        
+        # Make the more aggressive chapter marking regex optional with the preprocess option to 
+        # reduce false positives and move after header/footer removal
+        if getattr(self.extra_opts, 'preprocess_html', None):
+            if is_pdftohtml:
+                end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
+                
         if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
             length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
             if length:
+                # print "The pdf line length returned is " + str(length)
                 end_rules.append(
                     # Un wrap using punctuation
-                    (re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                 )
 
         for rule in self.PREPROCESS + start_rules:
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
new file mode 100644
index 0000000000..5301f70a16
--- /dev/null
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re
+from calibre.ebooks.conversion.preprocess import line_length
+from calibre.utils.logging import default_log
+
+class PreProcessor(object):
+
+    def __init__(self, log=None):
+        self.log = default_log if log is None else log
+        self.html_preprocess_sections = 0
+        self.found_indents = 0
+
+    def chapter_head(self, match):
+        chap = match.group('chap')
+        title = match.group('title')
+        if not title:
+            self.html_preprocess_sections = self.html_preprocess_sections + 1
+            self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
+            return '<h2>'+chap+'</h2>\n'
+        else:
+            self.html_preprocess_sections = self.html_preprocess_sections + 1
+            self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
+            return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
+
+    def chapter_break(self, match):
+        chap = match.group('section')
+        styles = match.group('styles')
+        self.html_preprocess_sections = self.html_preprocess_sections + 1
+        self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
+        return '<'+styles+' style="page-break-before:always">'+chap
+
+    def insert_indent(self, match):
+        pstyle = match.group('formatting')
+        span = match.group('span')
+        self.found_indents = self.found_indents + 1
+        if pstyle:
+            if not span:
+                return '<p '+pstyle+' style="text-indent:3%">'
+            else:
+                return '<p '+pstyle+' style="text-indent:3%">'+span
+        else:
+            if not span:
+                return '<p style="text-indent:3%">'
+            else:
+                return '<p style="text-indent:3%">'+span
+
+    def no_markup(self, raw, percent):
+        '''
+        Detects total marked up line endings in the file. raw is the text to
+        inspect.  Percent is the minimum percent of line endings which should
+        be marked up to return true.
+        '''
+        htm_end_ere = re.compile('</p>', re.DOTALL)
+        line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
+        htm_end = htm_end_ere.findall(raw)
+        line_end = line_end_ere.findall(raw)
+        tot_htm_ends = len(htm_end)
+        tot_ln_fds = len(line_end)
+        self.log("There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked up endings")
+
+        if percent > 1:
+            percent = 1
+        if percent < 0:
+            percent = 0
+
+        min_lns = tot_ln_fds * percent
+        self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
+        if min_lns > tot_htm_ends:
+            return True
+
+    def __call__(self, html):
+        self.log("*********  Preprocessing HTML  *********")
+        # Replace series of non-breaking spaces with text-indent
+        txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
+        html = txtindent.sub(self.insert_indent, html)
+        if self.found_indents > 1:
+            self.log("replaced "+str(self.found_indents)+ " nbsp indents with inline styles")
+        # remove remaining non-breaking spaces
+        html = re.sub(ur'\u00a0', ' ', html)
+        # Get rid of empty <o:p> tags to simplify other processing
+        html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
+        # Get rid of empty span tags
+        html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
+
+        # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+        linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
+        blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
+        blanklines = blankreg.findall(html)
+        lines = linereg.findall(html)
+        if len(lines) > 1:
+            self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
+            if float(len(blanklines)) / float(len(lines)) > 0.40:
+                self.log("deleting blank lines")
+                html = blankreg.sub('', html)
+        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
+        html = re.sub(r"\s*</p>", "</p>\n", html)
+        html = re.sub(r"\s*<p>\s*", "\n<p>", html)
+
+        # some lit files don't have any <p> tags or equivalent (generally just plain text between
+        # <pre> tags), check and  mark up line endings if required before proceeding
+        if self.no_markup(html, 0.1):
+             self.log("not enough paragraph markers, adding now")
+             add_markup = re.compile('(?<!>)(\n)')
+             html = add_markup.sub('</p>\n<p>', html)
+
+        # detect chapters/sections to match xpath or splitting logic
+        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
+        self.html_preprocess_sections = len(heading.findall(html))
+        self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
+        #
+        # Start with most typical chapter headings, get more aggressive until one works
+        if self.html_preprocess_sections < 10:
+            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
+            html = chapdetect.sub(self.chapter_head, html)
+        if self.html_preprocess_sections < 10:
+            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            html = chapdetect2.sub(self.chapter_head, html)
+
+        if self.html_preprocess_sections < 10:
+            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            html = chapdetect2.sub(self.chapter_head, html)
+
+        # Unwrap lines
+        #
+        self.log("Unwrapping Lines")
+        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
+        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
+        # that lines can be un-wrapped across page boundaries
+        paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
+        spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
+        paras = len(paras_reg.findall(html))
+        spans = len(spans_reg.findall(html))
+        if spans > 1:
+            if float(paras) / float(spans) < 0.75:
+                format = 'spanned_html'
+            else:
+                format = 'html'
+        else:
+            format = 'html'
+
+        # Calculate Length
+        length = line_length(format, html, 0.4)
+        self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
+        #
+        # Unwrap and/or delete soft-hyphens, hyphens
+        html = re.sub(u'­\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
+        html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
+
+        # Unwrap lines using punctation if the median length of all lines is less than 200
+        unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+        html = unwrap.sub(' ', html)
+
+        # If still no sections after unwrapping mark split points on lines with no punctuation
+        if self.html_preprocess_sections < 10:
+            self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
+            #self.log(html)
+            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
+            html = chapdetect3.sub(self.chapter_break, html)
+        # search for places where a first or second level heading is immediately followed by another
+        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
+        # headings and titles, images, etc
+        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
+        html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
+
+        return html
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index d57bfddd3e..084d48e54b 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -24,7 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows
 from calibre import unicode_path
 from calibre.utils.localization import get_lang
 from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
 
 class Link(object):
     '''
@@ -491,20 +491,6 @@ class HTMLInput(InputFormatPlugin):
         return (None, raw)
 
 	def preprocess_html(self, html):
-        if not hasattr(self, 'log'):
-            from calibre.utils.logging import default_log
-            self.log = default_log
-		self.log("*********  Preprocessing HTML  *********")
-		# Detect Chapters to match the xpath in the GUI
-		chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
-		html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
-		# Unwrap lines using punctation if the median length of all lines is less than 150
-		#
-		# Insert extra line feeds so the line length regex functions properly
-		html = re.sub(r"</p>", "</p>\n", html)
-		length = line_length('html', html, 0.4)
-		self.log.debug("*** Median length is " + str(length) + " ***")
-		unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
-		if length < 150:
-			html = unwrap.sub(' ', html)
-        return html
+        preprocessor = PreProcessor(log=getattr(self, 'log', None))
+        return preprocessor(html)
+
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 9bf20fb1d4..65f5c607a2 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -6,10 +6,9 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-import re
-
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
+
 
 class LITInput(InputFormatPlugin):
 
@@ -55,18 +54,6 @@ class LITInput(InputFormatPlugin):
 
 
 	def preprocess_html(self, html):
-		self.log("*********  Preprocessing HTML  *********")
-		# Detect Chapters to match the xpath in the GUI
-		chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
-		html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
-		# Unwrap lines using punctation if the median length of all lines is less than 150
-		#
-		# Insert extra line feeds so the line length regex functions properly
-		html = re.sub(r"</p>", "</p>\n", html)
-		length = line_length('html', html, 0.4)
-		self.log("*** Median length is " + str(length) + " ***")
-		unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
-		if length < 150:
-			html = unwrap.sub(' ', html)
-        return html
+        preprocessor = PreProcessor(log=getattr(self, 'log', None))
+        return preprocessor(html)
 
diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py
index 487e70c04f..b8dc7a9560 100644
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@@ -3,6 +3,7 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
+import re
 from calibre.customize.conversion import InputFormatPlugin
 
 class MOBIInput(InputFormatPlugin):
@@ -37,3 +38,12 @@ class MOBIInput(InputFormatPlugin):
                     include_meta_content_type=False))
                 accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
         return mr.created_opf_path
+
+    def preprocess_html(self, html):
+        # search for places where a first or second level heading is immediately followed by another
+        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
+        # headings and titles, images, etc
+        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
+        html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
+        return html
+
diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py
index 3ae9f8ccca..c151551866 100644
--- a/src/calibre/ebooks/pdb/pdf/reader.py
+++ b/src/calibre/ebooks/pdb/pdf/reader.py
@@ -21,7 +21,7 @@ class Reader(FormatReader):
         self.options = options
         setattr(self.options, 'new_pdf_engine', False)
         setattr(self.options, 'no_images', False)
-        setattr(self.options, 'unwrap_factor', 0.5)
+        setattr(self.options, 'unwrap_factor', 0.45)
 
     def extract_content(self, output_dir):
         self.log.info('Extracting PDF...')
diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py
index 64a089281e..14b3552b04 100644
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@@ -22,10 +22,10 @@ class PDFInput(InputFormatPlugin):
     options = set([
         OptionRecommendation(name='no_images', recommended_value=False,
             help=_('Do not extract images from the document')),
-        OptionRecommendation(name='unwrap_factor', recommended_value=0.5,
+        OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
             help=_('Scale used to determine the length at which a line should '
             'be unwrapped. Valid values are a decimal between 0 and 1. The '
-            'default is 0.5, this is the median line length.')),
+            'default is 0.45, just below the median line length.')),
         OptionRecommendation(name='new_pdf_engine', recommended_value=False,
             help=_('Use the new PDF conversion engine.'))
     ])
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index adda8794ca..000c603c1c 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -7,7 +7,7 @@ import os, glob, re, textwrap
 from lxml import etree
 
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
 
 class InlineClass(etree.XSLTExtension):
 
@@ -229,16 +229,8 @@ class RTFInput(InputFormatPlugin):
             res = transform.tostring(result)
             res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
             if self.options.preprocess_html:
-                self.log("*********  Preprocessing HTML  *********")
-                # Detect Chapters to match the xpath in the GUI
-                chapdetect = re.compile(r'<p[^>]*>\s*<span[^>]*>\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)\s*</span>\s*</p>', re.IGNORECASE)
-                res = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', res)
-                # Unwrap lines using punctation if the median length of all lines is less than 150
-                length = line_length('html', res, 0.4)
-                self.log("*** Median length is " + str(length) + " ***")
-                unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</span>\s*(</p>)?\s*(?P<up2threeblanks><p[^>]*>\s*(<span[^>]*>\s*</span>\s*)</p>\s*){0,3}\s*<p[^>]*>\s*(<span[^>]*>)?\s*" % length, re.UNICODE)
-                if length < 150:
-                    res = unwrap.sub(' ', res)
+                preprocessor = PreProcessor(log=getattr(self, 'log', None))
+                res = preprocessor(res)
             f.write(res)
         self.write_inline_css(inline_class)
         stream.seek(0)
diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui
index 626c68ea63..b2ee421922 100644
--- a/src/calibre/gui2/convert/pdf_input.ui
+++ b/src/calibre/gui2/convert/pdf_input.ui
@@ -46,7 +46,7 @@
       <double>0.010000000000000</double>
      </property>
      <property name="value">
-      <double>0.500000000000000</double>
+      <double>0.450000000000000</double>
      </property>
     </widget>
    </item>
diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py
index a70cf8b664..8aef350498 100644
--- a/src/calibre/web/feeds/__init__.py
+++ b/src/calibre/web/feeds/__init__.py
@@ -165,7 +165,9 @@ class Feed(object):
             if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
                 self.articles.append(article)
             else:
-                self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
+                t = strftime(u'%a, %d %b, %Y %H:%M', article.localtime.timetuple())
+                self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%
+                        (title, t, self.title))
             d = item.get('date', '')
             article.formatted_date = d