Merge with ldolse preprocessdev

2025-08-30 23:00:21 -04:00 · 2011-02-01 18:08:27 -05:00 · 2011-02-01 18:08:27 -05:00 · d20387ea74
commit d20387ea74
parent d51efa9104 72fe944b95
13 changed files with 514 additions and 51 deletions
--- a/resources/images/news/latimes.png
+++ b/resources/images/news/latimes.png
--- a/resources/recipes/20_minutos.recipe
+++ b/resources/recipes/20_minutos.recipe
@ -1,25 +1,25 @@
 # -*- coding: utf-8
 __license__   = 'GPL v3'
 __author__    = 'Luis Hernandez'
 __copyright__ = 'Luis Hernandez<tolyluis@gmail.com>'
-description   = 'Periódico gratuito en español - v0.8 - 27 Jan 2011'
+__version__     = 'v0.85'
 __date__        = '31 January 2011'
 '''
 www.20minutos.es
 '''
-
+import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class AdvancedUserRecipe1294946868(BasicNewsRecipe):
-    title          = u'20 Minutos'
+    title          = u'20 Minutos new'
    publisher      = u'Grupo 20 Minutos'
-    __author__            = 'Luis Hernández'
+    __author__            = 'Luis Hernandez'
-    description           = 'Periódico gratuito en español'
+    description           = 'Free spanish newspaper'
    cover_url     = 'http://estaticos.20minutos.es/mmedia/especiales/corporativo/css/img/logotipos_grupo20minutos.gif'
-    oldest_article = 5
+    oldest_article = 2
    max_articles_per_feed = 100
    remove_javascript = True
@ -29,6 +29,7 @@ class AdvancedUserRecipe1294946868(BasicNewsRecipe):
    encoding              = 'ISO-8859-1'
    language              = 'es'
    timefmt        = '[%a, %d %b, %Y]'
    remove_empty_feeds    = True
    keep_only_tags     = [
                                   dict(name='div', attrs={'id':['content','vinetas',]})
@ -43,13 +44,21 @@ class AdvancedUserRecipe1294946868(BasicNewsRecipe):
    remove_tags = [
                     dict(name='ol', attrs={'class':['navigation',]})
                    ,dict(name='span', attrs={'class':['action']})
-                    ,dict(name='div', attrs={'class':['twitter comments-list hidden','related-news','col','photo-gallery','calendario','article-comment','postto estirar','otras_vinetas estirar','kment','user-actions']})
+                    ,dict(name='div', attrs={'class':['twitter comments-list hidden','related-news','col','photo-gallery','photo-gallery side-art-block','calendario','article-comment','postto estirar','otras_vinetas estirar','kment','user-actions']})
                    ,dict(name='div', attrs={'id':['twitter-destacados','eco-tabs','inner','vineta_calendario','vinetistas clearfix','otras_vinetas estirar','MIN1','main','SUP1','INT']})
                    ,dict(name='ul', attrs={'class':['article-user-actions','stripped-list']})
                    ,dict(name='ul', attrs={'id':['site-links']})
                    ,dict(name='li', attrs={'class':['puntuacion','enviar','compartir']})
                       ]
    extra_css             = """
                               p{text-align: justify; font-size: 100%}
                               body{ text-align: left; font-size:100% }
                               h3{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; }
                                 """
    preprocess_regexps = [(re.compile(r'<a href="http://estaticos.*?[0-999]px;" target="_blank">', re.DOTALL), lambda m: '')]
    feeds = [
              (u'Portada'              , u'http://www.20minutos.es/rss/')
             ,(u'Nacional'             , u'http://www.20minutos.es/rss/nacional/')
@ -65,6 +74,6 @@ class AdvancedUserRecipe1294946868(BasicNewsRecipe):
             ,(u'Empleo'              , u'http://www.20minutos.es/rss/empleo/')
             ,(u'Cine'                  , u'http://www.20minutos.es/rss/cine/')
             ,(u'Musica'               , u'http://www.20minutos.es/rss/musica/')
-             ,(u'Vinetas'              , u'http://www.20minutos.es/rss/vinetas/')
+             ,(u'Vinetas'          , u'http://www.20minutos.es/rss/vinetas/')
             ,(u'Comunidad20'     , u'http://www.20minutos.es/rss/zona20/')
            ]
--- a/resources/recipes/cinco_dias.recipe
+++ b/resources/recipes/cinco_dias.recipe
@ -0,0 +1,71 @@
 __license__   = 'GPL v3'
 __author__    = 'Luis Hernandez'
 __copyright__ = 'Luis Hernandez<tolyluis@gmail.com>'
 __version__     = 'v1.2'
 __date__        = '31 January 2011'
 '''
 http://www.cincodias.com/
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class AdvancedUserRecipe1294946868(BasicNewsRecipe):
    title          = u'Cinco Dias'
    publisher      = u'Grupo Prisa'
    __author__            = 'Luis Hernandez'
    description           = 'spanish web about money and bussiness, free edition'
    cover_url     = 'http://www.prisa.com/images/logos/logo_cinco_dias.gif'
    oldest_article = 2
    max_articles_per_feed = 100
    remove_javascript = True
    no_stylesheets        = True
    use_embedded_content  = False
    language              = 'es'
    remove_empty_feeds    = True
    encoding               = 'ISO-8859-1'
    timefmt        = '[%a, %d %b, %Y]'
    keep_only_tags     = [
                                    dict(name='div', attrs={'class':['cab_articulo cab_noticia','pos_3','txt_noticia','mod_despiece']})
                                   ,dict(name='p', attrs={'class':['cintillo']})
                                ]
    remove_tags_before = dict(name='div' , attrs={'class':['publi_h']})
    remove_tags_after = dict(name='div' , attrs={'class':['tab_util util_estadisticas']})
    remove_tags = [
                             dict(name='div', attrs={'class':['util-1','util-2','util-3','inner estirar','inner1','inner2','inner3','cont','tab_util util_estadisticas','tab_util util_enviar','mod_list_inf','mod_similares','mod_divisas','mod_sectores','mod_termometro','mod post','mod_img','mod_txt','nivel estirar','barra estirar','info_brujula btnBrujula','utilidad_brujula estirar']})
                            ,dict(name='li', attrs={'class':['lnk-fcbook','lnk-retweet','lnk-meneame','desplegable','comentarios','list-options','estirar']})
                            ,dict(name='ul', attrs={'class':['lista-izquierda','list-options','estirar']})
                            ,dict(name='p', attrs={'class':['autor']})
                         ]
    extra_css             = """
                               p{text-align: justify; font-size: 100%}
                               body{ text-align: left; font-size:100% }
                               h1{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; }
                               h3{font-family: sans-serif; font-size:100%; font-style: italic; text-align: justify; }
                                 """
    feeds = [
                  (u'Ultima Hora'              , u'http://www.cincodias.com/rss/feed.html?feedId=17029')
                 ,(u'Empresas'                 , u'http://www.cincodias.com/rss/feed.html?feedId=19')
                 ,(u'Mercados'                 , u'http://www.cincodias.com/rss/feed.html?feedId=20')
                 ,(u'Economia'                 , u'http://www.cincodias.com/rss/feed.html?feedId=21')
                 ,(u'Tecnorama'               , u'http://www.cincodias.com/rss/feed.html?feedId=17230')
                 ,(u'Tecnologia'                , u'http://www.cincodias.com/rss/feed.html?feedId=17106')
                 ,(u'Finanzas Personales'  , u'http://www.cincodias.com/rss/feed.html?feedId=22')
                 ,(u'Fiscalidad'                 , u'http://www.cincodias.com/rss/feed.html?feedId=17107')
                 ,(u'Vivienda'                   , u'http://www.cincodias.com/rss/feed.html?feedId=17108')
                 ,(u'Tendencias'               , u'http://www.cincodias.com/rss/feed.html?feedId=17109')
                 ,(u'Empleo'                    , u'http://www.cincodias.com/rss/feed.html?feedId=17110')
                 ,(u'IBEX 35'                    , u'http://www.cincodias.com/rss/feed.html?feedId=17125')
                 ,(u'Sectores'                  , u'http://www.cincodias.com/rss/feed.html?feedId=17126')
                 ,(u'Opinion'                    , u'http://www.cincodias.com/rss/feed.html?feedId=17105')
              ]
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@ -143,7 +143,7 @@ def add_pipeline_options(parser, plumber):
                     ' patterns. Disabled by default. Use %s to enable. '
                     ' Individual actions can be disabled with the %s options.')
                  % ('--enable-heuristics', '--disable-*'),
-                  ['enable_heuristics',  'replace_scene_breaks'] + HEURISTIC_OPTIONS
+                  ['enable_heuristics', 'replace_scene_breaks'] + HEURISTIC_OPTIONS
                  ),
              'SEARCH AND REPLACE' : (
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -532,7 +532,7 @@ OptionRecommendation(name='format_scene_breaks',
           'horizontal rules.')),
 OptionRecommendation(name='replace_scene_breaks',
-    recommended_value='', level=OptionRecommendation.LOW,
+    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Replace scene breaks with the specified text.')),
 OptionRecommendation(name='dehyphenate',
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -26,9 +26,14 @@ class HeuristicProcessor(object):
        self.blanks_deleted = False
        self.blanks_between_paragraphs = False
        self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
-        self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|spacer)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
+        self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
        self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
        self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}(?!\s*<h\d)', re.IGNORECASE)
        self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
        self.line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
        self.line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
        self.single_blank = re.compile(r'(\s*<p[^>]*>\s*</p>)', re.IGNORECASE)
        self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
    def is_pdftohtml(self, src):
        return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
@ -187,19 +192,17 @@ class HeuristicProcessor(object):
        # Build the Regular Expressions in pieces
        init_lookahead = "(?=<(p|div))"
-        chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
+        chapter_line_open = self.line_open
        title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
        chapter_header_open = r"(?P<chap>"
        title_header_open = r"(?P<title>"
        chapter_header_close = ")\s*"
        title_header_close = ")"
-        chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
+        chapter_line_close = self.line_close
        title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
        is_pdftohtml = self.is_pdftohtml(html)
        if is_pdftohtml:
            chapter_line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
            chapter_line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
            title_line_open = "<(?P<outer2>p)[^>]*>\s*"
            title_line_close = "\s*</(?P=outer2)>"
@ -374,13 +377,15 @@ class HeuristicProcessor(object):
        html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
        # Delete microsoft 'smart' tags
        html = re.sub('(?i)</?st1:\w+>', '', html)
-        # Delete self closing paragraph tags
+        # Re-open self closing paragraph tags
-        html = re.sub('<p\s?/>', '', html)
+        html = re.sub('<p[^>/]*/>', '<p> </p>', html)
        # Get rid of empty span, bold, font, em, & italics tags
        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
        html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
        html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
        # Empty heading tags
        html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html)
        self.deleted_nbsps = True
        return html
@ -418,33 +423,98 @@ class HeuristicProcessor(object):
            if getattr(self.extra_opts, option, False):
                return True
        return False
    def detect_blank_formatting(self, html):
        blanks_before_headings = re.compile(r'(\s*<p[^>]*>\s*</p>){1,}(?=\s*<h\d)', re.IGNORECASE)
        blanks_after_headings = re.compile(r'(?<=</h\d>)(\s*<p[^>]*>\s*</p>){1,}', re.IGNORECASE)
-        def markup_spacers(match):
+    def merge_blanks(self, html, blanks_count=None):
-           blanks = match.group(0)
+        base_em = .5 # Baseline is 1.5em per blank line, 1st line is .5 em css and 1em for the nbsp
-           blanks = self.blankreg.sub('\n<p class="spacer"> </p>', blanks)
+        em_per_line = 1.5 # Add another 1.5 em for each additional blank
-           return blanks
+        
-        html = blanks_before_headings.sub(markup_spacers, html)
+        def merge_matches(match):
-        html = blanks_after_headings.sub(markup_spacers, html)
+            to_merge = match.group(0)
            lines = float(len(self.single_blank.findall(to_merge))) - 1.
            em = base_em + (em_per_line * lines)
            if to_merge.find('whitespace'):
                newline = self.any_multi_blank.sub('\n<p class="whitespace'+str(int(em * 10))+'" style="text-align:center; margin-top:'+str(em)+'em"> </p>', match.group(0))
            else:
                newline = self.any_multi_blank.sub('\n<p class="softbreak'+str(int(em * 10))+'" style="text-align:center; margin-top:'+str(em)+'em"> </p>', match.group(0))
            return newline
        html = self.any_multi_blank.sub(merge_matches, html)
        return html
    def detect_whitespace(self, html):
        blanks_around_headings = re.compile(r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?(?P<heading><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE)                                     
        blanks_n_nopunct = re.compile(r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE)
        def merge_header_whitespace(match):
            initblanks = match.group('initparas')
            endblanks = match.group('initparas') 
            heading = match.group('heading')
            top_margin = ''
            bottom_margin = ''
            if initblanks is not None:
                top_margin = 'margin-top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
            if endblanks is not None:
                bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(initblanks)))+'em;'
            if initblanks == None and endblanks == None:
                return heading
            else:
                heading = re.sub('(?i)<h(?P<hnum>\d+)[^>]*>', '\n\n<h'+'\g<hnum>'+' style="'+top_margin+bottom_margin+'">', heading)
            return heading
        html = blanks_around_headings.sub(merge_header_whitespace, html)
        def markup_whitespaces(match):
            blanks = match.group(0)
            blanks = self.blankreg.sub('\n<p class="whitespace" style="text-align:center; margin-top:0em; margin-bottom:0em"> </p>', blanks)
            return blanks
        html = blanks_n_nopunct.sub(markup_whitespaces, html)
        if self.html_preprocess_sections > self.min_chapters:
-            html = re.sub('(?si)^.*?(?=<h\d)', markup_spacers, html)
+            html = re.sub('(?si)^.*?(?=<h\d)', markup_whitespaces, html)
        return html
    def detect_soft_breaks(self, html):
        if not self.blanks_deleted and self.blanks_between_paragraphs:
-            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid"> </p>', html)
+            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1em; page-break-before:avoid; text-align:center"> </p>', html)
        else:
-            html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid"> </p>', html)
+            html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
        return html
    def markup_user_break(self, replacement_break):
        '''
        Takes string a user supplies and wraps it in markup that will be centered with 
        appropriate margins.  <hr> and <img> tags are allowed.  If the user specifies
        a style with width attributes in the <hr> tag then the appropriate margins are
        applied to wrapping divs.  This is because many ebook devices don't support margin:auto
        All other html is converted to text.
        '''
        hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em">'
        if re.findall('(<|>)', replacement_break):
            if re.match('^<hr', replacement_break):
                if replacement_break.find('width') != -1:
                   width = int(re.sub('.*?width(:|=)(?P<wnum>\d+).*', '\g<wnum>', replacement_break))
                   divpercent = (100 - width) / 2
                   hr_open = re.sub('45', str(divpercent), hr_open)
                   scene_break = hr_open+replacement_break+'</div>'
                else:
                    scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
            elif re.match('^<img', replacement_break):
                scene_break = self.scene_break_open+replacement_break+'</p>'
            else:
                replacement_break = html2text(replacement_break)
                replacement_break = re.sub('\s', '&nbsp;', replacement_break)
                scene_break = self.scene_break_open+replacement_break+'</p>'
        else:
            replacement_break = re.sub('\s', '&nbsp;', replacement_break)
            scene_break = self.scene_break_open+replacement_break+'</p>'
        return scene_break
    def __call__(self, html):
        self.log.debug("*********  Heuristic processing HTML  *********")
        # Count the words in the document to estimate how many chapters to look for and whether
        # other types of processing are attempted
        try:
@ -458,7 +528,7 @@ class HeuristicProcessor(object):
        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
        html = self.arrange_htm_line_endings(html)
-
+        #self.dump(html, 'after_arrange_line_endings')
        if self.cleanup_required():
            ###### Check Markup ######
            #
@ -478,6 +548,11 @@ class HeuristicProcessor(object):
            # fix indents must run before this step, as it removes non-breaking spaces
            html = self.cleanup_markup(html)
        is_pdftohtml = self.is_pdftohtml(html)
        if is_pdftohtml:
            self.line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
            self.line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
        # ADE doesn't render <br />, change to empty paragraphs
        #html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
@ -489,6 +564,7 @@ class HeuristicProcessor(object):
        if getattr(self.extra_opts, 'markup_chapter_headings', False):
            html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
        #self.dump(html, 'after_chapter_markup')
        if getattr(self.extra_opts, 'italicize_common_cases', False):
            html = self.markup_italicis(html)
@ -498,7 +574,7 @@ class HeuristicProcessor(object):
        if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
            self.log.debug("deleting blank lines")
            self.blanks_deleted = True
-            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid"> </p>', html)
+            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
            html = self.blankreg.sub('', html)
        # Determine line ending type
@ -539,7 +615,7 @@ class HeuristicProcessor(object):
        if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
            self.log.debug("Looking for more split points based on punctuation,"
                    " currently have " + unicode(self.html_preprocess_sections))
-            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
+            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
            html = chapdetect3.sub(self.chapter_break, html)
        if getattr(self.extra_opts, 'renumber_headings', False):
@ -549,14 +625,32 @@ class HeuristicProcessor(object):
            doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
            html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
        # If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
        # style it with the 'whitespace' class.  All remaining blank lines are styled as softbreaks.
        # Multiple sequential blank paragraphs are merged with appropriate margins
        # If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
        if getattr(self.extra_opts, 'format_scene_breaks', False):
-            html = self.detect_blank_formatting(html)
+            html = self.detect_whitespace(html)
            html = self.detect_soft_breaks(html)
-            # Center separator lines
+            blanks_count = len(self.any_multi_blank.findall(html))
-            html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid">' + '\g<break>' + '</p>', html)
+            if blanks_count >= 1:
-            #html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
+                html = self.merge_blanks(html, blanks_count)
            scene_break_regex = self.line_open+'(?![\w\'\"])(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
            scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
            # If the user has enabled scene break replacement, then either softbreaks
            # or 'hard' scene breaks are replaced, depending on which is in use
            # Otherwise separator lines are centered, use a bit larger margin in this case
            replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
            if replacement_break is not None:
                replacement_break = self.markup_user_break(replacement_break)
                if len(scene_break.findall(html)) >= 1:
                    html = scene_break.sub(replacement_break, html)
                else:
                    html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html) 
            else:
                html = scene_break.sub(self.scene_break_open+'\g<break>'+'</p>', html)
        if self.deleted_nbsps:
-            # put back non-breaking spaces in empty paragraphs to preserve original formatting
+            # put back non-breaking spaces in empty paragraphs so they render correctly
            html = self.anyblank.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
-        return html
+        return html
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -0,0 +1,61 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import re
 from calibre.customize import Plugin
 class Source(Plugin):
    type = _('Metadata source')
    author = 'Kovid Goyal'
    supported_platforms = ['windows', 'osx', 'linux']
    result_of_identify_is_complete = True
    def get_author_tokens(self, authors):
        'Take a list of authors and return a list of tokens useful for a '
        'AND search query'
        # Leave ' in there for Irish names
        pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
        for au in authors:
            for tok in au.split():
                yield pat.sub('', tok)
    def split_jobs(self, jobs, num):
        'Split a list of jobs into at most num groups, as evenly as possible'
        groups = [[] for i in range(num)]
        jobs = list(jobs)
        while jobs:
            for gr in groups:
                try:
                    job = jobs.pop()
                except IndexError:
                    break
                gr.append(job)
        return [g for g in groups if g]
    def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
        '''
        Identify a book by its title/author/isbn/etc.
        :param log: A log object, use it to output debugging information/errors
        :param result_queue: A result Queue, results should be put into it.
                            Each result is a Metadata object
        :param abort: If abort.is_set() returns True, abort further processing
                      and return as soon as possible
        :param title: The title of the book, can be None
        :param authors: A list of authors of the book, can be None
        :param identifiers: A dictionary of other identifiers, most commonly
                            {'isbn':'1234...'}
        :return: None if no errors occurred, otherwise a unicode representation
                 of the error suitable for showing to the user
        '''
        return None
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@ -0,0 +1,215 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import time
 from urllib import urlencode
 from functools import partial
 from threading import Thread
 from lxml import etree
 from calibre.ebooks.metadata.sources import Source
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.utils.date import parse_date, utcnow
 from calibre import browser, as_unicode
 NAMESPACES = {
              'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
              'atom' : 'http://www.w3.org/2005/Atom',
              'dc': 'http://purl.org/dc/terms'
            }
 XPath = partial(etree.XPath, namespaces=NAMESPACES)
 total_results  = XPath('//openSearch:totalResults')
 start_index    = XPath('//openSearch:startIndex')
 items_per_page = XPath('//openSearch:itemsPerPage')
 entry          = XPath('//atom:entry')
 entry_id       = XPath('descendant::atom:id')
 creator        = XPath('descendant::dc:creator')
 identifier     = XPath('descendant::dc:identifier')
 title          = XPath('descendant::dc:title')
 date           = XPath('descendant::dc:date')
 publisher      = XPath('descendant::dc:publisher')
 subject        = XPath('descendant::dc:subject')
 description    = XPath('descendant::dc:description')
 language       = XPath('descendant::dc:language')
 def to_metadata(browser, log, entry_):
    def get_text(extra, x):
        try:
            ans = x(extra)
            if ans:
                ans = ans[0].text
                if ans and ans.strip():
                    return ans.strip()
        except:
            log.exception('Programming error:')
        return None
    id_url = entry_id(entry_)[0].text
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if not authors:
        authors = [_('Unknown')]
    if not id_url or not title:
        # Silently discard this entry
        return None
    mi = Metadata(title_, authors)
    try:
        raw = browser.open(id_url).read()
        feed = etree.fromstring(raw)
        extra = entry(feed)[0]
    except:
        log.exception('Failed to get additional details for', mi.title)
        return mi
    mi.comments = get_text(extra, description)
    #mi.language = get_text(extra, language)
    mi.publisher = get_text(extra, publisher)
    # Author sort
    for x in creator(extra):
        for key, val in x.attrib.items():
            if key.endswith('file-as') and val and val.strip():
                mi.author_sort = val
                break
    # ISBN
    isbns = []
    for x in identifier(extra):
        t = str(x.text).strip()
        if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
            if t[:5].upper() == 'ISBN:':
                isbns.append(t[5:])
    if isbns:
        mi.isbn = sorted(isbns, key=len)[-1]
    # Tags
    try:
        btags = [x.text for x in subject(extra) if x.text]
        tags = []
        for t in btags:
            tags.extend([y.strip() for y in t.split('/')])
        tags = list(sorted(list(set(tags))))
    except:
        log.exception('Failed to parse tags:')
        tags = []
    if tags:
        mi.tags = [x.replace(',', ';') for x in tags]
    # pubdate
    pubdate = get_text(extra, date)
    if pubdate:
        try:
            default = utcnow().replace(day=15)
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
        except:
            log.exception('Failed to parse pubdate')
    return mi
 class Worker(Thread):
    def __init__(self, log, entries, abort, result_queue):
        self.browser, self.log, self.entries = browser(), log, entries
        self.abort, self.result_queue = abort, result_queue
        Thread.__init__(self)
        self.daemon = True
    def run(self):
        for i in self.entries:
            try:
                ans = to_metadata(self.browser, self.log, i)
                if ans is not None:
                    self.result_queue.put(ans)
            except:
                self.log.exception(
                    'Failed to get metadata for identify entry:',
                    etree.tostring(i))
            if self.abort.is_set():
                break
 class GoogleBooks(Source):
    name = 'Google Books'
    def create_query(self, log, title=None, authors=None, identifiers={},
            start_index=1):
        BASE_URL = 'http://books.google.com/books/feeds/volumes?'
        isbn = identifiers.get('isbn', None)
        q = ''
        if isbn is not None:
            q += 'isbn:'+isbn
        elif title or authors:
            def build_term(prefix, parts):
                return ' '.join('in'+prefix + ':' + x for x in parts)
            if title is not None:
                q += build_term('title', title.split())
            if authors:
                q += ('+' if q else '')+build_term('author',
                        self.get_author_tokens(authors))
        if isinstance(q, unicode):
            q = q.encode('utf-8')
        if not q:
            return None
        return BASE_URL+urlencode({
            'q':q,
            'max-results':20,
            'start-index':start_index,
            'min-viewability':'none',
            })
    def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
        query = self.create_query(log, title=title, authors=authors,
                identifiers=identifiers)
        try:
            raw = browser().open_novisit(query).read()
        except Exception, e:
            log.exception('Failed to make identify query: %r'%query)
            return as_unicode(e)
        try:
            parser = etree.XMLParser(recover=True, no_network=True)
            feed = etree.fromstring(raw, parser=parser)
            entries = entry(feed)
        except Exception, e:
            log.exception('Failed to parse identify results')
            return as_unicode(e)
        groups = self.split_jobs(entries, 5) # At most 5 threads
        if not groups:
            return
        workers = [Worker(log, entries, abort, result_queue) for entries in
                groups]
        if abort.is_set():
            return
        for worker in workers: worker.start()
        has_alive_worker = True
        while has_alive_worker and not abort.is_set():
            has_alive_worker = False
            for worker in workers:
                if worker.is_alive():
                    has_alive_worker = True
            time.sleep(0.1)
        return None
--- a/src/calibre/gui2/convert/single.py
+++ b/src/calibre/gui2/convert/single.py
@ -258,7 +258,6 @@ class Config(ResizableDialog, Ui_Dialog):
            if not w.pre_commit_check():
                return
            x = w.commit(save_defaults=False)
            print x
            recs.update(x)
        self.opf_file, self.cover_file = self.mw.opf_file, self.mw.cover_file
        self._recommendations = recs
--- a/src/calibre/gui2/dialogs/metadata_single.py
+++ b/src/calibre/gui2/dialogs/metadata_single.py
@ -429,10 +429,12 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
                old_extensions.add(ext)
        for ext in new_extensions:
            self.db.add_format(self.row, ext, open(paths[ext], 'rb'), notify=False)
-        db_extensions = set([f.lower() for f in self.db.formats(self.row).split(',')])
+        dbfmts = self.db.formats(self.row)
        db_extensions = set([f.lower() for f in (dbfmts.split(',') if dbfmts
            else [])])
        extensions = new_extensions.union(old_extensions)
        for ext in db_extensions:
-            if ext not in extensions:
+            if ext not in extensions and ext in self.original_formats:
                self.db.remove_format(self.row, ext, notify=False)
    def show_format(self, item, *args):
@ -576,6 +578,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
        self.orig_date = qt_to_dt(self.date.date())
        exts = self.db.formats(row)
        self.original_formats = []
        if exts:
            exts = exts.split(',')
            for ext in exts:
@ -586,6 +589,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
                if size is None:
                    continue
                Format(self.formats, ext, size, timestamp=timestamp)
                self.original_formats.append(ext.lower())
        self.initialize_combos()
--- a/src/calibre/gui2/metadata/basic_widgets.py
+++ b/src/calibre/gui2/metadata/basic_widgets.py
@ -472,6 +472,7 @@ class FormatsManager(QWidget): # {{{
    def initialize(self, db, id_):
        self.changed = False
        exts = db.formats(id_, index_is_id=True)
        self.original_val = set([])
        if exts:
            exts = exts.split(',')
            for ext in exts:
@ -482,6 +483,7 @@ class FormatsManager(QWidget): # {{{
                if size is None:
                    continue
                Format(self.formats, ext, size, timestamp=timestamp)
                self.original_val.add(ext.lower())
    def commit(self, db, id_):
        if not self.changed:
@ -500,11 +502,12 @@ class FormatsManager(QWidget): # {{{
        for ext in new_extensions:
            db.add_format(id_, ext, open(paths[ext], 'rb'), notify=False,
                    index_is_id=True)
-        db_extensions = set([f.lower() for f in db.formats(id_,
+        dbfmts = db.formats(id_, index_is_id=True)
-            index_is_id=True).split(',')])
+        db_extensions = set([f.lower() for f in (dbfmts.split(',') if dbfmts
            else [])])
        extensions = new_extensions.union(old_extensions)
        for ext in db_extensions:
-            if ext not in extensions:
+            if ext not in extensions and ext in self.original_val:
                db.remove_format(id_, ext, notify=False, index_is_id=True)
        self.changed = False
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@ -311,10 +311,15 @@ remove all non-breaking-space entities, or may include false positive matches re
 :guilabel:`Ensure scene breaks are consistently formatted`
    With this option |app| will attempt to detect common scene-break markers and ensure that they are center aligned.  
-    It also attempts to detect scene breaks defined by white space and replace them with a horizontal rule 15% of the
+    'Soft' scene break markers, i.e. scene breaks only defined by extra white space, are styled to ensure that they 
-    page width.  Some readers may find this desirable as these 'soft' scene breaks often become page breaks on readers, and 
+    will not be displayed in conjunction with page breaks.
    thus become difficult to distinguish.
 :guilabel:`Replace scene breaks`
    If this option is configured then |app| will replace scene break markers it finds with the replacement text specified by the
    user. In general you should avoid using html tags, |app| will discard any tags and use pre-defined markup.  <hr />
    tags, i.e. horizontal rules, are an exception.  These can optionally be specified with styles, if you choose to add your own
    style be sure to include the 'width' setting, otherwise the style information will be discarded.
 :guilabel:`Remove unnecessary hyphens`
    |app| will analyze all hyphenated content in the document when this option is enabled.  The document itself is used
    as a dictionary for analysis.  This allows |app| to accurately remove hyphens for any words in the document in any language, 
@ -628,7 +633,7 @@ between 0 and 1. The default is 0.45, just under the median line length. Lower t
 text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under :guilabel:`PDF Input`.
 Also, they often have headers and footers as part of the document that will become included with the text.
-Use the options to remove headers and footers to mitigate this issue. If the headers and footers are not
+Use the Search and Replace panel to remove headers and footers to mitigate this issue. If the headers and footers are not
 removed from the text it can throw off the paragraph unwrapping. To learn how to use the header and footer removal options, read 
 :ref:`regexptutorial`.
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@ -391,6 +391,8 @@ Take your pick:
  * A tribute to the SONY Librie which was the first e-ink based e-book reader
  * My wife chose it ;-)
 |app| is pronounced as cal-i-ber *not* ca-libre. If you're wondering, |app| is the British/commonwealth spelling for caliber. Being Indian, that's the natural spelling for me. 
 Why does |app| show only some of my fonts on OS X?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 |app| embeds fonts in ebook files it creates. E-book files support embedding only TrueType (.ttf) fonts. Most fonts on OS X systems are in .dfont format, thus they cannot be embedded. |app| shows only TrueType fonts found on your system. You can obtain many TrueType fonts on the web. Simply download the .ttf files and add them to the Library/Fonts directory in your home directory.