From 8c9c5d35e479ef3267e95b62c08c96e4a4588603 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 7 Feb 2011 01:50:17 +0800
Subject: [PATCH 1/3] first pass at abbyy processor

---
 src/calibre/ebooks/conversion/utils.py | 109 +++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index c0c2ee8978..e32928fd95 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -11,6 +11,7 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.utils.logging import default_log
 from calibre.utils.wordcount import get_wordcount_obj
 
+
 class HeuristicProcessor(object):
 
     def __init__(self, extra_opts=None, log=None):
@@ -38,6 +39,9 @@ class HeuristicProcessor(object):
     def is_pdftohtml(self, src):
         return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
 
+    def is_abbyy(self, src):
+        return '<meta name="generator" content="ABBYY FineReader' in src[:1000]
+
     def chapter_head(self, match):
         from calibre.utils.html2text import html2text
         chap = match.group('chap')
@@ -516,6 +520,107 @@ class HeuristicProcessor(object):
 
         return scene_break
 
+    def abbyy_processor(self, html):
+        abbyy_line = re.compile('((?P<linestart><p\sstyle="(?P<styles>[^\"]*?);?">)(?P<content>.*?)(?P<lineend></p>)|(?P<image><img[^>]*>))', re.IGNORECASE)
+        empty_paragraph = '\n<p> </p>\n'
+        previous_line_bottom_margin = False
+        self.in_blockquote = False
+        self.previous_was_paragraph = False
+        print "detected ABBYY content, running through processor"
+        html = re.sub('</?a[^>]*>', '', html)
+        
+        def check_paragraph(content):
+            content = re.sub('\s*</?span[^>]*>\s*', '', content)
+            if re.match('.*[\"\'.!?:]$', content):
+                #print "detected this as a paragraph"
+                return True
+            else:
+                return False
+        
+        def convert_styles(match):
+            #print "raw styles are: "+match.group('styles')
+            content = match.group('content')
+            #print "raw content is: "+match.group('content')
+            image = match.group('image')
+  
+            is_paragraph = False
+            text_align = ''
+            text_indent = ''
+            paragraph_before = ''
+            paragraph_after = ''
+            blockquote_open = '\n<blockquote>\n'
+            blockquote_close = '</blockquote>\n'
+            indented_text = 'text-indent:3%;'
+            blockquote_open_loop = ''
+            blockquote_close_loop = ''
+            debugabby = False
+            
+            if image:
+                debugabby = True
+                if self.in_blockquote:
+                    self.in_blockquote = False
+                    blockquote_close_loop = blockquote_close
+                self.previous_was_paragraph = False
+                return blockquote_close_loop+'\n'+image+'\n'
+            else:
+                styles = match.group('styles').split(';')
+                is_paragraph = check_paragraph(content)
+                #print "styles for this line are: "+str(styles)
+                split_styles = []
+                for style in styles:
+                   #print "style is: "+str(style)
+                   newstyle = style.split(':')
+                   #print "newstyle is: "+str(newstyle)
+                   split_styles.append(newstyle)
+                styles = split_styles          
+                for style, setting in styles:
+                    if style == 'text-align' and setting != 'left':
+                        text_align = style+':'+setting+';'
+                    if style == 'text-indent':
+                        setting = int(re.sub('\s*pt\s*', '', setting))
+                        if 9 < setting < 14:
+                            text_indent = indented_text
+                        else:
+                            text_indent = style+':'+str(setting)+'pt;'
+                    if style == 'padding':
+                        setting = re.sub('pt', '', setting).split(' ')
+                        if int(setting[1]) < 16 and int(setting[3]) < 16:
+                            if self.in_blockquote:
+                                debugabby = True
+                                if is_paragraph:
+                                    self.in_blockquote = False
+                                    blockquote_close_loop = blockquote_close
+                            if int(setting[3]) > 8 and text_indent == '':
+                                text_indent = indented_text
+                            if int(setting[0]) > 5:
+                                paragraph_before = empty_paragraph
+                            if int(setting[2]) > 5:
+                                paragraph_after = empty_paragraph
+                        elif not self.in_blockquote and self.previous_was_paragraph:
+                            debugabby = True
+                            self.in_blockquote = True
+                            blockquote_open_loop = blockquote_open
+                        if debugabby:
+                            print '\n\n******\n'
+                            print 'padding top is: '+str(setting[0])
+                            print 'padding right is: '+str(setting[1])
+                            print 'padding bottom is: '+str(setting[2])
+                            print 'padding left is: '+str(setting[3])
+
+                #print "text-align is: "+str(text_align)
+                print "\n***\nline is:\n     "+str(match.group(0))+'\n'
+                if debugabby:
+                    #print "this line is a paragraph = "+str(is_paragraph)+", previous line was "+str(self.previous_was_paragraph)
+                    print "styles for this line were: "+str(styles)
+                    print 'newline is: \n'+blockquote_open_loop+blockquote_close_loop+paragraph_before+'<p style="'+text_indent+text_align+'">'+content+'</p>'+paragraph_after+'\n\n\n\n\n'
+                print "is_paragraph is "+str(is_paragraph)+", previous_was_paragraph is "+str(self.previous_was_paragraph)
+                self.previous_was_paragraph = is_paragraph
+                print "previous_was_paragraph is now set to "+str(self.previous_was_paragraph)+"\n\n\n"
+                return blockquote_open_loop+blockquote_close_loop+paragraph_before+'<p style="'+text_indent+text_align+'">'+content+'</p>'+paragraph_after
+        
+        html = abbyy_line.sub(convert_styles, html)
+        return html
+        
 
     def __call__(self, html):
         self.log.debug("*********  Heuristic processing HTML  *********")
@@ -530,6 +635,10 @@ class HeuristicProcessor(object):
             self.log.warn("flow is too short, not running heuristics")
             return html
 
+        is_abbyy = self.is_abbyy(html)
+        if is_abbyy:
+            html = self.abbyy_processor(html)
+
         # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
         html = self.arrange_htm_line_endings(html)
         #self.dump(html, 'after_arrange_line_endings')

From c30e5bcaee6cc469d93977edf558d0435ac60e8a Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 8 Feb 2011 09:07:30 -0700
Subject: [PATCH 2/3] Update various French Belgian recipes

---
 resources/recipes/cinebel_be.recipe | 15 +++++++++++----
 resources/recipes/dhnet_be.recipe   | 12 ++++++++++--
 resources/recipes/lalibre_be.recipe | 21 ++++++++++++++-------
 resources/recipes/lameuse_be.recipe | 11 ++++++++---
 resources/recipes/lavenir_be.recipe | 12 +++++++++---
 resources/recipes/lesoir_be.recipe  |  5 +++--
 6 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/resources/recipes/cinebel_be.recipe b/resources/recipes/cinebel_be.recipe
index ec76bfc894..024050eb67 100644
--- a/resources/recipes/cinebel_be.recipe
+++ b/resources/recipes/cinebel_be.recipe
@@ -1,7 +1,7 @@
 #!/usr/bin/env  python
 
 __license__   = 'GPL v3'
-__copyright__ = '2008, Lionel Bergeret <lbergeret at gmail.com>'
+__copyright__ = '2008-2011, Lionel Bergeret <lbergeret at gmail.com>'
 '''
 cinebel.be
 '''
@@ -14,14 +14,14 @@ class Cinebel(BasicNewsRecipe):
     description           = u'Cinema news from Belgium in French'
     publisher             = u'cinebel.be'
     category              = 'news, cinema, movie, Belgium'
-    oldest_article        = 3
-    encoding              = 'utf8'
-    language              = 'fr_BE'
+    oldest_article        = 15
+    language              = 'fr'
 
     max_articles_per_feed = 20
     no_stylesheets        = True
     use_embedded_content  = False
     timefmt               = ' [%d %b %Y]'
+    filterDuplicates      = True
 
     keep_only_tags = [
 	 dict(name = 'span', attrs = {'class': 'movieMainTitle'})
@@ -35,6 +35,13 @@ class Cinebel(BasicNewsRecipe):
         ,(u'Top 10'                    , u'http://www.cinebel.be/Servlets/RssServlet?languageCode=fr&rssType=2' )
     ]
 
+    def preprocess_html(self, soup):
+        for alink in soup.findAll('a'):
+            if alink.has_key('href'):
+               tstr = "Site officiel: " + alink['href']
+               alink.replaceWith(tstr)
+        return soup
+
     def get_cover_url(self):
         cover_url = 'http://www.cinebel.be/portal/resources/common/logo_index.gif'
         return cover_url
diff --git a/resources/recipes/dhnet_be.recipe b/resources/recipes/dhnet_be.recipe
index ef4d1736e3..d55470a765 100644
--- a/resources/recipes/dhnet_be.recipe
+++ b/resources/recipes/dhnet_be.recipe
@@ -1,7 +1,7 @@
 #!/usr/bin/env  python
 
 __license__   = 'GPL v3'
-__copyright__ = '2008, Lionel Bergeret <lbergeret at gmail.com>'
+__copyright__ = '2008-2011, Lionel Bergeret <lbergeret at gmail.com>'
 '''
 dhnet.be
 '''
@@ -16,7 +16,8 @@ class DHNetBe(BasicNewsRecipe):
     publisher             = u'dhnet.be'
     category              = 'news, Belgium'
     oldest_article        = 3
-    language              = 'fr_BE'
+    language              = 'fr'
+    masthead_url          = 'http://www.dhnet.be/images/homepage_logo_dh.gif'
 
     max_articles_per_feed = 20
     no_stylesheets        = True
@@ -34,6 +35,13 @@ class DHNetBe(BasicNewsRecipe):
         ,(u'La Une Info'    , u'http://www.dhnet.be/rss/dhinfos/' )
     ]
 
+    def preprocess_html(self, soup):
+        for alink in soup.findAll('a'):
+            if alink.string is not None:
+               tstr = alink.string
+               alink.replaceWith(tstr)
+        return soup
+
     def get_cover_url(self):
         cover_url = strftime('http://pdf-online.dhnet.be/pdfonline/image/%Y%m%d/dh_%Y%m%d_nam_infoge_001.pdf.L.jpg')
         return cover_url
diff --git a/resources/recipes/lalibre_be.recipe b/resources/recipes/lalibre_be.recipe
index 53e346bf12..a6356be828 100644
--- a/resources/recipes/lalibre_be.recipe
+++ b/resources/recipes/lalibre_be.recipe
@@ -1,7 +1,7 @@
 #!/usr/bin/env  python
 
 __license__   = 'GPL v3'
-__copyright__ = '2008, Lionel Bergeret <lbergeret at gmail.com>'
+__copyright__ = '2008-2011, Lionel Bergeret <lbergeret at gmail.com>'
 '''
 lalibre.be
 '''
@@ -16,18 +16,18 @@ class LaLibre(BasicNewsRecipe):
     publisher             = u'lalibre.be'
     category              = 'news, Belgium'
     oldest_article        = 3
-    language              = 'fr_BE'
+    language              = 'fr'
+    masthead_url          = 'http://www.lalibre.be/img/logoLaLibre.gif'
 
     max_articles_per_feed = 20
     no_stylesheets        = True
     use_embedded_content  = False
     timefmt               = ' [%d %b %Y]'
 
-    keep_only_tags = [
-	 dict(name = 'div', attrs = {'id': 'articleHat'})
-	,dict(name = 'p', attrs = {'id': 'publicationDate'})
-        ,dict(name = 'div', attrs = {'id': 'articleText'})
-    ]
+    remove_tags_before = dict(name = 'div', attrs = {'class': 'extraMainContent'})
+    remove_tags_after  = dict(name = 'div', attrs = {'id': 'articleText'})
+
+    remove_tags = [dict(name = 'div', attrs = {'id': 'strongArticleLinks'})]
 
     feeds = [
          (u'L\'actu'           , u'http://www.lalibre.be/rss/?section=10' )
@@ -38,6 +38,13 @@ class LaLibre(BasicNewsRecipe):
         ,(u'Societe'           , u'http://www.lalibre.be/rss/?section=12' )
     ]
 
+    def preprocess_html(self, soup):
+        for alink in soup.findAll('a'):
+            if alink.string is not None:
+               tstr = alink.string
+               alink.replaceWith(tstr)
+        return soup
+
     def get_cover_url(self):
         cover_url = strftime('http://pdf-online.lalibre.be/pdfonline/image/%Y%m%d/llb_%Y%m%d_nam_libre_001.pdf.L.jpg')
         return cover_url
diff --git a/resources/recipes/lameuse_be.recipe b/resources/recipes/lameuse_be.recipe
index 03b7f84a5f..7166d01103 100644
--- a/resources/recipes/lameuse_be.recipe
+++ b/resources/recipes/lameuse_be.recipe
@@ -1,7 +1,7 @@
 #!/usr/bin/env  python
 
 __license__   = 'GPL v3'
-__copyright__ = '2008, Lionel Bergeret <lbergeret at gmail.com>'
+__copyright__ = '2008-2011, Lionel Bergeret <lbergeret at gmail.com>'
 '''
 lameuse.be
 '''
@@ -16,8 +16,8 @@ class LaMeuse(BasicNewsRecipe):
     publisher             = u'lameuse.be'
     category              = 'news, Belgium'
     oldest_article        = 3
-    encoding              = 'utf8'
-    language              = 'fr_BE'
+    language              = 'fr'
+    masthead_url          = 'http://www.lameuse.be/images/SPV3/logo_header_LM.gif'
 
     max_articles_per_feed = 20
     no_stylesheets        = True
@@ -32,6 +32,11 @@ class LaMeuse(BasicNewsRecipe):
          dict(name = 'div', attrs = {'class': 'sb-group'})
         ,dict(name = 'div', attrs = {'id': 'share'})
         ,dict(name = 'div', attrs = {'id': 'commentaires'})
+        ,dict(name = 'ul', attrs = {'class': 'right liensutiles'})
+        ,dict(name = 'ul', attrs = {'class': 'bas liensutiles'})
+        ,dict(name = 'p', attrs = {'class': 'ariane'})
+        ,dict(name = 'div', attrs = {'class': 'inner-bloc'})
+        ,dict(name = 'div', attrs = {'class': 'block-01'})
     ]
 
     feeds = [
diff --git a/resources/recipes/lavenir_be.recipe b/resources/recipes/lavenir_be.recipe
index 68be449ae5..4c2c8a00a2 100644
--- a/resources/recipes/lavenir_be.recipe
+++ b/resources/recipes/lavenir_be.recipe
@@ -1,7 +1,7 @@
 #!/usr/bin/env  python
 
 __license__   = 'GPL v3'
-__copyright__ = '2008, Lionel Bergeret <lbergeret at gmail.com>'
+__copyright__ = '2008-2011, Lionel Bergeret <lbergeret at gmail.com>'
 '''
 lavenir.net
 '''
@@ -15,8 +15,7 @@ class LAvenir(BasicNewsRecipe):
     publisher             = u'lavenir.net'
     category              = 'news, Belgium'
     oldest_article        = 3
-    encoding              = 'utf8'
-    language              = 'fr_BE'
+    language              = 'fr'
 
     max_articles_per_feed = 20
     no_stylesheets        = True
@@ -35,6 +34,13 @@ class LAvenir(BasicNewsRecipe):
         ,(u'Societe'        , u'http://www.lavenir.net/rss.aspx?foto=1&intro=1&section=info&info=12e1a2f4-7e03-4cf1-afec-016869072317' )
     ]
 
+    def preprocess_html(self, soup):
+        for alink in soup.findAll('a'):
+            if alink.string is not None:
+               tstr = alink.string
+               alink.replaceWith(tstr)
+        return soup
+
     def get_cover_url(self):
         cover_url = 'http://www.lavenir.net/extra/Static/journal/Pdf/1/UNE_Nationale.PDF'
         return cover_url
diff --git a/resources/recipes/lesoir_be.recipe b/resources/recipes/lesoir_be.recipe
index 6b6891c3b8..64fd2fa65c 100644
--- a/resources/recipes/lesoir_be.recipe
+++ b/resources/recipes/lesoir_be.recipe
@@ -1,7 +1,7 @@
 #!/usr/bin/env  python
 
 __license__   = 'GPL v3'
-__copyright__ = '2008, Lionel Bergeret <lbergeret at gmail.com>'
+__copyright__ = '2008-2011, Lionel Bergeret <lbergeret at gmail.com>'
 '''
 lesoir.be
 '''
@@ -16,7 +16,8 @@ class LeSoirBe(BasicNewsRecipe):
     publisher             = u'lesoir.be'
     category              = 'news, Belgium'
     oldest_article        = 3
-    language              = 'fr_BE'
+    language              = 'fr'
+    masthead_url          = 'http://pdf.lesoir.be/pdf/images/SOIR//logo.gif'
 
     max_articles_per_feed = 20
     no_stylesheets        = True

From 326ebb9bcbececee9cd37797afa9a899df5f63b3 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 8 Feb 2011 11:39:14 -0700
Subject: [PATCH 3/3] Turn search as you type off by default

---
 src/calibre/utils/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/utils/config.py b/src/calibre/utils/config.py
index 88197d423d..a2ceaced68 100644
--- a/src/calibre/utils/config.py
+++ b/src/calibre/utils/config.py
@@ -733,7 +733,7 @@ def _prefs():
             'prefixes, as for example, Red instead of title:Red, '
             'limit the columns searched to those named below.'))
     c.add_opt('limit_search_columns_to',
-            default=['title', 'authors', 'tags', 'series'],
+            default=['title', 'authors', 'tags', 'series', 'publisher'],
             help=_('Choose columns to be searched when not using prefixes, '
                 'as for example, when searching for Redd instead of '
                 'title:Red. Enter a list of search/lookup names '