From 8c9c5d35e479ef3267e95b62c08c96e4a4588603 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Mon, 7 Feb 2011 01:50:17 +0800
Subject: [PATCH 1/3] first pass at abbyy processor
---
src/calibre/ebooks/conversion/utils.py | 109 +++++++++++++++++++++++++
1 file changed, 109 insertions(+)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index c0c2ee8978..e32928fd95 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -11,6 +11,7 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log
from calibre.utils.wordcount import get_wordcount_obj
+
class HeuristicProcessor(object):
def __init__(self, extra_opts=None, log=None):
@@ -38,6 +39,9 @@ class HeuristicProcessor(object):
def is_pdftohtml(self, src):
return '' in src[:1000]
+ def is_abbyy(self, src):
+ return '[^\"]*?);?">)(?P.*?)(?P
)|(?P
]*>))', re.IGNORECASE)
+ empty_paragraph = '\n
\n'
+ previous_line_bottom_margin = False
+ self.in_blockquote = False
+ self.previous_was_paragraph = False
+ print "detected ABBYY content, running through processor"
+ html = re.sub('?a[^>]*>', '', html)
+
+ def check_paragraph(content):
+ content = re.sub('\s*?span[^>]*>\s*', '', content)
+ if re.match('.*[\"\'.!?:]$', content):
+ #print "detected this as a paragraph"
+ return True
+ else:
+ return False
+
+ def convert_styles(match):
+ #print "raw styles are: "+match.group('styles')
+ content = match.group('content')
+ #print "raw content is: "+match.group('content')
+ image = match.group('image')
+
+ is_paragraph = False
+ text_align = ''
+ text_indent = ''
+ paragraph_before = ''
+ paragraph_after = ''
+ blockquote_open = '\n\n'
+ blockquote_close = '
\n'
+ indented_text = 'text-indent:3%;'
+ blockquote_open_loop = ''
+ blockquote_close_loop = ''
+ debugabby = False
+
+ if image:
+ debugabby = True
+ if self.in_blockquote:
+ self.in_blockquote = False
+ blockquote_close_loop = blockquote_close
+ self.previous_was_paragraph = False
+ return blockquote_close_loop+'\n'+image+'\n'
+ else:
+ styles = match.group('styles').split(';')
+ is_paragraph = check_paragraph(content)
+ #print "styles for this line are: "+str(styles)
+ split_styles = []
+ for style in styles:
+ #print "style is: "+str(style)
+ newstyle = style.split(':')
+ #print "newstyle is: "+str(newstyle)
+ split_styles.append(newstyle)
+ styles = split_styles
+ for style, setting in styles:
+ if style == 'text-align' and setting != 'left':
+ text_align = style+':'+setting+';'
+ if style == 'text-indent':
+ setting = int(re.sub('\s*pt\s*', '', setting))
+ if 9 < setting < 14:
+ text_indent = indented_text
+ else:
+ text_indent = style+':'+str(setting)+'pt;'
+ if style == 'padding':
+ setting = re.sub('pt', '', setting).split(' ')
+ if int(setting[1]) < 16 and int(setting[3]) < 16:
+ if self.in_blockquote:
+ debugabby = True
+ if is_paragraph:
+ self.in_blockquote = False
+ blockquote_close_loop = blockquote_close
+ if int(setting[3]) > 8 and text_indent == '':
+ text_indent = indented_text
+ if int(setting[0]) > 5:
+ paragraph_before = empty_paragraph
+ if int(setting[2]) > 5:
+ paragraph_after = empty_paragraph
+ elif not self.in_blockquote and self.previous_was_paragraph:
+ debugabby = True
+ self.in_blockquote = True
+ blockquote_open_loop = blockquote_open
+ if debugabby:
+ print '\n\n******\n'
+ print 'padding top is: '+str(setting[0])
+ print 'padding right is: '+str(setting[1])
+ print 'padding bottom is: '+str(setting[2])
+ print 'padding left is: '+str(setting[3])
+
+ #print "text-align is: "+str(text_align)
+ print "\n***\nline is:\n "+str(match.group(0))+'\n'
+ if debugabby:
+ #print "this line is a paragraph = "+str(is_paragraph)+", previous line was "+str(self.previous_was_paragraph)
+ print "styles for this line were: "+str(styles)
+ print 'newline is: \n'+blockquote_open_loop+blockquote_close_loop+paragraph_before+''+content+'
'+paragraph_after+'\n\n\n\n\n'
+ print "is_paragraph is "+str(is_paragraph)+", previous_was_paragraph is "+str(self.previous_was_paragraph)
+ self.previous_was_paragraph = is_paragraph
+ print "previous_was_paragraph is now set to "+str(self.previous_was_paragraph)+"\n\n\n"
+ return blockquote_open_loop+blockquote_close_loop+paragraph_before+''+content+'
'+paragraph_after
+
+ html = abbyy_line.sub(convert_styles, html)
+ return html
+
def __call__(self, html):
self.log.debug("********* Heuristic processing HTML *********")
@@ -530,6 +635,10 @@ class HeuristicProcessor(object):
self.log.warn("flow is too short, not running heuristics")
return html
+ is_abbyy = self.is_abbyy(html)
+ if is_abbyy:
+ html = self.abbyy_processor(html)
+
# Arrange line feeds and tags so the line_length and no_markup functions work correctly
html = self.arrange_htm_line_endings(html)
#self.dump(html, 'after_arrange_line_endings')
From c30e5bcaee6cc469d93977edf558d0435ac60e8a Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Tue, 8 Feb 2011 09:07:30 -0700
Subject: [PATCH 2/3] Update various French Belgian recipes
---
resources/recipes/cinebel_be.recipe | 15 +++++++++++----
resources/recipes/dhnet_be.recipe | 12 ++++++++++--
resources/recipes/lalibre_be.recipe | 21 ++++++++++++++-------
resources/recipes/lameuse_be.recipe | 11 ++++++++---
resources/recipes/lavenir_be.recipe | 12 +++++++++---
resources/recipes/lesoir_be.recipe | 5 +++--
6 files changed, 55 insertions(+), 21 deletions(-)
diff --git a/resources/recipes/cinebel_be.recipe b/resources/recipes/cinebel_be.recipe
index ec76bfc894..024050eb67 100644
--- a/resources/recipes/cinebel_be.recipe
+++ b/resources/recipes/cinebel_be.recipe
@@ -1,7 +1,7 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
-__copyright__ = '2008, Lionel Bergeret '
+__copyright__ = '2008-2011, Lionel Bergeret '
'''
cinebel.be
'''
@@ -14,14 +14,14 @@ class Cinebel(BasicNewsRecipe):
description = u'Cinema news from Belgium in French'
publisher = u'cinebel.be'
category = 'news, cinema, movie, Belgium'
- oldest_article = 3
- encoding = 'utf8'
- language = 'fr_BE'
+ oldest_article = 15
+ language = 'fr'
max_articles_per_feed = 20
no_stylesheets = True
use_embedded_content = False
timefmt = ' [%d %b %Y]'
+ filterDuplicates = True
keep_only_tags = [
dict(name = 'span', attrs = {'class': 'movieMainTitle'})
@@ -35,6 +35,13 @@ class Cinebel(BasicNewsRecipe):
,(u'Top 10' , u'http://www.cinebel.be/Servlets/RssServlet?languageCode=fr&rssType=2' )
]
+ def preprocess_html(self, soup):
+ for alink in soup.findAll('a'):
+ if alink.has_key('href'):
+ tstr = "Site officiel: " + alink['href']
+ alink.replaceWith(tstr)
+ return soup
+
def get_cover_url(self):
cover_url = 'http://www.cinebel.be/portal/resources/common/logo_index.gif'
return cover_url
diff --git a/resources/recipes/dhnet_be.recipe b/resources/recipes/dhnet_be.recipe
index ef4d1736e3..d55470a765 100644
--- a/resources/recipes/dhnet_be.recipe
+++ b/resources/recipes/dhnet_be.recipe
@@ -1,7 +1,7 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
-__copyright__ = '2008, Lionel Bergeret '
+__copyright__ = '2008-2011, Lionel Bergeret '
'''
dhnet.be
'''
@@ -16,7 +16,8 @@ class DHNetBe(BasicNewsRecipe):
publisher = u'dhnet.be'
category = 'news, Belgium'
oldest_article = 3
- language = 'fr_BE'
+ language = 'fr'
+ masthead_url = 'http://www.dhnet.be/images/homepage_logo_dh.gif'
max_articles_per_feed = 20
no_stylesheets = True
@@ -34,6 +35,13 @@ class DHNetBe(BasicNewsRecipe):
,(u'La Une Info' , u'http://www.dhnet.be/rss/dhinfos/' )
]
+ def preprocess_html(self, soup):
+ for alink in soup.findAll('a'):
+ if alink.string is not None:
+ tstr = alink.string
+ alink.replaceWith(tstr)
+ return soup
+
def get_cover_url(self):
cover_url = strftime('http://pdf-online.dhnet.be/pdfonline/image/%Y%m%d/dh_%Y%m%d_nam_infoge_001.pdf.L.jpg')
return cover_url
diff --git a/resources/recipes/lalibre_be.recipe b/resources/recipes/lalibre_be.recipe
index 53e346bf12..a6356be828 100644
--- a/resources/recipes/lalibre_be.recipe
+++ b/resources/recipes/lalibre_be.recipe
@@ -1,7 +1,7 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
-__copyright__ = '2008, Lionel Bergeret '
+__copyright__ = '2008-2011, Lionel Bergeret '
'''
lalibre.be
'''
@@ -16,18 +16,18 @@ class LaLibre(BasicNewsRecipe):
publisher = u'lalibre.be'
category = 'news, Belgium'
oldest_article = 3
- language = 'fr_BE'
+ language = 'fr'
+ masthead_url = 'http://www.lalibre.be/img/logoLaLibre.gif'
max_articles_per_feed = 20
no_stylesheets = True
use_embedded_content = False
timefmt = ' [%d %b %Y]'
- keep_only_tags = [
- dict(name = 'div', attrs = {'id': 'articleHat'})
- ,dict(name = 'p', attrs = {'id': 'publicationDate'})
- ,dict(name = 'div', attrs = {'id': 'articleText'})
- ]
+ remove_tags_before = dict(name = 'div', attrs = {'class': 'extraMainContent'})
+ remove_tags_after = dict(name = 'div', attrs = {'id': 'articleText'})
+
+ remove_tags = [dict(name = 'div', attrs = {'id': 'strongArticleLinks'})]
feeds = [
(u'L\'actu' , u'http://www.lalibre.be/rss/?section=10' )
@@ -38,6 +38,13 @@ class LaLibre(BasicNewsRecipe):
,(u'Societe' , u'http://www.lalibre.be/rss/?section=12' )
]
+ def preprocess_html(self, soup):
+ for alink in soup.findAll('a'):
+ if alink.string is not None:
+ tstr = alink.string
+ alink.replaceWith(tstr)
+ return soup
+
def get_cover_url(self):
cover_url = strftime('http://pdf-online.lalibre.be/pdfonline/image/%Y%m%d/llb_%Y%m%d_nam_libre_001.pdf.L.jpg')
return cover_url
diff --git a/resources/recipes/lameuse_be.recipe b/resources/recipes/lameuse_be.recipe
index 03b7f84a5f..7166d01103 100644
--- a/resources/recipes/lameuse_be.recipe
+++ b/resources/recipes/lameuse_be.recipe
@@ -1,7 +1,7 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
-__copyright__ = '2008, Lionel Bergeret '
+__copyright__ = '2008-2011, Lionel Bergeret '
'''
lameuse.be
'''
@@ -16,8 +16,8 @@ class LaMeuse(BasicNewsRecipe):
publisher = u'lameuse.be'
category = 'news, Belgium'
oldest_article = 3
- encoding = 'utf8'
- language = 'fr_BE'
+ language = 'fr'
+ masthead_url = 'http://www.lameuse.be/images/SPV3/logo_header_LM.gif'
max_articles_per_feed = 20
no_stylesheets = True
@@ -32,6 +32,11 @@ class LaMeuse(BasicNewsRecipe):
dict(name = 'div', attrs = {'class': 'sb-group'})
,dict(name = 'div', attrs = {'id': 'share'})
,dict(name = 'div', attrs = {'id': 'commentaires'})
+ ,dict(name = 'ul', attrs = {'class': 'right liensutiles'})
+ ,dict(name = 'ul', attrs = {'class': 'bas liensutiles'})
+ ,dict(name = 'p', attrs = {'class': 'ariane'})
+ ,dict(name = 'div', attrs = {'class': 'inner-bloc'})
+ ,dict(name = 'div', attrs = {'class': 'block-01'})
]
feeds = [
diff --git a/resources/recipes/lavenir_be.recipe b/resources/recipes/lavenir_be.recipe
index 68be449ae5..4c2c8a00a2 100644
--- a/resources/recipes/lavenir_be.recipe
+++ b/resources/recipes/lavenir_be.recipe
@@ -1,7 +1,7 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
-__copyright__ = '2008, Lionel Bergeret '
+__copyright__ = '2008-2011, Lionel Bergeret '
'''
lavenir.net
'''
@@ -15,8 +15,7 @@ class LAvenir(BasicNewsRecipe):
publisher = u'lavenir.net'
category = 'news, Belgium'
oldest_article = 3
- encoding = 'utf8'
- language = 'fr_BE'
+ language = 'fr'
max_articles_per_feed = 20
no_stylesheets = True
@@ -35,6 +34,13 @@ class LAvenir(BasicNewsRecipe):
,(u'Societe' , u'http://www.lavenir.net/rss.aspx?foto=1&intro=1§ion=info&info=12e1a2f4-7e03-4cf1-afec-016869072317' )
]
+ def preprocess_html(self, soup):
+ for alink in soup.findAll('a'):
+ if alink.string is not None:
+ tstr = alink.string
+ alink.replaceWith(tstr)
+ return soup
+
def get_cover_url(self):
cover_url = 'http://www.lavenir.net/extra/Static/journal/Pdf/1/UNE_Nationale.PDF'
return cover_url
diff --git a/resources/recipes/lesoir_be.recipe b/resources/recipes/lesoir_be.recipe
index 6b6891c3b8..64fd2fa65c 100644
--- a/resources/recipes/lesoir_be.recipe
+++ b/resources/recipes/lesoir_be.recipe
@@ -1,7 +1,7 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
-__copyright__ = '2008, Lionel Bergeret '
+__copyright__ = '2008-2011, Lionel Bergeret '
'''
lesoir.be
'''
@@ -16,7 +16,8 @@ class LeSoirBe(BasicNewsRecipe):
publisher = u'lesoir.be'
category = 'news, Belgium'
oldest_article = 3
- language = 'fr_BE'
+ language = 'fr'
+ masthead_url = 'http://pdf.lesoir.be/pdf/images/SOIR//logo.gif'
max_articles_per_feed = 20
no_stylesheets = True
From 326ebb9bcbececee9cd37797afa9a899df5f63b3 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Tue, 8 Feb 2011 11:39:14 -0700
Subject: [PATCH 3/3] Turn search as you type off by default
---
src/calibre/utils/config.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/utils/config.py b/src/calibre/utils/config.py
index 88197d423d..a2ceaced68 100644
--- a/src/calibre/utils/config.py
+++ b/src/calibre/utils/config.py
@@ -733,7 +733,7 @@ def _prefs():
'prefixes, as for example, Red instead of title:Red, '
'limit the columns searched to those named below.'))
c.add_opt('limit_search_columns_to',
- default=['title', 'authors', 'tags', 'series'],
+ default=['title', 'authors', 'tags', 'series', 'publisher'],
help=_('Choose columns to be searched when not using prefixes, '
'as for example, when searching for Redd instead of '
'title:Red. Enter a list of search/lookup names '