From 8c9c5d35e479ef3267e95b62c08c96e4a4588603 Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 7 Feb 2011 01:50:17 +0800 Subject: [PATCH 1/3] first pass at abbyy processor --- src/calibre/ebooks/conversion/utils.py | 109 +++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index c0c2ee8978..e32928fd95 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -11,6 +11,7 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.utils.logging import default_log from calibre.utils.wordcount import get_wordcount_obj + class HeuristicProcessor(object): def __init__(self, extra_opts=None, log=None): @@ -38,6 +39,9 @@ class HeuristicProcessor(object): def is_pdftohtml(self, src): return '' in src[:1000] + def is_abbyy(self, src): + return '[^\"]*?);?">)(?P.*?)(?P

)|(?P]*>))', re.IGNORECASE) + empty_paragraph = '\n

\n' + previous_line_bottom_margin = False + self.in_blockquote = False + self.previous_was_paragraph = False + print "detected ABBYY content, running through processor" + html = re.sub(']*>', '', html) + + def check_paragraph(content): + content = re.sub('\s*]*>\s*', '', content) + if re.match('.*[\"\'.!?:]$', content): + #print "detected this as a paragraph" + return True + else: + return False + + def convert_styles(match): + #print "raw styles are: "+match.group('styles') + content = match.group('content') + #print "raw content is: "+match.group('content') + image = match.group('image') + + is_paragraph = False + text_align = '' + text_indent = '' + paragraph_before = '' + paragraph_after = '' + blockquote_open = '\n
\n' + blockquote_close = '
\n' + indented_text = 'text-indent:3%;' + blockquote_open_loop = '' + blockquote_close_loop = '' + debugabby = False + + if image: + debugabby = True + if self.in_blockquote: + self.in_blockquote = False + blockquote_close_loop = blockquote_close + self.previous_was_paragraph = False + return blockquote_close_loop+'\n'+image+'\n' + else: + styles = match.group('styles').split(';') + is_paragraph = check_paragraph(content) + #print "styles for this line are: "+str(styles) + split_styles = [] + for style in styles: + #print "style is: "+str(style) + newstyle = style.split(':') + #print "newstyle is: "+str(newstyle) + split_styles.append(newstyle) + styles = split_styles + for style, setting in styles: + if style == 'text-align' and setting != 'left': + text_align = style+':'+setting+';' + if style == 'text-indent': + setting = int(re.sub('\s*pt\s*', '', setting)) + if 9 < setting < 14: + text_indent = indented_text + else: + text_indent = style+':'+str(setting)+'pt;' + if style == 'padding': + setting = re.sub('pt', '', setting).split(' ') + if int(setting[1]) < 16 and int(setting[3]) < 16: + if self.in_blockquote: + debugabby = True + if is_paragraph: + self.in_blockquote = False + blockquote_close_loop = blockquote_close + if int(setting[3]) > 8 and text_indent == '': + text_indent = indented_text + if int(setting[0]) > 5: + paragraph_before = empty_paragraph + if int(setting[2]) > 5: + paragraph_after = empty_paragraph + elif not self.in_blockquote and self.previous_was_paragraph: + debugabby = True + self.in_blockquote = True + blockquote_open_loop = blockquote_open + if debugabby: + print '\n\n******\n' + print 'padding top is: '+str(setting[0]) + print 'padding right is: '+str(setting[1]) + print 'padding bottom is: '+str(setting[2]) + print 'padding left is: '+str(setting[3]) + + #print "text-align is: "+str(text_align) + print "\n***\nline is:\n "+str(match.group(0))+'\n' + if debugabby: + #print "this line is a paragraph = "+str(is_paragraph)+", previous line was "+str(self.previous_was_paragraph) + print "styles for this line were: "+str(styles) + print 'newline is: \n'+blockquote_open_loop+blockquote_close_loop+paragraph_before+'

'+content+'

'+paragraph_after+'\n\n\n\n\n' + print "is_paragraph is "+str(is_paragraph)+", previous_was_paragraph is "+str(self.previous_was_paragraph) + self.previous_was_paragraph = is_paragraph + print "previous_was_paragraph is now set to "+str(self.previous_was_paragraph)+"\n\n\n" + return blockquote_open_loop+blockquote_close_loop+paragraph_before+'

'+content+'

'+paragraph_after + + html = abbyy_line.sub(convert_styles, html) + return html + def __call__(self, html): self.log.debug("********* Heuristic processing HTML *********") @@ -530,6 +635,10 @@ class HeuristicProcessor(object): self.log.warn("flow is too short, not running heuristics") return html + is_abbyy = self.is_abbyy(html) + if is_abbyy: + html = self.abbyy_processor(html) + # Arrange line feeds and

tags so the line_length and no_markup functions work correctly html = self.arrange_htm_line_endings(html) #self.dump(html, 'after_arrange_line_endings') From c30e5bcaee6cc469d93977edf558d0435ac60e8a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 8 Feb 2011 09:07:30 -0700 Subject: [PATCH 2/3] Update various French Belgian recipes --- resources/recipes/cinebel_be.recipe | 15 +++++++++++---- resources/recipes/dhnet_be.recipe | 12 ++++++++++-- resources/recipes/lalibre_be.recipe | 21 ++++++++++++++------- resources/recipes/lameuse_be.recipe | 11 ++++++++--- resources/recipes/lavenir_be.recipe | 12 +++++++++--- resources/recipes/lesoir_be.recipe | 5 +++-- 6 files changed, 55 insertions(+), 21 deletions(-) diff --git a/resources/recipes/cinebel_be.recipe b/resources/recipes/cinebel_be.recipe index ec76bfc894..024050eb67 100644 --- a/resources/recipes/cinebel_be.recipe +++ b/resources/recipes/cinebel_be.recipe @@ -1,7 +1,7 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Lionel Bergeret ' +__copyright__ = '2008-2011, Lionel Bergeret ' ''' cinebel.be ''' @@ -14,14 +14,14 @@ class Cinebel(BasicNewsRecipe): description = u'Cinema news from Belgium in French' publisher = u'cinebel.be' category = 'news, cinema, movie, Belgium' - oldest_article = 3 - encoding = 'utf8' - language = 'fr_BE' + oldest_article = 15 + language = 'fr' max_articles_per_feed = 20 no_stylesheets = True use_embedded_content = False timefmt = ' [%d %b %Y]' + filterDuplicates = True keep_only_tags = [ dict(name = 'span', attrs = {'class': 'movieMainTitle'}) @@ -35,6 +35,13 @@ class Cinebel(BasicNewsRecipe): ,(u'Top 10' , u'http://www.cinebel.be/Servlets/RssServlet?languageCode=fr&rssType=2' ) ] + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.has_key('href'): + tstr = "Site officiel: " + alink['href'] + alink.replaceWith(tstr) + return soup + def get_cover_url(self): cover_url = 'http://www.cinebel.be/portal/resources/common/logo_index.gif' return cover_url diff --git a/resources/recipes/dhnet_be.recipe b/resources/recipes/dhnet_be.recipe index ef4d1736e3..d55470a765 100644 --- a/resources/recipes/dhnet_be.recipe +++ b/resources/recipes/dhnet_be.recipe @@ -1,7 +1,7 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Lionel Bergeret ' +__copyright__ = '2008-2011, Lionel Bergeret ' ''' dhnet.be ''' @@ -16,7 +16,8 @@ class DHNetBe(BasicNewsRecipe): publisher = u'dhnet.be' category = 'news, Belgium' oldest_article = 3 - language = 'fr_BE' + language = 'fr' + masthead_url = 'http://www.dhnet.be/images/homepage_logo_dh.gif' max_articles_per_feed = 20 no_stylesheets = True @@ -34,6 +35,13 @@ class DHNetBe(BasicNewsRecipe): ,(u'La Une Info' , u'http://www.dhnet.be/rss/dhinfos/' ) ] + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup + def get_cover_url(self): cover_url = strftime('http://pdf-online.dhnet.be/pdfonline/image/%Y%m%d/dh_%Y%m%d_nam_infoge_001.pdf.L.jpg') return cover_url diff --git a/resources/recipes/lalibre_be.recipe b/resources/recipes/lalibre_be.recipe index 53e346bf12..a6356be828 100644 --- a/resources/recipes/lalibre_be.recipe +++ b/resources/recipes/lalibre_be.recipe @@ -1,7 +1,7 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Lionel Bergeret ' +__copyright__ = '2008-2011, Lionel Bergeret ' ''' lalibre.be ''' @@ -16,18 +16,18 @@ class LaLibre(BasicNewsRecipe): publisher = u'lalibre.be' category = 'news, Belgium' oldest_article = 3 - language = 'fr_BE' + language = 'fr' + masthead_url = 'http://www.lalibre.be/img/logoLaLibre.gif' max_articles_per_feed = 20 no_stylesheets = True use_embedded_content = False timefmt = ' [%d %b %Y]' - keep_only_tags = [ - dict(name = 'div', attrs = {'id': 'articleHat'}) - ,dict(name = 'p', attrs = {'id': 'publicationDate'}) - ,dict(name = 'div', attrs = {'id': 'articleText'}) - ] + remove_tags_before = dict(name = 'div', attrs = {'class': 'extraMainContent'}) + remove_tags_after = dict(name = 'div', attrs = {'id': 'articleText'}) + + remove_tags = [dict(name = 'div', attrs = {'id': 'strongArticleLinks'})] feeds = [ (u'L\'actu' , u'http://www.lalibre.be/rss/?section=10' ) @@ -38,6 +38,13 @@ class LaLibre(BasicNewsRecipe): ,(u'Societe' , u'http://www.lalibre.be/rss/?section=12' ) ] + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup + def get_cover_url(self): cover_url = strftime('http://pdf-online.lalibre.be/pdfonline/image/%Y%m%d/llb_%Y%m%d_nam_libre_001.pdf.L.jpg') return cover_url diff --git a/resources/recipes/lameuse_be.recipe b/resources/recipes/lameuse_be.recipe index 03b7f84a5f..7166d01103 100644 --- a/resources/recipes/lameuse_be.recipe +++ b/resources/recipes/lameuse_be.recipe @@ -1,7 +1,7 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Lionel Bergeret ' +__copyright__ = '2008-2011, Lionel Bergeret ' ''' lameuse.be ''' @@ -16,8 +16,8 @@ class LaMeuse(BasicNewsRecipe): publisher = u'lameuse.be' category = 'news, Belgium' oldest_article = 3 - encoding = 'utf8' - language = 'fr_BE' + language = 'fr' + masthead_url = 'http://www.lameuse.be/images/SPV3/logo_header_LM.gif' max_articles_per_feed = 20 no_stylesheets = True @@ -32,6 +32,11 @@ class LaMeuse(BasicNewsRecipe): dict(name = 'div', attrs = {'class': 'sb-group'}) ,dict(name = 'div', attrs = {'id': 'share'}) ,dict(name = 'div', attrs = {'id': 'commentaires'}) + ,dict(name = 'ul', attrs = {'class': 'right liensutiles'}) + ,dict(name = 'ul', attrs = {'class': 'bas liensutiles'}) + ,dict(name = 'p', attrs = {'class': 'ariane'}) + ,dict(name = 'div', attrs = {'class': 'inner-bloc'}) + ,dict(name = 'div', attrs = {'class': 'block-01'}) ] feeds = [ diff --git a/resources/recipes/lavenir_be.recipe b/resources/recipes/lavenir_be.recipe index 68be449ae5..4c2c8a00a2 100644 --- a/resources/recipes/lavenir_be.recipe +++ b/resources/recipes/lavenir_be.recipe @@ -1,7 +1,7 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Lionel Bergeret ' +__copyright__ = '2008-2011, Lionel Bergeret ' ''' lavenir.net ''' @@ -15,8 +15,7 @@ class LAvenir(BasicNewsRecipe): publisher = u'lavenir.net' category = 'news, Belgium' oldest_article = 3 - encoding = 'utf8' - language = 'fr_BE' + language = 'fr' max_articles_per_feed = 20 no_stylesheets = True @@ -35,6 +34,13 @@ class LAvenir(BasicNewsRecipe): ,(u'Societe' , u'http://www.lavenir.net/rss.aspx?foto=1&intro=1§ion=info&info=12e1a2f4-7e03-4cf1-afec-016869072317' ) ] + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup + def get_cover_url(self): cover_url = 'http://www.lavenir.net/extra/Static/journal/Pdf/1/UNE_Nationale.PDF' return cover_url diff --git a/resources/recipes/lesoir_be.recipe b/resources/recipes/lesoir_be.recipe index 6b6891c3b8..64fd2fa65c 100644 --- a/resources/recipes/lesoir_be.recipe +++ b/resources/recipes/lesoir_be.recipe @@ -1,7 +1,7 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Lionel Bergeret ' +__copyright__ = '2008-2011, Lionel Bergeret ' ''' lesoir.be ''' @@ -16,7 +16,8 @@ class LeSoirBe(BasicNewsRecipe): publisher = u'lesoir.be' category = 'news, Belgium' oldest_article = 3 - language = 'fr_BE' + language = 'fr' + masthead_url = 'http://pdf.lesoir.be/pdf/images/SOIR//logo.gif' max_articles_per_feed = 20 no_stylesheets = True From 326ebb9bcbececee9cd37797afa9a899df5f63b3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 8 Feb 2011 11:39:14 -0700 Subject: [PATCH 3/3] Turn search as you type off by default --- src/calibre/utils/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/utils/config.py b/src/calibre/utils/config.py index 88197d423d..a2ceaced68 100644 --- a/src/calibre/utils/config.py +++ b/src/calibre/utils/config.py @@ -733,7 +733,7 @@ def _prefs(): 'prefixes, as for example, Red instead of title:Red, ' 'limit the columns searched to those named below.')) c.add_opt('limit_search_columns_to', - default=['title', 'authors', 'tags', 'series'], + default=['title', 'authors', 'tags', 'series', 'publisher'], help=_('Choose columns to be searched when not using prefixes, ' 'as for example, when searching for Redd instead of ' 'title:Red. Enter a list of search/lookup names '