first pass at abbyy processor

2025-12-29 16:20:20 -05:00 · 2011-02-07 01:50:17 +08:00 · 2011-02-07 01:50:17 +08:00 · 8c9c5d35e4
commit 8c9c5d35e4
parent 9b1ae4ba97
1 changed files with 109 additions and 0 deletions
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -11,6 +11,7 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.utils.logging import default_log
 from calibre.utils.wordcount import get_wordcount_obj

+
 class HeuristicProcessor(object):

    def __init__(self, extra_opts=None, log=None):
@ -38,6 +39,9 @@ class HeuristicProcessor(object):
    def is_pdftohtml(self, src):
        return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]

+    def is_abbyy(self, src):
+        return '<meta name="generator" content="ABBYY FineReader' in src[:1000]
+
    def chapter_head(self, match):
        from calibre.utils.html2text import html2text
        chap = match.group('chap')
@ -516,6 +520,107 @@ class HeuristicProcessor(object):

        return scene_break

+    def abbyy_processor(self, html):
+        abbyy_line = re.compile('((?P<linestart><p\sstyle="(?P<styles>[^\"]*?);?">)(?P<content>.*?)(?P<lineend></p>)|(?P<image><img[^>]*>))', re.IGNORECASE)
+        empty_paragraph = '\n<p> </p>\n'
+        previous_line_bottom_margin = False
+        self.in_blockquote = False
+        self.previous_was_paragraph = False
+        print "detected ABBYY content, running through processor"
+        html = re.sub('</?a[^>]*>', '', html)
+        
+        def check_paragraph(content):
+            content = re.sub('\s*</?span[^>]*>\s*', '', content)
+            if re.match('.*[\"\'.!?:]$', content):
+                #print "detected this as a paragraph"
+                return True
+            else:
+                return False
+        
+        def convert_styles(match):
+            #print "raw styles are: "+match.group('styles')
+            content = match.group('content')
+            #print "raw content is: "+match.group('content')
+            image = match.group('image')
+  
+            is_paragraph = False
+            text_align = ''
+            text_indent = ''
+            paragraph_before = ''
+            paragraph_after = ''
+            blockquote_open = '\n<blockquote>\n'
+            blockquote_close = '</blockquote>\n'
+            indented_text = 'text-indent:3%;'
+            blockquote_open_loop = ''
+            blockquote_close_loop = ''
+            debugabby = False
+            
+            if image:
+                debugabby = True
+                if self.in_blockquote:
+                    self.in_blockquote = False
+                    blockquote_close_loop = blockquote_close
+                self.previous_was_paragraph = False
+                return blockquote_close_loop+'\n'+image+'\n'
+            else:
+                styles = match.group('styles').split(';')
+                is_paragraph = check_paragraph(content)
+                #print "styles for this line are: "+str(styles)
+                split_styles = []
+                for style in styles:
+                   #print "style is: "+str(style)
+                   newstyle = style.split(':')
+                   #print "newstyle is: "+str(newstyle)
+                   split_styles.append(newstyle)
+                styles = split_styles          
+                for style, setting in styles:
+                    if style == 'text-align' and setting != 'left':
+                        text_align = style+':'+setting+';'
+                    if style == 'text-indent':
+                        setting = int(re.sub('\s*pt\s*', '', setting))
+                        if 9 < setting < 14:
+                            text_indent = indented_text
+                        else:
+                            text_indent = style+':'+str(setting)+'pt;'
+                    if style == 'padding':
+                        setting = re.sub('pt', '', setting).split(' ')
+                        if int(setting[1]) < 16 and int(setting[3]) < 16:
+                            if self.in_blockquote:
+                                debugabby = True
+                                if is_paragraph:
+                                    self.in_blockquote = False
+                                    blockquote_close_loop = blockquote_close
+                            if int(setting[3]) > 8 and text_indent == '':
+                                text_indent = indented_text
+                            if int(setting[0]) > 5:
+                                paragraph_before = empty_paragraph
+                            if int(setting[2]) > 5:
+                                paragraph_after = empty_paragraph
+                        elif not self.in_blockquote and self.previous_was_paragraph:
+                            debugabby = True
+                            self.in_blockquote = True
+                            blockquote_open_loop = blockquote_open
+                        if debugabby:
+                            print '\n\n******\n'
+                            print 'padding top is: '+str(setting[0])
+                            print 'padding right is: '+str(setting[1])
+                            print 'padding bottom is: '+str(setting[2])
+                            print 'padding left is: '+str(setting[3])
+
+                #print "text-align is: "+str(text_align)
+                print "\n***\nline is:\n     "+str(match.group(0))+'\n'
+                if debugabby:
+                    #print "this line is a paragraph = "+str(is_paragraph)+", previous line was "+str(self.previous_was_paragraph)
+                    print "styles for this line were: "+str(styles)
+                    print 'newline is: \n'+blockquote_open_loop+blockquote_close_loop+paragraph_before+'<p style="'+text_indent+text_align+'">'+content+'</p>'+paragraph_after+'\n\n\n\n\n'
+                print "is_paragraph is "+str(is_paragraph)+", previous_was_paragraph is "+str(self.previous_was_paragraph)
+                self.previous_was_paragraph = is_paragraph
+                print "previous_was_paragraph is now set to "+str(self.previous_was_paragraph)+"\n\n\n"
+                return blockquote_open_loop+blockquote_close_loop+paragraph_before+'<p style="'+text_indent+text_align+'">'+content+'</p>'+paragraph_after
+        
+        html = abbyy_line.sub(convert_styles, html)
+        return html
+        

    def __call__(self, html):
        self.log.debug("*********  Heuristic processing HTML  *********")
@ -530,6 +635,10 @@ class HeuristicProcessor(object):
            self.log.warn("flow is too short, not running heuristics")
            return html

+        is_abbyy = self.is_abbyy(html)
+        if is_abbyy:
+            html = self.abbyy_processor(html)
+
        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
        html = self.arrange_htm_line_endings(html)
        #self.dump(html, 'after_arrange_line_endings')