diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index d075390e8e..c0612a7b9c 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -11,6 +11,7 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log
from calibre.utils.wordcount import get_wordcount_obj
+
class HeuristicProcessor(object):
def __init__(self, extra_opts=None, log=None):
@@ -40,6 +41,9 @@ class HeuristicProcessor(object):
def is_pdftohtml(self, src):
return '' in src[:1000]
+ def is_abbyy(self, src):
+ return '[^\"]*?);?">)(?P
\n' + self.in_blockquote = False + self.previous_was_paragraph = False + html = re.sub('?a[^>]*>', '', html) + + def check_paragraph(content): + content = re.sub('\s*?span[^>]*>\s*', '', content) + if re.match('.*[\"\'.!?:]$', content): + #print "detected this as a paragraph" + return True + else: + return False + + def convert_styles(match): + #print "raw styles are: "+match.group('styles') + content = match.group('content') + #print "raw content is: "+match.group('content') + image = match.group('image') + + is_paragraph = False + text_align = '' + text_indent = '' + paragraph_before = '' + paragraph_after = '' + blockquote_open = '\n
\n' + blockquote_close = '\n' + indented_text = 'text-indent:3%;' + blockquote_open_loop = '' + blockquote_close_loop = '' + debugabby = False + + if image: + debugabby = True + if self.in_blockquote: + self.in_blockquote = False + blockquote_close_loop = blockquote_close + self.previous_was_paragraph = False + return blockquote_close_loop+'\n'+image+'\n' + else: + styles = match.group('styles').split(';') + is_paragraph = check_paragraph(content) + #print "styles for this line are: "+str(styles) + split_styles = [] + for style in styles: + #print "style is: "+str(style) + newstyle = style.split(':') + #print "newstyle is: "+str(newstyle) + split_styles.append(newstyle) + styles = split_styles + for style, setting in styles: + if style == 'text-align' and setting != 'left': + text_align = style+':'+setting+';' + if style == 'text-indent': + setting = int(re.sub('\s*pt\s*', '', setting)) + if 9 < setting < 14: + text_indent = indented_text + else: + text_indent = style+':'+str(setting)+'pt;' + if style == 'padding': + setting = re.sub('pt', '', setting).split(' ') + if int(setting[1]) < 16 and int(setting[3]) < 16: + if self.in_blockquote: + debugabby = True + if is_paragraph: + self.in_blockquote = False + blockquote_close_loop = blockquote_close + if int(setting[3]) > 8 and text_indent == '': + text_indent = indented_text + if int(setting[0]) > 5: + paragraph_before = empty_paragraph + if int(setting[2]) > 5: + paragraph_after = empty_paragraph + elif not self.in_blockquote and self.previous_was_paragraph: + debugabby = True + self.in_blockquote = True + blockquote_open_loop = blockquote_open + if debugabby: + self.log.debug('\n\n******\n') + self.log.debug('padding top is: '+str(setting[0])) + self.log.debug('padding right is:' + +str(setting[1])) + self.log.debug('padding bottom is: ' + + str(setting[2])) + self.log.debug('padding left is: ' + +str(setting[3])) + + #print "text-align is: "+str(text_align) + #print "\n***\nline is:\n "+str(match.group(0))+'\n' + if debugabby: + #print "this line is a paragraph = "+str(is_paragraph)+", previous line was "+str(self.previous_was_paragraph) + self.log.debug("styles for this line were:", styles) + self.log.debug('newline is:') + self.log.debug(blockquote_open_loop+blockquote_close_loop+ + paragraph_before+'
'+content+'
'+paragraph_after+'\n\n\n\n\n') + #print "is_paragraph is "+str(is_paragraph)+", previous_was_paragraph is "+str(self.previous_was_paragraph) + self.previous_was_paragraph = is_paragraph + #print "previous_was_paragraph is now set to "+str(self.previous_was_paragraph)+"\n\n\n" + return blockquote_open_loop+blockquote_close_loop+paragraph_before+''+content+'
'+paragraph_after + + html = abbyy_line.sub(convert_styles, html) + return html + def __call__(self, html): self.log.debug("********* Heuristic processing HTML *********") @@ -532,6 +641,10 @@ class HeuristicProcessor(object): self.log.warn("flow is too short, not running heuristics") return html + is_abbyy = self.is_abbyy(html) + if is_abbyy: + html = self.abbyy_processor(html) + # Arrange line feeds and tags so the line_length and no_markup functions work correctly html = self.arrange_htm_line_endings(html) #self.dump(html, 'after_arrange_line_endings')