first pass at abbyy processor

This commit is contained in:
ldolse 2011-02-07 01:50:17 +08:00
parent 9b1ae4ba97
commit 8c9c5d35e4

View File

@ -11,6 +11,7 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
from calibre.utils.wordcount import get_wordcount_obj from calibre.utils.wordcount import get_wordcount_obj
class HeuristicProcessor(object): class HeuristicProcessor(object):
def __init__(self, extra_opts=None, log=None): def __init__(self, extra_opts=None, log=None):
@ -38,6 +39,9 @@ class HeuristicProcessor(object):
def is_pdftohtml(self, src): def is_pdftohtml(self, src):
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000] return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
def is_abbyy(self, src):
return '<meta name="generator" content="ABBYY FineReader' in src[:1000]
def chapter_head(self, match): def chapter_head(self, match):
from calibre.utils.html2text import html2text from calibre.utils.html2text import html2text
chap = match.group('chap') chap = match.group('chap')
@ -516,6 +520,107 @@ class HeuristicProcessor(object):
return scene_break return scene_break
def abbyy_processor(self, html):
abbyy_line = re.compile('((?P<linestart><p\sstyle="(?P<styles>[^\"]*?);?">)(?P<content>.*?)(?P<lineend></p>)|(?P<image><img[^>]*>))', re.IGNORECASE)
empty_paragraph = '\n<p> </p>\n'
previous_line_bottom_margin = False
self.in_blockquote = False
self.previous_was_paragraph = False
print "detected ABBYY content, running through processor"
html = re.sub('</?a[^>]*>', '', html)
def check_paragraph(content):
content = re.sub('\s*</?span[^>]*>\s*', '', content)
if re.match('.*[\"\'.!?:]$', content):
#print "detected this as a paragraph"
return True
else:
return False
def convert_styles(match):
#print "raw styles are: "+match.group('styles')
content = match.group('content')
#print "raw content is: "+match.group('content')
image = match.group('image')
is_paragraph = False
text_align = ''
text_indent = ''
paragraph_before = ''
paragraph_after = ''
blockquote_open = '\n<blockquote>\n'
blockquote_close = '</blockquote>\n'
indented_text = 'text-indent:3%;'
blockquote_open_loop = ''
blockquote_close_loop = ''
debugabby = False
if image:
debugabby = True
if self.in_blockquote:
self.in_blockquote = False
blockquote_close_loop = blockquote_close
self.previous_was_paragraph = False
return blockquote_close_loop+'\n'+image+'\n'
else:
styles = match.group('styles').split(';')
is_paragraph = check_paragraph(content)
#print "styles for this line are: "+str(styles)
split_styles = []
for style in styles:
#print "style is: "+str(style)
newstyle = style.split(':')
#print "newstyle is: "+str(newstyle)
split_styles.append(newstyle)
styles = split_styles
for style, setting in styles:
if style == 'text-align' and setting != 'left':
text_align = style+':'+setting+';'
if style == 'text-indent':
setting = int(re.sub('\s*pt\s*', '', setting))
if 9 < setting < 14:
text_indent = indented_text
else:
text_indent = style+':'+str(setting)+'pt;'
if style == 'padding':
setting = re.sub('pt', '', setting).split(' ')
if int(setting[1]) < 16 and int(setting[3]) < 16:
if self.in_blockquote:
debugabby = True
if is_paragraph:
self.in_blockquote = False
blockquote_close_loop = blockquote_close
if int(setting[3]) > 8 and text_indent == '':
text_indent = indented_text
if int(setting[0]) > 5:
paragraph_before = empty_paragraph
if int(setting[2]) > 5:
paragraph_after = empty_paragraph
elif not self.in_blockquote and self.previous_was_paragraph:
debugabby = True
self.in_blockquote = True
blockquote_open_loop = blockquote_open
if debugabby:
print '\n\n******\n'
print 'padding top is: '+str(setting[0])
print 'padding right is: '+str(setting[1])
print 'padding bottom is: '+str(setting[2])
print 'padding left is: '+str(setting[3])
#print "text-align is: "+str(text_align)
print "\n***\nline is:\n "+str(match.group(0))+'\n'
if debugabby:
#print "this line is a paragraph = "+str(is_paragraph)+", previous line was "+str(self.previous_was_paragraph)
print "styles for this line were: "+str(styles)
print 'newline is: \n'+blockquote_open_loop+blockquote_close_loop+paragraph_before+'<p style="'+text_indent+text_align+'">'+content+'</p>'+paragraph_after+'\n\n\n\n\n'
print "is_paragraph is "+str(is_paragraph)+", previous_was_paragraph is "+str(self.previous_was_paragraph)
self.previous_was_paragraph = is_paragraph
print "previous_was_paragraph is now set to "+str(self.previous_was_paragraph)+"\n\n\n"
return blockquote_open_loop+blockquote_close_loop+paragraph_before+'<p style="'+text_indent+text_align+'">'+content+'</p>'+paragraph_after
html = abbyy_line.sub(convert_styles, html)
return html
def __call__(self, html): def __call__(self, html):
self.log.debug("********* Heuristic processing HTML *********") self.log.debug("********* Heuristic processing HTML *********")
@ -530,6 +635,10 @@ class HeuristicProcessor(object):
self.log.warn("flow is too short, not running heuristics") self.log.warn("flow is too short, not running heuristics")
return html return html
is_abbyy = self.is_abbyy(html)
if is_abbyy:
html = self.abbyy_processor(html)
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
html = self.arrange_htm_line_endings(html) html = self.arrange_htm_line_endings(html)
#self.dump(html, 'after_arrange_line_endings') #self.dump(html, 'after_arrange_line_endings')