mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
first pass at abbyy processor
This commit is contained in:
parent
9b1ae4ba97
commit
8c9c5d35e4
@ -11,6 +11,7 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||
from calibre.utils.logging import default_log
|
||||
from calibre.utils.wordcount import get_wordcount_obj
|
||||
|
||||
|
||||
class HeuristicProcessor(object):
|
||||
|
||||
def __init__(self, extra_opts=None, log=None):
|
||||
@ -38,6 +39,9 @@ class HeuristicProcessor(object):
|
||||
def is_pdftohtml(self, src):
|
||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||
|
||||
def is_abbyy(self, src):
|
||||
return '<meta name="generator" content="ABBYY FineReader' in src[:1000]
|
||||
|
||||
def chapter_head(self, match):
|
||||
from calibre.utils.html2text import html2text
|
||||
chap = match.group('chap')
|
||||
@ -516,6 +520,107 @@ class HeuristicProcessor(object):
|
||||
|
||||
return scene_break
|
||||
|
||||
def abbyy_processor(self, html):
|
||||
abbyy_line = re.compile('((?P<linestart><p\sstyle="(?P<styles>[^\"]*?);?">)(?P<content>.*?)(?P<lineend></p>)|(?P<image><img[^>]*>))', re.IGNORECASE)
|
||||
empty_paragraph = '\n<p> </p>\n'
|
||||
previous_line_bottom_margin = False
|
||||
self.in_blockquote = False
|
||||
self.previous_was_paragraph = False
|
||||
print "detected ABBYY content, running through processor"
|
||||
html = re.sub('</?a[^>]*>', '', html)
|
||||
|
||||
def check_paragraph(content):
|
||||
content = re.sub('\s*</?span[^>]*>\s*', '', content)
|
||||
if re.match('.*[\"\'.!?:]$', content):
|
||||
#print "detected this as a paragraph"
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def convert_styles(match):
|
||||
#print "raw styles are: "+match.group('styles')
|
||||
content = match.group('content')
|
||||
#print "raw content is: "+match.group('content')
|
||||
image = match.group('image')
|
||||
|
||||
is_paragraph = False
|
||||
text_align = ''
|
||||
text_indent = ''
|
||||
paragraph_before = ''
|
||||
paragraph_after = ''
|
||||
blockquote_open = '\n<blockquote>\n'
|
||||
blockquote_close = '</blockquote>\n'
|
||||
indented_text = 'text-indent:3%;'
|
||||
blockquote_open_loop = ''
|
||||
blockquote_close_loop = ''
|
||||
debugabby = False
|
||||
|
||||
if image:
|
||||
debugabby = True
|
||||
if self.in_blockquote:
|
||||
self.in_blockquote = False
|
||||
blockquote_close_loop = blockquote_close
|
||||
self.previous_was_paragraph = False
|
||||
return blockquote_close_loop+'\n'+image+'\n'
|
||||
else:
|
||||
styles = match.group('styles').split(';')
|
||||
is_paragraph = check_paragraph(content)
|
||||
#print "styles for this line are: "+str(styles)
|
||||
split_styles = []
|
||||
for style in styles:
|
||||
#print "style is: "+str(style)
|
||||
newstyle = style.split(':')
|
||||
#print "newstyle is: "+str(newstyle)
|
||||
split_styles.append(newstyle)
|
||||
styles = split_styles
|
||||
for style, setting in styles:
|
||||
if style == 'text-align' and setting != 'left':
|
||||
text_align = style+':'+setting+';'
|
||||
if style == 'text-indent':
|
||||
setting = int(re.sub('\s*pt\s*', '', setting))
|
||||
if 9 < setting < 14:
|
||||
text_indent = indented_text
|
||||
else:
|
||||
text_indent = style+':'+str(setting)+'pt;'
|
||||
if style == 'padding':
|
||||
setting = re.sub('pt', '', setting).split(' ')
|
||||
if int(setting[1]) < 16 and int(setting[3]) < 16:
|
||||
if self.in_blockquote:
|
||||
debugabby = True
|
||||
if is_paragraph:
|
||||
self.in_blockquote = False
|
||||
blockquote_close_loop = blockquote_close
|
||||
if int(setting[3]) > 8 and text_indent == '':
|
||||
text_indent = indented_text
|
||||
if int(setting[0]) > 5:
|
||||
paragraph_before = empty_paragraph
|
||||
if int(setting[2]) > 5:
|
||||
paragraph_after = empty_paragraph
|
||||
elif not self.in_blockquote and self.previous_was_paragraph:
|
||||
debugabby = True
|
||||
self.in_blockquote = True
|
||||
blockquote_open_loop = blockquote_open
|
||||
if debugabby:
|
||||
print '\n\n******\n'
|
||||
print 'padding top is: '+str(setting[0])
|
||||
print 'padding right is: '+str(setting[1])
|
||||
print 'padding bottom is: '+str(setting[2])
|
||||
print 'padding left is: '+str(setting[3])
|
||||
|
||||
#print "text-align is: "+str(text_align)
|
||||
print "\n***\nline is:\n "+str(match.group(0))+'\n'
|
||||
if debugabby:
|
||||
#print "this line is a paragraph = "+str(is_paragraph)+", previous line was "+str(self.previous_was_paragraph)
|
||||
print "styles for this line were: "+str(styles)
|
||||
print 'newline is: \n'+blockquote_open_loop+blockquote_close_loop+paragraph_before+'<p style="'+text_indent+text_align+'">'+content+'</p>'+paragraph_after+'\n\n\n\n\n'
|
||||
print "is_paragraph is "+str(is_paragraph)+", previous_was_paragraph is "+str(self.previous_was_paragraph)
|
||||
self.previous_was_paragraph = is_paragraph
|
||||
print "previous_was_paragraph is now set to "+str(self.previous_was_paragraph)+"\n\n\n"
|
||||
return blockquote_open_loop+blockquote_close_loop+paragraph_before+'<p style="'+text_indent+text_align+'">'+content+'</p>'+paragraph_after
|
||||
|
||||
html = abbyy_line.sub(convert_styles, html)
|
||||
return html
|
||||
|
||||
|
||||
def __call__(self, html):
|
||||
self.log.debug("********* Heuristic processing HTML *********")
|
||||
@ -530,6 +635,10 @@ class HeuristicProcessor(object):
|
||||
self.log.warn("flow is too short, not running heuristics")
|
||||
return html
|
||||
|
||||
is_abbyy = self.is_abbyy(html)
|
||||
if is_abbyy:
|
||||
html = self.abbyy_processor(html)
|
||||
|
||||
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||
html = self.arrange_htm_line_endings(html)
|
||||
#self.dump(html, 'after_arrange_line_endings')
|
||||
|
Loading…
x
Reference in New Issue
Block a user