mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Fix unicode issues in new preprocess code
This commit is contained in:
parent
47f93f4c9c
commit
54ee947072
@ -184,14 +184,14 @@ class Dehyphenator(object):
|
|||||||
wraptags = match.group('wraptags')
|
wraptags = match.group('wraptags')
|
||||||
except:
|
except:
|
||||||
wraptags = ''
|
wraptags = ''
|
||||||
hyphenated = str(firsthalf) + "-" + str(secondhalf)
|
hyphenated = unicode(firsthalf) + "-" + unicode(secondhalf)
|
||||||
dehyphenated = str(firsthalf) + str(secondhalf)
|
dehyphenated = unicode(firsthalf) + unicode(secondhalf)
|
||||||
lookupword = self.removesuffixes.sub('', dehyphenated)
|
lookupword = self.removesuffixes.sub('', dehyphenated)
|
||||||
if self.prefixes.match(firsthalf) is None:
|
if self.prefixes.match(firsthalf) is None:
|
||||||
lookupword = self.removeprefix.sub('', lookupword)
|
lookupword = self.removeprefix.sub('', lookupword)
|
||||||
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
|
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
|
||||||
try:
|
try:
|
||||||
searchresult = self.html.find(str.lower(lookupword))
|
searchresult = self.html.find(lookupword.lower())
|
||||||
except:
|
except:
|
||||||
return hyphenated
|
return hyphenated
|
||||||
if self.format == 'html_cleanup':
|
if self.format == 'html_cleanup':
|
||||||
|
@ -22,18 +22,21 @@ class PreProcessor(object):
|
|||||||
title = match.group('title')
|
title = match.group('title')
|
||||||
if not title:
|
if not title:
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
|
self.log("found " + unicode(self.html_preprocess_sections) +
|
||||||
|
" chapters. - " + unicode(chap))
|
||||||
return '<h2>'+chap+'</h2>\n'
|
return '<h2>'+chap+'</h2>\n'
|
||||||
else:
|
else:
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
|
self.log("found " + unicode(self.html_preprocess_sections) +
|
||||||
|
" chapters & titles. - " + unicode(chap) + ", " + unicode(title))
|
||||||
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
|
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
|
||||||
|
|
||||||
def chapter_break(self, match):
|
def chapter_break(self, match):
|
||||||
chap = match.group('section')
|
chap = match.group('section')
|
||||||
styles = match.group('styles')
|
styles = match.group('styles')
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
|
self.log("marked " + unicode(self.html_preprocess_sections) +
|
||||||
|
" section markers based on punctuation. - " + unicode(chap))
|
||||||
return '<'+styles+' style="page-break-before:always">'+chap
|
return '<'+styles+' style="page-break-before:always">'+chap
|
||||||
|
|
||||||
def insert_indent(self, match):
|
def insert_indent(self, match):
|
||||||
@ -63,7 +66,8 @@ class PreProcessor(object):
|
|||||||
line_end = line_end_ere.findall(raw)
|
line_end = line_end_ere.findall(raw)
|
||||||
tot_htm_ends = len(htm_end)
|
tot_htm_ends = len(htm_end)
|
||||||
tot_ln_fds = len(line_end)
|
tot_ln_fds = len(line_end)
|
||||||
self.log("There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked up endings")
|
self.log("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
|
||||||
|
unicode(tot_htm_ends) + " marked up endings")
|
||||||
|
|
||||||
if percent > 1:
|
if percent > 1:
|
||||||
percent = 1
|
percent = 1
|
||||||
@ -71,7 +75,7 @@ class PreProcessor(object):
|
|||||||
percent = 0
|
percent = 0
|
||||||
|
|
||||||
min_lns = tot_ln_fds * percent
|
min_lns = tot_ln_fds * percent
|
||||||
self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
|
self.log("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
|
||||||
if min_lns > tot_htm_ends:
|
if min_lns > tot_htm_ends:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -112,7 +116,7 @@ class PreProcessor(object):
|
|||||||
txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
|
txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
|
||||||
html = txtindent.sub(self.insert_indent, html)
|
html = txtindent.sub(self.insert_indent, html)
|
||||||
if self.found_indents > 1:
|
if self.found_indents > 1:
|
||||||
self.log("replaced "+str(self.found_indents)+ " nbsp indents with inline styles")
|
self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
|
||||||
# remove remaining non-breaking spaces
|
# remove remaining non-breaking spaces
|
||||||
html = re.sub(ur'\u00a0', ' ', html)
|
html = re.sub(ur'\u00a0', ' ', html)
|
||||||
# Get rid of empty <o:p> tags to simplify other processing
|
# Get rid of empty <o:p> tags to simplify other processing
|
||||||
@ -131,7 +135,8 @@ class PreProcessor(object):
|
|||||||
lines = linereg.findall(html)
|
lines = linereg.findall(html)
|
||||||
blanks_between_paragraphs = False
|
blanks_between_paragraphs = False
|
||||||
if len(lines) > 1:
|
if len(lines) > 1:
|
||||||
self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
|
||||||
|
unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
||||||
if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
|
if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
|
||||||
'remove_paragraph_spacing', False):
|
'remove_paragraph_spacing', False):
|
||||||
self.log("deleting blank lines")
|
self.log("deleting blank lines")
|
||||||
@ -170,20 +175,20 @@ class PreProcessor(object):
|
|||||||
#print chapter_marker
|
#print chapter_marker
|
||||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||||
self.html_preprocess_sections = len(heading.findall(html))
|
self.html_preprocess_sections = len(heading.findall(html))
|
||||||
self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
|
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
||||||
#
|
#
|
||||||
# Start with most typical chapter headings, get more aggressive until one works
|
# Start with most typical chapter headings, get more aggressive until one works
|
||||||
if self.html_preprocess_sections < 10:
|
if self.html_preprocess_sections < 10:
|
||||||
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
|
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
|
||||||
html = chapdetect.sub(self.chapter_head, html)
|
html = chapdetect.sub(self.chapter_head, html)
|
||||||
if self.html_preprocess_sections < 10:
|
if self.html_preprocess_sections < 10:
|
||||||
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
|
self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters")
|
||||||
chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
||||||
chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
|
chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
|
||||||
html = chapdetect2.sub(self.chapter_head, html)
|
html = chapdetect2.sub(self.chapter_head, html)
|
||||||
|
|
||||||
if self.html_preprocess_sections < 10:
|
if self.html_preprocess_sections < 10:
|
||||||
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
|
self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words")
|
||||||
chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
||||||
chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE)
|
chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE)
|
||||||
html = chapdetect2.sub(self.chapter_head, html)
|
html = chapdetect2.sub(self.chapter_head, html)
|
||||||
@ -207,11 +212,11 @@ class PreProcessor(object):
|
|||||||
# more of the lines break in the same region of the document then unwrapping is required
|
# more of the lines break in the same region of the document then unwrapping is required
|
||||||
docanalysis = DocAnalysis(format, html)
|
docanalysis = DocAnalysis(format, html)
|
||||||
hardbreaks = docanalysis.line_histogram(.50)
|
hardbreaks = docanalysis.line_histogram(.50)
|
||||||
self.log("Hard line breaks check returned "+str(hardbreaks))
|
self.log("Hard line breaks check returned "+unicode(hardbreaks))
|
||||||
# Calculate Length
|
# Calculate Length
|
||||||
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
||||||
length = docanalysis.line_length(unwrap_factor)
|
length = docanalysis.line_length(unwrap_factor)
|
||||||
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
|
self.log("*** Median line length is " + unicode(length) + ", calculated with " + format + " format ***")
|
||||||
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
|
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
|
||||||
if hardbreaks or unwrap_factor < 0.4:
|
if hardbreaks or unwrap_factor < 0.4:
|
||||||
self.log("Unwrapping required, unwrapping Lines")
|
self.log("Unwrapping required, unwrapping Lines")
|
||||||
@ -240,7 +245,8 @@ class PreProcessor(object):
|
|||||||
|
|
||||||
# If still no sections after unwrapping mark split points on lines with no punctuation
|
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||||
if self.html_preprocess_sections < 10:
|
if self.html_preprocess_sections < 10:
|
||||||
self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
|
self.log("Looking for more split points based on punctuation,"
|
||||||
|
" currently have " + unicode(self.html_preprocess_sections))
|
||||||
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||||
html = chapdetect3.sub(self.chapter_break, html)
|
html = chapdetect3.sub(self.chapter_break, html)
|
||||||
# search for places where a first or second level heading is immediately followed by another
|
# search for places where a first or second level heading is immediately followed by another
|
||||||
|
Loading…
x
Reference in New Issue
Block a user