mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fixed a UTF-8 decoding error which occurs on bad input while removing hyphens, cleaned up indents, regex tweaks
This commit is contained in:
parent
f5431765f4
commit
b7f6d820a7
@ -75,7 +75,7 @@ class DocAnalysis(object):
|
||||
if format == 'html':
|
||||
linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
|
||||
elif format == 'pdf':
|
||||
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
|
||||
linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
||||
elif format == 'spanned_html':
|
||||
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||
self.lines = linere.findall(raw)
|
||||
@ -191,8 +191,11 @@ class Dehyphenator(object):
|
||||
lookupword = self.removesuffixes.sub('', dehyphenated)
|
||||
if self.prefixes.match(firsthalf) is None:
|
||||
lookupword = self.removeprefix.sub('', lookupword)
|
||||
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
|
||||
booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
|
||||
print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
|
||||
try:
|
||||
searchresult = self.html.find(str.lower(lookupword))
|
||||
except:
|
||||
return hyphenated
|
||||
if self.format == 'html_cleanup':
|
||||
if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
|
||||
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
|
||||
|
@ -145,7 +145,7 @@ class PreProcessor(object):
|
||||
#
|
||||
# Build the Regular Expressions in pieces
|
||||
lookahead = "(?=<(p|div))"
|
||||
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>span|[ibu])[^>]*>)?\s*(<(?P<inner2>span|[ibu])[^>]*>)?\s*(<(?P<inner3>span|[ibu])[^>]*>)?\s*"
|
||||
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
|
||||
chapter_header_open = r"(?P<chap>"
|
||||
chapter_header_close = ")\s*"
|
||||
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)\s[^>]*>)?\s*</(?P=outer)>\s*"
|
||||
@ -154,7 +154,7 @@ class PreProcessor(object):
|
||||
else:
|
||||
blank_lines = ""
|
||||
opt_title_open = "("
|
||||
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>span|[ibu])[^>]*>)?\s*(<(?P<inner5>span|[ibu])[^>]*>)?\s*(<(?P<inner6>span|[ibu])[^>]*>)?\s*"
|
||||
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
|
||||
title_header_open = "(?P<title>"
|
||||
title_header_close = ")\s*"
|
||||
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"
|
||||
|
Loading…
x
Reference in New Issue
Block a user