mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fixed a UTF-8 decoding error which occurs on bad input while removing hyphens, cleaned up indents, regex tweaks
This commit is contained in:
parent
f5431765f4
commit
b7f6d820a7
@ -75,7 +75,7 @@ class DocAnalysis(object):
|
|||||||
if format == 'html':
|
if format == 'html':
|
||||||
linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
|
linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
|
||||||
elif format == 'pdf':
|
elif format == 'pdf':
|
||||||
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
|
linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
||||||
elif format == 'spanned_html':
|
elif format == 'spanned_html':
|
||||||
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||||
self.lines = linere.findall(raw)
|
self.lines = linere.findall(raw)
|
||||||
@ -191,18 +191,21 @@ class Dehyphenator(object):
|
|||||||
lookupword = self.removesuffixes.sub('', dehyphenated)
|
lookupword = self.removesuffixes.sub('', dehyphenated)
|
||||||
if self.prefixes.match(firsthalf) is None:
|
if self.prefixes.match(firsthalf) is None:
|
||||||
lookupword = self.removeprefix.sub('', lookupword)
|
lookupword = self.removeprefix.sub('', lookupword)
|
||||||
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
|
print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
|
||||||
booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
|
try:
|
||||||
|
searchresult = self.html.find(str.lower(lookupword))
|
||||||
|
except:
|
||||||
|
return hyphenated
|
||||||
if self.format == 'html_cleanup':
|
if self.format == 'html_cleanup':
|
||||||
if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
|
if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
|
||||||
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
|
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
|
||||||
return dehyphenated
|
return dehyphenated
|
||||||
elif self.html.find(hyphenated) != -1:
|
elif self.html.find(hyphenated) != -1:
|
||||||
#print "Cleanup:returned hyphenated word: " + str(hyphenated)
|
#print "Cleanup:returned hyphenated word: " + str(hyphenated)
|
||||||
return hyphenated
|
return hyphenated
|
||||||
else:
|
else:
|
||||||
#print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
|
#print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
|
||||||
return firsthalf+u'\u2014'+wraptags+secondhalf
|
return firsthalf+u'\u2014'+wraptags+secondhalf
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
|
if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
|
||||||
|
@ -145,7 +145,7 @@ class PreProcessor(object):
|
|||||||
#
|
#
|
||||||
# Build the Regular Expressions in pieces
|
# Build the Regular Expressions in pieces
|
||||||
lookahead = "(?=<(p|div))"
|
lookahead = "(?=<(p|div))"
|
||||||
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>span|[ibu])[^>]*>)?\s*(<(?P<inner2>span|[ibu])[^>]*>)?\s*(<(?P<inner3>span|[ibu])[^>]*>)?\s*"
|
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
|
||||||
chapter_header_open = r"(?P<chap>"
|
chapter_header_open = r"(?P<chap>"
|
||||||
chapter_header_close = ")\s*"
|
chapter_header_close = ")\s*"
|
||||||
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)\s[^>]*>)?\s*</(?P=outer)>\s*"
|
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)\s[^>]*>)?\s*</(?P=outer)>\s*"
|
||||||
@ -154,7 +154,7 @@ class PreProcessor(object):
|
|||||||
else:
|
else:
|
||||||
blank_lines = ""
|
blank_lines = ""
|
||||||
opt_title_open = "("
|
opt_title_open = "("
|
||||||
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>span|[ibu])[^>]*>)?\s*(<(?P<inner5>span|[ibu])[^>]*>)?\s*(<(?P<inner6>span|[ibu])[^>]*>)?\s*"
|
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
|
||||||
title_header_open = "(?P<title>"
|
title_header_open = "(?P<title>"
|
||||||
title_header_close = ")\s*"
|
title_header_close = ")\s*"
|
||||||
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"
|
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user