mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Fix use of UTF-8 raw string
This commit is contained in:
parent
cc3082135b
commit
ffe8fe5fd2
@ -144,7 +144,6 @@ class HTMLPreProcessor(object):
|
|||||||
# Fix pdftohtml markup
|
# Fix pdftohtml markup
|
||||||
PDFTOHTML = [
|
PDFTOHTML = [
|
||||||
# Fix umlauts
|
# Fix umlauts
|
||||||
# ¨
|
|
||||||
(re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
|
(re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
|
||||||
(re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
|
(re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
|
||||||
(re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'),
|
(re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'),
|
||||||
|
@ -176,8 +176,8 @@ class PreProcessor(object):
|
|||||||
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
|
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
|
||||||
#
|
#
|
||||||
# Unwrap and/or delete soft-hyphens, hyphens
|
# Unwrap and/or delete soft-hyphens, hyphens
|
||||||
html = re.sub(u'\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
||||||
html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
|
html = re.sub(u'(?<=[-\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
|
||||||
|
|
||||||
# Unwrap lines using punctation and line length
|
# Unwrap lines using punctation and line length
|
||||||
unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||||
@ -195,6 +195,6 @@ class PreProcessor(object):
|
|||||||
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
||||||
|
|
||||||
# put back non-breaking spaces in empty paragraphs to preserve original formatting
|
# put back non-breaking spaces in empty paragraphs to preserve original formatting
|
||||||
html = blankreg.sub('\n'+'\g<openline>'+' '+'\g<closeline>', html)
|
html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
||||||
|
|
||||||
return html
|
return html
|
||||||
|
Loading…
x
Reference in New Issue
Block a user