diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 54639df93c..da20af6e8a 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -379,7 +379,7 @@ class HTMLPreProcessor(object):
(re.compile(r'((?<=)\s*file:////?[A-Z].*
|file:////?[A-Z].*
(?=\s*
\n
' + match.group(1) + '
'), + (re.compile(u'\n
' + match.group(1) + '
'), # Remove page links (re.compile(r'', re.IGNORECASE), lambda match: ''), @@ -483,7 +483,7 @@ class HTMLPreProcessor(object): end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation - (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?(i|b|u)>)?\s*(
\s*\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), + (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?(i|b|u)>)?\s*(
\s*\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
)
for rule in self.PREPROCESS + start_rules:
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 4d017b7df4..3e809c39e3 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -207,7 +207,7 @@ class HeuristicProcessor(object):
n_lookahead_open = "\s+(?!"
n_lookahead_close = ")"
- default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\'’\"-]+\s{0,3}){1,5}?([ibu][^>]*>)?(?=<)"
+ default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?([ibu][^>]*>)?(?=<)"
simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter).{0,50}?([ibu][^>]*>)?(?=<)"
analysis_result = []
@@ -300,7 +300,7 @@ class HeuristicProcessor(object):
supports a range of html markup and text files
'''
# define the pieces of the regex
- lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?\s*((span|[iubp]|div)>)?"
@@ -526,7 +526,7 @@ class HeuristicProcessor(object):
if getattr(self.extra_opts, 'format_scene_breaks', False):
# Center separator lines
- html = re.sub(u'<(?P ' + '\g ' + '\g ]*>\s*