font|span|[ibu])[^>]*>)?\s*")
self.line_close = "((?P=inner3)>)?\\s*((?P=inner2)>)?\\s*((?P=inner1)>)?\\s*(?P=outer)>"
self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*(p|div)>)', re.IGNORECASE)
self.scene_break_open = ''
- self.common_in_text_endings = u'[\"\'—’”,\\.!\\?\\…\\)„\\w]'
- self.common_in_text_beginnings = u'[\\w\'\"“‘‛]'
+ self.common_in_text_endings = '[\"\'—’”,\\.!\\?\\…\\)„\\w]'
+ self.common_in_text_beginnings = '[\\w\'\"“‘‛]'
def is_pdftohtml(self, src):
return '' in src[:1000]
@@ -215,7 +217,8 @@ class HeuristicProcessor(object):
# Build the Regular Expressions in pieces
init_lookahead = "(?=<(p|div))"
chapter_line_open = self.line_open
- title_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" # noqa
+ title_line_open = (r"<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?"
+ r"\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*")
chapter_header_open = r"(?P"
title_header_open = r"(?P"
chapter_header_close = ")\\s*"
@@ -243,7 +246,9 @@ class HeuristicProcessor(object):
analysis_result = []
chapter_types = [
- [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'], # noqa
+ [(
+ r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)"
+ r"\s*([\d\w-]+\:?\'?\s*){0,5}"), True, True, True, False, "Searching for common section headings", 'common'],
# Highest frequency headings which include titles
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],
[r"]*>\s*(]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)()?\s*",
@@ -509,11 +514,14 @@ class HeuristicProcessor(object):
def detect_whitespace(self, html):
blanks_around_headings = re.compile(
- r'(?P(<(p|div)[^>]*>\s*(p|div)>\s*){1,}\s*)?(?P\d+)[^>]*>.*?)(?P\s*(<(p|div)[^>]*>\s*(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL) # noqa
+ r'(?P(<(p|div)[^>]*>\s*(p|div)>\s*){1,}\s*)?'
+ r'(?P\d+)[^>]*>.*?)(?P\s*(<(p|div)[^>]*>\s*(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
blanks_around_scene_breaks = re.compile(
- r'(?P(<(p|div)[^>]*>\s*(p|div)>\s*){1,}\s*)?(?P]*>.*?
)(?P\s*(<(p|div)[^>]*>\s*(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL) # noqa
+ r'(?P(<(p|div)[^>]*>\s*(p|div)>\s*){1,}\s*)?'
+ r'(?P]*>.*?
)(?P\s*(<(p|div)[^>]*>\s*(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
blanks_n_nopunct = re.compile(
- r'(?P(]*>\s*
\s*){1,}\s*)?]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W]((span|[ibu]|em|strong|font)>\s*)*
(?P\s*(]*>\s*
\s*){1,})?', re.IGNORECASE|re.DOTALL) # noqa
+ r'(?P(]*>\s*
\s*){1,}\s*)?]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*'
+ r'.{1,100}?[^\W]((span|[ibu]|em|strong|font)>\s*)*
(?P\s*(]*>\s*
\s*){1,})?', re.IGNORECASE|re.DOTALL)
def merge_header_whitespace(match):
initblanks = match.group('initparas')
@@ -826,7 +834,10 @@ class HeuristicProcessor(object):
self.log.debug("Looking for more split points based on punctuation,"
" currently have " + unicode_type(self.html_preprocess_sections))
chapdetect3 = re.compile(
- r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(?!([\W]+\s*)+)(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?([ibu]>){0,2}\s*()?\s*([ibu]>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE) # noqa
+ r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(?!([\W]+\s*)+)'
+ r'(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*'
+ r'.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?([ibu]>){0,2}\s*'
+ r'()?\s*([ibu]>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
html = chapdetect3.sub(self.chapter_break, html)
if getattr(self.extra_opts, 'renumber_headings', False):