From 267eebb9aa489cc443e57e90a9353730345af0c3 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 21 Nov 2010 15:38:18 +0800
Subject: [PATCH] adjusted preprocessing regexes for hyphen removal and chapter
marking
---
src/calibre/ebooks/conversion/preprocess.py | 2 +-
src/calibre/ebooks/conversion/utils.py | 10 +++++-----
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index ef092f7954..3ff816b3bf 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -220,7 +220,7 @@ class Dehyphenator(object):
self.html = html
self.format = format
if format == 'html':
- intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)' % length)
+ intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P()?\s*([iubp]>\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*
\s*)?(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length)
elif format == 'pdf':
intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P|[iub]>\s*
\s*<[iub]>)\s*(?P[\w\d]+)'% length)
elif format == 'individual_words':
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 20689c6950..feb74324e8 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -107,7 +107,7 @@ class PreProcessor(object):
# Arrange line feeds and
tags so the line_length and no_markup functions work correctly
html = re.sub(r"\s*
", "\n", html)
html = re.sub(r"\s*\s*", "\n
", html)
-
+
###### Check Markup ######
#
# some lit files don't have any
tags or equivalent (generally just plain text between
@@ -191,10 +191,10 @@ class PreProcessor(object):
n_lookahead_close = ")"
default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)"
- typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
- numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
- uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
- numeric_titles = r".?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*"
+ typical_chapters = r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
+ numeric_chapters = r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}\s*"
+ uppercase_chapters = r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*"
+ numeric_titles = r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*"
emphasized_lines = r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)()?\s*"
full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close