From 0ff1622bd45c14d1d8a5b86ea465e4fefdf65100 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 21 Aug 2010 12:08:09 +1000 Subject: [PATCH] Added preprocess_html to lit input, tweaked pdf preprocess chapter detection --- src/calibre/__init__.py | 2 -- src/calibre/ebooks/conversion/preprocess.py | 2 +- src/calibre/ebooks/lit/input.py | 11 +++++++++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 16aaab73dd..8d3a444972 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -549,5 +549,3 @@ main() ipshell = IPShellEmbed(user_ns=user_ns) ipshell() sys.argv = old_argv - - diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 3a18d38b22..a0dfb5ea2b 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -206,7 +206,7 @@ class HTMLPreProcessor(object): (re.compile(ur'\u00a0'), lambda match : ' '), # Detect Chapters to match default XPATH in GUI - (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(||)?(Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+)?(||)?)(]*>|]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(]*>|]*>))((?P(<i>)?\s*\w+(\s+\w+)?(</i>)?)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head), + (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<i><b>|<i>|<b>)?(Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(</i></b>|</i>|</b>)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>(<i>)?\s*\w+(\s+\w+)?(</i>)?)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head), (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), # Have paragraphs show better diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py index 89873196c9..47f55686c3 100644 --- a/src/calibre/ebooks/lit/input.py +++ b/src/calibre/ebooks/lit/input.py @@ -6,8 +6,12 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' +import re + from calibre.customize.conversion import InputFormatPlugin +from calibre import entity_to_unicode + class LITInput(InputFormatPlugin): name = 'LIT Input' @@ -48,4 +52,11 @@ class LITInput(InputFormatPlugin): for elem in body: ne = copy.deepcopy(elem) pre.append(ne) + + + def preprocess_html(self, html): + chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE) + html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html) + html = re.sub(r"(?<=.{65}[a-z,\IA])\s*</(span|p|div)>\s*(</(p|span|div)>\s*<p[^>]*>(\s*<(p|span|div)>\s*</(p|span|div)[^>]*>)?\s*(</(p|span|div)>\s*<p[^>]*>)?)?\s*<(span|div|p)[^>]*>", " ", html) + return html