diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 632a7a3291..b105a6c042 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -23,6 +23,14 @@ def sanitize_head(match): x = _span_pat.sub('', x) return '\n'+x+'\n' +def chap_head(match): + chap = match.group('chap') + title = match.group('title') + if not title: + return '

'+chap+'


' + else: + return '

'+chap+'
'+title+'


' + class CSSPreProcessor(object): @@ -54,8 +62,9 @@ class HTMLPreProcessor(object): (re.compile(r'', re.IGNORECASE), lambda match: '
'), # Remove page numbers (re.compile(r'\d+
', re.IGNORECASE), lambda match: ''), - # Remove
and replace

with

+ # Replace

with

(re.compile(r'\s*', re.IGNORECASE), lambda match: '

'), + # Remove
(re.compile(r'(.*)', re.IGNORECASE), lambda match: match.group() if \ re.match('<', match.group(1).lstrip()) or \ @@ -69,15 +78,22 @@ class HTMLPreProcessor(object): # Remove non breaking spaces (re.compile(ur'\u00a0'), lambda match : ' '), + # Detect Chapters to match default XPATH in GUI + (re.compile(r'(]*>)?(]*>)?s*(?P(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(]*>|]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(]*>|]*>))((?P.*)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head), + (re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), + # Have paragraphs show better (re.compile(r'<br.*?>'), lambda match : '<p>'), # Un wrap lines - (re.compile(r'(?<=\w)\s*</(i|b|u)>\s*<p.*?>\s*<(i|b|u)>\s*(?=\w)'), lambda match: ' '), - (re.compile(r'(?<=\w)\s*<p.*?>\s*(?=\w)', re.UNICODE), lambda match: ' '), + (re.compile(r'(?<=[^\.^\^?^!^"^”])\s*(</(i|b|u)>)*\s*<p.*?>\s*(<(i|b|u)>)*\s*(?=[a-z0-9I])', re.UNICODE), lambda match: ' '), + # Clean up spaces - (re.compile(u'(?<=\.|,|:|;|\?|!|”|"|\')[\s^ ]*(?=<)'), lambda match: ' '), - ] + (re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), + # Add space before and after italics + (re.compile(r'(?<!“)<i>'), lambda match: ' <i>'), + (re.compile(r'</i>(?=\w)'), lambda match: '</i> '), + ] # Fix Book Designer markup BOOK_DESIGNER = [ diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 4d8516f6c3..a5ee619937 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -98,7 +98,7 @@ def get_cover(stream): data = cStringIO.StringIO() try: - StreamReadWrapper(stream) as stream: + with StreamReadWrapper(stream) as stream: pdf = PdfFileReader(stream) output = PdfFileWriter()