From cfaa113f9557b9359208409a538302d9ec0af1d4 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 15 Jan 2011 09:05:08 -0500 Subject: [PATCH] Move italic marking to preprocessor. Have TXT input use the preprocessor for heuristics. Change preprocessor getattr to default to False otherwise every option set to off will run. --- src/calibre/ebooks/conversion/utils.py | 46 +++++++++++++--- src/calibre/ebooks/txt/heuristicprocessor.py | 58 -------------------- src/calibre/ebooks/txt/input.py | 13 +++-- src/calibre/ebooks/txt/processor.py | 5 -- 4 files changed, 48 insertions(+), 74 deletions(-) delete mode 100644 src/calibre/ebooks/txt/heuristicprocessor.py diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 2a88d371cc..56d4339d8c 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -128,6 +128,36 @@ class PreProcessor(object): wordcount = get_wordcount_obj(word_count_text) return wordcount.words + def markup_italicis(self, html): + ITALICIZE_WORDS = [ + 'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.', + 'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.', + 'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.', + 'Mlle.', 'Mons.', 'PS.', 'PPS.', + ] + + ITALICIZE_STYLE_PATS = [ + r'(?msu)_(?P.+?)_', + r'(?msu)/(?P[^<>]+?)/', + r'(?msu)~~(?P.+?)~~', + r'(?msu)\*(?P.+?)\*', + r'(?msu)~(?P.+?)~', + r'(?msu)_/(?P[^<>]+?)/_', + r'(?msu)_\*(?P.+?)\*_', + r'(?msu)\*/(?P[^<>]+?)/\*', + r'(?msu)_\*/(?P[^<>]+?)/\*_', + r'(?msu)/:(?P[^<>]+?):/', + r'(?msu)\|:(?P.+?):\|', + ] + + for word in ITALICIZE_WORDS: + html = html.replace(word, '%s' % word) + + for pat in ITALICIZE_STYLE_PATS: + html = re.sub(pat, lambda mo: '%s' % mo.group('words'), html) + + return html + def markup_chapters(self, html, wordcount, blanks_between_paragraphs): ''' Searches for common chapter headings throughout the document @@ -360,7 +390,7 @@ class PreProcessor(object): html = self.markup_pre(html) # Replace series of non-breaking spaces with text-indent - if getattr(self.extra_opts, 'fix_indents', True): + if getattr(self.extra_opts, 'fix_indents', False): html = self.fix_nbsp_indents(html) if self.cleanup_required(): @@ -375,19 +405,21 @@ class PreProcessor(object): #self.dump(html, 'before_chapter_markup') # detect chapters/sections to match xpath or splitting logic - if getattr(self.extra_opts, 'markup_chapter_headings', True): + if getattr(self.extra_opts, 'markup_chapter_headings', False): html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs) + if getattr(self.extra_opts, 'italicize_common_cases', False): + html = self.markup_italicis(html) + # If more than 40% of the lines are empty paragraphs and the user has enabled delete # blank paragraphs then delete blank lines to clean up spacing - if blanks_between_paragraphs and getattr(self.extra_opts, - 'delete_blank_paragraphs', False): + if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False): self.log("deleting blank lines") html = self.multi_blank.sub('\n

', html) html = self.blankreg.sub('', html) ###### Unwrap lines ###### - if getattr(self.extra_opts, 'unwrap_lines', True): + if getattr(self.extra_opts, 'unwrap_lines', False): # Determine line ending type # Some OCR sourced files have line breaks in the html using a combination of span & p tags # span are used for hard line breaks, p for new paragraphs. Determine which is used so @@ -416,7 +448,7 @@ class PreProcessor(object): dehyphenator = Dehyphenator() html = dehyphenator(html,'html_cleanup', length) - if getattr(self.extra_opts, 'dehyphenate', True): + if getattr(self.extra_opts, 'dehyphenate', False): # dehyphenate in cleanup mode to fix anything previous conversions/editing missed self.log("Fixing hyphenated content") dehyphenator = Dehyphenator() @@ -435,7 +467,7 @@ class PreProcessor(object): doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE) html = doubleheading.sub('\g'+'\n'+'', html) - if getattr(self.extra_opts, 'format_scene_breaks', True): + if getattr(self.extra_opts, 'format_scene_breaks', False): # Center separator lines html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py deleted file mode 100644 index b9d18fd23a..0000000000 --- a/src/calibre/ebooks/txt/heuristicprocessor.py +++ /dev/null @@ -1,58 +0,0 @@ -# -*- coding: utf-8 -*- - -__license__ = 'GPL 3' -__copyright__ = '2011, John Schember ' -__docformat__ = 'restructuredtext en' - -import re - -from calibre import prepare_string_for_xml - -class TXTHeuristicProcessor(object): - - def __init__(self): - self.ITALICIZE_WORDS = [ - 'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.', - 'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.', - 'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.', - 'Mlle.', 'Mons.', 'PS.', 'PPS.', - ] - self.ITALICIZE_STYLE_PATS = [ - r'(?msu)_(?P.+?)_', - r'(?msu)/(?P[^<>]+?)/', - r'(?msu)~~(?P.+?)~~', - r'(?msu)\*(?P.+?)\*', - r'(?msu)~(?P.+?)~', - r'(?msu)_/(?P[^<>]+?)/_', - r'(?msu)_\*(?P.+?)\*_', - r'(?msu)\*/(?P[^<>]+?)/\*', - r'(?msu)_\*/(?P[^<>]+?)/\*_', - r'(?msu)/:(?P[^<>]+?):/', - r'(?msu)\|:(?P.+?):\|', - ] - - def process_paragraph(self, paragraph): - for word in self.ITALICIZE_WORDS: - paragraph = paragraph.replace(word, '%s' % word) - for pat in self.ITALICIZE_STYLE_PATS: - paragraph = re.sub(pat, lambda mo: '%s' % mo.group('words'), paragraph) - return paragraph - - def convert(self, txt, title='', epub_split_size_kb=0): - from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE - txt = clean_txt(txt) - txt = split_txt(txt, epub_split_size_kb) - - processed = [] - for line in txt.split('\n\n'): - processed.append(u'

%s

' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' ')))) - - txt = u'\n'.join(processed) - txt = re.sub('[ ]{2,}', ' ', txt) - html = HTML_TEMPLATE % (title, txt) - - from calibre.ebooks.conversion.utils import PreProcessor - pp = PreProcessor() - html = pp.markup_chapters(html, pp.get_word_count(html), False) - - return html diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 0b0bd6d570..5cffbafe21 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \ - convert_heuristic, normalize_line_endings, convert_textile + normalize_line_endings, convert_textile from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -126,11 +126,16 @@ class TXTInput(InputFormatPlugin): txt = preprocessor.punctuation_unwrap(length, txt, 'txt') flow_size = getattr(options, 'flow_size', 0) + html = convert_basic(txt, epub_split_size_kb=flow_size) if options.formatting_type == 'heuristic': - html = convert_heuristic(txt, epub_split_size_kb=flow_size) - else: - html = convert_basic(txt, epub_split_size_kb=flow_size) + setattr(options, 'enable_heuristics', True) + setattr(options, 'markup_chapter_headings', True) + setattr(options, 'italicize_common_cases', True) + setattr(options, 'fix_indents', True) + setattr(options, 'delete_blank_paragraphs', True) + setattr(options, 'format_scene_breaks', True) + setattr(options, 'dehyphenate', True) # Dehyphenate in cleanup mode for missed txt and markdown conversion dehyphenator = Dehyphenator() diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index e1979063c0..9fd8af0d70 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -12,7 +12,6 @@ import os, re from calibre import prepare_string_for_xml, isbytestring from calibre.ebooks.metadata.opf2 import OPFCreator -from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor from calibre.ebooks.conversion.preprocess import DocAnalysis from calibre.utils.cleantext import clean_ascii_chars @@ -67,10 +66,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0): return HTML_TEMPLATE % (title, u'\n'.join(lines)) -def convert_heuristic(txt, title='', epub_split_size_kb=0): - tp = TXTHeuristicProcessor() - return tp.convert(txt, title, epub_split_size_kb) - def convert_markdown(txt, title='', disable_toc=False): from calibre.ebooks.markdown import markdown md = markdown.Markdown(