diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py new file mode 100644 index 0000000000..b9d18fd23a --- /dev/null +++ b/src/calibre/ebooks/txt/heuristicprocessor.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2011, John Schember ' +__docformat__ = 'restructuredtext en' + +import re + +from calibre import prepare_string_for_xml + +class TXTHeuristicProcessor(object): + + def __init__(self): + self.ITALICIZE_WORDS = [ + 'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.', + 'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.', + 'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.', + 'Mlle.', 'Mons.', 'PS.', 'PPS.', + ] + self.ITALICIZE_STYLE_PATS = [ + r'(?msu)_(?P.+?)_', + r'(?msu)/(?P[^<>]+?)/', + r'(?msu)~~(?P.+?)~~', + r'(?msu)\*(?P.+?)\*', + r'(?msu)~(?P.+?)~', + r'(?msu)_/(?P[^<>]+?)/_', + r'(?msu)_\*(?P.+?)\*_', + r'(?msu)\*/(?P[^<>]+?)/\*', + r'(?msu)_\*/(?P[^<>]+?)/\*_', + r'(?msu)/:(?P[^<>]+?):/', + r'(?msu)\|:(?P.+?):\|', + ] + + def process_paragraph(self, paragraph): + for word in self.ITALICIZE_WORDS: + paragraph = paragraph.replace(word, '%s' % word) + for pat in self.ITALICIZE_STYLE_PATS: + paragraph = re.sub(pat, lambda mo: '%s' % mo.group('words'), paragraph) + return paragraph + + def convert(self, txt, title='', epub_split_size_kb=0): + from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE + txt = clean_txt(txt) + txt = split_txt(txt, epub_split_size_kb) + + processed = [] + for line in txt.split('\n\n'): + processed.append(u'

%s

' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' ')))) + + txt = u'\n'.join(processed) + txt = re.sub('[ ]{2,}', ' ', txt) + html = HTML_TEMPLATE % (title, txt) + + from calibre.ebooks.conversion.utils import PreProcessor + pp = PreProcessor() + html = pp.markup_chapters(html, pp.get_word_count(html), False) + + return html diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 98756c5fa1..5060e124ff 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -10,7 +10,8 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ - preserve_spaces, detect_paragraph_type, detect_formatting_type + preserve_spaces, detect_paragraph_type, detect_formatting_type, \ + convert_heuristic from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -24,14 +25,15 @@ class TXTInput(InputFormatPlugin): OptionRecommendation(name='paragraph_type', recommended_value='auto', choices=['auto', 'block', 'single', 'print'], help=_('Paragraph structure.\n' - 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' + 'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n' '* auto: Try to auto detect paragraph type.\n' '* block: Treat a blank line as a paragraph break.\n' '* single: Assume every line is a paragraph.\n' '* print: Assume every line starting with 2+ spaces or a tab ' - 'starts a paragraph.')), + 'starts a paragraph.' + '* unformatted: Most lines have hard line breaks, few/no spaces or indents.')), OptionRecommendation(name='formatting_type', recommended_value='auto', - choices=['auto', 'none', 'markdown'], + choices=['auto', 'none', 'heuristic', 'markdown'], help=_('Formatting used within the document.' '* auto: Try to auto detect the document formatting.\n' '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' @@ -90,7 +92,7 @@ class TXTInput(InputFormatPlugin): # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. - if options.paragraph_type == 'single' or 'unformatted': + if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_paragraphs_print_formatted(txt) @@ -106,7 +108,12 @@ class TXTInput(InputFormatPlugin): txt = preprocessor.punctuation_unwrap(length, txt, 'txt') flow_size = getattr(options, 'flow_size', 0) - html = convert_basic(txt, epub_split_size_kb=flow_size) + + if options.formatting_type == 'heuristic': + html = convert_heuristic(txt, epub_split_size_kb=flow_size) + else: + html = convert_basic(txt, epub_split_size_kb=flow_size) + from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index c6cf1078cd..9dc29e45dd 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -9,6 +9,7 @@ import os, re from calibre import prepare_string_for_xml, isbytestring from calibre.ebooks.markdown import markdown from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor from calibre.ebooks.conversion.preprocess import DocAnalysis __license__ = 'GPL v3' @@ -17,7 +18,7 @@ __docformat__ = 'restructuredtext en' HTML_TEMPLATE = u'%s\n%s\n' -def convert_basic(txt, title='', epub_split_size_kb=0): +def clean_txt(txt): if isbytestring(txt): txt = txt.decode('utf-8', 'replace') # Strip whitespace from the beginning and end of the line. Also replace @@ -36,6 +37,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0): chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) illegal_chars = re.compile(u'|'.join(map(unichr, chars))) txt = illegal_chars.sub('', txt) + + return txt + +def split_txt(txt, epub_split_size_kb=0): #Takes care if there is no point to split if epub_split_size_kb > 0: if isinstance(txt, unicode): @@ -50,6 +55,12 @@ def convert_basic(txt, title='', epub_split_size_kb=0): if isbytestring(txt): txt = txt.decode('utf-8') + return txt + +def convert_basic(txt, title='', epub_split_size_kb=0): + txt = clean_txt(txt) + txt = split_txt(txt, epub_split_size_kb) + lines = [] # Split into paragraphs based on having a blank line between text. for line in txt.split('\n\n'): @@ -58,6 +69,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0): return HTML_TEMPLATE % (title, u'\n'.join(lines)) +def convert_heuristic(txt, title='', epub_split_size_kb=0): + tp = TXTHeuristicProcessor() + return tp.convert(txt, title, epub_split_size_kb) + def convert_markdown(txt, title='', disable_toc=False): md = markdown.Markdown( extensions=['footnotes', 'tables', 'toc'], @@ -117,12 +132,12 @@ def detect_paragraph_type(txt): if hardbreaks: # Check for print tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) - if tab_line_count / float(txt_line_count) >= .25: + if tab_line_count / float(txt_line_count) >= .15: return 'print' # Check for block empty_line_count = len(re.findall('(?mu)^\s*$', txt)) - if empty_line_count / float(txt_line_count) >= .25: + if empty_line_count / float(txt_line_count) >= .15: return 'block' # Assume unformatted text with hardbreaks if nothing else matches @@ -153,4 +168,4 @@ def detect_formatting_type(txt): if txt.count('\\'+c) > 10: return 'markdown' - return 'none' + return 'heuristic'