From 843e1f2068cf1707f7f002be7c05c37282e9fa36 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 8 Jan 2011 13:17:32 -0500 Subject: [PATCH 1/4] TXT Input: Basic heuristic processor. --- src/calibre/ebooks/txt/heuristicprocessor.py | 88 ++++++++++++++++++++ src/calibre/ebooks/txt/input.py | 12 ++- src/calibre/ebooks/txt/processor.py | 23 ++++- 3 files changed, 116 insertions(+), 7 deletions(-) create mode 100644 src/calibre/ebooks/txt/heuristicprocessor.py diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py new file mode 100644 index 0000000000..cbfa33a96a --- /dev/null +++ b/src/calibre/ebooks/txt/heuristicprocessor.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2011, John Schember ' +__docformat__ = 'restructuredtext en' + +import re +import string + +from calibre import prepare_string_for_xml +from calibre.ebooks.unidecode.unidecoder import Unidecoder + +class TXTHeuristicProcessor(object): + + def __init__(self): + self.ITALICIZE_WORDS = [ + 'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.', + 'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.', + 'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.', + 'Mlle.', 'Mons.', 'PS.', 'PPS.', + ] + self.ITALICIZE_STYLE_PATS = [ + r'(?msu)_(?P.+?)_', + r'(?msu)/(?P.+?)/', + r'(?msu)~~(?P.+?)~~', + r'(?msu)\*(?P.+?)\*', + r'(?msu)~(?P.+?)~', + r'(?msu)_/(?P.+?)/_', + r'(?msu)_\*(?P.+?)\*_', + r'(?msu)\*/(?P.+?)/\*', + r'(?msu)_\*/(?P.+?)/\*_', + r'(?msu)/:(?P.+?):/', + r'(?msu)\|:(?P.+?):\|', + ] + + def del_maketrans(self, deletechars): + return dict([(ord(x), u'') for x in deletechars]) + + def is_heading(self, line): + if not line: + return False + if len(line) > 40: + return False + + line = Unidecoder().decode(line) + + # punctuation. + if line.translate(self.del_maketrans(string.letters + string.digits + ' :-')): + return False + + # All upper case. + #if line.isupper(): + # return True + # Roman numerals. + #if not line.translate(self.del_maketrans('IVXYCivxyc ')): + # return True + + return True + + def process_paragraph(self, paragraph): + for word in self.ITALICIZE_WORDS: + paragraph = paragraph.replace(word, '%s' % word) + for pat in self.ITALICIZE_STYLE_PATS: + paragraph = re.sub(pat, lambda mo: '%s' % mo.group('words'), paragraph) + return paragraph + + def convert(self, txt, title='', epub_split_size_kb=0): + from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE + txt = clean_txt(txt) + txt = split_txt(txt, epub_split_size_kb) + + processed = [] + last_was_heading = False + for line in txt.split('\n\n'): + if self.is_heading(line): + if not last_was_heading: + processed.append(u'

%s

' % prepare_string_for_xml(line.replace('\n', ' '))) + else: + processed.append(u'

%s

' % prepare_string_for_xml(line.replace('\n', ' '))) + last_was_heading = True + else: + processed.append(u'

%s

' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' ')))) + last_was_heading = False + + txt = u'\n'.join(processed) + txt = re.sub('[ ]{2,}', ' ', txt) + + return HTML_TEMPLATE % (title, txt) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 47e92a45a9..fd805f8ce8 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -10,7 +10,8 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ - preserve_spaces, detect_paragraph_type, detect_formatting_type + preserve_spaces, detect_paragraph_type, detect_formatting_type, \ + convert_heuristic from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -31,7 +32,7 @@ class TXTInput(InputFormatPlugin): '* print: Assume every line starting with 2+ spaces or a tab ' 'starts a paragraph.')), OptionRecommendation(name='formatting_type', recommended_value='auto', - choices=['auto', 'none', 'markdown'], + choices=['auto', 'none', 'heuristic', 'markdown'], help=_('Formatting used within the document.' '* auto: Try to auto detect the document formatting.\n' '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' @@ -96,7 +97,12 @@ class TXTInput(InputFormatPlugin): txt = separate_paragraphs_print_formatted(txt) flow_size = getattr(options, 'flow_size', 0) - html = convert_basic(txt, epub_split_size_kb=flow_size) + + if options.formatting_type == 'heuristic': + html = convert_heuristic(txt, epub_split_size_kb=flow_size) + else: + html = convert_basic(txt, epub_split_size_kb=flow_size) + from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index f6d628e7c5..79eee79c29 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -9,6 +9,7 @@ import os, re from calibre import prepare_string_for_xml, isbytestring from calibre.ebooks.markdown import markdown from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' @@ -16,7 +17,7 @@ __docformat__ = 'restructuredtext en' HTML_TEMPLATE = u'%s\n%s\n' -def convert_basic(txt, title='', epub_split_size_kb=0): +def clean_txt(txt): if isbytestring(txt): txt = txt.decode('utf-8', 'replace') # Strip whitespace from the beginning and end of the line. Also replace @@ -35,6 +36,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0): chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) illegal_chars = re.compile(u'|'.join(map(unichr, chars))) txt = illegal_chars.sub('', txt) + + return txt + +def split_txt(txt, epub_split_size_kb=0): #Takes care if there is no point to split if epub_split_size_kb > 0: if isinstance(txt, unicode): @@ -49,6 +54,12 @@ def convert_basic(txt, title='', epub_split_size_kb=0): if isbytestring(txt): txt = txt.decode('utf-8') + return txt + +def convert_basic(txt, title='', epub_split_size_kb=0): + txt = clean_txt(txt) + txt = split_txt(txt, epub_split_size_kb) + lines = [] # Split into paragraphs based on having a blank line between text. for line in txt.split('\n\n'): @@ -57,6 +68,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0): return HTML_TEMPLATE % (title, u'\n'.join(lines)) +def convert_heuristic(txt, title='', epub_split_size_kb=0): + tp = TXTHeuristicProcessor() + return tp.convert(txt, title, epub_split_size_kb) + def convert_markdown(txt, title='', disable_toc=False): md = markdown.Markdown( extensions=['footnotes', 'tables', 'toc'], @@ -111,12 +126,12 @@ def detect_paragraph_type(txt): # Check for print tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) - if tab_line_count / float(txt_line_count) >= .25: + if tab_line_count / float(txt_line_count) >= .15: return 'print' # Check for block empty_line_count = len(re.findall('(?mu)^\s*$', txt)) - if empty_line_count / float(txt_line_count) >= .25: + if empty_line_count / float(txt_line_count) >= .15: return 'block' # Nothing else matched to assume single. @@ -143,4 +158,4 @@ def detect_formatting_type(txt): if txt.count('\\'+c) > 10: return 'markdown' - return 'none' + return 'heuristic' From f593b2163154bcd61e21b0e06f8cf0e29514af86 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 8 Jan 2011 13:53:32 -0500 Subject: [PATCH 2/4] TXT Input: Tweak Heuristic italicizing. --- src/calibre/ebooks/txt/heuristicprocessor.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py index cbfa33a96a..b0bbd49961 100644 --- a/src/calibre/ebooks/txt/heuristicprocessor.py +++ b/src/calibre/ebooks/txt/heuristicprocessor.py @@ -21,15 +21,15 @@ class TXTHeuristicProcessor(object): ] self.ITALICIZE_STYLE_PATS = [ r'(?msu)_(?P.+?)_', - r'(?msu)/(?P.+?)/', + r'(?msu)/(?P[^<>]+?)/', r'(?msu)~~(?P.+?)~~', r'(?msu)\*(?P.+?)\*', r'(?msu)~(?P.+?)~', - r'(?msu)_/(?P.+?)/_', + r'(?msu)_/(?P[^<>]+?)/_', r'(?msu)_\*(?P.+?)\*_', - r'(?msu)\*/(?P.+?)/\*', - r'(?msu)_\*/(?P.+?)/\*_', - r'(?msu)/:(?P.+?):/', + r'(?msu)\*/(?P[^<>]+?)/\*', + r'(?msu)_\*/(?P[^<>]+?)/\*_', + r'(?msu)/:(?P[^<>]+?):/', r'(?msu)\|:(?P.+?):\|', ] @@ -84,5 +84,6 @@ class TXTHeuristicProcessor(object): txt = u'\n'.join(processed) txt = re.sub('[ ]{2,}', ' ', txt) + print txt return HTML_TEMPLATE % (title, txt) From c8f18ff02e32f56220f83872f4def00cca58e73d Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 8 Jan 2011 15:49:10 -0500 Subject: [PATCH 3/4] TXT Input: Heuristic processor, use PreProcessor to mark chapter headings. --- src/calibre/ebooks/txt/heuristicprocessor.py | 43 ++++---------------- src/calibre/ebooks/txt/processor.py | 3 -- 2 files changed, 7 insertions(+), 39 deletions(-) diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py index b0bbd49961..c4489badc5 100644 --- a/src/calibre/ebooks/txt/heuristicprocessor.py +++ b/src/calibre/ebooks/txt/heuristicprocessor.py @@ -33,30 +33,6 @@ class TXTHeuristicProcessor(object): r'(?msu)\|:(?P.+?):\|', ] - def del_maketrans(self, deletechars): - return dict([(ord(x), u'') for x in deletechars]) - - def is_heading(self, line): - if not line: - return False - if len(line) > 40: - return False - - line = Unidecoder().decode(line) - - # punctuation. - if line.translate(self.del_maketrans(string.letters + string.digits + ' :-')): - return False - - # All upper case. - #if line.isupper(): - # return True - # Roman numerals. - #if not line.translate(self.del_maketrans('IVXYCivxyc ')): - # return True - - return True - def process_paragraph(self, paragraph): for word in self.ITALICIZE_WORDS: paragraph = paragraph.replace(word, '%s' % word) @@ -70,20 +46,15 @@ class TXTHeuristicProcessor(object): txt = split_txt(txt, epub_split_size_kb) processed = [] - last_was_heading = False for line in txt.split('\n\n'): - if self.is_heading(line): - if not last_was_heading: - processed.append(u'

%s

' % prepare_string_for_xml(line.replace('\n', ' '))) - else: - processed.append(u'

%s

' % prepare_string_for_xml(line.replace('\n', ' '))) - last_was_heading = True - else: - processed.append(u'

%s

' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' ')))) - last_was_heading = False + processed.append(u'

%s

' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' ')))) txt = u'\n'.join(processed) txt = re.sub('[ ]{2,}', ' ', txt) - print txt + html = HTML_TEMPLATE % (title, txt) + + from calibre.ebooks.conversion.utils import PreProcessor + pp = PreProcessor() + html = pp.markup_chapters(html, pp.get_word_count(html), False) - return HTML_TEMPLATE % (title, txt) + return html diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 1e67caccc6..9dc29e45dd 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -9,11 +9,8 @@ import os, re from calibre import prepare_string_for_xml, isbytestring from calibre.ebooks.markdown import markdown from calibre.ebooks.metadata.opf2 import OPFCreator -<<<<<<< TREE from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor -======= from calibre.ebooks.conversion.preprocess import DocAnalysis ->>>>>>> MERGE-SOURCE __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' From bd14205637cbf71fe4aad655de50f4f0fea98a60 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 8 Jan 2011 15:53:51 -0500 Subject: [PATCH 4/4] ... --- src/calibre/ebooks/txt/heuristicprocessor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py index c4489badc5..c4c6a56123 100644 --- a/src/calibre/ebooks/txt/heuristicprocessor.py +++ b/src/calibre/ebooks/txt/heuristicprocessor.py @@ -5,7 +5,6 @@ __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' import re -import string from calibre import prepare_string_for_xml from calibre.ebooks.unidecode.unidecoder import Unidecoder @@ -48,7 +47,7 @@ class TXTHeuristicProcessor(object): processed = [] for line in txt.split('\n\n'): processed.append(u'

%s

' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' ')))) - + txt = u'\n'.join(processed) txt = re.sub('[ ]{2,}', ' ', txt) html = HTML_TEMPLATE % (title, txt)