mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
TXT Input: Basic heuristic processor.
This commit is contained in:
parent
435ea9a2df
commit
843e1f2068
88
src/calibre/ebooks/txt/heuristicprocessor.py
Normal file
88
src/calibre/ebooks/txt/heuristicprocessor.py
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re
|
||||||
|
import string
|
||||||
|
|
||||||
|
from calibre import prepare_string_for_xml
|
||||||
|
from calibre.ebooks.unidecode.unidecoder import Unidecoder
|
||||||
|
|
||||||
|
class TXTHeuristicProcessor(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.ITALICIZE_WORDS = [
|
||||||
|
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
||||||
|
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
|
||||||
|
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
|
||||||
|
'Mlle.', 'Mons.', 'PS.', 'PPS.',
|
||||||
|
]
|
||||||
|
self.ITALICIZE_STYLE_PATS = [
|
||||||
|
r'(?msu)_(?P<words>.+?)_',
|
||||||
|
r'(?msu)/(?P<words>.+?)/',
|
||||||
|
r'(?msu)~~(?P<words>.+?)~~',
|
||||||
|
r'(?msu)\*(?P<words>.+?)\*',
|
||||||
|
r'(?msu)~(?P<words>.+?)~',
|
||||||
|
r'(?msu)_/(?P<words>.+?)/_',
|
||||||
|
r'(?msu)_\*(?P<words>.+?)\*_',
|
||||||
|
r'(?msu)\*/(?P<words>.+?)/\*',
|
||||||
|
r'(?msu)_\*/(?P<words>.+?)/\*_',
|
||||||
|
r'(?msu)/:(?P<words>.+?):/',
|
||||||
|
r'(?msu)\|:(?P<words>.+?):\|',
|
||||||
|
]
|
||||||
|
|
||||||
|
def del_maketrans(self, deletechars):
|
||||||
|
return dict([(ord(x), u'') for x in deletechars])
|
||||||
|
|
||||||
|
def is_heading(self, line):
|
||||||
|
if not line:
|
||||||
|
return False
|
||||||
|
if len(line) > 40:
|
||||||
|
return False
|
||||||
|
|
||||||
|
line = Unidecoder().decode(line)
|
||||||
|
|
||||||
|
# punctuation.
|
||||||
|
if line.translate(self.del_maketrans(string.letters + string.digits + ' :-')):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# All upper case.
|
||||||
|
#if line.isupper():
|
||||||
|
# return True
|
||||||
|
# Roman numerals.
|
||||||
|
#if not line.translate(self.del_maketrans('IVXYCivxyc ')):
|
||||||
|
# return True
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def process_paragraph(self, paragraph):
|
||||||
|
for word in self.ITALICIZE_WORDS:
|
||||||
|
paragraph = paragraph.replace(word, '<i>%s</i>' % word)
|
||||||
|
for pat in self.ITALICIZE_STYLE_PATS:
|
||||||
|
paragraph = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), paragraph)
|
||||||
|
return paragraph
|
||||||
|
|
||||||
|
def convert(self, txt, title='', epub_split_size_kb=0):
|
||||||
|
from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE
|
||||||
|
txt = clean_txt(txt)
|
||||||
|
txt = split_txt(txt, epub_split_size_kb)
|
||||||
|
|
||||||
|
processed = []
|
||||||
|
last_was_heading = False
|
||||||
|
for line in txt.split('\n\n'):
|
||||||
|
if self.is_heading(line):
|
||||||
|
if not last_was_heading:
|
||||||
|
processed.append(u'<h1>%s</h1>' % prepare_string_for_xml(line.replace('\n', ' ')))
|
||||||
|
else:
|
||||||
|
processed.append(u'<h2>%s</h2>' % prepare_string_for_xml(line.replace('\n', ' ')))
|
||||||
|
last_was_heading = True
|
||||||
|
else:
|
||||||
|
processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
|
||||||
|
last_was_heading = False
|
||||||
|
|
||||||
|
txt = u'\n'.join(processed)
|
||||||
|
txt = re.sub('[ ]{2,}', ' ', txt)
|
||||||
|
|
||||||
|
return HTML_TEMPLATE % (title, txt)
|
@ -10,7 +10,8 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
|||||||
from calibre.ebooks.chardet import detect
|
from calibre.ebooks.chardet import detect
|
||||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||||
preserve_spaces, detect_paragraph_type, detect_formatting_type
|
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
||||||
|
convert_heuristic
|
||||||
from calibre import _ent_pat, xml_entity_to_unicode
|
from calibre import _ent_pat, xml_entity_to_unicode
|
||||||
|
|
||||||
class TXTInput(InputFormatPlugin):
|
class TXTInput(InputFormatPlugin):
|
||||||
@ -31,7 +32,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
'* print: Assume every line starting with 2+ spaces or a tab '
|
'* print: Assume every line starting with 2+ spaces or a tab '
|
||||||
'starts a paragraph.')),
|
'starts a paragraph.')),
|
||||||
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
||||||
choices=['auto', 'none', 'markdown'],
|
choices=['auto', 'none', 'heuristic', 'markdown'],
|
||||||
help=_('Formatting used within the document.'
|
help=_('Formatting used within the document.'
|
||||||
'* auto: Try to auto detect the document formatting.\n'
|
'* auto: Try to auto detect the document formatting.\n'
|
||||||
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
|
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
|
||||||
@ -96,7 +97,12 @@ class TXTInput(InputFormatPlugin):
|
|||||||
txt = separate_paragraphs_print_formatted(txt)
|
txt = separate_paragraphs_print_formatted(txt)
|
||||||
|
|
||||||
flow_size = getattr(options, 'flow_size', 0)
|
flow_size = getattr(options, 'flow_size', 0)
|
||||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
|
||||||
|
if options.formatting_type == 'heuristic':
|
||||||
|
html = convert_heuristic(txt, epub_split_size_kb=flow_size)
|
||||||
|
else:
|
||||||
|
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||||
|
|
||||||
|
|
||||||
from calibre.customize.ui import plugin_for_input_format
|
from calibre.customize.ui import plugin_for_input_format
|
||||||
html_input = plugin_for_input_format('html')
|
html_input = plugin_for_input_format('html')
|
||||||
|
@ -9,6 +9,7 @@ import os, re
|
|||||||
from calibre import prepare_string_for_xml, isbytestring
|
from calibre import prepare_string_for_xml, isbytestring
|
||||||
from calibre.ebooks.markdown import markdown
|
from calibre.ebooks.markdown import markdown
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
|
from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
@ -16,7 +17,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
|
HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
|
||||||
|
|
||||||
def convert_basic(txt, title='', epub_split_size_kb=0):
|
def clean_txt(txt):
|
||||||
if isbytestring(txt):
|
if isbytestring(txt):
|
||||||
txt = txt.decode('utf-8', 'replace')
|
txt = txt.decode('utf-8', 'replace')
|
||||||
# Strip whitespace from the beginning and end of the line. Also replace
|
# Strip whitespace from the beginning and end of the line. Also replace
|
||||||
@ -35,6 +36,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
|
|||||||
chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19))
|
chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19))
|
||||||
illegal_chars = re.compile(u'|'.join(map(unichr, chars)))
|
illegal_chars = re.compile(u'|'.join(map(unichr, chars)))
|
||||||
txt = illegal_chars.sub('', txt)
|
txt = illegal_chars.sub('', txt)
|
||||||
|
|
||||||
|
return txt
|
||||||
|
|
||||||
|
def split_txt(txt, epub_split_size_kb=0):
|
||||||
#Takes care if there is no point to split
|
#Takes care if there is no point to split
|
||||||
if epub_split_size_kb > 0:
|
if epub_split_size_kb > 0:
|
||||||
if isinstance(txt, unicode):
|
if isinstance(txt, unicode):
|
||||||
@ -49,6 +54,12 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
|
|||||||
if isbytestring(txt):
|
if isbytestring(txt):
|
||||||
txt = txt.decode('utf-8')
|
txt = txt.decode('utf-8')
|
||||||
|
|
||||||
|
return txt
|
||||||
|
|
||||||
|
def convert_basic(txt, title='', epub_split_size_kb=0):
|
||||||
|
txt = clean_txt(txt)
|
||||||
|
txt = split_txt(txt, epub_split_size_kb)
|
||||||
|
|
||||||
lines = []
|
lines = []
|
||||||
# Split into paragraphs based on having a blank line between text.
|
# Split into paragraphs based on having a blank line between text.
|
||||||
for line in txt.split('\n\n'):
|
for line in txt.split('\n\n'):
|
||||||
@ -57,6 +68,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
|
|||||||
|
|
||||||
return HTML_TEMPLATE % (title, u'\n'.join(lines))
|
return HTML_TEMPLATE % (title, u'\n'.join(lines))
|
||||||
|
|
||||||
|
def convert_heuristic(txt, title='', epub_split_size_kb=0):
|
||||||
|
tp = TXTHeuristicProcessor()
|
||||||
|
return tp.convert(txt, title, epub_split_size_kb)
|
||||||
|
|
||||||
def convert_markdown(txt, title='', disable_toc=False):
|
def convert_markdown(txt, title='', disable_toc=False):
|
||||||
md = markdown.Markdown(
|
md = markdown.Markdown(
|
||||||
extensions=['footnotes', 'tables', 'toc'],
|
extensions=['footnotes', 'tables', 'toc'],
|
||||||
@ -111,12 +126,12 @@ def detect_paragraph_type(txt):
|
|||||||
|
|
||||||
# Check for print
|
# Check for print
|
||||||
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
||||||
if tab_line_count / float(txt_line_count) >= .25:
|
if tab_line_count / float(txt_line_count) >= .15:
|
||||||
return 'print'
|
return 'print'
|
||||||
|
|
||||||
# Check for block
|
# Check for block
|
||||||
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
||||||
if empty_line_count / float(txt_line_count) >= .25:
|
if empty_line_count / float(txt_line_count) >= .15:
|
||||||
return 'block'
|
return 'block'
|
||||||
|
|
||||||
# Nothing else matched to assume single.
|
# Nothing else matched to assume single.
|
||||||
@ -143,4 +158,4 @@ def detect_formatting_type(txt):
|
|||||||
if txt.count('\\'+c) > 10:
|
if txt.count('\\'+c) > 10:
|
||||||
return 'markdown'
|
return 'markdown'
|
||||||
|
|
||||||
return 'none'
|
return 'heuristic'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user