mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Move italic marking to preprocessor. Have TXT input use the preprocessor for heuristics. Change preprocessor getattr to default to False otherwise every option set to off will run.
This commit is contained in:
parent
fabd4f5fdf
commit
cfaa113f95
@ -128,6 +128,36 @@ class PreProcessor(object):
|
|||||||
wordcount = get_wordcount_obj(word_count_text)
|
wordcount = get_wordcount_obj(word_count_text)
|
||||||
return wordcount.words
|
return wordcount.words
|
||||||
|
|
||||||
|
def markup_italicis(self, html):
|
||||||
|
ITALICIZE_WORDS = [
|
||||||
|
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
||||||
|
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
|
||||||
|
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
|
||||||
|
'Mlle.', 'Mons.', 'PS.', 'PPS.',
|
||||||
|
]
|
||||||
|
|
||||||
|
ITALICIZE_STYLE_PATS = [
|
||||||
|
r'(?msu)_(?P<words>.+?)_',
|
||||||
|
r'(?msu)/(?P<words>[^<>]+?)/',
|
||||||
|
r'(?msu)~~(?P<words>.+?)~~',
|
||||||
|
r'(?msu)\*(?P<words>.+?)\*',
|
||||||
|
r'(?msu)~(?P<words>.+?)~',
|
||||||
|
r'(?msu)_/(?P<words>[^<>]+?)/_',
|
||||||
|
r'(?msu)_\*(?P<words>.+?)\*_',
|
||||||
|
r'(?msu)\*/(?P<words>[^<>]+?)/\*',
|
||||||
|
r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
|
||||||
|
r'(?msu)/:(?P<words>[^<>]+?):/',
|
||||||
|
r'(?msu)\|:(?P<words>.+?):\|',
|
||||||
|
]
|
||||||
|
|
||||||
|
for word in ITALICIZE_WORDS:
|
||||||
|
html = html.replace(word, '<i>%s</i>' % word)
|
||||||
|
|
||||||
|
for pat in ITALICIZE_STYLE_PATS:
|
||||||
|
html = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), html)
|
||||||
|
|
||||||
|
return html
|
||||||
|
|
||||||
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
|
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
|
||||||
'''
|
'''
|
||||||
Searches for common chapter headings throughout the document
|
Searches for common chapter headings throughout the document
|
||||||
@ -360,7 +390,7 @@ class PreProcessor(object):
|
|||||||
html = self.markup_pre(html)
|
html = self.markup_pre(html)
|
||||||
|
|
||||||
# Replace series of non-breaking spaces with text-indent
|
# Replace series of non-breaking spaces with text-indent
|
||||||
if getattr(self.extra_opts, 'fix_indents', True):
|
if getattr(self.extra_opts, 'fix_indents', False):
|
||||||
html = self.fix_nbsp_indents(html)
|
html = self.fix_nbsp_indents(html)
|
||||||
|
|
||||||
if self.cleanup_required():
|
if self.cleanup_required():
|
||||||
@ -375,19 +405,21 @@ class PreProcessor(object):
|
|||||||
#self.dump(html, 'before_chapter_markup')
|
#self.dump(html, 'before_chapter_markup')
|
||||||
# detect chapters/sections to match xpath or splitting logic
|
# detect chapters/sections to match xpath or splitting logic
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'markup_chapter_headings', True):
|
if getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||||
html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
|
html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
|
||||||
|
|
||||||
|
if getattr(self.extra_opts, 'italicize_common_cases', False):
|
||||||
|
html = self.markup_italicis(html)
|
||||||
|
|
||||||
# If more than 40% of the lines are empty paragraphs and the user has enabled delete
|
# If more than 40% of the lines are empty paragraphs and the user has enabled delete
|
||||||
# blank paragraphs then delete blank lines to clean up spacing
|
# blank paragraphs then delete blank lines to clean up spacing
|
||||||
if blanks_between_paragraphs and getattr(self.extra_opts,
|
if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
|
||||||
'delete_blank_paragraphs', False):
|
|
||||||
self.log("deleting blank lines")
|
self.log("deleting blank lines")
|
||||||
html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
||||||
html = self.blankreg.sub('', html)
|
html = self.blankreg.sub('', html)
|
||||||
|
|
||||||
###### Unwrap lines ######
|
###### Unwrap lines ######
|
||||||
if getattr(self.extra_opts, 'unwrap_lines', True):
|
if getattr(self.extra_opts, 'unwrap_lines', False):
|
||||||
# Determine line ending type
|
# Determine line ending type
|
||||||
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
||||||
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
||||||
@ -416,7 +448,7 @@ class PreProcessor(object):
|
|||||||
dehyphenator = Dehyphenator()
|
dehyphenator = Dehyphenator()
|
||||||
html = dehyphenator(html,'html_cleanup', length)
|
html = dehyphenator(html,'html_cleanup', length)
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'dehyphenate', True):
|
if getattr(self.extra_opts, 'dehyphenate', False):
|
||||||
# dehyphenate in cleanup mode to fix anything previous conversions/editing missed
|
# dehyphenate in cleanup mode to fix anything previous conversions/editing missed
|
||||||
self.log("Fixing hyphenated content")
|
self.log("Fixing hyphenated content")
|
||||||
dehyphenator = Dehyphenator()
|
dehyphenator = Dehyphenator()
|
||||||
@ -435,7 +467,7 @@ class PreProcessor(object):
|
|||||||
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||||
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'format_scene_breaks', True):
|
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
||||||
# Center separator lines
|
# Center separator lines
|
||||||
html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
|
html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
|
||||||
|
|
||||||
|
@ -1,58 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
|
||||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from calibre import prepare_string_for_xml
|
|
||||||
|
|
||||||
class TXTHeuristicProcessor(object):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.ITALICIZE_WORDS = [
|
|
||||||
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
|
||||||
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
|
|
||||||
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
|
|
||||||
'Mlle.', 'Mons.', 'PS.', 'PPS.',
|
|
||||||
]
|
|
||||||
self.ITALICIZE_STYLE_PATS = [
|
|
||||||
r'(?msu)_(?P<words>.+?)_',
|
|
||||||
r'(?msu)/(?P<words>[^<>]+?)/',
|
|
||||||
r'(?msu)~~(?P<words>.+?)~~',
|
|
||||||
r'(?msu)\*(?P<words>.+?)\*',
|
|
||||||
r'(?msu)~(?P<words>.+?)~',
|
|
||||||
r'(?msu)_/(?P<words>[^<>]+?)/_',
|
|
||||||
r'(?msu)_\*(?P<words>.+?)\*_',
|
|
||||||
r'(?msu)\*/(?P<words>[^<>]+?)/\*',
|
|
||||||
r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
|
|
||||||
r'(?msu)/:(?P<words>[^<>]+?):/',
|
|
||||||
r'(?msu)\|:(?P<words>.+?):\|',
|
|
||||||
]
|
|
||||||
|
|
||||||
def process_paragraph(self, paragraph):
|
|
||||||
for word in self.ITALICIZE_WORDS:
|
|
||||||
paragraph = paragraph.replace(word, '<i>%s</i>' % word)
|
|
||||||
for pat in self.ITALICIZE_STYLE_PATS:
|
|
||||||
paragraph = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), paragraph)
|
|
||||||
return paragraph
|
|
||||||
|
|
||||||
def convert(self, txt, title='', epub_split_size_kb=0):
|
|
||||||
from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE
|
|
||||||
txt = clean_txt(txt)
|
|
||||||
txt = split_txt(txt, epub_split_size_kb)
|
|
||||||
|
|
||||||
processed = []
|
|
||||||
for line in txt.split('\n\n'):
|
|
||||||
processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
|
|
||||||
|
|
||||||
txt = u'\n'.join(processed)
|
|
||||||
txt = re.sub('[ ]{2,}', ' ', txt)
|
|
||||||
html = HTML_TEMPLATE % (title, txt)
|
|
||||||
|
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
pp = PreProcessor()
|
|
||||||
html = pp.markup_chapters(html, pp.get_word_count(html), False)
|
|
||||||
|
|
||||||
return html
|
|
@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
|
|||||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||||
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
||||||
convert_heuristic, normalize_line_endings, convert_textile
|
normalize_line_endings, convert_textile
|
||||||
from calibre import _ent_pat, xml_entity_to_unicode
|
from calibre import _ent_pat, xml_entity_to_unicode
|
||||||
|
|
||||||
class TXTInput(InputFormatPlugin):
|
class TXTInput(InputFormatPlugin):
|
||||||
@ -126,11 +126,16 @@ class TXTInput(InputFormatPlugin):
|
|||||||
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
||||||
|
|
||||||
flow_size = getattr(options, 'flow_size', 0)
|
flow_size = getattr(options, 'flow_size', 0)
|
||||||
|
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||||
|
|
||||||
if options.formatting_type == 'heuristic':
|
if options.formatting_type == 'heuristic':
|
||||||
html = convert_heuristic(txt, epub_split_size_kb=flow_size)
|
setattr(options, 'enable_heuristics', True)
|
||||||
else:
|
setattr(options, 'markup_chapter_headings', True)
|
||||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
setattr(options, 'italicize_common_cases', True)
|
||||||
|
setattr(options, 'fix_indents', True)
|
||||||
|
setattr(options, 'delete_blank_paragraphs', True)
|
||||||
|
setattr(options, 'format_scene_breaks', True)
|
||||||
|
setattr(options, 'dehyphenate', True)
|
||||||
|
|
||||||
# Dehyphenate in cleanup mode for missed txt and markdown conversion
|
# Dehyphenate in cleanup mode for missed txt and markdown conversion
|
||||||
dehyphenator = Dehyphenator()
|
dehyphenator = Dehyphenator()
|
||||||
|
@ -12,7 +12,6 @@ import os, re
|
|||||||
|
|
||||||
from calibre import prepare_string_for_xml, isbytestring
|
from calibre import prepare_string_for_xml, isbytestring
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
|
|
||||||
from calibre.ebooks.conversion.preprocess import DocAnalysis
|
from calibre.ebooks.conversion.preprocess import DocAnalysis
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
|
|
||||||
@ -67,10 +66,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
|
|||||||
|
|
||||||
return HTML_TEMPLATE % (title, u'\n'.join(lines))
|
return HTML_TEMPLATE % (title, u'\n'.join(lines))
|
||||||
|
|
||||||
def convert_heuristic(txt, title='', epub_split_size_kb=0):
|
|
||||||
tp = TXTHeuristicProcessor()
|
|
||||||
return tp.convert(txt, title, epub_split_size_kb)
|
|
||||||
|
|
||||||
def convert_markdown(txt, title='', disable_toc=False):
|
def convert_markdown(txt, title='', disable_toc=False):
|
||||||
from calibre.ebooks.markdown import markdown
|
from calibre.ebooks.markdown import markdown
|
||||||
md = markdown.Markdown(
|
md = markdown.Markdown(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user