mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
TXT Input: New paragraph-type option (off) to disable modifying the paragraph structure. TXT Input: Rename none formatting-type to plain to correspond to the output option.
This commit is contained in:
commit
b000d470ed
@ -4,13 +4,14 @@ __license__ = 'GPL 3'
|
|||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import glob
|
|
||||||
import os
|
import os
|
||||||
|
import shutil
|
||||||
|
|
||||||
from calibre import _ent_pat, xml_entity_to_unicode
|
from calibre import _ent_pat, walk, xml_entity_to_unicode, guess_type
|
||||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||||
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||||
from calibre.ebooks.chardet import detect
|
from calibre.ebooks.chardet import detect
|
||||||
|
from calibre.ebooks.oeb.base import OEB_IMAGES
|
||||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||||
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
||||||
@ -28,20 +29,23 @@ class TXTInput(InputFormatPlugin):
|
|||||||
|
|
||||||
options = set([
|
options = set([
|
||||||
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
||||||
choices=['auto', 'block', 'single', 'print', 'unformatted'],
|
choices=['auto', 'block', 'single', 'print', 'unformatted', 'off'],
|
||||||
help=_('Paragraph structure.\n'
|
help=_('Paragraph structure.\n'
|
||||||
'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n'
|
'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\', \'off\']\n'
|
||||||
'* auto: Try to auto detect paragraph type.\n'
|
'* auto: Try to auto detect paragraph type.\n'
|
||||||
'* block: Treat a blank line as a paragraph break.\n'
|
'* block: Treat a blank line as a paragraph break.\n'
|
||||||
'* single: Assume every line is a paragraph.\n'
|
'* single: Assume every line is a paragraph.\n'
|
||||||
'* print: Assume every line starting with 2+ spaces or a tab '
|
'* print: Assume every line starting with 2+ spaces or a tab '
|
||||||
'starts a paragraph.'
|
'starts a paragraph.\n'
|
||||||
'* unformatted: Most lines have hard line breaks, few/no blank lines or indents.')),
|
'* unformatted: Most lines have hard line breaks, few/no blank lines or indents. '
|
||||||
|
'Tries to determine structure and reformat the differentiate elements.\n'
|
||||||
|
'* off: Don\'t modify the paragraph structure. This is useful when combined with '
|
||||||
|
'Markdown or Textile formatting to ensure no formatting is lost.')),
|
||||||
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
||||||
choices=['auto', 'none', 'heuristic', 'textile', 'markdown'],
|
choices=['auto', 'plain', 'heuristic', 'textile', 'markdown'],
|
||||||
help=_('Formatting used within the document.'
|
help=_('Formatting used within the document.'
|
||||||
'* auto: Automatically decide which formatting processor to use.\n'
|
'* auto: Automatically decide which formatting processor to use.\n'
|
||||||
'* none: Do not process the document formatting. Everything is a '
|
'* plain: Do not process the document formatting. Everything is a '
|
||||||
'paragraph and no styling is applied.\n'
|
'paragraph and no styling is applied.\n'
|
||||||
'* heuristic: Process using heuristics to determine formatting such '
|
'* heuristic: Process using heuristics to determine formatting such '
|
||||||
'as chapter headings and italic text.\n'
|
'as chapter headings and italic text.\n'
|
||||||
@ -64,6 +68,8 @@ class TXTInput(InputFormatPlugin):
|
|||||||
txt = ''
|
txt = ''
|
||||||
log.debug('Reading text from file...')
|
log.debug('Reading text from file...')
|
||||||
length = 0
|
length = 0
|
||||||
|
# [(u'path', mime),]
|
||||||
|
images = []
|
||||||
|
|
||||||
# Extract content from zip archive.
|
# Extract content from zip archive.
|
||||||
if file_ext == 'txtz':
|
if file_ext == 'txtz':
|
||||||
@ -72,10 +78,20 @@ class TXTInput(InputFormatPlugin):
|
|||||||
zf = ZipFile(stream)
|
zf = ZipFile(stream)
|
||||||
zf.extractall(tdir)
|
zf.extractall(tdir)
|
||||||
|
|
||||||
txts = glob.glob(os.path.join(tdir, '*.txt'))
|
for x in walk(tdir):
|
||||||
for t in txts:
|
if not os.path.isfile(x):
|
||||||
with open(t, 'rb') as tf:
|
continue
|
||||||
txt += tf.read()
|
if os.path.splitext(x)[1].lower() == '.txt':
|
||||||
|
with open(x, 'rb') as tf:
|
||||||
|
txt += tf.read() + '\n\n'
|
||||||
|
mt = guess_type(x)[0]
|
||||||
|
if mt in OEB_IMAGES:
|
||||||
|
path = os.path.relpath(x, tdir)
|
||||||
|
dir = os.path.join(os.getcwd(), os.path.dirname(path))
|
||||||
|
if not os.path.exists(dir):
|
||||||
|
os.makedirs(dir)
|
||||||
|
shutil.copy(x, os.path.join(os.getcwd(), path))
|
||||||
|
images.append((path, mt))
|
||||||
else:
|
else:
|
||||||
txt = stream.read()
|
txt = stream.read()
|
||||||
|
|
||||||
@ -134,7 +150,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
|
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
|
||||||
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
||||||
txt = separate_paragraphs_single_line(txt)
|
txt = separate_paragraphs_single_line(txt)
|
||||||
else:
|
elif options.paragraph_type == 'block':
|
||||||
txt = separate_hard_scene_breaks(txt)
|
txt = separate_hard_scene_breaks(txt)
|
||||||
txt = block_to_single_line(txt)
|
txt = block_to_single_line(txt)
|
||||||
|
|
||||||
@ -190,16 +206,20 @@ class TXTInput(InputFormatPlugin):
|
|||||||
htmlfile.write(html.encode('utf-8'))
|
htmlfile.write(html.encode('utf-8'))
|
||||||
odi = options.debug_pipeline
|
odi = options.debug_pipeline
|
||||||
options.debug_pipeline = None
|
options.debug_pipeline = None
|
||||||
# Generate oeb from htl conversion.
|
# Generate oeb from html conversion.
|
||||||
oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log,
|
oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log,
|
||||||
{})
|
{})
|
||||||
|
# Add images from from txtz archive to oeb.
|
||||||
|
for image, mime in images:
|
||||||
|
id, href = oeb.manifest.generate(id='image', href=image)
|
||||||
|
oeb.manifest.add(id, href, mime)
|
||||||
options.debug_pipeline = odi
|
options.debug_pipeline = odi
|
||||||
os.remove(htmlfile.name)
|
os.remove(htmlfile.name)
|
||||||
|
|
||||||
# Set metadata from file.
|
# Set metadata from file.
|
||||||
from calibre.customize.ui import get_file_type_metadata
|
from calibre.customize.ui import get_file_type_metadata
|
||||||
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
|
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
|
||||||
mi = get_file_type_metadata(stream, file_ext)
|
mi = get_file_type_metadata(stream, file_ext)
|
||||||
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
|
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
|
||||||
|
|
||||||
return oeb
|
return oeb
|
||||||
|
@ -126,7 +126,7 @@ def separate_hard_scene_breaks(txt):
|
|||||||
return '\n%s\n' % line
|
return '\n%s\n' % line
|
||||||
else:
|
else:
|
||||||
return line
|
return line
|
||||||
txt = re.sub(u'(?miu)^[ \t-=~\/]+$', lambda mo: sep_break(mo.group()), txt)
|
txt = re.sub(u'(?miu)^[ \t-=~\/_]+$', lambda mo: sep_break(mo.group()), txt)
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
def block_to_single_line(txt):
|
def block_to_single_line(txt):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user