TXT Input: Attempt to detect the input encoding when not specified. Auto detect paragraph structure and formatting markup. FB2 Output: Insert covers. Fixes #8172 (another fb2 problem in 0.7.37)

This commit is contained in:
Kovid Goyal 2011-01-06 14:32:31 -07:00
commit 91ba0d2df4
26 changed files with 388 additions and 262 deletions

View File

@ -13,7 +13,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class NewJournalOfPhysics(BasicNewsRecipe): class NewJournalOfPhysics(BasicNewsRecipe):
title = u'New Journal of Physics' title = u'New Journal of Physics'
__author__ = u'Chema Cortés' __author__ = u'Chema Cort\xe9s'
description = u'The open-access journal for physics' description = u'The open-access journal for physics'
publisher = u'IOP (Institute of Physics)' publisher = u'IOP (Institute of Physics)'
category = 'physics, journal, science' category = 'physics, journal, science'

View File

@ -16,6 +16,7 @@ import uuid
from lxml import etree from lxml import etree
from calibre import guess_type
from calibre import prepare_string_for_xml from calibre import prepare_string_for_xml
from calibre.constants import __appname__, __version__ from calibre.constants import __appname__, __version__
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
@ -161,6 +162,23 @@ class FB2MLizer(object):
text.append('<section>') text.append('<section>')
self.section_level += 1 self.section_level += 1
# Insert the title page / cover into the spine if it is not already referenced.
title_name = u''
if 'titlepage' in self.oeb_book.guide:
title_name = 'titlepage'
elif 'cover' in self.oeb_book.guide:
title_name = 'cover'
if title_name:
title_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[title_name].href]
if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml':
self.oeb_book.spine.insert(0, title_item, True)
# Create xhtml page to reference cover image so it can be used.
if self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
id = unicode(self.oeb_book.metadata.cover[0])
cover_item = self.oeb_book.manifest.ids[id]
if cover_item.media_type in OEB_RASTER_IMAGES:
self.insert_image_cover(cover_item.href)
for item in self.oeb_book.spine: for item in self.oeb_book.spine:
self.log.debug('Converting %s to FictionBook2 XML' % item.href) self.log.debug('Converting %s to FictionBook2 XML' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
@ -185,6 +203,17 @@ class FB2MLizer(object):
return ''.join(text) + '</body>' return ''.join(text) + '</body>'
def insert_image_cover(self, image_href):
from calibre.ebooks.oeb.base import RECOVER_PARSER
try:
root = etree.fromstring(u'<html xmlns="%s"><body><img src="%s" /></body></html>' % (XHTML_NS, image_href), parser=RECOVER_PARSER)
except:
root = etree.fromstring(u'', parser=RECOVER_PARSER)
id, href = self.oeb_book.manifest.generate('fb2_cover', 'fb2_cover.xhtml')
item = self.oeb_book.manifest.add(id, href, guess_type(href)[0], data=root)
self.oeb_book.spine.insert(0, item, True)
def fb2mlize_images(self): def fb2mlize_images(self):
''' '''
This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function. This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function.

View File

@ -19,16 +19,27 @@ class PDBInput(InputFormatPlugin):
file_types = set(['pdb']) file_types = set(['pdb'])
options = set([ options = set([
OptionRecommendation(name='single_line_paras', recommended_value=False, OptionRecommendation(name='paragraph_type', recommended_value='auto',
help=_('Normally calibre treats blank lines as paragraph markers. ' choices=['auto', 'block', 'single', 'print'],
'With this option it will assume that every line represents ' help=_('Paragraph structure.\n'
'a paragraph instead.')), 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
OptionRecommendation(name='print_formatted_paras', recommended_value=False, '* auto: Try to auto detect paragraph type.\n'
help=_('Normally calibre treats blank lines as paragraph markers. ' '* block: Treat a blank line as a paragraph break.\n'
'With this option it will assume that every line starting with ' '* single: Assume every line is a paragraph.\n'
'an indent (either a tab or 2+ spaces) represents a paragraph. ' '* print: Assume every line starting with 2+ spaces or a tab '
'Paragraphs end when the next line that starts with an indent ' 'starts a paragraph.')),
'is reached.')), OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=['auto', 'none', 'markdown'],
help=_('Formatting used within the document.'
'* auto: Try to auto detect the document formatting.\n'
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
'* markdown: Run the input though the markdown pre-processor. '
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. '
'With this option all spaces will be displayed.')),
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
help=_('Do not insert a Table of Contents into the output text.')),
]) ])
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,

View File

@ -22,7 +22,7 @@ class PDBOutput(OutputFormatPlugin):
short_switch='f', choices=FORMAT_WRITERS.keys(), short_switch='f', choices=FORMAT_WRITERS.keys(),
help=(_('Format to use inside the pdb container. Choices are:')+\ help=(_('Format to use inside the pdb container. Choices are:')+\
' %s' % FORMAT_WRITERS.keys())), ' %s' % FORMAT_WRITERS.keys())),
OptionRecommendation(name='output_encoding', recommended_value='cp1252', OptionRecommendation(name='pdb_output_encoding', recommended_value='cp1252',
level=OptionRecommendation.LOW, level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. ' \ help=_('Specify the character encoding of the output document. ' \
'The default is cp1252. Note: This option is not honored by all ' \ 'The default is cp1252. Note: This option is not honored by all ' \

View File

@ -8,12 +8,11 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os
import struct import struct
from cStringIO import StringIO
from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted
class HeaderRecord(object): class HeaderRecord(object):
''' '''
@ -33,9 +32,7 @@ class Reader(FormatReader):
def __init__(self, header, stream, log, options): def __init__(self, header, stream, log, options):
self.stream = stream self.stream = stream
self.log = log self.log = log
self.encoding = options.input_encoding self.options = options
self.single_line_paras = options.single_line_paras
self.print_formatted_paras = options.print_formatted_paras
self.sections = [] self.sections = []
for i in range(header.num_sections): for i in range(header.num_sections):
@ -48,34 +45,29 @@ class Reader(FormatReader):
def decompress_text(self, number): def decompress_text(self, number):
if self.header_record.compression == 1: if self.header_record.compression == 1:
return self.section_data(number).decode('cp1252' if self.encoding is None else self.encoding) return self.section_data(number)
if self.header_record.compression == 2 or self.header_record.compression == 258: if self.header_record.compression == 2 or self.header_record.compression == 258:
from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.compression.palmdoc import decompress_doc
return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') return decompress_doc(self.section_data(number))
return '' return ''
def extract_content(self, output_dir): def extract_content(self, output_dir):
txt = '' raw_txt = ''
self.log.info('Decompressing text...') self.log.info('Decompressing text...')
for i in range(1, self.header_record.num_records + 1): for i in range(1, self.header_record.num_records + 1):
self.log.debug('\tDecompressing text section %i' % i) self.log.debug('\tDecompressing text section %i' % i)
txt += self.decompress_text(i) raw_txt += self.decompress_text(i)
self.log.info('Converting text to OEB...') self.log.info('Converting text to OEB...')
if self.single_line_paras: stream = StringIO(raw_txt)
txt = separate_paragraphs_single_line(txt)
if self.print_formatted_paras:
txt = separate_paragraphs_print_formatted(txt)
html = convert_basic(txt)
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8'))
from calibre.ebooks.metadata.meta import get_metadata from calibre.customize.ui import plugin_for_input_format
mi = get_metadata(self.stream, 'pdb')
manifest = [('index.html', None)]
spine = ['index.html']
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
return os.path.join(output_dir, 'metadata.opf') txt_plugin = plugin_for_input_format('txt')
for option in txt_plugin.options:
if not hasattr(self.options, option.option.name):
setattr(self.options, option.name, option.recommended_value)
stream.seek(0)
return txt_plugin.convert(stream, self.options, 'txt', self.log, {})

View File

@ -50,7 +50,8 @@ class Writer(FormatWriter):
txt = writer.extract_content(oeb_book, self.opts) txt = writer.extract_content(oeb_book, self.opts)
self.log.debug('\tReplacing newlines with selected type...') self.log.debug('\tReplacing newlines with selected type...')
txt = specified_newlines(TxtNewlines('windows').newline, txt).encode(self.opts.output_encoding, 'replace') txt = specified_newlines(TxtNewlines('windows').newline,
txt).encode(self.opts.pdb_output_encoding, 'replace')
txt_length = len(txt) txt_length = len(txt)

View File

@ -8,12 +8,13 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, struct, zlib import struct
import zlib
from cStringIO import StringIO
from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pdb.ztxt import zTXTError from calibre.ebooks.pdb.ztxt import zTXTError
from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted
SUPPORTED_VERSION = (1, 40) SUPPORTED_VERSION = (1, 40)
@ -38,9 +39,7 @@ class Reader(FormatReader):
def __init__(self, header, stream, log, options): def __init__(self, header, stream, log, options):
self.stream = stream self.stream = stream
self.log = log self.log = log
self.encoding = options.input_encoding self.options = options
self.single_line_paras = options.single_line_paras
self.print_formatted_paras = options.print_formatted_paras
self.sections = [] self.sections = []
for i in range(header.num_sections): for i in range(header.num_sections):
@ -68,30 +67,25 @@ class Reader(FormatReader):
def decompress_text(self, number): def decompress_text(self, number):
if number == 1: if number == 1:
self.uncompressor = zlib.decompressobj() self.uncompressor = zlib.decompressobj()
return self.uncompressor.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') return self.uncompressor.decompress(self.section_data(number))
def extract_content(self, output_dir): def extract_content(self, output_dir):
txt = '' raw_txt = ''
self.log.info('Decompressing text...') self.log.info('Decompressing text...')
for i in range(1, self.header_record.num_records + 1): for i in range(1, self.header_record.num_records + 1):
self.log.debug('\tDecompressing text section %i' % i) self.log.debug('\tDecompressing text section %i' % i)
txt += self.decompress_text(i) raw_txt += self.decompress_text(i)
self.log.info('Converting text to OEB...') self.log.info('Converting text to OEB...')
if self.single_line_paras: stream = StringIO(raw_txt)
txt = separate_paragraphs_single_line(txt)
if self.print_formatted_paras:
txt = separate_paragraphs_print_formatted(txt)
html = convert_basic(txt)
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8'))
from calibre.ebooks.metadata.meta import get_metadata from calibre.customize.ui import plugin_for_input_format
mi = get_metadata(self.stream, 'pdb')
manifest = [('index.html', None)]
spine = ['index.html']
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
return os.path.join(output_dir, 'metadata.opf') txt_plugin = plugin_for_input_format('txt')
for option in txt_plugin.options:
if not hasattr(self.options, option.option.name):
setattr(self.options, option.name, option.recommended_value)
stream.seek(0)
return txt_plugin.convert(stream, self.options, 'txt', self.log, {})

View File

@ -54,7 +54,8 @@ class Writer(FormatWriter):
txt = writer.extract_content(oeb_book, self.opts) txt = writer.extract_content(oeb_book, self.opts)
self.log.debug('\tReplacing newlines with selected type...') self.log.debug('\tReplacing newlines with selected type...')
txt = specified_newlines(TxtNewlines('windows').newline, txt).encode(self.opts.output_encoding, 'replace') txt = specified_newlines(TxtNewlines('windows').newline,
txt).encode(self.opts.pdb_output_encoding, 'replace')
txt_length = len(txt) txt_length = len(txt)

View File

@ -28,7 +28,7 @@ class PMLOutput(OutputFormatPlugin):
file_type = 'pmlz' file_type = 'pmlz'
options = set([ options = set([
OptionRecommendation(name='output_encoding', recommended_value='cp1252', OptionRecommendation(name='pml_output_encoding', recommended_value='cp1252',
level=OptionRecommendation.LOW, level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. ' \ help=_('Specify the character encoding of the output document. ' \
'The default is cp1252.')), 'The default is cp1252.')),
@ -48,7 +48,7 @@ class PMLOutput(OutputFormatPlugin):
pmlmlizer = PMLMLizer(log) pmlmlizer = PMLMLizer(log)
pml = unicode(pmlmlizer.extract_content(oeb_book, opts)) pml = unicode(pmlmlizer.extract_content(oeb_book, opts))
with open(os.path.join(tdir, 'index.pml'), 'wb') as out: with open(os.path.join(tdir, 'index.pml'), 'wb') as out:
out.write(pml.encode(opts.output_encoding, 'replace')) out.write(pml.encode(opts.pml_output_encoding, 'replace'))
self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, tdir, opts) self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, tdir, opts)

View File

@ -4,11 +4,9 @@ __license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os from cStringIO import StringIO
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted
from calibre.ebooks.compression.tcr import decompress from calibre.ebooks.compression.tcr import decompress
class TCRInput(InputFormatPlugin): class TCRInput(InputFormatPlugin):
@ -19,36 +17,43 @@ class TCRInput(InputFormatPlugin):
file_types = set(['tcr']) file_types = set(['tcr'])
options = set([ options = set([
OptionRecommendation(name='single_line_paras', recommended_value=False, OptionRecommendation(name='paragraph_type', recommended_value='auto',
help=_('Normally calibre treats blank lines as paragraph markers. ' choices=['auto', 'block', 'single', 'print'],
'With this option it will assume that every line represents ' help=_('Paragraph structure.\n'
'a paragraph instead.')), 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
OptionRecommendation(name='print_formatted_paras', recommended_value=False, '* auto: Try to auto detect paragraph type.\n'
help=_('Normally calibre treats blank lines as paragraph markers. ' '* block: Treat a blank line as a paragraph break.\n'
'With this option it will assume that every line starting with ' '* single: Assume every line is a paragraph.\n'
'an indent (either a tab or 2+ spaces) represents a paragraph. ' '* print: Assume every line starting with 2+ spaces or a tab '
'Paragraphs end when the next line that starts with an indent ' 'starts a paragraph.')),
'is reached.')), OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=['auto', 'none', 'markdown'],
help=_('Formatting used within the document.'
'* auto: Try to auto detect the document formatting.\n'
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
'* markdown: Run the input though the markdown pre-processor. '
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. '
'With this option all spaces will be displayed.')),
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
help=_('Do not insert a Table of Contents into the output text.')),
]) ])
def convert(self, stream, options, file_ext, log, accelerators): def convert(self, stream, options, file_ext, log, accelerators):
log.info('Decompressing text...') log.info('Decompressing text...')
ienc = options.input_encoding if options.input_encoding else 'utf-8' raw_txt = decompress(stream)
txt = decompress(stream).decode(ienc, 'replace')
log.info('Converting text to OEB...') log.info('Converting text to OEB...')
if options.single_line_paras: stream = StringIO(raw_txt)
txt = separate_paragraphs_single_line(txt)
if options.print_formatted_paras:
txt = separate_paragraphs_print_formatted(txt)
html = convert_basic(txt)
with open(os.path.join(os.getcwd(), 'index.html'), 'wb') as index:
index.write(html.encode('utf-8'))
from calibre.ebooks.metadata.meta import get_metadata from calibre.customize.ui import plugin_for_input_format
mi = get_metadata(stream, 'tcr')
manifest = [('index.html', None)]
spine = ['index.html']
opf_writer(os.getcwd(), 'metadata.opf', manifest, spine, mi)
return os.path.join(os.getcwd(), 'metadata.opf') txt_plugin = plugin_for_input_format('txt')
for option in txt_plugin.options:
if not hasattr(options, option.option.name):
setattr(options, option.name, option.recommended_value)
stream.seek(0)
return txt_plugin.convert(stream, options,
'txt', log, accelerators)

View File

@ -18,7 +18,7 @@ class TCROutput(OutputFormatPlugin):
file_type = 'tcr' file_type = 'tcr'
options = set([ options = set([
OptionRecommendation(name='output_encoding', recommended_value='utf-8', OptionRecommendation(name='tcr_output_encoding', recommended_value='utf-8',
level=OptionRecommendation.LOW, level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. ' \ help=_('Specify the character encoding of the output document. ' \
'The default is utf-8.')), 'The default is utf-8.')),
@ -40,7 +40,7 @@ class TCROutput(OutputFormatPlugin):
setattr(opts, 'indent_paras', False) setattr(opts, 'indent_paras', False)
writer = TXTMLizer(log) writer = TXTMLizer(log)
txt = writer.extract_content(oeb_book, opts).encode(opts.output_encoding, 'replace') txt = writer.extract_content(oeb_book, opts).encode(opts.tcr_output_encoding, 'replace')
log.info('Compressing text...') log.info('Compressing text...')
txt = compress(txt) txt = compress(txt)

View File

@ -7,9 +7,10 @@ __docformat__ = 'restructuredtext en'
import os import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces preserve_spaces, detect_paragraph_type, detect_formatting_type
from calibre import _ent_pat, xml_entity_to_unicode from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin): class TXTInput(InputFormatPlugin):
@ -20,45 +21,57 @@ class TXTInput(InputFormatPlugin):
file_types = set(['txt']) file_types = set(['txt'])
options = set([ options = set([
OptionRecommendation(name='single_line_paras', recommended_value=False, OptionRecommendation(name='paragraph_type', recommended_value='auto',
help=_('Normally calibre treats blank lines as paragraph markers. ' choices=['auto', 'block', 'single', 'print'],
'With this option it will assume that every line represents ' help=_('Paragraph structure.\n'
'a paragraph instead.')), 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
OptionRecommendation(name='print_formatted_paras', recommended_value=False, '* auto: Try to auto detect paragraph type.\n'
help=_('Normally calibre treats blank lines as paragraph markers. ' '* block: Treat a blank line as a paragraph break.\n'
'With this option it will assume that every line starting with ' '* single: Assume every line is a paragraph.\n'
'an indent (either a tab or 2+ spaces) represents a paragraph. ' '* print: Assume every line starting with 2+ spaces or a tab '
'Paragraphs end when the next line that starts with an indent ' 'starts a paragraph.')),
'is reached.')), OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=['auto', 'none', 'markdown'],
help=_('Formatting used within the document.'
'* auto: Try to auto detect the document formatting.\n'
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
'* markdown: Run the input though the markdown pre-processor. '
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False, OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. ' help=_('Normally extra spaces are condensed into a single space. '
'With this option all spaces will be displayed.')), 'With this option all spaces will be displayed.')),
OptionRecommendation(name='markdown', recommended_value=False,
help=_('Run the text input through the markdown pre-processor. To '
'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name="markdown_disable_toc", recommended_value=False, OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
help=_('Do not insert a Table of Contents into the output text.')), help=_('Do not insert a Table of Contents into the output text.')),
]) ])
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
ienc = stream.encoding if stream.encoding else 'utf-8' log.debug('Reading text from file...')
txt = stream.read()
# Get the encoding of the document.
if options.input_encoding: if options.input_encoding:
ienc = options.input_encoding ienc = options.input_encoding
log.debug('Reading text from file...') log.debug('Using user specified input encoding of %s' % ienc)
txt = stream.read().decode(ienc, 'replace') else:
det_encoding = detect(txt)
ienc = det_encoding['encoding']
log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, det_encoding['confidence'] * 100))
if not ienc:
ienc = 'utf-8'
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
txt = txt.decode(ienc, 'replace')
# Adjust paragraph formatting as requested txt = _ent_pat.sub(xml_entity_to_unicode, txt)
if options.single_line_paras: # Preserve spaces will replace multiple spaces to a space
txt = separate_paragraphs_single_line(txt) # followed by the &nbsp; entity.
if options.print_formatted_paras:
txt = separate_paragraphs_print_formatted(txt)
if options.preserve_spaces: if options.preserve_spaces:
txt = preserve_spaces(txt) txt = preserve_spaces(txt)
txt = _ent_pat.sub(xml_entity_to_unicode, txt) if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt)
if options.markdown: if options.formatting_type == 'markdown':
log.debug('Running text though markdown conversion...') log.debug('Running text though markdown conversion...')
try: try:
html = convert_markdown(txt, disable_toc=options.markdown_disable_toc) html = convert_markdown(txt, disable_toc=options.markdown_disable_toc)
@ -66,6 +79,22 @@ class TXTInput(InputFormatPlugin):
raise ValueError('This txt file has malformed markup, it cannot be' raise ValueError('This txt file has malformed markup, it cannot be'
' converted by calibre. See http://daringfireball.net/projects/markdown/syntax') ' converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
else: else:
# Determine the paragraph type of the document.
if options.paragraph_type == 'auto':
options.paragraph_type = detect_paragraph_type(txt)
if options.paragraph_type == 'unknown':
log.debug('Could not reliably determine paragraph type using block')
options.paragraph_type = 'block'
else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
if options.paragraph_type == 'single':
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print':
txt = separate_paragraphs_print_formatted(txt)
flow_size = getattr(options, 'flow_size', 0) flow_size = getattr(options, 'flow_size', 0)
html = convert_basic(txt, epub_split_size_kb=flow_size) html = convert_basic(txt, epub_split_size_kb=flow_size)
@ -85,11 +114,10 @@ class TXTInput(InputFormatPlugin):
htmlfile = open(fname, 'wb') htmlfile = open(fname, 'wb')
with htmlfile: with htmlfile:
htmlfile.write(html.encode('utf-8')) htmlfile.write(html.encode('utf-8'))
cwd = os.getcwdu()
odi = options.debug_pipeline odi = options.debug_pipeline
options.debug_pipeline = None options.debug_pipeline = None
oeb = html_input(open(htmlfile.name, 'rb'), options, 'html', log, oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log,
{}, cwd) {})
options.debug_pipeline = odi options.debug_pipeline = odi
os.remove(htmlfile.name) os.remove(htmlfile.name)
return oeb return oeb

View File

@ -26,7 +26,7 @@ class TXTOutput(OutputFormatPlugin):
'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. ' 'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
'For Mac OS X use \'unix\'. \'system\' will default to the newline ' 'For Mac OS X use \'unix\'. \'system\' will default to the newline '
'type used by this OS.') % sorted(TxtNewlines.NEWLINE_TYPES.keys())), 'type used by this OS.') % sorted(TxtNewlines.NEWLINE_TYPES.keys())),
OptionRecommendation(name='output_encoding', recommended_value='utf-8', OptionRecommendation(name='txt_output_encoding', recommended_value='utf-8',
level=OptionRecommendation.LOW, level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. ' \ help=_('Specify the character encoding of the output document. ' \
'The default is utf-8.')), 'The default is utf-8.')),
@ -81,7 +81,7 @@ class TXTOutput(OutputFormatPlugin):
out_stream.seek(0) out_stream.seek(0)
out_stream.truncate() out_stream.truncate()
out_stream.write(txt.encode(opts.output_encoding, 'replace')) out_stream.write(txt.encode(opts.txt_output_encoding, 'replace'))
if close: if close:
out_stream.close() out_stream.close()

View File

@ -49,7 +49,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
if isbytestring(txt): if isbytestring(txt):
txt = txt.decode('utf-8') txt = txt.decode('utf-8')
lines = [] lines = []
# Split into paragraphs based on having a blank line between text. # Split into paragraphs based on having a blank line between text.
for line in txt.split('\n\n'): for line in txt.split('\n\n'):
@ -94,3 +93,54 @@ def split_string_separator(txt, size) :
xrange(0, len(txt), size)]) xrange(0, len(txt), size)])
return txt return txt
def detect_paragraph_type(txt):
'''
Tries to determine the formatting of the document.
block: Paragraphs are separated by a blank line.
single: Each line is a paragraph.
print: Each paragraph starts with a 2+ spaces or a tab
and ends when a new paragraph is reached.
markdown: Markdown formatting is in the document.
returns block, single, print, markdown
'''
txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n')
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
# Check for print
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
if tab_line_count / float(txt_line_count) >= .25:
return 'print'
# Check for block
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
if empty_line_count / float(txt_line_count) >= .25:
return 'block'
# Nothing else matched to assume single.
return 'single'
def detect_formatting_type(txt):
# Check for markdown
# Headings
if len(re.findall('(?mu)^#+', txt)) >= 5:
return 'markdown'
if len(re.findall('(?mu)^=+$', txt)) >= 5:
return 'markdown'
if len(re.findall('(?mu)^-+$', txt)) >= 5:
return 'markdown'
# Images
if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5:
return 'markdown'
# Links
if len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt)) >= 5:
return 'markdown'
# Escaped characters
md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']
for c in md_escapted_characters:
if txt.count('\\'+c) > 10:
return 'markdown'
return 'none'

View File

@ -192,6 +192,11 @@ class Widget(QWidget):
if not val: val = '' if not val: val = ''
getattr(g, 'setPlainText', g.setText)(val) getattr(g, 'setPlainText', g.setText)(val)
getattr(g, 'setCursorPosition', lambda x: x)(0) getattr(g, 'setCursorPosition', lambda x: x)(0)
elif isinstance(g, EncodingComboBox):
if val:
g.setEditText(val)
else:
g.setCurrentIndex(0)
elif isinstance(g, QComboBox) and val: elif isinstance(g, QComboBox) and val:
idx = g.findText(val, Qt.MatchFixedString) idx = g.findText(val, Qt.MatchFixedString)
if idx < 0: if idx < 0:
@ -202,8 +207,6 @@ class Widget(QWidget):
g.setCheckState(Qt.Checked if bool(val) else Qt.Unchecked) g.setCheckState(Qt.Checked if bool(val) else Qt.Unchecked)
elif isinstance(g, (XPathEdit, RegexEdit)): elif isinstance(g, (XPathEdit, RegexEdit)):
g.edit.setText(val if val else '') g.edit.setText(val if val else '')
elif isinstance(g, EncodingComboBox):
g.setEditText(val if val else '')
else: else:
raise Exception('Can\'t set value %s in %s'%(repr(val), raise Exception('Can\'t set value %s in %s'%(repr(val),
unicode(g.objectName()))) unicode(g.objectName())))

View File

@ -1,10 +1,10 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from calibre.gui2.convert.pdb_input_ui import Ui_Form from calibre.gui2.convert.txt_input_ui import Ui_Form
from calibre.gui2.convert import Widget from calibre.gui2.convert import Widget
class PluginWidget(Widget, Ui_Form): class PluginWidget(Widget, Ui_Form):
@ -12,10 +12,14 @@ class PluginWidget(Widget, Ui_Form):
TITLE = _('PDB Input') TITLE = _('PDB Input')
HELP = _('Options specific to')+' PDB '+_('input') HELP = _('Options specific to')+' PDB '+_('input')
COMMIT_NAME = 'pdb_input' COMMIT_NAME = 'pdb_input'
ICON = I('mimetypes/unknown.png') ICON = I('mimetypes/txt.png')
def __init__(self, parent, get_option, get_help, db=None, book_id=None): def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, Widget.__init__(self, parent,
['single_line_paras', 'print_formatted_paras']) ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
for x in get_option('paragraph_type').option.choices:
self.opt_paragraph_type.addItem(x)
for x in get_option('formatting_type').option.choices:
self.opt_formatting_type.addItem(x)
self.initialize_options(get_option, get_help, db, book_id) self.initialize_options(get_option, get_help, db, book_id)

View File

@ -1,48 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<ui version="4.0">
<class>Form</class>
<widget class="QWidget" name="Form">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>400</width>
<height>300</height>
</rect>
</property>
<property name="windowTitle">
<string>Form</string>
</property>
<layout class="QGridLayout" name="gridLayout">
<item row="2" column="0">
<spacer name="verticalSpacer">
<property name="orientation">
<enum>Qt::Vertical</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>20</width>
<height>213</height>
</size>
</property>
</spacer>
</item>
<item row="0" column="0">
<widget class="QCheckBox" name="opt_single_line_paras">
<property name="text">
<string>Treat each &amp;line as a paragraph</string>
</property>
</widget>
</item>
<item row="1" column="0">
<widget class="QCheckBox" name="opt_print_formatted_paras">
<property name="text">
<string>Assume print formatting</string>
</property>
</widget>
</item>
</layout>
</widget>
<resources/>
<connections/>
</ui>

View File

@ -19,7 +19,7 @@ class PluginWidget(Widget, Ui_Form):
ICON = I('mimetypes/unknown.png') ICON = I('mimetypes/unknown.png')
def __init__(self, parent, get_option, get_help, db=None, book_id=None): def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, ['format', 'inline_toc', 'output_encoding']) Widget.__init__(self, parent, ['format', 'inline_toc', 'pdb_output_encoding'])
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id) self.initialize_options(get_option, get_help, db, book_id)

View File

@ -55,10 +55,21 @@
</widget> </widget>
</item> </item>
<item row="1" column="1"> <item row="1" column="1">
<widget class="QLineEdit" name="opt_output_encoding"/> <widget class="EncodingComboBox" name="opt_pdb_output_encoding">
<property name="editable">
<bool>true</bool>
</property>
</widget>
</item> </item>
</layout> </layout>
</widget> </widget>
<customwidgets>
<customwidget>
<class>EncodingComboBox</class>
<extends>QComboBox</extends>
<header>widgets.h</header>
</customwidget>
</customwidgets>
<resources/> <resources/>
<connections/> <connections/>
</ui> </ui>

View File

@ -18,6 +18,6 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None): def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, ['inline_toc', 'full_image_depth', Widget.__init__(self, parent, ['inline_toc', 'full_image_depth',
'output_encoding']) 'pml_output_encoding'])
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id) self.initialize_options(get_option, get_help, db, book_id)

View File

@ -14,7 +14,7 @@
<string>Form</string> <string>Form</string>
</property> </property>
<layout class="QGridLayout" name="gridLayout"> <layout class="QGridLayout" name="gridLayout">
<item row="3" column="0"> <item row="4" column="0">
<spacer name="verticalSpacer"> <spacer name="verticalSpacer">
<property name="orientation"> <property name="orientation">
<enum>Qt::Vertical</enum> <enum>Qt::Vertical</enum>
@ -27,32 +27,47 @@
</property> </property>
</spacer> </spacer>
</item> </item>
<item row="1" column="0"> <item row="2" column="0">
<widget class="QCheckBox" name="opt_inline_toc"> <widget class="QCheckBox" name="opt_inline_toc">
<property name="text"> <property name="text">
<string>&amp;Inline TOC</string> <string>&amp;Inline TOC</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="2" column="0"> <item row="3" column="0">
<widget class="QCheckBox" name="opt_full_image_depth"> <widget class="QCheckBox" name="opt_full_image_depth">
<property name="text"> <property name="text">
<string>Do not reduce image size and depth</string> <string>Do not reduce image size and depth</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="0" column="0"> <item row="1" column="0">
<layout class="QHBoxLayout" name="horizontalLayout">
<item>
<widget class="QLabel" name="label"> <widget class="QLabel" name="label">
<property name="text"> <property name="text">
<string>Output Encoding:</string> <string>Output Encoding:</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="0" column="1"> <item>
<widget class="QLineEdit" name="opt_output_encoding"/> <widget class="EncodingComboBox" name="opt_pml_output_encoding">
<property name="editable">
<bool>true</bool>
</property>
</widget>
</item>
</layout>
</item> </item>
</layout> </layout>
</widget> </widget>
<customwidgets>
<customwidget>
<class>EncodingComboBox</class>
<extends>QComboBox</extends>
<header>widgets.h</header>
</customwidget>
</customwidgets>
<resources/> <resources/>
<connections/> <connections/>
</ui> </ui>

View File

@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.gui2.convert.txt_input_ui import Ui_Form
from calibre.gui2.convert import Widget
class PluginWidget(Widget, Ui_Form):
TITLE = _('TCR Input')
HELP = _('Options specific to')+' TCR '+_('input')
COMMIT_NAME = 'tcr_input'
ICON = I('mimetypes/txt.png')
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent,
['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
self.db, self.book_id = db, book_id
for x in get_option('paragraph_type').option.choices:
self.opt_paragraph_type.addItem(x)
for x in get_option('formatting_type').option.choices:
self.opt_formatting_type.addItem(x)
self.initialize_options(get_option, get_help, db, book_id)

View File

@ -16,7 +16,10 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None): def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, Widget.__init__(self, parent,
['single_line_paras', 'print_formatted_paras', 'markdown', ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
'markdown_disable_toc', 'preserve_spaces'])
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
for x in get_option('paragraph_type').option.choices:
self.opt_paragraph_type.addItem(x)
for x in get_option('formatting_type').option.choices:
self.opt_formatting_type.addItem(x)
self.initialize_options(get_option, get_help, db, book_id) self.initialize_options(get_option, get_help, db, book_id)

View File

@ -6,7 +6,7 @@
<rect> <rect>
<x>0</x> <x>0</x>
<y>0</y> <y>0</y>
<width>470</width> <width>518</width>
<height>300</height> <height>300</height>
</rect> </rect>
</property> </property>
@ -15,47 +15,23 @@
</property> </property>
<layout class="QGridLayout" name="gridLayout"> <layout class="QGridLayout" name="gridLayout">
<item row="0" column="0"> <item row="0" column="0">
<widget class="QCheckBox" name="opt_single_line_paras"> <widget class="QLabel" name="label_2">
<property name="text"> <property name="text">
<string>Treat each &amp;line as a paragraph</string> <string>Paragraph style:</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="1" column="0"> <item row="0" column="1">
<widget class="QCheckBox" name="opt_print_formatted_paras"> <widget class="QComboBox" name="opt_paragraph_type"/>
</item>
<item row="5" column="0" colspan="2">
<widget class="QCheckBox" name="opt_preserve_spaces">
<property name="text"> <property name="text">
<string>Assume print formatting</string> <string>Preserve &amp;spaces</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="2" column="0"> <item row="6" column="0" colspan="2">
<widget class="QCheckBox" name="opt_markdown">
<property name="text">
<string>Process using markdown</string>
</property>
</widget>
</item>
<item row="3" column="0">
<widget class="QLabel" name="label">
<property name="text">
<string>&lt;p&gt;Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit &lt;a href=&quot;http://daringfireball.net/projects/markdown&quot;&gt;markdown&lt;/a&gt;.</string>
</property>
<property name="wordWrap">
<bool>true</bool>
</property>
<property name="openExternalLinks">
<bool>true</bool>
</property>
</widget>
</item>
<item row="4" column="0">
<widget class="QCheckBox" name="opt_markdown_disable_toc">
<property name="text">
<string>Do not insert Table of Contents into output text when using markdown</string>
</property>
</widget>
</item>
<item row="6" column="0">
<spacer name="verticalSpacer"> <spacer name="verticalSpacer">
<property name="orientation"> <property name="orientation">
<enum>Qt::Vertical</enum> <enum>Qt::Vertical</enum>
@ -68,32 +44,47 @@
</property> </property>
</spacer> </spacer>
</item> </item>
<item row="5" column="0"> <item row="1" column="1">
<widget class="QCheckBox" name="opt_preserve_spaces"> <widget class="QComboBox" name="opt_formatting_type"/>
</item>
<item row="1" column="0">
<widget class="QLabel" name="label_3">
<property name="text"> <property name="text">
<string>Preserve &amp;spaces</string> <string>Formatting style:</string>
</property>
</widget>
</item>
<item row="2" column="0" rowspan="2" colspan="2">
<widget class="QGroupBox" name="groupBox">
<property name="title">
<string>Markdown Options</string>
</property>
<layout class="QVBoxLayout" name="verticalLayout">
<item>
<widget class="QLabel" name="label">
<property name="text">
<string>&lt;p&gt;Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit &lt;a href=&quot;http://daringfireball.net/projects/markdown&quot;&gt;markdown&lt;/a&gt;.</string>
</property>
<property name="wordWrap">
<bool>true</bool>
</property>
<property name="openExternalLinks">
<bool>true</bool>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="opt_markdown_disable_toc">
<property name="text">
<string>Do not insert Table of Contents into output text when using markdown</string>
</property> </property>
</widget> </widget>
</item> </item>
</layout> </layout>
</widget> </widget>
</item>
</layout>
</widget>
<resources/> <resources/>
<connections> <connections/>
<connection>
<sender>opt_markdown</sender>
<signal>toggled(bool)</signal>
<receiver>opt_markdown_disable_toc</receiver>
<slot>setEnabled(bool)</slot>
<hints>
<hint type="sourcelabel">
<x>76</x>
<y>80</y>
</hint>
<hint type="destinationlabel">
<x>418</x>
<y>105</y>
</hint>
</hints>
</connection>
</connections>
</ui> </ui>

View File

@ -22,7 +22,7 @@ class PluginWidget(Widget, Ui_Form):
Widget.__init__(self, parent, Widget.__init__(self, parent,
['newline', 'max_line_length', 'force_max_line_length', ['newline', 'max_line_length', 'force_max_line_length',
'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references', 'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references',
'output_encoding']) 'txt_output_encoding'])
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id) self.initialize_options(get_option, get_help, db, book_id)

View File

@ -96,10 +96,21 @@
</widget> </widget>
</item> </item>
<item row="2" column="1"> <item row="2" column="1">
<widget class="QLineEdit" name="opt_output_encoding"/> <widget class="EncodingComboBox" name="opt_txt_output_encoding">
<property name="editable">
<bool>true</bool>
</property>
</widget>
</item> </item>
</layout> </layout>
</widget> </widget>
<customwidgets>
<customwidget>
<class>EncodingComboBox</class>
<extends>QComboBox</extends>
<header>widgets.h</header>
</customwidget>
</customwidgets>
<resources/> <resources/>
<connections/> <connections/>
</ui> </ui>