diff --git a/resources/recipes/njp.recipe b/resources/recipes/njp.recipe index f2a427072b..ed202512f2 100644 --- a/resources/recipes/njp.recipe +++ b/resources/recipes/njp.recipe @@ -13,7 +13,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class NewJournalOfPhysics(BasicNewsRecipe): title = u'New Journal of Physics' - __author__ = u'Chema Cortés' + __author__ = u'Chema Cort\xe9s' description = u'The open-access journal for physics' publisher = u'IOP (Institute of Physics)' category = 'physics, journal, science' diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index f9ce9befb4..f6deab677a 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -16,6 +16,7 @@ import uuid from lxml import etree +from calibre import guess_type from calibre import prepare_string_for_xml from calibre.constants import __appname__, __version__ from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace @@ -161,6 +162,23 @@ class FB2MLizer(object): text.append('
') self.section_level += 1 + # Insert the title page / cover into the spine if it is not already referenced. + title_name = u'' + if 'titlepage' in self.oeb_book.guide: + title_name = 'titlepage' + elif 'cover' in self.oeb_book.guide: + title_name = 'cover' + if title_name: + title_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[title_name].href] + if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml': + self.oeb_book.spine.insert(0, title_item, True) + # Create xhtml page to reference cover image so it can be used. + if self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids: + id = unicode(self.oeb_book.metadata.cover[0]) + cover_item = self.oeb_book.manifest.ids[id] + if cover_item.media_type in OEB_RASTER_IMAGES: + self.insert_image_cover(cover_item.href) + for item in self.oeb_book.spine: self.log.debug('Converting %s to FictionBook2 XML' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) @@ -185,6 +203,17 @@ class FB2MLizer(object): return ''.join(text) + '' + def insert_image_cover(self, image_href): + from calibre.ebooks.oeb.base import RECOVER_PARSER + try: + root = etree.fromstring(u'' % (XHTML_NS, image_href), parser=RECOVER_PARSER) + except: + root = etree.fromstring(u'', parser=RECOVER_PARSER) + + id, href = self.oeb_book.manifest.generate('fb2_cover', 'fb2_cover.xhtml') + item = self.oeb_book.manifest.add(id, href, guess_type(href)[0], data=root) + self.oeb_book.spine.insert(0, item, True) + def fb2mlize_images(self): ''' This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function. diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 6850c48b16..3688abff3f 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -19,16 +19,27 @@ class PDBInput(InputFormatPlugin): file_types = set(['pdb']) options = set([ - OptionRecommendation(name='single_line_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line represents ' - 'a paragraph instead.')), - OptionRecommendation(name='print_formatted_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line starting with ' - 'an indent (either a tab or 2+ spaces) represents a paragraph. ' - 'Paragraphs end when the next line that starts with an indent ' - 'is reached.')), + OptionRecommendation(name='paragraph_type', recommended_value='auto', + choices=['auto', 'block', 'single', 'print'], + help=_('Paragraph structure.\n' + 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' + '* auto: Try to auto detect paragraph type.\n' + '* block: Treat a blank line as a paragraph break.\n' + '* single: Assume every line is a paragraph.\n' + '* print: Assume every line starting with 2+ spaces or a tab ' + 'starts a paragraph.')), + OptionRecommendation(name='formatting_type', recommended_value='auto', + choices=['auto', 'none', 'markdown'], + help=_('Formatting used within the document.' + '* auto: Try to auto detect the document formatting.\n' + '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' + '* markdown: Run the input though the markdown pre-processor. ' + 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), + OptionRecommendation(name='preserve_spaces', recommended_value=False, + help=_('Normally extra spaces are condensed into a single space. ' + 'With this option all spaces will be displayed.')), + OptionRecommendation(name="markdown_disable_toc", recommended_value=False, + help=_('Do not insert a Table of Contents into the output text.')), ]) def convert(self, stream, options, file_ext, log, diff --git a/src/calibre/ebooks/pdb/output.py b/src/calibre/ebooks/pdb/output.py index 4e76a2d298..7bca4e5c5d 100644 --- a/src/calibre/ebooks/pdb/output.py +++ b/src/calibre/ebooks/pdb/output.py @@ -22,7 +22,7 @@ class PDBOutput(OutputFormatPlugin): short_switch='f', choices=FORMAT_WRITERS.keys(), help=(_('Format to use inside the pdb container. Choices are:')+\ ' %s' % FORMAT_WRITERS.keys())), - OptionRecommendation(name='output_encoding', recommended_value='cp1252', + OptionRecommendation(name='pdb_output_encoding', recommended_value='cp1252', level=OptionRecommendation.LOW, help=_('Specify the character encoding of the output document. ' \ 'The default is cp1252. Note: This option is not honored by all ' \ diff --git a/src/calibre/ebooks/pdb/palmdoc/reader.py b/src/calibre/ebooks/pdb/palmdoc/reader.py index 52b8d1361f..945e31559a 100644 --- a/src/calibre/ebooks/pdb/palmdoc/reader.py +++ b/src/calibre/ebooks/pdb/palmdoc/reader.py @@ -8,12 +8,11 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os import struct +from cStringIO import StringIO + from calibre.ebooks.pdb.formatreader import FormatReader -from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ - separate_paragraphs_single_line, separate_paragraphs_print_formatted class HeaderRecord(object): ''' @@ -33,9 +32,7 @@ class Reader(FormatReader): def __init__(self, header, stream, log, options): self.stream = stream self.log = log - self.encoding = options.input_encoding - self.single_line_paras = options.single_line_paras - self.print_formatted_paras = options.print_formatted_paras + self.options = options self.sections = [] for i in range(header.num_sections): @@ -48,34 +45,29 @@ class Reader(FormatReader): def decompress_text(self, number): if self.header_record.compression == 1: - return self.section_data(number).decode('cp1252' if self.encoding is None else self.encoding) + return self.section_data(number) if self.header_record.compression == 2 or self.header_record.compression == 258: from calibre.ebooks.compression.palmdoc import decompress_doc - return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') + return decompress_doc(self.section_data(number)) return '' def extract_content(self, output_dir): - txt = '' + raw_txt = '' self.log.info('Decompressing text...') for i in range(1, self.header_record.num_records + 1): self.log.debug('\tDecompressing text section %i' % i) - txt += self.decompress_text(i) + raw_txt += self.decompress_text(i) self.log.info('Converting text to OEB...') - if self.single_line_paras: - txt = separate_paragraphs_single_line(txt) - if self.print_formatted_paras: - txt = separate_paragraphs_print_formatted(txt) - html = convert_basic(txt) - with open(os.path.join(output_dir, 'index.html'), 'wb') as index: - index.write(html.encode('utf-8')) + stream = StringIO(raw_txt) - from calibre.ebooks.metadata.meta import get_metadata - mi = get_metadata(self.stream, 'pdb') - manifest = [('index.html', None)] - spine = ['index.html'] - opf_writer(output_dir, 'metadata.opf', manifest, spine, mi) + from calibre.customize.ui import plugin_for_input_format - return os.path.join(output_dir, 'metadata.opf') + txt_plugin = plugin_for_input_format('txt') + for option in txt_plugin.options: + if not hasattr(self.options, option.option.name): + setattr(self.options, option.name, option.recommended_value) + stream.seek(0) + return txt_plugin.convert(stream, self.options, 'txt', self.log, {}) diff --git a/src/calibre/ebooks/pdb/palmdoc/writer.py b/src/calibre/ebooks/pdb/palmdoc/writer.py index 3f4a92fbed..5e9b77d75c 100644 --- a/src/calibre/ebooks/pdb/palmdoc/writer.py +++ b/src/calibre/ebooks/pdb/palmdoc/writer.py @@ -50,7 +50,8 @@ class Writer(FormatWriter): txt = writer.extract_content(oeb_book, self.opts) self.log.debug('\tReplacing newlines with selected type...') - txt = specified_newlines(TxtNewlines('windows').newline, txt).encode(self.opts.output_encoding, 'replace') + txt = specified_newlines(TxtNewlines('windows').newline, + txt).encode(self.opts.pdb_output_encoding, 'replace') txt_length = len(txt) diff --git a/src/calibre/ebooks/pdb/ztxt/reader.py b/src/calibre/ebooks/pdb/ztxt/reader.py index 5cac283264..6e7f5dd923 100644 --- a/src/calibre/ebooks/pdb/ztxt/reader.py +++ b/src/calibre/ebooks/pdb/ztxt/reader.py @@ -8,12 +8,13 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os, struct, zlib +import struct +import zlib + +from cStringIO import StringIO from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ztxt import zTXTError -from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ - separate_paragraphs_single_line, separate_paragraphs_print_formatted SUPPORTED_VERSION = (1, 40) @@ -38,9 +39,7 @@ class Reader(FormatReader): def __init__(self, header, stream, log, options): self.stream = stream self.log = log - self.encoding = options.input_encoding - self.single_line_paras = options.single_line_paras - self.print_formatted_paras = options.print_formatted_paras + self.options = options self.sections = [] for i in range(header.num_sections): @@ -68,30 +67,25 @@ class Reader(FormatReader): def decompress_text(self, number): if number == 1: self.uncompressor = zlib.decompressobj() - return self.uncompressor.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') + return self.uncompressor.decompress(self.section_data(number)) def extract_content(self, output_dir): - txt = '' + raw_txt = '' self.log.info('Decompressing text...') for i in range(1, self.header_record.num_records + 1): self.log.debug('\tDecompressing text section %i' % i) - txt += self.decompress_text(i) + raw_txt += self.decompress_text(i) self.log.info('Converting text to OEB...') - if self.single_line_paras: - txt = separate_paragraphs_single_line(txt) - if self.print_formatted_paras: - txt = separate_paragraphs_print_formatted(txt) - html = convert_basic(txt) - with open(os.path.join(output_dir, 'index.html'), 'wb') as index: - index.write(html.encode('utf-8')) + stream = StringIO(raw_txt) - from calibre.ebooks.metadata.meta import get_metadata - mi = get_metadata(self.stream, 'pdb') - manifest = [('index.html', None)] - spine = ['index.html'] - opf_writer(output_dir, 'metadata.opf', manifest, spine, mi) + from calibre.customize.ui import plugin_for_input_format - return os.path.join(output_dir, 'metadata.opf') + txt_plugin = plugin_for_input_format('txt') + for option in txt_plugin.options: + if not hasattr(self.options, option.option.name): + setattr(self.options, option.name, option.recommended_value) + stream.seek(0) + return txt_plugin.convert(stream, self.options, 'txt', self.log, {}) diff --git a/src/calibre/ebooks/pdb/ztxt/writer.py b/src/calibre/ebooks/pdb/ztxt/writer.py index ee4c5752c3..7c9056fe69 100644 --- a/src/calibre/ebooks/pdb/ztxt/writer.py +++ b/src/calibre/ebooks/pdb/ztxt/writer.py @@ -22,12 +22,12 @@ class Writer(FormatWriter): def __init__(self, opts, log): self.opts = opts self.log = log - + def write_content(self, oeb_book, out_stream, metadata=None): title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') txt_records, txt_length = self._generate_text(oeb_book) - + crc32 = 0 section_lengths = [] compressor = zlib.compressobj(9) @@ -41,32 +41,33 @@ class Writer(FormatWriter): header_record = self._header_record(txt_length, len(txt_records), crc32) section_lengths.insert(0, len(header_record)) - + out_stream.seek(0) hb = PdbHeaderBuilder('zTXTGPlm', title) hb.build_header(section_lengths, out_stream) for record in [header_record]+txt_records: out_stream.write(record) - + def _generate_text(self, oeb_book): writer = TXTMLizer(self.log) txt = writer.extract_content(oeb_book, self.opts) self.log.debug('\tReplacing newlines with selected type...') - txt = specified_newlines(TxtNewlines('windows').newline, txt).encode(self.opts.output_encoding, 'replace') + txt = specified_newlines(TxtNewlines('windows').newline, + txt).encode(self.opts.pdb_output_encoding, 'replace') txt_length = len(txt) - + txt_records = [] for i in range(0, (len(txt) / MAX_RECORD_SIZE) + 1): txt_records.append(txt[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]) - + return txt_records, txt_length - + def _header_record(self, txt_length, record_count, crc32): record = '' - + record += struct.pack('>H', 0x012c) # [0:2], version. 0x012c = 1.44 record += struct.pack('>H', record_count) # [2:4], Number of PDB records used for the text of the book. record += struct.pack('>L', txt_length) # [4:8], Uncompressed length of the entire text of the book. @@ -79,6 +80,6 @@ class Writer(FormatWriter): record += struct.pack('>B', 0) # [19:20], Reserved. record += struct.pack('>L', crc32) # [20:24], crc32 record += struct.pack('>LL', 0, 0) # [24:32], padding - + return record - + diff --git a/src/calibre/ebooks/pml/output.py b/src/calibre/ebooks/pml/output.py index 7e3729aa4a..58dc9a2138 100644 --- a/src/calibre/ebooks/pml/output.py +++ b/src/calibre/ebooks/pml/output.py @@ -28,7 +28,7 @@ class PMLOutput(OutputFormatPlugin): file_type = 'pmlz' options = set([ - OptionRecommendation(name='output_encoding', recommended_value='cp1252', + OptionRecommendation(name='pml_output_encoding', recommended_value='cp1252', level=OptionRecommendation.LOW, help=_('Specify the character encoding of the output document. ' \ 'The default is cp1252.')), @@ -48,7 +48,7 @@ class PMLOutput(OutputFormatPlugin): pmlmlizer = PMLMLizer(log) pml = unicode(pmlmlizer.extract_content(oeb_book, opts)) with open(os.path.join(tdir, 'index.pml'), 'wb') as out: - out.write(pml.encode(opts.output_encoding, 'replace')) + out.write(pml.encode(opts.pml_output_encoding, 'replace')) self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, tdir, opts) diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py index 67fa6ac66e..c1dcef235d 100644 --- a/src/calibre/ebooks/tcr/input.py +++ b/src/calibre/ebooks/tcr/input.py @@ -4,11 +4,9 @@ __license__ = 'GPL 3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os +from cStringIO import StringIO from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation -from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ - separate_paragraphs_single_line, separate_paragraphs_print_formatted from calibre.ebooks.compression.tcr import decompress class TCRInput(InputFormatPlugin): @@ -19,36 +17,43 @@ class TCRInput(InputFormatPlugin): file_types = set(['tcr']) options = set([ - OptionRecommendation(name='single_line_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line represents ' - 'a paragraph instead.')), - OptionRecommendation(name='print_formatted_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line starting with ' - 'an indent (either a tab or 2+ spaces) represents a paragraph. ' - 'Paragraphs end when the next line that starts with an indent ' - 'is reached.')), + OptionRecommendation(name='paragraph_type', recommended_value='auto', + choices=['auto', 'block', 'single', 'print'], + help=_('Paragraph structure.\n' + 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' + '* auto: Try to auto detect paragraph type.\n' + '* block: Treat a blank line as a paragraph break.\n' + '* single: Assume every line is a paragraph.\n' + '* print: Assume every line starting with 2+ spaces or a tab ' + 'starts a paragraph.')), + OptionRecommendation(name='formatting_type', recommended_value='auto', + choices=['auto', 'none', 'markdown'], + help=_('Formatting used within the document.' + '* auto: Try to auto detect the document formatting.\n' + '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' + '* markdown: Run the input though the markdown pre-processor. ' + 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), + OptionRecommendation(name='preserve_spaces', recommended_value=False, + help=_('Normally extra spaces are condensed into a single space. ' + 'With this option all spaces will be displayed.')), + OptionRecommendation(name="markdown_disable_toc", recommended_value=False, + help=_('Do not insert a Table of Contents into the output text.')), ]) def convert(self, stream, options, file_ext, log, accelerators): log.info('Decompressing text...') - ienc = options.input_encoding if options.input_encoding else 'utf-8' - txt = decompress(stream).decode(ienc, 'replace') + raw_txt = decompress(stream) log.info('Converting text to OEB...') - if options.single_line_paras: - txt = separate_paragraphs_single_line(txt) - if options.print_formatted_paras: - txt = separate_paragraphs_print_formatted(txt) - html = convert_basic(txt) - with open(os.path.join(os.getcwd(), 'index.html'), 'wb') as index: - index.write(html.encode('utf-8')) + stream = StringIO(raw_txt) - from calibre.ebooks.metadata.meta import get_metadata - mi = get_metadata(stream, 'tcr') - manifest = [('index.html', None)] - spine = ['index.html'] - opf_writer(os.getcwd(), 'metadata.opf', manifest, spine, mi) + from calibre.customize.ui import plugin_for_input_format - return os.path.join(os.getcwd(), 'metadata.opf') + txt_plugin = plugin_for_input_format('txt') + for option in txt_plugin.options: + if not hasattr(options, option.option.name): + setattr(options, option.name, option.recommended_value) + + stream.seek(0) + return txt_plugin.convert(stream, options, + 'txt', log, accelerators) diff --git a/src/calibre/ebooks/tcr/output.py b/src/calibre/ebooks/tcr/output.py index 3ca82730cc..97c9cae26c 100644 --- a/src/calibre/ebooks/tcr/output.py +++ b/src/calibre/ebooks/tcr/output.py @@ -18,7 +18,7 @@ class TCROutput(OutputFormatPlugin): file_type = 'tcr' options = set([ - OptionRecommendation(name='output_encoding', recommended_value='utf-8', + OptionRecommendation(name='tcr_output_encoding', recommended_value='utf-8', level=OptionRecommendation.LOW, help=_('Specify the character encoding of the output document. ' \ 'The default is utf-8.')), @@ -40,7 +40,7 @@ class TCROutput(OutputFormatPlugin): setattr(opts, 'indent_paras', False) writer = TXTMLizer(log) - txt = writer.extract_content(oeb_book, opts).encode(opts.output_encoding, 'replace') + txt = writer.extract_content(oeb_book, opts).encode(opts.tcr_output_encoding, 'replace') log.info('Compressing text...') txt = compress(txt) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 44b98304ea..47e92a45a9 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -7,9 +7,10 @@ __docformat__ = 'restructuredtext en' import os from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ - preserve_spaces + preserve_spaces, detect_paragraph_type, detect_formatting_type from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -20,45 +21,57 @@ class TXTInput(InputFormatPlugin): file_types = set(['txt']) options = set([ - OptionRecommendation(name='single_line_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line represents ' - 'a paragraph instead.')), - OptionRecommendation(name='print_formatted_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line starting with ' - 'an indent (either a tab or 2+ spaces) represents a paragraph. ' - 'Paragraphs end when the next line that starts with an indent ' - 'is reached.')), + OptionRecommendation(name='paragraph_type', recommended_value='auto', + choices=['auto', 'block', 'single', 'print'], + help=_('Paragraph structure.\n' + 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' + '* auto: Try to auto detect paragraph type.\n' + '* block: Treat a blank line as a paragraph break.\n' + '* single: Assume every line is a paragraph.\n' + '* print: Assume every line starting with 2+ spaces or a tab ' + 'starts a paragraph.')), + OptionRecommendation(name='formatting_type', recommended_value='auto', + choices=['auto', 'none', 'markdown'], + help=_('Formatting used within the document.' + '* auto: Try to auto detect the document formatting.\n' + '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' + '* markdown: Run the input though the markdown pre-processor. ' + 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, help=_('Normally extra spaces are condensed into a single space. ' 'With this option all spaces will be displayed.')), - OptionRecommendation(name='markdown', recommended_value=False, - help=_('Run the text input through the markdown pre-processor. To ' - 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name="markdown_disable_toc", recommended_value=False, help=_('Do not insert a Table of Contents into the output text.')), ]) def convert(self, stream, options, file_ext, log, accelerators): - ienc = stream.encoding if stream.encoding else 'utf-8' + log.debug('Reading text from file...') + + txt = stream.read() + # Get the encoding of the document. if options.input_encoding: ienc = options.input_encoding - log.debug('Reading text from file...') - txt = stream.read().decode(ienc, 'replace') - - # Adjust paragraph formatting as requested - if options.single_line_paras: - txt = separate_paragraphs_single_line(txt) - if options.print_formatted_paras: - txt = separate_paragraphs_print_formatted(txt) - if options.preserve_spaces: - txt = preserve_spaces(txt) + log.debug('Using user specified input encoding of %s' % ienc) + else: + det_encoding = detect(txt) + ienc = det_encoding['encoding'] + log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, det_encoding['confidence'] * 100)) + if not ienc: + ienc = 'utf-8' + log.debug('No input encoding specified and could not auto detect using %s' % ienc) + txt = txt.decode(ienc, 'replace') txt = _ent_pat.sub(xml_entity_to_unicode, txt) + # Preserve spaces will replace multiple spaces to a space + # followed by the   entity. + if options.preserve_spaces: + txt = preserve_spaces(txt) + + if options.formatting_type == 'auto': + options.formatting_type = detect_formatting_type(txt) - if options.markdown: + if options.formatting_type == 'markdown': log.debug('Running text though markdown conversion...') try: html = convert_markdown(txt, disable_toc=options.markdown_disable_toc) @@ -66,6 +79,22 @@ class TXTInput(InputFormatPlugin): raise ValueError('This txt file has malformed markup, it cannot be' ' converted by calibre. See http://daringfireball.net/projects/markdown/syntax') else: + # Determine the paragraph type of the document. + if options.paragraph_type == 'auto': + options.paragraph_type = detect_paragraph_type(txt) + if options.paragraph_type == 'unknown': + log.debug('Could not reliably determine paragraph type using block') + options.paragraph_type = 'block' + else: + log.debug('Auto detected paragraph type as %s' % options.paragraph_type) + + # We don't check for block because the processor assumes block. + # single and print at transformed to block for processing. + if options.paragraph_type == 'single': + txt = separate_paragraphs_single_line(txt) + elif options.paragraph_type == 'print': + txt = separate_paragraphs_print_formatted(txt) + flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) @@ -85,11 +114,10 @@ class TXTInput(InputFormatPlugin): htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) - cwd = os.getcwdu() odi = options.debug_pipeline options.debug_pipeline = None - oeb = html_input(open(htmlfile.name, 'rb'), options, 'html', log, - {}, cwd) + oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, + {}) options.debug_pipeline = odi os.remove(htmlfile.name) return oeb diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 0e077672d8..4d0d176fe4 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -26,7 +26,7 @@ class TXTOutput(OutputFormatPlugin): 'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. ' 'For Mac OS X use \'unix\'. \'system\' will default to the newline ' 'type used by this OS.') % sorted(TxtNewlines.NEWLINE_TYPES.keys())), - OptionRecommendation(name='output_encoding', recommended_value='utf-8', + OptionRecommendation(name='txt_output_encoding', recommended_value='utf-8', level=OptionRecommendation.LOW, help=_('Specify the character encoding of the output document. ' \ 'The default is utf-8.')), @@ -64,7 +64,7 @@ class TXTOutput(OutputFormatPlugin): writer = MarkdownMLizer(log) else: writer = TXTMLizer(log) - + txt = writer.extract_content(oeb_book, opts) log.debug('\tReplacing newlines with selected type...') @@ -81,7 +81,7 @@ class TXTOutput(OutputFormatPlugin): out_stream.seek(0) out_stream.truncate() - out_stream.write(txt.encode(opts.output_encoding, 'replace')) + out_stream.write(txt.encode(opts.txt_output_encoding, 'replace')) if close: out_stream.close() diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index dac1e34df7..f6d628e7c5 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -49,7 +49,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0): if isbytestring(txt): txt = txt.decode('utf-8') - lines = [] # Split into paragraphs based on having a blank line between text. for line in txt.split('\n\n'): @@ -94,3 +93,54 @@ def split_string_separator(txt, size) : xrange(0, len(txt), size)]) return txt +def detect_paragraph_type(txt): + ''' + Tries to determine the formatting of the document. + + block: Paragraphs are separated by a blank line. + single: Each line is a paragraph. + print: Each paragraph starts with a 2+ spaces or a tab + and ends when a new paragraph is reached. + markdown: Markdown formatting is in the document. + + returns block, single, print, markdown + ''' + txt = txt.replace('\r\n', '\n') + txt = txt.replace('\r', '\n') + txt_line_count = len(re.findall('(?mu)^\s*.+$', txt)) + + # Check for print + tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) + if tab_line_count / float(txt_line_count) >= .25: + return 'print' + + # Check for block + empty_line_count = len(re.findall('(?mu)^\s*$', txt)) + if empty_line_count / float(txt_line_count) >= .25: + return 'block' + + # Nothing else matched to assume single. + return 'single' + +def detect_formatting_type(txt): + # Check for markdown + # Headings + if len(re.findall('(?mu)^#+', txt)) >= 5: + return 'markdown' + if len(re.findall('(?mu)^=+$', txt)) >= 5: + return 'markdown' + if len(re.findall('(?mu)^-+$', txt)) >= 5: + return 'markdown' + # Images + if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5: + return 'markdown' + # Links + if len(re.findall('(?u)(^|(?P
[^!]))\[.*?\]\([^)]+\)', txt)) >= 5:
+        return 'markdown'
+    # Escaped characters
+    md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']
+    for c in md_escapted_characters:
+        if txt.count('\\'+c) > 10:
+            return 'markdown'
+    
+    return 'none'
diff --git a/src/calibre/gui2/convert/__init__.py b/src/calibre/gui2/convert/__init__.py
index 1557ce8939..e5f72099fe 100644
--- a/src/calibre/gui2/convert/__init__.py
+++ b/src/calibre/gui2/convert/__init__.py
@@ -192,6 +192,11 @@ class Widget(QWidget):
             if not val: val = ''
             getattr(g, 'setPlainText', g.setText)(val)
             getattr(g, 'setCursorPosition', lambda x: x)(0)
+        elif isinstance(g, EncodingComboBox):
+            if val:
+                g.setEditText(val)
+            else:
+                g.setCurrentIndex(0)
         elif isinstance(g, QComboBox) and val:
             idx = g.findText(val, Qt.MatchFixedString)
             if idx < 0:
@@ -202,8 +207,6 @@ class Widget(QWidget):
             g.setCheckState(Qt.Checked if bool(val) else Qt.Unchecked)
         elif isinstance(g, (XPathEdit, RegexEdit)):
             g.edit.setText(val if val else '')
-        elif isinstance(g, EncodingComboBox):
-            g.setEditText(val if val else '')
         else:
             raise Exception('Can\'t set value %s in %s'%(repr(val),
                 unicode(g.objectName())))
diff --git a/src/calibre/gui2/convert/pdb_input.py b/src/calibre/gui2/convert/pdb_input.py
index 4510cf81ba..16ff1ff236 100644
--- a/src/calibre/gui2/convert/pdb_input.py
+++ b/src/calibre/gui2/convert/pdb_input.py
@@ -1,10 +1,10 @@
 # -*- coding: utf-8 -*-
 
 __license__ = 'GPL 3'
-__copyright__ = '2009, John Schember '
+__copyright__ = '2011, John Schember '
 __docformat__ = 'restructuredtext en'
 
-from calibre.gui2.convert.pdb_input_ui import Ui_Form
+from calibre.gui2.convert.txt_input_ui import Ui_Form
 from calibre.gui2.convert import Widget
 
 class PluginWidget(Widget, Ui_Form):
@@ -12,10 +12,14 @@ class PluginWidget(Widget, Ui_Form):
     TITLE = _('PDB Input')
     HELP = _('Options specific to')+' PDB '+_('input')
     COMMIT_NAME = 'pdb_input'
-    ICON = I('mimetypes/unknown.png')
+    ICON = I('mimetypes/txt.png')
 
     def __init__(self, parent, get_option, get_help, db=None, book_id=None):
         Widget.__init__(self, parent,
-            ['single_line_paras', 'print_formatted_paras'])
+            ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
         self.db, self.book_id = db, book_id
+        for x in get_option('paragraph_type').option.choices:
+            self.opt_paragraph_type.addItem(x)
+        for x in get_option('formatting_type').option.choices:
+            self.opt_formatting_type.addItem(x)
         self.initialize_options(get_option, get_help, db, book_id)
diff --git a/src/calibre/gui2/convert/pdb_input.ui b/src/calibre/gui2/convert/pdb_input.ui
deleted file mode 100644
index 2b632b1a33..0000000000
--- a/src/calibre/gui2/convert/pdb_input.ui
+++ /dev/null
@@ -1,48 +0,0 @@
-
-
- Form
- 
-  
-   
-    0
-    0
-    400
-    300
-   
-  
-  
-   Form
-  
-  
-   
-    
-     
-      Qt::Vertical
-     
-     
-      
-       20
-       213
-      
-     
-    
-   
-   
-    
-     
-      Treat each &line as a paragraph
-     
-    
-   
-   
-    
-     
-      Assume print formatting
-     
-    
-   
-  
- 
- 
- 
-
diff --git a/src/calibre/gui2/convert/pdb_output.py b/src/calibre/gui2/convert/pdb_output.py
index 51c202cb03..ec6b7abb08 100644
--- a/src/calibre/gui2/convert/pdb_output.py
+++ b/src/calibre/gui2/convert/pdb_output.py
@@ -19,7 +19,7 @@ class PluginWidget(Widget, Ui_Form):
     ICON = I('mimetypes/unknown.png')
 
     def __init__(self, parent, get_option, get_help, db=None, book_id=None):
-        Widget.__init__(self, parent, ['format', 'inline_toc', 'output_encoding'])
+        Widget.__init__(self, parent, ['format', 'inline_toc', 'pdb_output_encoding'])
         self.db, self.book_id = db, book_id
         self.initialize_options(get_option, get_help, db, book_id)
 
diff --git a/src/calibre/gui2/convert/pdb_output.ui b/src/calibre/gui2/convert/pdb_output.ui
index 17bdc0a984..fcca83cc2e 100644
--- a/src/calibre/gui2/convert/pdb_output.ui
+++ b/src/calibre/gui2/convert/pdb_output.ui
@@ -55,10 +55,21 @@
     
    
    
-    
+    
+     
+      true
+     
+    
    
   
  
+ 
+  
+   EncodingComboBox
+   QComboBox
+   
widgets.h
+
+
diff --git a/src/calibre/gui2/convert/pml_output.py b/src/calibre/gui2/convert/pml_output.py index f7905194ca..56197ecde0 100644 --- a/src/calibre/gui2/convert/pml_output.py +++ b/src/calibre/gui2/convert/pml_output.py @@ -18,6 +18,6 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, ['inline_toc', 'full_image_depth', - 'output_encoding']) + 'pml_output_encoding']) self.db, self.book_id = db, book_id self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/pmlz_output.ui b/src/calibre/gui2/convert/pmlz_output.ui index 9754752c8a..162cfbb831 100644 --- a/src/calibre/gui2/convert/pmlz_output.ui +++ b/src/calibre/gui2/convert/pmlz_output.ui @@ -14,7 +14,7 @@ Form - + Qt::Vertical @@ -27,32 +27,47 @@ - + &Inline TOC - + Do not reduce image size and depth - - - - Output Encoding: - - - - - + + + + + + Output Encoding: + + + + + + + true + + + + + + + EncodingComboBox + QComboBox +
widgets.h
+
+
diff --git a/src/calibre/gui2/convert/tcr_input.py b/src/calibre/gui2/convert/tcr_input.py new file mode 100644 index 0000000000..366643ad5b --- /dev/null +++ b/src/calibre/gui2/convert/tcr_input.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2011, John Schember ' +__docformat__ = 'restructuredtext en' + +from calibre.gui2.convert.txt_input_ui import Ui_Form +from calibre.gui2.convert import Widget + +class PluginWidget(Widget, Ui_Form): + + TITLE = _('TCR Input') + HELP = _('Options specific to')+' TCR '+_('input') + COMMIT_NAME = 'tcr_input' + ICON = I('mimetypes/txt.png') + + def __init__(self, parent, get_option, get_help, db=None, book_id=None): + Widget.__init__(self, parent, + ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces']) + self.db, self.book_id = db, book_id + for x in get_option('paragraph_type').option.choices: + self.opt_paragraph_type.addItem(x) + for x in get_option('formatting_type').option.choices: + self.opt_formatting_type.addItem(x) + self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_input.py b/src/calibre/gui2/convert/txt_input.py index 31019251e2..62672cc0f9 100644 --- a/src/calibre/gui2/convert/txt_input.py +++ b/src/calibre/gui2/convert/txt_input.py @@ -16,7 +16,10 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, - ['single_line_paras', 'print_formatted_paras', 'markdown', - 'markdown_disable_toc', 'preserve_spaces']) + ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces']) self.db, self.book_id = db, book_id + for x in get_option('paragraph_type').option.choices: + self.opt_paragraph_type.addItem(x) + for x in get_option('formatting_type').option.choices: + self.opt_formatting_type.addItem(x) self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_input.ui b/src/calibre/gui2/convert/txt_input.ui index 186783c277..6cbd68135f 100644 --- a/src/calibre/gui2/convert/txt_input.ui +++ b/src/calibre/gui2/convert/txt_input.ui @@ -6,7 +6,7 @@ 0 0 - 470 + 518 300 @@ -15,47 +15,23 @@ - + - Treat each &line as a paragraph + Paragraph style: - - + + + + + - Assume print formatting + Preserve &spaces - - - - Process using markdown - - - - - - - <p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>. - - - true - - - true - - - - - - - Do not insert Table of Contents into output text when using markdown - - - - + Qt::Vertical @@ -68,32 +44,47 @@ - - + + + + + - Preserve &spaces + Formatting style: + + + + Markdown Options + + + + + + <p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>. + + + true + + + true + + + + + + + Do not insert Table of Contents into output text when using markdown + + + + + + - - - opt_markdown - toggled(bool) - opt_markdown_disable_toc - setEnabled(bool) - - - 76 - 80 - - - 418 - 105 - - - - + diff --git a/src/calibre/gui2/convert/txt_output.py b/src/calibre/gui2/convert/txt_output.py index 9f30e0d83f..9a228bd4cf 100644 --- a/src/calibre/gui2/convert/txt_output.py +++ b/src/calibre/gui2/convert/txt_output.py @@ -22,7 +22,7 @@ class PluginWidget(Widget, Ui_Form): Widget.__init__(self, parent, ['newline', 'max_line_length', 'force_max_line_length', 'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references', - 'output_encoding']) + 'txt_output_encoding']) self.db, self.book_id = db, book_id self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_output.ui b/src/calibre/gui2/convert/txt_output.ui index 6290a096c8..57fe702db7 100644 --- a/src/calibre/gui2/convert/txt_output.ui +++ b/src/calibre/gui2/convert/txt_output.ui @@ -96,10 +96,21 @@ - + + + true + + + + + EncodingComboBox + QComboBox +
widgets.h
+
+