diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 3ff816b3bf..9a27274dd8 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -51,16 +51,16 @@ def chap_head(match): chap = match.group('chap') title = match.group('title') if not title: - return '
tags or equivalent (generally just plain text between #
tags), check and mark up line endings if required before proceeding if self.no_markup(html, 0.1): - self.log("not enough paragraph markers, adding now") - # check if content is in pre tags, use txt processor to mark up if so - pre = re.compile(r'', re.IGNORECASE) - if len(pre.findall(html)) == 1: - self.log("Running Text Processing") - from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \ - separate_paragraphs_single_line - outerhtml = re.compile(r'.*?(?<=)(?P).*', re.IGNORECASE|re.DOTALL) - html = outerhtml.sub('\g.*)(?= ', html) - html = separate_paragraphs_single_line(html) - html = preserve_spaces(html) - html = convert_basic(html, epub_split_size_kb=0) - else: - # Add markup naively - # TODO - find out if there are cases where there are more than one tag or - # other types of unmarked html and handle them in some better fashion - add_markup = re.compile('(?)(\n)') - html = add_markup.sub('\n', html) + self.log("not enough paragraph markers, adding now") + # check if content is in pre tags, use txt processor to mark up if so + pre = re.compile(r'
', re.IGNORECASE) + if len(pre.findall(html)) == 1: + self.log("Running Text Processing") + from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \ + separate_paragraphs_single_line + outerhtml = re.compile(r'.*?(?<=)(?P).*', re.IGNORECASE|re.DOTALL) + html = outerhtml.sub('\g.*)(?= ', html) + html = separate_paragraphs_single_line(html) + html = preserve_spaces(html) + html = convert_basic(html, epub_split_size_kb=0) + else: + # Add markup naively + # TODO - find out if there are cases where there are more than one tag or + # other types of unmarked html and handle them in some better fashion + add_markup = re.compile('(?)(\n)') + html = add_markup.sub('\n', html) ###### Mark Indents/Cleanup ###### # @@ -164,8 +164,8 @@ class PreProcessor(object): self.log("deleting blank lines") html = blankreg.sub('', html) elif float(len(blanklines)) / float(len(lines)) > 0.40: - blanks_between_paragraphs = True - #print "blanks between paragraphs is marked True" + blanks_between_paragraphs = True + #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False #self.dump(html, 'before_chapter_markup') diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index f6deab677a..4dd6e7c7ae 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -173,7 +173,7 @@ class FB2MLizer(object): if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml': self.oeb_book.spine.insert(0, title_item, True) # Create xhtml page to reference cover image so it can be used. - if self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids: + if not title_name and self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids: id = unicode(self.oeb_book.metadata.cover[0]) cover_item = self.oeb_book.manifest.ids[id] if cover_item.media_type in OEB_RASTER_IMAGES: diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 3688abff3f..1b665bf94e 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en' import os -from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader from calibre.ebooks.conversion.utils import PreProcessor @@ -18,30 +18,6 @@ class PDBInput(InputFormatPlugin): description = 'Convert PDB to HTML' file_types = set(['pdb']) - options = set([ - OptionRecommendation(name='paragraph_type', recommended_value='auto', - choices=['auto', 'block', 'single', 'print'], - help=_('Paragraph structure.\n' - 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' - '* auto: Try to auto detect paragraph type.\n' - '* block: Treat a blank line as a paragraph break.\n' - '* single: Assume every line is a paragraph.\n' - '* print: Assume every line starting with 2+ spaces or a tab ' - 'starts a paragraph.')), - OptionRecommendation(name='formatting_type', recommended_value='auto', - choices=['auto', 'none', 'markdown'], - help=_('Formatting used within the document.' - '* auto: Try to auto detect the document formatting.\n' - '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' - '* markdown: Run the input though the markdown pre-processor. ' - 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), - OptionRecommendation(name='preserve_spaces', recommended_value=False, - help=_('Normally extra spaces are condensed into a single space. ' - 'With this option all spaces will be displayed.')), - OptionRecommendation(name="markdown_disable_toc", recommended_value=False, - help=_('Do not insert a Table of Contents into the output text.')), - ]) - def convert(self, stream, options, file_ext, log, accelerators): header = PdbHeaderReader(stream) @@ -60,4 +36,4 @@ class PDBInput(InputFormatPlugin): def preprocess_html(self, options, html): self.options = options preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) - return preprocessor(html) \ No newline at end of file + return preprocessor(html) diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py index c151551866..30b0c4c57c 100644 --- a/src/calibre/ebooks/pdb/pdf/reader.py +++ b/src/calibre/ebooks/pdb/pdf/reader.py @@ -19,9 +19,6 @@ class Reader(FormatReader): self.stream = stream self.log = log self.options = options - setattr(self.options, 'new_pdf_engine', False) - setattr(self.options, 'no_images', False) - setattr(self.options, 'unwrap_factor', 0.45) def extract_content(self, output_dir): self.log.info('Extracting PDF...') @@ -31,7 +28,12 @@ class Reader(FormatReader): for x in xrange(self.header.section_count()): pdf.write(self.header.section_data(x)) - from calibre.customize.ui import plugin_for_input_format - pdf.seek(0) - return plugin_for_input_format('pdf').convert(pdf, self.options, - 'pdf', self.log, []) + from calibre.customize.ui import plugin_for_input_format + + pdf_plugin = plugin_for_input_format('pdf') + for option in pdf_plugin.options: + if not hasattr(self.options, option.option.name): + setattr(self.options, option.name, option.recommended_value) + + pdf.seek(0) + return pdf_plugin.convert(pdf, self.options, 'pdf', self.log, {}) diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py index c1dcef235d..aac72da7a8 100644 --- a/src/calibre/ebooks/tcr/input.py +++ b/src/calibre/ebooks/tcr/input.py @@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en' from cStringIO import StringIO -from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.compression.tcr import decompress class TCRInput(InputFormatPlugin): @@ -16,30 +16,6 @@ class TCRInput(InputFormatPlugin): description = 'Convert TCR files to HTML' file_types = set(['tcr']) - options = set([ - OptionRecommendation(name='paragraph_type', recommended_value='auto', - choices=['auto', 'block', 'single', 'print'], - help=_('Paragraph structure.\n' - 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' - '* auto: Try to auto detect paragraph type.\n' - '* block: Treat a blank line as a paragraph break.\n' - '* single: Assume every line is a paragraph.\n' - '* print: Assume every line starting with 2+ spaces or a tab ' - 'starts a paragraph.')), - OptionRecommendation(name='formatting_type', recommended_value='auto', - choices=['auto', 'none', 'markdown'], - help=_('Formatting used within the document.' - '* auto: Try to auto detect the document formatting.\n' - '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' - '* markdown: Run the input though the markdown pre-processor. ' - 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), - OptionRecommendation(name='preserve_spaces', recommended_value=False, - help=_('Normally extra spaces are condensed into a single space. ' - 'With this option all spaces will be displayed.')), - OptionRecommendation(name="markdown_disable_toc", recommended_value=False, - help=_('Do not insert a Table of Contents into the output text.')), - ]) - def convert(self, stream, options, file_ext, log, accelerators): log.info('Decompressing text...') raw_txt = decompress(stream) diff --git a/src/calibre/gui2/convert/pdb_input.py b/src/calibre/gui2/convert/pdb_input.py deleted file mode 100644 index 16ff1ff236..0000000000 --- a/src/calibre/gui2/convert/pdb_input.py +++ /dev/null @@ -1,25 +0,0 @@ -# -*- coding: utf-8 -*- - -__license__ = 'GPL 3' -__copyright__ = '2011, John Schember
' -__docformat__ = 'restructuredtext en' - -from calibre.gui2.convert.txt_input_ui import Ui_Form -from calibre.gui2.convert import Widget - -class PluginWidget(Widget, Ui_Form): - - TITLE = _('PDB Input') - HELP = _('Options specific to')+' PDB '+_('input') - COMMIT_NAME = 'pdb_input' - ICON = I('mimetypes/txt.png') - - def __init__(self, parent, get_option, get_help, db=None, book_id=None): - Widget.__init__(self, parent, - ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces']) - self.db, self.book_id = db, book_id - for x in get_option('paragraph_type').option.choices: - self.opt_paragraph_type.addItem(x) - for x in get_option('formatting_type').option.choices: - self.opt_formatting_type.addItem(x) - self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/tcr_input.py b/src/calibre/gui2/convert/tcr_input.py deleted file mode 100644 index 366643ad5b..0000000000 --- a/src/calibre/gui2/convert/tcr_input.py +++ /dev/null @@ -1,25 +0,0 @@ -# -*- coding: utf-8 -*- - -__license__ = 'GPL 3' -__copyright__ = '2011, John Schember ' -__docformat__ = 'restructuredtext en' - -from calibre.gui2.convert.txt_input_ui import Ui_Form -from calibre.gui2.convert import Widget - -class PluginWidget(Widget, Ui_Form): - - TITLE = _('TCR Input') - HELP = _('Options specific to')+' TCR '+_('input') - COMMIT_NAME = 'tcr_input' - ICON = I('mimetypes/txt.png') - - def __init__(self, parent, get_option, get_help, db=None, book_id=None): - Widget.__init__(self, parent, - ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces']) - self.db, self.book_id = db, book_id - for x in get_option('paragraph_type').option.choices: - self.opt_paragraph_type.addItem(x) - for x in get_option('formatting_type').option.choices: - self.opt_formatting_type.addItem(x) - self.initialize_options(get_option, get_help, db, book_id)