From 38a82b049dade612732287cd15e9716b56b5f995 Mon Sep 17 00:00:00 2001
From: John Schember
Date: Sat, 1 Jan 2011 21:39:57 -0500
Subject: [PATCH 01/10] GUI: Editable combo box with most common character
encodings instead of fully free form text entry. This still allows users to
specify encodings that are not part of the common list.
---
src/calibre/gui2/convert/__init__.py | 4 ++-
src/calibre/gui2/convert/look_and_feel.ui | 22 +++++++++----
src/calibre/gui2/convert/pdb_output.ui | 13 +++++++-
src/calibre/gui2/convert/pmlz_output.ui | 39 ++++++++++++++++-------
src/calibre/gui2/convert/txt_output.ui | 13 +++++++-
src/calibre/gui2/widgets.py | 26 +++++++++++++++
6 files changed, 95 insertions(+), 22 deletions(-)
diff --git a/src/calibre/gui2/convert/__init__.py b/src/calibre/gui2/convert/__init__.py
index c1efe5b9af..6b977afc19 100644
--- a/src/calibre/gui2/convert/__init__.py
+++ b/src/calibre/gui2/convert/__init__.py
@@ -191,7 +191,9 @@ class Widget(QWidget):
if not val: val = ''
getattr(g, 'setPlainText', g.setText)(val)
getattr(g, 'setCursorPosition', lambda x: x)(0)
- elif isinstance(g, QComboBox) and val:
+ elif isinstance(g, QComboBox):
+ if not val:
+ val = ''
idx = g.findText(val, Qt.MatchFixedString)
if idx < 0:
g.addItem(val)
diff --git a/src/calibre/gui2/convert/look_and_feel.ui b/src/calibre/gui2/convert/look_and_feel.ui
index 367233e2c0..cd0426ac53 100644
--- a/src/calibre/gui2/convert/look_and_feel.ui
+++ b/src/calibre/gui2/convert/look_and_feel.ui
@@ -84,7 +84,7 @@
...
-
+
:/images/wizard.png:/images/wizard.png
@@ -122,14 +122,8 @@
Input character &encoding:
-
- opt_input_encoding
-
- -
-
-
-
@@ -244,8 +238,22 @@
+ -
+
+
+ true
+
+
+
+
+
+ EncodingComboBox
+ QComboBox
+
+
+
diff --git a/src/calibre/gui2/convert/pdb_output.ui b/src/calibre/gui2/convert/pdb_output.ui
index 17bdc0a984..a571a0035b 100644
--- a/src/calibre/gui2/convert/pdb_output.ui
+++ b/src/calibre/gui2/convert/pdb_output.ui
@@ -55,10 +55,21 @@
-
-
+
+
+ true
+
+
+
+
+ EncodingComboBox
+ QComboBox
+
+
+
diff --git a/src/calibre/gui2/convert/pmlz_output.ui b/src/calibre/gui2/convert/pmlz_output.ui
index 9754752c8a..bd70cf1039 100644
--- a/src/calibre/gui2/convert/pmlz_output.ui
+++ b/src/calibre/gui2/convert/pmlz_output.ui
@@ -14,7 +14,7 @@
Form
- -
+
-
Qt::Vertical
@@ -27,32 +27,47 @@
- -
+
-
&Inline TOC
- -
+
-
Do not reduce image size and depth
- -
-
-
- Output Encoding:
-
-
-
- -
-
+
-
+
+
-
+
+
+ Output Encoding:
+
+
+
+ -
+
+
+ true
+
+
+
+
+
+
+ EncodingComboBox
+ QComboBox
+
+
+
diff --git a/src/calibre/gui2/convert/txt_output.ui b/src/calibre/gui2/convert/txt_output.ui
index 6290a096c8..3a2516b98e 100644
--- a/src/calibre/gui2/convert/txt_output.ui
+++ b/src/calibre/gui2/convert/txt_output.ui
@@ -96,10 +96,21 @@
-
-
+
+
+ true
+
+
+
+
+ EncodingComboBox
+ QComboBox
+
+
+
diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py
index bc3c23876f..cab2e2d4df 100644
--- a/src/calibre/gui2/widgets.py
+++ b/src/calibre/gui2/widgets.py
@@ -616,6 +616,32 @@ class ComboBoxWithHelp(QComboBox):
QComboBox.hidePopup(self)
self.set_state()
+
+class EncodingComboBox(QComboBox):
+ '''
+ A combobox that holds text encodings support
+ by Python. This is only populated with the most
+ common and standard encodings. There is no good
+ way to programatically list all supported encodings
+ using encodings.aliases.aliases.keys(). It
+ will not work.
+ '''
+
+ ENCODINGS = ['', 'ascii', 'big5', 'cp1250', 'cp1251', 'cp1252', 'cp1253',
+ 'cp1254', 'cp1255', 'cp1256', 'euc_jp', 'euc_kr', 'gb2312', 'gb18030',
+ 'hz', 'iso2022_jp', 'iso2022_kr', 'iso8859_5', 'latin_1', 'shift_jis',
+ 'utf_8',
+ ]
+
+ def __init__(self, parent=None):
+ QComboBox.__init__(self, parent)
+ self.setEditable(True)
+ self.setLineEdit(EnLineEdit(self))
+
+ for item in self.ENCODINGS:
+ self.addItem(item)
+
+
class PythonHighlighter(QSyntaxHighlighter):
Rules = []
From 47aeaf10b67498bb8c8c4399abe0ab60f2d0401b Mon Sep 17 00:00:00 2001
From: John Schember
Date: Sat, 1 Jan 2011 23:03:58 -0500
Subject: [PATCH 02/10] TXT Input: Attempt to detect the input encoding when
not specified. TCR, PDB Input: Use TXT Input converion plugin for conversion,
adds encoding detection and allows for all of TXT Input options to be used
(eReader PDB ignores options that do not apply to it).
---
src/calibre/ebooks/pdb/input.py | 14 +++++++--
src/calibre/ebooks/pdb/palmdoc/reader.py | 37 +++++++---------------
src/calibre/ebooks/pdb/ztxt/reader.py | 40 +++++++++---------------
src/calibre/ebooks/tcr/input.py | 35 +++++++++------------
src/calibre/ebooks/txt/input.py | 20 ++++++++----
5 files changed, 67 insertions(+), 79 deletions(-)
diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py
index 6850c48b16..9edf381f1e 100644
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@@ -22,13 +22,23 @@ class PDBInput(InputFormatPlugin):
OptionRecommendation(name='single_line_paras', recommended_value=False,
help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line represents '
- 'a paragraph instead.')),
+ 'a paragraph instead. This option is ignored by eReader format.')),
OptionRecommendation(name='print_formatted_paras', recommended_value=False,
help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line starting with '
'an indent (either a tab or 2+ spaces) represents a paragraph. '
'Paragraphs end when the next line that starts with an indent '
- 'is reached.')),
+ 'is reached. This option is ignored by eReader format.')),
+ OptionRecommendation(name='preserve_spaces', recommended_value=False,
+ help=_('Normally extra spaces are condensed into a single space. '
+ 'With this option all spaces will be displayed. This option '
+ 'is ignored by eReader format.')),
+ OptionRecommendation(name='markdown', recommended_value=False,
+ help=_('Run the text input through the markdown pre-processor. To '
+ 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
+ OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
+ help=_('Do not insert a Table of Contents into the output text. '
+ 'This option is ignored by eReader format.')),
])
def convert(self, stream, options, file_ext, log,
diff --git a/src/calibre/ebooks/pdb/palmdoc/reader.py b/src/calibre/ebooks/pdb/palmdoc/reader.py
index 52b8d1361f..f1f00ea8e3 100644
--- a/src/calibre/ebooks/pdb/palmdoc/reader.py
+++ b/src/calibre/ebooks/pdb/palmdoc/reader.py
@@ -11,9 +11,9 @@ __docformat__ = 'restructuredtext en'
import os
import struct
+from cStringIO import StringIO
+
from calibre.ebooks.pdb.formatreader import FormatReader
-from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
- separate_paragraphs_single_line, separate_paragraphs_print_formatted
class HeaderRecord(object):
'''
@@ -33,9 +33,7 @@ class Reader(FormatReader):
def __init__(self, header, stream, log, options):
self.stream = stream
self.log = log
- self.encoding = options.input_encoding
- self.single_line_paras = options.single_line_paras
- self.print_formatted_paras = options.print_formatted_paras
+ self.options = options
self.sections = []
for i in range(header.num_sections):
@@ -48,34 +46,23 @@ class Reader(FormatReader):
def decompress_text(self, number):
if self.header_record.compression == 1:
- return self.section_data(number).decode('cp1252' if self.encoding is None else self.encoding)
+ return self.section_data(number)
if self.header_record.compression == 2 or self.header_record.compression == 258:
from calibre.ebooks.compression.palmdoc import decompress_doc
- return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace')
+ return decompress_doc(self.section_data(number))
return ''
def extract_content(self, output_dir):
- txt = ''
+ raw_txt = ''
self.log.info('Decompressing text...')
for i in range(1, self.header_record.num_records + 1):
self.log.debug('\tDecompressing text section %i' % i)
- txt += self.decompress_text(i)
+ raw_txt += self.decompress_text(i)
self.log.info('Converting text to OEB...')
- if self.single_line_paras:
- txt = separate_paragraphs_single_line(txt)
- if self.print_formatted_paras:
- txt = separate_paragraphs_print_formatted(txt)
- html = convert_basic(txt)
- with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
- index.write(html.encode('utf-8'))
-
- from calibre.ebooks.metadata.meta import get_metadata
- mi = get_metadata(self.stream, 'pdb')
- manifest = [('index.html', None)]
- spine = ['index.html']
- opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
-
- return os.path.join(output_dir, 'metadata.opf')
-
+ stream = StringIO(raw_txt)
+ from calibre.customize.ui import plugin_for_input_format
+ stream.seek(0)
+ return plugin_for_input_format('txt').convert(stream, self.options,
+ 'txt', self.log, {})
diff --git a/src/calibre/ebooks/pdb/ztxt/reader.py b/src/calibre/ebooks/pdb/ztxt/reader.py
index 5cac283264..7e51dae1fd 100644
--- a/src/calibre/ebooks/pdb/ztxt/reader.py
+++ b/src/calibre/ebooks/pdb/ztxt/reader.py
@@ -8,12 +8,13 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember '
__docformat__ = 'restructuredtext en'
-import os, struct, zlib
+import struct
+import zlib
+
+from cStringIO import StringIO
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pdb.ztxt import zTXTError
-from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
- separate_paragraphs_single_line, separate_paragraphs_print_formatted
SUPPORTED_VERSION = (1, 40)
@@ -38,9 +39,7 @@ class Reader(FormatReader):
def __init__(self, header, stream, log, options):
self.stream = stream
self.log = log
- self.encoding = options.input_encoding
- self.single_line_paras = options.single_line_paras
- self.print_formatted_paras = options.print_formatted_paras
+ self.options = options
self.sections = []
for i in range(header.num_sections):
@@ -68,30 +67,19 @@ class Reader(FormatReader):
def decompress_text(self, number):
if number == 1:
self.uncompressor = zlib.decompressobj()
- return self.uncompressor.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace')
+ return self.uncompressor.decompress(self.section_data(number))
def extract_content(self, output_dir):
- txt = ''
+ raw_txt = ''
self.log.info('Decompressing text...')
for i in range(1, self.header_record.num_records + 1):
self.log.debug('\tDecompressing text section %i' % i)
- txt += self.decompress_text(i)
-
+ raw_txt += self.decompress_text(i)
+
self.log.info('Converting text to OEB...')
- if self.single_line_paras:
- txt = separate_paragraphs_single_line(txt)
- if self.print_formatted_paras:
- txt = separate_paragraphs_print_formatted(txt)
- html = convert_basic(txt)
- with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
- index.write(html.encode('utf-8'))
-
- from calibre.ebooks.metadata.meta import get_metadata
- mi = get_metadata(self.stream, 'pdb')
- manifest = [('index.html', None)]
- spine = ['index.html']
- opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
-
- return os.path.join(output_dir, 'metadata.opf')
-
+ stream = StringIO(raw_txt)
+ from calibre.customize.ui import plugin_for_input_format
+ stream.seek(0)
+ return plugin_for_input_format('txt').convert(stream, self.options,
+ 'txt', self.log, {})
diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py
index 67fa6ac66e..47154988a0 100644
--- a/src/calibre/ebooks/tcr/input.py
+++ b/src/calibre/ebooks/tcr/input.py
@@ -4,11 +4,9 @@ __license__ = 'GPL 3'
__copyright__ = '2009, John Schember '
__docformat__ = 'restructuredtext en'
-import os
+from cStringIO import StringIO
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
-from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
- separate_paragraphs_single_line, separate_paragraphs_print_formatted
from calibre.ebooks.compression.tcr import decompress
class TCRInput(InputFormatPlugin):
@@ -29,26 +27,23 @@ class TCRInput(InputFormatPlugin):
'an indent (either a tab or 2+ spaces) represents a paragraph. '
'Paragraphs end when the next line that starts with an indent '
'is reached.')),
+ OptionRecommendation(name='preserve_spaces', recommended_value=False,
+ help=_('Normally extra spaces are condensed into a single space. '
+ 'With this option all spaces will be displayed.')),
+ OptionRecommendation(name='markdown', recommended_value=False,
+ help=_('Run the text input through the markdown pre-processor. To '
+ 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
+ OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
+ help=_('Do not insert a Table of Contents into the output text.')),
])
def convert(self, stream, options, file_ext, log, accelerators):
log.info('Decompressing text...')
- ienc = options.input_encoding if options.input_encoding else 'utf-8'
- txt = decompress(stream).decode(ienc, 'replace')
+ raw_txt = decompress(stream)
log.info('Converting text to OEB...')
- if options.single_line_paras:
- txt = separate_paragraphs_single_line(txt)
- if options.print_formatted_paras:
- txt = separate_paragraphs_print_formatted(txt)
- html = convert_basic(txt)
- with open(os.path.join(os.getcwd(), 'index.html'), 'wb') as index:
- index.write(html.encode('utf-8'))
-
- from calibre.ebooks.metadata.meta import get_metadata
- mi = get_metadata(stream, 'tcr')
- manifest = [('index.html', None)]
- spine = ['index.html']
- opf_writer(os.getcwd(), 'metadata.opf', manifest, spine, mi)
-
- return os.path.join(os.getcwd(), 'metadata.opf')
+ stream = StringIO(raw_txt)
+ from calibre.customize.ui import plugin_for_input_format
+ stream.seek(0)
+ return plugin_for_input_format('txt').convert(stream, options,
+ 'txt', log, accelerators)
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 44b98304ea..1a732535b3 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces
@@ -42,11 +43,19 @@ class TXTInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log,
accelerators):
- ienc = stream.encoding if stream.encoding else 'utf-8'
+ log.debug('Reading text from file...')
+
+ txt = stream.read()
if options.input_encoding:
ienc = options.input_encoding
- log.debug('Reading text from file...')
- txt = stream.read().decode(ienc, 'replace')
+ log.debug('Using user specified input encoding of %s' % ienc)
+ else:
+ ienc = detect(txt)['encoding']
+ log.debug('Detected input encoding as %s' % ienc)
+ if not ienc:
+ ienc = 'utf-8'
+ log.debug('No input encoding specified and could not auto detect using %s' % ienc)
+ txt = txt.decode(ienc, 'replace')
# Adjust paragraph formatting as requested
if options.single_line_paras:
@@ -85,11 +94,10 @@ class TXTInput(InputFormatPlugin):
htmlfile = open(fname, 'wb')
with htmlfile:
htmlfile.write(html.encode('utf-8'))
- cwd = os.getcwdu()
odi = options.debug_pipeline
options.debug_pipeline = None
- oeb = html_input(open(htmlfile.name, 'rb'), options, 'html', log,
- {}, cwd)
+ oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log,
+ {})
options.debug_pipeline = odi
os.remove(htmlfile.name)
return oeb
From 089d3679420b087c09dce06b3ea80ac1faf194c0 Mon Sep 17 00:00:00 2001
From: John Schember
Date: Sun, 2 Jan 2011 09:59:41 -0500
Subject: [PATCH 03/10] PDF Output: Change call to get_printer to correct
get_pdf_printer.
---
src/calibre/ebooks/pdf/writer.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py
index 4ff10290c9..8938dd66c1 100644
--- a/src/calibre/ebooks/pdf/writer.py
+++ b/src/calibre/ebooks/pdf/writer.py
@@ -175,7 +175,7 @@ class PDFWriter(QObject): # {{{
if self.cover_data is None:
return
item_path = os.path.join(self.tmp_path, 'cover.pdf')
- printer = self.get_printer()
+ printer = self.get_pdf_printer()
printer.setOutputFileName(item_path)
self.combine_queue.insert(0, item_path)
p = QPixmap()
From d9195c0632ac823e0e581e417596d1d2039aef9d Mon Sep 17 00:00:00 2001
From: John Schember
Date: Sun, 2 Jan 2011 17:32:16 -0500
Subject: [PATCH 04/10] TXT Input: Add confidence of detected encoding to debug
log.
---
src/calibre/ebooks/txt/input.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 1a732535b3..5e406216d6 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -50,8 +50,9 @@ class TXTInput(InputFormatPlugin):
ienc = options.input_encoding
log.debug('Using user specified input encoding of %s' % ienc)
else:
- ienc = detect(txt)['encoding']
- log.debug('Detected input encoding as %s' % ienc)
+ det_encoding = detect(txt)
+ ienc = det_encoding['encoding']
+ log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, det_encoding['confidence'] * 100))
if not ienc:
ienc = 'utf-8'
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
From 9ec91639197e2e1dec38525984787b317c0296c9 Mon Sep 17 00:00:00 2001
From: John Schember
Date: Sun, 2 Jan 2011 19:05:35 -0500
Subject: [PATCH 05/10] TXT Input: Auto detect paragraph structure.
---
src/calibre/ebooks/pdb/input.py | 30 ++++++++---------
src/calibre/ebooks/tcr/input.py | 24 +++++++-------
src/calibre/ebooks/txt/input.py | 51 ++++++++++++++++++-----------
src/calibre/ebooks/txt/processor.py | 50 +++++++++++++++++++++++++++-
4 files changed, 104 insertions(+), 51 deletions(-)
diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py
index 9edf381f1e..b8b4b93ca1 100644
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@@ -19,26 +19,22 @@ class PDBInput(InputFormatPlugin):
file_types = set(['pdb'])
options = set([
- OptionRecommendation(name='single_line_paras', recommended_value=False,
- help=_('Normally calibre treats blank lines as paragraph markers. '
- 'With this option it will assume that every line represents '
- 'a paragraph instead. This option is ignored by eReader format.')),
- OptionRecommendation(name='print_formatted_paras', recommended_value=False,
- help=_('Normally calibre treats blank lines as paragraph markers. '
- 'With this option it will assume that every line starting with '
- 'an indent (either a tab or 2+ spaces) represents a paragraph. '
- 'Paragraphs end when the next line that starts with an indent '
- 'is reached. This option is ignored by eReader format.')),
+ OptionRecommendation(name='paragraph_format', recommended_value='auto',
+ choices=['auto', 'block', 'single', 'print', 'markdown'],
+ help=_('How calibre splits text into paragraphs.\n'
+ 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
+ '* auto: Try to auto detect paragraph format.\n'
+ '* block: Treat a blank line as a paragraph break.\n'
+ '* single: Assume every line is a paragraph.\n'
+ '* print: Assume every line starting with 2+ spaces or a tab '
+ 'starts a paragraph.\n'
+ '* markdown: Run the input though the markdown pre-processor. '
+ 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. '
- 'With this option all spaces will be displayed. This option '
- 'is ignored by eReader format.')),
- OptionRecommendation(name='markdown', recommended_value=False,
- help=_('Run the text input through the markdown pre-processor. To '
- 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
+ 'With this option all spaces will be displayed.')),
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
- help=_('Do not insert a Table of Contents into the output text. '
- 'This option is ignored by eReader format.')),
+ help=_('Do not insert a Table of Contents into the output text.')),
])
def convert(self, stream, options, file_ext, log,
diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py
index 47154988a0..47fe7e7337 100644
--- a/src/calibre/ebooks/tcr/input.py
+++ b/src/calibre/ebooks/tcr/input.py
@@ -17,22 +17,20 @@ class TCRInput(InputFormatPlugin):
file_types = set(['tcr'])
options = set([
- OptionRecommendation(name='single_line_paras', recommended_value=False,
- help=_('Normally calibre treats blank lines as paragraph markers. '
- 'With this option it will assume that every line represents '
- 'a paragraph instead.')),
- OptionRecommendation(name='print_formatted_paras', recommended_value=False,
- help=_('Normally calibre treats blank lines as paragraph markers. '
- 'With this option it will assume that every line starting with '
- 'an indent (either a tab or 2+ spaces) represents a paragraph. '
- 'Paragraphs end when the next line that starts with an indent '
- 'is reached.')),
+ OptionRecommendation(name='paragraph_format', recommended_value='auto',
+ choices=['auto', 'block', 'single', 'print', 'markdown'],
+ help=_('How calibre splits text into paragraphs.\n'
+ 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
+ '* auto: Try to auto detect paragraph format.\n'
+ '* block: Treat a blank line as a paragraph break.\n'
+ '* single: Assume every line is a paragraph.\n'
+ '* print: Assume every line starting with 2+ spaces or a tab '
+ 'starts a paragraph.\n'
+ '* markdown: Run the input though the markdown pre-processor. '
+ 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. '
'With this option all spaces will be displayed.')),
- OptionRecommendation(name='markdown', recommended_value=False,
- help=_('Run the text input through the markdown pre-processor. To '
- 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
help=_('Do not insert a Table of Contents into the output text.')),
])
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 5e406216d6..e68c47e9b3 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -10,7 +10,7 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
- preserve_spaces
+ preserve_spaces, detect_paragraph_formatting
from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin):
@@ -21,22 +21,20 @@ class TXTInput(InputFormatPlugin):
file_types = set(['txt'])
options = set([
- OptionRecommendation(name='single_line_paras', recommended_value=False,
- help=_('Normally calibre treats blank lines as paragraph markers. '
- 'With this option it will assume that every line represents '
- 'a paragraph instead.')),
- OptionRecommendation(name='print_formatted_paras', recommended_value=False,
- help=_('Normally calibre treats blank lines as paragraph markers. '
- 'With this option it will assume that every line starting with '
- 'an indent (either a tab or 2+ spaces) represents a paragraph. '
- 'Paragraphs end when the next line that starts with an indent '
- 'is reached.')),
+ OptionRecommendation(name='paragraph_format', recommended_value='auto',
+ choices=['auto', 'block', 'single', 'print', 'markdown'],
+ help=_('How calibre splits text into paragraphs.\n'
+ 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
+ '* auto: Try to auto detect paragraph format.\n'
+ '* block: Treat a blank line as a paragraph break.\n'
+ '* single: Assume every line is a paragraph.\n'
+ '* print: Assume every line starting with 2+ spaces or a tab '
+ 'starts a paragraph.\n'
+ '* markdown: Run the input though the markdown pre-processor. '
+ 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. '
'With this option all spaces will be displayed.')),
- OptionRecommendation(name='markdown', recommended_value=False,
- help=_('Run the text input through the markdown pre-processor. To '
- 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
help=_('Do not insert a Table of Contents into the output text.')),
])
@@ -46,6 +44,7 @@ class TXTInput(InputFormatPlugin):
log.debug('Reading text from file...')
txt = stream.read()
+ # Get the encoding of the document.
if options.input_encoding:
ienc = options.input_encoding
log.debug('Using user specified input encoding of %s' % ienc)
@@ -58,17 +57,29 @@ class TXTInput(InputFormatPlugin):
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
txt = txt.decode(ienc, 'replace')
- # Adjust paragraph formatting as requested
- if options.single_line_paras:
+ # Determine the formatting of the document.
+ if options.paragraph_format == 'auto':
+ options.paragraph_format = detect_paragraph_formatting(txt)
+ if options.paragraph_format == 'unknown':
+ log.debug('Could not reliably determine paragraph format using block format')
+ options.paragraph_format = 'block'
+ else:
+ log.debug('Auto detected paragraph format as %s' % options.paragraph_format)
+
+ # We don't check for block because the processor assumes block.
+ # single and print at transformed to block for processing.
+ if options.paragraph_format == 'single':
txt = separate_paragraphs_single_line(txt)
- if options.print_formatted_paras:
+ elif options.paragraph_format == 'print':
txt = separate_paragraphs_print_formatted(txt)
+
+ txt = _ent_pat.sub(xml_entity_to_unicode, txt)
+ # Preserve spaces will replace multiple spaces to a space
+ # followed by the entity.
if options.preserve_spaces:
txt = preserve_spaces(txt)
- txt = _ent_pat.sub(xml_entity_to_unicode, txt)
-
- if options.markdown:
+ if options.paragraph_format == 'markdown':
log.debug('Running text though markdown conversion...')
try:
html = convert_markdown(txt, disable_toc=options.markdown_disable_toc)
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index dac1e34df7..e1014b0c7b 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -49,7 +49,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
if isbytestring(txt):
txt = txt.decode('utf-8')
-
lines = []
# Split into paragraphs based on having a blank line between text.
for line in txt.split('\n\n'):
@@ -94,3 +93,52 @@ def split_string_separator(txt, size) :
xrange(0, len(txt), size)])
return txt
+def detect_paragraph_formatting(txt):
+ '''
+ Tries to determine the formatting of the document.
+
+ block: Paragraphs are separated by a blank line.
+ single: Each line is a paragraph.
+ print: Each paragraph starts with a 2+ spaces or a tab
+ and ends when a new paragraph is reached.
+ markdown: Markdown formatting is in the document.
+
+ returns block, single, print, markdown
+ '''
+ txt = txt.replace('\r\n', '\n')
+ txt = txt.replace('\r', '\n')
+ txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
+
+ # Check for markdown
+ # Headings
+ if len(re.findall('(?mu)^#+', txt)) >= 5:
+ return 'markdown'
+ if len(re.findall('(?mu)^=+$', txt)) >= 5:
+ return 'markdown'
+ if len(re.findall('(?mu)^-+$', txt)) >= 5:
+ return 'markdown'
+ # Images
+ if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5:
+ return 'markdown'
+ # Links
+ if len(re.findall('(?u)(^|(?P[^!]))\[.*?\]\([^)]+\)', txt)) >= 5:
+ return 'markdown'
+ # Escaped characters
+ md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']
+ for c in md_escapted_characters:
+ if txt.count('\\'+c) > 10:
+ return 'markdown'
+
+ # Check for print
+ tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
+ if tab_line_count / float(txt_line_count) >= .25:
+ return 'print'
+
+ # Check for block
+ empty_line_count = len(re.findall('(?mu)^\s*$', txt))
+ if empty_line_count / float(txt_line_count) >= .25:
+ return 'block'
+
+ # Nothing else matched to assume single.
+ return 'single'
+
From 521e41973aa09d00bf3a495507b03a21e4257165 Mon Sep 17 00:00:00 2001
From: John Schember
Date: Sun, 2 Jan 2011 19:18:52 -0500
Subject: [PATCH 06/10] GUI: TXT, TCR, PDB Inputs gui conversion options
updated.
---
src/calibre/gui2/convert/pdb_input.py | 10 +++--
src/calibre/gui2/convert/pdb_input.ui | 48 ---------------------
src/calibre/gui2/convert/tcr_input.py | 23 ++++++++++
src/calibre/gui2/convert/txt_input.py | 5 ++-
src/calibre/gui2/convert/txt_input.ui | 60 +++++++--------------------
5 files changed, 48 insertions(+), 98 deletions(-)
delete mode 100644 src/calibre/gui2/convert/pdb_input.ui
create mode 100644 src/calibre/gui2/convert/tcr_input.py
diff --git a/src/calibre/gui2/convert/pdb_input.py b/src/calibre/gui2/convert/pdb_input.py
index 4510cf81ba..655f4025a7 100644
--- a/src/calibre/gui2/convert/pdb_input.py
+++ b/src/calibre/gui2/convert/pdb_input.py
@@ -1,10 +1,10 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
-__copyright__ = '2009, John Schember '
+__copyright__ = '2011, John Schember '
__docformat__ = 'restructuredtext en'
-from calibre.gui2.convert.pdb_input_ui import Ui_Form
+from calibre.gui2.convert.txt_input_ui import Ui_Form
from calibre.gui2.convert import Widget
class PluginWidget(Widget, Ui_Form):
@@ -12,10 +12,12 @@ class PluginWidget(Widget, Ui_Form):
TITLE = _('PDB Input')
HELP = _('Options specific to')+' PDB '+_('input')
COMMIT_NAME = 'pdb_input'
- ICON = I('mimetypes/unknown.png')
+ ICON = I('mimetypes/txt.png')
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent,
- ['single_line_paras', 'print_formatted_paras'])
+ ['paragraph_format', 'markdown_disable_toc', 'preserve_spaces'])
self.db, self.book_id = db, book_id
+ for x in get_option('paragraph_format').option.choices:
+ self.opt_paragraph_format.addItem(x)
self.initialize_options(get_option, get_help, db, book_id)
diff --git a/src/calibre/gui2/convert/pdb_input.ui b/src/calibre/gui2/convert/pdb_input.ui
deleted file mode 100644
index 2b632b1a33..0000000000
--- a/src/calibre/gui2/convert/pdb_input.ui
+++ /dev/null
@@ -1,48 +0,0 @@
-
-
- Form
-
-
-
- 0
- 0
- 400
- 300
-
-
-
- Form
-
-
- -
-
-
- Qt::Vertical
-
-
-
- 20
- 213
-
-
-
-
- -
-
-
- Treat each &line as a paragraph
-
-
-
- -
-
-
- Assume print formatting
-
-
-
-
-
-
-
-
diff --git a/src/calibre/gui2/convert/tcr_input.py b/src/calibre/gui2/convert/tcr_input.py
new file mode 100644
index 0000000000..2aa877ce4d
--- /dev/null
+++ b/src/calibre/gui2/convert/tcr_input.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember '
+__docformat__ = 'restructuredtext en'
+
+from calibre.gui2.convert.txt_input_ui import Ui_Form
+from calibre.gui2.convert import Widget
+
+class PluginWidget(Widget, Ui_Form):
+
+ TITLE = _('TCR Input')
+ HELP = _('Options specific to')+' TCR '+_('input')
+ COMMIT_NAME = 'tcr_input'
+ ICON = I('mimetypes/txt.png')
+
+ def __init__(self, parent, get_option, get_help, db=None, book_id=None):
+ Widget.__init__(self, parent,
+ ['paragraph_format', 'markdown_disable_toc', 'preserve_spaces'])
+ self.db, self.book_id = db, book_id
+ for x in get_option('paragraph_format').option.choices:
+ self.opt_paragraph_format.addItem(x)
+ self.initialize_options(get_option, get_help, db, book_id)
diff --git a/src/calibre/gui2/convert/txt_input.py b/src/calibre/gui2/convert/txt_input.py
index 31019251e2..99d04fe2f4 100644
--- a/src/calibre/gui2/convert/txt_input.py
+++ b/src/calibre/gui2/convert/txt_input.py
@@ -16,7 +16,8 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent,
- ['single_line_paras', 'print_formatted_paras', 'markdown',
- 'markdown_disable_toc', 'preserve_spaces'])
+ ['paragraph_format', 'markdown_disable_toc', 'preserve_spaces'])
self.db, self.book_id = db, book_id
+ for x in get_option('paragraph_format').option.choices:
+ self.opt_paragraph_format.addItem(x)
self.initialize_options(get_option, get_help, db, book_id)
diff --git a/src/calibre/gui2/convert/txt_input.ui b/src/calibre/gui2/convert/txt_input.ui
index 186783c277..b45297fdf2 100644
--- a/src/calibre/gui2/convert/txt_input.ui
+++ b/src/calibre/gui2/convert/txt_input.ui
@@ -6,7 +6,7 @@
0
0
- 470
+ 488
300
@@ -15,27 +15,16 @@
-
-
+
- Treat each &line as a paragraph
+ Document structure detection
- -
-
-
- Assume print formatting
-
-
+
-
+
- -
-
-
- Process using markdown
-
-
-
- -
+
-
<p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>.
@@ -48,14 +37,21 @@
- -
+
-
Do not insert Table of Contents into output text when using markdown
- -
+
-
+
+
+ Preserve &spaces
+
+
+
+ -
Qt::Vertical
@@ -68,32 +64,8 @@
- -
-
-
- Preserve &spaces
-
-
-
-
-
- opt_markdown
- toggled(bool)
- opt_markdown_disable_toc
- setEnabled(bool)
-
-
- 76
- 80
-
-
- 418
- 105
-
-
-
-
+
From 2427c5bdd01d9c94abd3e887dd9d1cfcc3e2f5fc Mon Sep 17 00:00:00 2001
From: John Schember
Date: Mon, 3 Jan 2011 20:53:41 -0500
Subject: [PATCH 07/10] FB2 Output: Fix bug #8172, Include cover page in output
when it is not referenced in the oeb spine.
---
src/calibre/ebooks/fb2/fb2ml.py | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py
index f9ce9befb4..8d23a5f0b2 100644
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@@ -161,6 +161,17 @@ class FB2MLizer(object):
text.append('')
self.section_level += 1
+ # Insert the title page / cover into the spine if it is not already referenced.
+ title_name = u''
+ if 'titlepage' in self.oeb_book.guide:
+ title_name = 'titlepage'
+ elif 'cover' in self.oeb_book.guide:
+ title_name = 'cover'
+ if title_name:
+ title_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[title_name].href]
+ if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml':
+ self.oeb_book.spine.insert(0, title_item, True)
+
for item in self.oeb_book.spine:
self.log.debug('Converting %s to FictionBook2 XML' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
From d23ce51b98629014b0d4ba899b89d74d9ba51812 Mon Sep 17 00:00:00 2001
From: John Schember
Date: Wed, 5 Jan 2011 18:30:50 -0500
Subject: [PATCH 08/10] FB2 Ouput: Insert image based covers into document.
---
src/calibre/ebooks/fb2/fb2ml.py | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py
index 8d23a5f0b2..f6deab677a 100644
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@@ -16,6 +16,7 @@ import uuid
from lxml import etree
+from calibre import guess_type
from calibre import prepare_string_for_xml
from calibre.constants import __appname__, __version__
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
@@ -171,6 +172,12 @@ class FB2MLizer(object):
title_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[title_name].href]
if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml':
self.oeb_book.spine.insert(0, title_item, True)
+ # Create xhtml page to reference cover image so it can be used.
+ if self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
+ id = unicode(self.oeb_book.metadata.cover[0])
+ cover_item = self.oeb_book.manifest.ids[id]
+ if cover_item.media_type in OEB_RASTER_IMAGES:
+ self.insert_image_cover(cover_item.href)
for item in self.oeb_book.spine:
self.log.debug('Converting %s to FictionBook2 XML' % item.href)
@@ -196,6 +203,17 @@ class FB2MLizer(object):
return ''.join(text) + '
