diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index e0367515bc..32c512fe39 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -325,6 +325,17 @@ class TXTMetadataReader(MetadataReaderPlugin): from calibre.ebooks.metadata.txt import get_metadata return get_metadata(stream) +class TXTZMetadataReader(MetadataReaderPlugin): + + name = 'Read TXTZ metadata' + file_types = set(['txtz']) + description = _('Read metadata from %s files') % 'TXTZ' + author = 'John Schember' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.txtz import get_metadata + return get_metadata(stream) + class ZipMetadataReader(MetadataReaderPlugin): name = 'Read ZIP metadata' @@ -412,6 +423,17 @@ class TOPAZMetadataWriter(MetadataWriterPlugin): from calibre.ebooks.metadata.topaz import set_metadata set_metadata(stream, mi) +class TXTZMetadataWriter(MetadataWriterPlugin): + + name = 'Set TXTZ metadata' + file_types = set(['txtz']) + description = _('Set metadata from %s files') % 'TXTZ' + author = 'John Schember' + + def set_metadata(self, stream, mi, type): + from calibre.ebooks.metadata.txtz import set_metadata + set_metadata(stream, mi) + # }}} from calibre.ebooks.comic.input import ComicInput @@ -446,6 +468,7 @@ from calibre.ebooks.rb.output import RBOutput from calibre.ebooks.rtf.output import RTFOutput from calibre.ebooks.tcr.output import TCROutput from calibre.ebooks.txt.output import TXTOutput +from calibre.ebooks.txt.output import TXTZOutput from calibre.ebooks.html.output import HTMLOutput from calibre.ebooks.snb.output import SNBOutput @@ -531,6 +554,7 @@ plugins += [ RTFOutput, TCROutput, TXTOutput, + TXTZOutput, HTMLOutput, SNBOutput, ] diff --git a/src/calibre/ebooks/__init__.py b/src/calibre/ebooks/__init__.py index 4dc97f43ed..49604ae682 100644 --- a/src/calibre/ebooks/__init__.py +++ b/src/calibre/ebooks/__init__.py @@ -25,7 +25,7 @@ class DRMError(ValueError): class ParserError(ValueError): pass -BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'htm', 'xhtm', +BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'htm', 'xhtm', 'html', 'xhtml', 'pdf', 'pdb', 'pdr', 'prc', 'mobi', 'azw', 'doc', 'epub', 'fb2', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip', 'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'mbp', 'tan', 'snb'] diff --git a/src/calibre/ebooks/metadata/txt.py b/src/calibre/ebooks/metadata/txt.py index 79713774e3..70d3c72ae0 100644 --- a/src/calibre/ebooks/metadata/txt.py +++ b/src/calibre/ebooks/metadata/txt.py @@ -1,16 +1,20 @@ -'''Read meta information from TXT files''' - -from __future__ import with_statement +# -*- coding: utf-8 -*- __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' +''' +Read meta information from TXT files +''' + import re from calibre.ebooks.metadata import MetaInformation def get_metadata(stream, extract_cover=True): - """ Return metadata as a L{MetaInfo} object """ + ''' + Return metadata as a L{MetaInfo} object + ''' mi = MetaInformation(_('Unknown'), [_('Unknown')]) stream.seek(0) diff --git a/src/calibre/ebooks/metadata/txtz.py b/src/calibre/ebooks/metadata/txtz.py new file mode 100644 index 0000000000..ae6efb4838 --- /dev/null +++ b/src/calibre/ebooks/metadata/txtz.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = '2011, John Schember ' + +''' +Read meta information from TXT files +''' + +import os + +from cStringIO import StringIO + +from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf +from calibre.ptempfile import TemporaryDirectory +from calibre.utils.zipfile import ZipFile, safe_replace + +def get_metadata(stream, extract_cover=True): + ''' + Return metadata as a L{MetaInfo} object + ''' + mi = MetaInformation(_('Unknown'), [_('Unknown')]) + stream.seek(0) + + with TemporaryDirectory('_untxtz_mdata') as tdir: + try: + zf = ZipFile(stream) + zf.extract('metadata.opf', tdir) + with open(os.path.join(tdir, 'metadata.opf'), 'rb') as opff: + mi = OPF(opff).to_book_metadata() + except: + return mi + return mi + +def set_metadata(stream, mi): + opf = StringIO(metadata_to_opf(mi)) + safe_replace(stream, 'metadata.opf', opf) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index e240205222..8ab1524b02 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -4,23 +4,27 @@ __license__ = 'GPL 3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' +import glob import os +from calibre import _ent_pat, xml_entity_to_unicode from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \ - normalize_line_endings, convert_textile, remove_indents, block_to_single_line -from calibre import _ent_pat, xml_entity_to_unicode + normalize_line_endings, convert_textile, remove_indents, block_to_single_line, \ + separate_hard_scene_breaks +from calibre.ptempfile import TemporaryDirectory +from calibre.utils.zipfile import ZipFile class TXTInput(InputFormatPlugin): name = 'TXT Input' author = 'John Schember' description = 'Convert TXT files to HTML' - file_types = set(['txt']) + file_types = set(['txt', 'txtz']) options = set([ OptionRecommendation(name='paragraph_type', recommended_value='auto', @@ -57,10 +61,23 @@ class TXTInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): self.log = log + txt = '' log.debug('Reading text from file...') length = 0 - txt = stream.read() + # Extract content from zip archive. + if file_ext == 'txtz': + log.debug('De-compressing content to temporary directory...') + with TemporaryDirectory('_untxtz') as tdir: + zf = ZipFile(stream) + zf.extractall(tdir) + + txts = glob.glob(os.path.join(tdir, '*.txt')) + for t in txts: + with open(t, 'rb') as tf: + txt += tf.read() + else: + txt = stream.read() # Get the encoding of the document. if options.input_encoding: @@ -98,6 +115,7 @@ class TXTInput(InputFormatPlugin): if options.formatting_type == 'heuristic': setattr(options, 'enable_heuristics', True) setattr(options, 'unwrap_lines', False) + setattr(options, 'smarten_punctuation', True) # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. @@ -105,6 +123,7 @@ class TXTInput(InputFormatPlugin): if options.paragraph_type == 'single': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': + txt = separate_hard_scene_breaks(txt) txt = separate_paragraphs_print_formatted(txt) txt = block_to_single_line(txt) elif options.paragraph_type == 'unformatted': @@ -116,6 +135,7 @@ class TXTInput(InputFormatPlugin): txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = separate_paragraphs_single_line(txt) else: + txt = separate_hard_scene_breaks(txt) txt = block_to_single_line(txt) if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False): @@ -175,4 +195,11 @@ class TXTInput(InputFormatPlugin): {}) options.debug_pipeline = odi os.remove(htmlfile.name) + + # Set metadata from file. + from calibre.customize.ui import get_file_type_metadata + from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata + mi = get_file_type_metadata(stream, file_ext) + meta_info_to_oeb_metadata(mi, oeb.metadata, log) + return oeb diff --git a/src/calibre/ebooks/txt/markdownml.py b/src/calibre/ebooks/txt/markdownml.py index 116561f355..c179378049 100644 --- a/src/calibre/ebooks/txt/markdownml.py +++ b/src/calibre/ebooks/txt/markdownml.py @@ -35,11 +35,9 @@ class MarkdownMLizer(object): html = unicode(etree.tostring(item.data, encoding=unicode)) if not self.opts.keep_links: - html = re.sub(r'<\s*a[^>]*>', '', html) - html = re.sub(r'<\s*/\s*a\s*>', '', html) + html = re.sub(r'<\s*/*\s*a[^>]*>', '', html) if not self.opts.keep_image_references: - html = re.sub(r'<\s*img[^>]*>', '', html) - html = re.sub(r'<\s*img\s*>', '', html) + html = re.sub(r'<\s*img[^>]*>', '', html)\ text = html2text(html) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index b73a6e8908..d021cbbba6 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -5,11 +5,18 @@ __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' import os +import shutil + +from lxml import etree from calibre.customize.conversion import OutputFormatPlugin, \ OptionRecommendation +from calibre.ebooks.oeb.base import OEB_IMAGES from calibre.ebooks.txt.txtml import TXTMLizer from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines +from calibre.ptempfile import TemporaryDirectory, TemporaryFile +from calibre.utils.cleantext import clean_ascii_chars +from calibre.utils.zipfile import ZipFile class TXTOutput(OutputFormatPlugin): @@ -73,6 +80,7 @@ class TXTOutput(OutputFormatPlugin): writer = TXTMLizer(log) txt = writer.extract_content(oeb_book, opts) + txt = clean_ascii_chars(txt) log.debug('\tReplacing newlines with selected type...') txt = specified_newlines(TxtNewlines(opts.newline).newline, txt) @@ -93,3 +101,32 @@ class TXTOutput(OutputFormatPlugin): if close: out_stream.close() + +class TXTZOutput(TXTOutput): + + name = 'TXTZ Output' + author = 'John Schember' + file_type = 'txtz' + + def convert(self, oeb_book, output_path, input_plugin, opts, log): + with TemporaryDirectory('_txtz_output') as tdir: + # TXT + with TemporaryFile('index.txt') as tf: + TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log) + shutil.copy(tf, os.path.join(tdir, 'index.txt')) + + # Images + for item in oeb_book.manifest: + if item.media_type in OEB_IMAGES: + path = os.path.join(tdir, os.path.dirname(item.href)) + if not os.path.exists(path): + os.makedirs(path) + with open(os.path.join(tdir, item.href), 'wb') as imgf: + imgf.write(item.data) + + # Metadata + with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf: + mdataf.write(etree.tostring(oeb_book.metadata.to_opf1())) + + txtz = ZipFile(output_path, 'w') + txtz.add_dir(tdir) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index f7b6cce234..55213381c9 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -29,8 +29,7 @@ def clean_txt(txt): txt = '\n'.join([line.rstrip() for line in txt.splitlines()]) # Replace whitespace at the beginning of the line with   - txt = re.sub('(?m)(?P^[ ]+)(?=.)', lambda mo: ' ' * mo.groups('space').count(' '), txt) - txt = re.sub('(?m)(?P^[\t]+)(?=.)', lambda mo: ' ' * 4 * mo.groups('space').count('\t'), txt) + txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', ' ' * 4, txt) # Condense redundant spaces txt = re.sub('[ ]{2,}', ' ', txt) @@ -121,6 +120,15 @@ def separate_paragraphs_print_formatted(txt): txt = re.sub(u'(?miu)^(?P\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt) return txt +def separate_hard_scene_breaks(txt): + def sep_break(line): + if len(line.strip()) > 0: + return '\n%s\n' % line + else: + return line + txt = re.sub(u'(?miu)^[ \t-=~\/]+$', lambda mo: sep_break(mo.group()), txt) + return txt + def block_to_single_line(txt): txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt) return txt @@ -221,9 +229,9 @@ def detect_formatting_type(txt): markdown_count += len(re.findall('(?mu)^=+$', txt)) markdown_count += len(re.findall('(?mu)^-+$', txt)) # Images - markdown_count += len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) + markdown_count += len(re.findall('(?u)!\[.*?\](\[|\()', txt)) # Links - markdown_count += len(re.findall('(?u)(^|(?P
[^!]))\[.*?\]\([^)]+\)', txt))
+    markdown_count += len(re.findall('(?u)^|[^!]\[.*?\](\[|\()', txt))
 
     # Check for textile
     # Headings
@@ -231,9 +239,9 @@ def detect_formatting_type(txt):
     # Block quote.
     textile_count += len(re.findall(r'(?mu)^bq\.', txt))
     # Images
-    textile_count += len(re.findall(r'\![^\s]+(?=.*?/)(:[^\s]+)*', txt))
+    textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
     # Links
-    textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
+    textile_count += len(re.findall(r'"[^"]*":\S+', txt))
 
     # Decide if either markdown or textile is used in the text
     # based on the number of unique formatting elements found.
diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py
index 94834d8e79..284e4846d9 100644
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@@ -36,13 +36,12 @@ class TextileMLizer(object):
             html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
 
             if not self.opts.keep_links:
-                html = re.sub(r'<\s*a[^>]*>', '', html)
-                html = re.sub(r'<\s*/\s*a\s*>', '', html)
+                html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
             if not self.opts.keep_image_references:
                 html = re.sub(r'<\s*img[^>]*>', '', html)
-                html = re.sub(r'<\s*img\s*>', '', html)
 
             text = html2textile(html)
+            text = text.replace('%', '')
 
             # Ensure the section ends with at least two new line characters.
             # This is to prevent the last paragraph from a section being
diff --git a/src/calibre/gui2/add.py b/src/calibre/gui2/add.py
index 086f40feee..f40cf0ff75 100644
--- a/src/calibre/gui2/add.py
+++ b/src/calibre/gui2/add.py
@@ -183,32 +183,33 @@ class DBAdder(QObject): # {{{
                 identical_book_list = self.db.find_identical_books(mi)
                 if identical_book_list:  # books with same author and nearly same title exist in db
                     self.merged_books.add(mi.title)
-                    a_new_record_has_been_created = False
-                    for identical_book in identical_book_list:
-                        if gprefs['automerge'] == 'ignore':
-                            self.add_formats(identical_book, formats, replace=False)
-                        if gprefs['automerge'] == 'overwrite':
-                            self.add_formats(identical_book, formats, replace=True)
-                        if gprefs['automerge'] == 'new record' and not a_new_record_has_been_created:
-                            '''
-                            We are here because we have at least one book record in the db that matches the one file/format being processed
-                            We need to check if the file/format being processed matches a format in the matching book record.
-                            If so, create new record (as below), else, add to existing record, as above.
-                            Test if format exists in matching record. identical_book is an id, formats is a FQPN path in a list
-                            '''
-                            for path in formats:
-                                fmt = os.path.splitext(path)[-1].replace('.', '').upper()
-                                ib_fmts = self.db.formats(identical_book, index_is_id=True)
-                                if ib_fmts and fmt in ib_fmts: # Create a new record
-                                    if not a_new_record_has_been_created:
-                                        id_ = self.db.create_book_entry(mi, cover=cover, add_duplicates=True)
-                                        self.number_of_books_added += 1
-                                        self.add_formats(id_, formats)
-                                        a_new_record_has_been_created = True
-                                else: #new record not required
-                                    self.add_formats(identical_book, formats, replace=False)
+                    seen_fmts = set([])
 
-                else: # books with same author and nearly same title do not exist in db
+                    for identical_book in identical_book_list:
+                        ib_fmts = self.db.formats(identical_book, index_is_id=True)
+                        if ib_fmts:
+                            seen_fmts |= set(ib_fmts.split(','))
+                        replace = gprefs['automerge'] == 'overwrite'
+                        self.add_formats(identical_book, formats,
+                                replace=replace)
+                    if gprefs['automerge'] == 'new record':
+                        incoming_fmts = \
+                            set([os.path.splitext(path)[-1].replace('.',
+                                '').upper() for path in formats])
+                        if incoming_fmts.intersection(seen_fmts):
+                            # There was at least one duplicate format
+                            # so create a new record and put the
+                            # incoming formats into it
+                            # We should arguably put only the duplicate
+                            # formats, but no real harm is done by having
+                            # all formats
+                            id_ = self.db.create_book_entry(mi, cover=cover,
+                                    add_duplicates=True)
+                            self.number_of_books_added += 1
+                            self.add_formats(id_, formats)
+
+                else:
+                    # books with same author and nearly same title do not exist in db
                     id_ = self.db.create_book_entry(mi, cover=cover, add_duplicates=True)
                     self.number_of_books_added += 1
                     self.add_formats(id_, formats)
diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py
index 3e711edd2d..52d263fe36 100644
--- a/src/calibre/gui2/dialogs/metadata_single.py
+++ b/src/calibre/gui2/dialogs/metadata_single.py
@@ -616,6 +616,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
         self.original_series_name = unicode(self.series.text()).strip()
         if len(db.custom_column_label_map) == 0:
             self.central_widget.tabBar().setVisible(False)
+            self.central_widget.setTabEnabled(1, False)
         else:
             self.create_custom_column_editors()
         self.generate_cover_button.clicked.connect(self.generate_cover)
@@ -780,8 +781,8 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
                     _('You have changed the tags. In order to use the tags'
                        ' editor, you must either discard or apply these '
                        'changes. Apply changes?'), show_copy_button=False):
-                self.books_to_refresh |= self.apply_tags(commit=True, notify=True,
-                                                         allow_case_change=True)
+                self.books_to_refresh |= self.apply_tags(commit=True,
+                        notify=True)
                 self.original_tags = unicode(self.tags.text())
             else:
                 self.tags.setText(self.original_tags)
diff --git a/src/calibre/gui2/metadata/single.py b/src/calibre/gui2/metadata/single.py
index 1be954155c..0fa5c746e7 100644
--- a/src/calibre/gui2/metadata/single.py
+++ b/src/calibre/gui2/metadata/single.py
@@ -197,7 +197,7 @@ class MetadataSingleDialogBase(ResizableDialog):
         self.books_to_refresh = set([])
         for widget in self.basic_metadata_widgets:
             widget.initialize(self.db, id_)
-        for widget in self.custom_metadata_widgets:
+        for widget in getattr(self, 'custom_metadata_widgets', []):
             widget.initialize(id_)
         # Commented out as it doesn't play nice with Next, Prev buttons
         #self.fetch_metadata_button.setFocus(Qt.OtherFocusReason)
diff --git a/src/calibre/utils/localization.py b/src/calibre/utils/localization.py
index 037a147e28..97356df081 100644
--- a/src/calibre/utils/localization.py
+++ b/src/calibre/utils/localization.py
@@ -104,6 +104,7 @@ _extra_lang_codes = {
         'en_IN' : _('English (India)'),
         'en_TH' : _('English (Thailand)'),
         'en_CY' : _('English (Cyprus)'),
+        'en_CZ' : _('English (Czechoslovakia)'),
         'en_PK' : _('English (Pakistan)'),
         'en_HR' : _('English (Croatia)'),
         'en_IL' : _('English (Israel)'),