Merge from trunk

This commit is contained in:
Charles Haley 2011-02-08 12:03:41 +00:00
commit 4c523826b7
13 changed files with 188 additions and 50 deletions

View File

@ -325,6 +325,17 @@ class TXTMetadataReader(MetadataReaderPlugin):
from calibre.ebooks.metadata.txt import get_metadata
return get_metadata(stream)
class TXTZMetadataReader(MetadataReaderPlugin):
name = 'Read TXTZ metadata'
file_types = set(['txtz'])
description = _('Read metadata from %s files') % 'TXTZ'
author = 'John Schember'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.txtz import get_metadata
return get_metadata(stream)
class ZipMetadataReader(MetadataReaderPlugin):
name = 'Read ZIP metadata'
@ -412,6 +423,17 @@ class TOPAZMetadataWriter(MetadataWriterPlugin):
from calibre.ebooks.metadata.topaz import set_metadata
set_metadata(stream, mi)
class TXTZMetadataWriter(MetadataWriterPlugin):
name = 'Set TXTZ metadata'
file_types = set(['txtz'])
description = _('Set metadata from %s files') % 'TXTZ'
author = 'John Schember'
def set_metadata(self, stream, mi, type):
from calibre.ebooks.metadata.txtz import set_metadata
set_metadata(stream, mi)
# }}}
from calibre.ebooks.comic.input import ComicInput
@ -446,6 +468,7 @@ from calibre.ebooks.rb.output import RBOutput
from calibre.ebooks.rtf.output import RTFOutput
from calibre.ebooks.tcr.output import TCROutput
from calibre.ebooks.txt.output import TXTOutput
from calibre.ebooks.txt.output import TXTZOutput
from calibre.ebooks.html.output import HTMLOutput
from calibre.ebooks.snb.output import SNBOutput
@ -531,6 +554,7 @@ plugins += [
RTFOutput,
TCROutput,
TXTOutput,
TXTZOutput,
HTMLOutput,
SNBOutput,
]

View File

@ -25,7 +25,7 @@ class DRMError(ValueError):
class ParserError(ValueError):
pass
BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'htm', 'xhtm',
BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'htm', 'xhtm',
'html', 'xhtml', 'pdf', 'pdb', 'pdr', 'prc', 'mobi', 'azw', 'doc',
'epub', 'fb2', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip',
'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'mbp', 'tan', 'snb']

View File

@ -1,16 +1,20 @@
'''Read meta information from TXT files'''
from __future__ import with_statement
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
'''
Read meta information from TXT files
'''
import re
from calibre.ebooks.metadata import MetaInformation
def get_metadata(stream, extract_cover=True):
""" Return metadata as a L{MetaInfo} object """
'''
Return metadata as a L{MetaInfo} object
'''
mi = MetaInformation(_('Unknown'), [_('Unknown')])
stream.seek(0)

View File

@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
'''
Read meta information from TXT files
'''
import os
from cStringIO import StringIO
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.zipfile import ZipFile, safe_replace
def get_metadata(stream, extract_cover=True):
'''
Return metadata as a L{MetaInfo} object
'''
mi = MetaInformation(_('Unknown'), [_('Unknown')])
stream.seek(0)
with TemporaryDirectory('_untxtz_mdata') as tdir:
try:
zf = ZipFile(stream)
zf.extract('metadata.opf', tdir)
with open(os.path.join(tdir, 'metadata.opf'), 'rb') as opff:
mi = OPF(opff).to_book_metadata()
except:
return mi
return mi
def set_metadata(stream, mi):
opf = StringIO(metadata_to_opf(mi))
safe_replace(stream, 'metadata.opf', opf)

View File

@ -4,23 +4,27 @@ __license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import glob
import os
from calibre import _ent_pat, xml_entity_to_unicode
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
normalize_line_endings, convert_textile, remove_indents, block_to_single_line
from calibre import _ent_pat, xml_entity_to_unicode
normalize_line_endings, convert_textile, remove_indents, block_to_single_line, \
separate_hard_scene_breaks
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.zipfile import ZipFile
class TXTInput(InputFormatPlugin):
name = 'TXT Input'
author = 'John Schember'
description = 'Convert TXT files to HTML'
file_types = set(['txt'])
file_types = set(['txt', 'txtz'])
options = set([
OptionRecommendation(name='paragraph_type', recommended_value='auto',
@ -57,10 +61,23 @@ class TXTInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log,
accelerators):
self.log = log
txt = ''
log.debug('Reading text from file...')
length = 0
txt = stream.read()
# Extract content from zip archive.
if file_ext == 'txtz':
log.debug('De-compressing content to temporary directory...')
with TemporaryDirectory('_untxtz') as tdir:
zf = ZipFile(stream)
zf.extractall(tdir)
txts = glob.glob(os.path.join(tdir, '*.txt'))
for t in txts:
with open(t, 'rb') as tf:
txt += tf.read()
else:
txt = stream.read()
# Get the encoding of the document.
if options.input_encoding:
@ -98,6 +115,7 @@ class TXTInput(InputFormatPlugin):
if options.formatting_type == 'heuristic':
setattr(options, 'enable_heuristics', True)
setattr(options, 'unwrap_lines', False)
setattr(options, 'smarten_punctuation', True)
# Reformat paragraphs to block formatting based on the detected type.
# We don't check for block because the processor assumes block.
@ -105,6 +123,7 @@ class TXTInput(InputFormatPlugin):
if options.paragraph_type == 'single':
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print':
txt = separate_hard_scene_breaks(txt)
txt = separate_paragraphs_print_formatted(txt)
txt = block_to_single_line(txt)
elif options.paragraph_type == 'unformatted':
@ -116,6 +135,7 @@ class TXTInput(InputFormatPlugin):
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
txt = separate_paragraphs_single_line(txt)
else:
txt = separate_hard_scene_breaks(txt)
txt = block_to_single_line(txt)
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
@ -175,4 +195,11 @@ class TXTInput(InputFormatPlugin):
{})
options.debug_pipeline = odi
os.remove(htmlfile.name)
# Set metadata from file.
from calibre.customize.ui import get_file_type_metadata
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
mi = get_file_type_metadata(stream, file_ext)
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
return oeb

View File

@ -35,11 +35,9 @@ class MarkdownMLizer(object):
html = unicode(etree.tostring(item.data, encoding=unicode))
if not self.opts.keep_links:
html = re.sub(r'<\s*a[^>]*>', '', html)
html = re.sub(r'<\s*/\s*a\s*>', '', html)
html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
if not self.opts.keep_image_references:
html = re.sub(r'<\s*img[^>]*>', '', html)
html = re.sub(r'<\s*img\s*>', '', html)
html = re.sub(r'<\s*img[^>]*>', '', html)\
text = html2text(html)

View File

@ -5,11 +5,18 @@ __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
import shutil
from lxml import etree
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
from calibre.ebooks.oeb.base import OEB_IMAGES
from calibre.ebooks.txt.txtml import TXTMLizer
from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines
from calibre.ptempfile import TemporaryDirectory, TemporaryFile
from calibre.utils.cleantext import clean_ascii_chars
from calibre.utils.zipfile import ZipFile
class TXTOutput(OutputFormatPlugin):
@ -73,6 +80,7 @@ class TXTOutput(OutputFormatPlugin):
writer = TXTMLizer(log)
txt = writer.extract_content(oeb_book, opts)
txt = clean_ascii_chars(txt)
log.debug('\tReplacing newlines with selected type...')
txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)
@ -93,3 +101,32 @@ class TXTOutput(OutputFormatPlugin):
if close:
out_stream.close()
class TXTZOutput(TXTOutput):
name = 'TXTZ Output'
author = 'John Schember'
file_type = 'txtz'
def convert(self, oeb_book, output_path, input_plugin, opts, log):
with TemporaryDirectory('_txtz_output') as tdir:
# TXT
with TemporaryFile('index.txt') as tf:
TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
shutil.copy(tf, os.path.join(tdir, 'index.txt'))
# Images
for item in oeb_book.manifest:
if item.media_type in OEB_IMAGES:
path = os.path.join(tdir, os.path.dirname(item.href))
if not os.path.exists(path):
os.makedirs(path)
with open(os.path.join(tdir, item.href), 'wb') as imgf:
imgf.write(item.data)
# Metadata
with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf:
mdataf.write(etree.tostring(oeb_book.metadata.to_opf1()))
txtz = ZipFile(output_path, 'w')
txtz.add_dir(tdir)

View File

@ -29,8 +29,7 @@ def clean_txt(txt):
txt = '\n'.join([line.rstrip() for line in txt.splitlines()])
# Replace whitespace at the beginning of the line with &nbsp;
txt = re.sub('(?m)(?P<space>^[ ]+)(?=.)', lambda mo: '&nbsp;' * mo.groups('space').count(' '), txt)
txt = re.sub('(?m)(?P<space>^[\t]+)(?=.)', lambda mo: '&nbsp;' * 4 * mo.groups('space').count('\t'), txt)
txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', '&nbsp;' * 4, txt)
# Condense redundant spaces
txt = re.sub('[ ]{2,}', ' ', txt)
@ -121,6 +120,15 @@ def separate_paragraphs_print_formatted(txt):
txt = re.sub(u'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt)
return txt
def separate_hard_scene_breaks(txt):
def sep_break(line):
if len(line.strip()) > 0:
return '\n%s\n' % line
else:
return line
txt = re.sub(u'(?miu)^[ \t-=~\/]+$', lambda mo: sep_break(mo.group()), txt)
return txt
def block_to_single_line(txt):
txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt)
return txt
@ -221,9 +229,9 @@ def detect_formatting_type(txt):
markdown_count += len(re.findall('(?mu)^=+$', txt))
markdown_count += len(re.findall('(?mu)^-+$', txt))
# Images
markdown_count += len(re.findall('(?u)!\[.*?\]\(.+?\)', txt))
markdown_count += len(re.findall('(?u)!\[.*?\](\[|\()', txt))
# Links
markdown_count += len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt))
markdown_count += len(re.findall('(?u)^|[^!]\[.*?\](\[|\()', txt))
# Check for textile
# Headings
@ -231,9 +239,9 @@ def detect_formatting_type(txt):
# Block quote.
textile_count += len(re.findall(r'(?mu)^bq\.', txt))
# Images
textile_count += len(re.findall(r'\![^\s]+(?=.*?/)(:[^\s]+)*', txt))
textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
# Links
textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
textile_count += len(re.findall(r'"[^"]*":\S+', txt))
# Decide if either markdown or textile is used in the text
# based on the number of unique formatting elements found.

View File

@ -36,13 +36,12 @@ class TextileMLizer(object):
html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
if not self.opts.keep_links:
html = re.sub(r'<\s*a[^>]*>', '', html)
html = re.sub(r'<\s*/\s*a\s*>', '', html)
html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
if not self.opts.keep_image_references:
html = re.sub(r'<\s*img[^>]*>', '', html)
html = re.sub(r'<\s*img\s*>', '', html)
text = html2textile(html)
text = text.replace('%', '')
# Ensure the section ends with at least two new line characters.
# This is to prevent the last paragraph from a section being

View File

@ -183,32 +183,33 @@ class DBAdder(QObject): # {{{
identical_book_list = self.db.find_identical_books(mi)
if identical_book_list: # books with same author and nearly same title exist in db
self.merged_books.add(mi.title)
a_new_record_has_been_created = False
for identical_book in identical_book_list:
if gprefs['automerge'] == 'ignore':
self.add_formats(identical_book, formats, replace=False)
if gprefs['automerge'] == 'overwrite':
self.add_formats(identical_book, formats, replace=True)
if gprefs['automerge'] == 'new record' and not a_new_record_has_been_created:
'''
We are here because we have at least one book record in the db that matches the one file/format being processed
We need to check if the file/format being processed matches a format in the matching book record.
If so, create new record (as below), else, add to existing record, as above.
Test if format exists in matching record. identical_book is an id, formats is a FQPN path in a list
'''
for path in formats:
fmt = os.path.splitext(path)[-1].replace('.', '').upper()
ib_fmts = self.db.formats(identical_book, index_is_id=True)
if ib_fmts and fmt in ib_fmts: # Create a new record
if not a_new_record_has_been_created:
id_ = self.db.create_book_entry(mi, cover=cover, add_duplicates=True)
self.number_of_books_added += 1
self.add_formats(id_, formats)
a_new_record_has_been_created = True
else: #new record not required
self.add_formats(identical_book, formats, replace=False)
seen_fmts = set([])
else: # books with same author and nearly same title do not exist in db
for identical_book in identical_book_list:
ib_fmts = self.db.formats(identical_book, index_is_id=True)
if ib_fmts:
seen_fmts |= set(ib_fmts.split(','))
replace = gprefs['automerge'] == 'overwrite'
self.add_formats(identical_book, formats,
replace=replace)
if gprefs['automerge'] == 'new record':
incoming_fmts = \
set([os.path.splitext(path)[-1].replace('.',
'').upper() for path in formats])
if incoming_fmts.intersection(seen_fmts):
# There was at least one duplicate format
# so create a new record and put the
# incoming formats into it
# We should arguably put only the duplicate
# formats, but no real harm is done by having
# all formats
id_ = self.db.create_book_entry(mi, cover=cover,
add_duplicates=True)
self.number_of_books_added += 1
self.add_formats(id_, formats)
else:
# books with same author and nearly same title do not exist in db
id_ = self.db.create_book_entry(mi, cover=cover, add_duplicates=True)
self.number_of_books_added += 1
self.add_formats(id_, formats)

View File

@ -616,6 +616,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
self.original_series_name = unicode(self.series.text()).strip()
if len(db.custom_column_label_map) == 0:
self.central_widget.tabBar().setVisible(False)
self.central_widget.setTabEnabled(1, False)
else:
self.create_custom_column_editors()
self.generate_cover_button.clicked.connect(self.generate_cover)
@ -780,8 +781,8 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
_('You have changed the tags. In order to use the tags'
' editor, you must either discard or apply these '
'changes. Apply changes?'), show_copy_button=False):
self.books_to_refresh |= self.apply_tags(commit=True, notify=True,
allow_case_change=True)
self.books_to_refresh |= self.apply_tags(commit=True,
notify=True)
self.original_tags = unicode(self.tags.text())
else:
self.tags.setText(self.original_tags)

View File

@ -197,7 +197,7 @@ class MetadataSingleDialogBase(ResizableDialog):
self.books_to_refresh = set([])
for widget in self.basic_metadata_widgets:
widget.initialize(self.db, id_)
for widget in self.custom_metadata_widgets:
for widget in getattr(self, 'custom_metadata_widgets', []):
widget.initialize(id_)
# Commented out as it doesn't play nice with Next, Prev buttons
#self.fetch_metadata_button.setFocus(Qt.OtherFocusReason)

View File

@ -104,6 +104,7 @@ _extra_lang_codes = {
'en_IN' : _('English (India)'),
'en_TH' : _('English (Thailand)'),
'en_CY' : _('English (Cyprus)'),
'en_CZ' : _('English (Czechoslovakia)'),
'en_PK' : _('English (Pakistan)'),
'en_HR' : _('English (Croatia)'),
'en_IL' : _('English (Israel)'),