Merge from trunk

This commit is contained in:
Charles Haley 2011-02-08 12:03:41 +00:00
commit 4c523826b7
13 changed files with 188 additions and 50 deletions

View File

@ -325,6 +325,17 @@ class TXTMetadataReader(MetadataReaderPlugin):
from calibre.ebooks.metadata.txt import get_metadata from calibre.ebooks.metadata.txt import get_metadata
return get_metadata(stream) return get_metadata(stream)
class TXTZMetadataReader(MetadataReaderPlugin):
name = 'Read TXTZ metadata'
file_types = set(['txtz'])
description = _('Read metadata from %s files') % 'TXTZ'
author = 'John Schember'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.txtz import get_metadata
return get_metadata(stream)
class ZipMetadataReader(MetadataReaderPlugin): class ZipMetadataReader(MetadataReaderPlugin):
name = 'Read ZIP metadata' name = 'Read ZIP metadata'
@ -412,6 +423,17 @@ class TOPAZMetadataWriter(MetadataWriterPlugin):
from calibre.ebooks.metadata.topaz import set_metadata from calibre.ebooks.metadata.topaz import set_metadata
set_metadata(stream, mi) set_metadata(stream, mi)
class TXTZMetadataWriter(MetadataWriterPlugin):
name = 'Set TXTZ metadata'
file_types = set(['txtz'])
description = _('Set metadata from %s files') % 'TXTZ'
author = 'John Schember'
def set_metadata(self, stream, mi, type):
from calibre.ebooks.metadata.txtz import set_metadata
set_metadata(stream, mi)
# }}} # }}}
from calibre.ebooks.comic.input import ComicInput from calibre.ebooks.comic.input import ComicInput
@ -446,6 +468,7 @@ from calibre.ebooks.rb.output import RBOutput
from calibre.ebooks.rtf.output import RTFOutput from calibre.ebooks.rtf.output import RTFOutput
from calibre.ebooks.tcr.output import TCROutput from calibre.ebooks.tcr.output import TCROutput
from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.txt.output import TXTOutput
from calibre.ebooks.txt.output import TXTZOutput
from calibre.ebooks.html.output import HTMLOutput from calibre.ebooks.html.output import HTMLOutput
from calibre.ebooks.snb.output import SNBOutput from calibre.ebooks.snb.output import SNBOutput
@ -531,6 +554,7 @@ plugins += [
RTFOutput, RTFOutput,
TCROutput, TCROutput,
TXTOutput, TXTOutput,
TXTZOutput,
HTMLOutput, HTMLOutput,
SNBOutput, SNBOutput,
] ]

View File

@ -25,7 +25,7 @@ class DRMError(ValueError):
class ParserError(ValueError): class ParserError(ValueError):
pass pass
BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'htm', 'xhtm', BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'htm', 'xhtm',
'html', 'xhtml', 'pdf', 'pdb', 'pdr', 'prc', 'mobi', 'azw', 'doc', 'html', 'xhtml', 'pdf', 'pdb', 'pdr', 'prc', 'mobi', 'azw', 'doc',
'epub', 'fb2', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip', 'epub', 'fb2', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip',
'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'mbp', 'tan', 'snb'] 'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'mbp', 'tan', 'snb']

View File

@ -1,16 +1,20 @@
'''Read meta information from TXT files''' # -*- coding: utf-8 -*-
from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
'''
Read meta information from TXT files
'''
import re import re
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
def get_metadata(stream, extract_cover=True): def get_metadata(stream, extract_cover=True):
""" Return metadata as a L{MetaInfo} object """ '''
Return metadata as a L{MetaInfo} object
'''
mi = MetaInformation(_('Unknown'), [_('Unknown')]) mi = MetaInformation(_('Unknown'), [_('Unknown')])
stream.seek(0) stream.seek(0)

View File

@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
'''
Read meta information from TXT files
'''
import os
from cStringIO import StringIO
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.zipfile import ZipFile, safe_replace
def get_metadata(stream, extract_cover=True):
'''
Return metadata as a L{MetaInfo} object
'''
mi = MetaInformation(_('Unknown'), [_('Unknown')])
stream.seek(0)
with TemporaryDirectory('_untxtz_mdata') as tdir:
try:
zf = ZipFile(stream)
zf.extract('metadata.opf', tdir)
with open(os.path.join(tdir, 'metadata.opf'), 'rb') as opff:
mi = OPF(opff).to_book_metadata()
except:
return mi
return mi
def set_metadata(stream, mi):
opf = StringIO(metadata_to_opf(mi))
safe_replace(stream, 'metadata.opf', opf)

View File

@ -4,23 +4,27 @@ __license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import glob
import os import os
from calibre import _ent_pat, xml_entity_to_unicode
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.ebooks.chardet import detect from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_type, detect_formatting_type, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \
normalize_line_endings, convert_textile, remove_indents, block_to_single_line normalize_line_endings, convert_textile, remove_indents, block_to_single_line, \
from calibre import _ent_pat, xml_entity_to_unicode separate_hard_scene_breaks
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.zipfile import ZipFile
class TXTInput(InputFormatPlugin): class TXTInput(InputFormatPlugin):
name = 'TXT Input' name = 'TXT Input'
author = 'John Schember' author = 'John Schember'
description = 'Convert TXT files to HTML' description = 'Convert TXT files to HTML'
file_types = set(['txt']) file_types = set(['txt', 'txtz'])
options = set([ options = set([
OptionRecommendation(name='paragraph_type', recommended_value='auto', OptionRecommendation(name='paragraph_type', recommended_value='auto',
@ -57,10 +61,23 @@ class TXTInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
self.log = log self.log = log
txt = ''
log.debug('Reading text from file...') log.debug('Reading text from file...')
length = 0 length = 0
txt = stream.read() # Extract content from zip archive.
if file_ext == 'txtz':
log.debug('De-compressing content to temporary directory...')
with TemporaryDirectory('_untxtz') as tdir:
zf = ZipFile(stream)
zf.extractall(tdir)
txts = glob.glob(os.path.join(tdir, '*.txt'))
for t in txts:
with open(t, 'rb') as tf:
txt += tf.read()
else:
txt = stream.read()
# Get the encoding of the document. # Get the encoding of the document.
if options.input_encoding: if options.input_encoding:
@ -98,6 +115,7 @@ class TXTInput(InputFormatPlugin):
if options.formatting_type == 'heuristic': if options.formatting_type == 'heuristic':
setattr(options, 'enable_heuristics', True) setattr(options, 'enable_heuristics', True)
setattr(options, 'unwrap_lines', False) setattr(options, 'unwrap_lines', False)
setattr(options, 'smarten_punctuation', True)
# Reformat paragraphs to block formatting based on the detected type. # Reformat paragraphs to block formatting based on the detected type.
# We don't check for block because the processor assumes block. # We don't check for block because the processor assumes block.
@ -105,6 +123,7 @@ class TXTInput(InputFormatPlugin):
if options.paragraph_type == 'single': if options.paragraph_type == 'single':
txt = separate_paragraphs_single_line(txt) txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print': elif options.paragraph_type == 'print':
txt = separate_hard_scene_breaks(txt)
txt = separate_paragraphs_print_formatted(txt) txt = separate_paragraphs_print_formatted(txt)
txt = block_to_single_line(txt) txt = block_to_single_line(txt)
elif options.paragraph_type == 'unformatted': elif options.paragraph_type == 'unformatted':
@ -116,6 +135,7 @@ class TXTInput(InputFormatPlugin):
txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
txt = separate_paragraphs_single_line(txt) txt = separate_paragraphs_single_line(txt)
else: else:
txt = separate_hard_scene_breaks(txt)
txt = block_to_single_line(txt) txt = block_to_single_line(txt)
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False): if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
@ -175,4 +195,11 @@ class TXTInput(InputFormatPlugin):
{}) {})
options.debug_pipeline = odi options.debug_pipeline = odi
os.remove(htmlfile.name) os.remove(htmlfile.name)
# Set metadata from file.
from calibre.customize.ui import get_file_type_metadata
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
mi = get_file_type_metadata(stream, file_ext)
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
return oeb return oeb

View File

@ -35,11 +35,9 @@ class MarkdownMLizer(object):
html = unicode(etree.tostring(item.data, encoding=unicode)) html = unicode(etree.tostring(item.data, encoding=unicode))
if not self.opts.keep_links: if not self.opts.keep_links:
html = re.sub(r'<\s*a[^>]*>', '', html) html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
html = re.sub(r'<\s*/\s*a\s*>', '', html)
if not self.opts.keep_image_references: if not self.opts.keep_image_references:
html = re.sub(r'<\s*img[^>]*>', '', html) html = re.sub(r'<\s*img[^>]*>', '', html)\
html = re.sub(r'<\s*img\s*>', '', html)
text = html2text(html) text = html2text(html)

View File

@ -5,11 +5,18 @@ __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os import os
import shutil
from lxml import etree
from calibre.customize.conversion import OutputFormatPlugin, \ from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation OptionRecommendation
from calibre.ebooks.oeb.base import OEB_IMAGES
from calibre.ebooks.txt.txtml import TXTMLizer from calibre.ebooks.txt.txtml import TXTMLizer
from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines
from calibre.ptempfile import TemporaryDirectory, TemporaryFile
from calibre.utils.cleantext import clean_ascii_chars
from calibre.utils.zipfile import ZipFile
class TXTOutput(OutputFormatPlugin): class TXTOutput(OutputFormatPlugin):
@ -73,6 +80,7 @@ class TXTOutput(OutputFormatPlugin):
writer = TXTMLizer(log) writer = TXTMLizer(log)
txt = writer.extract_content(oeb_book, opts) txt = writer.extract_content(oeb_book, opts)
txt = clean_ascii_chars(txt)
log.debug('\tReplacing newlines with selected type...') log.debug('\tReplacing newlines with selected type...')
txt = specified_newlines(TxtNewlines(opts.newline).newline, txt) txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)
@ -93,3 +101,32 @@ class TXTOutput(OutputFormatPlugin):
if close: if close:
out_stream.close() out_stream.close()
class TXTZOutput(TXTOutput):
name = 'TXTZ Output'
author = 'John Schember'
file_type = 'txtz'
def convert(self, oeb_book, output_path, input_plugin, opts, log):
with TemporaryDirectory('_txtz_output') as tdir:
# TXT
with TemporaryFile('index.txt') as tf:
TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
shutil.copy(tf, os.path.join(tdir, 'index.txt'))
# Images
for item in oeb_book.manifest:
if item.media_type in OEB_IMAGES:
path = os.path.join(tdir, os.path.dirname(item.href))
if not os.path.exists(path):
os.makedirs(path)
with open(os.path.join(tdir, item.href), 'wb') as imgf:
imgf.write(item.data)
# Metadata
with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf:
mdataf.write(etree.tostring(oeb_book.metadata.to_opf1()))
txtz = ZipFile(output_path, 'w')
txtz.add_dir(tdir)

View File

@ -29,8 +29,7 @@ def clean_txt(txt):
txt = '\n'.join([line.rstrip() for line in txt.splitlines()]) txt = '\n'.join([line.rstrip() for line in txt.splitlines()])
# Replace whitespace at the beginning of the line with &nbsp; # Replace whitespace at the beginning of the line with &nbsp;
txt = re.sub('(?m)(?P<space>^[ ]+)(?=.)', lambda mo: '&nbsp;' * mo.groups('space').count(' '), txt) txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', '&nbsp;' * 4, txt)
txt = re.sub('(?m)(?P<space>^[\t]+)(?=.)', lambda mo: '&nbsp;' * 4 * mo.groups('space').count('\t'), txt)
# Condense redundant spaces # Condense redundant spaces
txt = re.sub('[ ]{2,}', ' ', txt) txt = re.sub('[ ]{2,}', ' ', txt)
@ -121,6 +120,15 @@ def separate_paragraphs_print_formatted(txt):
txt = re.sub(u'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt) txt = re.sub(u'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt)
return txt return txt
def separate_hard_scene_breaks(txt):
def sep_break(line):
if len(line.strip()) > 0:
return '\n%s\n' % line
else:
return line
txt = re.sub(u'(?miu)^[ \t-=~\/]+$', lambda mo: sep_break(mo.group()), txt)
return txt
def block_to_single_line(txt): def block_to_single_line(txt):
txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt) txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt)
return txt return txt
@ -221,9 +229,9 @@ def detect_formatting_type(txt):
markdown_count += len(re.findall('(?mu)^=+$', txt)) markdown_count += len(re.findall('(?mu)^=+$', txt))
markdown_count += len(re.findall('(?mu)^-+$', txt)) markdown_count += len(re.findall('(?mu)^-+$', txt))
# Images # Images
markdown_count += len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) markdown_count += len(re.findall('(?u)!\[.*?\](\[|\()', txt))
# Links # Links
markdown_count += len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt)) markdown_count += len(re.findall('(?u)^|[^!]\[.*?\](\[|\()', txt))
# Check for textile # Check for textile
# Headings # Headings
@ -231,9 +239,9 @@ def detect_formatting_type(txt):
# Block quote. # Block quote.
textile_count += len(re.findall(r'(?mu)^bq\.', txt)) textile_count += len(re.findall(r'(?mu)^bq\.', txt))
# Images # Images
textile_count += len(re.findall(r'\![^\s]+(?=.*?/)(:[^\s]+)*', txt)) textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
# Links # Links
textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt)) textile_count += len(re.findall(r'"[^"]*":\S+', txt))
# Decide if either markdown or textile is used in the text # Decide if either markdown or textile is used in the text
# based on the number of unique formatting elements found. # based on the number of unique formatting elements found.

View File

@ -36,13 +36,12 @@ class TextileMLizer(object):
html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
if not self.opts.keep_links: if not self.opts.keep_links:
html = re.sub(r'<\s*a[^>]*>', '', html) html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
html = re.sub(r'<\s*/\s*a\s*>', '', html)
if not self.opts.keep_image_references: if not self.opts.keep_image_references:
html = re.sub(r'<\s*img[^>]*>', '', html) html = re.sub(r'<\s*img[^>]*>', '', html)
html = re.sub(r'<\s*img\s*>', '', html)
text = html2textile(html) text = html2textile(html)
text = text.replace('%', '')
# Ensure the section ends with at least two new line characters. # Ensure the section ends with at least two new line characters.
# This is to prevent the last paragraph from a section being # This is to prevent the last paragraph from a section being

View File

@ -183,32 +183,33 @@ class DBAdder(QObject): # {{{
identical_book_list = self.db.find_identical_books(mi) identical_book_list = self.db.find_identical_books(mi)
if identical_book_list: # books with same author and nearly same title exist in db if identical_book_list: # books with same author and nearly same title exist in db
self.merged_books.add(mi.title) self.merged_books.add(mi.title)
a_new_record_has_been_created = False seen_fmts = set([])
for identical_book in identical_book_list:
if gprefs['automerge'] == 'ignore':
self.add_formats(identical_book, formats, replace=False)
if gprefs['automerge'] == 'overwrite':
self.add_formats(identical_book, formats, replace=True)
if gprefs['automerge'] == 'new record' and not a_new_record_has_been_created:
'''
We are here because we have at least one book record in the db that matches the one file/format being processed
We need to check if the file/format being processed matches a format in the matching book record.
If so, create new record (as below), else, add to existing record, as above.
Test if format exists in matching record. identical_book is an id, formats is a FQPN path in a list
'''
for path in formats:
fmt = os.path.splitext(path)[-1].replace('.', '').upper()
ib_fmts = self.db.formats(identical_book, index_is_id=True)
if ib_fmts and fmt in ib_fmts: # Create a new record
if not a_new_record_has_been_created:
id_ = self.db.create_book_entry(mi, cover=cover, add_duplicates=True)
self.number_of_books_added += 1
self.add_formats(id_, formats)
a_new_record_has_been_created = True
else: #new record not required
self.add_formats(identical_book, formats, replace=False)
else: # books with same author and nearly same title do not exist in db for identical_book in identical_book_list:
ib_fmts = self.db.formats(identical_book, index_is_id=True)
if ib_fmts:
seen_fmts |= set(ib_fmts.split(','))
replace = gprefs['automerge'] == 'overwrite'
self.add_formats(identical_book, formats,
replace=replace)
if gprefs['automerge'] == 'new record':
incoming_fmts = \
set([os.path.splitext(path)[-1].replace('.',
'').upper() for path in formats])
if incoming_fmts.intersection(seen_fmts):
# There was at least one duplicate format
# so create a new record and put the
# incoming formats into it
# We should arguably put only the duplicate
# formats, but no real harm is done by having
# all formats
id_ = self.db.create_book_entry(mi, cover=cover,
add_duplicates=True)
self.number_of_books_added += 1
self.add_formats(id_, formats)
else:
# books with same author and nearly same title do not exist in db
id_ = self.db.create_book_entry(mi, cover=cover, add_duplicates=True) id_ = self.db.create_book_entry(mi, cover=cover, add_duplicates=True)
self.number_of_books_added += 1 self.number_of_books_added += 1
self.add_formats(id_, formats) self.add_formats(id_, formats)

View File

@ -616,6 +616,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
self.original_series_name = unicode(self.series.text()).strip() self.original_series_name = unicode(self.series.text()).strip()
if len(db.custom_column_label_map) == 0: if len(db.custom_column_label_map) == 0:
self.central_widget.tabBar().setVisible(False) self.central_widget.tabBar().setVisible(False)
self.central_widget.setTabEnabled(1, False)
else: else:
self.create_custom_column_editors() self.create_custom_column_editors()
self.generate_cover_button.clicked.connect(self.generate_cover) self.generate_cover_button.clicked.connect(self.generate_cover)
@ -780,8 +781,8 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
_('You have changed the tags. In order to use the tags' _('You have changed the tags. In order to use the tags'
' editor, you must either discard or apply these ' ' editor, you must either discard or apply these '
'changes. Apply changes?'), show_copy_button=False): 'changes. Apply changes?'), show_copy_button=False):
self.books_to_refresh |= self.apply_tags(commit=True, notify=True, self.books_to_refresh |= self.apply_tags(commit=True,
allow_case_change=True) notify=True)
self.original_tags = unicode(self.tags.text()) self.original_tags = unicode(self.tags.text())
else: else:
self.tags.setText(self.original_tags) self.tags.setText(self.original_tags)

View File

@ -197,7 +197,7 @@ class MetadataSingleDialogBase(ResizableDialog):
self.books_to_refresh = set([]) self.books_to_refresh = set([])
for widget in self.basic_metadata_widgets: for widget in self.basic_metadata_widgets:
widget.initialize(self.db, id_) widget.initialize(self.db, id_)
for widget in self.custom_metadata_widgets: for widget in getattr(self, 'custom_metadata_widgets', []):
widget.initialize(id_) widget.initialize(id_)
# Commented out as it doesn't play nice with Next, Prev buttons # Commented out as it doesn't play nice with Next, Prev buttons
#self.fetch_metadata_button.setFocus(Qt.OtherFocusReason) #self.fetch_metadata_button.setFocus(Qt.OtherFocusReason)

View File

@ -104,6 +104,7 @@ _extra_lang_codes = {
'en_IN' : _('English (India)'), 'en_IN' : _('English (India)'),
'en_TH' : _('English (Thailand)'), 'en_TH' : _('English (Thailand)'),
'en_CY' : _('English (Cyprus)'), 'en_CY' : _('English (Cyprus)'),
'en_CZ' : _('English (Czechoslovakia)'),
'en_PK' : _('English (Pakistan)'), 'en_PK' : _('English (Pakistan)'),
'en_HR' : _('English (Croatia)'), 'en_HR' : _('English (Croatia)'),
'en_IL' : _('English (Israel)'), 'en_IL' : _('English (Israel)'),