Initial TXTZ format support

This commit is contained in:
Kovid Goyal 2011-02-07 15:43:50 -07:00
commit b8ec84468d
9 changed files with 143 additions and 21 deletions

View File

@ -325,6 +325,17 @@ class TXTMetadataReader(MetadataReaderPlugin):
from calibre.ebooks.metadata.txt import get_metadata from calibre.ebooks.metadata.txt import get_metadata
return get_metadata(stream) return get_metadata(stream)
class TXTZMetadataReader(MetadataReaderPlugin):
name = 'Read TXTZ metadata'
file_types = set(['txtz'])
description = _('Read metadata from %s files') % 'TXTZ'
author = 'John Schember'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.txtz import get_metadata
return get_metadata(stream)
class ZipMetadataReader(MetadataReaderPlugin): class ZipMetadataReader(MetadataReaderPlugin):
name = 'Read ZIP metadata' name = 'Read ZIP metadata'
@ -412,6 +423,17 @@ class TOPAZMetadataWriter(MetadataWriterPlugin):
from calibre.ebooks.metadata.topaz import set_metadata from calibre.ebooks.metadata.topaz import set_metadata
set_metadata(stream, mi) set_metadata(stream, mi)
class TXTZMetadataWriter(MetadataWriterPlugin):
name = 'Set TXTZ metadata'
file_types = set(['txtz'])
description = _('Set metadata from %s files') % 'TXTZ'
author = 'John Schember'
def set_metadata(self, stream, mi, type):
from calibre.ebooks.metadata.txtz import set_metadata
set_metadata(stream, mi)
# }}} # }}}
from calibre.ebooks.comic.input import ComicInput from calibre.ebooks.comic.input import ComicInput
@ -446,6 +468,7 @@ from calibre.ebooks.rb.output import RBOutput
from calibre.ebooks.rtf.output import RTFOutput from calibre.ebooks.rtf.output import RTFOutput
from calibre.ebooks.tcr.output import TCROutput from calibre.ebooks.tcr.output import TCROutput
from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.txt.output import TXTOutput
from calibre.ebooks.txt.output import TXTZOutput
from calibre.ebooks.html.output import HTMLOutput from calibre.ebooks.html.output import HTMLOutput
from calibre.ebooks.snb.output import SNBOutput from calibre.ebooks.snb.output import SNBOutput
@ -531,6 +554,7 @@ plugins += [
RTFOutput, RTFOutput,
TCROutput, TCROutput,
TXTOutput, TXTOutput,
TXTZOutput,
HTMLOutput, HTMLOutput,
SNBOutput, SNBOutput,
] ]

View File

@ -25,7 +25,7 @@ class DRMError(ValueError):
class ParserError(ValueError): class ParserError(ValueError):
pass pass
BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'htm', 'xhtm', BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'htm', 'xhtm',
'html', 'xhtml', 'pdf', 'pdb', 'pdr', 'prc', 'mobi', 'azw', 'doc', 'html', 'xhtml', 'pdf', 'pdb', 'pdr', 'prc', 'mobi', 'azw', 'doc',
'epub', 'fb2', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip', 'epub', 'fb2', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip',
'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'mbp', 'tan', 'snb'] 'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'mbp', 'tan', 'snb']

View File

@ -1,16 +1,20 @@
'''Read meta information from TXT files''' # -*- coding: utf-8 -*-
from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
'''
Read meta information from TXT files
'''
import re import re
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
def get_metadata(stream, extract_cover=True): def get_metadata(stream, extract_cover=True):
""" Return metadata as a L{MetaInfo} object """ '''
Return metadata as a L{MetaInfo} object
'''
mi = MetaInformation(_('Unknown'), [_('Unknown')]) mi = MetaInformation(_('Unknown'), [_('Unknown')])
stream.seek(0) stream.seek(0)

View File

@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
'''
Read meta information from TXT files
'''
import os
from cStringIO import StringIO
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.zipfile import ZipFile, safe_replace
def get_metadata(stream, extract_cover=True):
'''
Return metadata as a L{MetaInfo} object
'''
mi = MetaInformation(_('Unknown'), [_('Unknown')])
stream.seek(0)
with TemporaryDirectory('_untxtz_mdata') as tdir:
try:
zf = ZipFile(stream)
zf.extract('metadata.opf', tdir)
with open(os.path.join(tdir, 'metadata.opf'), 'rb') as opff:
mi = OPF(opff).to_book_metadata()
except:
return mi
return mi
def set_metadata(stream, mi):
opf = StringIO(metadata_to_opf(mi))
safe_replace(stream, 'metadata.opf', opf)

View File

@ -4,8 +4,10 @@ __license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import glob
import os import os
from calibre import _ent_pat, xml_entity_to_unicode
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.ebooks.chardet import detect from calibre.ebooks.chardet import detect
@ -13,14 +15,15 @@ from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_type, detect_formatting_type, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \
normalize_line_endings, convert_textile, remove_indents, block_to_single_line normalize_line_endings, convert_textile, remove_indents, block_to_single_line
from calibre import _ent_pat, xml_entity_to_unicode from calibre.ptempfile import TemporaryDirectory
from calibre.utils.zipfile import ZipFile
class TXTInput(InputFormatPlugin): class TXTInput(InputFormatPlugin):
name = 'TXT Input' name = 'TXT Input'
author = 'John Schember' author = 'John Schember'
description = 'Convert TXT files to HTML' description = 'Convert TXT files to HTML'
file_types = set(['txt']) file_types = set(['txt', 'txtz'])
options = set([ options = set([
OptionRecommendation(name='paragraph_type', recommended_value='auto', OptionRecommendation(name='paragraph_type', recommended_value='auto',
@ -57,9 +60,22 @@ class TXTInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
self.log = log self.log = log
txt = ''
log.debug('Reading text from file...') log.debug('Reading text from file...')
length = 0 length = 0
# Extract content from zip archive.
if file_ext == 'txtz':
log.debug('De-compressing content to temporary directory...')
with TemporaryDirectory('_untxtz') as tdir:
zf = ZipFile(stream)
zf.extractall(tdir)
txts = glob.glob(os.path.join(tdir, '*.txt'))
for t in txts:
with open(t, 'rb') as tf:
txt += tf.read()
else:
txt = stream.read() txt = stream.read()
# Get the encoding of the document. # Get the encoding of the document.
@ -175,4 +191,11 @@ class TXTInput(InputFormatPlugin):
{}) {})
options.debug_pipeline = odi options.debug_pipeline = odi
os.remove(htmlfile.name) os.remove(htmlfile.name)
# Set metadata from file.
from calibre.customize.ui import get_file_type_metadata
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
mi = get_file_type_metadata(stream, file_ext)
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
return oeb return oeb

View File

@ -35,11 +35,9 @@ class MarkdownMLizer(object):
html = unicode(etree.tostring(item.data, encoding=unicode)) html = unicode(etree.tostring(item.data, encoding=unicode))
if not self.opts.keep_links: if not self.opts.keep_links:
html = re.sub(r'<\s*a[^>]*>', '', html) html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
html = re.sub(r'<\s*/\s*a\s*>', '', html)
if not self.opts.keep_image_references: if not self.opts.keep_image_references:
html = re.sub(r'<\s*img[^>]*>', '', html) html = re.sub(r'<\s*img[^>]*>', '', html)\
html = re.sub(r'<\s*img\s*>', '', html)
text = html2text(html) text = html2text(html)

View File

@ -5,11 +5,18 @@ __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os import os
import shutil
from lxml import etree
from calibre.customize.conversion import OutputFormatPlugin, \ from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation OptionRecommendation
from calibre.ebooks.oeb.base import OEB_IMAGES
from calibre.ebooks.txt.txtml import TXTMLizer from calibre.ebooks.txt.txtml import TXTMLizer
from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines
from calibre.ptempfile import TemporaryDirectory, TemporaryFile
from calibre.utils.cleantext import clean_ascii_chars
from calibre.utils.zipfile import ZipFile
class TXTOutput(OutputFormatPlugin): class TXTOutput(OutputFormatPlugin):
@ -73,6 +80,7 @@ class TXTOutput(OutputFormatPlugin):
writer = TXTMLizer(log) writer = TXTMLizer(log)
txt = writer.extract_content(oeb_book, opts) txt = writer.extract_content(oeb_book, opts)
txt = clean_ascii_chars(txt)
log.debug('\tReplacing newlines with selected type...') log.debug('\tReplacing newlines with selected type...')
txt = specified_newlines(TxtNewlines(opts.newline).newline, txt) txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)
@ -93,3 +101,32 @@ class TXTOutput(OutputFormatPlugin):
if close: if close:
out_stream.close() out_stream.close()
class TXTZOutput(TXTOutput):
name = 'TXTZ Output'
author = 'John Schember'
file_type = 'txtz'
def convert(self, oeb_book, output_path, input_plugin, opts, log):
with TemporaryDirectory('_txtz_output') as tdir:
# TXT
with TemporaryFile('index.txt') as tf:
TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
shutil.copy(tf, os.path.join(tdir, 'index.txt'))
# Images
for item in oeb_book.manifest:
if item.media_type in OEB_IMAGES:
path = os.path.join(tdir, os.path.dirname(item.href))
if not os.path.exists(path):
os.makedirs(path)
with open(os.path.join(tdir, item.href), 'wb') as imgf:
imgf.write(item.data)
# Metadata
with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf:
mdataf.write(etree.tostring(oeb_book.metadata.to_opf1()))
txtz = ZipFile(output_path, 'w')
txtz.add_dir(tdir)

View File

@ -29,8 +29,7 @@ def clean_txt(txt):
txt = '\n'.join([line.rstrip() for line in txt.splitlines()]) txt = '\n'.join([line.rstrip() for line in txt.splitlines()])
# Replace whitespace at the beginning of the line with &nbsp; # Replace whitespace at the beginning of the line with &nbsp;
txt = re.sub('(?m)(?P<space>^[ ]+)(?=.)', lambda mo: '&nbsp;' * mo.groups('space').count(' '), txt) txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', '&nbsp;' * 4, txt)
txt = re.sub('(?m)(?P<space>^[\t]+)(?=.)', lambda mo: '&nbsp;' * 4 * mo.groups('space').count('\t'), txt)
# Condense redundant spaces # Condense redundant spaces
txt = re.sub('[ ]{2,}', ' ', txt) txt = re.sub('[ ]{2,}', ' ', txt)
@ -221,9 +220,9 @@ def detect_formatting_type(txt):
markdown_count += len(re.findall('(?mu)^=+$', txt)) markdown_count += len(re.findall('(?mu)^=+$', txt))
markdown_count += len(re.findall('(?mu)^-+$', txt)) markdown_count += len(re.findall('(?mu)^-+$', txt))
# Images # Images
markdown_count += len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) markdown_count += len(re.findall('(?u)!\[.*?\](\[|\()', txt))
# Links # Links
markdown_count += len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt)) markdown_count += len(re.findall('(?u)^|[^!]\[.*?\](\[|\()', txt))
# Check for textile # Check for textile
# Headings # Headings
@ -231,9 +230,9 @@ def detect_formatting_type(txt):
# Block quote. # Block quote.
textile_count += len(re.findall(r'(?mu)^bq\.', txt)) textile_count += len(re.findall(r'(?mu)^bq\.', txt))
# Images # Images
textile_count += len(re.findall(r'\![^\s]+(?=.*?/)(:[^\s]+)*', txt)) textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
# Links # Links
textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt)) textile_count += len(re.findall(r'"[^"]*":\S+', txt))
# Decide if either markdown or textile is used in the text # Decide if either markdown or textile is used in the text
# based on the number of unique formatting elements found. # based on the number of unique formatting elements found.

View File

@ -36,13 +36,12 @@ class TextileMLizer(object):
html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
if not self.opts.keep_links: if not self.opts.keep_links:
html = re.sub(r'<\s*a[^>]*>', '', html) html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
html = re.sub(r'<\s*/\s*a\s*>', '', html)
if not self.opts.keep_image_references: if not self.opts.keep_image_references:
html = re.sub(r'<\s*img[^>]*>', '', html) html = re.sub(r'<\s*img[^>]*>', '', html)
html = re.sub(r'<\s*img\s*>', '', html)
text = html2textile(html) text = html2textile(html)
text = text.replace('%', '')
# Ensure the section ends with at least two new line characters. # Ensure the section ends with at least two new line characters.
# This is to prevent the last paragraph from a section being # This is to prevent the last paragraph from a section being