mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Initial TXTZ format support
This commit is contained in:
commit
b8ec84468d
@ -325,6 +325,17 @@ class TXTMetadataReader(MetadataReaderPlugin):
|
||||
from calibre.ebooks.metadata.txt import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
class TXTZMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
name = 'Read TXTZ metadata'
|
||||
file_types = set(['txtz'])
|
||||
description = _('Read metadata from %s files') % 'TXTZ'
|
||||
author = 'John Schember'
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.metadata.txtz import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
class ZipMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
name = 'Read ZIP metadata'
|
||||
@ -412,6 +423,17 @@ class TOPAZMetadataWriter(MetadataWriterPlugin):
|
||||
from calibre.ebooks.metadata.topaz import set_metadata
|
||||
set_metadata(stream, mi)
|
||||
|
||||
class TXTZMetadataWriter(MetadataWriterPlugin):
|
||||
|
||||
name = 'Set TXTZ metadata'
|
||||
file_types = set(['txtz'])
|
||||
description = _('Set metadata from %s files') % 'TXTZ'
|
||||
author = 'John Schember'
|
||||
|
||||
def set_metadata(self, stream, mi, type):
|
||||
from calibre.ebooks.metadata.txtz import set_metadata
|
||||
set_metadata(stream, mi)
|
||||
|
||||
# }}}
|
||||
|
||||
from calibre.ebooks.comic.input import ComicInput
|
||||
@ -446,6 +468,7 @@ from calibre.ebooks.rb.output import RBOutput
|
||||
from calibre.ebooks.rtf.output import RTFOutput
|
||||
from calibre.ebooks.tcr.output import TCROutput
|
||||
from calibre.ebooks.txt.output import TXTOutput
|
||||
from calibre.ebooks.txt.output import TXTZOutput
|
||||
from calibre.ebooks.html.output import HTMLOutput
|
||||
from calibre.ebooks.snb.output import SNBOutput
|
||||
|
||||
@ -531,6 +554,7 @@ plugins += [
|
||||
RTFOutput,
|
||||
TCROutput,
|
||||
TXTOutput,
|
||||
TXTZOutput,
|
||||
HTMLOutput,
|
||||
SNBOutput,
|
||||
]
|
||||
|
@ -25,7 +25,7 @@ class DRMError(ValueError):
|
||||
class ParserError(ValueError):
|
||||
pass
|
||||
|
||||
BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'htm', 'xhtm',
|
||||
BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'htm', 'xhtm',
|
||||
'html', 'xhtml', 'pdf', 'pdb', 'pdr', 'prc', 'mobi', 'azw', 'doc',
|
||||
'epub', 'fb2', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip',
|
||||
'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'mbp', 'tan', 'snb']
|
||||
|
@ -1,16 +1,20 @@
|
||||
'''Read meta information from TXT files'''
|
||||
|
||||
from __future__ import with_statement
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
|
||||
'''
|
||||
Read meta information from TXT files
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
|
||||
def get_metadata(stream, extract_cover=True):
|
||||
""" Return metadata as a L{MetaInfo} object """
|
||||
'''
|
||||
Return metadata as a L{MetaInfo} object
|
||||
'''
|
||||
mi = MetaInformation(_('Unknown'), [_('Unknown')])
|
||||
stream.seek(0)
|
||||
|
||||
|
38
src/calibre/ebooks/metadata/txtz.py
Normal file
38
src/calibre/ebooks/metadata/txtz.py
Normal file
@ -0,0 +1,38 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
|
||||
'''
|
||||
Read meta information from TXT files
|
||||
'''
|
||||
|
||||
import os
|
||||
|
||||
from cStringIO import StringIO
|
||||
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.utils.zipfile import ZipFile, safe_replace
|
||||
|
||||
def get_metadata(stream, extract_cover=True):
|
||||
'''
|
||||
Return metadata as a L{MetaInfo} object
|
||||
'''
|
||||
mi = MetaInformation(_('Unknown'), [_('Unknown')])
|
||||
stream.seek(0)
|
||||
|
||||
with TemporaryDirectory('_untxtz_mdata') as tdir:
|
||||
try:
|
||||
zf = ZipFile(stream)
|
||||
zf.extract('metadata.opf', tdir)
|
||||
with open(os.path.join(tdir, 'metadata.opf'), 'rb') as opff:
|
||||
mi = OPF(opff).to_book_metadata()
|
||||
except:
|
||||
return mi
|
||||
return mi
|
||||
|
||||
def set_metadata(stream, mi):
|
||||
opf = StringIO(metadata_to_opf(mi))
|
||||
safe_replace(stream, 'metadata.opf', opf)
|
@ -4,8 +4,10 @@ __license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import glob
|
||||
import os
|
||||
|
||||
from calibre import _ent_pat, xml_entity_to_unicode
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||
from calibre.ebooks.chardet import detect
|
||||
@ -13,14 +15,15 @@ from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
||||
normalize_line_endings, convert_textile, remove_indents, block_to_single_line
|
||||
from calibre import _ent_pat, xml_entity_to_unicode
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
|
||||
class TXTInput(InputFormatPlugin):
|
||||
|
||||
name = 'TXT Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert TXT files to HTML'
|
||||
file_types = set(['txt'])
|
||||
file_types = set(['txt', 'txtz'])
|
||||
|
||||
options = set([
|
||||
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
||||
@ -57,10 +60,23 @@ class TXTInput(InputFormatPlugin):
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
self.log = log
|
||||
txt = ''
|
||||
log.debug('Reading text from file...')
|
||||
length = 0
|
||||
|
||||
txt = stream.read()
|
||||
# Extract content from zip archive.
|
||||
if file_ext == 'txtz':
|
||||
log.debug('De-compressing content to temporary directory...')
|
||||
with TemporaryDirectory('_untxtz') as tdir:
|
||||
zf = ZipFile(stream)
|
||||
zf.extractall(tdir)
|
||||
|
||||
txts = glob.glob(os.path.join(tdir, '*.txt'))
|
||||
for t in txts:
|
||||
with open(t, 'rb') as tf:
|
||||
txt += tf.read()
|
||||
else:
|
||||
txt = stream.read()
|
||||
|
||||
# Get the encoding of the document.
|
||||
if options.input_encoding:
|
||||
@ -175,4 +191,11 @@ class TXTInput(InputFormatPlugin):
|
||||
{})
|
||||
options.debug_pipeline = odi
|
||||
os.remove(htmlfile.name)
|
||||
|
||||
# Set metadata from file.
|
||||
from calibre.customize.ui import get_file_type_metadata
|
||||
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
|
||||
mi = get_file_type_metadata(stream, file_ext)
|
||||
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
|
||||
|
||||
return oeb
|
||||
|
@ -35,11 +35,9 @@ class MarkdownMLizer(object):
|
||||
html = unicode(etree.tostring(item.data, encoding=unicode))
|
||||
|
||||
if not self.opts.keep_links:
|
||||
html = re.sub(r'<\s*a[^>]*>', '', html)
|
||||
html = re.sub(r'<\s*/\s*a\s*>', '', html)
|
||||
html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
|
||||
if not self.opts.keep_image_references:
|
||||
html = re.sub(r'<\s*img[^>]*>', '', html)
|
||||
html = re.sub(r'<\s*img\s*>', '', html)
|
||||
html = re.sub(r'<\s*img[^>]*>', '', html)\
|
||||
|
||||
text = html2text(html)
|
||||
|
||||
|
@ -5,11 +5,18 @@ __copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, \
|
||||
OptionRecommendation
|
||||
from calibre.ebooks.oeb.base import OEB_IMAGES
|
||||
from calibre.ebooks.txt.txtml import TXTMLizer
|
||||
from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines
|
||||
from calibre.ptempfile import TemporaryDirectory, TemporaryFile
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
|
||||
class TXTOutput(OutputFormatPlugin):
|
||||
|
||||
@ -73,6 +80,7 @@ class TXTOutput(OutputFormatPlugin):
|
||||
writer = TXTMLizer(log)
|
||||
|
||||
txt = writer.extract_content(oeb_book, opts)
|
||||
txt = clean_ascii_chars(txt)
|
||||
|
||||
log.debug('\tReplacing newlines with selected type...')
|
||||
txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)
|
||||
@ -93,3 +101,32 @@ class TXTOutput(OutputFormatPlugin):
|
||||
if close:
|
||||
out_stream.close()
|
||||
|
||||
|
||||
class TXTZOutput(TXTOutput):
|
||||
|
||||
name = 'TXTZ Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'txtz'
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
with TemporaryDirectory('_txtz_output') as tdir:
|
||||
# TXT
|
||||
with TemporaryFile('index.txt') as tf:
|
||||
TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
|
||||
shutil.copy(tf, os.path.join(tdir, 'index.txt'))
|
||||
|
||||
# Images
|
||||
for item in oeb_book.manifest:
|
||||
if item.media_type in OEB_IMAGES:
|
||||
path = os.path.join(tdir, os.path.dirname(item.href))
|
||||
if not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
with open(os.path.join(tdir, item.href), 'wb') as imgf:
|
||||
imgf.write(item.data)
|
||||
|
||||
# Metadata
|
||||
with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf:
|
||||
mdataf.write(etree.tostring(oeb_book.metadata.to_opf1()))
|
||||
|
||||
txtz = ZipFile(output_path, 'w')
|
||||
txtz.add_dir(tdir)
|
||||
|
@ -29,8 +29,7 @@ def clean_txt(txt):
|
||||
txt = '\n'.join([line.rstrip() for line in txt.splitlines()])
|
||||
|
||||
# Replace whitespace at the beginning of the line with
|
||||
txt = re.sub('(?m)(?P<space>^[ ]+)(?=.)', lambda mo: ' ' * mo.groups('space').count(' '), txt)
|
||||
txt = re.sub('(?m)(?P<space>^[\t]+)(?=.)', lambda mo: ' ' * 4 * mo.groups('space').count('\t'), txt)
|
||||
txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', ' ' * 4, txt)
|
||||
|
||||
# Condense redundant spaces
|
||||
txt = re.sub('[ ]{2,}', ' ', txt)
|
||||
@ -221,9 +220,9 @@ def detect_formatting_type(txt):
|
||||
markdown_count += len(re.findall('(?mu)^=+$', txt))
|
||||
markdown_count += len(re.findall('(?mu)^-+$', txt))
|
||||
# Images
|
||||
markdown_count += len(re.findall('(?u)!\[.*?\]\(.+?\)', txt))
|
||||
markdown_count += len(re.findall('(?u)!\[.*?\](\[|\()', txt))
|
||||
# Links
|
||||
markdown_count += len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt))
|
||||
markdown_count += len(re.findall('(?u)^|[^!]\[.*?\](\[|\()', txt))
|
||||
|
||||
# Check for textile
|
||||
# Headings
|
||||
@ -231,9 +230,9 @@ def detect_formatting_type(txt):
|
||||
# Block quote.
|
||||
textile_count += len(re.findall(r'(?mu)^bq\.', txt))
|
||||
# Images
|
||||
textile_count += len(re.findall(r'\![^\s]+(?=.*?/)(:[^\s]+)*', txt))
|
||||
textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
|
||||
# Links
|
||||
textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
|
||||
textile_count += len(re.findall(r'"[^"]*":\S+', txt))
|
||||
|
||||
# Decide if either markdown or textile is used in the text
|
||||
# based on the number of unique formatting elements found.
|
||||
|
@ -36,13 +36,12 @@ class TextileMLizer(object):
|
||||
html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
|
||||
|
||||
if not self.opts.keep_links:
|
||||
html = re.sub(r'<\s*a[^>]*>', '', html)
|
||||
html = re.sub(r'<\s*/\s*a\s*>', '', html)
|
||||
html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
|
||||
if not self.opts.keep_image_references:
|
||||
html = re.sub(r'<\s*img[^>]*>', '', html)
|
||||
html = re.sub(r'<\s*img\s*>', '', html)
|
||||
|
||||
text = html2textile(html)
|
||||
text = text.replace('%', '')
|
||||
|
||||
# Ensure the section ends with at least two new line characters.
|
||||
# This is to prevent the last paragraph from a section being
|
||||
|
Loading…
x
Reference in New Issue
Block a user