TXT Input: Enhance formatting detection regexes. Add basic TXTZ input support.

This commit is contained in:
John Schember 2011-02-06 13:49:50 -05:00
parent 39c57bba49
commit 52c0a1899b
2 changed files with 27 additions and 8 deletions

View File

@ -4,23 +4,29 @@ __license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import glob
import mimetypes
import os
import shutil
from calibre import _ent_pat, xml_entity_to_unicode
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
normalize_line_endings, convert_textile, remove_indents, block_to_single_line
from calibre import _ent_pat, xml_entity_to_unicode
normalize_line_endings, convert_textile, remove_indents, block_to_single_line, \
image_list
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.zipfile import ZipFile
class TXTInput(InputFormatPlugin):
name = 'TXT Input'
author = 'John Schember'
description = 'Convert TXT files to HTML'
file_types = set(['txt'])
file_types = set(['txt', 'txtz'])
options = set([
OptionRecommendation(name='paragraph_type', recommended_value='auto',
@ -57,10 +63,23 @@ class TXTInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log,
accelerators):
self.log = log
txt = ''
log.debug('Reading text from file...')
length = 0
txt = stream.read()
# Extract content from zip archive.
if file_ext == 'txtz':
log.debug('De-compressing content to temporary directory...')
with TemporaryDirectory('_untxtz') as tdir:
zf = ZipFile(stream)
zf.extractall(tdir)
txts = glob.glob(os.path.join(tdir, '*.txt'))
for t in txts:
with open(t, 'rb') as tf:
txt += tf.read()
else:
txt = stream.read()
# Get the encoding of the document.
if options.input_encoding:

View File

@ -221,9 +221,9 @@ def detect_formatting_type(txt):
markdown_count += len(re.findall('(?mu)^=+$', txt))
markdown_count += len(re.findall('(?mu)^-+$', txt))
# Images
markdown_count += len(re.findall('(?u)!\[.*?\]\(.+?\)', txt))
markdown_count += len(re.findall('(?u)!\[.*?\](\[|\()', txt))
# Links
markdown_count += len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt))
markdown_count += len(re.findall('(?u)^|[^!]\[.*?\](\[|\()', txt))
# Check for textile
# Headings
@ -231,9 +231,9 @@ def detect_formatting_type(txt):
# Block quote.
textile_count += len(re.findall(r'(?mu)^bq\.', txt))
# Images
textile_count += len(re.findall(r'\![^\s]+(?=.*?/)(:[^\s]+)*', txt))
textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
# Links
textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
textile_count += len(re.findall(r'"[^"]*":\S+', txt))
# Decide if either markdown or textile is used in the text
# based on the number of unique formatting elements found.