TXT Input: Enhance formatting detection regexes. Add basic TXTZ input support.

This commit is contained in:
John Schember 2011-02-06 13:49:50 -05:00
parent 39c57bba49
commit 52c0a1899b
2 changed files with 27 additions and 8 deletions

View File

@ -4,23 +4,29 @@ __license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import glob
import mimetypes
import os import os
import shutil
from calibre import _ent_pat, xml_entity_to_unicode
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.ebooks.chardet import detect from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_type, detect_formatting_type, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \
normalize_line_endings, convert_textile, remove_indents, block_to_single_line normalize_line_endings, convert_textile, remove_indents, block_to_single_line, \
from calibre import _ent_pat, xml_entity_to_unicode image_list
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.zipfile import ZipFile
class TXTInput(InputFormatPlugin): class TXTInput(InputFormatPlugin):
name = 'TXT Input' name = 'TXT Input'
author = 'John Schember' author = 'John Schember'
description = 'Convert TXT files to HTML' description = 'Convert TXT files to HTML'
file_types = set(['txt']) file_types = set(['txt', 'txtz'])
options = set([ options = set([
OptionRecommendation(name='paragraph_type', recommended_value='auto', OptionRecommendation(name='paragraph_type', recommended_value='auto',
@ -57,10 +63,23 @@ class TXTInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
self.log = log self.log = log
txt = ''
log.debug('Reading text from file...') log.debug('Reading text from file...')
length = 0 length = 0
txt = stream.read() # Extract content from zip archive.
if file_ext == 'txtz':
log.debug('De-compressing content to temporary directory...')
with TemporaryDirectory('_untxtz') as tdir:
zf = ZipFile(stream)
zf.extractall(tdir)
txts = glob.glob(os.path.join(tdir, '*.txt'))
for t in txts:
with open(t, 'rb') as tf:
txt += tf.read()
else:
txt = stream.read()
# Get the encoding of the document. # Get the encoding of the document.
if options.input_encoding: if options.input_encoding:

View File

@ -221,9 +221,9 @@ def detect_formatting_type(txt):
markdown_count += len(re.findall('(?mu)^=+$', txt)) markdown_count += len(re.findall('(?mu)^=+$', txt))
markdown_count += len(re.findall('(?mu)^-+$', txt)) markdown_count += len(re.findall('(?mu)^-+$', txt))
# Images # Images
markdown_count += len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) markdown_count += len(re.findall('(?u)!\[.*?\](\[|\()', txt))
# Links # Links
markdown_count += len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt)) markdown_count += len(re.findall('(?u)^|[^!]\[.*?\](\[|\()', txt))
# Check for textile # Check for textile
# Headings # Headings
@ -231,9 +231,9 @@ def detect_formatting_type(txt):
# Block quote. # Block quote.
textile_count += len(re.findall(r'(?mu)^bq\.', txt)) textile_count += len(re.findall(r'(?mu)^bq\.', txt))
# Images # Images
textile_count += len(re.findall(r'\![^\s]+(?=.*?/)(:[^\s]+)*', txt)) textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
# Links # Links
textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt)) textile_count += len(re.findall(r'"[^"]*":\S+', txt))
# Decide if either markdown or textile is used in the text # Decide if either markdown or textile is used in the text
# based on the number of unique formatting elements found. # based on the number of unique formatting elements found.