mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
TXT Input: Enhance formatting detection regexes. Add basic TXTZ input support.
This commit is contained in:
parent
39c57bba49
commit
52c0a1899b
@ -4,23 +4,29 @@ __license__ = 'GPL 3'
|
|||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import glob
|
||||||
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
from calibre import _ent_pat, xml_entity_to_unicode
|
||||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||||
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||||
from calibre.ebooks.chardet import detect
|
from calibre.ebooks.chardet import detect
|
||||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||||
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
||||||
normalize_line_endings, convert_textile, remove_indents, block_to_single_line
|
normalize_line_endings, convert_textile, remove_indents, block_to_single_line, \
|
||||||
from calibre import _ent_pat, xml_entity_to_unicode
|
image_list
|
||||||
|
from calibre.ptempfile import TemporaryDirectory
|
||||||
|
from calibre.utils.zipfile import ZipFile
|
||||||
|
|
||||||
class TXTInput(InputFormatPlugin):
|
class TXTInput(InputFormatPlugin):
|
||||||
|
|
||||||
name = 'TXT Input'
|
name = 'TXT Input'
|
||||||
author = 'John Schember'
|
author = 'John Schember'
|
||||||
description = 'Convert TXT files to HTML'
|
description = 'Convert TXT files to HTML'
|
||||||
file_types = set(['txt'])
|
file_types = set(['txt', 'txtz'])
|
||||||
|
|
||||||
options = set([
|
options = set([
|
||||||
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
||||||
@ -57,10 +63,23 @@ class TXTInput(InputFormatPlugin):
|
|||||||
def convert(self, stream, options, file_ext, log,
|
def convert(self, stream, options, file_ext, log,
|
||||||
accelerators):
|
accelerators):
|
||||||
self.log = log
|
self.log = log
|
||||||
|
txt = ''
|
||||||
log.debug('Reading text from file...')
|
log.debug('Reading text from file...')
|
||||||
length = 0
|
length = 0
|
||||||
|
|
||||||
txt = stream.read()
|
# Extract content from zip archive.
|
||||||
|
if file_ext == 'txtz':
|
||||||
|
log.debug('De-compressing content to temporary directory...')
|
||||||
|
with TemporaryDirectory('_untxtz') as tdir:
|
||||||
|
zf = ZipFile(stream)
|
||||||
|
zf.extractall(tdir)
|
||||||
|
|
||||||
|
txts = glob.glob(os.path.join(tdir, '*.txt'))
|
||||||
|
for t in txts:
|
||||||
|
with open(t, 'rb') as tf:
|
||||||
|
txt += tf.read()
|
||||||
|
else:
|
||||||
|
txt = stream.read()
|
||||||
|
|
||||||
# Get the encoding of the document.
|
# Get the encoding of the document.
|
||||||
if options.input_encoding:
|
if options.input_encoding:
|
||||||
|
@ -221,9 +221,9 @@ def detect_formatting_type(txt):
|
|||||||
markdown_count += len(re.findall('(?mu)^=+$', txt))
|
markdown_count += len(re.findall('(?mu)^=+$', txt))
|
||||||
markdown_count += len(re.findall('(?mu)^-+$', txt))
|
markdown_count += len(re.findall('(?mu)^-+$', txt))
|
||||||
# Images
|
# Images
|
||||||
markdown_count += len(re.findall('(?u)!\[.*?\]\(.+?\)', txt))
|
markdown_count += len(re.findall('(?u)!\[.*?\](\[|\()', txt))
|
||||||
# Links
|
# Links
|
||||||
markdown_count += len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt))
|
markdown_count += len(re.findall('(?u)^|[^!]\[.*?\](\[|\()', txt))
|
||||||
|
|
||||||
# Check for textile
|
# Check for textile
|
||||||
# Headings
|
# Headings
|
||||||
@ -231,9 +231,9 @@ def detect_formatting_type(txt):
|
|||||||
# Block quote.
|
# Block quote.
|
||||||
textile_count += len(re.findall(r'(?mu)^bq\.', txt))
|
textile_count += len(re.findall(r'(?mu)^bq\.', txt))
|
||||||
# Images
|
# Images
|
||||||
textile_count += len(re.findall(r'\![^\s]+(?=.*?/)(:[^\s]+)*', txt))
|
textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
|
||||||
# Links
|
# Links
|
||||||
textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
|
textile_count += len(re.findall(r'"[^"]*":\S+', txt))
|
||||||
|
|
||||||
# Decide if either markdown or textile is used in the text
|
# Decide if either markdown or textile is used in the text
|
||||||
# based on the number of unique formatting elements found.
|
# based on the number of unique formatting elements found.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user