TXT Input: remove unnecessary try block. Rework markdown and textile detection.

This commit is contained in:
John Schember 2011-01-11 18:08:55 -05:00
parent 626f1b2558
commit 9585ba655c
2 changed files with 19 additions and 28 deletions

View File

@ -94,11 +94,7 @@ class TXTInput(InputFormatPlugin):
' converted by calibre. See http://daringfireball.net/projects/markdown/syntax') ' converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
elif options.formatting_type == 'textile': elif options.formatting_type == 'textile':
log.debug('Running text though textile conversion...') log.debug('Running text though textile conversion...')
try: html = convert_textile(txt)
html = convert_textile(txt)
except RuntimeError:
raise ValueError('This txt file has malformed markup, it cannot be'
' converted by calibre.')
else: else:
# Determine the paragraph type of the document. # Determine the paragraph type of the document.
if options.paragraph_type == 'auto': if options.paragraph_type == 'auto':

View File

@ -162,38 +162,33 @@ def detect_paragraph_type(txt):
def detect_formatting_type(txt): def detect_formatting_type(txt):
markdown_count = 0
textile_count = 0
# Check for markdown # Check for markdown
# Headings # Headings
if len(re.findall('(?mu)^#+', txt)) >= 5: markdown_count += len(re.findall('(?mu)^#+', txt))
return 'markdown' markdown_count += len(re.findall('(?mu)^=+$', txt))
if len(re.findall('(?mu)^=+$', txt)) >= 5: markdown_count += len(re.findall('(?mu)^-+$', txt))
return 'markdown'
if len(re.findall('(?mu)^-+$', txt)) >= 5:
return 'markdown'
# Images # Images
if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5: markdown_count += len(re.findall('(?u)!\[.*?\]\(.+?\)', txt))
return 'markdown'
# Links # Links
if len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt)) >= 5: markdown_count += len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt))
return 'markdown'
# Escaped characters
md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']
for c in md_escapted_characters:
if txt.count('\\'+c) > 10:
return 'markdown'
# Check for textile # Check for textile
# Headings # Headings
if len(re.findall(r'h[1-6]\.', txt)) >= 5: textile_count += len(re.findall(r'(?mu)^h[1-6]\.', txt))
return 'textile'
# Block quote. # Block quote.
if len(re.findall(r'bq\.', txt)) >= 5: textile_count += len(re.findall(r'(?mu)^bq\.', txt))
return 'textile'
# Images # Images
if len(re.findall(r'\![^\s]+(:[^\s]+)*', txt)) >= 5: textile_count += len(re.findall(r'\![^\s]+(:[^\s]+)*', txt))
return 'textile'
# Links # Links
if len(re.findall(r'"(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt)) >= 5: textile_count += len(re.findall(r'"(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
return 'textile'
if markdown_count > 5 or textile_count > 5:
if markdown_count > textile_count:
return 'markdown'
else:
return 'textile'
return 'heuristic' return 'heuristic'