From 9585ba655c810bb9132f3d6d7299455d23d47493 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 11 Jan 2011 18:08:55 -0500 Subject: [PATCH] TXT Input: remove unnecessary try block. Rework markdown and textile detection. --- src/calibre/ebooks/txt/input.py | 6 +---- src/calibre/ebooks/txt/processor.py | 41 +++++++++++++---------------- 2 files changed, 19 insertions(+), 28 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 73af3acde4..0b0bd6d570 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -94,11 +94,7 @@ class TXTInput(InputFormatPlugin): ' converted by calibre. See http://daringfireball.net/projects/markdown/syntax') elif options.formatting_type == 'textile': log.debug('Running text though textile conversion...') - try: - html = convert_textile(txt) - except RuntimeError: - raise ValueError('This txt file has malformed markup, it cannot be' - ' converted by calibre.') + html = convert_textile(txt) else: # Determine the paragraph type of the document. if options.paragraph_type == 'auto': diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index d0526bd9fc..d59fd4121a 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -162,38 +162,33 @@ def detect_paragraph_type(txt): def detect_formatting_type(txt): + markdown_count = 0 + textile_count = 0 + # Check for markdown # Headings - if len(re.findall('(?mu)^#+', txt)) >= 5: - return 'markdown' - if len(re.findall('(?mu)^=+$', txt)) >= 5: - return 'markdown' - if len(re.findall('(?mu)^-+$', txt)) >= 5: - return 'markdown' + markdown_count += len(re.findall('(?mu)^#+', txt)) + markdown_count += len(re.findall('(?mu)^=+$', txt)) + markdown_count += len(re.findall('(?mu)^-+$', txt)) # Images - if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5: - return 'markdown' + markdown_count += len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) # Links - if len(re.findall('(?u)(^|(?P
[^!]))\[.*?\]\([^)]+\)', txt)) >= 5:
-        return 'markdown'
-    # Escaped characters
-    md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']
-    for c in md_escapted_characters:
-        if txt.count('\\'+c) > 10:
-            return 'markdown'
+    markdown_count += len(re.findall('(?u)(^|(?P
[^!]))\[.*?\]\([^)]+\)', txt))
         
     # Check for textile
     # Headings
-    if len(re.findall(r'h[1-6]\.', txt)) >= 5:
-        return 'textile'
+    textile_count += len(re.findall(r'(?mu)^h[1-6]\.', txt))
     # Block quote.
-    if len(re.findall(r'bq\.', txt)) >= 5:
-        return 'textile'
+    textile_count += len(re.findall(r'(?mu)^bq\.', txt))
     # Images
-    if len(re.findall(r'\![^\s]+(:[^\s]+)*', txt)) >= 5:
-        return 'textile'
+    textile_count += len(re.findall(r'\![^\s]+(:[^\s]+)*', txt))
     # Links
-    if len(re.findall(r'"(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt)) >= 5:
-        return 'textile'
+    textile_count += len(re.findall(r'"(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
+    
+    if markdown_count > 5 or textile_count > 5:
+        if markdown_count > textile_count:
+            return 'markdown'
+        else:
+            return 'textile'
     
     return 'heuristic'