TXT: small Textile changes. Remove old textile conversion code.

2025-07-09 03:04:10 -04:00 · 2011-05-10 18:55:19 -04:00 · 2011-05-10 18:55:19 -04:00 · 441718f76c
commit 441718f76c
parent b95f9949be
2 changed files with 34 additions and 233 deletions
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@ -78,44 +78,55 @@ class TextileMLizer(OEB2HTML):
            for i in self.our_links:
                if i[0] == '#':
                    if i not in self.our_ids:
-                        self.log.debug('Link has no target - %s ...' % i)
                        text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text)
            for i in self.our_ids:
                if i not in self.our_links:
-                    self.log.debug('ID has no link - %s ...' % i)
                    text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text)
                    
        # Remove obvious non-needed escaping, add sub/sup-script ones
        text = check_escaping(text, ['\*', '_', '\*'])
-        text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text) # escape the super/sub-scripts if needed
-        text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text) # escape the super/sub-scripts if needed
+        # escape the super/sub-scripts if needed
+        text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text)
+        # escape the super/sub-scripts if needed
+        text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text)

-        text = re.sub(r'%\xa0+', r'%', text)                            #remove empty spans
-        text = re.sub(r'%%', r'', text)                                 #remove empty spans - MAY MERGE SOME ?
-        text = re.sub(r'%([_+*-]+)%', r'\1', text)                      #remove spans from tagged output
-        text = re.sub(r' +\n', r'\n', text)                             #remove spaces before a newline
-        text = re.sub(r'^\n+', r'', text)                               #remove newlines at top of file
-        text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)              #correct blockcode paras
-        text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text)            #correct blockquote paras
-#        text = re.sub(r'\n{4,}', r'\n\np. \n\n', text)                  #reduce blank lines + insert blank para
-        text = re.sub(r'\n{3}', r'\n\n', text)                          #reduce blank lines
-#        text = re.sub(r' ((\* ?)+) ', r' ==\1== ', text)
+        #remove empty spans
+        text = re.sub(r'%\xa0+', r'%', text)
+        #remove empty spans - MAY MERGE SOME ?
+        text = re.sub(r'%%', r'', text)
+        #remove spans from tagged output
+        text = re.sub(r'%([_+*-]+)%', r'\1', text)
+        #remove spaces before a newline
+        text = re.sub(r' +\n', r'\n', text)
+        #remove newlines at top of file
+        text = re.sub(r'^\n+', r'', text)
+        #correct blockcode paras
+        text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)
+        #correct blockquote paras
+        text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text)
+
+        #reduce blank lines
+        text = re.sub(r'\n{3}', r'\n\n', text)
        text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text)
-        text = re.sub(r'\n\n {2,4}%', r'%', text)                          #Check span following blank para
+        #Check span following blank para
+        text = re.sub(r'\n+ +%', r' %', text)
        text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text)
-        text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text)                # blank paragraph
-        text = re.sub(u'\n\xa0',   r'\np. ', text)                     # blank paragraph
-        text = re.sub(u'\np[<>=]{1,2}?\. \xa0',   r'\np. ', text)       # blank paragraph
+        # blank paragraph
+        text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text)
+        # blank paragraph
+        text = re.sub(u'\n\xa0',   r'\np. ', text)
+        # blank paragraph
+        text = re.sub(u'\np[<>=]{1,2}?\. \xa0',   r'\np. ', text)
        text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
        text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text)
-        text = re.sub(r' {2,}\|', r' |', text)                               #sort out spaces in tables
+        #sort out spaces in tables
+        text = re.sub(r' {2,}\|', r' |', text)

        # Now put back spaces removed earlier as they're needed here
        text = re.sub(r'\np\.\n', r'\np. \n', text)
-        text = re.sub(r' \n\n\n', r' \n\n', text)                          #reduce blank lines
-        
-        # started work on trying to fix footnotes
-#        text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text)
+        #reduce blank lines
+        text = re.sub(r' \n\n\n', r' \n\n', text)
+
        return text

    def remove_newlines(self, text):
@ -198,7 +209,6 @@ class TextileMLizer(OEB2HTML):
        return txt

    def prepare_string_for_textile(self, txt):
-#        if re.search(r'(\s([*&_+\-~@%|]|\?{2}))|(([*&_+\-~@%|]|\?{2})\s)', txt):
        if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt):
            return ' ==%s== ' % txt
        return txt
--- a/src/calibre/utils/html2textile.py
+++ b/src/calibre/utils/html2textile.py
@ -1,209 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright (c) 2010, Webreactor - Marcin Lulek <info@webreactor.eu>
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#    * Redistributions of source code must retain the above copyright
-#      notice, this list of conditions and the following disclaimer.
-#    * Redistributions in binary form must reproduce the above copyright
-#      notice, this list of conditions and the following disclaimer in the
-#      documentation and/or other materials provided with the distribution.
-#    * Neither the name of the <organization> nor the
-#      names of its contributors may be used to endorse or promote products
-#      derived from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-from lxml import etree
-from calibre.ebooks.oeb.base import barename
-
-class EchoTarget:
-
-    def __init__(self):
-        self.final_output = []
-        self.block = False
-        self.ol_ident = 0
-        self.ul_ident = 0
-        self.list_types = []
-        self.haystack = []
-
-    def start(self, tag, attrib):
-        tag = barename(tag)
-
-        newline = '\n'
-        dot = ''
-        new_tag = ''
-
-        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
-            new_tag = tag
-            dot = '. '
-        elif tag == 'p':
-                new_tag = ''
-                dot = ''
-        elif tag == 'blockquote':
-            new_tag = 'bq'
-            dot = '. '
-        elif tag in ('b', 'strong'):
-            new_tag = '*'
-            newline = ''
-        elif tag in ('em', 'i'):
-            new_tag = '_'
-            newline = ''
-        elif tag == 'cite':
-            new_tag = '??'
-            newline = ''
-        elif tag == 'del':
-            new_tag = '-'
-            newline = ''
-        elif tag == 'ins':
-            new_tag = '+'
-            newline = ''
-        elif tag == 'sup':
-            new_tag = '^'
-            newline = ''
-        elif tag == 'sub':
-            new_tag = '~'
-            newline = ''
-        elif tag == 'span':
-            new_tag = ''
-            newline = ''
-        elif tag == 'a':
-            self.block = True
-            if 'title' in attrib:
-                self.a_part = {'title':attrib.get('title'),
-                               'href':attrib.get('href', '')}
-            else:
-                self.a_part = {'title':None, 'href':attrib.get('href', '')}
-            new_tag = ''
-            newline = ''
-
-        elif tag == 'img':
-            if 'alt' in attrib:
-                new_tag = ' !%s(%s)' % (attrib.get('src'), attrib.get('title'),)
-            else:
-                new_tag = ' !%s' % attrib.get('src')
-            newline = ''
-
-        elif tag in ('ul', 'ol'):
-            new_tag = ''
-            newline = ''
-            self.list_types.append(tag)
-            if tag == 'ul':
-                self.ul_ident += 1
-            else:
-                self.ol_ident += 1
-
-        elif tag == 'li':
-            indent = self.ul_ident + self.ol_ident
-            if self.list_types[-1] == 'ul':
-                new_tag = '*' * indent + ' '
-                newline = '\n'
-            else:
-                new_tag = '#' * indent + ' '
-                newline = '\n'
-
-
-        if tag not in ('ul', 'ol'):
-            textile = '%(newline)s%(tag)s%(dot)s' % \
-                                 {
-                                  'newline':newline,
-                                  'tag':new_tag,
-                                  'dot':dot
-                                  }
-            if not self.block:
-                self.final_output.append(textile)
-            else:
-                self.haystack.append(textile)
-
-    def end(self, tag):
-        tag = barename(tag)
-
-        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
-            self.final_output.append('\n')
-        elif tag in ('b', 'strong'):
-            self.final_output.append('*')
-        elif tag in ('em', 'i'):
-            self.final_output.append('_')
-        elif tag == 'cite':
-            self.final_output.append('??')
-        elif tag == 'del':
-            self.final_output.append('-')
-        elif tag == 'ins':
-            self.final_output.append('+')
-        elif tag == 'sup':
-            self.final_output.append('^')
-        elif tag == 'sub':
-            self.final_output.append('~')
-        elif tag == 'span':
-            self.final_output.append('')
-        elif tag == 'a':
-            if self.a_part['title']:
-                textilized = ' "%s (%s)":%s ' % (
-                                                 ''.join(self.haystack),
-                                                 self.a_part.get('title'),
-                                                 self.a_part.get('href'),
-                                                 )
-                self.haystack = []
-            else:
-                textilized = ' "%s":%s ' % (
-                                                 ''.join(self.haystack),
-                                                 self.a_part.get('href'),
-                                                 )
-                self.haystack = []
-            self.final_output.append(textilized)
-            self.block = False
-        elif tag == 'img':
-            self.final_output.append('!')
-        elif tag == 'ul':
-            self.ul_ident -= 1
-            self.list_types.pop()
-            if len(self.list_types) == 0:
-                self.final_output.append('\n')
-        elif tag == 'ol':
-            self.ol_ident -= 1
-            self.list_types.pop()
-            if len(self.list_types) == 0:
-                self.final_output.append('\n')
-
-    def data(self, data):
-        #we dont want any linebreaks inside our tags
-        node_data = data.replace('\n','')
-        if not self.block:
-            self.final_output.append(node_data)
-        else:
-            self.haystack.append(node_data)
-
-    def comment(self, text):
-        pass
-
-    def close(self):
-        return "closed!"
-
-
-def html2textile(html):
-    #1st pass
-    #clean the whitespace and convert html to xhtml
-    parser = etree.HTMLParser()
-    tree = etree.fromstring(html, parser)
-    xhtml = etree.tostring(tree, method="xml")
-    parser = etree.XMLParser(remove_blank_text=True)
-    root = etree.XML(xhtml, parser)
-    cleaned_html = etree.tostring(root)
-    #2nd pass build textile
-    target = EchoTarget()
-    parser = etree.XMLParser(target=target)
-    root = etree.fromstring(cleaned_html, parser)
-    textilized_text = ''.join(target.final_output).lstrip().rstrip()
-    return textilized_text