TXT: small Textile changes. Remove old textile conversion code.

2025-07-09 03:04:10 -04:00 · 2011-05-10 18:55:19 -04:00 · 2011-05-10 18:55:19 -04:00 · 441718f76c
commit 441718f76c
parent b95f9949be
2 changed files with 34 additions and 233 deletions
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@ -78,44 +78,55 @@ class TextileMLizer(OEB2HTML):
            for i in self.our_links:
                if i[0] == '#':
                    if i not in self.our_ids:
                        self.log.debug('Link has no target - %s ...' % i)
                        text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text)
            for i in self.our_ids:
                if i not in self.our_links:
                    self.log.debug('ID has no link - %s ...' % i)
                    text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text)
        # Remove obvious non-needed escaping, add sub/sup-script ones
        text = check_escaping(text, ['\*', '_', '\*'])
-        text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text) # escape the super/sub-scripts if needed
+        # escape the super/sub-scripts if needed
-        text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text) # escape the super/sub-scripts if needed
+        text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text)
        # escape the super/sub-scripts if needed
        text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text)
-        text = re.sub(r'%\xa0+', r'%', text)                            #remove empty spans
+        #remove empty spans
-        text = re.sub(r'%%', r'', text)                                 #remove empty spans - MAY MERGE SOME ?
+        text = re.sub(r'%\xa0+', r'%', text)
-        text = re.sub(r'%([_+*-]+)%', r'\1', text)                      #remove spans from tagged output
+        #remove empty spans - MAY MERGE SOME ?
-        text = re.sub(r' +\n', r'\n', text)                             #remove spaces before a newline
+        text = re.sub(r'%%', r'', text)
-        text = re.sub(r'^\n+', r'', text)                               #remove newlines at top of file
+        #remove spans from tagged output
-        text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)              #correct blockcode paras
+        text = re.sub(r'%([_+*-]+)%', r'\1', text)
-        text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text)            #correct blockquote paras
+        #remove spaces before a newline
-#        text = re.sub(r'\n{4,}', r'\n\np. \n\n', text)                  #reduce blank lines + insert blank para
+        text = re.sub(r' +\n', r'\n', text)
-        text = re.sub(r'\n{3}', r'\n\n', text)                          #reduce blank lines
+        #remove newlines at top of file
-#        text = re.sub(r' ((\* ?)+) ', r' ==\1== ', text)
+        text = re.sub(r'^\n+', r'', text)
        #correct blockcode paras
        text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)
        #correct blockquote paras
        text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text)
        #reduce blank lines
        text = re.sub(r'\n{3}', r'\n\n', text)
        text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text)
-        text = re.sub(r'\n\n {2,4}%', r'%', text)                          #Check span following blank para
+        #Check span following blank para
        text = re.sub(r'\n+ +%', r' %', text)
        text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text)
-        text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text)                # blank paragraph
+        # blank paragraph
-        text = re.sub(u'\n\xa0',   r'\np. ', text)                     # blank paragraph
+        text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text)
-        text = re.sub(u'\np[<>=]{1,2}?\. \xa0',   r'\np. ', text)       # blank paragraph
+        # blank paragraph
        text = re.sub(u'\n\xa0',   r'\np. ', text)
        # blank paragraph
        text = re.sub(u'\np[<>=]{1,2}?\. \xa0',   r'\np. ', text)
        text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
        text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text)
-        text = re.sub(r' {2,}\|', r' |', text)                               #sort out spaces in tables
+        #sort out spaces in tables
        text = re.sub(r' {2,}\|', r' |', text)
        # Now put back spaces removed earlier as they're needed here
        text = re.sub(r'\np\.\n', r'\np. \n', text)
-        text = re.sub(r' \n\n\n', r' \n\n', text)                          #reduce blank lines
+        #reduce blank lines
        text = re.sub(r' \n\n\n', r' \n\n', text)
        # started work on trying to fix footnotes
 #        text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text)
        return text
    def remove_newlines(self, text):
@ -198,7 +209,6 @@ class TextileMLizer(OEB2HTML):
        return txt
    def prepare_string_for_textile(self, txt):
 #        if re.search(r'(\s([*&_+\-~@%|]|\?{2}))|(([*&_+\-~@%|]|\?{2})\s)', txt):
        if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt):
            return ' ==%s== ' % txt
        return txt
--- a/src/calibre/utils/html2textile.py
+++ b/src/calibre/utils/html2textile.py
@ -1,209 +0,0 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2010, Webreactor - Marcin Lulek <info@webreactor.eu>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #    * Redistributions of source code must retain the above copyright
 #      notice, this list of conditions and the following disclaimer.
 #    * Redistributions in binary form must reproduce the above copyright
 #      notice, this list of conditions and the following disclaimer in the
 #      documentation and/or other materials provided with the distribution.
 #    * Neither the name of the <organization> nor the
 #      names of its contributors may be used to endorse or promote products
 #      derived from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 from lxml import etree
 from calibre.ebooks.oeb.base import barename
 class EchoTarget:
    def __init__(self):
        self.final_output = []
        self.block = False
        self.ol_ident = 0
        self.ul_ident = 0
        self.list_types = []
        self.haystack = []
    def start(self, tag, attrib):
        tag = barename(tag)
        newline = '\n'
        dot = ''
        new_tag = ''
        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
            new_tag = tag
            dot = '. '
        elif tag == 'p':
                new_tag = ''
                dot = ''
        elif tag == 'blockquote':
            new_tag = 'bq'
            dot = '. '
        elif tag in ('b', 'strong'):
            new_tag = '*'
            newline = ''
        elif tag in ('em', 'i'):
            new_tag = '_'
            newline = ''
        elif tag == 'cite':
            new_tag = '??'
            newline = ''
        elif tag == 'del':
            new_tag = '-'
            newline = ''
        elif tag == 'ins':
            new_tag = '+'
            newline = ''
        elif tag == 'sup':
            new_tag = '^'
            newline = ''
        elif tag == 'sub':
            new_tag = '~'
            newline = ''
        elif tag == 'span':
            new_tag = ''
            newline = ''
        elif tag == 'a':
            self.block = True
            if 'title' in attrib:
                self.a_part = {'title':attrib.get('title'),
                               'href':attrib.get('href', '')}
            else:
                self.a_part = {'title':None, 'href':attrib.get('href', '')}
            new_tag = ''
            newline = ''
        elif tag == 'img':
            if 'alt' in attrib:
                new_tag = ' !%s(%s)' % (attrib.get('src'), attrib.get('title'),)
            else:
                new_tag = ' !%s' % attrib.get('src')
            newline = ''
        elif tag in ('ul', 'ol'):
            new_tag = ''
            newline = ''
            self.list_types.append(tag)
            if tag == 'ul':
                self.ul_ident += 1
            else:
                self.ol_ident += 1
        elif tag == 'li':
            indent = self.ul_ident + self.ol_ident
            if self.list_types[-1] == 'ul':
                new_tag = '*' * indent + ' '
                newline = '\n'
            else:
                new_tag = '#' * indent + ' '
                newline = '\n'
        if tag not in ('ul', 'ol'):
            textile = '%(newline)s%(tag)s%(dot)s' % \
                                 {
                                  'newline':newline,
                                  'tag':new_tag,
                                  'dot':dot
                                  }
            if not self.block:
                self.final_output.append(textile)
            else:
                self.haystack.append(textile)
    def end(self, tag):
        tag = barename(tag)
        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
            self.final_output.append('\n')
        elif tag in ('b', 'strong'):
            self.final_output.append('*')
        elif tag in ('em', 'i'):
            self.final_output.append('_')
        elif tag == 'cite':
            self.final_output.append('??')
        elif tag == 'del':
            self.final_output.append('-')
        elif tag == 'ins':
            self.final_output.append('+')
        elif tag == 'sup':
            self.final_output.append('^')
        elif tag == 'sub':
            self.final_output.append('~')
        elif tag == 'span':
            self.final_output.append('')
        elif tag == 'a':
            if self.a_part['title']:
                textilized = ' "%s (%s)":%s ' % (
                                                 ''.join(self.haystack),
                                                 self.a_part.get('title'),
                                                 self.a_part.get('href'),
                                                 )
                self.haystack = []
            else:
                textilized = ' "%s":%s ' % (
                                                 ''.join(self.haystack),
                                                 self.a_part.get('href'),
                                                 )
                self.haystack = []
            self.final_output.append(textilized)
            self.block = False
        elif tag == 'img':
            self.final_output.append('!')
        elif tag == 'ul':
            self.ul_ident -= 1
            self.list_types.pop()
            if len(self.list_types) == 0:
                self.final_output.append('\n')
        elif tag == 'ol':
            self.ol_ident -= 1
            self.list_types.pop()
            if len(self.list_types) == 0:
                self.final_output.append('\n')
    def data(self, data):
        #we dont want any linebreaks inside our tags
        node_data = data.replace('\n','')
        if not self.block:
            self.final_output.append(node_data)
        else:
            self.haystack.append(node_data)
    def comment(self, text):
        pass
    def close(self):
        return "closed!"
 def html2textile(html):
    #1st pass
    #clean the whitespace and convert html to xhtml
    parser = etree.HTMLParser()
    tree = etree.fromstring(html, parser)
    xhtml = etree.tostring(tree, method="xml")
    parser = etree.XMLParser(remove_blank_text=True)
    root = etree.XML(xhtml, parser)
    cleaned_html = etree.tostring(root)
    #2nd pass build textile
    target = EchoTarget()
    parser = etree.XMLParser(target=target)
    root = etree.fromstring(cleaned_html, parser)
    textilized_text = ''.join(target.final_output).lstrip().rstrip()
    return textilized_text