From 441718f76c867da749a10607f931b8b03485d331 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 10 May 2011 18:55:19 -0400 Subject: [PATCH] TXT: small Textile changes. Remove old textile conversion code. --- src/calibre/ebooks/txt/textileml.py | 58 ++++---- src/calibre/utils/html2textile.py | 209 ---------------------------- 2 files changed, 34 insertions(+), 233 deletions(-) delete mode 100644 src/calibre/utils/html2textile.py diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 814ba01a3e..17988053e8 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -78,44 +78,55 @@ class TextileMLizer(OEB2HTML): for i in self.our_links: if i[0] == '#': if i not in self.our_ids: - self.log.debug('Link has no target - %s ...' % i) text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text) for i in self.our_ids: if i not in self.our_links: - self.log.debug('ID has no link - %s ...' % i) text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text) # Remove obvious non-needed escaping, add sub/sup-script ones text = check_escaping(text, ['\*', '_', '\*']) - text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text) # escape the super/sub-scripts if needed - text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text) # escape the super/sub-scripts if needed + # escape the super/sub-scripts if needed + text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text) + # escape the super/sub-scripts if needed + text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text) - text = re.sub(r'%\xa0+', r'%', text) #remove empty spans - text = re.sub(r'%%', r'', text) #remove empty spans - MAY MERGE SOME ? - text = re.sub(r'%([_+*-]+)%', r'\1', text) #remove spans from tagged output - text = re.sub(r' +\n', r'\n', text) #remove spaces before a newline - text = re.sub(r'^\n+', r'', text) #remove newlines at top of file - text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) #correct blockcode paras - text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) #correct blockquote paras -# text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) #reduce blank lines + insert blank para - text = re.sub(r'\n{3}', r'\n\n', text) #reduce blank lines -# text = re.sub(r' ((\* ?)+) ', r' ==\1== ', text) + #remove empty spans + text = re.sub(r'%\xa0+', r'%', text) + #remove empty spans - MAY MERGE SOME ? + text = re.sub(r'%%', r'', text) + #remove spans from tagged output + text = re.sub(r'%([_+*-]+)%', r'\1', text) + #remove spaces before a newline + text = re.sub(r' +\n', r'\n', text) + #remove newlines at top of file + text = re.sub(r'^\n+', r'', text) + #correct blockcode paras + text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) + #correct blockquote paras + text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) + + #reduce blank lines + text = re.sub(r'\n{3}', r'\n\n', text) text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text) - text = re.sub(r'\n\n {2,4}%', r'%', text) #Check span following blank para + #Check span following blank para + text = re.sub(r'\n+ +%', r' %', text) text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text) - text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text) # blank paragraph - text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph - text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text) # blank paragraph + # blank paragraph + text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text) + # blank paragraph + text = re.sub(u'\n\xa0', r'\np. ', text) + # blank paragraph + text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text) text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text) text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text) - text = re.sub(r' {2,}\|', r' |', text) #sort out spaces in tables + #sort out spaces in tables + text = re.sub(r' {2,}\|', r' |', text) # Now put back spaces removed earlier as they're needed here text = re.sub(r'\np\.\n', r'\np. \n', text) - text = re.sub(r' \n\n\n', r' \n\n', text) #reduce blank lines - - # started work on trying to fix footnotes -# text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text) + #reduce blank lines + text = re.sub(r' \n\n\n', r' \n\n', text) + return text def remove_newlines(self, text): @@ -198,7 +209,6 @@ class TextileMLizer(OEB2HTML): return txt def prepare_string_for_textile(self, txt): -# if re.search(r'(\s([*&_+\-~@%|]|\?{2}))|(([*&_+\-~@%|]|\?{2})\s)', txt): if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt): return ' ==%s== ' % txt return txt diff --git a/src/calibre/utils/html2textile.py b/src/calibre/utils/html2textile.py deleted file mode 100644 index 786e912e36..0000000000 --- a/src/calibre/utils/html2textile.py +++ /dev/null @@ -1,209 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright (c) 2010, Webreactor - Marcin Lulek -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of the nor the -# names of its contributors may be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY -# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -from lxml import etree -from calibre.ebooks.oeb.base import barename - -class EchoTarget: - - def __init__(self): - self.final_output = [] - self.block = False - self.ol_ident = 0 - self.ul_ident = 0 - self.list_types = [] - self.haystack = [] - - def start(self, tag, attrib): - tag = barename(tag) - - newline = '\n' - dot = '' - new_tag = '' - - if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): - new_tag = tag - dot = '. ' - elif tag == 'p': - new_tag = '' - dot = '' - elif tag == 'blockquote': - new_tag = 'bq' - dot = '. ' - elif tag in ('b', 'strong'): - new_tag = '*' - newline = '' - elif tag in ('em', 'i'): - new_tag = '_' - newline = '' - elif tag == 'cite': - new_tag = '??' - newline = '' - elif tag == 'del': - new_tag = '-' - newline = '' - elif tag == 'ins': - new_tag = '+' - newline = '' - elif tag == 'sup': - new_tag = '^' - newline = '' - elif tag == 'sub': - new_tag = '~' - newline = '' - elif tag == 'span': - new_tag = '' - newline = '' - elif tag == 'a': - self.block = True - if 'title' in attrib: - self.a_part = {'title':attrib.get('title'), - 'href':attrib.get('href', '')} - else: - self.a_part = {'title':None, 'href':attrib.get('href', '')} - new_tag = '' - newline = '' - - elif tag == 'img': - if 'alt' in attrib: - new_tag = ' !%s(%s)' % (attrib.get('src'), attrib.get('title'),) - else: - new_tag = ' !%s' % attrib.get('src') - newline = '' - - elif tag in ('ul', 'ol'): - new_tag = '' - newline = '' - self.list_types.append(tag) - if tag == 'ul': - self.ul_ident += 1 - else: - self.ol_ident += 1 - - elif tag == 'li': - indent = self.ul_ident + self.ol_ident - if self.list_types[-1] == 'ul': - new_tag = '*' * indent + ' ' - newline = '\n' - else: - new_tag = '#' * indent + ' ' - newline = '\n' - - - if tag not in ('ul', 'ol'): - textile = '%(newline)s%(tag)s%(dot)s' % \ - { - 'newline':newline, - 'tag':new_tag, - 'dot':dot - } - if not self.block: - self.final_output.append(textile) - else: - self.haystack.append(textile) - - def end(self, tag): - tag = barename(tag) - - if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'): - self.final_output.append('\n') - elif tag in ('b', 'strong'): - self.final_output.append('*') - elif tag in ('em', 'i'): - self.final_output.append('_') - elif tag == 'cite': - self.final_output.append('??') - elif tag == 'del': - self.final_output.append('-') - elif tag == 'ins': - self.final_output.append('+') - elif tag == 'sup': - self.final_output.append('^') - elif tag == 'sub': - self.final_output.append('~') - elif tag == 'span': - self.final_output.append('') - elif tag == 'a': - if self.a_part['title']: - textilized = ' "%s (%s)":%s ' % ( - ''.join(self.haystack), - self.a_part.get('title'), - self.a_part.get('href'), - ) - self.haystack = [] - else: - textilized = ' "%s":%s ' % ( - ''.join(self.haystack), - self.a_part.get('href'), - ) - self.haystack = [] - self.final_output.append(textilized) - self.block = False - elif tag == 'img': - self.final_output.append('!') - elif tag == 'ul': - self.ul_ident -= 1 - self.list_types.pop() - if len(self.list_types) == 0: - self.final_output.append('\n') - elif tag == 'ol': - self.ol_ident -= 1 - self.list_types.pop() - if len(self.list_types) == 0: - self.final_output.append('\n') - - def data(self, data): - #we dont want any linebreaks inside our tags - node_data = data.replace('\n','') - if not self.block: - self.final_output.append(node_data) - else: - self.haystack.append(node_data) - - def comment(self, text): - pass - - def close(self): - return "closed!" - - -def html2textile(html): - #1st pass - #clean the whitespace and convert html to xhtml - parser = etree.HTMLParser() - tree = etree.fromstring(html, parser) - xhtml = etree.tostring(tree, method="xml") - parser = etree.XMLParser(remove_blank_text=True) - root = etree.XML(xhtml, parser) - cleaned_html = etree.tostring(root) - #2nd pass build textile - target = EchoTarget() - parser = etree.XMLParser(target=target) - root = etree.fromstring(cleaned_html, parser) - textilized_text = ''.join(target.final_output).lstrip().rstrip() - return textilized_text