TXT: small Textile changes. Remove old textile conversion code.

This commit is contained in:
John Schember 2011-05-10 18:55:19 -04:00
parent b95f9949be
commit 441718f76c
2 changed files with 34 additions and 233 deletions

View File

@ -78,44 +78,55 @@ class TextileMLizer(OEB2HTML):
for i in self.our_links:
if i[0] == '#':
if i not in self.our_ids:
self.log.debug('Link has no target - %s ...' % i)
text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text)
for i in self.our_ids:
if i not in self.our_links:
self.log.debug('ID has no link - %s ...' % i)
text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text)
# Remove obvious non-needed escaping, add sub/sup-script ones
text = check_escaping(text, ['\*', '_', '\*'])
text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text) # escape the super/sub-scripts if needed
text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text) # escape the super/sub-scripts if needed
# escape the super/sub-scripts if needed
text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text)
# escape the super/sub-scripts if needed
text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text)
text = re.sub(r'%\xa0+', r'%', text) #remove empty spans
text = re.sub(r'%%', r'', text) #remove empty spans - MAY MERGE SOME ?
text = re.sub(r'%([_+*-]+)%', r'\1', text) #remove spans from tagged output
text = re.sub(r' +\n', r'\n', text) #remove spaces before a newline
text = re.sub(r'^\n+', r'', text) #remove newlines at top of file
text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) #correct blockcode paras
text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) #correct blockquote paras
# text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) #reduce blank lines + insert blank para
text = re.sub(r'\n{3}', r'\n\n', text) #reduce blank lines
# text = re.sub(r' ((\* ?)+) ', r' ==\1== ', text)
#remove empty spans
text = re.sub(r'%\xa0+', r'%', text)
#remove empty spans - MAY MERGE SOME ?
text = re.sub(r'%%', r'', text)
#remove spans from tagged output
text = re.sub(r'%([_+*-]+)%', r'\1', text)
#remove spaces before a newline
text = re.sub(r' +\n', r'\n', text)
#remove newlines at top of file
text = re.sub(r'^\n+', r'', text)
#correct blockcode paras
text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)
#correct blockquote paras
text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text)
#reduce blank lines
text = re.sub(r'\n{3}', r'\n\n', text)
text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text)
text = re.sub(r'\n\n {2,4}%', r'%', text) #Check span following blank para
#Check span following blank para
text = re.sub(r'\n+ +%', r' %', text)
text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text)
text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text) # blank paragraph
text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph
text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text) # blank paragraph
# blank paragraph
text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text)
# blank paragraph
text = re.sub(u'\n\xa0', r'\np. ', text)
# blank paragraph
text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text)
text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text)
text = re.sub(r' {2,}\|', r' |', text) #sort out spaces in tables
#sort out spaces in tables
text = re.sub(r' {2,}\|', r' |', text)
# Now put back spaces removed earlier as they're needed here
text = re.sub(r'\np\.\n', r'\np. \n', text)
text = re.sub(r' \n\n\n', r' \n\n', text) #reduce blank lines
# started work on trying to fix footnotes
# text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text)
#reduce blank lines
text = re.sub(r' \n\n\n', r' \n\n', text)
return text
def remove_newlines(self, text):
@ -198,7 +209,6 @@ class TextileMLizer(OEB2HTML):
return txt
def prepare_string_for_textile(self, txt):
# if re.search(r'(\s([*&_+\-~@%|]|\?{2}))|(([*&_+\-~@%|]|\?{2})\s)', txt):
if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt):
return ' ==%s== ' % txt
return txt

View File

@ -1,209 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright (c) 2010, Webreactor - Marcin Lulek <info@webreactor.eu>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the <organization> nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from lxml import etree
from calibre.ebooks.oeb.base import barename
class EchoTarget:
def __init__(self):
self.final_output = []
self.block = False
self.ol_ident = 0
self.ul_ident = 0
self.list_types = []
self.haystack = []
def start(self, tag, attrib):
tag = barename(tag)
newline = '\n'
dot = ''
new_tag = ''
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
new_tag = tag
dot = '. '
elif tag == 'p':
new_tag = ''
dot = ''
elif tag == 'blockquote':
new_tag = 'bq'
dot = '. '
elif tag in ('b', 'strong'):
new_tag = '*'
newline = ''
elif tag in ('em', 'i'):
new_tag = '_'
newline = ''
elif tag == 'cite':
new_tag = '??'
newline = ''
elif tag == 'del':
new_tag = '-'
newline = ''
elif tag == 'ins':
new_tag = '+'
newline = ''
elif tag == 'sup':
new_tag = '^'
newline = ''
elif tag == 'sub':
new_tag = '~'
newline = ''
elif tag == 'span':
new_tag = ''
newline = ''
elif tag == 'a':
self.block = True
if 'title' in attrib:
self.a_part = {'title':attrib.get('title'),
'href':attrib.get('href', '')}
else:
self.a_part = {'title':None, 'href':attrib.get('href', '')}
new_tag = ''
newline = ''
elif tag == 'img':
if 'alt' in attrib:
new_tag = ' !%s(%s)' % (attrib.get('src'), attrib.get('title'),)
else:
new_tag = ' !%s' % attrib.get('src')
newline = ''
elif tag in ('ul', 'ol'):
new_tag = ''
newline = ''
self.list_types.append(tag)
if tag == 'ul':
self.ul_ident += 1
else:
self.ol_ident += 1
elif tag == 'li':
indent = self.ul_ident + self.ol_ident
if self.list_types[-1] == 'ul':
new_tag = '*' * indent + ' '
newline = '\n'
else:
new_tag = '#' * indent + ' '
newline = '\n'
if tag not in ('ul', 'ol'):
textile = '%(newline)s%(tag)s%(dot)s' % \
{
'newline':newline,
'tag':new_tag,
'dot':dot
}
if not self.block:
self.final_output.append(textile)
else:
self.haystack.append(textile)
def end(self, tag):
tag = barename(tag)
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
self.final_output.append('\n')
elif tag in ('b', 'strong'):
self.final_output.append('*')
elif tag in ('em', 'i'):
self.final_output.append('_')
elif tag == 'cite':
self.final_output.append('??')
elif tag == 'del':
self.final_output.append('-')
elif tag == 'ins':
self.final_output.append('+')
elif tag == 'sup':
self.final_output.append('^')
elif tag == 'sub':
self.final_output.append('~')
elif tag == 'span':
self.final_output.append('')
elif tag == 'a':
if self.a_part['title']:
textilized = ' "%s (%s)":%s ' % (
''.join(self.haystack),
self.a_part.get('title'),
self.a_part.get('href'),
)
self.haystack = []
else:
textilized = ' "%s":%s ' % (
''.join(self.haystack),
self.a_part.get('href'),
)
self.haystack = []
self.final_output.append(textilized)
self.block = False
elif tag == 'img':
self.final_output.append('!')
elif tag == 'ul':
self.ul_ident -= 1
self.list_types.pop()
if len(self.list_types) == 0:
self.final_output.append('\n')
elif tag == 'ol':
self.ol_ident -= 1
self.list_types.pop()
if len(self.list_types) == 0:
self.final_output.append('\n')
def data(self, data):
#we dont want any linebreaks inside our tags
node_data = data.replace('\n','')
if not self.block:
self.final_output.append(node_data)
else:
self.haystack.append(node_data)
def comment(self, text):
pass
def close(self):
return "closed!"
def html2textile(html):
#1st pass
#clean the whitespace and convert html to xhtml
parser = etree.HTMLParser()
tree = etree.fromstring(html, parser)
xhtml = etree.tostring(tree, method="xml")
parser = etree.XMLParser(remove_blank_text=True)
root = etree.XML(xhtml, parser)
cleaned_html = etree.tostring(root)
#2nd pass build textile
target = EchoTarget()
parser = etree.XMLParser(target=target)
root = etree.fromstring(cleaned_html, parser)
textilized_text = ''.join(target.final_output).lstrip().rstrip()
return textilized_text