mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
TXT: small Textile changes. Remove old textile conversion code.
This commit is contained in:
parent
b95f9949be
commit
441718f76c
@ -78,44 +78,55 @@ class TextileMLizer(OEB2HTML):
|
|||||||
for i in self.our_links:
|
for i in self.our_links:
|
||||||
if i[0] == '#':
|
if i[0] == '#':
|
||||||
if i not in self.our_ids:
|
if i not in self.our_ids:
|
||||||
self.log.debug('Link has no target - %s ...' % i)
|
|
||||||
text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text)
|
text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text)
|
||||||
for i in self.our_ids:
|
for i in self.our_ids:
|
||||||
if i not in self.our_links:
|
if i not in self.our_links:
|
||||||
self.log.debug('ID has no link - %s ...' % i)
|
|
||||||
text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text)
|
text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text)
|
||||||
|
|
||||||
# Remove obvious non-needed escaping, add sub/sup-script ones
|
# Remove obvious non-needed escaping, add sub/sup-script ones
|
||||||
text = check_escaping(text, ['\*', '_', '\*'])
|
text = check_escaping(text, ['\*', '_', '\*'])
|
||||||
text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text) # escape the super/sub-scripts if needed
|
# escape the super/sub-scripts if needed
|
||||||
text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text) # escape the super/sub-scripts if needed
|
text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text)
|
||||||
|
# escape the super/sub-scripts if needed
|
||||||
|
text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text)
|
||||||
|
|
||||||
text = re.sub(r'%\xa0+', r'%', text) #remove empty spans
|
#remove empty spans
|
||||||
text = re.sub(r'%%', r'', text) #remove empty spans - MAY MERGE SOME ?
|
text = re.sub(r'%\xa0+', r'%', text)
|
||||||
text = re.sub(r'%([_+*-]+)%', r'\1', text) #remove spans from tagged output
|
#remove empty spans - MAY MERGE SOME ?
|
||||||
text = re.sub(r' +\n', r'\n', text) #remove spaces before a newline
|
text = re.sub(r'%%', r'', text)
|
||||||
text = re.sub(r'^\n+', r'', text) #remove newlines at top of file
|
#remove spans from tagged output
|
||||||
text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) #correct blockcode paras
|
text = re.sub(r'%([_+*-]+)%', r'\1', text)
|
||||||
text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) #correct blockquote paras
|
#remove spaces before a newline
|
||||||
# text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) #reduce blank lines + insert blank para
|
text = re.sub(r' +\n', r'\n', text)
|
||||||
text = re.sub(r'\n{3}', r'\n\n', text) #reduce blank lines
|
#remove newlines at top of file
|
||||||
# text = re.sub(r' ((\* ?)+) ', r' ==\1== ', text)
|
text = re.sub(r'^\n+', r'', text)
|
||||||
|
#correct blockcode paras
|
||||||
|
text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)
|
||||||
|
#correct blockquote paras
|
||||||
|
text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text)
|
||||||
|
|
||||||
|
#reduce blank lines
|
||||||
|
text = re.sub(r'\n{3}', r'\n\n', text)
|
||||||
text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text)
|
text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text)
|
||||||
text = re.sub(r'\n\n {2,4}%', r'%', text) #Check span following blank para
|
#Check span following blank para
|
||||||
|
text = re.sub(r'\n+ +%', r' %', text)
|
||||||
text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text)
|
text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text)
|
||||||
text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text) # blank paragraph
|
# blank paragraph
|
||||||
text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph
|
text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text)
|
||||||
text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text) # blank paragraph
|
# blank paragraph
|
||||||
|
text = re.sub(u'\n\xa0', r'\np. ', text)
|
||||||
|
# blank paragraph
|
||||||
|
text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text)
|
||||||
text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
|
text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
|
||||||
text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text)
|
text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text)
|
||||||
text = re.sub(r' {2,}\|', r' |', text) #sort out spaces in tables
|
#sort out spaces in tables
|
||||||
|
text = re.sub(r' {2,}\|', r' |', text)
|
||||||
|
|
||||||
# Now put back spaces removed earlier as they're needed here
|
# Now put back spaces removed earlier as they're needed here
|
||||||
text = re.sub(r'\np\.\n', r'\np. \n', text)
|
text = re.sub(r'\np\.\n', r'\np. \n', text)
|
||||||
text = re.sub(r' \n\n\n', r' \n\n', text) #reduce blank lines
|
#reduce blank lines
|
||||||
|
text = re.sub(r' \n\n\n', r' \n\n', text)
|
||||||
|
|
||||||
# started work on trying to fix footnotes
|
|
||||||
# text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text)
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def remove_newlines(self, text):
|
def remove_newlines(self, text):
|
||||||
@ -198,7 +209,6 @@ class TextileMLizer(OEB2HTML):
|
|||||||
return txt
|
return txt
|
||||||
|
|
||||||
def prepare_string_for_textile(self, txt):
|
def prepare_string_for_textile(self, txt):
|
||||||
# if re.search(r'(\s([*&_+\-~@%|]|\?{2}))|(([*&_+\-~@%|]|\?{2})\s)', txt):
|
|
||||||
if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt):
|
if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt):
|
||||||
return ' ==%s== ' % txt
|
return ' ==%s== ' % txt
|
||||||
return txt
|
return txt
|
||||||
|
@ -1,209 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
# Copyright (c) 2010, Webreactor - Marcin Lulek <info@webreactor.eu>
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# Redistribution and use in source and binary forms, with or without
|
|
||||||
# modification, are permitted provided that the following conditions are met:
|
|
||||||
# * Redistributions of source code must retain the above copyright
|
|
||||||
# notice, this list of conditions and the following disclaimer.
|
|
||||||
# * Redistributions in binary form must reproduce the above copyright
|
|
||||||
# notice, this list of conditions and the following disclaimer in the
|
|
||||||
# documentation and/or other materials provided with the distribution.
|
|
||||||
# * Neither the name of the <organization> nor the
|
|
||||||
# names of its contributors may be used to endorse or promote products
|
|
||||||
# derived from this software without specific prior written permission.
|
|
||||||
#
|
|
||||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
||||||
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
||||||
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
||||||
# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
|
|
||||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
||||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
||||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
||||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
||||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
|
|
||||||
|
|
||||||
from lxml import etree
|
|
||||||
from calibre.ebooks.oeb.base import barename
|
|
||||||
|
|
||||||
class EchoTarget:
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.final_output = []
|
|
||||||
self.block = False
|
|
||||||
self.ol_ident = 0
|
|
||||||
self.ul_ident = 0
|
|
||||||
self.list_types = []
|
|
||||||
self.haystack = []
|
|
||||||
|
|
||||||
def start(self, tag, attrib):
|
|
||||||
tag = barename(tag)
|
|
||||||
|
|
||||||
newline = '\n'
|
|
||||||
dot = ''
|
|
||||||
new_tag = ''
|
|
||||||
|
|
||||||
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
|
||||||
new_tag = tag
|
|
||||||
dot = '. '
|
|
||||||
elif tag == 'p':
|
|
||||||
new_tag = ''
|
|
||||||
dot = ''
|
|
||||||
elif tag == 'blockquote':
|
|
||||||
new_tag = 'bq'
|
|
||||||
dot = '. '
|
|
||||||
elif tag in ('b', 'strong'):
|
|
||||||
new_tag = '*'
|
|
||||||
newline = ''
|
|
||||||
elif tag in ('em', 'i'):
|
|
||||||
new_tag = '_'
|
|
||||||
newline = ''
|
|
||||||
elif tag == 'cite':
|
|
||||||
new_tag = '??'
|
|
||||||
newline = ''
|
|
||||||
elif tag == 'del':
|
|
||||||
new_tag = '-'
|
|
||||||
newline = ''
|
|
||||||
elif tag == 'ins':
|
|
||||||
new_tag = '+'
|
|
||||||
newline = ''
|
|
||||||
elif tag == 'sup':
|
|
||||||
new_tag = '^'
|
|
||||||
newline = ''
|
|
||||||
elif tag == 'sub':
|
|
||||||
new_tag = '~'
|
|
||||||
newline = ''
|
|
||||||
elif tag == 'span':
|
|
||||||
new_tag = ''
|
|
||||||
newline = ''
|
|
||||||
elif tag == 'a':
|
|
||||||
self.block = True
|
|
||||||
if 'title' in attrib:
|
|
||||||
self.a_part = {'title':attrib.get('title'),
|
|
||||||
'href':attrib.get('href', '')}
|
|
||||||
else:
|
|
||||||
self.a_part = {'title':None, 'href':attrib.get('href', '')}
|
|
||||||
new_tag = ''
|
|
||||||
newline = ''
|
|
||||||
|
|
||||||
elif tag == 'img':
|
|
||||||
if 'alt' in attrib:
|
|
||||||
new_tag = ' !%s(%s)' % (attrib.get('src'), attrib.get('title'),)
|
|
||||||
else:
|
|
||||||
new_tag = ' !%s' % attrib.get('src')
|
|
||||||
newline = ''
|
|
||||||
|
|
||||||
elif tag in ('ul', 'ol'):
|
|
||||||
new_tag = ''
|
|
||||||
newline = ''
|
|
||||||
self.list_types.append(tag)
|
|
||||||
if tag == 'ul':
|
|
||||||
self.ul_ident += 1
|
|
||||||
else:
|
|
||||||
self.ol_ident += 1
|
|
||||||
|
|
||||||
elif tag == 'li':
|
|
||||||
indent = self.ul_ident + self.ol_ident
|
|
||||||
if self.list_types[-1] == 'ul':
|
|
||||||
new_tag = '*' * indent + ' '
|
|
||||||
newline = '\n'
|
|
||||||
else:
|
|
||||||
new_tag = '#' * indent + ' '
|
|
||||||
newline = '\n'
|
|
||||||
|
|
||||||
|
|
||||||
if tag not in ('ul', 'ol'):
|
|
||||||
textile = '%(newline)s%(tag)s%(dot)s' % \
|
|
||||||
{
|
|
||||||
'newline':newline,
|
|
||||||
'tag':new_tag,
|
|
||||||
'dot':dot
|
|
||||||
}
|
|
||||||
if not self.block:
|
|
||||||
self.final_output.append(textile)
|
|
||||||
else:
|
|
||||||
self.haystack.append(textile)
|
|
||||||
|
|
||||||
def end(self, tag):
|
|
||||||
tag = barename(tag)
|
|
||||||
|
|
||||||
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
|
|
||||||
self.final_output.append('\n')
|
|
||||||
elif tag in ('b', 'strong'):
|
|
||||||
self.final_output.append('*')
|
|
||||||
elif tag in ('em', 'i'):
|
|
||||||
self.final_output.append('_')
|
|
||||||
elif tag == 'cite':
|
|
||||||
self.final_output.append('??')
|
|
||||||
elif tag == 'del':
|
|
||||||
self.final_output.append('-')
|
|
||||||
elif tag == 'ins':
|
|
||||||
self.final_output.append('+')
|
|
||||||
elif tag == 'sup':
|
|
||||||
self.final_output.append('^')
|
|
||||||
elif tag == 'sub':
|
|
||||||
self.final_output.append('~')
|
|
||||||
elif tag == 'span':
|
|
||||||
self.final_output.append('')
|
|
||||||
elif tag == 'a':
|
|
||||||
if self.a_part['title']:
|
|
||||||
textilized = ' "%s (%s)":%s ' % (
|
|
||||||
''.join(self.haystack),
|
|
||||||
self.a_part.get('title'),
|
|
||||||
self.a_part.get('href'),
|
|
||||||
)
|
|
||||||
self.haystack = []
|
|
||||||
else:
|
|
||||||
textilized = ' "%s":%s ' % (
|
|
||||||
''.join(self.haystack),
|
|
||||||
self.a_part.get('href'),
|
|
||||||
)
|
|
||||||
self.haystack = []
|
|
||||||
self.final_output.append(textilized)
|
|
||||||
self.block = False
|
|
||||||
elif tag == 'img':
|
|
||||||
self.final_output.append('!')
|
|
||||||
elif tag == 'ul':
|
|
||||||
self.ul_ident -= 1
|
|
||||||
self.list_types.pop()
|
|
||||||
if len(self.list_types) == 0:
|
|
||||||
self.final_output.append('\n')
|
|
||||||
elif tag == 'ol':
|
|
||||||
self.ol_ident -= 1
|
|
||||||
self.list_types.pop()
|
|
||||||
if len(self.list_types) == 0:
|
|
||||||
self.final_output.append('\n')
|
|
||||||
|
|
||||||
def data(self, data):
|
|
||||||
#we dont want any linebreaks inside our tags
|
|
||||||
node_data = data.replace('\n','')
|
|
||||||
if not self.block:
|
|
||||||
self.final_output.append(node_data)
|
|
||||||
else:
|
|
||||||
self.haystack.append(node_data)
|
|
||||||
|
|
||||||
def comment(self, text):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
return "closed!"
|
|
||||||
|
|
||||||
|
|
||||||
def html2textile(html):
|
|
||||||
#1st pass
|
|
||||||
#clean the whitespace and convert html to xhtml
|
|
||||||
parser = etree.HTMLParser()
|
|
||||||
tree = etree.fromstring(html, parser)
|
|
||||||
xhtml = etree.tostring(tree, method="xml")
|
|
||||||
parser = etree.XMLParser(remove_blank_text=True)
|
|
||||||
root = etree.XML(xhtml, parser)
|
|
||||||
cleaned_html = etree.tostring(root)
|
|
||||||
#2nd pass build textile
|
|
||||||
target = EchoTarget()
|
|
||||||
parser = etree.XMLParser(target=target)
|
|
||||||
root = etree.fromstring(cleaned_html, parser)
|
|
||||||
textilized_text = ''.join(target.final_output).lstrip().rstrip()
|
|
||||||
return textilized_text
|
|
Loading…
x
Reference in New Issue
Block a user