TXT: Textile changes.

This commit is contained in:
John Schember 2011-04-30 09:43:09 -04:00
parent a168a3d11a
commit 05331d7f05
2 changed files with 135 additions and 98 deletions

View File

@ -242,6 +242,8 @@ def detect_formatting_type(txt):
textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt)) textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
# Links # Links
textile_count += len(re.findall(r'"[^"]*":\S+', txt)) textile_count += len(re.findall(r'"[^"]*":\S+', txt))
# paragraph blocks
textile_count += len(re.findall(r'(?mu)^p(<|<>|=|>)?\. ', txt))
# Decide if either markdown or textile is used in the text # Decide if either markdown or textile is used in the text
# based on the number of unique formatting elements found. # based on the number of unique formatting elements found.

View File

@ -7,7 +7,6 @@ __docformat__ = 'restructuredtext en'
''' '''
Transform OEB content into Textile formatted plain text Transform OEB content into Textile formatted plain text
''' '''
import re import re
from functools import partial from functools import partial
@ -16,8 +15,6 @@ from calibre.ebooks.htmlz.oeb2html import OEB2HTML
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.txt.unsmarten import unsmarten from calibre.ebooks.txt.unsmarten import unsmarten
from operator import itemgetter
class TextileMLizer(OEB2HTML): class TextileMLizer(OEB2HTML):
@ -29,17 +26,20 @@ class TextileMLizer(OEB2HTML):
self.links = {} self.links = {}
self.list = [] self.list = []
self.our_links = [] self.our_links = []
self.in_a_link = False
self.our_ids = [] self.our_ids = []
self.images = {} self.images = {}
self.id_no_text = u''
self.style_embed = []
self.remove_space_after_newline = False self.remove_space_after_newline = False
self.base_hrefs = [item.href for item in oeb_book.spine] self.base_hrefs = [item.href for item in oeb_book.spine]
self.map_resources(oeb_book) self.map_resources(oeb_book)
# self.style_bold = False self.style_bold = False
# self.style_italic = False self.style_italic = False
# self.style_under = False self.style_under = False
# self.style_strike = False self.style_strike = False
# self.style_smallcap = False self.style_smallcap = False
txt = self.mlize_spine(oeb_book) txt = self.mlize_spine(oeb_book)
txt = unsmarten(txt) txt = unsmarten(txt)
@ -56,7 +56,7 @@ class TextileMLizer(OEB2HTML):
self.rewrite_ids(item.data, item) self.rewrite_ids(item.data, item)
rewrite_links(item.data, partial(self.rewrite_link, page=item)) rewrite_links(item.data, partial(self.rewrite_link, page=item))
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) output += self.dump_text(item.data.find(XHTML('body')), stylizer)
output.append('\n\n') output.append('\n\n')
return ''.join(output) return ''.join(output)
@ -64,36 +64,47 @@ class TextileMLizer(OEB2HTML):
# Needs tweaking and finetuning # Needs tweaking and finetuning
def check_escaping(text, tests): def check_escaping(text, tests):
for t in tests: for t in tests:
text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text) # I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged
txt = '%s' % t
self.log.debug('DEBUG: ' + txt)
if txt != '%':
text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text)
text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text) text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text)
text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text) text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text)
return text return text
# Note - I'm not checking for escaped '-' as this will also get hypenated words
text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%'])
text = re.sub(r' +\n', r'\n', text)
text = re.sub(r'^\n+', r'', text)
text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)
text = re.sub(r'\nbq\.\n?\np\. ', r'\nbq. ', text)
text = re.sub(r'\n{4,}', r'\n\np. \n\n', text)
text = re.sub(r'\n{3}', r'\n\n', text)
text = re.sub(r'(p.*\. \n?)(p.*\. )', r'\2', text)
text = re.sub(r'p.*\. \n\n', r'', text)
text = re.sub(u'p.*\. \xa0', r'p. ', text) # blank paragraph
text = re.sub(r' \|', r'|', text)
# Now put back spaces removed earlier as they're needed here
text = re.sub(r'\np\.\n', r'\np. \n', text)
# Now tidyup links and ids - remove ones that don't have a correponding opposite # Now tidyup links and ids - remove ones that don't have a correponding opposite
if self.opts.keep_links: if self.opts.keep_links:
for i in self.our_links: for i in self.our_links:
if i not in self.our_ids: if i[0] == '#':
text = re.sub(r'"(.+)":'+i, '\1', text) if i not in self.our_ids:
text = re.sub(r'"(.+)":'+i, '\1', text)
for i in self.our_ids: for i in self.our_ids:
if i not in self.our_links: if i not in self.our_links:
text = re.sub(r'\('+i+'\)', '', text) text = re.sub(r'\('+i+'\)', '', text)
# Note - I'm not checking for escaped '-' as this will also get hypenated words
text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%'])
text = re.sub(r'%\xa0+', r'%', text) #remove empty spans
text = re.sub(r'%%', r'', text) #remove empty spans
text = re.sub(r'%([_+*-]+)%', r'\1', text) #remove spans from tagged output
text = re.sub(r' +\n', r'\n', text) #remove spaces before a newline
text = re.sub(r'^\n+', r'', text) #remove newlines at top of file
text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) #correct blockcode paras
text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) #correct blockquote paras
# text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) #reduce blank lines + insert blank para
text = re.sub(r'\n{3}', r'\n\n', text) #reduce blank lines
text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text)
text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text)
text = re.sub(r'\n(p.*\.\n)(p.*\.)', r'\n\2', text)
text = re.sub(u'\np.*\.\xa0', r'\np. ', text) # blank paragraph
text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph
text = re.sub(r' {2,}\|', r' |', text) #sort out spaces in tables
# Now put back spaces removed earlier as they're needed here
text = re.sub(r'\np\.\n', r'\np. \n', text)
text = re.sub(r' \n\n\n', r' \n\n', text) #reduce blank lines
# started work on trying to fix footnotes # started work on trying to fix footnotes
# text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text) # text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text)
return text return text
@ -110,21 +121,15 @@ class TextileMLizer(OEB2HTML):
self.remove_space_after_newline = False self.remove_space_after_newline = False
return text return text
# def remove_leading_ws(self, text):
# text = text.replace('\r\n', '\n')
# text = text.replace('\r', '\n')
# text = re.sub(r'\n[\t ]+', '\n', text)
# text = re.sub(r'\n{2,}', '\n', text)
# return text
def check_styles(self, style): def check_styles(self, style):
txt = '{' txt = '{'
# style_string = '%s;' % style
# txt += style_string
if style['color'] and style['color'] != 'black': if style['color'] and style['color'] != 'black':
txt += 'color:'+style['color']+';' txt += 'color:'+style['color']+';'
# if style['font-size']:# in ('big', 'bigger', 'small', 'smaller'): try:
# txt += 'font-size: %d;' % style['font-size'] if style['background']:
txt += 'background:'+style['background']+';'
except:
pass
txt += '}' txt += '}'
if txt == '{}': txt = '' if txt == '{}': txt = ''
return txt return txt
@ -137,7 +142,7 @@ class TextileMLizer(OEB2HTML):
return '' return ''
def check_valign(self, style): def check_valign(self, style):
tests = {'top':'^','bottom':'~', 'middle':'-'} tests = {'top':'^','bottom':'~'} #, 'middle':'-'}
for i in tests: for i in tests:
if style['vertical-align'] == i: if style['vertical-align'] == i:
return tests[i] return tests[i]
@ -157,8 +162,9 @@ class TextileMLizer(OEB2HTML):
def check_id_tag(self, attribs): def check_id_tag(self, attribs):
txt = '' txt = ''
if attribs.has_key('id'): # and attribs['id'] in self.links.values(): if attribs.has_key('id'): # and attribs['id'] in self.links.values():
txt = '(#'+attribs['id']+ ')' txt = '(#'+attribs['id']+ ')'
self.our_ids.append('#'+attribs['id']) self.our_ids.append('#'+attribs['id'])
self.id_no_text = u'\xa0'
return txt return txt
def build_block(self, tag, style, attribs): def build_block(self, tag, style, attribs):
@ -170,7 +176,7 @@ class TextileMLizer(OEB2HTML):
txt += self.check_styles(style) txt += self.check_styles(style)
return txt return txt
def dump_text(self, elem, stylizer, page, tag_stack=[]): def dump_text(self, elem, stylizer):
''' '''
@elem: The element in the etree that we are working on. @elem: The element in the etree that we are working on.
@stylizer: The style information attached to the element. @stylizer: The style information attached to the element.
@ -197,45 +203,59 @@ class TextileMLizer(OEB2HTML):
or style['visibility'] == 'hidden': or style['visibility'] == 'hidden':
return [''] return ['']
# Soft scene breaks.
text.append(self.check_padding(style, ['margin-top',u'\n\n\xa0']))
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
if tag == 'div': if tag == 'div':
tag = 'p' tag = 'p'
text.append(self.build_block(tag, style, attribs)) block = self.build_block(tag, style, attribs)
text.append('. ') # Normal paragraph with no styling.
tags.append('\n') if block == '\np':
text.append('\n\n')
tags.append('\n')
else:
text.append(block)
text.append('. ')
tags.append('\n')
#self.style_embed = []
if style['font-style'] == 'italic' or tag in ('i', 'em'): if style['font-style'] == 'italic' or tag in ('i', 'em'):
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
if self.style_italic == False: if self.style_italic == False:
text.append('_') text.append('_')
# text.append('from '+tag)
tags.append('_') tags.append('_')
self.style_embed.append ('_')
self.style_italic = True self.style_italic = True
if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
style_string = '%s;' % style
text.append(style_string)
if self.style_bold == False: if self.style_bold == False:
text.append('*') text.append('*')
# text.append('from '+tag)
tags.append('*') tags.append('*')
self.style_embed.append ('*')
self.style_bold = True self.style_bold = True
if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
if tag != 'a': if tag != 'a':
if self.style_under == False: if self.style_under == False:
text.append('+') text.append('+')
tags.append('+') tags.append('+')
self.style_embed.append ('+')
self.style_under = True self.style_under = True
if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'): if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'):
if self.style_strike == False: if self.style_strike == False:
text.append('-') text.append('-')
tags.append('-') tags.append('-')
self.style_embed.append ('-')
self.style_strike = True self.style_strike = True
if tag == 'br': if tag == 'br':
text.append('') for i in reversed(self.style_embed):
tags.append('\n') text.append(i)
text.append('\n')
for i in self.style_embed:
text.append(i)
tags.append('')
self.remove_space_after_newline = True self.remove_space_after_newline = True
elif tag == 'blockquote': if tag == 'blockquote':
text.append('\nbq. ') text.append('\nbq. ')
tags.append('\n') tags.append('\n')
elif tag in ('abbr', 'acronym'): elif tag in ('abbr', 'acronym'):
@ -259,7 +279,7 @@ class TextileMLizer(OEB2HTML):
text.append('??') text.append('??')
tags.append('??') tags.append('??')
elif tag == 'hr': elif tag == 'hr':
text.append('\n***\n') text.append('\n***')
tags.append('\n') tags.append('\n')
elif tag == 'pre': elif tag == 'pre':
self.in_pre = True self.in_pre = True
@ -267,12 +287,14 @@ class TextileMLizer(OEB2HTML):
tags.append('pre\n') tags.append('pre\n')
elif tag == 'a': elif tag == 'a':
if self.opts.keep_links: if self.opts.keep_links:
text.append ('"') text.append('"')
tags.append('a')
if attribs.has_key('href'): if attribs.has_key('href'):
tags.append('":' + attribs['href']) tags.append('":' + attribs['href'])
self.our_links.append(attribs['href']) self.our_links.append(attribs['href'])
if attribs.has_key('title'): if attribs.has_key('title'):
tags.append('(' + attribs['title'] + ')') tags.append('(' + attribs['title'] + ')')
self.in_a_link = True
elif tag == 'img': elif tag == 'img':
if self.opts.keep_image_references: if self.opts.keep_image_references:
txt = '!' + self.check_halign(style) txt = '!' + self.check_halign(style)
@ -286,7 +308,7 @@ class TextileMLizer(OEB2HTML):
tags.append('!') tags.append('!')
elif tag in ('ol', 'ul'): elif tag in ('ol', 'ul'):
self.list.append({'name':tag, 'num':0}) self.list.append({'name':tag, 'num':0})
text.append('\n') text.append('')
tags.append(tag) tags.append(tag)
elif tag == 'li': elif tag == 'li':
if self.list: li = self.list[-1] if self.list: li = self.list[-1]
@ -294,7 +316,7 @@ class TextileMLizer(OEB2HTML):
text.append('\n') text.append('\n')
if li['name'] == 'ul': text.append('*'*len(self.list)+' ') if li['name'] == 'ul': text.append('*'*len(self.list)+' ')
elif li['name'] == 'ol': text.append('#'*len(self.list)+' ') elif li['name'] == 'ol': text.append('#'*len(self.list)+' ')
tags.append('\n') tags.append('')
elif tag == 'dl': elif tag == 'dl':
text.append('\n') text.append('\n')
tags.append('') tags.append('')
@ -308,12 +330,19 @@ class TextileMLizer(OEB2HTML):
text.append('') text.append('')
tags.append('\n') tags.append('\n')
elif tag == 'table': elif tag == 'table':
self.in_table = True txt = self.build_block(tag, style, attribs)
text.append('') txt += '. \n'
if txt != '\ntable. \n':
text.append(txt)
else:
text.append('\n')
tags.append('') tags.append('')
tags.append('table')
elif tag == 'tr': elif tag == 'tr':
text.append('') txt = self.build_block('', style, attribs)
txt += '. '
if txt != '\n. ':
txt = re.sub ('\n','',txt)
text.append(txt)
tags.append('|\n') tags.append('|\n')
elif tag == 'td': elif tag == 'td':
text.append('|') text.append('|')
@ -324,13 +353,15 @@ class TextileMLizer(OEB2HTML):
txt += '\\' + attribs['colspan'] txt += '\\' + attribs['colspan']
if attribs.has_key ('rowspan'): if attribs.has_key ('rowspan'):
txt += '/' + attribs['rowspan'] txt += '/' + attribs['rowspan']
try:
txt += self.check_styles(style)
except:
pass
if txt != '': if txt != '':
text.append(txt+'. ') text.append(txt+'. ')
tags.append('') tags.append('')
elif tag == 'th': elif tag == 'th':
text.append('|_') text.append('|_. ')
text.append('. ')
tags.append('') tags.append('')
elif tag == 'span': elif tag == 'span':
if style['font-variant'] == 'small-caps': if style['font-variant'] == 'small-caps':
@ -339,35 +370,36 @@ class TextileMLizer(OEB2HTML):
tags.append('&') tags.append('&')
self.style_smallcap = True self.style_smallcap = True
else: else:
txt = '%' if self.in_a_link == False:
if self.opts.keep_links: txt = '%'
txt += self.check_id_tag(attribs) if self.opts.keep_links:
txt += self.check_styles(style) txt += self.check_id_tag(attribs)
if txt != '%': txt += self.check_styles(style)
text.append(txt) if txt != '%':
tags.append('%') text.append(txt)
tags.append('%')
if self.opts.keep_links and attribs.has_key('id'): if self.opts.keep_links and attribs.has_key('id'):
if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span'): if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'):
text.append(self.check_id_tag(attribs)) text.append(self.check_id_tag(attribs))
# Process the styles for any that we want to keep # Process the styles for any that we want to keep
if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', 'span'): if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', \
text.append(self.check_styles(style)) 'span', 'table', 'tr', 'td'):
if not self.in_a_link:
text.append(self.check_styles(style))
# Process tags that contain text. # Process tags that contain text.
if hasattr(elem, 'text') and elem.text: if hasattr(elem, 'text') and elem.text:
txt = elem.text txt = elem.text
if not self.in_pre: if not self.in_pre:
if self.in_table: txt = self.remove_newlines(txt)
txt = self.remove_newlines(txt)
else:
txt = self.remove_leading_ws(txt)
text.append(txt) text.append(txt)
self.id_no_text = u''
# Recurse down into tags within the tag we are in. # Recurse down into tags within the tag we are in.
for item in elem: for item in elem:
text += self.dump_text(item, stylizer, page, tag_stack+tags) text += self.dump_text(item, stylizer)
# Close all open tags. # Close all open tags.
tags.reverse() tags.reverse()
@ -375,36 +407,39 @@ class TextileMLizer(OEB2HTML):
if tag in ('pre', 'ul', 'ol', 'li', 'table'): if tag in ('pre', 'ul', 'ol', 'li', 'table'):
if tag == 'pre': if tag == 'pre':
self.in_pre = False self.in_pre = False
if tag == 'table': elif tag in ('ul', 'ol'):
self.in_table = False
if tag in ('ul', 'ol'):
if self.list: self.list.pop() if self.list: self.list.pop()
if not self.list: text.append('\n') if not self.list: text.append('\n')
else: else:
text.append('%s' % t) if t == 'a':
if t == '*': self.style_bold = False self.in_a_link = False
if t == '_': self.style_italic = False t = ''
if t == '+': self.style_under = False text.append(self.id_no_text)
if t == '-': self.style_strike = False self.id_no_text = u''
if t == '&': self.style_smallcap = False if t == '*':
self.style_bold = False
elif t == '_':
self.style_italic = False
elif t == '+':
self.style_under = False
elif t == '-':
self.style_strike = False
elif t == '&':
self.style_smallcap = False
if t in ('*', '_', '+', '-'):
txt = self.style_embed.pop()
text.append(txt)
else:
text.append('%s' % t)
# Soft scene breaks. # Soft scene breaks.
text.append(self.check_padding(style, ['margin-bottom',u'\n\n\xa0'])) text.append(self.check_padding(style, ['margin-bottom',u'\n\n\xa0']))
# try:
# ems = int(round((float(style.marginBottom) / style.fontSize) - 1))
# if ems >= 1:
# text.append('\n' * ems)
# except:
# pass
# Add the text that is outside of the tag. # Add the text that is outside of the tag.
if hasattr(elem, 'tail') and elem.tail: if hasattr(elem, 'tail') and elem.tail:
tail = elem.tail tail = elem.tail
if not self.in_pre: if not self.in_pre:
if self.in_table: tail = self.remove_newlines(tail)
tail = self.remove_newlines(tail)
else:
tail = self.remove_leading_ws(tail)
text.append(tail) text.append(tail)
return text return text