mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
TXT: Textile changes.
This commit is contained in:
parent
a168a3d11a
commit
05331d7f05
@ -242,6 +242,8 @@ def detect_formatting_type(txt):
|
||||
textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
|
||||
# Links
|
||||
textile_count += len(re.findall(r'"[^"]*":\S+', txt))
|
||||
# paragraph blocks
|
||||
textile_count += len(re.findall(r'(?mu)^p(<|<>|=|>)?\. ', txt))
|
||||
|
||||
# Decide if either markdown or textile is used in the text
|
||||
# based on the number of unique formatting elements found.
|
||||
|
@ -7,7 +7,6 @@ __docformat__ = 'restructuredtext en'
|
||||
'''
|
||||
Transform OEB content into Textile formatted plain text
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from functools import partial
|
||||
@ -16,8 +15,6 @@ from calibre.ebooks.htmlz.oeb2html import OEB2HTML
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
from calibre.ebooks.txt.unsmarten import unsmarten
|
||||
from operator import itemgetter
|
||||
|
||||
|
||||
class TextileMLizer(OEB2HTML):
|
||||
|
||||
@ -29,17 +26,20 @@ class TextileMLizer(OEB2HTML):
|
||||
self.links = {}
|
||||
self.list = []
|
||||
self.our_links = []
|
||||
self.in_a_link = False
|
||||
self.our_ids = []
|
||||
self.images = {}
|
||||
self.id_no_text = u''
|
||||
self.style_embed = []
|
||||
self.remove_space_after_newline = False
|
||||
self.base_hrefs = [item.href for item in oeb_book.spine]
|
||||
self.map_resources(oeb_book)
|
||||
|
||||
# self.style_bold = False
|
||||
# self.style_italic = False
|
||||
# self.style_under = False
|
||||
# self.style_strike = False
|
||||
# self.style_smallcap = False
|
||||
self.style_bold = False
|
||||
self.style_italic = False
|
||||
self.style_under = False
|
||||
self.style_strike = False
|
||||
self.style_smallcap = False
|
||||
|
||||
txt = self.mlize_spine(oeb_book)
|
||||
txt = unsmarten(txt)
|
||||
@ -56,7 +56,7 @@ class TextileMLizer(OEB2HTML):
|
||||
self.rewrite_ids(item.data, item)
|
||||
rewrite_links(item.data, partial(self.rewrite_link, page=item))
|
||||
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
|
||||
output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
|
||||
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
|
||||
output.append('\n\n')
|
||||
return ''.join(output)
|
||||
|
||||
@ -64,36 +64,47 @@ class TextileMLizer(OEB2HTML):
|
||||
# Needs tweaking and finetuning
|
||||
def check_escaping(text, tests):
|
||||
for t in tests:
|
||||
# I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged
|
||||
txt = '%s' % t
|
||||
self.log.debug('DEBUG: ' + txt)
|
||||
if txt != '%':
|
||||
text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text)
|
||||
text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text)
|
||||
text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text)
|
||||
return text
|
||||
|
||||
# Note - I'm not checking for escaped '-' as this will also get hypenated words
|
||||
text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%'])
|
||||
|
||||
text = re.sub(r' +\n', r'\n', text)
|
||||
text = re.sub(r'^\n+', r'', text)
|
||||
text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)
|
||||
text = re.sub(r'\nbq\.\n?\np\. ', r'\nbq. ', text)
|
||||
text = re.sub(r'\n{4,}', r'\n\np. \n\n', text)
|
||||
text = re.sub(r'\n{3}', r'\n\n', text)
|
||||
text = re.sub(r'(p.*\. \n?)(p.*\. )', r'\2', text)
|
||||
text = re.sub(r'p.*\. \n\n', r'', text)
|
||||
text = re.sub(u'p.*\. \xa0', r'p. ', text) # blank paragraph
|
||||
text = re.sub(r' \|', r'|', text)
|
||||
# Now put back spaces removed earlier as they're needed here
|
||||
text = re.sub(r'\np\.\n', r'\np. \n', text)
|
||||
|
||||
# Now tidyup links and ids - remove ones that don't have a correponding opposite
|
||||
if self.opts.keep_links:
|
||||
for i in self.our_links:
|
||||
if i[0] == '#':
|
||||
if i not in self.our_ids:
|
||||
text = re.sub(r'"(.+)":'+i, '\1', text)
|
||||
for i in self.our_ids:
|
||||
if i not in self.our_links:
|
||||
text = re.sub(r'\('+i+'\)', '', text)
|
||||
|
||||
# Note - I'm not checking for escaped '-' as this will also get hypenated words
|
||||
text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%'])
|
||||
|
||||
text = re.sub(r'%\xa0+', r'%', text) #remove empty spans
|
||||
text = re.sub(r'%%', r'', text) #remove empty spans
|
||||
text = re.sub(r'%([_+*-]+)%', r'\1', text) #remove spans from tagged output
|
||||
text = re.sub(r' +\n', r'\n', text) #remove spaces before a newline
|
||||
text = re.sub(r'^\n+', r'', text) #remove newlines at top of file
|
||||
text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) #correct blockcode paras
|
||||
text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) #correct blockquote paras
|
||||
# text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) #reduce blank lines + insert blank para
|
||||
text = re.sub(r'\n{3}', r'\n\n', text) #reduce blank lines
|
||||
text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text)
|
||||
text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text)
|
||||
text = re.sub(r'\n(p.*\.\n)(p.*\.)', r'\n\2', text)
|
||||
text = re.sub(u'\np.*\.\xa0', r'\np. ', text) # blank paragraph
|
||||
text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph
|
||||
text = re.sub(r' {2,}\|', r' |', text) #sort out spaces in tables
|
||||
# Now put back spaces removed earlier as they're needed here
|
||||
text = re.sub(r'\np\.\n', r'\np. \n', text)
|
||||
text = re.sub(r' \n\n\n', r' \n\n', text) #reduce blank lines
|
||||
|
||||
# started work on trying to fix footnotes
|
||||
# text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text)
|
||||
return text
|
||||
@ -110,21 +121,15 @@ class TextileMLizer(OEB2HTML):
|
||||
self.remove_space_after_newline = False
|
||||
return text
|
||||
|
||||
# def remove_leading_ws(self, text):
|
||||
# text = text.replace('\r\n', '\n')
|
||||
# text = text.replace('\r', '\n')
|
||||
# text = re.sub(r'\n[\t ]+', '\n', text)
|
||||
# text = re.sub(r'\n{2,}', '\n', text)
|
||||
# return text
|
||||
|
||||
def check_styles(self, style):
|
||||
txt = '{'
|
||||
# style_string = '%s;' % style
|
||||
# txt += style_string
|
||||
if style['color'] and style['color'] != 'black':
|
||||
txt += 'color:'+style['color']+';'
|
||||
# if style['font-size']:# in ('big', 'bigger', 'small', 'smaller'):
|
||||
# txt += 'font-size: %d;' % style['font-size']
|
||||
try:
|
||||
if style['background']:
|
||||
txt += 'background:'+style['background']+';'
|
||||
except:
|
||||
pass
|
||||
txt += '}'
|
||||
if txt == '{}': txt = ''
|
||||
return txt
|
||||
@ -137,7 +142,7 @@ class TextileMLizer(OEB2HTML):
|
||||
return ''
|
||||
|
||||
def check_valign(self, style):
|
||||
tests = {'top':'^','bottom':'~', 'middle':'-'}
|
||||
tests = {'top':'^','bottom':'~'} #, 'middle':'-'}
|
||||
for i in tests:
|
||||
if style['vertical-align'] == i:
|
||||
return tests[i]
|
||||
@ -159,6 +164,7 @@ class TextileMLizer(OEB2HTML):
|
||||
if attribs.has_key('id'): # and attribs['id'] in self.links.values():
|
||||
txt = '(#'+attribs['id']+ ')'
|
||||
self.our_ids.append('#'+attribs['id'])
|
||||
self.id_no_text = u'\xa0'
|
||||
return txt
|
||||
|
||||
def build_block(self, tag, style, attribs):
|
||||
@ -170,7 +176,7 @@ class TextileMLizer(OEB2HTML):
|
||||
txt += self.check_styles(style)
|
||||
return txt
|
||||
|
||||
def dump_text(self, elem, stylizer, page, tag_stack=[]):
|
||||
def dump_text(self, elem, stylizer):
|
||||
'''
|
||||
@elem: The element in the etree that we are working on.
|
||||
@stylizer: The style information attached to the element.
|
||||
@ -197,45 +203,59 @@ class TextileMLizer(OEB2HTML):
|
||||
or style['visibility'] == 'hidden':
|
||||
return ['']
|
||||
|
||||
# Soft scene breaks.
|
||||
text.append(self.check_padding(style, ['margin-top',u'\n\n\xa0']))
|
||||
|
||||
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
|
||||
if tag == 'div':
|
||||
tag = 'p'
|
||||
text.append(self.build_block(tag, style, attribs))
|
||||
block = self.build_block(tag, style, attribs)
|
||||
# Normal paragraph with no styling.
|
||||
if block == '\np':
|
||||
text.append('\n\n')
|
||||
tags.append('\n')
|
||||
else:
|
||||
text.append(block)
|
||||
text.append('. ')
|
||||
tags.append('\n')
|
||||
#self.style_embed = []
|
||||
|
||||
if style['font-style'] == 'italic' or tag in ('i', 'em'):
|
||||
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
|
||||
if self.style_italic == False:
|
||||
text.append('_')
|
||||
# text.append('from '+tag)
|
||||
tags.append('_')
|
||||
self.style_embed.append ('_')
|
||||
self.style_italic = True
|
||||
if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
|
||||
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
|
||||
style_string = '%s;' % style
|
||||
text.append(style_string)
|
||||
if self.style_bold == False:
|
||||
text.append('*')
|
||||
# text.append('from '+tag)
|
||||
tags.append('*')
|
||||
self.style_embed.append ('*')
|
||||
self.style_bold = True
|
||||
if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
|
||||
if tag != 'a':
|
||||
if self.style_under == False:
|
||||
text.append('+')
|
||||
tags.append('+')
|
||||
self.style_embed.append ('+')
|
||||
self.style_under = True
|
||||
if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'):
|
||||
if self.style_strike == False:
|
||||
text.append('-')
|
||||
tags.append('-')
|
||||
self.style_embed.append ('-')
|
||||
self.style_strike = True
|
||||
if tag == 'br':
|
||||
text.append('')
|
||||
tags.append('\n')
|
||||
for i in reversed(self.style_embed):
|
||||
text.append(i)
|
||||
text.append('\n')
|
||||
for i in self.style_embed:
|
||||
text.append(i)
|
||||
tags.append('')
|
||||
self.remove_space_after_newline = True
|
||||
elif tag == 'blockquote':
|
||||
if tag == 'blockquote':
|
||||
text.append('\nbq. ')
|
||||
tags.append('\n')
|
||||
elif tag in ('abbr', 'acronym'):
|
||||
@ -259,7 +279,7 @@ class TextileMLizer(OEB2HTML):
|
||||
text.append('??')
|
||||
tags.append('??')
|
||||
elif tag == 'hr':
|
||||
text.append('\n***\n')
|
||||
text.append('\n***')
|
||||
tags.append('\n')
|
||||
elif tag == 'pre':
|
||||
self.in_pre = True
|
||||
@ -267,12 +287,14 @@ class TextileMLizer(OEB2HTML):
|
||||
tags.append('pre\n')
|
||||
elif tag == 'a':
|
||||
if self.opts.keep_links:
|
||||
text.append ('"')
|
||||
text.append('"')
|
||||
tags.append('a')
|
||||
if attribs.has_key('href'):
|
||||
tags.append('":' + attribs['href'])
|
||||
self.our_links.append(attribs['href'])
|
||||
if attribs.has_key('title'):
|
||||
tags.append('(' + attribs['title'] + ')')
|
||||
self.in_a_link = True
|
||||
elif tag == 'img':
|
||||
if self.opts.keep_image_references:
|
||||
txt = '!' + self.check_halign(style)
|
||||
@ -286,7 +308,7 @@ class TextileMLizer(OEB2HTML):
|
||||
tags.append('!')
|
||||
elif tag in ('ol', 'ul'):
|
||||
self.list.append({'name':tag, 'num':0})
|
||||
text.append('\n')
|
||||
text.append('')
|
||||
tags.append(tag)
|
||||
elif tag == 'li':
|
||||
if self.list: li = self.list[-1]
|
||||
@ -294,7 +316,7 @@ class TextileMLizer(OEB2HTML):
|
||||
text.append('\n')
|
||||
if li['name'] == 'ul': text.append('*'*len(self.list)+' ')
|
||||
elif li['name'] == 'ol': text.append('#'*len(self.list)+' ')
|
||||
tags.append('\n')
|
||||
tags.append('')
|
||||
elif tag == 'dl':
|
||||
text.append('\n')
|
||||
tags.append('')
|
||||
@ -308,12 +330,19 @@ class TextileMLizer(OEB2HTML):
|
||||
text.append('')
|
||||
tags.append('\n')
|
||||
elif tag == 'table':
|
||||
self.in_table = True
|
||||
text.append('')
|
||||
txt = self.build_block(tag, style, attribs)
|
||||
txt += '. \n'
|
||||
if txt != '\ntable. \n':
|
||||
text.append(txt)
|
||||
else:
|
||||
text.append('\n')
|
||||
tags.append('')
|
||||
tags.append('table')
|
||||
elif tag == 'tr':
|
||||
text.append('')
|
||||
txt = self.build_block('', style, attribs)
|
||||
txt += '. '
|
||||
if txt != '\n. ':
|
||||
txt = re.sub ('\n','',txt)
|
||||
text.append(txt)
|
||||
tags.append('|\n')
|
||||
elif tag == 'td':
|
||||
text.append('|')
|
||||
@ -324,13 +353,15 @@ class TextileMLizer(OEB2HTML):
|
||||
txt += '\\' + attribs['colspan']
|
||||
if attribs.has_key ('rowspan'):
|
||||
txt += '/' + attribs['rowspan']
|
||||
try:
|
||||
txt += self.check_styles(style)
|
||||
except:
|
||||
pass
|
||||
if txt != '':
|
||||
text.append(txt+'. ')
|
||||
tags.append('')
|
||||
elif tag == 'th':
|
||||
text.append('|_')
|
||||
|
||||
text.append('. ')
|
||||
text.append('|_. ')
|
||||
tags.append('')
|
||||
elif tag == 'span':
|
||||
if style['font-variant'] == 'small-caps':
|
||||
@ -339,6 +370,7 @@ class TextileMLizer(OEB2HTML):
|
||||
tags.append('&')
|
||||
self.style_smallcap = True
|
||||
else:
|
||||
if self.in_a_link == False:
|
||||
txt = '%'
|
||||
if self.opts.keep_links:
|
||||
txt += self.check_id_tag(attribs)
|
||||
@ -348,26 +380,26 @@ class TextileMLizer(OEB2HTML):
|
||||
tags.append('%')
|
||||
|
||||
if self.opts.keep_links and attribs.has_key('id'):
|
||||
if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span'):
|
||||
if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'):
|
||||
text.append(self.check_id_tag(attribs))
|
||||
|
||||
# Process the styles for any that we want to keep
|
||||
if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', 'span'):
|
||||
if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', \
|
||||
'span', 'table', 'tr', 'td'):
|
||||
if not self.in_a_link:
|
||||
text.append(self.check_styles(style))
|
||||
|
||||
# Process tags that contain text.
|
||||
if hasattr(elem, 'text') and elem.text:
|
||||
txt = elem.text
|
||||
if not self.in_pre:
|
||||
if self.in_table:
|
||||
txt = self.remove_newlines(txt)
|
||||
else:
|
||||
txt = self.remove_leading_ws(txt)
|
||||
text.append(txt)
|
||||
self.id_no_text = u''
|
||||
|
||||
# Recurse down into tags within the tag we are in.
|
||||
for item in elem:
|
||||
text += self.dump_text(item, stylizer, page, tag_stack+tags)
|
||||
text += self.dump_text(item, stylizer)
|
||||
|
||||
# Close all open tags.
|
||||
tags.reverse()
|
||||
@ -375,36 +407,39 @@ class TextileMLizer(OEB2HTML):
|
||||
if tag in ('pre', 'ul', 'ol', 'li', 'table'):
|
||||
if tag == 'pre':
|
||||
self.in_pre = False
|
||||
if tag == 'table':
|
||||
self.in_table = False
|
||||
if tag in ('ul', 'ol'):
|
||||
elif tag in ('ul', 'ol'):
|
||||
if self.list: self.list.pop()
|
||||
if not self.list: text.append('\n')
|
||||
else:
|
||||
if t == 'a':
|
||||
self.in_a_link = False
|
||||
t = ''
|
||||
text.append(self.id_no_text)
|
||||
self.id_no_text = u''
|
||||
if t == '*':
|
||||
self.style_bold = False
|
||||
elif t == '_':
|
||||
self.style_italic = False
|
||||
elif t == '+':
|
||||
self.style_under = False
|
||||
elif t == '-':
|
||||
self.style_strike = False
|
||||
elif t == '&':
|
||||
self.style_smallcap = False
|
||||
if t in ('*', '_', '+', '-'):
|
||||
txt = self.style_embed.pop()
|
||||
text.append(txt)
|
||||
else:
|
||||
text.append('%s' % t)
|
||||
if t == '*': self.style_bold = False
|
||||
if t == '_': self.style_italic = False
|
||||
if t == '+': self.style_under = False
|
||||
if t == '-': self.style_strike = False
|
||||
if t == '&': self.style_smallcap = False
|
||||
|
||||
# Soft scene breaks.
|
||||
text.append(self.check_padding(style, ['margin-bottom',u'\n\n\xa0']))
|
||||
# try:
|
||||
# ems = int(round((float(style.marginBottom) / style.fontSize) - 1))
|
||||
# if ems >= 1:
|
||||
# text.append('\n' * ems)
|
||||
# except:
|
||||
# pass
|
||||
|
||||
# Add the text that is outside of the tag.
|
||||
if hasattr(elem, 'tail') and elem.tail:
|
||||
tail = elem.tail
|
||||
if not self.in_pre:
|
||||
if self.in_table:
|
||||
tail = self.remove_newlines(tail)
|
||||
else:
|
||||
tail = self.remove_leading_ws(tail)
|
||||
text.append(tail)
|
||||
|
||||
return text
|
||||
|
Loading…
x
Reference in New Issue
Block a user