From 05331d7f05de3ed3010a63b5c0d754452ee23782 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 30 Apr 2011 09:43:09 -0400 Subject: [PATCH] TXT: Textile changes. --- src/calibre/ebooks/txt/processor.py | 2 + src/calibre/ebooks/txt/textileml.py | 231 ++++++++++++++++------------ 2 files changed, 135 insertions(+), 98 deletions(-) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 7e161f63bd..54369190de 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -242,6 +242,8 @@ def detect_formatting_type(txt): textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt)) # Links textile_count += len(re.findall(r'"[^"]*":\S+', txt)) + # paragraph blocks + textile_count += len(re.findall(r'(?mu)^p(<|<>|=|>)?\. ', txt)) # Decide if either markdown or textile is used in the text # based on the number of unique formatting elements found. diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 42b709a681..622ff8d2e3 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -7,7 +7,6 @@ __docformat__ = 'restructuredtext en' ''' Transform OEB content into Textile formatted plain text ''' - import re from functools import partial @@ -16,8 +15,6 @@ from calibre.ebooks.htmlz.oeb2html import OEB2HTML from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.txt.unsmarten import unsmarten -from operator import itemgetter - class TextileMLizer(OEB2HTML): @@ -29,17 +26,20 @@ class TextileMLizer(OEB2HTML): self.links = {} self.list = [] self.our_links = [] + self.in_a_link = False self.our_ids = [] self.images = {} + self.id_no_text = u'' + self.style_embed = [] self.remove_space_after_newline = False self.base_hrefs = [item.href for item in oeb_book.spine] self.map_resources(oeb_book) -# self.style_bold = False -# self.style_italic = False -# self.style_under = False -# self.style_strike = False -# self.style_smallcap = False + self.style_bold = False + self.style_italic = False + self.style_under = False + self.style_strike = False + self.style_smallcap = False txt = self.mlize_spine(oeb_book) txt = unsmarten(txt) @@ -56,7 +56,7 @@ class TextileMLizer(OEB2HTML): self.rewrite_ids(item.data, item) rewrite_links(item.data, partial(self.rewrite_link, page=item)) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) - output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) + output += self.dump_text(item.data.find(XHTML('body')), stylizer) output.append('\n\n') return ''.join(output) @@ -64,36 +64,47 @@ class TextileMLizer(OEB2HTML): # Needs tweaking and finetuning def check_escaping(text, tests): for t in tests: - text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text) + # I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged + txt = '%s' % t + self.log.debug('DEBUG: ' + txt) + if txt != '%': + text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text) text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text) text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text) return text - # Note - I'm not checking for escaped '-' as this will also get hypenated words - text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%']) - - text = re.sub(r' +\n', r'\n', text) - text = re.sub(r'^\n+', r'', text) - text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) - text = re.sub(r'\nbq\.\n?\np\. ', r'\nbq. ', text) - text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) - text = re.sub(r'\n{3}', r'\n\n', text) - text = re.sub(r'(p.*\. \n?)(p.*\. )', r'\2', text) - text = re.sub(r'p.*\. \n\n', r'', text) - text = re.sub(u'p.*\. \xa0', r'p. ', text) # blank paragraph - text = re.sub(r' \|', r'|', text) - # Now put back spaces removed earlier as they're needed here - text = re.sub(r'\np\.\n', r'\np. \n', text) - # Now tidyup links and ids - remove ones that don't have a correponding opposite if self.opts.keep_links: for i in self.our_links: - if i not in self.our_ids: - text = re.sub(r'"(.+)":'+i, '\1', text) + if i[0] == '#': + if i not in self.our_ids: + text = re.sub(r'"(.+)":'+i, '\1', text) for i in self.our_ids: if i not in self.our_links: text = re.sub(r'\('+i+'\)', '', text) + + # Note - I'm not checking for escaped '-' as this will also get hypenated words + text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%']) + text = re.sub(r'%\xa0+', r'%', text) #remove empty spans + text = re.sub(r'%%', r'', text) #remove empty spans + text = re.sub(r'%([_+*-]+)%', r'\1', text) #remove spans from tagged output + text = re.sub(r' +\n', r'\n', text) #remove spaces before a newline + text = re.sub(r'^\n+', r'', text) #remove newlines at top of file + text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) #correct blockcode paras + text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) #correct blockquote paras +# text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) #reduce blank lines + insert blank para + text = re.sub(r'\n{3}', r'\n\n', text) #reduce blank lines + text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text) + text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text) + text = re.sub(r'\n(p.*\.\n)(p.*\.)', r'\n\2', text) + text = re.sub(u'\np.*\.\xa0', r'\np. ', text) # blank paragraph + text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph + text = re.sub(r' {2,}\|', r' |', text) #sort out spaces in tables + # Now put back spaces removed earlier as they're needed here + text = re.sub(r'\np\.\n', r'\np. \n', text) + text = re.sub(r' \n\n\n', r' \n\n', text) #reduce blank lines + # started work on trying to fix footnotes # text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text) return text @@ -110,21 +121,15 @@ class TextileMLizer(OEB2HTML): self.remove_space_after_newline = False return text -# def remove_leading_ws(self, text): -# text = text.replace('\r\n', '\n') -# text = text.replace('\r', '\n') -# text = re.sub(r'\n[\t ]+', '\n', text) -# text = re.sub(r'\n{2,}', '\n', text) -# return text - def check_styles(self, style): txt = '{' -# style_string = '%s;' % style -# txt += style_string if style['color'] and style['color'] != 'black': txt += 'color:'+style['color']+';' -# if style['font-size']:# in ('big', 'bigger', 'small', 'smaller'): -# txt += 'font-size: %d;' % style['font-size'] + try: + if style['background']: + txt += 'background:'+style['background']+';' + except: + pass txt += '}' if txt == '{}': txt = '' return txt @@ -137,7 +142,7 @@ class TextileMLizer(OEB2HTML): return '' def check_valign(self, style): - tests = {'top':'^','bottom':'~', 'middle':'-'} + tests = {'top':'^','bottom':'~'} #, 'middle':'-'} for i in tests: if style['vertical-align'] == i: return tests[i] @@ -157,8 +162,9 @@ class TextileMLizer(OEB2HTML): def check_id_tag(self, attribs): txt = '' if attribs.has_key('id'): # and attribs['id'] in self.links.values(): - txt = '(#'+attribs['id']+ ')' - self.our_ids.append('#'+attribs['id']) + txt = '(#'+attribs['id']+ ')' + self.our_ids.append('#'+attribs['id']) + self.id_no_text = u'\xa0' return txt def build_block(self, tag, style, attribs): @@ -170,7 +176,7 @@ class TextileMLizer(OEB2HTML): txt += self.check_styles(style) return txt - def dump_text(self, elem, stylizer, page, tag_stack=[]): + def dump_text(self, elem, stylizer): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. @@ -197,45 +203,59 @@ class TextileMLizer(OEB2HTML): or style['visibility'] == 'hidden': return [''] + # Soft scene breaks. + text.append(self.check_padding(style, ['margin-top',u'\n\n\xa0'])) + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): if tag == 'div': tag = 'p' - text.append(self.build_block(tag, style, attribs)) - text.append('. ') - tags.append('\n') + block = self.build_block(tag, style, attribs) + # Normal paragraph with no styling. + if block == '\np': + text.append('\n\n') + tags.append('\n') + else: + text.append(block) + text.append('. ') + tags.append('\n') + #self.style_embed = [] if style['font-style'] == 'italic' or tag in ('i', 'em'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): if self.style_italic == False: text.append('_') -# text.append('from '+tag) tags.append('_') + self.style_embed.append ('_') self.style_italic = True if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): - style_string = '%s;' % style - text.append(style_string) if self.style_bold == False: text.append('*') -# text.append('from '+tag) tags.append('*') + self.style_embed.append ('*') self.style_bold = True if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): if tag != 'a': if self.style_under == False: text.append('+') tags.append('+') + self.style_embed.append ('+') self.style_under = True if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'): if self.style_strike == False: text.append('-') tags.append('-') + self.style_embed.append ('-') self.style_strike = True if tag == 'br': - text.append('') - tags.append('\n') + for i in reversed(self.style_embed): + text.append(i) + text.append('\n') + for i in self.style_embed: + text.append(i) + tags.append('') self.remove_space_after_newline = True - elif tag == 'blockquote': + if tag == 'blockquote': text.append('\nbq. ') tags.append('\n') elif tag in ('abbr', 'acronym'): @@ -259,7 +279,7 @@ class TextileMLizer(OEB2HTML): text.append('??') tags.append('??') elif tag == 'hr': - text.append('\n***\n') + text.append('\n***') tags.append('\n') elif tag == 'pre': self.in_pre = True @@ -267,12 +287,14 @@ class TextileMLizer(OEB2HTML): tags.append('pre\n') elif tag == 'a': if self.opts.keep_links: - text.append ('"') + text.append('"') + tags.append('a') if attribs.has_key('href'): tags.append('":' + attribs['href']) self.our_links.append(attribs['href']) if attribs.has_key('title'): tags.append('(' + attribs['title'] + ')') + self.in_a_link = True elif tag == 'img': if self.opts.keep_image_references: txt = '!' + self.check_halign(style) @@ -286,7 +308,7 @@ class TextileMLizer(OEB2HTML): tags.append('!') elif tag in ('ol', 'ul'): self.list.append({'name':tag, 'num':0}) - text.append('\n') + text.append('') tags.append(tag) elif tag == 'li': if self.list: li = self.list[-1] @@ -294,7 +316,7 @@ class TextileMLizer(OEB2HTML): text.append('\n') if li['name'] == 'ul': text.append('*'*len(self.list)+' ') elif li['name'] == 'ol': text.append('#'*len(self.list)+' ') - tags.append('\n') + tags.append('') elif tag == 'dl': text.append('\n') tags.append('') @@ -308,12 +330,19 @@ class TextileMLizer(OEB2HTML): text.append('') tags.append('\n') elif tag == 'table': - self.in_table = True - text.append('') + txt = self.build_block(tag, style, attribs) + txt += '. \n' + if txt != '\ntable. \n': + text.append(txt) + else: + text.append('\n') tags.append('') - tags.append('table') elif tag == 'tr': - text.append('') + txt = self.build_block('', style, attribs) + txt += '. ' + if txt != '\n. ': + txt = re.sub ('\n','',txt) + text.append(txt) tags.append('|\n') elif tag == 'td': text.append('|') @@ -324,13 +353,15 @@ class TextileMLizer(OEB2HTML): txt += '\\' + attribs['colspan'] if attribs.has_key ('rowspan'): txt += '/' + attribs['rowspan'] + try: + txt += self.check_styles(style) + except: + pass if txt != '': text.append(txt+'. ') tags.append('') elif tag == 'th': - text.append('|_') - - text.append('. ') + text.append('|_. ') tags.append('') elif tag == 'span': if style['font-variant'] == 'small-caps': @@ -339,35 +370,36 @@ class TextileMLizer(OEB2HTML): tags.append('&') self.style_smallcap = True else: - txt = '%' - if self.opts.keep_links: - txt += self.check_id_tag(attribs) - txt += self.check_styles(style) - if txt != '%': - text.append(txt) - tags.append('%') + if self.in_a_link == False: + txt = '%' + if self.opts.keep_links: + txt += self.check_id_tag(attribs) + txt += self.check_styles(style) + if txt != '%': + text.append(txt) + tags.append('%') if self.opts.keep_links and attribs.has_key('id'): - if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span'): + if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'): text.append(self.check_id_tag(attribs)) # Process the styles for any that we want to keep - if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', 'span'): - text.append(self.check_styles(style)) + if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', \ + 'span', 'table', 'tr', 'td'): + if not self.in_a_link: + text.append(self.check_styles(style)) # Process tags that contain text. if hasattr(elem, 'text') and elem.text: txt = elem.text if not self.in_pre: - if self.in_table: - txt = self.remove_newlines(txt) - else: - txt = self.remove_leading_ws(txt) + txt = self.remove_newlines(txt) text.append(txt) + self.id_no_text = u'' # Recurse down into tags within the tag we are in. for item in elem: - text += self.dump_text(item, stylizer, page, tag_stack+tags) + text += self.dump_text(item, stylizer) # Close all open tags. tags.reverse() @@ -375,36 +407,39 @@ class TextileMLizer(OEB2HTML): if tag in ('pre', 'ul', 'ol', 'li', 'table'): if tag == 'pre': self.in_pre = False - if tag == 'table': - self.in_table = False - if tag in ('ul', 'ol'): + elif tag in ('ul', 'ol'): if self.list: self.list.pop() if not self.list: text.append('\n') else: - text.append('%s' % t) - if t == '*': self.style_bold = False - if t == '_': self.style_italic = False - if t == '+': self.style_under = False - if t == '-': self.style_strike = False - if t == '&': self.style_smallcap = False + if t == 'a': + self.in_a_link = False + t = '' + text.append(self.id_no_text) + self.id_no_text = u'' + if t == '*': + self.style_bold = False + elif t == '_': + self.style_italic = False + elif t == '+': + self.style_under = False + elif t == '-': + self.style_strike = False + elif t == '&': + self.style_smallcap = False + if t in ('*', '_', '+', '-'): + txt = self.style_embed.pop() + text.append(txt) + else: + text.append('%s' % t) # Soft scene breaks. text.append(self.check_padding(style, ['margin-bottom',u'\n\n\xa0'])) -# try: -# ems = int(round((float(style.marginBottom) / style.fontSize) - 1)) -# if ems >= 1: -# text.append('\n' * ems) -# except: -# pass # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: tail = elem.tail if not self.in_pre: - if self.in_table: - tail = self.remove_newlines(tail) - else: - tail = self.remove_leading_ws(tail) + tail = self.remove_newlines(tail) text.append(tail) return text