From 842ba755575c108fc0c8ab93cac383185776f212 Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 9 May 2011 21:19:28 -0400 Subject: [PATCH] More changes. --- src/calibre/ebooks/textile/functions.py | 34 +++++++++----- src/calibre/ebooks/txt/textileml.py | 59 ++++++++++++++++--------- 2 files changed, 61 insertions(+), 32 deletions(-) diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index b186e79ad4..0e1811f195 100755 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -12,7 +12,7 @@ A Humane Web Text Generator #__date__ = '2009/12/04' __copyright__ = """ -Copyright (c) 2011, Leigh Parry +Copyright (c) 2011, Leigh Parry Copyright (c) 2011, John Schember Copyright (c) 2009, Jason Samsa, http://jsamsa.com/ Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/ @@ -219,14 +219,13 @@ class Textile(object): ] glyph_defaults = [ (re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'), r'\1\2×\3'), # dimension sign - (re.compile(r'(\d+)\'', re.I), r'\1′'), # prime - (re.compile(r'(\d+)\"', re.I), r'\1″'), # prime-double + (re.compile(r'(\d+)\'(\s)', re.I), r'\1′\2'), # prime + (re.compile(r'(\d+)\"(\s)', re.I), r'\1″\2'), # prime-double (re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), r'\1'), # 3+ uppercase acronym (re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), r'\1'), # 3+ uppercase (re.compile(r'\b(\s{0,1})?\.{3}'), r'\1…'), # ellipsis (re.compile(r'^[\*_-]{3,}$', re.M), r'
'), #
scene-break -# (re.compile(r'\b--\b'), r'—'), # em dash - (re.compile(r'([^-])--([^-])'), r'\1—\2'), # em dash + (re.compile(r'(^|[^-])--([^-]|$)'), r'\1—\2'), # em dash (re.compile(r'\s-(?:\s|$)'), r' – '), # en dash (re.compile(r'\b( ?)[([]TM[])]', re.I), r'\1™'), # trademark (re.compile(r'\b( ?)[([]R[])]', re.I), r'\1®'), # registered @@ -706,6 +705,21 @@ class Textile(object): result.append(line) return ''.join(result) + def glyphs_only(self, text): + # fix: hackish + text = re.sub(r'"\Z', '\" ', text) + + result = [] + for line in re.compile(r'(<.*?>)', re.U).split(text): + if not re.search(r'<.*>', line): + rules = [] + if re.search(r'{.+?}', line): + rules = self.macro_defaults + for s, r in rules: + line = s.sub(r, line) + result.append(line) + return ''.join(result) + def vAlign(self, input): d = {'^':'top', '-':'middle', '~':'bottom'} return d.get(input, '') @@ -792,7 +806,6 @@ class Textile(object): text = self.noTextile(text) text = self.code(text) - text = self.glyphs(text) text = self.links(text) if not self.noimage: @@ -804,6 +817,7 @@ class Textile(object): text = self.span(text) text = self.footnoteRef(text) + text = self.glyphs(text) return text.rstrip('\n') @@ -814,6 +828,7 @@ class Textile(object): 'fooobar ... and hello world ...' """ + text = self.glyphs_only(text) punct = '!"#$%&\'*+,-./:;=?@\\^_`|~' pattern = r''' @@ -868,7 +883,7 @@ class Textile(object): >>> t.span(r"hello %(bob)span *strong* and **bold**% goodbye") 'hello span strong and bold goodbye' """ - qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^', r'&') + qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^') pnct = ".,\"'?!;:" for qtag in qtags: @@ -900,9 +915,7 @@ class Textile(object): '%' : 'span', '+' : 'ins', '~' : 'sub', - '^' : 'sup', - '&' : 'span style="font-variant:small-caps;"' -# '&' : 'span style="font-transform:uppercase;font-size:smaller;"' + '^' : 'sup' } tag = qtags[tag] atts = self.pba(atts) @@ -1046,4 +1059,3 @@ def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'): return Textile(restricted=True, lite=lite, noimage=noimage).textile(text, rel='nofollow', html_type=html_type) - diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 31c118251d..814ba01a3e 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -69,7 +69,8 @@ class TextileMLizer(OEB2HTML): txt = '%s' % t if txt != '%': text = re.sub(r'([^'+t+'|^\n])'+t+'\]\['+t+'([^'+t+'])', r'\1\2', text) - text = re.sub(r'(\s|[*_])\[('+t+'[a-zA-Z0-9 \',.*_]+'+t+')\](\s|[*_])', r'\1\2\3', text) + text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text) + text = re.sub(r'(\s|[*_\'"])\[('+t+'[a-zA-Z0-9 \'",.*_]+'+t+')\](\s|[*_\'"?!,.])', r'\1\2\3', text) return text # Now tidyup links and ids - remove ones that don't have a correponding opposite @@ -77,14 +78,17 @@ class TextileMLizer(OEB2HTML): for i in self.our_links: if i[0] == '#': if i not in self.our_ids: - text = re.sub(r'"(.+)":'+i, '\1', text) + self.log.debug('Link has no target - %s ...' % i) + text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text) for i in self.our_ids: if i not in self.our_links: - text = re.sub(r'\('+i+'\)', '', text) + self.log.debug('ID has no link - %s ...' % i) + text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text) - # Note - I'm not checking for escaped '-' as this will also get hypenated words - text = check_escaping(text, ['\*', '_', '\+', '-']) -# text = check_escaping(text, ['\*', '_', '\+', '-']) + # Remove obvious non-needed escaping, add sub/sup-script ones + text = check_escaping(text, ['\*', '_', '\*']) + text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text) # escape the super/sub-scripts if needed + text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text) # escape the super/sub-scripts if needed text = re.sub(r'%\xa0+', r'%', text) #remove empty spans text = re.sub(r'%%', r'', text) #remove empty spans - MAY MERGE SOME ? @@ -96,13 +100,14 @@ class TextileMLizer(OEB2HTML): # text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) #reduce blank lines + insert blank para text = re.sub(r'\n{3}', r'\n\n', text) #reduce blank lines # text = re.sub(r' ((\* ?)+) ', r' ==\1== ', text) - text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text) + text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text) + text = re.sub(r'\n\n {2,4}%', r'%', text) #Check span following blank para text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text) text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text) # blank paragraph text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text) # blank paragraph text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text) - text = re.sub(r'\n(p\. \n)(p.*\.)', r'\n\2', text) + text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text) text = re.sub(r' {2,}\|', r' |', text) #sort out spaces in tables # Now put back spaces removed earlier as they're needed here @@ -193,7 +198,8 @@ class TextileMLizer(OEB2HTML): return txt def prepare_string_for_textile(self, txt): - if re.search(r'(\s([*&_+\-=~@%|]|\?{2}))|(([*&_+\-=~@%|]|\?{2})\s)', txt): +# if re.search(r'(\s([*&_+\-~@%|]|\?{2}))|(([*&_+\-~@%|]|\?{2})\s)', txt): + if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt): return ' ==%s== ' % txt return txt @@ -240,15 +246,23 @@ class TextileMLizer(OEB2HTML): if style['font-style'] == 'italic' or tag in ('i', 'em'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): if self.style_italic == False: - text.append('[_') - tags.append('_]') + if self.in_a_link: + text.append('_') + tags.append('_') + else: + text.append('[_') + tags.append('_]') self.style_embed.append('_') self.style_italic = True if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): if self.style_bold == False: - text.append('[*') - tags.append('*]') + if self.in_a_link: + text.append('*') + tags.append('*') + else: + text.append('[*') + tags.append('*]') self.style_embed.append('*') self.style_bold = True if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): @@ -304,14 +318,17 @@ class TextileMLizer(OEB2HTML): tags.append('pre\n') elif tag == 'a': if self.opts.keep_links: - text.append('"') - tags.append('a') if attribs.has_key('href'): + text.append('"') + tags.append('a') tags.append('":' + attribs['href']) self.our_links.append(attribs['href']) - if attribs.has_key('title'): - tags.append('(' + attribs['title'] + ')') - self.in_a_link = True + if attribs.has_key('title'): + tags.append('(' + attribs['title'] + ')') + self.in_a_link = True + else: + text.append('%') + tags.append('%') elif tag == 'img': if self.opts.keep_image_references: txt = '!' + self.check_halign(style) @@ -432,9 +449,9 @@ class TextileMLizer(OEB2HTML): t = '' text.append(self.id_no_text) self.id_no_text = u'' - if t == '*]': + if t in ('*]', '*'): self.style_bold = False - elif t == '_]': + elif t in ('_]', '_'): self.style_italic = False elif t == '+]': self.style_under = False @@ -442,7 +459,7 @@ class TextileMLizer(OEB2HTML): self.style_strike = False elif t == '&': self.style_smallcap = False - if t in ('*]', '_]', '+]', '-]'): + if t in ('*]', '_]', '+]', '-]', '*', '_'): txt = self.style_embed.pop() text.append('%s' % t)