From 804b248d46c71e5169c57da794ec2f69f2998dbf Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 16 Apr 2011 11:55:44 -0400 Subject: [PATCH 01/15] Add new but still wip textile output generator. --- src/calibre/ebooks/txt/output.py | 21 +- src/calibre/ebooks/txt/textileml.py | 341 +++++++++++++++++++++++++--- src/calibre/ebooks/txt/unsmarten.py | 109 +++++++++ 3 files changed, 432 insertions(+), 39 deletions(-) create mode 100644 src/calibre/ebooks/txt/unsmarten.py diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 4e54a97b45..7b50afb345 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -70,16 +70,17 @@ class TXTOutput(OutputFormatPlugin): ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): + print 'New' if opts.txt_output_formatting.lower() == 'markdown': from calibre.ebooks.txt.markdownml import MarkdownMLizer - writer = MarkdownMLizer(log) + self.writer = MarkdownMLizer(log) elif opts.txt_output_formatting.lower() == 'textile': from calibre.ebooks.txt.textileml import TextileMLizer - writer = TextileMLizer(log) + self.writer = TextileMLizer(log) else: - writer = TXTMLizer(log) + self.writer = TXTMLizer(log) - txt = writer.extract_content(oeb_book, opts) + txt = self.writer.extract_content(oeb_book, opts) txt = clean_ascii_chars(txt) log.debug('\tReplacing newlines with selected type...') @@ -118,10 +119,18 @@ class TXTZOutput(TXTOutput): # Images for item in oeb_book.manifest: if item.media_type in OEB_IMAGES: - path = os.path.join(tdir, os.path.dirname(item.href)) + if hasattr(self.writer, 'images'): + path = os.path.join(tdir, 'images') + if item.href in self.writer.images: + href = self.writer.images[item.href] + else: + continue + else: + path = os.path.join(tdir, os.path.dirname(item.href)) + href = os.path.basename(item.href) if not os.path.exists(path): os.makedirs(path) - with open(os.path.join(tdir, item.href), 'wb') as imgf: + with open(os.path.join(path, href), 'wb') as imgf: imgf.write(item.data) # Metadata diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index d7e11695c5..9651fa8971 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- __license__ = 'GPL 3' -__copyright__ = '2011, John Schember ' +__copyright__ = '2011, Leigh Parry ' __docformat__ = 'restructuredtext en' ''' @@ -10,53 +10,328 @@ Transform OEB content into Textile formatted plain text import re -from lxml import etree +from functools import partial -from calibre.ebooks.oeb.base import XHTML -from calibre.utils.html2textile import html2textile +from calibre.ebooks.htmlz.oeb2html import OEB2HTML +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links +from calibre.ebooks.oeb.stylizer import Stylizer +from calibre.ebooks.txt.unsmarten import unsmarten +from operator import itemgetter -class TextileMLizer(object): - def __init__(self, log): - self.log = log +class TextileMLizer(OEB2HTML): def extract_content(self, oeb_book, opts): self.log.info('Converting XHTML to Textile formatted TXT...') - self.oeb_book = oeb_book self.opts = opts + self.in_pre = False + self.in_table = False + self.links = {} + self.list = [] + self.images = {} + self.base_hrefs = [item.href for item in oeb_book.spine] + self.map_resources(oeb_book) - return self.mlize_spine() + self.style_bold = False + self.style_italic = False + self.style_under = False + self.style_strike = False + self.style_smallcap = False - def mlize_spine(self): + txt = self.mlize_spine(oeb_book) + txt = unsmarten(txt) + + # Do some tidying up + txt = self.tidy_up(txt) + + return txt + + def mlize_spine(self, oeb_book): output = [u''] - - for item in self.oeb_book.spine: + for item in oeb_book.spine: self.log.debug('Converting %s to Textile formatted TXT...' % item.href) + self.rewrite_ids(item.data, item) + rewrite_links(item.data, partial(self.rewrite_link, page=item)) + stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) + output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) + output.append('\n\n') + return ''.join(output) - html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) + def tidy_up(self, text): + def check_count(text, tests): + x = [] + for i, t in enumerate(reversed(tests)): + x.append((text.count(t), i, t)) + if x: + return sorted(x, key=itemgetter(0, 1), reverse=True)[0][2] + return '' - if not self.opts.keep_links: - html = re.sub(r'<\s*/*\s*a[^>]*>', '', html) - if not self.opts.keep_image_references: - html = re.sub(r'<\s*img[^>]*>', '', html) + # NEEDS TWEAKING +# def check_escaping(text, tests): +# for t in tests: +# text = re.sub(r'(\S)('+t+'\w+'+t+')', r'\1[\2]', text) +# text = re.sub(r'(\s)('+t+'\w+'+t+')(\S)', r'\1[\2]\3', text) +# return text - text = html2textile(html) + txt = check_count(text, ['\np<. ', '\np<>. ', '\np. ']) + text = re.sub(txt+'(\S)', r'\n\1', text) - # Ensure the section ends with at least two new line characters. - # This is to prevent the last paragraph from a section being - # combined into the fist paragraph of the next. - end_chars = text[-4:] - # Convert all newlines to \n - end_chars = end_chars.replace('\r\n', '\n') - end_chars = end_chars.replace('\r', '\n') - end_chars = end_chars[-2:] - if not end_chars[1] == '\n': - text += '\n\n' - if end_chars[1] == '\n' and not end_chars[0] == '\n': - text += '\n' +# text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '-']) - output += text + text = re.sub('\npre\. bc\.', '\nbc.', text) + text = re.sub('\np=. p. ', '\np. ', text) + text = re.sub('\np=. \n', '\n', text) + text = re.sub('\n{3,}', '\n\n', text) + text = re.sub(' \|', '|', text) - output = u''.join(output) + # started work on trying to fix footnotes +# text = re.sub(r'\^"(\d+)":#.+\^', r'[\1]', text) + return text - return output + def remove_newlines(self, text): + text = text.replace('\r\n', ' ') + text = text.replace('\n', ' ') + text = text.replace('\r', ' ') + # Condense redundant spaces created by replacing newlines with spaces. + text = re.sub(r'[ ]{2,}', ' ', text) + text = re.sub(r'\t+', '', text) + return text + + def remove_leading_ws(self, text): + text = text.replace('\r\n', '\n') + text = text.replace('\r', '\n') + text = re.sub(r'\n+', '\n', text) + text = re.sub(r'\n[\t ]+', '\n', text) + return text + + def check_align(self, style, align, tests): + for i in tests: + if style[align] == i[0]: + return i[1] + return '' + + def check_padding(self, style, tests): + txt = '' + for i in tests: + try: + ems = int(round(float(style[i[0]] / style['font-size']))) + if ems >=1: + txt += i[1] * ems + except: + pass + return txt + + def check_id_tag(self, attribs): + txt = '' + if attribs.has_key('id'): + txt = '(#'+attribs['id']+')' + return txt + + def build_block(self, tag, style, attribs, finish): + txt = tag + if self.opts.keep_links: + txt += self.check_id_tag(attribs) + txt += self.check_padding(style, [['padding-left','('],['padding-right',')']]) + txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']]) + txt += finish + return txt + + def dump_text(self, elem, stylizer, page, tag_stack=[]): + ''' + @elem: The element in the etree that we are working on. + @stylizer: The style information attached to the element. + ''' + + # We can only processes tags. If there isn't a tag return any text. + if not isinstance(elem.tag, basestring) \ + or namespace(elem.tag) != XHTML_NS: + p = elem.getparent() + if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \ + and elem.tail: + return [elem.tail] + return [''] + + # Setup our variables. + text = [''] + style = stylizer.style(elem) + tags = [] + tag = barename(elem.tag) + attribs = elem.attrib + + # Ignore anything that is set to not be displayed. + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + return [''] + + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'): + text.append(self.build_block(tag, style, attribs, '. ')) + tags.append('\n') + + if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): + if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): + if self.style_bold == False: + text.append('*') + tags.append('*') + self.style_bold = True + if style['font-style'] == 'italic' or tag in ('i', 'em'): + if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): + if self.style_italic == False: + text.append('_') + tags.append('_') + self.style_italic = True + if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): + if tag != 'a': + if self.style_under == False: + text.append('+') + tags.append('+') + self.style_under = True + if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'): + if self.style_strike == False: + text.append('-') + tags.append('-') + self.style_strike = True + if style['font-variant'] == 'small-caps': + if self.style_smallcap == 0: + text.append('&') + tags.append('&') + self.style_smallcap = 1 + if tag == 'br': + text.append('') + tags.append('\n') + elif tag == 'blockquote': + text.append('bq. ') + tags.append('\n') + elif tag in ('abbr', 'acronym'): + text.append('') + txt = attribs['title'] + tags.append('(' + txt + ')') + elif tag == 'sup': + text.append('^') + tags.append('^') + elif tag == 'sub': + text.append('~') + tags.append('~') + elif tag == 'code': + if self.in_pre: + text.append('bc. ') + tags.append('\n') + else: + text.append('@') + tags.append('@') + elif tag == 'cite': + text.append('??') + tags.append('??') + elif tag == 'hr': + text.append('\n***\n') + tags.append('\n') + elif tag == 'pre': + self.in_pre = True + text.append('pre. ') + tags.append('pre') + elif tag == 'a': + if self.opts.keep_links: + text.append ('"') + tags.append('":' + attribs['href']) + if attribs.has_key('title'): + tags.append('(' + attribs['title'] + ')') + elif tag == 'img': + if self.opts.keep_image_references: + text.append ('!' + attribs['src']) + if attribs.has_key('alt'): + txt = attribs['alt'] + if txt != '': + text.append('(' + txt + ')') + tags.append('!') + elif tag in ('ol', 'ul'): + self.list.append({'name':tag, 'num':0}) + text.append('') + tags.append(tag) + elif tag == 'li': + if self.list: li = self.list[-1] + else: li = {'name':'ul', 'num':0} + if li['name'] == 'ul': text.append('*'*len(self.list)+' ') + elif li['name'] == 'ol': text.append('#'*len(self.list)+' ') + elif tag == 'dl': + text.append('\n') + tags.append('') + elif tag == 'dt': + text.append('') + tags.append('\n') + elif tag == 'dd': + text.append(' ') + tags.append('') + elif tag == 'dd': + text.append('') + tags.append('\n') + elif tag == 'table': + self.in_table = True + text.append('') + tags.append('table') + elif tag == 'tr': + text.append('') + tags.append('|\n') + elif tag == 'td': + text.append('|') + txt = '' + txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']]) + txt += self.check_align(style, 'vertical-align', [['top','^'],['bottom','~']]) + if attribs.has_key ('colspan'): + txt += '\\' + attribs['colspan'] + if attribs.has_key ('rowspan'): + txt += '/' + attribs['rowspan'] + if txt != '': + text.append(txt+'. ') + tags.append('') + elif tag == 'th': + text.append('|_. ') + tags.append('') + + if self.opts.keep_links and attribs.has_key('id'): + if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'): + text.append('(#' + attribs['id'] + ')') + + # If wanted process all style tags here - before taxt in tags is written + + # Process tags that contain text. + if hasattr(elem, 'text') and elem.text: + txt = elem.text + if not self.in_pre: + if self.in_table: + txt = self.remove_newlines(txt) + else: + txt = self.remove_leading_ws(txt) + text.append(txt) + + # Recurse down into tags within the tag we are in. + for item in elem: + text += self.dump_text(item, stylizer, page, tag_stack+tags) + + # Close all open tags. + tags.reverse() + for t in tags: + if tag in ('pre', 'ul', 'ol', 'li', 'table'): + if tag == 'pre': + self.in_pre = False + if tag == 'table': + self.in_table = False + if tag in ('ul', 'ol'): + if self.list: self.list.pop() + else: + text.append('%s' % t) + if t == '*': self.style_bold = False + if t == '_': self.style_italic = False + if t == '+': self.style_under = False + if t == '-': self.style_strike = False + if t == '&': self.style_smallcap = False + + # Add the text that is outside of the tag. + if hasattr(elem, 'tail') and elem.tail: + tail = elem.tail + if not self.in_pre: + if self.in_table: + tail = self.remove_newlines(tail) + else: + tail = self.remove_leading_ws(tail) + text.append(tail) + + return text diff --git a/src/calibre/ebooks/txt/unsmarten.py b/src/calibre/ebooks/txt/unsmarten.py new file mode 100644 index 0000000000..30a22bf069 --- /dev/null +++ b/src/calibre/ebooks/txt/unsmarten.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- + +''' + +''' + +__version__ = '0.1' +__author__ = 'Leigh Parry' + +import re + +def unsmarten(txt): + txt = re.sub(u'–|–|–', r'-', txt) # en-dash + txt = re.sub(u'—|—|—', r'--', txt) # em-dash + txt = re.sub(u'…|…|…', r'...', txt) # ellipsis + + txt = re.sub(u'“|”|″|“|”|″|“|”|″', r'"', txt) # double quote + txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt) # apostrophe + txt = re.sub(u'‘|’|′|‘|’|′|‘|’|′', r"'", txt) # single quote + + txt = re.sub(u'¢|¢|¢', r'{c\}', txt) # cent + txt = re.sub(u'£|£|£', r'{L-}', txt) # pound + txt = re.sub(u'¥|¥|¥', r'{Y=}', txt) # yen + txt = re.sub(u'©|©|©', r'{(c)}', txt) # copyright + txt = re.sub(u'®|®|®', r'{(r)}', txt) # registered + txt = re.sub(u'¼|¼|¼', r'{1/4}', txt) # quarter + txt = re.sub(u'½|½|½', r'{1/2}', txt) # half + txt = re.sub(u'¾|¾|¾', r'{3/4}', txt) # three-quarter + txt = re.sub(u'À|À|À', r'{A`)}', txt) # A-grave + txt = re.sub(u'Á|Á|Á', r"{A'}", txt) # A-acute + txt = re.sub(u'Â|Â|Â', r'{A^}', txt) # A-circumflex + txt = re.sub(u'Ã|Ã|Ã', r'{A~}', txt) # A-tilde + txt = re.sub(u'Ä|Ä|Ä', r'{A"}', txt) # A-umlaut + txt = re.sub(u'Å|Å|Å', r'{Ao}', txt) # A-ring + txt = re.sub(u'Æ|Æ|Æ', r'{AE}', txt) # AE + txt = re.sub(u'Ç|Ç|Ç', r'{C,}', txt) # C-cedilla + txt = re.sub(u'È|È|È', r'{E`}', txt) # E-grave + txt = re.sub(u'É|É|É', r"{E'}", txt) # E-acute + txt = re.sub(u'Ê|Ê|Ê', r'{E^}', txt) # E-circumflex + txt = re.sub(u'Ë|Ë|Ë', r'{E"}', txt) # E-umlaut + txt = re.sub(u'Ì|Ì|Ì', r'{I`}', txt) # I-grave + txt = re.sub(u'Í|Í|Í', r"{I'}", txt) # I-acute + txt = re.sub(u'Î|Î|Î', r'{I^}', txt) # I-circumflex + txt = re.sub(u'Ï|Ï|Ï', r'{I"}', txt) # I-umlaut + txt = re.sub(u'Ð|Ð|Ð', r'{D-}', txt) # ETH + txt = re.sub(u'Ñ|Ñ|Ñ', r'{N~}', txt) # N-tilde + txt = re.sub(u'Ò|Ò|Ò', r'{O`}', txt) # O-grave + txt = re.sub(u'Ó|Ó|Ó', r"{O'}", txt) # O-acute + txt = re.sub(u'Ô|Ô|Ô', r'{O^}', txt) # O-circumflex + txt = re.sub(u'Õ|Õ|Õ', r'{O~}', txt) # O-tilde + txt = re.sub(u'Ö|Ö|Ö', r'{O"}', txt) # O-umlaut + txt = re.sub(u'×|×|×', r'{x}', txt) # dimension + txt = re.sub(u'Ø|Ø|Ø', r'{O/}', txt) # O-slash + txt = re.sub(u'Ù|Ù|Ù', r"{U`}", txt) # U-grave + txt = re.sub(u'Ú|Ú|Ú', r"{U'}", txt) # U-acute + txt = re.sub(u'Û|Û|Û', r'{U^}', txt) # U-circumflex + txt = re.sub(u'Ü|Ü|Ü', r'{U"}', txt) # U-umlaut + txt = re.sub(u'Ý|Ý|Ý', r"{Y'}", txt) # Y-grave + txt = re.sub(u'ß|ß|ß', r'{sz}', txt) # sharp-s + txt = re.sub(u'à|à|à', r'{a`}', txt) # a-grave + txt = re.sub(u'á|á|á', r"{a'}", txt) # a-acute + txt = re.sub(u'â|â|â', r'{a^}', txt) # a-circumflex + txt = re.sub(u'ã|ã|ã', r'{a~}', txt) # a-tilde + txt = re.sub(u'ä|ä|ä', r'{a"}', txt) # a-umlaut + txt = re.sub(u'å|å|å', r'{ao}', txt) # a-ring + txt = re.sub(u'æ|æ|æ', r'{ae}', txt) # ae + txt = re.sub(u'ç|ç|ç', r'{c,}', txt) # c-cedilla + txt = re.sub(u'è|è|è', r'{e`}', txt) # e-grave + txt = re.sub(u'é|é|é', r"{e'}", txt) # e-acute + txt = re.sub(u'ê|ê|ê', r'{e^}', txt) # e-circumflex + txt = re.sub(u'ë|ë|ë', r'{e"}', txt) # e-umlaut + txt = re.sub(u'ì|ì|ì', r'{i`}', txt) # i-grave + txt = re.sub(u'í|í|í', r"{i'}", txt) # i-acute + txt = re.sub(u'î|î|î', r'{i^}', txt) # i-circumflex + txt = re.sub(u'ï|ï|ï', r'{i"}', txt) # i-umlaut + txt = re.sub(u'ð|ð|ð', r'{d-}', txt) # eth + txt = re.sub(u'ñ|ñ|ñ', r'{n~}', txt) # n-tilde + txt = re.sub(u'ò|ò|ò', r'{o`}', txt) # o-grave + txt = re.sub(u'ó|ó|ó', r"{o'}", txt) # o-acute + txt = re.sub(u'ô|ô|ô', r'{o^}', txt) # o-circumflex + txt = re.sub(u'õ|õ|õ', r'{o~}', txt) # o-tilde + txt = re.sub(u'ö|ö|ö', r'{o"}', txt) # o-umlaut + txt = re.sub(u'ø|ø|ø', r'{o/}', txt) # o-stroke + txt = re.sub(u'ù|ù|ù', r'{u`}', txt) # u-grave + txt = re.sub(u'ú|ú|ú', r"{u'}", txt) # u-acute + txt = re.sub(u'û|û|û', r'{u^}', txt) # u-circumflex + txt = re.sub(u'ü|ü|ü', r'{u"}', txt) # u-umlaut + txt = re.sub(u'ý|ý|ý', r"{y'}", txt) # y-acute + txt = re.sub(u'ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut + txt = re.sub(u'Œ|Œ|Œ', r'{OE}', txt) # OE + txt = re.sub(u'œ|œ|œ', r'{oe}', txt) # oe + txt = re.sub(u'Ŝ|Š|Ŝ', r'{S^}', txt) # Scaron + txt = re.sub(u'ŝ|š|ŝ', r'{s^}', txt) # scaron + txt = re.sub(u'•|•|•', r'{*}', txt) # bullet + txt = re.sub(u'₣|₣', r'{Fr}', txt) # Franc + txt = re.sub(u'₤|₤', r'{L=}', txt) # Lira + txt = re.sub(u'₨|₨', r'{Rs}', txt) # Rupee + txt = re.sub(u'€|€|€', r'{C=}', txt) # euro + txt = re.sub(u'™|™|™', r'{tm}', txt) # trademark + txt = re.sub(u'♠|♠|♠', r'{spade}', txt) # spade + txt = re.sub(u'♣|♣|♣', r'{club}', txt) # club + txt = re.sub(u'♥|♥|♥', r'{heart}', txt) # heart + txt = re.sub(u'♦|♦|♦', r'{diamond}', txt) # diamond + + txt = re.sub(u'\xa0', r'p. ', txt) # blank paragraph + txt = re.sub(u'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph + txt = re.sub(u'\n \n', r'\n
\n', txt) # blank paragraph - br tag + + return txt From be3d441d3bb4705fc24261312644ae148a0581c4 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 19 Apr 2011 06:49:27 -0400 Subject: [PATCH 02/15] More textile work. --- src/calibre/ebooks/txt/output.py | 1 - src/calibre/ebooks/txt/textileml.py | 125 +++++++++++++++++++--------- src/calibre/ebooks/txt/unsmarten.py | 11 ++- 3 files changed, 89 insertions(+), 48 deletions(-) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 7b50afb345..606dec4a63 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -70,7 +70,6 @@ class TXTOutput(OutputFormatPlugin): ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): - print 'New' if opts.txt_output_formatting.lower() == 'markdown': from calibre.ebooks.txt.markdownml import MarkdownMLizer self.writer = MarkdownMLizer(log) diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 9651fa8971..9a025e0aef 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -58,31 +58,39 @@ class TextileMLizer(OEB2HTML): return ''.join(output) def tidy_up(self, text): - def check_count(text, tests): - x = [] - for i, t in enumerate(reversed(tests)): - x.append((text.count(t), i, t)) - if x: - return sorted(x, key=itemgetter(0, 1), reverse=True)[0][2] - return '' +# def check_count(text, tests): +# x = [] +# for i, t in enumerate(reversed(tests)): +# x.append((text.count(t), i, t)) +# if x: +# return sorted(x, key=itemgetter(0, 1), reverse=True)[0][2] +# return '' - # NEEDS TWEAKING -# def check_escaping(text, tests): -# for t in tests: -# text = re.sub(r'(\S)('+t+'\w+'+t+')', r'\1[\2]', text) -# text = re.sub(r'(\s)('+t+'\w+'+t+')(\S)', r'\1[\2]\3', text) -# return text + # Needs tweaking and finetuning - don't use yet. + def check_escaping(text, tests): + for t in tests: + text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text) +# text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text) +# text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text) +# text = re.sub(r'(["\'])\[('+t+'\w+'+t+')\]', r'\1\2', text) +# text = re.sub(r'\[('+t+'\w+'+t+')\](["\',\.!\?])', r'\1\2', text) + return text - txt = check_count(text, ['\np<. ', '\np<>. ', '\np. ']) - text = re.sub(txt+'(\S)', r'\n\1', text) +# txt = check_count(text, ['\np<. ', '\np<>. ', '\np. ']) +# text = re.sub(txt+'(\S)', r'\n\1', text) -# text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '-']) + text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '-']) - text = re.sub('\npre\. bc\.', '\nbc.', text) - text = re.sub('\np=. p. ', '\np. ', text) - text = re.sub('\np=. \n', '\n', text) - text = re.sub('\n{3,}', '\n\n', text) - text = re.sub(' \|', '|', text) + text = re.sub(r'^\n+', r'', text) + text = re.sub(r'\npre\. bc\.', r'\nbc.', text) + text = re.sub(r'\nbq\. \n\np\. ', r'\nbq. ', text) + text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) + text = re.sub(r'\n{3}', r'\n\n', text) + text = re.sub(r'(p.*\. \n?)(p.*\. )', r'\2', text) + text = re.sub(r'p.*\. \n\n', r'', text) +# text = re.sub(u'\n \n', r'\n
\n', text) # blank paragraph - br tag + text = re.sub(u'p.*\. \xa0', r'p. ', text) # blank paragraph + text = re.sub(r' \|', r'|', text) # started work on trying to fix footnotes # text = re.sub(r'\^"(\d+)":#.+\^', r'[\1]', text) @@ -94,20 +102,29 @@ class TextileMLizer(OEB2HTML): text = text.replace('\r', ' ') # Condense redundant spaces created by replacing newlines with spaces. text = re.sub(r'[ ]{2,}', ' ', text) - text = re.sub(r'\t+', '', text) + text = re.sub(r'\t +', '', text) +# text = re.sub(r'\n +', '', text) return text def remove_leading_ws(self, text): text = text.replace('\r\n', '\n') text = text.replace('\r', '\n') - text = re.sub(r'\n+', '\n', text) text = re.sub(r'\n[\t ]+', '\n', text) + text = re.sub(r'\n{2,}', '\n', text) return text - def check_align(self, style, align, tests): + def check_halign(self, style): + tests = {'left':'<','justify':'<>','center':'=','right':'>'} for i in tests: - if style[align] == i[0]: - return i[1] + if style['text-align'] == i: + return tests[i] + return '' + + def check_valign(self, style): + tests = {'top':'^','bottom':'~', 'middle':'-'} + for i in tests: + if style['vertical-align'] == i: + return tests[i] return '' def check_padding(self, style, tests): @@ -124,15 +141,16 @@ class TextileMLizer(OEB2HTML): def check_id_tag(self, attribs): txt = '' if attribs.has_key('id'): - txt = '(#'+attribs['id']+')' + #if attribs['id'] in self.links: + txt = '(#'+attribs['id']+')' return txt def build_block(self, tag, style, attribs, finish): - txt = tag + txt = '\n' + tag if self.opts.keep_links: txt += self.check_id_tag(attribs) txt += self.check_padding(style, [['padding-left','('],['padding-right',')']]) - txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']]) + txt += self.check_halign(style) txt += finish return txt @@ -163,7 +181,17 @@ class TextileMLizer(OEB2HTML): or style['visibility'] == 'hidden': return [''] - if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'): + # Soft scene breaks. + text.append(self.check_padding(style, ['margin-top',u'\n\n\xa0'])) + + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): + #For debugging + if tag == 'h1': + for i in self.links: + text.append(i) + text.append('\n') + if tag == 'div': + tag = 'p' text.append(self.build_block(tag, style, attribs, '. ')) tags.append('\n') @@ -191,10 +219,10 @@ class TextileMLizer(OEB2HTML): tags.append('-') self.style_strike = True if style['font-variant'] == 'small-caps': - if self.style_smallcap == 0: + if self.style_smallcap == False: text.append('&') tags.append('&') - self.style_smallcap = 1 + self.style_smallcap = True if tag == 'br': text.append('') tags.append('\n') @@ -236,7 +264,10 @@ class TextileMLizer(OEB2HTML): tags.append('(' + attribs['title'] + ')') elif tag == 'img': if self.opts.keep_image_references: - text.append ('!' + attribs['src']) + txt = '!' + self.check_halign(style) + txt += self.check_valign(style) + txt += attribs['src'] + text.append(txt) if attribs.has_key('alt'): txt = attribs['alt'] if txt != '': @@ -247,6 +278,7 @@ class TextileMLizer(OEB2HTML): text.append('') tags.append(tag) elif tag == 'li': +# text.append('\n') if self.list: li = self.list[-1] else: li = {'name':'ul', 'num':0} if li['name'] == 'ul': text.append('*'*len(self.list)+' ') @@ -273,8 +305,8 @@ class TextileMLizer(OEB2HTML): elif tag == 'td': text.append('|') txt = '' - txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']]) - txt += self.check_align(style, 'vertical-align', [['top','^'],['bottom','~']]) + txt += self.check_halign(style) + txt += self.check_valign(style) if attribs.has_key ('colspan'): txt += '\\' + attribs['colspan'] if attribs.has_key ('rowspan'): @@ -288,7 +320,10 @@ class TextileMLizer(OEB2HTML): if self.opts.keep_links and attribs.has_key('id'): if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'): - text.append('(#' + attribs['id'] + ')') + if tag == 'span': + text.append(' %') + tags.append('% ') + text.append('(#' + attribs['id'] + u')\xa0') # If wanted process all style tags here - before taxt in tags is written @@ -318,11 +353,19 @@ class TextileMLizer(OEB2HTML): if self.list: self.list.pop() else: text.append('%s' % t) - if t == '*': self.style_bold = False - if t == '_': self.style_italic = False - if t == '+': self.style_under = False - if t == '-': self.style_strike = False - if t == '&': self.style_smallcap = False + if t == '*': + self.style_bold = False + if t == '_': + self.style_italic = False + if t == '+': + self.style_under = False + if t == '-': + self.style_strike = False + if t == '&': + self.style_smallcap = False + + # Soft scene breaks. + text.append(self.check_padding(style, ['margin-bottom',u'\n\n\xa0'])) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: diff --git a/src/calibre/ebooks/txt/unsmarten.py b/src/calibre/ebooks/txt/unsmarten.py index 30a22bf069..40444ba601 100644 --- a/src/calibre/ebooks/txt/unsmarten.py +++ b/src/calibre/ebooks/txt/unsmarten.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- -''' - -''' +"""unsmarten : html2textile helper function""" __version__ = '0.1' __author__ = 'Leigh Parry' @@ -102,8 +100,9 @@ def unsmarten(txt): txt = re.sub(u'♥|♥|♥', r'{heart}', txt) # heart txt = re.sub(u'♦|♦|♦', r'{diamond}', txt) # diamond - txt = re.sub(u'\xa0', r'p. ', txt) # blank paragraph - txt = re.sub(u'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph - txt = re.sub(u'\n \n', r'\n
\n', txt) # blank paragraph - br tag + # Move into main code? +# txt = re.sub(u'\xa0', r'p. ', txt) # blank paragraph +# txt = re.sub(u'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph +# txt = re.sub(u'\n \n', r'\n
\n', txt) # blank paragraph - br tag return txt From ff6043ce0f0659edce0c05e7e669f5e9c106ea96 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 30 Apr 2011 08:44:30 -0400 Subject: [PATCH 03/15] ... --- src/calibre/ebooks/textile/functions.py | 12 +- src/calibre/ebooks/txt/textileml.py | 202 ++++++++++++++---------- 2 files changed, 123 insertions(+), 91 deletions(-) diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index 39f793face..dd1914cf9f 100755 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -12,7 +12,7 @@ A Humane Web Text Generator #__date__ = '2009/12/04' __copyright__ = """ -Copyright (c) 2011, Leigh Parry +Copyright (c) 2011, Leigh Parry Copyright (c) 2011, John Schember Copyright (c) 2009, Jason Samsa, http://jsamsa.com/ Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/ @@ -225,8 +225,8 @@ class Textile(object): (re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), r'\1'), # 3+ uppercase (re.compile(r'\b(\s{0,1})?\.{3}'), r'\1…'), # ellipsis (re.compile(r'^[\*_-]{3,}$', re.M), r'
'), #
scene-break - (re.compile(r'\b--\b'), r'—'), # em dash - (re.compile(r'(\s)--(\s)'), r'\1—\2'), # em dash +# (re.compile(r'\b--\b'), r'—'), # em dash + (re.compile(r'([^-])--([^-])'), r'\1—\2'), # em dash (re.compile(r'\s-(?:\s|$)'), r' – '), # en dash (re.compile(r'\b( ?)[([]TM[])]', re.I), r'\1™'), # trademark (re.compile(r'\b( ?)[([]R[])]', re.I), r'\1®'), # registered @@ -868,7 +868,7 @@ class Textile(object): >>> t.span(r"hello %(bob)span *strong* and **bold**% goodbye") 'hello span strong and bold goodbye' """ - qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^') + qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^', r'&') pnct = ".,\"'?!;:" for qtag in qtags: @@ -900,7 +900,9 @@ class Textile(object): '%' : 'span', '+' : 'ins', '~' : 'sub', - '^' : 'sup' + '^' : 'sup', + '&' : 'span style="font-variant:small-caps;"' +# '&' : 'span style="font-transform:uppercase;font-size:smaller;"' } tag = qtags[tag] atts = self.pba(atts) diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 9a025e0aef..42b709a681 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -28,15 +28,18 @@ class TextileMLizer(OEB2HTML): self.in_table = False self.links = {} self.list = [] + self.our_links = [] + self.our_ids = [] self.images = {} + self.remove_space_after_newline = False self.base_hrefs = [item.href for item in oeb_book.spine] self.map_resources(oeb_book) - self.style_bold = False - self.style_italic = False - self.style_under = False - self.style_strike = False - self.style_smallcap = False +# self.style_bold = False +# self.style_italic = False +# self.style_under = False +# self.style_strike = False +# self.style_smallcap = False txt = self.mlize_spine(oeb_book) txt = unsmarten(txt) @@ -58,42 +61,41 @@ class TextileMLizer(OEB2HTML): return ''.join(output) def tidy_up(self, text): -# def check_count(text, tests): -# x = [] -# for i, t in enumerate(reversed(tests)): -# x.append((text.count(t), i, t)) -# if x: -# return sorted(x, key=itemgetter(0, 1), reverse=True)[0][2] -# return '' - - # Needs tweaking and finetuning - don't use yet. + # Needs tweaking and finetuning def check_escaping(text, tests): for t in tests: text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text) -# text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text) -# text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text) -# text = re.sub(r'(["\'])\[('+t+'\w+'+t+')\]', r'\1\2', text) -# text = re.sub(r'\[('+t+'\w+'+t+')\](["\',\.!\?])', r'\1\2', text) + text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text) + text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text) return text -# txt = check_count(text, ['\np<. ', '\np<>. ', '\np. ']) -# text = re.sub(txt+'(\S)', r'\n\1', text) - - text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '-']) + # Note - I'm not checking for escaped '-' as this will also get hypenated words + text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%']) + text = re.sub(r' +\n', r'\n', text) text = re.sub(r'^\n+', r'', text) - text = re.sub(r'\npre\. bc\.', r'\nbc.', text) - text = re.sub(r'\nbq\. \n\np\. ', r'\nbq. ', text) + text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) + text = re.sub(r'\nbq\.\n?\np\. ', r'\nbq. ', text) text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) text = re.sub(r'\n{3}', r'\n\n', text) text = re.sub(r'(p.*\. \n?)(p.*\. )', r'\2', text) text = re.sub(r'p.*\. \n\n', r'', text) -# text = re.sub(u'\n \n', r'\n
\n', text) # blank paragraph - br tag text = re.sub(u'p.*\. \xa0', r'p. ', text) # blank paragraph text = re.sub(r' \|', r'|', text) + # Now put back spaces removed earlier as they're needed here + text = re.sub(r'\np\.\n', r'\np. \n', text) + + # Now tidyup links and ids - remove ones that don't have a correponding opposite + if self.opts.keep_links: + for i in self.our_links: + if i not in self.our_ids: + text = re.sub(r'"(.+)":'+i, '\1', text) + for i in self.our_ids: + if i not in self.our_links: + text = re.sub(r'\('+i+'\)', '', text) # started work on trying to fix footnotes -# text = re.sub(r'\^"(\d+)":#.+\^', r'[\1]', text) +# text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text) return text def remove_newlines(self, text): @@ -102,16 +104,30 @@ class TextileMLizer(OEB2HTML): text = text.replace('\r', ' ') # Condense redundant spaces created by replacing newlines with spaces. text = re.sub(r'[ ]{2,}', ' ', text) - text = re.sub(r'\t +', '', text) -# text = re.sub(r'\n +', '', text) + text = re.sub(r'\t+', '', text) + if self.remove_space_after_newline == True: + text = re.sub(r'^ +', '', text) + self.remove_space_after_newline = False return text - def remove_leading_ws(self, text): - text = text.replace('\r\n', '\n') - text = text.replace('\r', '\n') - text = re.sub(r'\n[\t ]+', '\n', text) - text = re.sub(r'\n{2,}', '\n', text) - return text +# def remove_leading_ws(self, text): +# text = text.replace('\r\n', '\n') +# text = text.replace('\r', '\n') +# text = re.sub(r'\n[\t ]+', '\n', text) +# text = re.sub(r'\n{2,}', '\n', text) +# return text + + def check_styles(self, style): + txt = '{' +# style_string = '%s;' % style +# txt += style_string + if style['color'] and style['color'] != 'black': + txt += 'color:'+style['color']+';' +# if style['font-size']:# in ('big', 'bigger', 'small', 'smaller'): +# txt += 'font-size: %d;' % style['font-size'] + txt += '}' + if txt == '{}': txt = '' + return txt def check_halign(self, style): tests = {'left':'<','justify':'<>','center':'=','right':'>'} @@ -140,18 +156,18 @@ class TextileMLizer(OEB2HTML): def check_id_tag(self, attribs): txt = '' - if attribs.has_key('id'): - #if attribs['id'] in self.links: - txt = '(#'+attribs['id']+')' + if attribs.has_key('id'): # and attribs['id'] in self.links.values(): + txt = '(#'+attribs['id']+ ')' + self.our_ids.append('#'+attribs['id']) return txt - def build_block(self, tag, style, attribs, finish): + def build_block(self, tag, style, attribs): txt = '\n' + tag if self.opts.keep_links: txt += self.check_id_tag(attribs) txt += self.check_padding(style, [['padding-left','('],['padding-right',')']]) txt += self.check_halign(style) - txt += finish + txt += self.check_styles(style) return txt def dump_text(self, elem, stylizer, page, tag_stack=[]): @@ -175,38 +191,35 @@ class TextileMLizer(OEB2HTML): tags = [] tag = barename(elem.tag) attribs = elem.attrib - + # Ignore anything that is set to not be displayed. if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': return [''] - # Soft scene breaks. - text.append(self.check_padding(style, ['margin-top',u'\n\n\xa0'])) - if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): - #For debugging - if tag == 'h1': - for i in self.links: - text.append(i) - text.append('\n') if tag == 'div': tag = 'p' - text.append(self.build_block(tag, style, attribs, '. ')) + text.append(self.build_block(tag, style, attribs)) + text.append('. ') tags.append('\n') - if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): - if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): - if self.style_bold == False: - text.append('*') - tags.append('*') - self.style_bold = True if style['font-style'] == 'italic' or tag in ('i', 'em'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): if self.style_italic == False: text.append('_') +# text.append('from '+tag) tags.append('_') self.style_italic = True + if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): + if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): + style_string = '%s;' % style + text.append(style_string) + if self.style_bold == False: + text.append('*') +# text.append('from '+tag) + tags.append('*') + self.style_bold = True if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): if tag != 'a': if self.style_under == False: @@ -218,16 +231,12 @@ class TextileMLizer(OEB2HTML): text.append('-') tags.append('-') self.style_strike = True - if style['font-variant'] == 'small-caps': - if self.style_smallcap == False: - text.append('&') - tags.append('&') - self.style_smallcap = True if tag == 'br': text.append('') tags.append('\n') + self.remove_space_after_newline = True elif tag == 'blockquote': - text.append('bq. ') + text.append('\nbq. ') tags.append('\n') elif tag in ('abbr', 'acronym'): text.append('') @@ -241,8 +250,8 @@ class TextileMLizer(OEB2HTML): tags.append('~') elif tag == 'code': if self.in_pre: - text.append('bc. ') - tags.append('\n') + text.append('\nbc. ') + tags.append('') else: text.append('@') tags.append('@') @@ -254,12 +263,14 @@ class TextileMLizer(OEB2HTML): tags.append('\n') elif tag == 'pre': self.in_pre = True - text.append('pre. ') - tags.append('pre') + text.append('\npre. ') + tags.append('pre\n') elif tag == 'a': if self.opts.keep_links: text.append ('"') - tags.append('":' + attribs['href']) + if attribs.has_key('href'): + tags.append('":' + attribs['href']) + self.our_links.append(attribs['href']) if attribs.has_key('title'): tags.append('(' + attribs['title'] + ')') elif tag == 'img': @@ -275,14 +286,15 @@ class TextileMLizer(OEB2HTML): tags.append('!') elif tag in ('ol', 'ul'): self.list.append({'name':tag, 'num':0}) - text.append('') + text.append('\n') tags.append(tag) elif tag == 'li': -# text.append('\n') if self.list: li = self.list[-1] else: li = {'name':'ul', 'num':0} + text.append('\n') if li['name'] == 'ul': text.append('*'*len(self.list)+' ') elif li['name'] == 'ol': text.append('#'*len(self.list)+' ') + tags.append('\n') elif tag == 'dl': text.append('\n') tags.append('') @@ -298,6 +310,7 @@ class TextileMLizer(OEB2HTML): elif tag == 'table': self.in_table = True text.append('') + tags.append('') tags.append('table') elif tag == 'tr': text.append('') @@ -315,18 +328,33 @@ class TextileMLizer(OEB2HTML): text.append(txt+'. ') tags.append('') elif tag == 'th': - text.append('|_. ') + text.append('|_') + + text.append('. ') tags.append('') + elif tag == 'span': + if style['font-variant'] == 'small-caps': + if self.style_smallcap == False: + text.append('&') + tags.append('&') + self.style_smallcap = True + else: + txt = '%' + if self.opts.keep_links: + txt += self.check_id_tag(attribs) + txt += self.check_styles(style) + if txt != '%': + text.append(txt) + tags.append('%') if self.opts.keep_links and attribs.has_key('id'): - if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'): - if tag == 'span': - text.append(' %') - tags.append('% ') - text.append('(#' + attribs['id'] + u')\xa0') - - # If wanted process all style tags here - before taxt in tags is written + if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span'): + text.append(self.check_id_tag(attribs)) + # Process the styles for any that we want to keep + if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', 'span'): + text.append(self.check_styles(style)) + # Process tags that contain text. if hasattr(elem, 'text') and elem.text: txt = elem.text @@ -351,21 +379,23 @@ class TextileMLizer(OEB2HTML): self.in_table = False if tag in ('ul', 'ol'): if self.list: self.list.pop() + if not self.list: text.append('\n') else: text.append('%s' % t) - if t == '*': - self.style_bold = False - if t == '_': - self.style_italic = False - if t == '+': - self.style_under = False - if t == '-': - self.style_strike = False - if t == '&': - self.style_smallcap = False + if t == '*': self.style_bold = False + if t == '_': self.style_italic = False + if t == '+': self.style_under = False + if t == '-': self.style_strike = False + if t == '&': self.style_smallcap = False # Soft scene breaks. text.append(self.check_padding(style, ['margin-bottom',u'\n\n\xa0'])) +# try: +# ems = int(round((float(style.marginBottom) / style.fontSize) - 1)) +# if ems >= 1: +# text.append('\n' * ems) +# except: +# pass # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: From 05331d7f05de3ed3010a63b5c0d754452ee23782 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 30 Apr 2011 09:43:09 -0400 Subject: [PATCH 04/15] TXT: Textile changes. --- src/calibre/ebooks/txt/processor.py | 2 + src/calibre/ebooks/txt/textileml.py | 231 ++++++++++++++++------------ 2 files changed, 135 insertions(+), 98 deletions(-) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 7e161f63bd..54369190de 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -242,6 +242,8 @@ def detect_formatting_type(txt): textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt)) # Links textile_count += len(re.findall(r'"[^"]*":\S+', txt)) + # paragraph blocks + textile_count += len(re.findall(r'(?mu)^p(<|<>|=|>)?\. ', txt)) # Decide if either markdown or textile is used in the text # based on the number of unique formatting elements found. diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 42b709a681..622ff8d2e3 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -7,7 +7,6 @@ __docformat__ = 'restructuredtext en' ''' Transform OEB content into Textile formatted plain text ''' - import re from functools import partial @@ -16,8 +15,6 @@ from calibre.ebooks.htmlz.oeb2html import OEB2HTML from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.txt.unsmarten import unsmarten -from operator import itemgetter - class TextileMLizer(OEB2HTML): @@ -29,17 +26,20 @@ class TextileMLizer(OEB2HTML): self.links = {} self.list = [] self.our_links = [] + self.in_a_link = False self.our_ids = [] self.images = {} + self.id_no_text = u'' + self.style_embed = [] self.remove_space_after_newline = False self.base_hrefs = [item.href for item in oeb_book.spine] self.map_resources(oeb_book) -# self.style_bold = False -# self.style_italic = False -# self.style_under = False -# self.style_strike = False -# self.style_smallcap = False + self.style_bold = False + self.style_italic = False + self.style_under = False + self.style_strike = False + self.style_smallcap = False txt = self.mlize_spine(oeb_book) txt = unsmarten(txt) @@ -56,7 +56,7 @@ class TextileMLizer(OEB2HTML): self.rewrite_ids(item.data, item) rewrite_links(item.data, partial(self.rewrite_link, page=item)) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) - output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) + output += self.dump_text(item.data.find(XHTML('body')), stylizer) output.append('\n\n') return ''.join(output) @@ -64,36 +64,47 @@ class TextileMLizer(OEB2HTML): # Needs tweaking and finetuning def check_escaping(text, tests): for t in tests: - text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text) + # I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged + txt = '%s' % t + self.log.debug('DEBUG: ' + txt) + if txt != '%': + text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text) text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text) text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text) return text - # Note - I'm not checking for escaped '-' as this will also get hypenated words - text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%']) - - text = re.sub(r' +\n', r'\n', text) - text = re.sub(r'^\n+', r'', text) - text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) - text = re.sub(r'\nbq\.\n?\np\. ', r'\nbq. ', text) - text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) - text = re.sub(r'\n{3}', r'\n\n', text) - text = re.sub(r'(p.*\. \n?)(p.*\. )', r'\2', text) - text = re.sub(r'p.*\. \n\n', r'', text) - text = re.sub(u'p.*\. \xa0', r'p. ', text) # blank paragraph - text = re.sub(r' \|', r'|', text) - # Now put back spaces removed earlier as they're needed here - text = re.sub(r'\np\.\n', r'\np. \n', text) - # Now tidyup links and ids - remove ones that don't have a correponding opposite if self.opts.keep_links: for i in self.our_links: - if i not in self.our_ids: - text = re.sub(r'"(.+)":'+i, '\1', text) + if i[0] == '#': + if i not in self.our_ids: + text = re.sub(r'"(.+)":'+i, '\1', text) for i in self.our_ids: if i not in self.our_links: text = re.sub(r'\('+i+'\)', '', text) + + # Note - I'm not checking for escaped '-' as this will also get hypenated words + text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%']) + text = re.sub(r'%\xa0+', r'%', text) #remove empty spans + text = re.sub(r'%%', r'', text) #remove empty spans + text = re.sub(r'%([_+*-]+)%', r'\1', text) #remove spans from tagged output + text = re.sub(r' +\n', r'\n', text) #remove spaces before a newline + text = re.sub(r'^\n+', r'', text) #remove newlines at top of file + text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) #correct blockcode paras + text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) #correct blockquote paras +# text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) #reduce blank lines + insert blank para + text = re.sub(r'\n{3}', r'\n\n', text) #reduce blank lines + text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text) + text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text) + text = re.sub(r'\n(p.*\.\n)(p.*\.)', r'\n\2', text) + text = re.sub(u'\np.*\.\xa0', r'\np. ', text) # blank paragraph + text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph + text = re.sub(r' {2,}\|', r' |', text) #sort out spaces in tables + # Now put back spaces removed earlier as they're needed here + text = re.sub(r'\np\.\n', r'\np. \n', text) + text = re.sub(r' \n\n\n', r' \n\n', text) #reduce blank lines + # started work on trying to fix footnotes # text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text) return text @@ -110,21 +121,15 @@ class TextileMLizer(OEB2HTML): self.remove_space_after_newline = False return text -# def remove_leading_ws(self, text): -# text = text.replace('\r\n', '\n') -# text = text.replace('\r', '\n') -# text = re.sub(r'\n[\t ]+', '\n', text) -# text = re.sub(r'\n{2,}', '\n', text) -# return text - def check_styles(self, style): txt = '{' -# style_string = '%s;' % style -# txt += style_string if style['color'] and style['color'] != 'black': txt += 'color:'+style['color']+';' -# if style['font-size']:# in ('big', 'bigger', 'small', 'smaller'): -# txt += 'font-size: %d;' % style['font-size'] + try: + if style['background']: + txt += 'background:'+style['background']+';' + except: + pass txt += '}' if txt == '{}': txt = '' return txt @@ -137,7 +142,7 @@ class TextileMLizer(OEB2HTML): return '' def check_valign(self, style): - tests = {'top':'^','bottom':'~', 'middle':'-'} + tests = {'top':'^','bottom':'~'} #, 'middle':'-'} for i in tests: if style['vertical-align'] == i: return tests[i] @@ -157,8 +162,9 @@ class TextileMLizer(OEB2HTML): def check_id_tag(self, attribs): txt = '' if attribs.has_key('id'): # and attribs['id'] in self.links.values(): - txt = '(#'+attribs['id']+ ')' - self.our_ids.append('#'+attribs['id']) + txt = '(#'+attribs['id']+ ')' + self.our_ids.append('#'+attribs['id']) + self.id_no_text = u'\xa0' return txt def build_block(self, tag, style, attribs): @@ -170,7 +176,7 @@ class TextileMLizer(OEB2HTML): txt += self.check_styles(style) return txt - def dump_text(self, elem, stylizer, page, tag_stack=[]): + def dump_text(self, elem, stylizer): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. @@ -197,45 +203,59 @@ class TextileMLizer(OEB2HTML): or style['visibility'] == 'hidden': return [''] + # Soft scene breaks. + text.append(self.check_padding(style, ['margin-top',u'\n\n\xa0'])) + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): if tag == 'div': tag = 'p' - text.append(self.build_block(tag, style, attribs)) - text.append('. ') - tags.append('\n') + block = self.build_block(tag, style, attribs) + # Normal paragraph with no styling. + if block == '\np': + text.append('\n\n') + tags.append('\n') + else: + text.append(block) + text.append('. ') + tags.append('\n') + #self.style_embed = [] if style['font-style'] == 'italic' or tag in ('i', 'em'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): if self.style_italic == False: text.append('_') -# text.append('from '+tag) tags.append('_') + self.style_embed.append ('_') self.style_italic = True if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): - style_string = '%s;' % style - text.append(style_string) if self.style_bold == False: text.append('*') -# text.append('from '+tag) tags.append('*') + self.style_embed.append ('*') self.style_bold = True if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): if tag != 'a': if self.style_under == False: text.append('+') tags.append('+') + self.style_embed.append ('+') self.style_under = True if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'): if self.style_strike == False: text.append('-') tags.append('-') + self.style_embed.append ('-') self.style_strike = True if tag == 'br': - text.append('') - tags.append('\n') + for i in reversed(self.style_embed): + text.append(i) + text.append('\n') + for i in self.style_embed: + text.append(i) + tags.append('') self.remove_space_after_newline = True - elif tag == 'blockquote': + if tag == 'blockquote': text.append('\nbq. ') tags.append('\n') elif tag in ('abbr', 'acronym'): @@ -259,7 +279,7 @@ class TextileMLizer(OEB2HTML): text.append('??') tags.append('??') elif tag == 'hr': - text.append('\n***\n') + text.append('\n***') tags.append('\n') elif tag == 'pre': self.in_pre = True @@ -267,12 +287,14 @@ class TextileMLizer(OEB2HTML): tags.append('pre\n') elif tag == 'a': if self.opts.keep_links: - text.append ('"') + text.append('"') + tags.append('a') if attribs.has_key('href'): tags.append('":' + attribs['href']) self.our_links.append(attribs['href']) if attribs.has_key('title'): tags.append('(' + attribs['title'] + ')') + self.in_a_link = True elif tag == 'img': if self.opts.keep_image_references: txt = '!' + self.check_halign(style) @@ -286,7 +308,7 @@ class TextileMLizer(OEB2HTML): tags.append('!') elif tag in ('ol', 'ul'): self.list.append({'name':tag, 'num':0}) - text.append('\n') + text.append('') tags.append(tag) elif tag == 'li': if self.list: li = self.list[-1] @@ -294,7 +316,7 @@ class TextileMLizer(OEB2HTML): text.append('\n') if li['name'] == 'ul': text.append('*'*len(self.list)+' ') elif li['name'] == 'ol': text.append('#'*len(self.list)+' ') - tags.append('\n') + tags.append('') elif tag == 'dl': text.append('\n') tags.append('') @@ -308,12 +330,19 @@ class TextileMLizer(OEB2HTML): text.append('') tags.append('\n') elif tag == 'table': - self.in_table = True - text.append('') + txt = self.build_block(tag, style, attribs) + txt += '. \n' + if txt != '\ntable. \n': + text.append(txt) + else: + text.append('\n') tags.append('') - tags.append('table') elif tag == 'tr': - text.append('') + txt = self.build_block('', style, attribs) + txt += '. ' + if txt != '\n. ': + txt = re.sub ('\n','',txt) + text.append(txt) tags.append('|\n') elif tag == 'td': text.append('|') @@ -324,13 +353,15 @@ class TextileMLizer(OEB2HTML): txt += '\\' + attribs['colspan'] if attribs.has_key ('rowspan'): txt += '/' + attribs['rowspan'] + try: + txt += self.check_styles(style) + except: + pass if txt != '': text.append(txt+'. ') tags.append('') elif tag == 'th': - text.append('|_') - - text.append('. ') + text.append('|_. ') tags.append('') elif tag == 'span': if style['font-variant'] == 'small-caps': @@ -339,35 +370,36 @@ class TextileMLizer(OEB2HTML): tags.append('&') self.style_smallcap = True else: - txt = '%' - if self.opts.keep_links: - txt += self.check_id_tag(attribs) - txt += self.check_styles(style) - if txt != '%': - text.append(txt) - tags.append('%') + if self.in_a_link == False: + txt = '%' + if self.opts.keep_links: + txt += self.check_id_tag(attribs) + txt += self.check_styles(style) + if txt != '%': + text.append(txt) + tags.append('%') if self.opts.keep_links and attribs.has_key('id'): - if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span'): + if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'): text.append(self.check_id_tag(attribs)) # Process the styles for any that we want to keep - if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', 'span'): - text.append(self.check_styles(style)) + if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', \ + 'span', 'table', 'tr', 'td'): + if not self.in_a_link: + text.append(self.check_styles(style)) # Process tags that contain text. if hasattr(elem, 'text') and elem.text: txt = elem.text if not self.in_pre: - if self.in_table: - txt = self.remove_newlines(txt) - else: - txt = self.remove_leading_ws(txt) + txt = self.remove_newlines(txt) text.append(txt) + self.id_no_text = u'' # Recurse down into tags within the tag we are in. for item in elem: - text += self.dump_text(item, stylizer, page, tag_stack+tags) + text += self.dump_text(item, stylizer) # Close all open tags. tags.reverse() @@ -375,36 +407,39 @@ class TextileMLizer(OEB2HTML): if tag in ('pre', 'ul', 'ol', 'li', 'table'): if tag == 'pre': self.in_pre = False - if tag == 'table': - self.in_table = False - if tag in ('ul', 'ol'): + elif tag in ('ul', 'ol'): if self.list: self.list.pop() if not self.list: text.append('\n') else: - text.append('%s' % t) - if t == '*': self.style_bold = False - if t == '_': self.style_italic = False - if t == '+': self.style_under = False - if t == '-': self.style_strike = False - if t == '&': self.style_smallcap = False + if t == 'a': + self.in_a_link = False + t = '' + text.append(self.id_no_text) + self.id_no_text = u'' + if t == '*': + self.style_bold = False + elif t == '_': + self.style_italic = False + elif t == '+': + self.style_under = False + elif t == '-': + self.style_strike = False + elif t == '&': + self.style_smallcap = False + if t in ('*', '_', '+', '-'): + txt = self.style_embed.pop() + text.append(txt) + else: + text.append('%s' % t) # Soft scene breaks. text.append(self.check_padding(style, ['margin-bottom',u'\n\n\xa0'])) -# try: -# ems = int(round((float(style.marginBottom) / style.fontSize) - 1)) -# if ems >= 1: -# text.append('\n' * ems) -# except: -# pass # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: tail = elem.tail if not self.in_pre: - if self.in_table: - tail = self.remove_newlines(tail) - else: - tail = self.remove_leading_ws(tail) + tail = self.remove_newlines(tail) text.append(tail) return text From 8853f6c1468bebd72e360517c4117a3764f9edfe Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 1 May 2011 10:24:56 -0400 Subject: [PATCH 05/15] ... --- src/calibre/ebooks/txt/output.py | 14 ++++++++++++-- src/calibre/ebooks/txt/textileml.py | 25 ++++++++++++------------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 261ace2f91..d9c42eb1dc 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -66,6 +66,13 @@ class TXTOutput(OutputFormatPlugin): help=_('Do not remove image references within the document. This is only ' \ 'useful when paired with a txt-output-formatting option that ' 'is not none because links are always removed with plain text output.')), + OptionRecommendation(name='keep_color', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Do not remove font color from output. This is only useful when ' \ + 'txt-output-formatting is set to textile. Textile is the only ' \ + 'formatting that supports setting font color. If this option is ' \ + 'not specified font color will not be set and default to the ' \ + 'color displayed by the reader (generally this is black).')), ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): @@ -111,9 +118,12 @@ class TXTZOutput(TXTOutput): from calibre.ebooks.oeb.base import OEB_IMAGES with TemporaryDirectory('_txtz_output') as tdir: # TXT - with TemporaryFile('index.txt') as tf: + txt_name = 'index.txt' + if opts.txt_output_formatting.lower() == 'textile': + txt_name = 'index.text' + with TemporaryFile(txt_name) as tf: TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log) - shutil.copy(tf, os.path.join(tdir, 'index.txt')) + shutil.copy(tf, os.path.join(tdir, txt_name)) # Images for item in oeb_book.manifest: diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 622ff8d2e3..1c35670596 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -98,7 +98,7 @@ class TextileMLizer(OEB2HTML): text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text) text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text) text = re.sub(r'\n(p.*\.\n)(p.*\.)', r'\n\2', text) - text = re.sub(u'\np.*\.\xa0', r'\np. ', text) # blank paragraph + text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text) # blank paragraph text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph text = re.sub(r' {2,}\|', r' |', text) #sort out spaces in tables # Now put back spaces removed earlier as they're needed here @@ -176,6 +176,11 @@ class TextileMLizer(OEB2HTML): txt += self.check_styles(style) return txt + def prepare_string_for_textile(self, txt): + if re.search(r'(\s([*&_+\-=~@%|]|\?{2}))|(([*&_+\-=~@%|]|\?{2})\s)', txt): + return ' ==%s== ' % txt + return txt + def dump_text(self, elem, stylizer): ''' @elem: The element in the etree that we are working on. @@ -197,7 +202,7 @@ class TextileMLizer(OEB2HTML): tags = [] tag = barename(elem.tag) attribs = elem.attrib - + # Ignore anything that is set to not be displayed. if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': @@ -209,15 +214,9 @@ class TextileMLizer(OEB2HTML): if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): if tag == 'div': tag = 'p' - block = self.build_block(tag, style, attribs) - # Normal paragraph with no styling. - if block == '\np': - text.append('\n\n') - tags.append('\n') - else: - text.append(block) - text.append('. ') - tags.append('\n') + text.append(self.build_block(tag, style, attribs)) + text.append('. ') + tags.append('\n') #self.style_embed = [] if style['font-style'] == 'italic' or tag in ('i', 'em'): @@ -393,7 +392,7 @@ class TextileMLizer(OEB2HTML): if hasattr(elem, 'text') and elem.text: txt = elem.text if not self.in_pre: - txt = self.remove_newlines(txt) + txt = self.prepare_string_for_textile(self.remove_newlines(txt)) text.append(txt) self.id_no_text = u'' @@ -439,7 +438,7 @@ class TextileMLizer(OEB2HTML): if hasattr(elem, 'tail') and elem.tail: tail = elem.tail if not self.in_pre: - tail = self.remove_newlines(tail) + tail = self.prepare_string_for_textile(self.remove_newlines(tail)) text.append(tail) return text From 803c0449b9b2d7e479658e03f555c215eacad026 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 8 May 2011 16:01:36 -0400 Subject: [PATCH 06/15] ... --- src/calibre/customize/profiles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index 5c29f1e79b..de82aaffa1 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -253,7 +253,7 @@ class OutputProfile(Plugin): periodical_date_in_title = True #: Characters used in jackets and catalogs - missing_char = u'x' + missing_char = u'x' ratings_char = u'*' empty_ratings_char = u' ' read_char = u'+' From 5ac915b416c49189606311a2524d59d5a3f8feeb Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 8 May 2011 16:39:45 -0400 Subject: [PATCH 07/15] Leigh's latest changes. --- src/calibre/ebooks/txt/textileml.py | 106 ++++++++++++++++------------ 1 file changed, 61 insertions(+), 45 deletions(-) diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 1c35670596..2f04c4676b 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -14,6 +14,7 @@ from functools import partial from calibre.ebooks.htmlz.oeb2html import OEB2HTML from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links from calibre.ebooks.oeb.stylizer import Stylizer +from calibre.ebooks import unit_convert from calibre.ebooks.txt.unsmarten import unsmarten class TextileMLizer(OEB2HTML): @@ -55,20 +56,19 @@ class TextileMLizer(OEB2HTML): self.log.debug('Converting %s to Textile formatted TXT...' % item.href) self.rewrite_ids(item.data, item) rewrite_links(item.data, partial(self.rewrite_link, page=item)) - stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) + stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile) output += self.dump_text(item.data.find(XHTML('body')), stylizer) output.append('\n\n') return ''.join(output) def tidy_up(self, text): - # Needs tweaking and finetuning + # May need tweaking and finetuning def check_escaping(text, tests): for t in tests: # I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged txt = '%s' % t - self.log.debug('DEBUG: ' + txt) if txt != '%': - text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text) + text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text) text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text) text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text) return text @@ -87,26 +87,26 @@ class TextileMLizer(OEB2HTML): text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%']) text = re.sub(r'%\xa0+', r'%', text) #remove empty spans - text = re.sub(r'%%', r'', text) #remove empty spans + text = re.sub(r'%%', r'', text) #remove empty spans - MAY MERGE SOME ? text = re.sub(r'%([_+*-]+)%', r'\1', text) #remove spans from tagged output text = re.sub(r' +\n', r'\n', text) #remove spaces before a newline text = re.sub(r'^\n+', r'', text) #remove newlines at top of file text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) #correct blockcode paras text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) #correct blockquote paras -# text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) #reduce blank lines + insert blank para text = re.sub(r'\n{3}', r'\n\n', text) #reduce blank lines text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text) text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text) - text = re.sub(r'\n(p.*\.\n)(p.*\.)', r'\n\2', text) text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text) # blank paragraph text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph + text = re.sub(u'\np[<>=]{0,2}\. \xa0', r'\np. ', text) # blank paragraph + text = re.sub(r'\n(p.*\.\n)(p.*\.)', r'\n\2', text) + text = re.sub(r'\n(p\. \n)(p.*\.)', r'\n\2', text) text = re.sub(r' {2,}\|', r' |', text) #sort out spaces in tables + # Now put back spaces removed earlier as they're needed here text = re.sub(r'\np\.\n', r'\np. \n', text) text = re.sub(r' \n\n\n', r' \n\n', text) #reduce blank lines - - # started work on trying to fix footnotes -# text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text) + return text def remove_newlines(self, text): @@ -123,13 +123,11 @@ class TextileMLizer(OEB2HTML): def check_styles(self, style): txt = '{' - if style['color'] and style['color'] != 'black': - txt += 'color:'+style['color']+';' - try: - if style['background']: + if self.opts.keep_color: + if 'color' in style.cssdict() and style['color'] != 'black': + txt += 'color:'+style['color']+';' + if 'background' in style.cssdict(): txt += 'background:'+style['background']+';' - except: - pass txt += '}' if txt == '{}': txt = '' return txt @@ -148,30 +146,44 @@ class TextileMLizer(OEB2HTML): return tests[i] return '' - def check_padding(self, style, tests): + def check_padding(self, style, stylizer): txt = '' - for i in tests: - try: - ems = int(round(float(style[i[0]] / style['font-size']))) - if ems >=1: - txt += i[1] * ems - except: - pass + left_padding_pts = 0 + left_margin_pts = 0 + if 'padding-left' in style.cssdict() and style['padding-left'] != 'auto': + left_padding_pts = unit_convert(style['padding-left'], style.width, style.fontSize, stylizer.profile.dpi) + if 'margin-left' in style.cssdict() and style['margin-left'] != 'auto': + left_margin_pts = unit_convert(style['margin-left'], style.width, style.fontSize, stylizer.profile.dpi) + left = left_margin_pts + left_padding_pts + emleft = int(round(left / stylizer.profile.fbase)) + if emleft >= 1: + txt += '(' * emleft + right_padding_pts = 0 + right_margin_pts = 0 + if 'padding-right' in style.cssdict() and style['padding-right'] != 'auto': + right_padding_pts = unit_convert(style['padding-right'], style.width, style.fontSize, stylizer.profile.dpi) + if 'margin-right' in style.cssdict() and style['margin-right'] != 'auto': + right_margin_pts = unit_convert(style['margin-right'], style.width, style.fontSize, stylizer.profile.dpi) + right = right_margin_pts + right_padding_pts + emright = int(round(right / stylizer.profile.fbase)) + if emright >= 1: + txt += ')' * emright + return txt def check_id_tag(self, attribs): txt = '' - if attribs.has_key('id'): # and attribs['id'] in self.links.values(): + if attribs.has_key('id'): txt = '(#'+attribs['id']+ ')' self.our_ids.append('#'+attribs['id']) self.id_no_text = u'\xa0' return txt - def build_block(self, tag, style, attribs): + def build_block(self, tag, style, attribs, stylizer): txt = '\n' + tag if self.opts.keep_links: txt += self.check_id_tag(attribs) - txt += self.check_padding(style, [['padding-left','('],['padding-right',')']]) + txt += self.check_padding(style, stylizer) txt += self.check_halign(style) txt += self.check_styles(style) return txt @@ -202,22 +214,24 @@ class TextileMLizer(OEB2HTML): tags = [] tag = barename(elem.tag) attribs = elem.attrib - + # Ignore anything that is set to not be displayed. if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': return [''] # Soft scene breaks. - text.append(self.check_padding(style, ['margin-top',u'\n\n\xa0'])) - + if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto': + ems = int(round(float(style.marginTop) / style.fontSize) - 1) + if ems >= 1: + text.append(u'\n\n\xa0' * ems) + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): if tag == 'div': tag = 'p' - text.append(self.build_block(tag, style, attribs)) + text.append(self.build_block(tag, style, attribs, stylizer)) text.append('. ') tags.append('\n') - #self.style_embed = [] if style['font-style'] == 'italic' or tag in ('i', 'em'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): @@ -306,15 +320,17 @@ class TextileMLizer(OEB2HTML): text.append('(' + txt + ')') tags.append('!') elif tag in ('ol', 'ul'): - self.list.append({'name':tag, 'num':0}) + self.list.append({'name': tag, 'num': 0}) text.append('') tags.append(tag) elif tag == 'li': if self.list: li = self.list[-1] - else: li = {'name':'ul', 'num':0} + else: li = {'name': 'ul', 'num': 0} text.append('\n') - if li['name'] == 'ul': text.append('*'*len(self.list)+' ') - elif li['name'] == 'ol': text.append('#'*len(self.list)+' ') + if li['name'] == 'ul': + text.append('*' * len(self.list) + ' ') + elif li['name'] == 'ol': + text.append('#' * len(self.list) + ' ') tags.append('') elif tag == 'dl': text.append('\n') @@ -329,7 +345,7 @@ class TextileMLizer(OEB2HTML): text.append('') tags.append('\n') elif tag == 'table': - txt = self.build_block(tag, style, attribs) + txt = self.build_block(tag, style, attribs, stylizer) txt += '. \n' if txt != '\ntable. \n': text.append(txt) @@ -337,10 +353,10 @@ class TextileMLizer(OEB2HTML): text.append('\n') tags.append('') elif tag == 'tr': - txt = self.build_block('', style, attribs) + txt = self.build_block('', style, attribs, stylizer) txt += '. ' if txt != '\n. ': - txt = re.sub ('\n','',txt) + txt = re.sub ('\n', '', txt) text.append(txt) tags.append('|\n') elif tag == 'td': @@ -352,12 +368,9 @@ class TextileMLizer(OEB2HTML): txt += '\\' + attribs['colspan'] if attribs.has_key ('rowspan'): txt += '/' + attribs['rowspan'] - try: - txt += self.check_styles(style) - except: - pass + txt += self.check_styles(style) if txt != '': - text.append(txt+'. ') + text.append(txt + '. ') tags.append('') elif tag == 'th': text.append('|_. ') @@ -432,7 +445,10 @@ class TextileMLizer(OEB2HTML): text.append('%s' % t) # Soft scene breaks. - text.append(self.check_padding(style, ['margin-bottom',u'\n\n\xa0'])) + if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto': + ems = int(round((float(style.marginBottom) / style.fontSize) - 1)) + if ems >=1: + text.append(u'\n\n\xa0' * ems) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: From d6ec680ebbbadc659a09105d66aaa60299ac1be9 Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 9 May 2011 06:43:19 -0400 Subject: [PATCH 08/15] Leigh's latest changes. --- src/calibre/ebooks/textile/functions.py | 2 +- src/calibre/ebooks/txt/textileml.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index dd1914cf9f..b186e79ad4 100755 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -792,6 +792,7 @@ class Textile(object): text = self.noTextile(text) text = self.code(text) + text = self.glyphs(text) text = self.links(text) if not self.noimage: @@ -803,7 +804,6 @@ class Textile(object): text = self.span(text) text = self.footnoteRef(text) - text = self.glyphs(text) return text.rstrip('\n') diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 2f04c4676b..082332ffd8 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -69,8 +69,8 @@ class TextileMLizer(OEB2HTML): txt = '%s' % t if txt != '%': text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text) - text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text) - text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text) + text = re.sub(r'([a-zA-Z0-9\'"\-])('+t+'[a-zA-Z0-9\'"!? ,.\-]+'+t+')', r'\1[\2]', text) + text = re.sub(r'('+t+'[a-zA-Z0-9\'"!? ,.\-]+'+t+')([a-zA-Z0-9\'"!?\-])', r'[\1]\2', text) return text # Now tidyup links and ids - remove ones that don't have a correponding opposite From c384188057639b42e5e10c142f6e1425f94d09ba Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 9 May 2011 06:47:45 -0400 Subject: [PATCH 09/15] Leigh's latest changes. --- src/calibre/ebooks/txt/textileml.py | 58 +++++++++++++++-------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 082332ffd8..31c118251d 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -68,9 +68,8 @@ class TextileMLizer(OEB2HTML): # I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged txt = '%s' % t if txt != '%': - text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text) - text = re.sub(r'([a-zA-Z0-9\'"\-])('+t+'[a-zA-Z0-9\'"!? ,.\-]+'+t+')', r'\1[\2]', text) - text = re.sub(r'('+t+'[a-zA-Z0-9\'"!? ,.\-]+'+t+')([a-zA-Z0-9\'"!?\-])', r'[\1]\2', text) + text = re.sub(r'([^'+t+'|^\n])'+t+'\]\['+t+'([^'+t+'])', r'\1\2', text) + text = re.sub(r'(\s|[*_])\[('+t+'[a-zA-Z0-9 \',.*_]+'+t+')\](\s|[*_])', r'\1\2\3', text) return text # Now tidyup links and ids - remove ones that don't have a correponding opposite @@ -84,7 +83,8 @@ class TextileMLizer(OEB2HTML): text = re.sub(r'\('+i+'\)', '', text) # Note - I'm not checking for escaped '-' as this will also get hypenated words - text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%']) + text = check_escaping(text, ['\*', '_', '\+', '-']) +# text = check_escaping(text, ['\*', '_', '\+', '-']) text = re.sub(r'%\xa0+', r'%', text) #remove empty spans text = re.sub(r'%%', r'', text) #remove empty spans - MAY MERGE SOME ? @@ -93,20 +93,24 @@ class TextileMLizer(OEB2HTML): text = re.sub(r'^\n+', r'', text) #remove newlines at top of file text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) #correct blockcode paras text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) #correct blockquote paras +# text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) #reduce blank lines + insert blank para text = re.sub(r'\n{3}', r'\n\n', text) #reduce blank lines +# text = re.sub(r' ((\* ?)+) ', r' ==\1== ', text) text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text) text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text) text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text) # blank paragraph text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph - text = re.sub(u'\np[<>=]{0,2}\. \xa0', r'\np. ', text) # blank paragraph - text = re.sub(r'\n(p.*\.\n)(p.*\.)', r'\n\2', text) + text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text) # blank paragraph + text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text) text = re.sub(r'\n(p\. \n)(p.*\.)', r'\n\2', text) text = re.sub(r' {2,}\|', r' |', text) #sort out spaces in tables # Now put back spaces removed earlier as they're needed here text = re.sub(r'\np\.\n', r'\np. \n', text) text = re.sub(r' \n\n\n', r' \n\n', text) #reduce blank lines - + + # started work on trying to fix footnotes +# text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text) return text def remove_newlines(self, text): @@ -236,29 +240,29 @@ class TextileMLizer(OEB2HTML): if style['font-style'] == 'italic' or tag in ('i', 'em'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): if self.style_italic == False: - text.append('_') - tags.append('_') - self.style_embed.append ('_') + text.append('[_') + tags.append('_]') + self.style_embed.append('_') self.style_italic = True if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): if self.style_bold == False: - text.append('*') - tags.append('*') - self.style_embed.append ('*') + text.append('[*') + tags.append('*]') + self.style_embed.append('*') self.style_bold = True if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): if tag != 'a': if self.style_under == False: - text.append('+') - tags.append('+') - self.style_embed.append ('+') + text.append('[+') + tags.append('+]') + self.style_embed.append('+') self.style_under = True if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'): if self.style_strike == False: - text.append('-') - tags.append('-') - self.style_embed.append ('-') + text.append('[-') + tags.append('-]') + self.style_embed.append('-') self.style_strike = True if tag == 'br': for i in reversed(self.style_embed): @@ -428,26 +432,24 @@ class TextileMLizer(OEB2HTML): t = '' text.append(self.id_no_text) self.id_no_text = u'' - if t == '*': + if t == '*]': self.style_bold = False - elif t == '_': + elif t == '_]': self.style_italic = False - elif t == '+': + elif t == '+]': self.style_under = False - elif t == '-': + elif t == '-]': self.style_strike = False elif t == '&': self.style_smallcap = False - if t in ('*', '_', '+', '-'): + if t in ('*]', '_]', '+]', '-]'): txt = self.style_embed.pop() - text.append(txt) - else: - text.append('%s' % t) + text.append('%s' % t) # Soft scene breaks. if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto': ems = int(round((float(style.marginBottom) / style.fontSize) - 1)) - if ems >=1: + if ems >= 1: text.append(u'\n\n\xa0' * ems) # Add the text that is outside of the tag. From 842ba755575c108fc0c8ab93cac383185776f212 Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 9 May 2011 21:19:28 -0400 Subject: [PATCH 10/15] More changes. --- src/calibre/ebooks/textile/functions.py | 34 +++++++++----- src/calibre/ebooks/txt/textileml.py | 59 ++++++++++++++++--------- 2 files changed, 61 insertions(+), 32 deletions(-) diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index b186e79ad4..0e1811f195 100755 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -12,7 +12,7 @@ A Humane Web Text Generator #__date__ = '2009/12/04' __copyright__ = """ -Copyright (c) 2011, Leigh Parry +Copyright (c) 2011, Leigh Parry Copyright (c) 2011, John Schember Copyright (c) 2009, Jason Samsa, http://jsamsa.com/ Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/ @@ -219,14 +219,13 @@ class Textile(object): ] glyph_defaults = [ (re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'), r'\1\2×\3'), # dimension sign - (re.compile(r'(\d+)\'', re.I), r'\1′'), # prime - (re.compile(r'(\d+)\"', re.I), r'\1″'), # prime-double + (re.compile(r'(\d+)\'(\s)', re.I), r'\1′\2'), # prime + (re.compile(r'(\d+)\"(\s)', re.I), r'\1″\2'), # prime-double (re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), r'\1'), # 3+ uppercase acronym (re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), r'\1'), # 3+ uppercase (re.compile(r'\b(\s{0,1})?\.{3}'), r'\1…'), # ellipsis (re.compile(r'^[\*_-]{3,}$', re.M), r'
'), #
scene-break -# (re.compile(r'\b--\b'), r'—'), # em dash - (re.compile(r'([^-])--([^-])'), r'\1—\2'), # em dash + (re.compile(r'(^|[^-])--([^-]|$)'), r'\1—\2'), # em dash (re.compile(r'\s-(?:\s|$)'), r' – '), # en dash (re.compile(r'\b( ?)[([]TM[])]', re.I), r'\1™'), # trademark (re.compile(r'\b( ?)[([]R[])]', re.I), r'\1®'), # registered @@ -706,6 +705,21 @@ class Textile(object): result.append(line) return ''.join(result) + def glyphs_only(self, text): + # fix: hackish + text = re.sub(r'"\Z', '\" ', text) + + result = [] + for line in re.compile(r'(<.*?>)', re.U).split(text): + if not re.search(r'<.*>', line): + rules = [] + if re.search(r'{.+?}', line): + rules = self.macro_defaults + for s, r in rules: + line = s.sub(r, line) + result.append(line) + return ''.join(result) + def vAlign(self, input): d = {'^':'top', '-':'middle', '~':'bottom'} return d.get(input, '') @@ -792,7 +806,6 @@ class Textile(object): text = self.noTextile(text) text = self.code(text) - text = self.glyphs(text) text = self.links(text) if not self.noimage: @@ -804,6 +817,7 @@ class Textile(object): text = self.span(text) text = self.footnoteRef(text) + text = self.glyphs(text) return text.rstrip('\n') @@ -814,6 +828,7 @@ class Textile(object): 'fooobar ... and hello world ...' """ + text = self.glyphs_only(text) punct = '!"#$%&\'*+,-./:;=?@\\^_`|~' pattern = r''' @@ -868,7 +883,7 @@ class Textile(object): >>> t.span(r"hello %(bob)span *strong* and **bold**% goodbye") 'hello span strong and bold goodbye' """ - qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^', r'&') + qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^') pnct = ".,\"'?!;:" for qtag in qtags: @@ -900,9 +915,7 @@ class Textile(object): '%' : 'span', '+' : 'ins', '~' : 'sub', - '^' : 'sup', - '&' : 'span style="font-variant:small-caps;"' -# '&' : 'span style="font-transform:uppercase;font-size:smaller;"' + '^' : 'sup' } tag = qtags[tag] atts = self.pba(atts) @@ -1046,4 +1059,3 @@ def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'): return Textile(restricted=True, lite=lite, noimage=noimage).textile(text, rel='nofollow', html_type=html_type) - diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 31c118251d..814ba01a3e 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -69,7 +69,8 @@ class TextileMLizer(OEB2HTML): txt = '%s' % t if txt != '%': text = re.sub(r'([^'+t+'|^\n])'+t+'\]\['+t+'([^'+t+'])', r'\1\2', text) - text = re.sub(r'(\s|[*_])\[('+t+'[a-zA-Z0-9 \',.*_]+'+t+')\](\s|[*_])', r'\1\2\3', text) + text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text) + text = re.sub(r'(\s|[*_\'"])\[('+t+'[a-zA-Z0-9 \'",.*_]+'+t+')\](\s|[*_\'"?!,.])', r'\1\2\3', text) return text # Now tidyup links and ids - remove ones that don't have a correponding opposite @@ -77,14 +78,17 @@ class TextileMLizer(OEB2HTML): for i in self.our_links: if i[0] == '#': if i not in self.our_ids: - text = re.sub(r'"(.+)":'+i, '\1', text) + self.log.debug('Link has no target - %s ...' % i) + text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text) for i in self.our_ids: if i not in self.our_links: - text = re.sub(r'\('+i+'\)', '', text) + self.log.debug('ID has no link - %s ...' % i) + text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text) - # Note - I'm not checking for escaped '-' as this will also get hypenated words - text = check_escaping(text, ['\*', '_', '\+', '-']) -# text = check_escaping(text, ['\*', '_', '\+', '-']) + # Remove obvious non-needed escaping, add sub/sup-script ones + text = check_escaping(text, ['\*', '_', '\*']) + text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text) # escape the super/sub-scripts if needed + text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text) # escape the super/sub-scripts if needed text = re.sub(r'%\xa0+', r'%', text) #remove empty spans text = re.sub(r'%%', r'', text) #remove empty spans - MAY MERGE SOME ? @@ -96,13 +100,14 @@ class TextileMLizer(OEB2HTML): # text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) #reduce blank lines + insert blank para text = re.sub(r'\n{3}', r'\n\n', text) #reduce blank lines # text = re.sub(r' ((\* ?)+) ', r' ==\1== ', text) - text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text) + text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text) + text = re.sub(r'\n\n {2,4}%', r'%', text) #Check span following blank para text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text) text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text) # blank paragraph text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text) # blank paragraph text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text) - text = re.sub(r'\n(p\. \n)(p.*\.)', r'\n\2', text) + text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text) text = re.sub(r' {2,}\|', r' |', text) #sort out spaces in tables # Now put back spaces removed earlier as they're needed here @@ -193,7 +198,8 @@ class TextileMLizer(OEB2HTML): return txt def prepare_string_for_textile(self, txt): - if re.search(r'(\s([*&_+\-=~@%|]|\?{2}))|(([*&_+\-=~@%|]|\?{2})\s)', txt): +# if re.search(r'(\s([*&_+\-~@%|]|\?{2}))|(([*&_+\-~@%|]|\?{2})\s)', txt): + if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt): return ' ==%s== ' % txt return txt @@ -240,15 +246,23 @@ class TextileMLizer(OEB2HTML): if style['font-style'] == 'italic' or tag in ('i', 'em'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): if self.style_italic == False: - text.append('[_') - tags.append('_]') + if self.in_a_link: + text.append('_') + tags.append('_') + else: + text.append('[_') + tags.append('_]') self.style_embed.append('_') self.style_italic = True if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): if self.style_bold == False: - text.append('[*') - tags.append('*]') + if self.in_a_link: + text.append('*') + tags.append('*') + else: + text.append('[*') + tags.append('*]') self.style_embed.append('*') self.style_bold = True if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): @@ -304,14 +318,17 @@ class TextileMLizer(OEB2HTML): tags.append('pre\n') elif tag == 'a': if self.opts.keep_links: - text.append('"') - tags.append('a') if attribs.has_key('href'): + text.append('"') + tags.append('a') tags.append('":' + attribs['href']) self.our_links.append(attribs['href']) - if attribs.has_key('title'): - tags.append('(' + attribs['title'] + ')') - self.in_a_link = True + if attribs.has_key('title'): + tags.append('(' + attribs['title'] + ')') + self.in_a_link = True + else: + text.append('%') + tags.append('%') elif tag == 'img': if self.opts.keep_image_references: txt = '!' + self.check_halign(style) @@ -432,9 +449,9 @@ class TextileMLizer(OEB2HTML): t = '' text.append(self.id_no_text) self.id_no_text = u'' - if t == '*]': + if t in ('*]', '*'): self.style_bold = False - elif t == '_]': + elif t in ('_]', '_'): self.style_italic = False elif t == '+]': self.style_under = False @@ -442,7 +459,7 @@ class TextileMLizer(OEB2HTML): self.style_strike = False elif t == '&': self.style_smallcap = False - if t in ('*]', '_]', '+]', '-]'): + if t in ('*]', '_]', '+]', '-]', '*', '_'): txt = self.style_embed.pop() text.append('%s' % t) From 3ca59beaf5c825fbb14af90b0108b7792a011924 Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 9 May 2011 21:21:35 -0400 Subject: [PATCH 11/15] Add email. --- src/calibre/ebooks/textile/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index 0e1811f195..8a9c6b082a 100755 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -12,7 +12,7 @@ A Humane Web Text Generator #__date__ = '2009/12/04' __copyright__ = """ -Copyright (c) 2011, Leigh Parry +Copyright (c) 2011, Leigh Parry Copyright (c) 2011, John Schember Copyright (c) 2009, Jason Samsa, http://jsamsa.com/ Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/ From b95f9949be04a4d92eeabc76629cff0361817d47 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 10 May 2011 06:37:40 -0400 Subject: [PATCH 12/15] Rename function. --- src/calibre/ebooks/textile/functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index 8a9c6b082a..e088d264fc 100755 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -705,7 +705,7 @@ class Textile(object): result.append(line) return ''.join(result) - def glyphs_only(self, text): + def macros_only(self, text): # fix: hackish text = re.sub(r'"\Z', '\" ', text) @@ -828,7 +828,7 @@ class Textile(object): 'fooobar ... and hello world ...' """ - text = self.glyphs_only(text) + text = self.macros_only(text) punct = '!"#$%&\'*+,-./:;=?@\\^_`|~' pattern = r''' From 441718f76c867da749a10607f931b8b03485d331 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 10 May 2011 18:55:19 -0400 Subject: [PATCH 13/15] TXT: small Textile changes. Remove old textile conversion code. --- src/calibre/ebooks/txt/textileml.py | 58 ++++---- src/calibre/utils/html2textile.py | 209 ---------------------------- 2 files changed, 34 insertions(+), 233 deletions(-) delete mode 100644 src/calibre/utils/html2textile.py diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 814ba01a3e..17988053e8 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -78,44 +78,55 @@ class TextileMLizer(OEB2HTML): for i in self.our_links: if i[0] == '#': if i not in self.our_ids: - self.log.debug('Link has no target - %s ...' % i) text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text) for i in self.our_ids: if i not in self.our_links: - self.log.debug('ID has no link - %s ...' % i) text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text) # Remove obvious non-needed escaping, add sub/sup-script ones text = check_escaping(text, ['\*', '_', '\*']) - text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text) # escape the super/sub-scripts if needed - text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text) # escape the super/sub-scripts if needed + # escape the super/sub-scripts if needed + text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text) + # escape the super/sub-scripts if needed + text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text) - text = re.sub(r'%\xa0+', r'%', text) #remove empty spans - text = re.sub(r'%%', r'', text) #remove empty spans - MAY MERGE SOME ? - text = re.sub(r'%([_+*-]+)%', r'\1', text) #remove spans from tagged output - text = re.sub(r' +\n', r'\n', text) #remove spaces before a newline - text = re.sub(r'^\n+', r'', text) #remove newlines at top of file - text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) #correct blockcode paras - text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) #correct blockquote paras -# text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) #reduce blank lines + insert blank para - text = re.sub(r'\n{3}', r'\n\n', text) #reduce blank lines -# text = re.sub(r' ((\* ?)+) ', r' ==\1== ', text) + #remove empty spans + text = re.sub(r'%\xa0+', r'%', text) + #remove empty spans - MAY MERGE SOME ? + text = re.sub(r'%%', r'', text) + #remove spans from tagged output + text = re.sub(r'%([_+*-]+)%', r'\1', text) + #remove spaces before a newline + text = re.sub(r' +\n', r'\n', text) + #remove newlines at top of file + text = re.sub(r'^\n+', r'', text) + #correct blockcode paras + text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) + #correct blockquote paras + text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) + + #reduce blank lines + text = re.sub(r'\n{3}', r'\n\n', text) text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text) - text = re.sub(r'\n\n {2,4}%', r'%', text) #Check span following blank para + #Check span following blank para + text = re.sub(r'\n+ +%', r' %', text) text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text) - text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text) # blank paragraph - text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph - text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text) # blank paragraph + # blank paragraph + text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text) + # blank paragraph + text = re.sub(u'\n\xa0', r'\np. ', text) + # blank paragraph + text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text) text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text) text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text) - text = re.sub(r' {2,}\|', r' |', text) #sort out spaces in tables + #sort out spaces in tables + text = re.sub(r' {2,}\|', r' |', text) # Now put back spaces removed earlier as they're needed here text = re.sub(r'\np\.\n', r'\np. \n', text) - text = re.sub(r' \n\n\n', r' \n\n', text) #reduce blank lines - - # started work on trying to fix footnotes -# text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text) + #reduce blank lines + text = re.sub(r' \n\n\n', r' \n\n', text) + return text def remove_newlines(self, text): @@ -198,7 +209,6 @@ class TextileMLizer(OEB2HTML): return txt def prepare_string_for_textile(self, txt): -# if re.search(r'(\s([*&_+\-~@%|]|\?{2}))|(([*&_+\-~@%|]|\?{2})\s)', txt): if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt): return ' ==%s== ' % txt return txt diff --git a/src/calibre/utils/html2textile.py b/src/calibre/utils/html2textile.py deleted file mode 100644 index 786e912e36..0000000000 --- a/src/calibre/utils/html2textile.py +++ /dev/null @@ -1,209 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright (c) 2010, Webreactor - Marcin Lulek -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of the nor the -# names of its contributors may be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY -# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -from lxml import etree -from calibre.ebooks.oeb.base import barename - -class EchoTarget: - - def __init__(self): - self.final_output = [] - self.block = False - self.ol_ident = 0 - self.ul_ident = 0 - self.list_types = [] - self.haystack = [] - - def start(self, tag, attrib): - tag = barename(tag) - - newline = '\n' - dot = '' - new_tag = '' - - if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): - new_tag = tag - dot = '. ' - elif tag == 'p': - new_tag = '' - dot = '' - elif tag == 'blockquote': - new_tag = 'bq' - dot = '. ' - elif tag in ('b', 'strong'): - new_tag = '*' - newline = '' - elif tag in ('em', 'i'): - new_tag = '_' - newline = '' - elif tag == 'cite': - new_tag = '??' - newline = '' - elif tag == 'del': - new_tag = '-' - newline = '' - elif tag == 'ins': - new_tag = '+' - newline = '' - elif tag == 'sup': - new_tag = '^' - newline = '' - elif tag == 'sub': - new_tag = '~' - newline = '' - elif tag == 'span': - new_tag = '' - newline = '' - elif tag == 'a': - self.block = True - if 'title' in attrib: - self.a_part = {'title':attrib.get('title'), - 'href':attrib.get('href', '')} - else: - self.a_part = {'title':None, 'href':attrib.get('href', '')} - new_tag = '' - newline = '' - - elif tag == 'img': - if 'alt' in attrib: - new_tag = ' !%s(%s)' % (attrib.get('src'), attrib.get('title'),) - else: - new_tag = ' !%s' % attrib.get('src') - newline = '' - - elif tag in ('ul', 'ol'): - new_tag = '' - newline = '' - self.list_types.append(tag) - if tag == 'ul': - self.ul_ident += 1 - else: - self.ol_ident += 1 - - elif tag == 'li': - indent = self.ul_ident + self.ol_ident - if self.list_types[-1] == 'ul': - new_tag = '*' * indent + ' ' - newline = '\n' - else: - new_tag = '#' * indent + ' ' - newline = '\n' - - - if tag not in ('ul', 'ol'): - textile = '%(newline)s%(tag)s%(dot)s' % \ - { - 'newline':newline, - 'tag':new_tag, - 'dot':dot - } - if not self.block: - self.final_output.append(textile) - else: - self.haystack.append(textile) - - def end(self, tag): - tag = barename(tag) - - if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'): - self.final_output.append('\n') - elif tag in ('b', 'strong'): - self.final_output.append('*') - elif tag in ('em', 'i'): - self.final_output.append('_') - elif tag == 'cite': - self.final_output.append('??') - elif tag == 'del': - self.final_output.append('-') - elif tag == 'ins': - self.final_output.append('+') - elif tag == 'sup': - self.final_output.append('^') - elif tag == 'sub': - self.final_output.append('~') - elif tag == 'span': - self.final_output.append('') - elif tag == 'a': - if self.a_part['title']: - textilized = ' "%s (%s)":%s ' % ( - ''.join(self.haystack), - self.a_part.get('title'), - self.a_part.get('href'), - ) - self.haystack = [] - else: - textilized = ' "%s":%s ' % ( - ''.join(self.haystack), - self.a_part.get('href'), - ) - self.haystack = [] - self.final_output.append(textilized) - self.block = False - elif tag == 'img': - self.final_output.append('!') - elif tag == 'ul': - self.ul_ident -= 1 - self.list_types.pop() - if len(self.list_types) == 0: - self.final_output.append('\n') - elif tag == 'ol': - self.ol_ident -= 1 - self.list_types.pop() - if len(self.list_types) == 0: - self.final_output.append('\n') - - def data(self, data): - #we dont want any linebreaks inside our tags - node_data = data.replace('\n','') - if not self.block: - self.final_output.append(node_data) - else: - self.haystack.append(node_data) - - def comment(self, text): - pass - - def close(self): - return "closed!" - - -def html2textile(html): - #1st pass - #clean the whitespace and convert html to xhtml - parser = etree.HTMLParser() - tree = etree.fromstring(html, parser) - xhtml = etree.tostring(tree, method="xml") - parser = etree.XMLParser(remove_blank_text=True) - root = etree.XML(xhtml, parser) - cleaned_html = etree.tostring(root) - #2nd pass build textile - target = EchoTarget() - parser = etree.XMLParser(target=target) - root = etree.fromstring(cleaned_html, parser) - textilized_text = ''.join(target.final_output).lstrip().rstrip() - return textilized_text From 5c1b683536ccb7fb221b13e35c4ae73db46cd35b Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 10 May 2011 18:58:30 -0400 Subject: [PATCH 14/15] TXT: Add keep color GUI option. --- src/calibre/gui2/convert/txt_output.py | 2 +- src/calibre/gui2/convert/txt_output.ui | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/calibre/gui2/convert/txt_output.py b/src/calibre/gui2/convert/txt_output.py index 8427f83824..816e8d7785 100644 --- a/src/calibre/gui2/convert/txt_output.py +++ b/src/calibre/gui2/convert/txt_output.py @@ -19,7 +19,7 @@ class PluginWidget(Widget, Ui_Form): Widget.__init__(self, parent, ['newline', 'max_line_length', 'force_max_line_length', 'inline_toc', 'txt_output_formatting', 'keep_links', 'keep_image_references', - 'txt_output_encoding']) + 'keep_color', 'txt_output_encoding']) self.db, self.book_id = db, book_id for x in get_option('newline').option.choices: self.opt_newline.addItem(x) diff --git a/src/calibre/gui2/convert/txt_output.ui b/src/calibre/gui2/convert/txt_output.ui index 1ef9e6e6b9..36ffabb07e 100644 --- a/src/calibre/gui2/convert/txt_output.ui +++ b/src/calibre/gui2/convert/txt_output.ui @@ -122,6 +122,13 @@ + + + + Do not remove font color before processing + + + From dc0834e8bcfdd49a84ab68cafec51d0433ab7988 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 12 May 2011 18:06:17 -0400 Subject: [PATCH 15/15] TXT: Textileml tweaks. --- src/calibre/ebooks/txt/textileml.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 17988053e8..36dc9952d2 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -106,17 +106,17 @@ class TextileMLizer(OEB2HTML): text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) #reduce blank lines - text = re.sub(r'\n{3}', r'\n\n', text) + text = re.sub(r'\n{3}', r'\n\np. \n\n', text) text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text) #Check span following blank para text = re.sub(r'\n+ +%', r' %', text) text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text) # blank paragraph - text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text) + text = re.sub(r'\n(p.*\.)\n', r'\n\1 \n\n', text) # blank paragraph - text = re.sub(u'\n\xa0', r'\np. ', text) + text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph - text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text) + text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text) text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text) text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text) #sort out spaces in tables