From 804b248d46c71e5169c57da794ec2f69f2998dbf Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 16 Apr 2011 11:55:44 -0400 Subject: [PATCH] Add new but still wip textile output generator. --- src/calibre/ebooks/txt/output.py | 21 +- src/calibre/ebooks/txt/textileml.py | 341 +++++++++++++++++++++++++--- src/calibre/ebooks/txt/unsmarten.py | 109 +++++++++ 3 files changed, 432 insertions(+), 39 deletions(-) create mode 100644 src/calibre/ebooks/txt/unsmarten.py diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 4e54a97b45..7b50afb345 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -70,16 +70,17 @@ class TXTOutput(OutputFormatPlugin): ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): + print 'New' if opts.txt_output_formatting.lower() == 'markdown': from calibre.ebooks.txt.markdownml import MarkdownMLizer - writer = MarkdownMLizer(log) + self.writer = MarkdownMLizer(log) elif opts.txt_output_formatting.lower() == 'textile': from calibre.ebooks.txt.textileml import TextileMLizer - writer = TextileMLizer(log) + self.writer = TextileMLizer(log) else: - writer = TXTMLizer(log) + self.writer = TXTMLizer(log) - txt = writer.extract_content(oeb_book, opts) + txt = self.writer.extract_content(oeb_book, opts) txt = clean_ascii_chars(txt) log.debug('\tReplacing newlines with selected type...') @@ -118,10 +119,18 @@ class TXTZOutput(TXTOutput): # Images for item in oeb_book.manifest: if item.media_type in OEB_IMAGES: - path = os.path.join(tdir, os.path.dirname(item.href)) + if hasattr(self.writer, 'images'): + path = os.path.join(tdir, 'images') + if item.href in self.writer.images: + href = self.writer.images[item.href] + else: + continue + else: + path = os.path.join(tdir, os.path.dirname(item.href)) + href = os.path.basename(item.href) if not os.path.exists(path): os.makedirs(path) - with open(os.path.join(tdir, item.href), 'wb') as imgf: + with open(os.path.join(path, href), 'wb') as imgf: imgf.write(item.data) # Metadata diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index d7e11695c5..9651fa8971 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- __license__ = 'GPL 3' -__copyright__ = '2011, John Schember ' +__copyright__ = '2011, Leigh Parry ' __docformat__ = 'restructuredtext en' ''' @@ -10,53 +10,328 @@ Transform OEB content into Textile formatted plain text import re -from lxml import etree +from functools import partial -from calibre.ebooks.oeb.base import XHTML -from calibre.utils.html2textile import html2textile +from calibre.ebooks.htmlz.oeb2html import OEB2HTML +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links +from calibre.ebooks.oeb.stylizer import Stylizer +from calibre.ebooks.txt.unsmarten import unsmarten +from operator import itemgetter -class TextileMLizer(object): - def __init__(self, log): - self.log = log +class TextileMLizer(OEB2HTML): def extract_content(self, oeb_book, opts): self.log.info('Converting XHTML to Textile formatted TXT...') - self.oeb_book = oeb_book self.opts = opts + self.in_pre = False + self.in_table = False + self.links = {} + self.list = [] + self.images = {} + self.base_hrefs = [item.href for item in oeb_book.spine] + self.map_resources(oeb_book) - return self.mlize_spine() + self.style_bold = False + self.style_italic = False + self.style_under = False + self.style_strike = False + self.style_smallcap = False - def mlize_spine(self): + txt = self.mlize_spine(oeb_book) + txt = unsmarten(txt) + + # Do some tidying up + txt = self.tidy_up(txt) + + return txt + + def mlize_spine(self, oeb_book): output = [u''] - - for item in self.oeb_book.spine: + for item in oeb_book.spine: self.log.debug('Converting %s to Textile formatted TXT...' % item.href) + self.rewrite_ids(item.data, item) + rewrite_links(item.data, partial(self.rewrite_link, page=item)) + stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) + output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) + output.append('\n\n') + return ''.join(output) - html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) + def tidy_up(self, text): + def check_count(text, tests): + x = [] + for i, t in enumerate(reversed(tests)): + x.append((text.count(t), i, t)) + if x: + return sorted(x, key=itemgetter(0, 1), reverse=True)[0][2] + return '' - if not self.opts.keep_links: - html = re.sub(r'<\s*/*\s*a[^>]*>', '', html) - if not self.opts.keep_image_references: - html = re.sub(r'<\s*img[^>]*>', '', html) + # NEEDS TWEAKING +# def check_escaping(text, tests): +# for t in tests: +# text = re.sub(r'(\S)('+t+'\w+'+t+')', r'\1[\2]', text) +# text = re.sub(r'(\s)('+t+'\w+'+t+')(\S)', r'\1[\2]\3', text) +# return text - text = html2textile(html) + txt = check_count(text, ['\np<. ', '\np<>. ', '\np. ']) + text = re.sub(txt+'(\S)', r'\n\1', text) - # Ensure the section ends with at least two new line characters. - # This is to prevent the last paragraph from a section being - # combined into the fist paragraph of the next. - end_chars = text[-4:] - # Convert all newlines to \n - end_chars = end_chars.replace('\r\n', '\n') - end_chars = end_chars.replace('\r', '\n') - end_chars = end_chars[-2:] - if not end_chars[1] == '\n': - text += '\n\n' - if end_chars[1] == '\n' and not end_chars[0] == '\n': - text += '\n' +# text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '-']) - output += text + text = re.sub('\npre\. bc\.', '\nbc.', text) + text = re.sub('\np=. p. ', '\np. ', text) + text = re.sub('\np=. \n', '\n', text) + text = re.sub('\n{3,}', '\n\n', text) + text = re.sub(' \|', '|', text) - output = u''.join(output) + # started work on trying to fix footnotes +# text = re.sub(r'\^"(\d+)":#.+\^', r'[\1]', text) + return text - return output + def remove_newlines(self, text): + text = text.replace('\r\n', ' ') + text = text.replace('\n', ' ') + text = text.replace('\r', ' ') + # Condense redundant spaces created by replacing newlines with spaces. + text = re.sub(r'[ ]{2,}', ' ', text) + text = re.sub(r'\t+', '', text) + return text + + def remove_leading_ws(self, text): + text = text.replace('\r\n', '\n') + text = text.replace('\r', '\n') + text = re.sub(r'\n+', '\n', text) + text = re.sub(r'\n[\t ]+', '\n', text) + return text + + def check_align(self, style, align, tests): + for i in tests: + if style[align] == i[0]: + return i[1] + return '' + + def check_padding(self, style, tests): + txt = '' + for i in tests: + try: + ems = int(round(float(style[i[0]] / style['font-size']))) + if ems >=1: + txt += i[1] * ems + except: + pass + return txt + + def check_id_tag(self, attribs): + txt = '' + if attribs.has_key('id'): + txt = '(#'+attribs['id']+')' + return txt + + def build_block(self, tag, style, attribs, finish): + txt = tag + if self.opts.keep_links: + txt += self.check_id_tag(attribs) + txt += self.check_padding(style, [['padding-left','('],['padding-right',')']]) + txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']]) + txt += finish + return txt + + def dump_text(self, elem, stylizer, page, tag_stack=[]): + ''' + @elem: The element in the etree that we are working on. + @stylizer: The style information attached to the element. + ''' + + # We can only processes tags. If there isn't a tag return any text. + if not isinstance(elem.tag, basestring) \ + or namespace(elem.tag) != XHTML_NS: + p = elem.getparent() + if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \ + and elem.tail: + return [elem.tail] + return [''] + + # Setup our variables. + text = [''] + style = stylizer.style(elem) + tags = [] + tag = barename(elem.tag) + attribs = elem.attrib + + # Ignore anything that is set to not be displayed. + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + return [''] + + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'): + text.append(self.build_block(tag, style, attribs, '. ')) + tags.append('\n') + + if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): + if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): + if self.style_bold == False: + text.append('*') + tags.append('*') + self.style_bold = True + if style['font-style'] == 'italic' or tag in ('i', 'em'): + if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): + if self.style_italic == False: + text.append('_') + tags.append('_') + self.style_italic = True + if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): + if tag != 'a': + if self.style_under == False: + text.append('+') + tags.append('+') + self.style_under = True + if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'): + if self.style_strike == False: + text.append('-') + tags.append('-') + self.style_strike = True + if style['font-variant'] == 'small-caps': + if self.style_smallcap == 0: + text.append('&') + tags.append('&') + self.style_smallcap = 1 + if tag == 'br': + text.append('') + tags.append('\n') + elif tag == 'blockquote': + text.append('bq. ') + tags.append('\n') + elif tag in ('abbr', 'acronym'): + text.append('') + txt = attribs['title'] + tags.append('(' + txt + ')') + elif tag == 'sup': + text.append('^') + tags.append('^') + elif tag == 'sub': + text.append('~') + tags.append('~') + elif tag == 'code': + if self.in_pre: + text.append('bc. ') + tags.append('\n') + else: + text.append('@') + tags.append('@') + elif tag == 'cite': + text.append('??') + tags.append('??') + elif tag == 'hr': + text.append('\n***\n') + tags.append('\n') + elif tag == 'pre': + self.in_pre = True + text.append('pre. ') + tags.append('pre') + elif tag == 'a': + if self.opts.keep_links: + text.append ('"') + tags.append('":' + attribs['href']) + if attribs.has_key('title'): + tags.append('(' + attribs['title'] + ')') + elif tag == 'img': + if self.opts.keep_image_references: + text.append ('!' + attribs['src']) + if attribs.has_key('alt'): + txt = attribs['alt'] + if txt != '': + text.append('(' + txt + ')') + tags.append('!') + elif tag in ('ol', 'ul'): + self.list.append({'name':tag, 'num':0}) + text.append('') + tags.append(tag) + elif tag == 'li': + if self.list: li = self.list[-1] + else: li = {'name':'ul', 'num':0} + if li['name'] == 'ul': text.append('*'*len(self.list)+' ') + elif li['name'] == 'ol': text.append('#'*len(self.list)+' ') + elif tag == 'dl': + text.append('\n') + tags.append('') + elif tag == 'dt': + text.append('') + tags.append('\n') + elif tag == 'dd': + text.append(' ') + tags.append('') + elif tag == 'dd': + text.append('') + tags.append('\n') + elif tag == 'table': + self.in_table = True + text.append('') + tags.append('table') + elif tag == 'tr': + text.append('') + tags.append('|\n') + elif tag == 'td': + text.append('|') + txt = '' + txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']]) + txt += self.check_align(style, 'vertical-align', [['top','^'],['bottom','~']]) + if attribs.has_key ('colspan'): + txt += '\\' + attribs['colspan'] + if attribs.has_key ('rowspan'): + txt += '/' + attribs['rowspan'] + if txt != '': + text.append(txt+'. ') + tags.append('') + elif tag == 'th': + text.append('|_. ') + tags.append('') + + if self.opts.keep_links and attribs.has_key('id'): + if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'): + text.append('(#' + attribs['id'] + ')') + + # If wanted process all style tags here - before taxt in tags is written + + # Process tags that contain text. + if hasattr(elem, 'text') and elem.text: + txt = elem.text + if not self.in_pre: + if self.in_table: + txt = self.remove_newlines(txt) + else: + txt = self.remove_leading_ws(txt) + text.append(txt) + + # Recurse down into tags within the tag we are in. + for item in elem: + text += self.dump_text(item, stylizer, page, tag_stack+tags) + + # Close all open tags. + tags.reverse() + for t in tags: + if tag in ('pre', 'ul', 'ol', 'li', 'table'): + if tag == 'pre': + self.in_pre = False + if tag == 'table': + self.in_table = False + if tag in ('ul', 'ol'): + if self.list: self.list.pop() + else: + text.append('%s' % t) + if t == '*': self.style_bold = False + if t == '_': self.style_italic = False + if t == '+': self.style_under = False + if t == '-': self.style_strike = False + if t == '&': self.style_smallcap = False + + # Add the text that is outside of the tag. + if hasattr(elem, 'tail') and elem.tail: + tail = elem.tail + if not self.in_pre: + if self.in_table: + tail = self.remove_newlines(tail) + else: + tail = self.remove_leading_ws(tail) + text.append(tail) + + return text diff --git a/src/calibre/ebooks/txt/unsmarten.py b/src/calibre/ebooks/txt/unsmarten.py new file mode 100644 index 0000000000..30a22bf069 --- /dev/null +++ b/src/calibre/ebooks/txt/unsmarten.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- + +''' + +''' + +__version__ = '0.1' +__author__ = 'Leigh Parry' + +import re + +def unsmarten(txt): + txt = re.sub(u'–|–|–', r'-', txt) # en-dash + txt = re.sub(u'—|—|—', r'--', txt) # em-dash + txt = re.sub(u'…|…|…', r'...', txt) # ellipsis + + txt = re.sub(u'“|”|″|“|”|″|“|”|″', r'"', txt) # double quote + txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt) # apostrophe + txt = re.sub(u'‘|’|′|‘|’|′|‘|’|′', r"'", txt) # single quote + + txt = re.sub(u'¢|¢|¢', r'{c\}', txt) # cent + txt = re.sub(u'£|£|£', r'{L-}', txt) # pound + txt = re.sub(u'¥|¥|¥', r'{Y=}', txt) # yen + txt = re.sub(u'©|©|©', r'{(c)}', txt) # copyright + txt = re.sub(u'®|®|®', r'{(r)}', txt) # registered + txt = re.sub(u'¼|¼|¼', r'{1/4}', txt) # quarter + txt = re.sub(u'½|½|½', r'{1/2}', txt) # half + txt = re.sub(u'¾|¾|¾', r'{3/4}', txt) # three-quarter + txt = re.sub(u'À|À|À', r'{A`)}', txt) # A-grave + txt = re.sub(u'Á|Á|Á', r"{A'}", txt) # A-acute + txt = re.sub(u'Â|Â|Â', r'{A^}', txt) # A-circumflex + txt = re.sub(u'Ã|Ã|Ã', r'{A~}', txt) # A-tilde + txt = re.sub(u'Ä|Ä|Ä', r'{A"}', txt) # A-umlaut + txt = re.sub(u'Å|Å|Å', r'{Ao}', txt) # A-ring + txt = re.sub(u'Æ|Æ|Æ', r'{AE}', txt) # AE + txt = re.sub(u'Ç|Ç|Ç', r'{C,}', txt) # C-cedilla + txt = re.sub(u'È|È|È', r'{E`}', txt) # E-grave + txt = re.sub(u'É|É|É', r"{E'}", txt) # E-acute + txt = re.sub(u'Ê|Ê|Ê', r'{E^}', txt) # E-circumflex + txt = re.sub(u'Ë|Ë|Ë', r'{E"}', txt) # E-umlaut + txt = re.sub(u'Ì|Ì|Ì', r'{I`}', txt) # I-grave + txt = re.sub(u'Í|Í|Í', r"{I'}", txt) # I-acute + txt = re.sub(u'Î|Î|Î', r'{I^}', txt) # I-circumflex + txt = re.sub(u'Ï|Ï|Ï', r'{I"}', txt) # I-umlaut + txt = re.sub(u'Ð|Ð|Ð', r'{D-}', txt) # ETH + txt = re.sub(u'Ñ|Ñ|Ñ', r'{N~}', txt) # N-tilde + txt = re.sub(u'Ò|Ò|Ò', r'{O`}', txt) # O-grave + txt = re.sub(u'Ó|Ó|Ó', r"{O'}", txt) # O-acute + txt = re.sub(u'Ô|Ô|Ô', r'{O^}', txt) # O-circumflex + txt = re.sub(u'Õ|Õ|Õ', r'{O~}', txt) # O-tilde + txt = re.sub(u'Ö|Ö|Ö', r'{O"}', txt) # O-umlaut + txt = re.sub(u'×|×|×', r'{x}', txt) # dimension + txt = re.sub(u'Ø|Ø|Ø', r'{O/}', txt) # O-slash + txt = re.sub(u'Ù|Ù|Ù', r"{U`}", txt) # U-grave + txt = re.sub(u'Ú|Ú|Ú', r"{U'}", txt) # U-acute + txt = re.sub(u'Û|Û|Û', r'{U^}', txt) # U-circumflex + txt = re.sub(u'Ü|Ü|Ü', r'{U"}', txt) # U-umlaut + txt = re.sub(u'Ý|Ý|Ý', r"{Y'}", txt) # Y-grave + txt = re.sub(u'ß|ß|ß', r'{sz}', txt) # sharp-s + txt = re.sub(u'à|à|à', r'{a`}', txt) # a-grave + txt = re.sub(u'á|á|á', r"{a'}", txt) # a-acute + txt = re.sub(u'â|â|â', r'{a^}', txt) # a-circumflex + txt = re.sub(u'ã|ã|ã', r'{a~}', txt) # a-tilde + txt = re.sub(u'ä|ä|ä', r'{a"}', txt) # a-umlaut + txt = re.sub(u'å|å|å', r'{ao}', txt) # a-ring + txt = re.sub(u'æ|æ|æ', r'{ae}', txt) # ae + txt = re.sub(u'ç|ç|ç', r'{c,}', txt) # c-cedilla + txt = re.sub(u'è|è|è', r'{e`}', txt) # e-grave + txt = re.sub(u'é|é|é', r"{e'}", txt) # e-acute + txt = re.sub(u'ê|ê|ê', r'{e^}', txt) # e-circumflex + txt = re.sub(u'ë|ë|ë', r'{e"}', txt) # e-umlaut + txt = re.sub(u'ì|ì|ì', r'{i`}', txt) # i-grave + txt = re.sub(u'í|í|í', r"{i'}", txt) # i-acute + txt = re.sub(u'î|î|î', r'{i^}', txt) # i-circumflex + txt = re.sub(u'ï|ï|ï', r'{i"}', txt) # i-umlaut + txt = re.sub(u'ð|ð|ð', r'{d-}', txt) # eth + txt = re.sub(u'ñ|ñ|ñ', r'{n~}', txt) # n-tilde + txt = re.sub(u'ò|ò|ò', r'{o`}', txt) # o-grave + txt = re.sub(u'ó|ó|ó', r"{o'}", txt) # o-acute + txt = re.sub(u'ô|ô|ô', r'{o^}', txt) # o-circumflex + txt = re.sub(u'õ|õ|õ', r'{o~}', txt) # o-tilde + txt = re.sub(u'ö|ö|ö', r'{o"}', txt) # o-umlaut + txt = re.sub(u'ø|ø|ø', r'{o/}', txt) # o-stroke + txt = re.sub(u'ù|ù|ù', r'{u`}', txt) # u-grave + txt = re.sub(u'ú|ú|ú', r"{u'}", txt) # u-acute + txt = re.sub(u'û|û|û', r'{u^}', txt) # u-circumflex + txt = re.sub(u'ü|ü|ü', r'{u"}', txt) # u-umlaut + txt = re.sub(u'ý|ý|ý', r"{y'}", txt) # y-acute + txt = re.sub(u'ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut + txt = re.sub(u'Œ|Œ|Œ', r'{OE}', txt) # OE + txt = re.sub(u'œ|œ|œ', r'{oe}', txt) # oe + txt = re.sub(u'Ŝ|Š|Ŝ', r'{S^}', txt) # Scaron + txt = re.sub(u'ŝ|š|ŝ', r'{s^}', txt) # scaron + txt = re.sub(u'•|•|•', r'{*}', txt) # bullet + txt = re.sub(u'₣|₣', r'{Fr}', txt) # Franc + txt = re.sub(u'₤|₤', r'{L=}', txt) # Lira + txt = re.sub(u'₨|₨', r'{Rs}', txt) # Rupee + txt = re.sub(u'€|€|€', r'{C=}', txt) # euro + txt = re.sub(u'™|™|™', r'{tm}', txt) # trademark + txt = re.sub(u'♠|♠|♠', r'{spade}', txt) # spade + txt = re.sub(u'♣|♣|♣', r'{club}', txt) # club + txt = re.sub(u'♥|♥|♥', r'{heart}', txt) # heart + txt = re.sub(u'♦|♦|♦', r'{diamond}', txt) # diamond + + txt = re.sub(u'\xa0', r'p. ', txt) # blank paragraph + txt = re.sub(u'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph + txt = re.sub(u'\n \n', r'\n
\n', txt) # blank paragraph - br tag + + return txt