Add new but still wip textile output generator.

This commit is contained in:
John Schember 2011-04-16 11:55:44 -04:00
parent 9b581963ed
commit 804b248d46
3 changed files with 432 additions and 39 deletions

View File

@ -70,16 +70,17 @@ class TXTOutput(OutputFormatPlugin):
])
def convert(self, oeb_book, output_path, input_plugin, opts, log):
print 'New'
if opts.txt_output_formatting.lower() == 'markdown':
from calibre.ebooks.txt.markdownml import MarkdownMLizer
writer = MarkdownMLizer(log)
self.writer = MarkdownMLizer(log)
elif opts.txt_output_formatting.lower() == 'textile':
from calibre.ebooks.txt.textileml import TextileMLizer
writer = TextileMLizer(log)
self.writer = TextileMLizer(log)
else:
writer = TXTMLizer(log)
self.writer = TXTMLizer(log)
txt = writer.extract_content(oeb_book, opts)
txt = self.writer.extract_content(oeb_book, opts)
txt = clean_ascii_chars(txt)
log.debug('\tReplacing newlines with selected type...')
@ -118,10 +119,18 @@ class TXTZOutput(TXTOutput):
# Images
for item in oeb_book.manifest:
if item.media_type in OEB_IMAGES:
path = os.path.join(tdir, os.path.dirname(item.href))
if hasattr(self.writer, 'images'):
path = os.path.join(tdir, 'images')
if item.href in self.writer.images:
href = self.writer.images[item.href]
else:
continue
else:
path = os.path.join(tdir, os.path.dirname(item.href))
href = os.path.basename(item.href)
if not os.path.exists(path):
os.makedirs(path)
with open(os.path.join(tdir, item.href), 'wb') as imgf:
with open(os.path.join(path, href), 'wb') as imgf:
imgf.write(item.data)
# Metadata

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
__docformat__ = 'restructuredtext en'
'''
@ -10,53 +10,328 @@ Transform OEB content into Textile formatted plain text
import re
from lxml import etree
from functools import partial
from calibre.ebooks.oeb.base import XHTML
from calibre.utils.html2textile import html2textile
from calibre.ebooks.htmlz.oeb2html import OEB2HTML
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.txt.unsmarten import unsmarten
from operator import itemgetter
class TextileMLizer(object):
def __init__(self, log):
self.log = log
class TextileMLizer(OEB2HTML):
def extract_content(self, oeb_book, opts):
self.log.info('Converting XHTML to Textile formatted TXT...')
self.oeb_book = oeb_book
self.opts = opts
self.in_pre = False
self.in_table = False
self.links = {}
self.list = []
self.images = {}
self.base_hrefs = [item.href for item in oeb_book.spine]
self.map_resources(oeb_book)
return self.mlize_spine()
self.style_bold = False
self.style_italic = False
self.style_under = False
self.style_strike = False
self.style_smallcap = False
def mlize_spine(self):
txt = self.mlize_spine(oeb_book)
txt = unsmarten(txt)
# Do some tidying up
txt = self.tidy_up(txt)
return txt
def mlize_spine(self, oeb_book):
output = [u'']
for item in self.oeb_book.spine:
for item in oeb_book.spine:
self.log.debug('Converting %s to Textile formatted TXT...' % item.href)
self.rewrite_ids(item.data, item)
rewrite_links(item.data, partial(self.rewrite_link, page=item))
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
output.append('\n\n')
return ''.join(output)
html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
def tidy_up(self, text):
def check_count(text, tests):
x = []
for i, t in enumerate(reversed(tests)):
x.append((text.count(t), i, t))
if x:
return sorted(x, key=itemgetter(0, 1), reverse=True)[0][2]
return ''
if not self.opts.keep_links:
html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
if not self.opts.keep_image_references:
html = re.sub(r'<\s*img[^>]*>', '', html)
# NEEDS TWEAKING
# def check_escaping(text, tests):
# for t in tests:
# text = re.sub(r'(\S)('+t+'\w+'+t+')', r'\1[\2]', text)
# text = re.sub(r'(\s)('+t+'\w+'+t+')(\S)', r'\1[\2]\3', text)
# return text
text = html2textile(html)
txt = check_count(text, ['\np<. ', '\np<>. ', '\np. '])
text = re.sub(txt+'(\S)', r'\n\1', text)
# Ensure the section ends with at least two new line characters.
# This is to prevent the last paragraph from a section being
# combined into the fist paragraph of the next.
end_chars = text[-4:]
# Convert all newlines to \n
end_chars = end_chars.replace('\r\n', '\n')
end_chars = end_chars.replace('\r', '\n')
end_chars = end_chars[-2:]
if not end_chars[1] == '\n':
text += '\n\n'
if end_chars[1] == '\n' and not end_chars[0] == '\n':
text += '\n'
# text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '-'])
output += text
text = re.sub('\npre\. bc\.', '\nbc.', text)
text = re.sub('\np=. p. ', '\np. ', text)
text = re.sub('\np=. \n', '\n', text)
text = re.sub('\n{3,}', '\n\n', text)
text = re.sub(' \|', '|', text)
output = u''.join(output)
# started work on trying to fix footnotes
# text = re.sub(r'\^"(\d+)":#.+\^', r'[\1]', text)
return text
return output
def remove_newlines(self, text):
text = text.replace('\r\n', ' ')
text = text.replace('\n', ' ')
text = text.replace('\r', ' ')
# Condense redundant spaces created by replacing newlines with spaces.
text = re.sub(r'[ ]{2,}', ' ', text)
text = re.sub(r'\t+', '', text)
return text
def remove_leading_ws(self, text):
text = text.replace('\r\n', '\n')
text = text.replace('\r', '\n')
text = re.sub(r'\n+', '\n', text)
text = re.sub(r'\n[\t ]+', '\n', text)
return text
def check_align(self, style, align, tests):
for i in tests:
if style[align] == i[0]:
return i[1]
return ''
def check_padding(self, style, tests):
txt = ''
for i in tests:
try:
ems = int(round(float(style[i[0]] / style['font-size'])))
if ems >=1:
txt += i[1] * ems
except:
pass
return txt
def check_id_tag(self, attribs):
txt = ''
if attribs.has_key('id'):
txt = '(#'+attribs['id']+')'
return txt
def build_block(self, tag, style, attribs, finish):
txt = tag
if self.opts.keep_links:
txt += self.check_id_tag(attribs)
txt += self.check_padding(style, [['padding-left','('],['padding-right',')']])
txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']])
txt += finish
return txt
def dump_text(self, elem, stylizer, page, tag_stack=[]):
'''
@elem: The element in the etree that we are working on.
@stylizer: The style information attached to the element.
'''
# We can only processes tags. If there isn't a tag return any text.
if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) != XHTML_NS:
p = elem.getparent()
if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
and elem.tail:
return [elem.tail]
return ['']
# Setup our variables.
text = ['']
style = stylizer.style(elem)
tags = []
tag = barename(elem.tag)
attribs = elem.attrib
# Ignore anything that is set to not be displayed.
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden':
return ['']
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
text.append(self.build_block(tag, style, attribs, '. '))
tags.append('\n')
if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
if self.style_bold == False:
text.append('*')
tags.append('*')
self.style_bold = True
if style['font-style'] == 'italic' or tag in ('i', 'em'):
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
if self.style_italic == False:
text.append('_')
tags.append('_')
self.style_italic = True
if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
if tag != 'a':
if self.style_under == False:
text.append('+')
tags.append('+')
self.style_under = True
if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'):
if self.style_strike == False:
text.append('-')
tags.append('-')
self.style_strike = True
if style['font-variant'] == 'small-caps':
if self.style_smallcap == 0:
text.append('&')
tags.append('&')
self.style_smallcap = 1
if tag == 'br':
text.append('')
tags.append('\n')
elif tag == 'blockquote':
text.append('bq. ')
tags.append('\n')
elif tag in ('abbr', 'acronym'):
text.append('')
txt = attribs['title']
tags.append('(' + txt + ')')
elif tag == 'sup':
text.append('^')
tags.append('^')
elif tag == 'sub':
text.append('~')
tags.append('~')
elif tag == 'code':
if self.in_pre:
text.append('bc. ')
tags.append('\n')
else:
text.append('@')
tags.append('@')
elif tag == 'cite':
text.append('??')
tags.append('??')
elif tag == 'hr':
text.append('\n***\n')
tags.append('\n')
elif tag == 'pre':
self.in_pre = True
text.append('pre. ')
tags.append('pre')
elif tag == 'a':
if self.opts.keep_links:
text.append ('"')
tags.append('":' + attribs['href'])
if attribs.has_key('title'):
tags.append('(' + attribs['title'] + ')')
elif tag == 'img':
if self.opts.keep_image_references:
text.append ('!' + attribs['src'])
if attribs.has_key('alt'):
txt = attribs['alt']
if txt != '':
text.append('(' + txt + ')')
tags.append('!')
elif tag in ('ol', 'ul'):
self.list.append({'name':tag, 'num':0})
text.append('')
tags.append(tag)
elif tag == 'li':
if self.list: li = self.list[-1]
else: li = {'name':'ul', 'num':0}
if li['name'] == 'ul': text.append('*'*len(self.list)+' ')
elif li['name'] == 'ol': text.append('#'*len(self.list)+' ')
elif tag == 'dl':
text.append('\n')
tags.append('')
elif tag == 'dt':
text.append('')
tags.append('\n')
elif tag == 'dd':
text.append(' ')
tags.append('')
elif tag == 'dd':
text.append('')
tags.append('\n')
elif tag == 'table':
self.in_table = True
text.append('')
tags.append('table')
elif tag == 'tr':
text.append('')
tags.append('|\n')
elif tag == 'td':
text.append('|')
txt = ''
txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']])
txt += self.check_align(style, 'vertical-align', [['top','^'],['bottom','~']])
if attribs.has_key ('colspan'):
txt += '\\' + attribs['colspan']
if attribs.has_key ('rowspan'):
txt += '/' + attribs['rowspan']
if txt != '':
text.append(txt+'. ')
tags.append('')
elif tag == 'th':
text.append('|_. ')
tags.append('')
if self.opts.keep_links and attribs.has_key('id'):
if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
text.append('(#' + attribs['id'] + ')')
# If wanted process all style tags here - before taxt in tags is written
# Process tags that contain text.
if hasattr(elem, 'text') and elem.text:
txt = elem.text
if not self.in_pre:
if self.in_table:
txt = self.remove_newlines(txt)
else:
txt = self.remove_leading_ws(txt)
text.append(txt)
# Recurse down into tags within the tag we are in.
for item in elem:
text += self.dump_text(item, stylizer, page, tag_stack+tags)
# Close all open tags.
tags.reverse()
for t in tags:
if tag in ('pre', 'ul', 'ol', 'li', 'table'):
if tag == 'pre':
self.in_pre = False
if tag == 'table':
self.in_table = False
if tag in ('ul', 'ol'):
if self.list: self.list.pop()
else:
text.append('%s' % t)
if t == '*': self.style_bold = False
if t == '_': self.style_italic = False
if t == '+': self.style_under = False
if t == '-': self.style_strike = False
if t == '&': self.style_smallcap = False
# Add the text that is outside of the tag.
if hasattr(elem, 'tail') and elem.tail:
tail = elem.tail
if not self.in_pre:
if self.in_table:
tail = self.remove_newlines(tail)
else:
tail = self.remove_leading_ws(tail)
text.append(tail)
return text

View File

@ -0,0 +1,109 @@
# -*- coding: utf-8 -*-
'''
'''
__version__ = '0.1'
__author__ = 'Leigh Parry'
import re
def unsmarten(txt):
txt = re.sub(u'&#8211;|&ndash;|', r'-', txt) # en-dash
txt = re.sub(u'&#8212;|&mdash;|—', r'--', txt) # em-dash
txt = re.sub(u'&#8230;|&hellip;|…', r'...', txt) # ellipsis
txt = re.sub(u'&#8220;|&#8221;|&#8243;|&ldquo;|&rdquo;|&Prime;|“|”|″', r'"', txt) # double quote
txt = re.sub(u'(["\'‘“]|\s)', r"\1{'/}", txt) # apostrophe
txt = re.sub(u'&#8216;|&#8217;|&#8242;|&lsquo;|&rsquo;|&prime;|||', r"'", txt) # single quote
txt = re.sub(u'&#162;|&cent;|¢', r'{c\}', txt) # cent
txt = re.sub(u'&#163;|&pound;|£', r'{L-}', txt) # pound
txt = re.sub(u'&#165;|&yen;|¥', r'{Y=}', txt) # yen
txt = re.sub(u'&#169;|&copy;|©', r'{(c)}', txt) # copyright
txt = re.sub(u'&#174;|&reg;|®', r'{(r)}', txt) # registered
txt = re.sub(u'&#188;|&frac14;|¼', r'{1/4}', txt) # quarter
txt = re.sub(u'&#189;|&frac12;|½', r'{1/2}', txt) # half
txt = re.sub(u'&#190;|&frac34;|¾', r'{3/4}', txt) # three-quarter
txt = re.sub(u'&#192;|&Agrave;|À', r'{A`)}', txt) # A-grave
txt = re.sub(u'&#193;|&Aacute;|Á', r"{A'}", txt) # A-acute
txt = re.sub(u'&#194;|&Acirc;|Â', r'{A^}', txt) # A-circumflex
txt = re.sub(u'&#195;|&Atilde;|Ã', r'{A~}', txt) # A-tilde
txt = re.sub(u'&#196;|&Auml;|Ä', r'{A"}', txt) # A-umlaut
txt = re.sub(u'&#197;|&Aring;|Å', r'{Ao}', txt) # A-ring
txt = re.sub(u'&#198;|&AElig;|Æ', r'{AE}', txt) # AE
txt = re.sub(u'&#199;|&Ccedil;|Ç', r'{C,}', txt) # C-cedilla
txt = re.sub(u'&#200;|&Egrave;|È', r'{E`}', txt) # E-grave
txt = re.sub(u'&#201;|&Eacute;|É', r"{E'}", txt) # E-acute
txt = re.sub(u'&#202;|&Ecirc;|Ê', r'{E^}', txt) # E-circumflex
txt = re.sub(u'&#203;|&Euml;|Ë', r'{E"}', txt) # E-umlaut
txt = re.sub(u'&#204;|&Igrave;|Ì', r'{I`}', txt) # I-grave
txt = re.sub(u'&#205;|&Iacute;|Í', r"{I'}", txt) # I-acute
txt = re.sub(u'&#206;|&Icirc;|Î', r'{I^}', txt) # I-circumflex
txt = re.sub(u'&#207;|&Iuml;|Ï', r'{I"}', txt) # I-umlaut
txt = re.sub(u'&#208;|&ETH;|Ð', r'{D-}', txt) # ETH
txt = re.sub(u'&#209;|&Ntilde;|Ñ', r'{N~}', txt) # N-tilde
txt = re.sub(u'&#210;|&Ograve;|Ò', r'{O`}', txt) # O-grave
txt = re.sub(u'&#211;|&Oacute;|Ó', r"{O'}", txt) # O-acute
txt = re.sub(u'&#212;|&Ocirc;|Ô', r'{O^}', txt) # O-circumflex
txt = re.sub(u'&#213;|&Otilde;|Õ', r'{O~}', txt) # O-tilde
txt = re.sub(u'&#214;|&Ouml;|Ö', r'{O"}', txt) # O-umlaut
txt = re.sub(u'&#215;|&times;|×', r'{x}', txt) # dimension
txt = re.sub(u'&#216;|&Oslash;|Ø', r'{O/}', txt) # O-slash
txt = re.sub(u'&#217;|&Ugrave;|Ù', r"{U`}", txt) # U-grave
txt = re.sub(u'&#218;|&Uacute;|Ú', r"{U'}", txt) # U-acute
txt = re.sub(u'&#219;|&Ucirc;|Û', r'{U^}', txt) # U-circumflex
txt = re.sub(u'&#220;|&Uuml;|Ü', r'{U"}', txt) # U-umlaut
txt = re.sub(u'&#221;|&Yacute;|Ý', r"{Y'}", txt) # Y-grave
txt = re.sub(u'&#223;|&szlig;|ß', r'{sz}', txt) # sharp-s
txt = re.sub(u'&#224;|&agrave;|à', r'{a`}', txt) # a-grave
txt = re.sub(u'&#225;|&aacute;|á', r"{a'}", txt) # a-acute
txt = re.sub(u'&#226;|&acirc;|â', r'{a^}', txt) # a-circumflex
txt = re.sub(u'&#227;|&atilde;|ã', r'{a~}', txt) # a-tilde
txt = re.sub(u'&#228;|&auml;|ä', r'{a"}', txt) # a-umlaut
txt = re.sub(u'&#229;|&aring;|å', r'{ao}', txt) # a-ring
txt = re.sub(u'&#230;|&aelig;|æ', r'{ae}', txt) # ae
txt = re.sub(u'&#231;|&ccedil;|ç', r'{c,}', txt) # c-cedilla
txt = re.sub(u'&#232;|&egrave;|è', r'{e`}', txt) # e-grave
txt = re.sub(u'&#233;|&eacute;|é', r"{e'}", txt) # e-acute
txt = re.sub(u'&#234;|&ecirc;|ê', r'{e^}', txt) # e-circumflex
txt = re.sub(u'&#235;|&euml;|ë', r'{e"}', txt) # e-umlaut
txt = re.sub(u'&#236;|&igrave;|ì', r'{i`}', txt) # i-grave
txt = re.sub(u'&#237;|&iacute;|í', r"{i'}", txt) # i-acute
txt = re.sub(u'&#238;|&icirc;|î', r'{i^}', txt) # i-circumflex
txt = re.sub(u'&#239;|&iuml;|ï', r'{i"}', txt) # i-umlaut
txt = re.sub(u'&#240;|&eth;|ð', r'{d-}', txt) # eth
txt = re.sub(u'&#241;|&ntilde;|ñ', r'{n~}', txt) # n-tilde
txt = re.sub(u'&#242;|&ograve;|ò', r'{o`}', txt) # o-grave
txt = re.sub(u'&#243;|&oacute;|ó', r"{o'}", txt) # o-acute
txt = re.sub(u'&#244;|&ocirc;|ô', r'{o^}', txt) # o-circumflex
txt = re.sub(u'&#245;|&otilde;|õ', r'{o~}', txt) # o-tilde
txt = re.sub(u'&#246;|&ouml;|ö', r'{o"}', txt) # o-umlaut
txt = re.sub(u'&#248;|&oslash;|ø', r'{o/}', txt) # o-stroke
txt = re.sub(u'&#249;|&ugrave;|ù', r'{u`}', txt) # u-grave
txt = re.sub(u'&#250;|&uacute;|ú', r"{u'}", txt) # u-acute
txt = re.sub(u'&#251;|&ucirc;|û', r'{u^}', txt) # u-circumflex
txt = re.sub(u'&#252;|&uuml;|ü', r'{u"}', txt) # u-umlaut
txt = re.sub(u'&#253;|&yacute;|ý', r"{y'}", txt) # y-acute
txt = re.sub(u'&#255;|&yuml;|ÿ', r'{y"}', txt) # y-umlaut
txt = re.sub(u'&#338;|&OElig;|Œ', r'{OE}', txt) # OE
txt = re.sub(u'&#339;|&oelig;|œ', r'{oe}', txt) # oe
txt = re.sub(u'&#348;|&Scaron;|Ŝ', r'{S^}', txt) # Scaron
txt = re.sub(u'&#349;|&scaron;|ŝ', r'{s^}', txt) # scaron
txt = re.sub(u'&#8226;|&bull;|•', r'{*}', txt) # bullet
txt = re.sub(u'&#8355;|₣', r'{Fr}', txt) # Franc
txt = re.sub(u'&#8356;|₤', r'{L=}', txt) # Lira
txt = re.sub(u'&#8360;|₨', r'{Rs}', txt) # Rupee
txt = re.sub(u'&#8364;|&euro;|€', r'{C=}', txt) # euro
txt = re.sub(u'&#8482;|&trade;|™', r'{tm}', txt) # trademark
txt = re.sub(u'&#9824;|&spades;|♠', r'{spade}', txt) # spade
txt = re.sub(u'&#9827;|&clubs;|♣', r'{club}', txt) # club
txt = re.sub(u'&#9829;|&hearts;|♥', r'{heart}', txt) # heart
txt = re.sub(u'&#9830;|&diams;|♦', r'{diamond}', txt) # diamond
txt = re.sub(u'\xa0', r'p. ', txt) # blank paragraph
txt = re.sub(u'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph
txt = re.sub(u'\n \n', r'\n<br />\n', txt) # blank paragraph - br tag
return txt