diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index 5c29f1e79b..e04930dd0c 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -253,7 +253,7 @@ class OutputProfile(Plugin): periodical_date_in_title = True #: Characters used in jackets and catalogs - missing_char = u'x' + missing_char = u'x' ratings_char = u'*' empty_ratings_char = u' ' read_char = u'+' @@ -293,38 +293,38 @@ class iPadOutput(OutputProfile): } ] - missing_char = u'\u2715\u200a' # stylized 'x' plus hair space - ratings_char = u'\u2605' # filled star - empty_ratings_char = u'\u2606' # hollow star - read_char = u'\u2713' # check mark + missing_char = u'\u2715\u200a' # stylized 'x' plus hair space + ratings_char = u'\u2605' # filled star + empty_ratings_char = u'\u2606' # hollow star + read_char = u'\u2713' # check mark touchscreen = True # touchscreen_news_css {{{ touchscreen_news_css = u''' - /* hr used in articles */ - .article_articles_list { + /* hr used in articles */ + .article_articles_list { width:18%; - } + } .article_link { - color: #593f29; + color: #593f29; font-style: italic; } .article_next { - -webkit-border-top-right-radius:4px; - -webkit-border-bottom-right-radius:4px; + -webkit-border-top-right-radius:4px; + -webkit-border-bottom-right-radius:4px; font-style: italic; width:32%; } .article_prev { - -webkit-border-top-left-radius:4px; - -webkit-border-bottom-left-radius:4px; + -webkit-border-top-left-radius:4px; + -webkit-border-bottom-left-radius:4px; font-style: italic; width:32%; } - .article_sections_list { + .article_sections_list { width:18%; - } + } .articles_link { font-weight: bold; } @@ -334,8 +334,8 @@ class iPadOutput(OutputProfile): .caption_divider { - border:#ccc 1px solid; - } + border:#ccc 1px solid; + } .touchscreen_navbar { background:#c3bab2; @@ -357,50 +357,50 @@ class iPadOutput(OutputProfile): text-align:center; } - .touchscreen_navbar td a:link { - color: #593f29; - text-decoration: none; - } + .touchscreen_navbar td a:link { + color: #593f29; + text-decoration: none; + } - /* Index formatting */ - .publish_date { - text-align:center; - } - .divider { - border-bottom:1em solid white; - border-top:1px solid gray; - } + /* Index formatting */ + .publish_date { + text-align:center; + } + .divider { + border-bottom:1em solid white; + border-top:1px solid gray; + } - hr.caption_divider { - border-color:black; - border-style:solid; - border-width:1px; - } + hr.caption_divider { + border-color:black; + border-style:solid; + border-width:1px; + } /* Feed summary formatting */ .article_summary { - display:inline-block; - } + display:inline-block; + } .feed { font-family:sans-serif; font-weight:bold; font-size:larger; - } + } .feed_link { font-style: italic; } .feed_next { - -webkit-border-top-right-radius:4px; - -webkit-border-bottom-right-radius:4px; + -webkit-border-top-right-radius:4px; + -webkit-border-bottom-right-radius:4px; font-style: italic; width:40%; } .feed_prev { - -webkit-border-top-left-radius:4px; - -webkit-border-bottom-left-radius:4px; + -webkit-border-top-left-radius:4px; + -webkit-border-bottom-left-radius:4px; font-style: italic; width:40%; } @@ -410,24 +410,24 @@ class iPadOutput(OutputProfile): font-size: 160%; } - .feed_up { + .feed_up { font-weight: bold; width:20%; - } + } .summary_headline { font-weight:bold; text-align:left; - } + } .summary_byline { text-align:left; font-family:monospace; - } + } .summary_text { text-align:left; - } + } ''' # }}} @@ -617,8 +617,8 @@ class KindleOutput(OutputProfile): supports_mobi_indexing = True periodical_date_in_title = False - missing_char = u'x\u2009' - empty_ratings_char = u'\u2606' + missing_char = u'x\u2009' + empty_ratings_char = u'\u2606' ratings_char = u'\u2605' read_char = u'\u2713' @@ -642,8 +642,8 @@ class KindleDXOutput(OutputProfile): #comic_screen_size = (741, 1022) supports_mobi_indexing = True periodical_date_in_title = False - missing_char = u'x\u2009' - empty_ratings_char = u'\u2606' + missing_char = u'x\u2009' + empty_ratings_char = u'\u2606' ratings_char = u'\u2605' read_char = u'\u2713' mobi_ems_per_blockquote = 2.0 diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index 39f793face..e088d264fc 100755 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -12,7 +12,7 @@ A Humane Web Text Generator #__date__ = '2009/12/04' __copyright__ = """ -Copyright (c) 2011, Leigh Parry +Copyright (c) 2011, Leigh Parry Copyright (c) 2011, John Schember Copyright (c) 2009, Jason Samsa, http://jsamsa.com/ Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/ @@ -219,14 +219,13 @@ class Textile(object): ] glyph_defaults = [ (re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'), r'\1\2×\3'), # dimension sign - (re.compile(r'(\d+)\'', re.I), r'\1′'), # prime - (re.compile(r'(\d+)\"', re.I), r'\1″'), # prime-double + (re.compile(r'(\d+)\'(\s)', re.I), r'\1′\2'), # prime + (re.compile(r'(\d+)\"(\s)', re.I), r'\1″\2'), # prime-double (re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), r'\1'), # 3+ uppercase acronym (re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), r'\1'), # 3+ uppercase (re.compile(r'\b(\s{0,1})?\.{3}'), r'\1…'), # ellipsis (re.compile(r'^[\*_-]{3,}$', re.M), r'
'), #
scene-break - (re.compile(r'\b--\b'), r'—'), # em dash - (re.compile(r'(\s)--(\s)'), r'\1—\2'), # em dash + (re.compile(r'(^|[^-])--([^-]|$)'), r'\1—\2'), # em dash (re.compile(r'\s-(?:\s|$)'), r' – '), # en dash (re.compile(r'\b( ?)[([]TM[])]', re.I), r'\1™'), # trademark (re.compile(r'\b( ?)[([]R[])]', re.I), r'\1®'), # registered @@ -706,6 +705,21 @@ class Textile(object): result.append(line) return ''.join(result) + def macros_only(self, text): + # fix: hackish + text = re.sub(r'"\Z', '\" ', text) + + result = [] + for line in re.compile(r'(<.*?>)', re.U).split(text): + if not re.search(r'<.*>', line): + rules = [] + if re.search(r'{.+?}', line): + rules = self.macro_defaults + for s, r in rules: + line = s.sub(r, line) + result.append(line) + return ''.join(result) + def vAlign(self, input): d = {'^':'top', '-':'middle', '~':'bottom'} return d.get(input, '') @@ -814,6 +828,7 @@ class Textile(object): 'fooobar ... and hello world ...' """ + text = self.macros_only(text) punct = '!"#$%&\'*+,-./:;=?@\\^_`|~' pattern = r''' @@ -1044,4 +1059,3 @@ def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'): return Textile(restricted=True, lite=lite, noimage=noimage).textile(text, rel='nofollow', html_type=html_type) - diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index ac63690996..d9c42eb1dc 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -66,19 +66,26 @@ class TXTOutput(OutputFormatPlugin): help=_('Do not remove image references within the document. This is only ' \ 'useful when paired with a txt-output-formatting option that ' 'is not none because links are always removed with plain text output.')), + OptionRecommendation(name='keep_color', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Do not remove font color from output. This is only useful when ' \ + 'txt-output-formatting is set to textile. Textile is the only ' \ + 'formatting that supports setting font color. If this option is ' \ + 'not specified font color will not be set and default to the ' \ + 'color displayed by the reader (generally this is black).')), ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): if opts.txt_output_formatting.lower() == 'markdown': from calibre.ebooks.txt.markdownml import MarkdownMLizer - writer = MarkdownMLizer(log) + self.writer = MarkdownMLizer(log) elif opts.txt_output_formatting.lower() == 'textile': from calibre.ebooks.txt.textileml import TextileMLizer - writer = TextileMLizer(log) + self.writer = TextileMLizer(log) else: - writer = TXTMLizer(log) + self.writer = TXTMLizer(log) - txt = writer.extract_content(oeb_book, opts) + txt = self.writer.extract_content(oeb_book, opts) txt = clean_ascii_chars(txt) log.debug('\tReplacing newlines with selected type...') @@ -111,17 +118,28 @@ class TXTZOutput(TXTOutput): from calibre.ebooks.oeb.base import OEB_IMAGES with TemporaryDirectory('_txtz_output') as tdir: # TXT - with TemporaryFile('index.txt') as tf: + txt_name = 'index.txt' + if opts.txt_output_formatting.lower() == 'textile': + txt_name = 'index.text' + with TemporaryFile(txt_name) as tf: TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log) - shutil.copy(tf, os.path.join(tdir, 'index.txt')) + shutil.copy(tf, os.path.join(tdir, txt_name)) # Images for item in oeb_book.manifest: if item.media_type in OEB_IMAGES: - path = os.path.join(tdir, os.path.dirname(item.href)) + if hasattr(self.writer, 'images'): + path = os.path.join(tdir, 'images') + if item.href in self.writer.images: + href = self.writer.images[item.href] + else: + continue + else: + path = os.path.join(tdir, os.path.dirname(item.href)) + href = os.path.basename(item.href) if not os.path.exists(path): os.makedirs(path) - with open(os.path.join(tdir, item.href), 'wb') as imgf: + with open(os.path.join(path, href), 'wb') as imgf: imgf.write(item.data) # Metadata diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 7e161f63bd..54369190de 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -242,6 +242,8 @@ def detect_formatting_type(txt): textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt)) # Links textile_count += len(re.findall(r'"[^"]*":\S+', txt)) + # paragraph blocks + textile_count += len(re.findall(r'(?mu)^p(<|<>|=|>)?\. ', txt)) # Decide if either markdown or textile is used in the text # based on the number of unique formatting elements found. diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index d7e11695c5..36dc9952d2 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -1,62 +1,489 @@ # -*- coding: utf-8 -*- __license__ = 'GPL 3' -__copyright__ = '2011, John Schember ' +__copyright__ = '2011, Leigh Parry ' __docformat__ = 'restructuredtext en' ''' Transform OEB content into Textile formatted plain text ''' - import re -from lxml import etree +from functools import partial -from calibre.ebooks.oeb.base import XHTML -from calibre.utils.html2textile import html2textile +from calibre.ebooks.htmlz.oeb2html import OEB2HTML +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links +from calibre.ebooks.oeb.stylizer import Stylizer +from calibre.ebooks import unit_convert +from calibre.ebooks.txt.unsmarten import unsmarten -class TextileMLizer(object): - - def __init__(self, log): - self.log = log +class TextileMLizer(OEB2HTML): def extract_content(self, oeb_book, opts): self.log.info('Converting XHTML to Textile formatted TXT...') - self.oeb_book = oeb_book self.opts = opts + self.in_pre = False + self.in_table = False + self.links = {} + self.list = [] + self.our_links = [] + self.in_a_link = False + self.our_ids = [] + self.images = {} + self.id_no_text = u'' + self.style_embed = [] + self.remove_space_after_newline = False + self.base_hrefs = [item.href for item in oeb_book.spine] + self.map_resources(oeb_book) - return self.mlize_spine() + self.style_bold = False + self.style_italic = False + self.style_under = False + self.style_strike = False + self.style_smallcap = False - def mlize_spine(self): + txt = self.mlize_spine(oeb_book) + txt = unsmarten(txt) + + # Do some tidying up + txt = self.tidy_up(txt) + + return txt + + def mlize_spine(self, oeb_book): output = [u''] - - for item in self.oeb_book.spine: + for item in oeb_book.spine: self.log.debug('Converting %s to Textile formatted TXT...' % item.href) + self.rewrite_ids(item.data, item) + rewrite_links(item.data, partial(self.rewrite_link, page=item)) + stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile) + output += self.dump_text(item.data.find(XHTML('body')), stylizer) + output.append('\n\n') + return ''.join(output) - html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) + def tidy_up(self, text): + # May need tweaking and finetuning + def check_escaping(text, tests): + for t in tests: + # I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged + txt = '%s' % t + if txt != '%': + text = re.sub(r'([^'+t+'|^\n])'+t+'\]\['+t+'([^'+t+'])', r'\1\2', text) + text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text) + text = re.sub(r'(\s|[*_\'"])\[('+t+'[a-zA-Z0-9 \'",.*_]+'+t+')\](\s|[*_\'"?!,.])', r'\1\2\3', text) + return text - if not self.opts.keep_links: - html = re.sub(r'<\s*/*\s*a[^>]*>', '', html) - if not self.opts.keep_image_references: - html = re.sub(r'<\s*img[^>]*>', '', html) + # Now tidyup links and ids - remove ones that don't have a correponding opposite + if self.opts.keep_links: + for i in self.our_links: + if i[0] == '#': + if i not in self.our_ids: + text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text) + for i in self.our_ids: + if i not in self.our_links: + text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text) + + # Remove obvious non-needed escaping, add sub/sup-script ones + text = check_escaping(text, ['\*', '_', '\*']) + # escape the super/sub-scripts if needed + text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text) + # escape the super/sub-scripts if needed + text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text) - text = html2textile(html) + #remove empty spans + text = re.sub(r'%\xa0+', r'%', text) + #remove empty spans - MAY MERGE SOME ? + text = re.sub(r'%%', r'', text) + #remove spans from tagged output + text = re.sub(r'%([_+*-]+)%', r'\1', text) + #remove spaces before a newline + text = re.sub(r' +\n', r'\n', text) + #remove newlines at top of file + text = re.sub(r'^\n+', r'', text) + #correct blockcode paras + text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) + #correct blockquote paras + text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) - # Ensure the section ends with at least two new line characters. - # This is to prevent the last paragraph from a section being - # combined into the fist paragraph of the next. - end_chars = text[-4:] - # Convert all newlines to \n - end_chars = end_chars.replace('\r\n', '\n') - end_chars = end_chars.replace('\r', '\n') - end_chars = end_chars[-2:] - if not end_chars[1] == '\n': - text += '\n\n' - if end_chars[1] == '\n' and not end_chars[0] == '\n': - text += '\n' + #reduce blank lines + text = re.sub(r'\n{3}', r'\n\np. \n\n', text) + text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text) + #Check span following blank para + text = re.sub(r'\n+ +%', r' %', text) + text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text) + # blank paragraph + text = re.sub(r'\n(p.*\.)\n', r'\n\1 \n\n', text) + # blank paragraph + text = re.sub(u'\n\xa0', r'\np. ', text) + # blank paragraph + text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text) + text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text) + text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text) + #sort out spaces in tables + text = re.sub(r' {2,}\|', r' |', text) - output += text + # Now put back spaces removed earlier as they're needed here + text = re.sub(r'\np\.\n', r'\np. \n', text) + #reduce blank lines + text = re.sub(r' \n\n\n', r' \n\n', text) - output = u''.join(output) + return text - return output + def remove_newlines(self, text): + text = text.replace('\r\n', ' ') + text = text.replace('\n', ' ') + text = text.replace('\r', ' ') + # Condense redundant spaces created by replacing newlines with spaces. + text = re.sub(r'[ ]{2,}', ' ', text) + text = re.sub(r'\t+', '', text) + if self.remove_space_after_newline == True: + text = re.sub(r'^ +', '', text) + self.remove_space_after_newline = False + return text + + def check_styles(self, style): + txt = '{' + if self.opts.keep_color: + if 'color' in style.cssdict() and style['color'] != 'black': + txt += 'color:'+style['color']+';' + if 'background' in style.cssdict(): + txt += 'background:'+style['background']+';' + txt += '}' + if txt == '{}': txt = '' + return txt + + def check_halign(self, style): + tests = {'left':'<','justify':'<>','center':'=','right':'>'} + for i in tests: + if style['text-align'] == i: + return tests[i] + return '' + + def check_valign(self, style): + tests = {'top':'^','bottom':'~'} #, 'middle':'-'} + for i in tests: + if style['vertical-align'] == i: + return tests[i] + return '' + + def check_padding(self, style, stylizer): + txt = '' + left_padding_pts = 0 + left_margin_pts = 0 + if 'padding-left' in style.cssdict() and style['padding-left'] != 'auto': + left_padding_pts = unit_convert(style['padding-left'], style.width, style.fontSize, stylizer.profile.dpi) + if 'margin-left' in style.cssdict() and style['margin-left'] != 'auto': + left_margin_pts = unit_convert(style['margin-left'], style.width, style.fontSize, stylizer.profile.dpi) + left = left_margin_pts + left_padding_pts + emleft = int(round(left / stylizer.profile.fbase)) + if emleft >= 1: + txt += '(' * emleft + right_padding_pts = 0 + right_margin_pts = 0 + if 'padding-right' in style.cssdict() and style['padding-right'] != 'auto': + right_padding_pts = unit_convert(style['padding-right'], style.width, style.fontSize, stylizer.profile.dpi) + if 'margin-right' in style.cssdict() and style['margin-right'] != 'auto': + right_margin_pts = unit_convert(style['margin-right'], style.width, style.fontSize, stylizer.profile.dpi) + right = right_margin_pts + right_padding_pts + emright = int(round(right / stylizer.profile.fbase)) + if emright >= 1: + txt += ')' * emright + + return txt + + def check_id_tag(self, attribs): + txt = '' + if attribs.has_key('id'): + txt = '(#'+attribs['id']+ ')' + self.our_ids.append('#'+attribs['id']) + self.id_no_text = u'\xa0' + return txt + + def build_block(self, tag, style, attribs, stylizer): + txt = '\n' + tag + if self.opts.keep_links: + txt += self.check_id_tag(attribs) + txt += self.check_padding(style, stylizer) + txt += self.check_halign(style) + txt += self.check_styles(style) + return txt + + def prepare_string_for_textile(self, txt): + if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt): + return ' ==%s== ' % txt + return txt + + def dump_text(self, elem, stylizer): + ''' + @elem: The element in the etree that we are working on. + @stylizer: The style information attached to the element. + ''' + + # We can only processes tags. If there isn't a tag return any text. + if not isinstance(elem.tag, basestring) \ + or namespace(elem.tag) != XHTML_NS: + p = elem.getparent() + if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \ + and elem.tail: + return [elem.tail] + return [''] + + # Setup our variables. + text = [''] + style = stylizer.style(elem) + tags = [] + tag = barename(elem.tag) + attribs = elem.attrib + + # Ignore anything that is set to not be displayed. + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + return [''] + + # Soft scene breaks. + if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto': + ems = int(round(float(style.marginTop) / style.fontSize) - 1) + if ems >= 1: + text.append(u'\n\n\xa0' * ems) + + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): + if tag == 'div': + tag = 'p' + text.append(self.build_block(tag, style, attribs, stylizer)) + text.append('. ') + tags.append('\n') + + if style['font-style'] == 'italic' or tag in ('i', 'em'): + if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): + if self.style_italic == False: + if self.in_a_link: + text.append('_') + tags.append('_') + else: + text.append('[_') + tags.append('_]') + self.style_embed.append('_') + self.style_italic = True + if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): + if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): + if self.style_bold == False: + if self.in_a_link: + text.append('*') + tags.append('*') + else: + text.append('[*') + tags.append('*]') + self.style_embed.append('*') + self.style_bold = True + if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): + if tag != 'a': + if self.style_under == False: + text.append('[+') + tags.append('+]') + self.style_embed.append('+') + self.style_under = True + if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'): + if self.style_strike == False: + text.append('[-') + tags.append('-]') + self.style_embed.append('-') + self.style_strike = True + if tag == 'br': + for i in reversed(self.style_embed): + text.append(i) + text.append('\n') + for i in self.style_embed: + text.append(i) + tags.append('') + self.remove_space_after_newline = True + if tag == 'blockquote': + text.append('\nbq. ') + tags.append('\n') + elif tag in ('abbr', 'acronym'): + text.append('') + txt = attribs['title'] + tags.append('(' + txt + ')') + elif tag == 'sup': + text.append('^') + tags.append('^') + elif tag == 'sub': + text.append('~') + tags.append('~') + elif tag == 'code': + if self.in_pre: + text.append('\nbc. ') + tags.append('') + else: + text.append('@') + tags.append('@') + elif tag == 'cite': + text.append('??') + tags.append('??') + elif tag == 'hr': + text.append('\n***') + tags.append('\n') + elif tag == 'pre': + self.in_pre = True + text.append('\npre. ') + tags.append('pre\n') + elif tag == 'a': + if self.opts.keep_links: + if attribs.has_key('href'): + text.append('"') + tags.append('a') + tags.append('":' + attribs['href']) + self.our_links.append(attribs['href']) + if attribs.has_key('title'): + tags.append('(' + attribs['title'] + ')') + self.in_a_link = True + else: + text.append('%') + tags.append('%') + elif tag == 'img': + if self.opts.keep_image_references: + txt = '!' + self.check_halign(style) + txt += self.check_valign(style) + txt += attribs['src'] + text.append(txt) + if attribs.has_key('alt'): + txt = attribs['alt'] + if txt != '': + text.append('(' + txt + ')') + tags.append('!') + elif tag in ('ol', 'ul'): + self.list.append({'name': tag, 'num': 0}) + text.append('') + tags.append(tag) + elif tag == 'li': + if self.list: li = self.list[-1] + else: li = {'name': 'ul', 'num': 0} + text.append('\n') + if li['name'] == 'ul': + text.append('*' * len(self.list) + ' ') + elif li['name'] == 'ol': + text.append('#' * len(self.list) + ' ') + tags.append('') + elif tag == 'dl': + text.append('\n') + tags.append('') + elif tag == 'dt': + text.append('') + tags.append('\n') + elif tag == 'dd': + text.append(' ') + tags.append('') + elif tag == 'dd': + text.append('') + tags.append('\n') + elif tag == 'table': + txt = self.build_block(tag, style, attribs, stylizer) + txt += '. \n' + if txt != '\ntable. \n': + text.append(txt) + else: + text.append('\n') + tags.append('') + elif tag == 'tr': + txt = self.build_block('', style, attribs, stylizer) + txt += '. ' + if txt != '\n. ': + txt = re.sub ('\n', '', txt) + text.append(txt) + tags.append('|\n') + elif tag == 'td': + text.append('|') + txt = '' + txt += self.check_halign(style) + txt += self.check_valign(style) + if attribs.has_key ('colspan'): + txt += '\\' + attribs['colspan'] + if attribs.has_key ('rowspan'): + txt += '/' + attribs['rowspan'] + txt += self.check_styles(style) + if txt != '': + text.append(txt + '. ') + tags.append('') + elif tag == 'th': + text.append('|_. ') + tags.append('') + elif tag == 'span': + if style['font-variant'] == 'small-caps': + if self.style_smallcap == False: + text.append('&') + tags.append('&') + self.style_smallcap = True + else: + if self.in_a_link == False: + txt = '%' + if self.opts.keep_links: + txt += self.check_id_tag(attribs) + txt += self.check_styles(style) + if txt != '%': + text.append(txt) + tags.append('%') + + if self.opts.keep_links and attribs.has_key('id'): + if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'): + text.append(self.check_id_tag(attribs)) + + # Process the styles for any that we want to keep + if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', \ + 'span', 'table', 'tr', 'td'): + if not self.in_a_link: + text.append(self.check_styles(style)) + + # Process tags that contain text. + if hasattr(elem, 'text') and elem.text: + txt = elem.text + if not self.in_pre: + txt = self.prepare_string_for_textile(self.remove_newlines(txt)) + text.append(txt) + self.id_no_text = u'' + + # Recurse down into tags within the tag we are in. + for item in elem: + text += self.dump_text(item, stylizer) + + # Close all open tags. + tags.reverse() + for t in tags: + if tag in ('pre', 'ul', 'ol', 'li', 'table'): + if tag == 'pre': + self.in_pre = False + elif tag in ('ul', 'ol'): + if self.list: self.list.pop() + if not self.list: text.append('\n') + else: + if t == 'a': + self.in_a_link = False + t = '' + text.append(self.id_no_text) + self.id_no_text = u'' + if t in ('*]', '*'): + self.style_bold = False + elif t in ('_]', '_'): + self.style_italic = False + elif t == '+]': + self.style_under = False + elif t == '-]': + self.style_strike = False + elif t == '&': + self.style_smallcap = False + if t in ('*]', '_]', '+]', '-]', '*', '_'): + txt = self.style_embed.pop() + text.append('%s' % t) + + # Soft scene breaks. + if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto': + ems = int(round((float(style.marginBottom) / style.fontSize) - 1)) + if ems >= 1: + text.append(u'\n\n\xa0' * ems) + + # Add the text that is outside of the tag. + if hasattr(elem, 'tail') and elem.tail: + tail = elem.tail + if not self.in_pre: + tail = self.prepare_string_for_textile(self.remove_newlines(tail)) + text.append(tail) + + return text diff --git a/src/calibre/ebooks/txt/unsmarten.py b/src/calibre/ebooks/txt/unsmarten.py new file mode 100644 index 0000000000..40444ba601 --- /dev/null +++ b/src/calibre/ebooks/txt/unsmarten.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- + +"""unsmarten : html2textile helper function""" + +__version__ = '0.1' +__author__ = 'Leigh Parry' + +import re + +def unsmarten(txt): + txt = re.sub(u'–|–|–', r'-', txt) # en-dash + txt = re.sub(u'—|—|—', r'--', txt) # em-dash + txt = re.sub(u'…|…|…', r'...', txt) # ellipsis + + txt = re.sub(u'“|”|″|“|”|″|“|”|″', r'"', txt) # double quote + txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt) # apostrophe + txt = re.sub(u'‘|’|′|‘|’|′|‘|’|′', r"'", txt) # single quote + + txt = re.sub(u'¢|¢|¢', r'{c\}', txt) # cent + txt = re.sub(u'£|£|£', r'{L-}', txt) # pound + txt = re.sub(u'¥|¥|¥', r'{Y=}', txt) # yen + txt = re.sub(u'©|©|©', r'{(c)}', txt) # copyright + txt = re.sub(u'®|®|®', r'{(r)}', txt) # registered + txt = re.sub(u'¼|¼|¼', r'{1/4}', txt) # quarter + txt = re.sub(u'½|½|½', r'{1/2}', txt) # half + txt = re.sub(u'¾|¾|¾', r'{3/4}', txt) # three-quarter + txt = re.sub(u'À|À|À', r'{A`)}', txt) # A-grave + txt = re.sub(u'Á|Á|Á', r"{A'}", txt) # A-acute + txt = re.sub(u'Â|Â|Â', r'{A^}', txt) # A-circumflex + txt = re.sub(u'Ã|Ã|Ã', r'{A~}', txt) # A-tilde + txt = re.sub(u'Ä|Ä|Ä', r'{A"}', txt) # A-umlaut + txt = re.sub(u'Å|Å|Å', r'{Ao}', txt) # A-ring + txt = re.sub(u'Æ|Æ|Æ', r'{AE}', txt) # AE + txt = re.sub(u'Ç|Ç|Ç', r'{C,}', txt) # C-cedilla + txt = re.sub(u'È|È|È', r'{E`}', txt) # E-grave + txt = re.sub(u'É|É|É', r"{E'}", txt) # E-acute + txt = re.sub(u'Ê|Ê|Ê', r'{E^}', txt) # E-circumflex + txt = re.sub(u'Ë|Ë|Ë', r'{E"}', txt) # E-umlaut + txt = re.sub(u'Ì|Ì|Ì', r'{I`}', txt) # I-grave + txt = re.sub(u'Í|Í|Í', r"{I'}", txt) # I-acute + txt = re.sub(u'Î|Î|Î', r'{I^}', txt) # I-circumflex + txt = re.sub(u'Ï|Ï|Ï', r'{I"}', txt) # I-umlaut + txt = re.sub(u'Ð|Ð|Ð', r'{D-}', txt) # ETH + txt = re.sub(u'Ñ|Ñ|Ñ', r'{N~}', txt) # N-tilde + txt = re.sub(u'Ò|Ò|Ò', r'{O`}', txt) # O-grave + txt = re.sub(u'Ó|Ó|Ó', r"{O'}", txt) # O-acute + txt = re.sub(u'Ô|Ô|Ô', r'{O^}', txt) # O-circumflex + txt = re.sub(u'Õ|Õ|Õ', r'{O~}', txt) # O-tilde + txt = re.sub(u'Ö|Ö|Ö', r'{O"}', txt) # O-umlaut + txt = re.sub(u'×|×|×', r'{x}', txt) # dimension + txt = re.sub(u'Ø|Ø|Ø', r'{O/}', txt) # O-slash + txt = re.sub(u'Ù|Ù|Ù', r"{U`}", txt) # U-grave + txt = re.sub(u'Ú|Ú|Ú', r"{U'}", txt) # U-acute + txt = re.sub(u'Û|Û|Û', r'{U^}', txt) # U-circumflex + txt = re.sub(u'Ü|Ü|Ü', r'{U"}', txt) # U-umlaut + txt = re.sub(u'Ý|Ý|Ý', r"{Y'}", txt) # Y-grave + txt = re.sub(u'ß|ß|ß', r'{sz}', txt) # sharp-s + txt = re.sub(u'à|à|à', r'{a`}', txt) # a-grave + txt = re.sub(u'á|á|á', r"{a'}", txt) # a-acute + txt = re.sub(u'â|â|â', r'{a^}', txt) # a-circumflex + txt = re.sub(u'ã|ã|ã', r'{a~}', txt) # a-tilde + txt = re.sub(u'ä|ä|ä', r'{a"}', txt) # a-umlaut + txt = re.sub(u'å|å|å', r'{ao}', txt) # a-ring + txt = re.sub(u'æ|æ|æ', r'{ae}', txt) # ae + txt = re.sub(u'ç|ç|ç', r'{c,}', txt) # c-cedilla + txt = re.sub(u'è|è|è', r'{e`}', txt) # e-grave + txt = re.sub(u'é|é|é', r"{e'}", txt) # e-acute + txt = re.sub(u'ê|ê|ê', r'{e^}', txt) # e-circumflex + txt = re.sub(u'ë|ë|ë', r'{e"}', txt) # e-umlaut + txt = re.sub(u'ì|ì|ì', r'{i`}', txt) # i-grave + txt = re.sub(u'í|í|í', r"{i'}", txt) # i-acute + txt = re.sub(u'î|î|î', r'{i^}', txt) # i-circumflex + txt = re.sub(u'ï|ï|ï', r'{i"}', txt) # i-umlaut + txt = re.sub(u'ð|ð|ð', r'{d-}', txt) # eth + txt = re.sub(u'ñ|ñ|ñ', r'{n~}', txt) # n-tilde + txt = re.sub(u'ò|ò|ò', r'{o`}', txt) # o-grave + txt = re.sub(u'ó|ó|ó', r"{o'}", txt) # o-acute + txt = re.sub(u'ô|ô|ô', r'{o^}', txt) # o-circumflex + txt = re.sub(u'õ|õ|õ', r'{o~}', txt) # o-tilde + txt = re.sub(u'ö|ö|ö', r'{o"}', txt) # o-umlaut + txt = re.sub(u'ø|ø|ø', r'{o/}', txt) # o-stroke + txt = re.sub(u'ù|ù|ù', r'{u`}', txt) # u-grave + txt = re.sub(u'ú|ú|ú', r"{u'}", txt) # u-acute + txt = re.sub(u'û|û|û', r'{u^}', txt) # u-circumflex + txt = re.sub(u'ü|ü|ü', r'{u"}', txt) # u-umlaut + txt = re.sub(u'ý|ý|ý', r"{y'}", txt) # y-acute + txt = re.sub(u'ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut + txt = re.sub(u'Œ|Œ|Œ', r'{OE}', txt) # OE + txt = re.sub(u'œ|œ|œ', r'{oe}', txt) # oe + txt = re.sub(u'Ŝ|Š|Ŝ', r'{S^}', txt) # Scaron + txt = re.sub(u'ŝ|š|ŝ', r'{s^}', txt) # scaron + txt = re.sub(u'•|•|•', r'{*}', txt) # bullet + txt = re.sub(u'₣|₣', r'{Fr}', txt) # Franc + txt = re.sub(u'₤|₤', r'{L=}', txt) # Lira + txt = re.sub(u'₨|₨', r'{Rs}', txt) # Rupee + txt = re.sub(u'€|€|€', r'{C=}', txt) # euro + txt = re.sub(u'™|™|™', r'{tm}', txt) # trademark + txt = re.sub(u'♠|♠|♠', r'{spade}', txt) # spade + txt = re.sub(u'♣|♣|♣', r'{club}', txt) # club + txt = re.sub(u'♥|♥|♥', r'{heart}', txt) # heart + txt = re.sub(u'♦|♦|♦', r'{diamond}', txt) # diamond + + # Move into main code? +# txt = re.sub(u'\xa0', r'p. ', txt) # blank paragraph +# txt = re.sub(u'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph +# txt = re.sub(u'\n \n', r'\n
\n', txt) # blank paragraph - br tag + + return txt diff --git a/src/calibre/gui2/convert/txt_output.py b/src/calibre/gui2/convert/txt_output.py index 8427f83824..816e8d7785 100644 --- a/src/calibre/gui2/convert/txt_output.py +++ b/src/calibre/gui2/convert/txt_output.py @@ -19,7 +19,7 @@ class PluginWidget(Widget, Ui_Form): Widget.__init__(self, parent, ['newline', 'max_line_length', 'force_max_line_length', 'inline_toc', 'txt_output_formatting', 'keep_links', 'keep_image_references', - 'txt_output_encoding']) + 'keep_color', 'txt_output_encoding']) self.db, self.book_id = db, book_id for x in get_option('newline').option.choices: self.opt_newline.addItem(x) diff --git a/src/calibre/gui2/convert/txt_output.ui b/src/calibre/gui2/convert/txt_output.ui index 1ef9e6e6b9..3a62643551 100644 --- a/src/calibre/gui2/convert/txt_output.ui +++ b/src/calibre/gui2/convert/txt_output.ui @@ -122,6 +122,13 @@ + + + + Keep text color, when possible + + + diff --git a/src/calibre/gui2/store/wizards_tower_books_plugin.py b/src/calibre/gui2/store/wizards_tower_books_plugin.py index 56bb00ff7e..c17ea2ca64 100644 --- a/src/calibre/gui2/store/wizards_tower_books_plugin.py +++ b/src/calibre/gui2/store/wizards_tower_books_plugin.py @@ -29,7 +29,7 @@ class WizardsTowerBooksStore(BasicStoreConfig, StorePlugin): detail_item = self.url + detail_item if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_item if detail_item else url))) + open_url(QUrl(url_slash_cleaner(detail_item))) else: d = WebStoreDialog(self.gui, self.url, parent, detail_item) d.setWindowTitle(self.name) @@ -38,9 +38,9 @@ class WizardsTowerBooksStore(BasicStoreConfig, StorePlugin): def search(self, query, max_results=10, timeout=60): url = 'http://www.wizardstowerbooks.com/search.html?for=' + urllib.quote(query) - + br = browser() - + counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) @@ -60,13 +60,13 @@ class WizardsTowerBooksStore(BasicStoreConfig, StorePlugin): price = price.strip() if not price: continue - + title = ''.join(data.xpath('.//span[@class="prti"]/a/b/text()')) author = ''.join(data.xpath('.//p[@class="last"]/text()')) a, b, author = author.partition(' by ') - + counter -= 1 - + s = SearchResult() s.cover_url = cover_url s.title = title.strip() @@ -74,15 +74,15 @@ class WizardsTowerBooksStore(BasicStoreConfig, StorePlugin): s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED - + yield s def get_details(self, search_result, timeout): br = browser() with closing(br.open(url_slash_cleaner(self.url + search_result.detail_item), timeout=timeout)) as nf: idata = html.fromstring(nf.read()) - + formats = ', '.join(idata.xpath('//select[@id="N1_"]//option//text()')) search_result.formats = formats.upper() - + return True diff --git a/src/calibre/utils/html2textile.py b/src/calibre/utils/html2textile.py deleted file mode 100644 index 786e912e36..0000000000 --- a/src/calibre/utils/html2textile.py +++ /dev/null @@ -1,209 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright (c) 2010, Webreactor - Marcin Lulek -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of the nor the -# names of its contributors may be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY -# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -from lxml import etree -from calibre.ebooks.oeb.base import barename - -class EchoTarget: - - def __init__(self): - self.final_output = [] - self.block = False - self.ol_ident = 0 - self.ul_ident = 0 - self.list_types = [] - self.haystack = [] - - def start(self, tag, attrib): - tag = barename(tag) - - newline = '\n' - dot = '' - new_tag = '' - - if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): - new_tag = tag - dot = '. ' - elif tag == 'p': - new_tag = '' - dot = '' - elif tag == 'blockquote': - new_tag = 'bq' - dot = '. ' - elif tag in ('b', 'strong'): - new_tag = '*' - newline = '' - elif tag in ('em', 'i'): - new_tag = '_' - newline = '' - elif tag == 'cite': - new_tag = '??' - newline = '' - elif tag == 'del': - new_tag = '-' - newline = '' - elif tag == 'ins': - new_tag = '+' - newline = '' - elif tag == 'sup': - new_tag = '^' - newline = '' - elif tag == 'sub': - new_tag = '~' - newline = '' - elif tag == 'span': - new_tag = '' - newline = '' - elif tag == 'a': - self.block = True - if 'title' in attrib: - self.a_part = {'title':attrib.get('title'), - 'href':attrib.get('href', '')} - else: - self.a_part = {'title':None, 'href':attrib.get('href', '')} - new_tag = '' - newline = '' - - elif tag == 'img': - if 'alt' in attrib: - new_tag = ' !%s(%s)' % (attrib.get('src'), attrib.get('title'),) - else: - new_tag = ' !%s' % attrib.get('src') - newline = '' - - elif tag in ('ul', 'ol'): - new_tag = '' - newline = '' - self.list_types.append(tag) - if tag == 'ul': - self.ul_ident += 1 - else: - self.ol_ident += 1 - - elif tag == 'li': - indent = self.ul_ident + self.ol_ident - if self.list_types[-1] == 'ul': - new_tag = '*' * indent + ' ' - newline = '\n' - else: - new_tag = '#' * indent + ' ' - newline = '\n' - - - if tag not in ('ul', 'ol'): - textile = '%(newline)s%(tag)s%(dot)s' % \ - { - 'newline':newline, - 'tag':new_tag, - 'dot':dot - } - if not self.block: - self.final_output.append(textile) - else: - self.haystack.append(textile) - - def end(self, tag): - tag = barename(tag) - - if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'): - self.final_output.append('\n') - elif tag in ('b', 'strong'): - self.final_output.append('*') - elif tag in ('em', 'i'): - self.final_output.append('_') - elif tag == 'cite': - self.final_output.append('??') - elif tag == 'del': - self.final_output.append('-') - elif tag == 'ins': - self.final_output.append('+') - elif tag == 'sup': - self.final_output.append('^') - elif tag == 'sub': - self.final_output.append('~') - elif tag == 'span': - self.final_output.append('') - elif tag == 'a': - if self.a_part['title']: - textilized = ' "%s (%s)":%s ' % ( - ''.join(self.haystack), - self.a_part.get('title'), - self.a_part.get('href'), - ) - self.haystack = [] - else: - textilized = ' "%s":%s ' % ( - ''.join(self.haystack), - self.a_part.get('href'), - ) - self.haystack = [] - self.final_output.append(textilized) - self.block = False - elif tag == 'img': - self.final_output.append('!') - elif tag == 'ul': - self.ul_ident -= 1 - self.list_types.pop() - if len(self.list_types) == 0: - self.final_output.append('\n') - elif tag == 'ol': - self.ol_ident -= 1 - self.list_types.pop() - if len(self.list_types) == 0: - self.final_output.append('\n') - - def data(self, data): - #we dont want any linebreaks inside our tags - node_data = data.replace('\n','') - if not self.block: - self.final_output.append(node_data) - else: - self.haystack.append(node_data) - - def comment(self, text): - pass - - def close(self): - return "closed!" - - -def html2textile(html): - #1st pass - #clean the whitespace and convert html to xhtml - parser = etree.HTMLParser() - tree = etree.fromstring(html, parser) - xhtml = etree.tostring(tree, method="xml") - parser = etree.XMLParser(remove_blank_text=True) - root = etree.XML(xhtml, parser) - cleaned_html = etree.tostring(root) - #2nd pass build textile - target = EchoTarget() - parser = etree.XMLParser(target=target) - root = etree.fromstring(cleaned_html, parser) - textilized_text = ''.join(target.final_output).lstrip().rstrip() - return textilized_text