From 804b248d46c71e5169c57da794ec2f69f2998dbf Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 16 Apr 2011 11:55:44 -0400 Subject: [PATCH 01/25] Add new but still wip textile output generator. --- src/calibre/ebooks/txt/output.py | 21 +- src/calibre/ebooks/txt/textileml.py | 341 +++++++++++++++++++++++++--- src/calibre/ebooks/txt/unsmarten.py | 109 +++++++++ 3 files changed, 432 insertions(+), 39 deletions(-) create mode 100644 src/calibre/ebooks/txt/unsmarten.py diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 4e54a97b45..7b50afb345 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -70,16 +70,17 @@ class TXTOutput(OutputFormatPlugin): ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): + print 'New' if opts.txt_output_formatting.lower() == 'markdown': from calibre.ebooks.txt.markdownml import MarkdownMLizer - writer = MarkdownMLizer(log) + self.writer = MarkdownMLizer(log) elif opts.txt_output_formatting.lower() == 'textile': from calibre.ebooks.txt.textileml import TextileMLizer - writer = TextileMLizer(log) + self.writer = TextileMLizer(log) else: - writer = TXTMLizer(log) + self.writer = TXTMLizer(log) - txt = writer.extract_content(oeb_book, opts) + txt = self.writer.extract_content(oeb_book, opts) txt = clean_ascii_chars(txt) log.debug('\tReplacing newlines with selected type...') @@ -118,10 +119,18 @@ class TXTZOutput(TXTOutput): # Images for item in oeb_book.manifest: if item.media_type in OEB_IMAGES: - path = os.path.join(tdir, os.path.dirname(item.href)) + if hasattr(self.writer, 'images'): + path = os.path.join(tdir, 'images') + if item.href in self.writer.images: + href = self.writer.images[item.href] + else: + continue + else: + path = os.path.join(tdir, os.path.dirname(item.href)) + href = os.path.basename(item.href) if not os.path.exists(path): os.makedirs(path) - with open(os.path.join(tdir, item.href), 'wb') as imgf: + with open(os.path.join(path, href), 'wb') as imgf: imgf.write(item.data) # Metadata diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index d7e11695c5..9651fa8971 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- __license__ = 'GPL 3' -__copyright__ = '2011, John Schember ' +__copyright__ = '2011, Leigh Parry ' __docformat__ = 'restructuredtext en' ''' @@ -10,53 +10,328 @@ Transform OEB content into Textile formatted plain text import re -from lxml import etree +from functools import partial -from calibre.ebooks.oeb.base import XHTML -from calibre.utils.html2textile import html2textile +from calibre.ebooks.htmlz.oeb2html import OEB2HTML +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links +from calibre.ebooks.oeb.stylizer import Stylizer +from calibre.ebooks.txt.unsmarten import unsmarten +from operator import itemgetter -class TextileMLizer(object): - def __init__(self, log): - self.log = log +class TextileMLizer(OEB2HTML): def extract_content(self, oeb_book, opts): self.log.info('Converting XHTML to Textile formatted TXT...') - self.oeb_book = oeb_book self.opts = opts + self.in_pre = False + self.in_table = False + self.links = {} + self.list = [] + self.images = {} + self.base_hrefs = [item.href for item in oeb_book.spine] + self.map_resources(oeb_book) - return self.mlize_spine() + self.style_bold = False + self.style_italic = False + self.style_under = False + self.style_strike = False + self.style_smallcap = False - def mlize_spine(self): + txt = self.mlize_spine(oeb_book) + txt = unsmarten(txt) + + # Do some tidying up + txt = self.tidy_up(txt) + + return txt + + def mlize_spine(self, oeb_book): output = [u''] - - for item in self.oeb_book.spine: + for item in oeb_book.spine: self.log.debug('Converting %s to Textile formatted TXT...' % item.href) + self.rewrite_ids(item.data, item) + rewrite_links(item.data, partial(self.rewrite_link, page=item)) + stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) + output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) + output.append('\n\n') + return ''.join(output) - html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) + def tidy_up(self, text): + def check_count(text, tests): + x = [] + for i, t in enumerate(reversed(tests)): + x.append((text.count(t), i, t)) + if x: + return sorted(x, key=itemgetter(0, 1), reverse=True)[0][2] + return '' - if not self.opts.keep_links: - html = re.sub(r'<\s*/*\s*a[^>]*>', '', html) - if not self.opts.keep_image_references: - html = re.sub(r'<\s*img[^>]*>', '', html) + # NEEDS TWEAKING +# def check_escaping(text, tests): +# for t in tests: +# text = re.sub(r'(\S)('+t+'\w+'+t+')', r'\1[\2]', text) +# text = re.sub(r'(\s)('+t+'\w+'+t+')(\S)', r'\1[\2]\3', text) +# return text - text = html2textile(html) + txt = check_count(text, ['\np<. ', '\np<>. ', '\np. ']) + text = re.sub(txt+'(\S)', r'\n\1', text) - # Ensure the section ends with at least two new line characters. - # This is to prevent the last paragraph from a section being - # combined into the fist paragraph of the next. - end_chars = text[-4:] - # Convert all newlines to \n - end_chars = end_chars.replace('\r\n', '\n') - end_chars = end_chars.replace('\r', '\n') - end_chars = end_chars[-2:] - if not end_chars[1] == '\n': - text += '\n\n' - if end_chars[1] == '\n' and not end_chars[0] == '\n': - text += '\n' +# text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '-']) - output += text + text = re.sub('\npre\. bc\.', '\nbc.', text) + text = re.sub('\np=. p. ', '\np. ', text) + text = re.sub('\np=. \n', '\n', text) + text = re.sub('\n{3,}', '\n\n', text) + text = re.sub(' \|', '|', text) - output = u''.join(output) + # started work on trying to fix footnotes +# text = re.sub(r'\^"(\d+)":#.+\^', r'[\1]', text) + return text - return output + def remove_newlines(self, text): + text = text.replace('\r\n', ' ') + text = text.replace('\n', ' ') + text = text.replace('\r', ' ') + # Condense redundant spaces created by replacing newlines with spaces. + text = re.sub(r'[ ]{2,}', ' ', text) + text = re.sub(r'\t+', '', text) + return text + + def remove_leading_ws(self, text): + text = text.replace('\r\n', '\n') + text = text.replace('\r', '\n') + text = re.sub(r'\n+', '\n', text) + text = re.sub(r'\n[\t ]+', '\n', text) + return text + + def check_align(self, style, align, tests): + for i in tests: + if style[align] == i[0]: + return i[1] + return '' + + def check_padding(self, style, tests): + txt = '' + for i in tests: + try: + ems = int(round(float(style[i[0]] / style['font-size']))) + if ems >=1: + txt += i[1] * ems + except: + pass + return txt + + def check_id_tag(self, attribs): + txt = '' + if attribs.has_key('id'): + txt = '(#'+attribs['id']+')' + return txt + + def build_block(self, tag, style, attribs, finish): + txt = tag + if self.opts.keep_links: + txt += self.check_id_tag(attribs) + txt += self.check_padding(style, [['padding-left','('],['padding-right',')']]) + txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']]) + txt += finish + return txt + + def dump_text(self, elem, stylizer, page, tag_stack=[]): + ''' + @elem: The element in the etree that we are working on. + @stylizer: The style information attached to the element. + ''' + + # We can only processes tags. If there isn't a tag return any text. + if not isinstance(elem.tag, basestring) \ + or namespace(elem.tag) != XHTML_NS: + p = elem.getparent() + if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \ + and elem.tail: + return [elem.tail] + return [''] + + # Setup our variables. + text = [''] + style = stylizer.style(elem) + tags = [] + tag = barename(elem.tag) + attribs = elem.attrib + + # Ignore anything that is set to not be displayed. + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + return [''] + + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'): + text.append(self.build_block(tag, style, attribs, '. ')) + tags.append('\n') + + if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): + if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): + if self.style_bold == False: + text.append('*') + tags.append('*') + self.style_bold = True + if style['font-style'] == 'italic' or tag in ('i', 'em'): + if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): + if self.style_italic == False: + text.append('_') + tags.append('_') + self.style_italic = True + if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): + if tag != 'a': + if self.style_under == False: + text.append('+') + tags.append('+') + self.style_under = True + if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'): + if self.style_strike == False: + text.append('-') + tags.append('-') + self.style_strike = True + if style['font-variant'] == 'small-caps': + if self.style_smallcap == 0: + text.append('&') + tags.append('&') + self.style_smallcap = 1 + if tag == 'br': + text.append('') + tags.append('\n') + elif tag == 'blockquote': + text.append('bq. ') + tags.append('\n') + elif tag in ('abbr', 'acronym'): + text.append('') + txt = attribs['title'] + tags.append('(' + txt + ')') + elif tag == 'sup': + text.append('^') + tags.append('^') + elif tag == 'sub': + text.append('~') + tags.append('~') + elif tag == 'code': + if self.in_pre: + text.append('bc. ') + tags.append('\n') + else: + text.append('@') + tags.append('@') + elif tag == 'cite': + text.append('??') + tags.append('??') + elif tag == 'hr': + text.append('\n***\n') + tags.append('\n') + elif tag == 'pre': + self.in_pre = True + text.append('pre. ') + tags.append('pre') + elif tag == 'a': + if self.opts.keep_links: + text.append ('"') + tags.append('":' + attribs['href']) + if attribs.has_key('title'): + tags.append('(' + attribs['title'] + ')') + elif tag == 'img': + if self.opts.keep_image_references: + text.append ('!' + attribs['src']) + if attribs.has_key('alt'): + txt = attribs['alt'] + if txt != '': + text.append('(' + txt + ')') + tags.append('!') + elif tag in ('ol', 'ul'): + self.list.append({'name':tag, 'num':0}) + text.append('') + tags.append(tag) + elif tag == 'li': + if self.list: li = self.list[-1] + else: li = {'name':'ul', 'num':0} + if li['name'] == 'ul': text.append('*'*len(self.list)+' ') + elif li['name'] == 'ol': text.append('#'*len(self.list)+' ') + elif tag == 'dl': + text.append('\n') + tags.append('') + elif tag == 'dt': + text.append('') + tags.append('\n') + elif tag == 'dd': + text.append(' ') + tags.append('') + elif tag == 'dd': + text.append('') + tags.append('\n') + elif tag == 'table': + self.in_table = True + text.append('') + tags.append('table') + elif tag == 'tr': + text.append('') + tags.append('|\n') + elif tag == 'td': + text.append('|') + txt = '' + txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']]) + txt += self.check_align(style, 'vertical-align', [['top','^'],['bottom','~']]) + if attribs.has_key ('colspan'): + txt += '\\' + attribs['colspan'] + if attribs.has_key ('rowspan'): + txt += '/' + attribs['rowspan'] + if txt != '': + text.append(txt+'. ') + tags.append('') + elif tag == 'th': + text.append('|_. ') + tags.append('') + + if self.opts.keep_links and attribs.has_key('id'): + if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'): + text.append('(#' + attribs['id'] + ')') + + # If wanted process all style tags here - before taxt in tags is written + + # Process tags that contain text. + if hasattr(elem, 'text') and elem.text: + txt = elem.text + if not self.in_pre: + if self.in_table: + txt = self.remove_newlines(txt) + else: + txt = self.remove_leading_ws(txt) + text.append(txt) + + # Recurse down into tags within the tag we are in. + for item in elem: + text += self.dump_text(item, stylizer, page, tag_stack+tags) + + # Close all open tags. + tags.reverse() + for t in tags: + if tag in ('pre', 'ul', 'ol', 'li', 'table'): + if tag == 'pre': + self.in_pre = False + if tag == 'table': + self.in_table = False + if tag in ('ul', 'ol'): + if self.list: self.list.pop() + else: + text.append('%s' % t) + if t == '*': self.style_bold = False + if t == '_': self.style_italic = False + if t == '+': self.style_under = False + if t == '-': self.style_strike = False + if t == '&': self.style_smallcap = False + + # Add the text that is outside of the tag. + if hasattr(elem, 'tail') and elem.tail: + tail = elem.tail + if not self.in_pre: + if self.in_table: + tail = self.remove_newlines(tail) + else: + tail = self.remove_leading_ws(tail) + text.append(tail) + + return text diff --git a/src/calibre/ebooks/txt/unsmarten.py b/src/calibre/ebooks/txt/unsmarten.py new file mode 100644 index 0000000000..30a22bf069 --- /dev/null +++ b/src/calibre/ebooks/txt/unsmarten.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- + +''' + +''' + +__version__ = '0.1' +__author__ = 'Leigh Parry' + +import re + +def unsmarten(txt): + txt = re.sub(u'–|–|–', r'-', txt) # en-dash + txt = re.sub(u'—|—|—', r'--', txt) # em-dash + txt = re.sub(u'…|…|…', r'...', txt) # ellipsis + + txt = re.sub(u'“|”|″|“|”|″|“|”|″', r'"', txt) # double quote + txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt) # apostrophe + txt = re.sub(u'‘|’|′|‘|’|′|‘|’|′', r"'", txt) # single quote + + txt = re.sub(u'¢|¢|¢', r'{c\}', txt) # cent + txt = re.sub(u'£|£|£', r'{L-}', txt) # pound + txt = re.sub(u'¥|¥|¥', r'{Y=}', txt) # yen + txt = re.sub(u'©|©|©', r'{(c)}', txt) # copyright + txt = re.sub(u'®|®|®', r'{(r)}', txt) # registered + txt = re.sub(u'¼|¼|¼', r'{1/4}', txt) # quarter + txt = re.sub(u'½|½|½', r'{1/2}', txt) # half + txt = re.sub(u'¾|¾|¾', r'{3/4}', txt) # three-quarter + txt = re.sub(u'À|À|À', r'{A`)}', txt) # A-grave + txt = re.sub(u'Á|Á|Á', r"{A'}", txt) # A-acute + txt = re.sub(u'Â|Â|Â', r'{A^}', txt) # A-circumflex + txt = re.sub(u'Ã|Ã|Ã', r'{A~}', txt) # A-tilde + txt = re.sub(u'Ä|Ä|Ä', r'{A"}', txt) # A-umlaut + txt = re.sub(u'Å|Å|Å', r'{Ao}', txt) # A-ring + txt = re.sub(u'Æ|Æ|Æ', r'{AE}', txt) # AE + txt = re.sub(u'Ç|Ç|Ç', r'{C,}', txt) # C-cedilla + txt = re.sub(u'È|È|È', r'{E`}', txt) # E-grave + txt = re.sub(u'É|É|É', r"{E'}", txt) # E-acute + txt = re.sub(u'Ê|Ê|Ê', r'{E^}', txt) # E-circumflex + txt = re.sub(u'Ë|Ë|Ë', r'{E"}', txt) # E-umlaut + txt = re.sub(u'Ì|Ì|Ì', r'{I`}', txt) # I-grave + txt = re.sub(u'Í|Í|Í', r"{I'}", txt) # I-acute + txt = re.sub(u'Î|Î|Î', r'{I^}', txt) # I-circumflex + txt = re.sub(u'Ï|Ï|Ï', r'{I"}', txt) # I-umlaut + txt = re.sub(u'Ð|Ð|Ð', r'{D-}', txt) # ETH + txt = re.sub(u'Ñ|Ñ|Ñ', r'{N~}', txt) # N-tilde + txt = re.sub(u'Ò|Ò|Ò', r'{O`}', txt) # O-grave + txt = re.sub(u'Ó|Ó|Ó', r"{O'}", txt) # O-acute + txt = re.sub(u'Ô|Ô|Ô', r'{O^}', txt) # O-circumflex + txt = re.sub(u'Õ|Õ|Õ', r'{O~}', txt) # O-tilde + txt = re.sub(u'Ö|Ö|Ö', r'{O"}', txt) # O-umlaut + txt = re.sub(u'×|×|×', r'{x}', txt) # dimension + txt = re.sub(u'Ø|Ø|Ø', r'{O/}', txt) # O-slash + txt = re.sub(u'Ù|Ù|Ù', r"{U`}", txt) # U-grave + txt = re.sub(u'Ú|Ú|Ú', r"{U'}", txt) # U-acute + txt = re.sub(u'Û|Û|Û', r'{U^}', txt) # U-circumflex + txt = re.sub(u'Ü|Ü|Ü', r'{U"}', txt) # U-umlaut + txt = re.sub(u'Ý|Ý|Ý', r"{Y'}", txt) # Y-grave + txt = re.sub(u'ß|ß|ß', r'{sz}', txt) # sharp-s + txt = re.sub(u'à|à|à', r'{a`}', txt) # a-grave + txt = re.sub(u'á|á|á', r"{a'}", txt) # a-acute + txt = re.sub(u'â|â|â', r'{a^}', txt) # a-circumflex + txt = re.sub(u'ã|ã|ã', r'{a~}', txt) # a-tilde + txt = re.sub(u'ä|ä|ä', r'{a"}', txt) # a-umlaut + txt = re.sub(u'å|å|å', r'{ao}', txt) # a-ring + txt = re.sub(u'æ|æ|æ', r'{ae}', txt) # ae + txt = re.sub(u'ç|ç|ç', r'{c,}', txt) # c-cedilla + txt = re.sub(u'è|è|è', r'{e`}', txt) # e-grave + txt = re.sub(u'é|é|é', r"{e'}", txt) # e-acute + txt = re.sub(u'ê|ê|ê', r'{e^}', txt) # e-circumflex + txt = re.sub(u'ë|ë|ë', r'{e"}', txt) # e-umlaut + txt = re.sub(u'ì|ì|ì', r'{i`}', txt) # i-grave + txt = re.sub(u'í|í|í', r"{i'}", txt) # i-acute + txt = re.sub(u'î|î|î', r'{i^}', txt) # i-circumflex + txt = re.sub(u'ï|ï|ï', r'{i"}', txt) # i-umlaut + txt = re.sub(u'ð|ð|ð', r'{d-}', txt) # eth + txt = re.sub(u'ñ|ñ|ñ', r'{n~}', txt) # n-tilde + txt = re.sub(u'ò|ò|ò', r'{o`}', txt) # o-grave + txt = re.sub(u'ó|ó|ó', r"{o'}", txt) # o-acute + txt = re.sub(u'ô|ô|ô', r'{o^}', txt) # o-circumflex + txt = re.sub(u'õ|õ|õ', r'{o~}', txt) # o-tilde + txt = re.sub(u'ö|ö|ö', r'{o"}', txt) # o-umlaut + txt = re.sub(u'ø|ø|ø', r'{o/}', txt) # o-stroke + txt = re.sub(u'ù|ù|ù', r'{u`}', txt) # u-grave + txt = re.sub(u'ú|ú|ú', r"{u'}", txt) # u-acute + txt = re.sub(u'û|û|û', r'{u^}', txt) # u-circumflex + txt = re.sub(u'ü|ü|ü', r'{u"}', txt) # u-umlaut + txt = re.sub(u'ý|ý|ý', r"{y'}", txt) # y-acute + txt = re.sub(u'ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut + txt = re.sub(u'Œ|Œ|Œ', r'{OE}', txt) # OE + txt = re.sub(u'œ|œ|œ', r'{oe}', txt) # oe + txt = re.sub(u'Ŝ|Š|Ŝ', r'{S^}', txt) # Scaron + txt = re.sub(u'ŝ|š|ŝ', r'{s^}', txt) # scaron + txt = re.sub(u'•|•|•', r'{*}', txt) # bullet + txt = re.sub(u'₣|₣', r'{Fr}', txt) # Franc + txt = re.sub(u'₤|₤', r'{L=}', txt) # Lira + txt = re.sub(u'₨|₨', r'{Rs}', txt) # Rupee + txt = re.sub(u'€|€|€', r'{C=}', txt) # euro + txt = re.sub(u'™|™|™', r'{tm}', txt) # trademark + txt = re.sub(u'♠|♠|♠', r'{spade}', txt) # spade + txt = re.sub(u'♣|♣|♣', r'{club}', txt) # club + txt = re.sub(u'♥|♥|♥', r'{heart}', txt) # heart + txt = re.sub(u'♦|♦|♦', r'{diamond}', txt) # diamond + + txt = re.sub(u'\xa0', r'p. ', txt) # blank paragraph + txt = re.sub(u'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph + txt = re.sub(u'\n \n', r'\n
\n', txt) # blank paragraph - br tag + + return txt From be3d441d3bb4705fc24261312644ae148a0581c4 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 19 Apr 2011 06:49:27 -0400 Subject: [PATCH 02/25] More textile work. --- src/calibre/ebooks/txt/output.py | 1 - src/calibre/ebooks/txt/textileml.py | 125 +++++++++++++++++++--------- src/calibre/ebooks/txt/unsmarten.py | 11 ++- 3 files changed, 89 insertions(+), 48 deletions(-) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 7b50afb345..606dec4a63 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -70,7 +70,6 @@ class TXTOutput(OutputFormatPlugin): ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): - print 'New' if opts.txt_output_formatting.lower() == 'markdown': from calibre.ebooks.txt.markdownml import MarkdownMLizer self.writer = MarkdownMLizer(log) diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 9651fa8971..9a025e0aef 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -58,31 +58,39 @@ class TextileMLizer(OEB2HTML): return ''.join(output) def tidy_up(self, text): - def check_count(text, tests): - x = [] - for i, t in enumerate(reversed(tests)): - x.append((text.count(t), i, t)) - if x: - return sorted(x, key=itemgetter(0, 1), reverse=True)[0][2] - return '' +# def check_count(text, tests): +# x = [] +# for i, t in enumerate(reversed(tests)): +# x.append((text.count(t), i, t)) +# if x: +# return sorted(x, key=itemgetter(0, 1), reverse=True)[0][2] +# return '' - # NEEDS TWEAKING -# def check_escaping(text, tests): -# for t in tests: -# text = re.sub(r'(\S)('+t+'\w+'+t+')', r'\1[\2]', text) -# text = re.sub(r'(\s)('+t+'\w+'+t+')(\S)', r'\1[\2]\3', text) -# return text + # Needs tweaking and finetuning - don't use yet. + def check_escaping(text, tests): + for t in tests: + text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text) +# text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text) +# text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text) +# text = re.sub(r'(["\'])\[('+t+'\w+'+t+')\]', r'\1\2', text) +# text = re.sub(r'\[('+t+'\w+'+t+')\](["\',\.!\?])', r'\1\2', text) + return text - txt = check_count(text, ['\np<. ', '\np<>. ', '\np. ']) - text = re.sub(txt+'(\S)', r'\n\1', text) +# txt = check_count(text, ['\np<. ', '\np<>. ', '\np. ']) +# text = re.sub(txt+'(\S)', r'\n\1', text) -# text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '-']) + text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '-']) - text = re.sub('\npre\. bc\.', '\nbc.', text) - text = re.sub('\np=. p. ', '\np. ', text) - text = re.sub('\np=. \n', '\n', text) - text = re.sub('\n{3,}', '\n\n', text) - text = re.sub(' \|', '|', text) + text = re.sub(r'^\n+', r'', text) + text = re.sub(r'\npre\. bc\.', r'\nbc.', text) + text = re.sub(r'\nbq\. \n\np\. ', r'\nbq. ', text) + text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) + text = re.sub(r'\n{3}', r'\n\n', text) + text = re.sub(r'(p.*\. \n?)(p.*\. )', r'\2', text) + text = re.sub(r'p.*\. \n\n', r'', text) +# text = re.sub(u'\n \n', r'\n
\n', text) # blank paragraph - br tag + text = re.sub(u'p.*\. \xa0', r'p. ', text) # blank paragraph + text = re.sub(r' \|', r'|', text) # started work on trying to fix footnotes # text = re.sub(r'\^"(\d+)":#.+\^', r'[\1]', text) @@ -94,20 +102,29 @@ class TextileMLizer(OEB2HTML): text = text.replace('\r', ' ') # Condense redundant spaces created by replacing newlines with spaces. text = re.sub(r'[ ]{2,}', ' ', text) - text = re.sub(r'\t+', '', text) + text = re.sub(r'\t +', '', text) +# text = re.sub(r'\n +', '', text) return text def remove_leading_ws(self, text): text = text.replace('\r\n', '\n') text = text.replace('\r', '\n') - text = re.sub(r'\n+', '\n', text) text = re.sub(r'\n[\t ]+', '\n', text) + text = re.sub(r'\n{2,}', '\n', text) return text - def check_align(self, style, align, tests): + def check_halign(self, style): + tests = {'left':'<','justify':'<>','center':'=','right':'>'} for i in tests: - if style[align] == i[0]: - return i[1] + if style['text-align'] == i: + return tests[i] + return '' + + def check_valign(self, style): + tests = {'top':'^','bottom':'~', 'middle':'-'} + for i in tests: + if style['vertical-align'] == i: + return tests[i] return '' def check_padding(self, style, tests): @@ -124,15 +141,16 @@ class TextileMLizer(OEB2HTML): def check_id_tag(self, attribs): txt = '' if attribs.has_key('id'): - txt = '(#'+attribs['id']+')' + #if attribs['id'] in self.links: + txt = '(#'+attribs['id']+')' return txt def build_block(self, tag, style, attribs, finish): - txt = tag + txt = '\n' + tag if self.opts.keep_links: txt += self.check_id_tag(attribs) txt += self.check_padding(style, [['padding-left','('],['padding-right',')']]) - txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']]) + txt += self.check_halign(style) txt += finish return txt @@ -163,7 +181,17 @@ class TextileMLizer(OEB2HTML): or style['visibility'] == 'hidden': return [''] - if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'): + # Soft scene breaks. + text.append(self.check_padding(style, ['margin-top',u'\n\n\xa0'])) + + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): + #For debugging + if tag == 'h1': + for i in self.links: + text.append(i) + text.append('\n') + if tag == 'div': + tag = 'p' text.append(self.build_block(tag, style, attribs, '. ')) tags.append('\n') @@ -191,10 +219,10 @@ class TextileMLizer(OEB2HTML): tags.append('-') self.style_strike = True if style['font-variant'] == 'small-caps': - if self.style_smallcap == 0: + if self.style_smallcap == False: text.append('&') tags.append('&') - self.style_smallcap = 1 + self.style_smallcap = True if tag == 'br': text.append('') tags.append('\n') @@ -236,7 +264,10 @@ class TextileMLizer(OEB2HTML): tags.append('(' + attribs['title'] + ')') elif tag == 'img': if self.opts.keep_image_references: - text.append ('!' + attribs['src']) + txt = '!' + self.check_halign(style) + txt += self.check_valign(style) + txt += attribs['src'] + text.append(txt) if attribs.has_key('alt'): txt = attribs['alt'] if txt != '': @@ -247,6 +278,7 @@ class TextileMLizer(OEB2HTML): text.append('') tags.append(tag) elif tag == 'li': +# text.append('\n') if self.list: li = self.list[-1] else: li = {'name':'ul', 'num':0} if li['name'] == 'ul': text.append('*'*len(self.list)+' ') @@ -273,8 +305,8 @@ class TextileMLizer(OEB2HTML): elif tag == 'td': text.append('|') txt = '' - txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']]) - txt += self.check_align(style, 'vertical-align', [['top','^'],['bottom','~']]) + txt += self.check_halign(style) + txt += self.check_valign(style) if attribs.has_key ('colspan'): txt += '\\' + attribs['colspan'] if attribs.has_key ('rowspan'): @@ -288,7 +320,10 @@ class TextileMLizer(OEB2HTML): if self.opts.keep_links and attribs.has_key('id'): if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'): - text.append('(#' + attribs['id'] + ')') + if tag == 'span': + text.append(' %') + tags.append('% ') + text.append('(#' + attribs['id'] + u')\xa0') # If wanted process all style tags here - before taxt in tags is written @@ -318,11 +353,19 @@ class TextileMLizer(OEB2HTML): if self.list: self.list.pop() else: text.append('%s' % t) - if t == '*': self.style_bold = False - if t == '_': self.style_italic = False - if t == '+': self.style_under = False - if t == '-': self.style_strike = False - if t == '&': self.style_smallcap = False + if t == '*': + self.style_bold = False + if t == '_': + self.style_italic = False + if t == '+': + self.style_under = False + if t == '-': + self.style_strike = False + if t == '&': + self.style_smallcap = False + + # Soft scene breaks. + text.append(self.check_padding(style, ['margin-bottom',u'\n\n\xa0'])) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: diff --git a/src/calibre/ebooks/txt/unsmarten.py b/src/calibre/ebooks/txt/unsmarten.py index 30a22bf069..40444ba601 100644 --- a/src/calibre/ebooks/txt/unsmarten.py +++ b/src/calibre/ebooks/txt/unsmarten.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- -''' - -''' +"""unsmarten : html2textile helper function""" __version__ = '0.1' __author__ = 'Leigh Parry' @@ -102,8 +100,9 @@ def unsmarten(txt): txt = re.sub(u'♥|♥|♥', r'{heart}', txt) # heart txt = re.sub(u'♦|♦|♦', r'{diamond}', txt) # diamond - txt = re.sub(u'\xa0', r'p. ', txt) # blank paragraph - txt = re.sub(u'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph - txt = re.sub(u'\n \n', r'\n
\n', txt) # blank paragraph - br tag + # Move into main code? +# txt = re.sub(u'\xa0', r'p. ', txt) # blank paragraph +# txt = re.sub(u'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph +# txt = re.sub(u'\n \n', r'\n
\n', txt) # blank paragraph - br tag return txt From fabef627e3dd85d06989551614db5277e72021c7 Mon Sep 17 00:00:00 2001 From: Byron Li Date: Mon, 25 Apr 2011 21:11:24 +0800 Subject: [PATCH 03/25] Add a douban.com plugin stub. Not working yet. --- src/calibre/customize/builtins.py | 5 +- src/calibre/ebooks/metadata/sources/douban.py | 361 ++++++++++++++++++ 2 files changed, 364 insertions(+), 2 deletions(-) create mode 100644 src/calibre/ebooks/metadata/sources/douban.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index c27fa2a57b..3c769f8dc7 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -628,8 +628,9 @@ if test_eight_code: from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary from calibre.ebooks.metadata.sources.isbndb import ISBNDB from calibre.ebooks.metadata.sources.overdrive import OverDrive - - plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive] + from calibre.ebooks.metadata.sources.douban import Douban + + plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban] # }}} else: diff --git a/src/calibre/ebooks/metadata/sources/douban.py b/src/calibre/ebooks/metadata/sources/douban.py new file mode 100644 index 0000000000..b50bb6ff85 --- /dev/null +++ b/src/calibre/ebooks/metadata/sources/douban.py @@ -0,0 +1,361 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ; 2011, Li Fanxi ' +__docformat__ = 'restructuredtext en' + +import time, hashlib +from urllib import urlencode +from functools import partial +from Queue import Queue, Empty + +from lxml import etree + +from calibre.ebooks.metadata import check_isbn +from calibre.ebooks.metadata.sources.base import Source +from calibre.ebooks.metadata.book.base import Metadata +from calibre.ebooks.chardet import xml_to_unicode +from calibre.utils.date import parse_date, utcnow +from calibre.utils.cleantext import clean_ascii_chars +from calibre import as_unicode + +NAMESPACES = { + 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', + 'atom' : 'http://www.w3.org/2005/Atom', + 'dc' : 'http://purl.org/dc/terms', + 'gd' : 'http://schemas.google.com/g/2005' + } + +NAMESPACES = { + 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', + 'atom' : 'http://www.w3.org/2005/Atom', + 'db': 'http://www.douban.com/xmlns/' + } +XPath = partial(etree.XPath, namespaces=NAMESPACES) +total_results = XPath('//openSearch:totalResults') +start_index = XPath('//openSearch:startIndex') +items_per_page = XPath('//openSearch:itemsPerPage') +entry = XPath('//atom:entry') +entry_id = XPath('descendant::atom:id') +title = XPath('descendant::atom:title') +description = XPath('descendant::atom:summary') +publisher = XPath("descendant::db:attribute[@name='publisher']") +isbn = XPath("descendant::db:attribute[@name='isbn13']") +date = XPath("descendant::db:attribute[@name='pubdate']") +creator = XPath("descendant::db:attribute[@name='author']") +tag = XPath("descendant::db:tag") + +def get_details(browser, url, timeout): # {{{ + try: + raw = browser.open_novisit(url, timeout=timeout).read() + except Exception as e: + gc = getattr(e, 'getcode', lambda : -1) + if gc() != 403: + raise + # Google is throttling us, wait a little + time.sleep(2) + raw = browser.open_novisit(url, timeout=timeout).read() + + return raw +# }}} + +def to_metadata(browser, log, entry_, timeout): # {{{ + + def get_text(extra, x): + try: + ans = x(extra) + if ans: + ans = ans[0].text + if ans and ans.strip(): + return ans.strip() + except: + log.exception('Programming error:') + return None + + + id_url = entry_id(entry_)[0].text + google_id = id_url.split('/')[-1] + title_ = ': '.join([x.text for x in title(entry_)]).strip() + authors = [x.text.strip() for x in creator(entry_) if x.text] + if not authors: + authors = [_('Unknown')] + if not id_url or not title: + # Silently discard this entry + return None + + mi = Metadata(title_, authors) + mi.identifiers = {'google':google_id} + try: + raw = get_details(browser, id_url, timeout) + feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), + strip_encoding_pats=True)[0]) + extra = entry(feed)[0] + except: + log.exception('Failed to get additional details for', mi.title) + return mi + + mi.comments = get_text(extra, description) + #mi.language = get_text(extra, language) + mi.publisher = get_text(extra, publisher) + + # ISBN + isbns = [] + for x in identifier(extra): + t = str(x.text).strip() + if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): + if t[:5].upper() == 'ISBN:': + t = check_isbn(t[5:]) + if t: + isbns.append(t) + if isbns: + mi.isbn = sorted(isbns, key=len)[-1] + mi.all_isbns = isbns + + # Tags + try: + btags = [x.text for x in subject(extra) if x.text] + tags = [] + for t in btags: + atags = [y.strip() for y in t.split('/')] + for tag in atags: + if tag not in tags: + tags.append(tag) + except: + log.exception('Failed to parse tags:') + tags = [] + if tags: + mi.tags = [x.replace(',', ';') for x in tags] + + # pubdate + pubdate = get_text(extra, date) + if pubdate: + try: + default = utcnow().replace(day=15) + mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) + except: + log.error('Failed to parse pubdate %r'%pubdate) + + # Ratings + for x in rating(extra): + try: + mi.rating = float(x.get('average')) + if mi.rating > 5: + mi.rating /= 2 + except: + log.exception('Failed to parse rating') + + # Cover + mi.has_google_cover = None + for x in extra.xpath( + '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'): + mi.has_google_cover = x.get('href') + break + + return mi +# }}} + +class Douban(Source): + + name = 'Douban Books' + author = _('Li Fanxi') + + description = _('Downloads metadata from Douban.com') + + capabilities = frozenset(['identify', 'cover']) + touched_fields = frozenset(['title', 'authors', 'tags', + 'comments', 'publisher', 'identifier:isbn', 'rating', + 'identifier:douban']) # language currently disabled + supports_gzip_transfer_encoding = True + cached_cover_url_is_reliable = True + + DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d' +# GOOGLE_COVER = 'http://books.google.com/books?id=%s&printsec=frontcover&img=1' + +# DUMMY_IMAGE_MD5 = frozenset(['0de4383ebad0adad5eeb8975cd796657']) + + def get_book_url(self, identifiers): # {{{ + db = identifiers.get('douban', None) + if db is not None: + return db + else: + return None + # }}} + + def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ + SEARCH_URL = 'http://api.douban.com/book/subjects?' + ISBN_URL = 'http://api.douban.com/book/subject/isbn/' + + q = '' + t = None + isbn = check_isbn(identifiers.get('isbn', None)) + if isbn is not None: + q = isbn + t = 'isbn' + elif title or authors: + def build_term(prefix, parts): + return ' '.join(x for x in parts) + title_tokens = list(self.get_title_tokens(title)) + if title_tokens: + q += build_term('title', title_tokens) + author_tokens = self.get_author_tokens(authors, + only_first_author=True) + if author_tokens: + q += ((' ' if q != '' else '') + + build_term('author', author_tokens)) + t = 'search' + if isinstance(q, unicode): + q = q.encode('utf-8') + if not q: + return None + print(q) + url = None + if t == "isbn": + url = ISBN_URL + q + else: + url = SEARCH_URL + urlencode({ + 'q': q, + }) + if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '': + url = url + "?apikey=" + self.DOUBAN_API_KEY + print(url) + return url + # }}} + + def download_cover(self, log, result_queue, abort, # {{{ + title=None, authors=None, identifiers={}, timeout=30): + cached_url = self.get_cached_cover_url(identifiers) + if cached_url is None: + log.info('No cached cover found, running identify') + rq = Queue() + self.identify(log, rq, abort, title=title, authors=authors, + identifiers=identifiers) + if abort.is_set(): + return + results = [] + while True: + try: + results.append(rq.get_nowait()) + except Empty: + break + results.sort(key=self.identify_results_keygen( + title=title, authors=authors, identifiers=identifiers)) + for mi in results: + cached_url = self.get_cached_cover_url(mi.identifiers) + if cached_url is not None: + break + if cached_url is None: + log.info('No cover found') + return + + if abort.is_set(): + return + br = self.browser + log('Downloading cover from:', cached_url) + try: + cdata = br.open_novisit(cached_url, timeout=timeout).read() + if cdata: + if hashlib.md5(cdata).hexdigest() in self.DUMMY_IMAGE_MD5: + log.warning('Google returned a dummy image, ignoring') + else: + result_queue.put((self, cdata)) + except: + log.exception('Failed to download cover from:', cached_url) + + # }}} + + def get_cached_cover_url(self, identifiers): # {{{ + url = None + goog = identifiers.get('google', None) + if goog is None: + isbn = identifiers.get('isbn', None) + if isbn is not None: + goog = self.cached_isbn_to_identifier(isbn) + if goog is not None: + url = self.cached_identifier_to_cover_url(goog) + + return url + # }}} + + def get_all_details(self, br, log, entries, abort, # {{{ + result_queue, timeout): + for relevance, i in enumerate(entries): + try: + ans = to_metadata(br, log, i, timeout) + if isinstance(ans, Metadata): + ans.source_relevance = relevance + goog = ans.identifiers['google'] + for isbn in getattr(ans, 'all_isbns', []): + self.cache_isbn_to_identifier(isbn, goog) + if ans.has_google_cover: + self.cache_identifier_to_cover_url(goog, + self.GOOGLE_COVER%goog) + self.clean_downloaded_metadata(ans) + result_queue.put(ans) + except: + log.exception( + 'Failed to get metadata for identify entry:', + etree.tostring(i)) + if abort.is_set(): + break + # }}} + + def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ + identifiers={}, timeout=30): + query = self.create_query(log, title=title, authors=authors, + identifiers=identifiers) + if not query: + log.error('Insufficient metadata to construct query') + return + br = self.browser + try: + raw = br.open_novisit(query, timeout=timeout).read() + except Exception as e: + log.exception('Failed to make identify query: %r'%query) + return as_unicode(e) + + try: + parser = etree.XMLParser(recover=True, no_network=True) + feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), + strip_encoding_pats=True)[0], parser=parser) + entries = entry(feed) + except Exception as e: + log.exception('Failed to parse identify results') + return as_unicode(e) + + if not entries and identifiers and title and authors and \ + not abort.is_set(): + return self.identify(log, result_queue, abort, title=title, + authors=authors, timeout=timeout) + + # There is no point running these queries in threads as google + # throttles requests returning 403 Forbidden errors + self.get_all_details(br, log, entries, abort, result_queue, timeout) + + return None + # }}} + +if __name__ == '__main__': # tests {{{ + # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py + from calibre.ebooks.metadata.sources.test import (test_identify_plugin, + title_test, authors_test) + test_identify_plugin(GoogleBooks.name, + [ + + + ( + {'identifiers':{'isbn': '0743273567'}, 'title':'Great Gatsby', + 'authors':['Fitzgerald']}, + [title_test('The great gatsby', exact=True), + authors_test(['Francis Scott Fitzgerald'])] + ), + + ( + {'title': 'Flatland', 'authors':['Abbott']}, + [title_test('Flatland', exact=False)] + ), + ]) +# }}} + From ea4b5b9054765bb737179d904c9168846def2e45 Mon Sep 17 00:00:00 2001 From: Byron Li Date: Fri, 29 Apr 2011 16:29:57 +0800 Subject: [PATCH 04/25] First working version of Douban book plugin. --- src/calibre/ebooks/metadata/sources/douban.py | 83 +++++++++---------- 1 file changed, 37 insertions(+), 46 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/douban.py b/src/calibre/ebooks/metadata/sources/douban.py index b50bb6ff85..8f1794b33f 100644 --- a/src/calibre/ebooks/metadata/sources/douban.py +++ b/src/calibre/ebooks/metadata/sources/douban.py @@ -25,14 +25,8 @@ from calibre import as_unicode NAMESPACES = { 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', 'atom' : 'http://www.w3.org/2005/Atom', - 'dc' : 'http://purl.org/dc/terms', - 'gd' : 'http://schemas.google.com/g/2005' - } - -NAMESPACES = { - 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', - 'atom' : 'http://www.w3.org/2005/Atom', - 'db': 'http://www.douban.com/xmlns/' + 'db': 'http://www.douban.com/xmlns/', + 'gd': 'http://schemas.google.com/g/2005' } XPath = partial(etree.XPath, namespaces=NAMESPACES) total_results = XPath('//openSearch:totalResults') @@ -47,6 +41,8 @@ isbn = XPath("descendant::db:attribute[@name='isbn13']") date = XPath("descendant::db:attribute[@name='pubdate']") creator = XPath("descendant::db:attribute[@name='author']") tag = XPath("descendant::db:tag") +rating = XPath("descendant::gd:rating[@name='average']") +cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href") def get_details(browser, url, timeout): # {{{ try: @@ -77,7 +73,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{ id_url = entry_id(entry_)[0].text - google_id = id_url.split('/')[-1] + douban_id = id_url.split('/')[-1] title_ = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: @@ -87,7 +83,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{ return None mi = Metadata(title_, authors) - mi.identifiers = {'google':google_id} + mi.identifiers = {'douban':douban_id} try: raw = get_details(browser, id_url, timeout) feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), @@ -103,13 +99,9 @@ def to_metadata(browser, log, entry_, timeout): # {{{ # ISBN isbns = [] - for x in identifier(extra): - t = str(x.text).strip() - if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): - if t[:5].upper() == 'ISBN:': - t = check_isbn(t[5:]) - if t: - isbns.append(t) + for x in [t.text for t in isbn(extra)]: + if check_isbn(x): + isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns @@ -139,21 +131,23 @@ def to_metadata(browser, log, entry_, timeout): # {{{ log.error('Failed to parse pubdate %r'%pubdate) # Ratings - for x in rating(extra): + if rating(extra): try: - mi.rating = float(x.get('average')) - if mi.rating > 5: - mi.rating /= 2 + mi.rating = float(rating(extra).text) / 2.0 except: log.exception('Failed to parse rating') + mi.rating = 0 # Cover - mi.has_google_cover = None - for x in extra.xpath( - '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'): - mi.has_google_cover = x.get('href') - break - + mi.has_douban_cover = None + u = cover_url(extra) + print(u) + if u: + u = u[0].replace('/spic/', '/lpic/'); + print(u) + # If URL contains "book-default", the book doesn't have a cover + if u.find('book-default') == -1: + mi.has_douban_cover = u return mi # }}} @@ -172,6 +166,7 @@ class Douban(Source): cached_cover_url_is_reliable = True DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d' + DOUBAN_ID_URL = 'http://api.douban.com/book/subject/%s' # GOOGLE_COVER = 'http://books.google.com/books?id=%s&printsec=frontcover&img=1' # DUMMY_IMAGE_MD5 = frozenset(['0de4383ebad0adad5eeb8975cd796657']) @@ -179,7 +174,7 @@ class Douban(Source): def get_book_url(self, identifiers): # {{{ db = identifiers.get('douban', None) if db is not None: - return db + return DOUBAN_ID_URL % db else: return None # }}} @@ -206,11 +201,11 @@ class Douban(Source): q += ((' ' if q != '' else '') + build_term('author', author_tokens)) t = 'search' + q = q.strip() if isinstance(q, unicode): q = q.encode('utf-8') if not q: return None - print(q) url = None if t == "isbn": url = ISBN_URL + q @@ -220,7 +215,6 @@ class Douban(Source): }) if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '': url = url + "?apikey=" + self.DOUBAN_API_KEY - print(url) return url # }}} @@ -257,10 +251,7 @@ class Douban(Source): try: cdata = br.open_novisit(cached_url, timeout=timeout).read() if cdata: - if hashlib.md5(cdata).hexdigest() in self.DUMMY_IMAGE_MD5: - log.warning('Google returned a dummy image, ignoring') - else: - result_queue.put((self, cdata)) + result_queue.put((self, cdata)) except: log.exception('Failed to download cover from:', cached_url) @@ -268,13 +259,13 @@ class Douban(Source): def get_cached_cover_url(self, identifiers): # {{{ url = None - goog = identifiers.get('google', None) - if goog is None: + db = identifiers.get('douban', None) + if db is None: isbn = identifiers.get('isbn', None) if isbn is not None: - goog = self.cached_isbn_to_identifier(isbn) - if goog is not None: - url = self.cached_identifier_to_cover_url(goog) + db = self.cached_isbn_to_identifier(isbn) + if db is not None: + url = self.cached_identifier_to_cover_url(db) return url # }}} @@ -286,12 +277,12 @@ class Douban(Source): ans = to_metadata(br, log, i, timeout) if isinstance(ans, Metadata): ans.source_relevance = relevance - goog = ans.identifiers['google'] + db = ans.identifiers['douban'] for isbn in getattr(ans, 'all_isbns', []): - self.cache_isbn_to_identifier(isbn, goog) - if ans.has_google_cover: - self.cache_identifier_to_cover_url(goog, - self.GOOGLE_COVER%goog) + self.cache_isbn_to_identifier(isbn, db) + if ans.has_douban_cover: + self.cache_identifier_to_cover_url(db, + ans.has_douban_cover) self.clean_downloaded_metadata(ans) result_queue.put(ans) except: @@ -315,7 +306,6 @@ class Douban(Source): except Exception as e: log.exception('Failed to make identify query: %r'%query) return as_unicode(e) - try: parser = etree.XMLParser(recover=True, no_network=True) feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), @@ -324,7 +314,8 @@ class Douban(Source): except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) - + if not title: + title = "" if not entries and identifiers and title and authors and \ not abort.is_set(): return self.identify(log, result_queue, abort, title=title, From ff6043ce0f0659edce0c05e7e669f5e9c106ea96 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 30 Apr 2011 08:44:30 -0400 Subject: [PATCH 05/25] ... --- src/calibre/ebooks/textile/functions.py | 12 +- src/calibre/ebooks/txt/textileml.py | 202 ++++++++++++++---------- 2 files changed, 123 insertions(+), 91 deletions(-) diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index 39f793face..dd1914cf9f 100755 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -12,7 +12,7 @@ A Humane Web Text Generator #__date__ = '2009/12/04' __copyright__ = """ -Copyright (c) 2011, Leigh Parry +Copyright (c) 2011, Leigh Parry Copyright (c) 2011, John Schember Copyright (c) 2009, Jason Samsa, http://jsamsa.com/ Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/ @@ -225,8 +225,8 @@ class Textile(object): (re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), r'\1'), # 3+ uppercase (re.compile(r'\b(\s{0,1})?\.{3}'), r'\1…'), # ellipsis (re.compile(r'^[\*_-]{3,}$', re.M), r'
'), #
scene-break - (re.compile(r'\b--\b'), r'—'), # em dash - (re.compile(r'(\s)--(\s)'), r'\1—\2'), # em dash +# (re.compile(r'\b--\b'), r'—'), # em dash + (re.compile(r'([^-])--([^-])'), r'\1—\2'), # em dash (re.compile(r'\s-(?:\s|$)'), r' – '), # en dash (re.compile(r'\b( ?)[([]TM[])]', re.I), r'\1™'), # trademark (re.compile(r'\b( ?)[([]R[])]', re.I), r'\1®'), # registered @@ -868,7 +868,7 @@ class Textile(object): >>> t.span(r"hello %(bob)span *strong* and **bold**% goodbye") 'hello span strong and bold goodbye' """ - qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^') + qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^', r'&') pnct = ".,\"'?!;:" for qtag in qtags: @@ -900,7 +900,9 @@ class Textile(object): '%' : 'span', '+' : 'ins', '~' : 'sub', - '^' : 'sup' + '^' : 'sup', + '&' : 'span style="font-variant:small-caps;"' +# '&' : 'span style="font-transform:uppercase;font-size:smaller;"' } tag = qtags[tag] atts = self.pba(atts) diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 9a025e0aef..42b709a681 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -28,15 +28,18 @@ class TextileMLizer(OEB2HTML): self.in_table = False self.links = {} self.list = [] + self.our_links = [] + self.our_ids = [] self.images = {} + self.remove_space_after_newline = False self.base_hrefs = [item.href for item in oeb_book.spine] self.map_resources(oeb_book) - self.style_bold = False - self.style_italic = False - self.style_under = False - self.style_strike = False - self.style_smallcap = False +# self.style_bold = False +# self.style_italic = False +# self.style_under = False +# self.style_strike = False +# self.style_smallcap = False txt = self.mlize_spine(oeb_book) txt = unsmarten(txt) @@ -58,42 +61,41 @@ class TextileMLizer(OEB2HTML): return ''.join(output) def tidy_up(self, text): -# def check_count(text, tests): -# x = [] -# for i, t in enumerate(reversed(tests)): -# x.append((text.count(t), i, t)) -# if x: -# return sorted(x, key=itemgetter(0, 1), reverse=True)[0][2] -# return '' - - # Needs tweaking and finetuning - don't use yet. + # Needs tweaking and finetuning def check_escaping(text, tests): for t in tests: text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text) -# text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text) -# text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text) -# text = re.sub(r'(["\'])\[('+t+'\w+'+t+')\]', r'\1\2', text) -# text = re.sub(r'\[('+t+'\w+'+t+')\](["\',\.!\?])', r'\1\2', text) + text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text) + text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text) return text -# txt = check_count(text, ['\np<. ', '\np<>. ', '\np. ']) -# text = re.sub(txt+'(\S)', r'\n\1', text) - - text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '-']) + # Note - I'm not checking for escaped '-' as this will also get hypenated words + text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%']) + text = re.sub(r' +\n', r'\n', text) text = re.sub(r'^\n+', r'', text) - text = re.sub(r'\npre\. bc\.', r'\nbc.', text) - text = re.sub(r'\nbq\. \n\np\. ', r'\nbq. ', text) + text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) + text = re.sub(r'\nbq\.\n?\np\. ', r'\nbq. ', text) text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) text = re.sub(r'\n{3}', r'\n\n', text) text = re.sub(r'(p.*\. \n?)(p.*\. )', r'\2', text) text = re.sub(r'p.*\. \n\n', r'', text) -# text = re.sub(u'\n \n', r'\n
\n', text) # blank paragraph - br tag text = re.sub(u'p.*\. \xa0', r'p. ', text) # blank paragraph text = re.sub(r' \|', r'|', text) + # Now put back spaces removed earlier as they're needed here + text = re.sub(r'\np\.\n', r'\np. \n', text) + + # Now tidyup links and ids - remove ones that don't have a correponding opposite + if self.opts.keep_links: + for i in self.our_links: + if i not in self.our_ids: + text = re.sub(r'"(.+)":'+i, '\1', text) + for i in self.our_ids: + if i not in self.our_links: + text = re.sub(r'\('+i+'\)', '', text) # started work on trying to fix footnotes -# text = re.sub(r'\^"(\d+)":#.+\^', r'[\1]', text) +# text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text) return text def remove_newlines(self, text): @@ -102,16 +104,30 @@ class TextileMLizer(OEB2HTML): text = text.replace('\r', ' ') # Condense redundant spaces created by replacing newlines with spaces. text = re.sub(r'[ ]{2,}', ' ', text) - text = re.sub(r'\t +', '', text) -# text = re.sub(r'\n +', '', text) + text = re.sub(r'\t+', '', text) + if self.remove_space_after_newline == True: + text = re.sub(r'^ +', '', text) + self.remove_space_after_newline = False return text - def remove_leading_ws(self, text): - text = text.replace('\r\n', '\n') - text = text.replace('\r', '\n') - text = re.sub(r'\n[\t ]+', '\n', text) - text = re.sub(r'\n{2,}', '\n', text) - return text +# def remove_leading_ws(self, text): +# text = text.replace('\r\n', '\n') +# text = text.replace('\r', '\n') +# text = re.sub(r'\n[\t ]+', '\n', text) +# text = re.sub(r'\n{2,}', '\n', text) +# return text + + def check_styles(self, style): + txt = '{' +# style_string = '%s;' % style +# txt += style_string + if style['color'] and style['color'] != 'black': + txt += 'color:'+style['color']+';' +# if style['font-size']:# in ('big', 'bigger', 'small', 'smaller'): +# txt += 'font-size: %d;' % style['font-size'] + txt += '}' + if txt == '{}': txt = '' + return txt def check_halign(self, style): tests = {'left':'<','justify':'<>','center':'=','right':'>'} @@ -140,18 +156,18 @@ class TextileMLizer(OEB2HTML): def check_id_tag(self, attribs): txt = '' - if attribs.has_key('id'): - #if attribs['id'] in self.links: - txt = '(#'+attribs['id']+')' + if attribs.has_key('id'): # and attribs['id'] in self.links.values(): + txt = '(#'+attribs['id']+ ')' + self.our_ids.append('#'+attribs['id']) return txt - def build_block(self, tag, style, attribs, finish): + def build_block(self, tag, style, attribs): txt = '\n' + tag if self.opts.keep_links: txt += self.check_id_tag(attribs) txt += self.check_padding(style, [['padding-left','('],['padding-right',')']]) txt += self.check_halign(style) - txt += finish + txt += self.check_styles(style) return txt def dump_text(self, elem, stylizer, page, tag_stack=[]): @@ -175,38 +191,35 @@ class TextileMLizer(OEB2HTML): tags = [] tag = barename(elem.tag) attribs = elem.attrib - + # Ignore anything that is set to not be displayed. if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': return [''] - # Soft scene breaks. - text.append(self.check_padding(style, ['margin-top',u'\n\n\xa0'])) - if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): - #For debugging - if tag == 'h1': - for i in self.links: - text.append(i) - text.append('\n') if tag == 'div': tag = 'p' - text.append(self.build_block(tag, style, attribs, '. ')) + text.append(self.build_block(tag, style, attribs)) + text.append('. ') tags.append('\n') - if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): - if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): - if self.style_bold == False: - text.append('*') - tags.append('*') - self.style_bold = True if style['font-style'] == 'italic' or tag in ('i', 'em'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): if self.style_italic == False: text.append('_') +# text.append('from '+tag) tags.append('_') self.style_italic = True + if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): + if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): + style_string = '%s;' % style + text.append(style_string) + if self.style_bold == False: + text.append('*') +# text.append('from '+tag) + tags.append('*') + self.style_bold = True if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): if tag != 'a': if self.style_under == False: @@ -218,16 +231,12 @@ class TextileMLizer(OEB2HTML): text.append('-') tags.append('-') self.style_strike = True - if style['font-variant'] == 'small-caps': - if self.style_smallcap == False: - text.append('&') - tags.append('&') - self.style_smallcap = True if tag == 'br': text.append('') tags.append('\n') + self.remove_space_after_newline = True elif tag == 'blockquote': - text.append('bq. ') + text.append('\nbq. ') tags.append('\n') elif tag in ('abbr', 'acronym'): text.append('') @@ -241,8 +250,8 @@ class TextileMLizer(OEB2HTML): tags.append('~') elif tag == 'code': if self.in_pre: - text.append('bc. ') - tags.append('\n') + text.append('\nbc. ') + tags.append('') else: text.append('@') tags.append('@') @@ -254,12 +263,14 @@ class TextileMLizer(OEB2HTML): tags.append('\n') elif tag == 'pre': self.in_pre = True - text.append('pre. ') - tags.append('pre') + text.append('\npre. ') + tags.append('pre\n') elif tag == 'a': if self.opts.keep_links: text.append ('"') - tags.append('":' + attribs['href']) + if attribs.has_key('href'): + tags.append('":' + attribs['href']) + self.our_links.append(attribs['href']) if attribs.has_key('title'): tags.append('(' + attribs['title'] + ')') elif tag == 'img': @@ -275,14 +286,15 @@ class TextileMLizer(OEB2HTML): tags.append('!') elif tag in ('ol', 'ul'): self.list.append({'name':tag, 'num':0}) - text.append('') + text.append('\n') tags.append(tag) elif tag == 'li': -# text.append('\n') if self.list: li = self.list[-1] else: li = {'name':'ul', 'num':0} + text.append('\n') if li['name'] == 'ul': text.append('*'*len(self.list)+' ') elif li['name'] == 'ol': text.append('#'*len(self.list)+' ') + tags.append('\n') elif tag == 'dl': text.append('\n') tags.append('') @@ -298,6 +310,7 @@ class TextileMLizer(OEB2HTML): elif tag == 'table': self.in_table = True text.append('') + tags.append('') tags.append('table') elif tag == 'tr': text.append('') @@ -315,18 +328,33 @@ class TextileMLizer(OEB2HTML): text.append(txt+'. ') tags.append('') elif tag == 'th': - text.append('|_. ') + text.append('|_') + + text.append('. ') tags.append('') + elif tag == 'span': + if style['font-variant'] == 'small-caps': + if self.style_smallcap == False: + text.append('&') + tags.append('&') + self.style_smallcap = True + else: + txt = '%' + if self.opts.keep_links: + txt += self.check_id_tag(attribs) + txt += self.check_styles(style) + if txt != '%': + text.append(txt) + tags.append('%') if self.opts.keep_links and attribs.has_key('id'): - if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'): - if tag == 'span': - text.append(' %') - tags.append('% ') - text.append('(#' + attribs['id'] + u')\xa0') - - # If wanted process all style tags here - before taxt in tags is written + if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span'): + text.append(self.check_id_tag(attribs)) + # Process the styles for any that we want to keep + if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', 'span'): + text.append(self.check_styles(style)) + # Process tags that contain text. if hasattr(elem, 'text') and elem.text: txt = elem.text @@ -351,21 +379,23 @@ class TextileMLizer(OEB2HTML): self.in_table = False if tag in ('ul', 'ol'): if self.list: self.list.pop() + if not self.list: text.append('\n') else: text.append('%s' % t) - if t == '*': - self.style_bold = False - if t == '_': - self.style_italic = False - if t == '+': - self.style_under = False - if t == '-': - self.style_strike = False - if t == '&': - self.style_smallcap = False + if t == '*': self.style_bold = False + if t == '_': self.style_italic = False + if t == '+': self.style_under = False + if t == '-': self.style_strike = False + if t == '&': self.style_smallcap = False # Soft scene breaks. text.append(self.check_padding(style, ['margin-bottom',u'\n\n\xa0'])) +# try: +# ems = int(round((float(style.marginBottom) / style.fontSize) - 1)) +# if ems >= 1: +# text.append('\n' * ems) +# except: +# pass # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: From 05331d7f05de3ed3010a63b5c0d754452ee23782 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 30 Apr 2011 09:43:09 -0400 Subject: [PATCH 06/25] TXT: Textile changes. --- src/calibre/ebooks/txt/processor.py | 2 + src/calibre/ebooks/txt/textileml.py | 231 ++++++++++++++++------------ 2 files changed, 135 insertions(+), 98 deletions(-) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 7e161f63bd..54369190de 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -242,6 +242,8 @@ def detect_formatting_type(txt): textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt)) # Links textile_count += len(re.findall(r'"[^"]*":\S+', txt)) + # paragraph blocks + textile_count += len(re.findall(r'(?mu)^p(<|<>|=|>)?\. ', txt)) # Decide if either markdown or textile is used in the text # based on the number of unique formatting elements found. diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 42b709a681..622ff8d2e3 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -7,7 +7,6 @@ __docformat__ = 'restructuredtext en' ''' Transform OEB content into Textile formatted plain text ''' - import re from functools import partial @@ -16,8 +15,6 @@ from calibre.ebooks.htmlz.oeb2html import OEB2HTML from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.txt.unsmarten import unsmarten -from operator import itemgetter - class TextileMLizer(OEB2HTML): @@ -29,17 +26,20 @@ class TextileMLizer(OEB2HTML): self.links = {} self.list = [] self.our_links = [] + self.in_a_link = False self.our_ids = [] self.images = {} + self.id_no_text = u'' + self.style_embed = [] self.remove_space_after_newline = False self.base_hrefs = [item.href for item in oeb_book.spine] self.map_resources(oeb_book) -# self.style_bold = False -# self.style_italic = False -# self.style_under = False -# self.style_strike = False -# self.style_smallcap = False + self.style_bold = False + self.style_italic = False + self.style_under = False + self.style_strike = False + self.style_smallcap = False txt = self.mlize_spine(oeb_book) txt = unsmarten(txt) @@ -56,7 +56,7 @@ class TextileMLizer(OEB2HTML): self.rewrite_ids(item.data, item) rewrite_links(item.data, partial(self.rewrite_link, page=item)) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) - output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) + output += self.dump_text(item.data.find(XHTML('body')), stylizer) output.append('\n\n') return ''.join(output) @@ -64,36 +64,47 @@ class TextileMLizer(OEB2HTML): # Needs tweaking and finetuning def check_escaping(text, tests): for t in tests: - text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text) + # I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged + txt = '%s' % t + self.log.debug('DEBUG: ' + txt) + if txt != '%': + text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text) text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text) text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text) return text - # Note - I'm not checking for escaped '-' as this will also get hypenated words - text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%']) - - text = re.sub(r' +\n', r'\n', text) - text = re.sub(r'^\n+', r'', text) - text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) - text = re.sub(r'\nbq\.\n?\np\. ', r'\nbq. ', text) - text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) - text = re.sub(r'\n{3}', r'\n\n', text) - text = re.sub(r'(p.*\. \n?)(p.*\. )', r'\2', text) - text = re.sub(r'p.*\. \n\n', r'', text) - text = re.sub(u'p.*\. \xa0', r'p. ', text) # blank paragraph - text = re.sub(r' \|', r'|', text) - # Now put back spaces removed earlier as they're needed here - text = re.sub(r'\np\.\n', r'\np. \n', text) - # Now tidyup links and ids - remove ones that don't have a correponding opposite if self.opts.keep_links: for i in self.our_links: - if i not in self.our_ids: - text = re.sub(r'"(.+)":'+i, '\1', text) + if i[0] == '#': + if i not in self.our_ids: + text = re.sub(r'"(.+)":'+i, '\1', text) for i in self.our_ids: if i not in self.our_links: text = re.sub(r'\('+i+'\)', '', text) + + # Note - I'm not checking for escaped '-' as this will also get hypenated words + text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%']) + text = re.sub(r'%\xa0+', r'%', text) #remove empty spans + text = re.sub(r'%%', r'', text) #remove empty spans + text = re.sub(r'%([_+*-]+)%', r'\1', text) #remove spans from tagged output + text = re.sub(r' +\n', r'\n', text) #remove spaces before a newline + text = re.sub(r'^\n+', r'', text) #remove newlines at top of file + text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) #correct blockcode paras + text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) #correct blockquote paras +# text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) #reduce blank lines + insert blank para + text = re.sub(r'\n{3}', r'\n\n', text) #reduce blank lines + text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text) + text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text) + text = re.sub(r'\n(p.*\.\n)(p.*\.)', r'\n\2', text) + text = re.sub(u'\np.*\.\xa0', r'\np. ', text) # blank paragraph + text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph + text = re.sub(r' {2,}\|', r' |', text) #sort out spaces in tables + # Now put back spaces removed earlier as they're needed here + text = re.sub(r'\np\.\n', r'\np. \n', text) + text = re.sub(r' \n\n\n', r' \n\n', text) #reduce blank lines + # started work on trying to fix footnotes # text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text) return text @@ -110,21 +121,15 @@ class TextileMLizer(OEB2HTML): self.remove_space_after_newline = False return text -# def remove_leading_ws(self, text): -# text = text.replace('\r\n', '\n') -# text = text.replace('\r', '\n') -# text = re.sub(r'\n[\t ]+', '\n', text) -# text = re.sub(r'\n{2,}', '\n', text) -# return text - def check_styles(self, style): txt = '{' -# style_string = '%s;' % style -# txt += style_string if style['color'] and style['color'] != 'black': txt += 'color:'+style['color']+';' -# if style['font-size']:# in ('big', 'bigger', 'small', 'smaller'): -# txt += 'font-size: %d;' % style['font-size'] + try: + if style['background']: + txt += 'background:'+style['background']+';' + except: + pass txt += '}' if txt == '{}': txt = '' return txt @@ -137,7 +142,7 @@ class TextileMLizer(OEB2HTML): return '' def check_valign(self, style): - tests = {'top':'^','bottom':'~', 'middle':'-'} + tests = {'top':'^','bottom':'~'} #, 'middle':'-'} for i in tests: if style['vertical-align'] == i: return tests[i] @@ -157,8 +162,9 @@ class TextileMLizer(OEB2HTML): def check_id_tag(self, attribs): txt = '' if attribs.has_key('id'): # and attribs['id'] in self.links.values(): - txt = '(#'+attribs['id']+ ')' - self.our_ids.append('#'+attribs['id']) + txt = '(#'+attribs['id']+ ')' + self.our_ids.append('#'+attribs['id']) + self.id_no_text = u'\xa0' return txt def build_block(self, tag, style, attribs): @@ -170,7 +176,7 @@ class TextileMLizer(OEB2HTML): txt += self.check_styles(style) return txt - def dump_text(self, elem, stylizer, page, tag_stack=[]): + def dump_text(self, elem, stylizer): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. @@ -197,45 +203,59 @@ class TextileMLizer(OEB2HTML): or style['visibility'] == 'hidden': return [''] + # Soft scene breaks. + text.append(self.check_padding(style, ['margin-top',u'\n\n\xa0'])) + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): if tag == 'div': tag = 'p' - text.append(self.build_block(tag, style, attribs)) - text.append('. ') - tags.append('\n') + block = self.build_block(tag, style, attribs) + # Normal paragraph with no styling. + if block == '\np': + text.append('\n\n') + tags.append('\n') + else: + text.append(block) + text.append('. ') + tags.append('\n') + #self.style_embed = [] if style['font-style'] == 'italic' or tag in ('i', 'em'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): if self.style_italic == False: text.append('_') -# text.append('from '+tag) tags.append('_') + self.style_embed.append ('_') self.style_italic = True if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): - style_string = '%s;' % style - text.append(style_string) if self.style_bold == False: text.append('*') -# text.append('from '+tag) tags.append('*') + self.style_embed.append ('*') self.style_bold = True if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): if tag != 'a': if self.style_under == False: text.append('+') tags.append('+') + self.style_embed.append ('+') self.style_under = True if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'): if self.style_strike == False: text.append('-') tags.append('-') + self.style_embed.append ('-') self.style_strike = True if tag == 'br': - text.append('') - tags.append('\n') + for i in reversed(self.style_embed): + text.append(i) + text.append('\n') + for i in self.style_embed: + text.append(i) + tags.append('') self.remove_space_after_newline = True - elif tag == 'blockquote': + if tag == 'blockquote': text.append('\nbq. ') tags.append('\n') elif tag in ('abbr', 'acronym'): @@ -259,7 +279,7 @@ class TextileMLizer(OEB2HTML): text.append('??') tags.append('??') elif tag == 'hr': - text.append('\n***\n') + text.append('\n***') tags.append('\n') elif tag == 'pre': self.in_pre = True @@ -267,12 +287,14 @@ class TextileMLizer(OEB2HTML): tags.append('pre\n') elif tag == 'a': if self.opts.keep_links: - text.append ('"') + text.append('"') + tags.append('a') if attribs.has_key('href'): tags.append('":' + attribs['href']) self.our_links.append(attribs['href']) if attribs.has_key('title'): tags.append('(' + attribs['title'] + ')') + self.in_a_link = True elif tag == 'img': if self.opts.keep_image_references: txt = '!' + self.check_halign(style) @@ -286,7 +308,7 @@ class TextileMLizer(OEB2HTML): tags.append('!') elif tag in ('ol', 'ul'): self.list.append({'name':tag, 'num':0}) - text.append('\n') + text.append('') tags.append(tag) elif tag == 'li': if self.list: li = self.list[-1] @@ -294,7 +316,7 @@ class TextileMLizer(OEB2HTML): text.append('\n') if li['name'] == 'ul': text.append('*'*len(self.list)+' ') elif li['name'] == 'ol': text.append('#'*len(self.list)+' ') - tags.append('\n') + tags.append('') elif tag == 'dl': text.append('\n') tags.append('') @@ -308,12 +330,19 @@ class TextileMLizer(OEB2HTML): text.append('') tags.append('\n') elif tag == 'table': - self.in_table = True - text.append('') + txt = self.build_block(tag, style, attribs) + txt += '. \n' + if txt != '\ntable. \n': + text.append(txt) + else: + text.append('\n') tags.append('') - tags.append('table') elif tag == 'tr': - text.append('') + txt = self.build_block('', style, attribs) + txt += '. ' + if txt != '\n. ': + txt = re.sub ('\n','',txt) + text.append(txt) tags.append('|\n') elif tag == 'td': text.append('|') @@ -324,13 +353,15 @@ class TextileMLizer(OEB2HTML): txt += '\\' + attribs['colspan'] if attribs.has_key ('rowspan'): txt += '/' + attribs['rowspan'] + try: + txt += self.check_styles(style) + except: + pass if txt != '': text.append(txt+'. ') tags.append('') elif tag == 'th': - text.append('|_') - - text.append('. ') + text.append('|_. ') tags.append('') elif tag == 'span': if style['font-variant'] == 'small-caps': @@ -339,35 +370,36 @@ class TextileMLizer(OEB2HTML): tags.append('&') self.style_smallcap = True else: - txt = '%' - if self.opts.keep_links: - txt += self.check_id_tag(attribs) - txt += self.check_styles(style) - if txt != '%': - text.append(txt) - tags.append('%') + if self.in_a_link == False: + txt = '%' + if self.opts.keep_links: + txt += self.check_id_tag(attribs) + txt += self.check_styles(style) + if txt != '%': + text.append(txt) + tags.append('%') if self.opts.keep_links and attribs.has_key('id'): - if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span'): + if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'): text.append(self.check_id_tag(attribs)) # Process the styles for any that we want to keep - if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', 'span'): - text.append(self.check_styles(style)) + if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', \ + 'span', 'table', 'tr', 'td'): + if not self.in_a_link: + text.append(self.check_styles(style)) # Process tags that contain text. if hasattr(elem, 'text') and elem.text: txt = elem.text if not self.in_pre: - if self.in_table: - txt = self.remove_newlines(txt) - else: - txt = self.remove_leading_ws(txt) + txt = self.remove_newlines(txt) text.append(txt) + self.id_no_text = u'' # Recurse down into tags within the tag we are in. for item in elem: - text += self.dump_text(item, stylizer, page, tag_stack+tags) + text += self.dump_text(item, stylizer) # Close all open tags. tags.reverse() @@ -375,36 +407,39 @@ class TextileMLizer(OEB2HTML): if tag in ('pre', 'ul', 'ol', 'li', 'table'): if tag == 'pre': self.in_pre = False - if tag == 'table': - self.in_table = False - if tag in ('ul', 'ol'): + elif tag in ('ul', 'ol'): if self.list: self.list.pop() if not self.list: text.append('\n') else: - text.append('%s' % t) - if t == '*': self.style_bold = False - if t == '_': self.style_italic = False - if t == '+': self.style_under = False - if t == '-': self.style_strike = False - if t == '&': self.style_smallcap = False + if t == 'a': + self.in_a_link = False + t = '' + text.append(self.id_no_text) + self.id_no_text = u'' + if t == '*': + self.style_bold = False + elif t == '_': + self.style_italic = False + elif t == '+': + self.style_under = False + elif t == '-': + self.style_strike = False + elif t == '&': + self.style_smallcap = False + if t in ('*', '_', '+', '-'): + txt = self.style_embed.pop() + text.append(txt) + else: + text.append('%s' % t) # Soft scene breaks. text.append(self.check_padding(style, ['margin-bottom',u'\n\n\xa0'])) -# try: -# ems = int(round((float(style.marginBottom) / style.fontSize) - 1)) -# if ems >= 1: -# text.append('\n' * ems) -# except: -# pass # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: tail = elem.tail if not self.in_pre: - if self.in_table: - tail = self.remove_newlines(tail) - else: - tail = self.remove_leading_ws(tail) + tail = self.remove_newlines(tail) text.append(tail) return text From 8853f6c1468bebd72e360517c4117a3764f9edfe Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 1 May 2011 10:24:56 -0400 Subject: [PATCH 07/25] ... --- src/calibre/ebooks/txt/output.py | 14 ++++++++++++-- src/calibre/ebooks/txt/textileml.py | 25 ++++++++++++------------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 261ace2f91..d9c42eb1dc 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -66,6 +66,13 @@ class TXTOutput(OutputFormatPlugin): help=_('Do not remove image references within the document. This is only ' \ 'useful when paired with a txt-output-formatting option that ' 'is not none because links are always removed with plain text output.')), + OptionRecommendation(name='keep_color', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Do not remove font color from output. This is only useful when ' \ + 'txt-output-formatting is set to textile. Textile is the only ' \ + 'formatting that supports setting font color. If this option is ' \ + 'not specified font color will not be set and default to the ' \ + 'color displayed by the reader (generally this is black).')), ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): @@ -111,9 +118,12 @@ class TXTZOutput(TXTOutput): from calibre.ebooks.oeb.base import OEB_IMAGES with TemporaryDirectory('_txtz_output') as tdir: # TXT - with TemporaryFile('index.txt') as tf: + txt_name = 'index.txt' + if opts.txt_output_formatting.lower() == 'textile': + txt_name = 'index.text' + with TemporaryFile(txt_name) as tf: TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log) - shutil.copy(tf, os.path.join(tdir, 'index.txt')) + shutil.copy(tf, os.path.join(tdir, txt_name)) # Images for item in oeb_book.manifest: diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 622ff8d2e3..1c35670596 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -98,7 +98,7 @@ class TextileMLizer(OEB2HTML): text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text) text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text) text = re.sub(r'\n(p.*\.\n)(p.*\.)', r'\n\2', text) - text = re.sub(u'\np.*\.\xa0', r'\np. ', text) # blank paragraph + text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text) # blank paragraph text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph text = re.sub(r' {2,}\|', r' |', text) #sort out spaces in tables # Now put back spaces removed earlier as they're needed here @@ -176,6 +176,11 @@ class TextileMLizer(OEB2HTML): txt += self.check_styles(style) return txt + def prepare_string_for_textile(self, txt): + if re.search(r'(\s([*&_+\-=~@%|]|\?{2}))|(([*&_+\-=~@%|]|\?{2})\s)', txt): + return ' ==%s== ' % txt + return txt + def dump_text(self, elem, stylizer): ''' @elem: The element in the etree that we are working on. @@ -197,7 +202,7 @@ class TextileMLizer(OEB2HTML): tags = [] tag = barename(elem.tag) attribs = elem.attrib - + # Ignore anything that is set to not be displayed. if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': @@ -209,15 +214,9 @@ class TextileMLizer(OEB2HTML): if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): if tag == 'div': tag = 'p' - block = self.build_block(tag, style, attribs) - # Normal paragraph with no styling. - if block == '\np': - text.append('\n\n') - tags.append('\n') - else: - text.append(block) - text.append('. ') - tags.append('\n') + text.append(self.build_block(tag, style, attribs)) + text.append('. ') + tags.append('\n') #self.style_embed = [] if style['font-style'] == 'italic' or tag in ('i', 'em'): @@ -393,7 +392,7 @@ class TextileMLizer(OEB2HTML): if hasattr(elem, 'text') and elem.text: txt = elem.text if not self.in_pre: - txt = self.remove_newlines(txt) + txt = self.prepare_string_for_textile(self.remove_newlines(txt)) text.append(txt) self.id_no_text = u'' @@ -439,7 +438,7 @@ class TextileMLizer(OEB2HTML): if hasattr(elem, 'tail') and elem.tail: tail = elem.tail if not self.in_pre: - tail = self.remove_newlines(tail) + tail = self.prepare_string_for_textile(self.remove_newlines(tail)) text.append(tail) return text From 4bdbab22ca6e8818b76e0ae98ec30094dd00622d Mon Sep 17 00:00:00 2001 From: Li Fanxi Date: Sun, 8 May 2011 22:28:47 +0800 Subject: [PATCH 08/25] Finish the Douban.com books metadata source plugin --- src/calibre/ebooks/metadata/sources/douban.py | 55 +++++++++---------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/douban.py b/src/calibre/ebooks/metadata/sources/douban.py index 8f1794b33f..7a8619261b 100644 --- a/src/calibre/ebooks/metadata/sources/douban.py +++ b/src/calibre/ebooks/metadata/sources/douban.py @@ -40,8 +40,8 @@ publisher = XPath("descendant::db:attribute[@name='publisher']") isbn = XPath("descendant::db:attribute[@name='isbn13']") date = XPath("descendant::db:attribute[@name='pubdate']") creator = XPath("descendant::db:attribute[@name='author']") -tag = XPath("descendant::db:tag") -rating = XPath("descendant::gd:rating[@name='average']") +booktag = XPath("descendant::db:tag/attribute::name") +rating = XPath("descendant::gd:rating/attribute::average") cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href") def get_details(browser, url, timeout): # {{{ @@ -51,7 +51,7 @@ def get_details(browser, url, timeout): # {{{ gc = getattr(e, 'getcode', lambda : -1) if gc() != 403: raise - # Google is throttling us, wait a little + # Douban is throttling us, wait a little time.sleep(2) raw = browser.open_novisit(url, timeout=timeout).read() @@ -59,7 +59,6 @@ def get_details(browser, url, timeout): # {{{ # }}} def to_metadata(browser, log, entry_, timeout): # {{{ - def get_text(extra, x): try: ans = x(extra) @@ -71,7 +70,6 @@ def to_metadata(browser, log, entry_, timeout): # {{{ log.exception('Programming error:') return None - id_url = entry_id(entry_)[0].text douban_id = id_url.split('/')[-1] title_ = ': '.join([x.text for x in title(entry_)]).strip() @@ -92,9 +90,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{ except: log.exception('Failed to get additional details for', mi.title) return mi - mi.comments = get_text(extra, description) - #mi.language = get_text(extra, language) mi.publisher = get_text(extra, publisher) # ISBN @@ -108,7 +104,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{ # Tags try: - btags = [x.text for x in subject(extra) if x.text] + btags = [x for x in booktag(extra) if x] tags = [] for t in btags: atags = [y.strip() for y in t.split('/')] @@ -120,7 +116,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{ tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] - + # pubdate pubdate = get_text(extra, date) if pubdate: @@ -133,7 +129,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{ # Ratings if rating(extra): try: - mi.rating = float(rating(extra).text) / 2.0 + mi.rating = float(rating(extra)[0]) / 2.0 except: log.exception('Failed to parse rating') mi.rating = 0 @@ -141,10 +137,8 @@ def to_metadata(browser, log, entry_, timeout): # {{{ # Cover mi.has_douban_cover = None u = cover_url(extra) - print(u) if u: u = u[0].replace('/spic/', '/lpic/'); - print(u) # If URL contains "book-default", the book doesn't have a cover if u.find('book-default') == -1: mi.has_douban_cover = u @@ -155,26 +149,24 @@ class Douban(Source): name = 'Douban Books' author = _('Li Fanxi') + version = (2, 0, 0) description = _('Downloads metadata from Douban.com') capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset(['title', 'authors', 'tags', - 'comments', 'publisher', 'identifier:isbn', 'rating', + 'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating', 'identifier:douban']) # language currently disabled supports_gzip_transfer_encoding = True cached_cover_url_is_reliable = True DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d' - DOUBAN_ID_URL = 'http://api.douban.com/book/subject/%s' -# GOOGLE_COVER = 'http://books.google.com/books?id=%s&printsec=frontcover&img=1' - -# DUMMY_IMAGE_MD5 = frozenset(['0de4383ebad0adad5eeb8975cd796657']) + DOUBAN_BOOK_URL = 'http://book.douban.com/subject/%s/' def get_book_url(self, identifiers): # {{{ db = identifiers.get('douban', None) if db is not None: - return DOUBAN_ID_URL % db + return ('douban', db, self.DOUBAN_BOOK_URL%db) else: return None # }}} @@ -182,13 +174,18 @@ class Douban(Source): def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ SEARCH_URL = 'http://api.douban.com/book/subjects?' ISBN_URL = 'http://api.douban.com/book/subject/isbn/' + SUBJECT_URL = 'http://api.douban.com/book/subject/' q = '' t = None isbn = check_isbn(identifiers.get('isbn', None)) + subject = identifiers.get('douban', None) if isbn is not None: q = isbn t = 'isbn' + elif subject is not None: + q = subject + t = 'subject' elif title or authors: def build_term(prefix, parts): return ' '.join(x for x in parts) @@ -209,6 +206,8 @@ class Douban(Source): url = None if t == "isbn": url = ISBN_URL + q + elif t == 'subject': + url = SUBJECT_URL + q else: url = SEARCH_URL + urlencode({ 'q': q, @@ -314,14 +313,12 @@ class Douban(Source): except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) - if not title: - title = "" if not entries and identifiers and title and authors and \ not abort.is_set(): return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) - # There is no point running these queries in threads as google + # There is no point running these queries in threads as douban # throttles requests returning 403 Forbidden errors self.get_all_details(br, log, entries, abort, result_queue, timeout) @@ -329,23 +326,23 @@ class Douban(Source): # }}} if __name__ == '__main__': # tests {{{ - # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py + # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/douban.py from calibre.ebooks.metadata.sources.test import (test_identify_plugin, title_test, authors_test) - test_identify_plugin(GoogleBooks.name, + test_identify_plugin(Douban.name, [ ( - {'identifiers':{'isbn': '0743273567'}, 'title':'Great Gatsby', - 'authors':['Fitzgerald']}, - [title_test('The great gatsby', exact=True), - authors_test(['Francis Scott Fitzgerald'])] + {'identifiers':{'isbn': '9787536692930'}, 'title':'三体', + 'authors':['刘慈欣']}, + [title_test('三体', exact=True), + authors_test(['刘慈欣'])] ), ( - {'title': 'Flatland', 'authors':['Abbott']}, - [title_test('Flatland', exact=False)] + {'title': 'Linux内核修炼之道', 'authors':['任桥伟']}, + [title_test('Linux内核修炼之道', exact=False)] ), ]) # }}} From 803c0449b9b2d7e479658e03f555c215eacad026 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 8 May 2011 16:01:36 -0400 Subject: [PATCH 09/25] ... --- src/calibre/customize/profiles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index 5c29f1e79b..de82aaffa1 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -253,7 +253,7 @@ class OutputProfile(Plugin): periodical_date_in_title = True #: Characters used in jackets and catalogs - missing_char = u'x' + missing_char = u'x' ratings_char = u'*' empty_ratings_char = u' ' read_char = u'+' From 5ac915b416c49189606311a2524d59d5a3f8feeb Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 8 May 2011 16:39:45 -0400 Subject: [PATCH 10/25] Leigh's latest changes. --- src/calibre/ebooks/txt/textileml.py | 106 ++++++++++++++++------------ 1 file changed, 61 insertions(+), 45 deletions(-) diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 1c35670596..2f04c4676b 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -14,6 +14,7 @@ from functools import partial from calibre.ebooks.htmlz.oeb2html import OEB2HTML from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links from calibre.ebooks.oeb.stylizer import Stylizer +from calibre.ebooks import unit_convert from calibre.ebooks.txt.unsmarten import unsmarten class TextileMLizer(OEB2HTML): @@ -55,20 +56,19 @@ class TextileMLizer(OEB2HTML): self.log.debug('Converting %s to Textile formatted TXT...' % item.href) self.rewrite_ids(item.data, item) rewrite_links(item.data, partial(self.rewrite_link, page=item)) - stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) + stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile) output += self.dump_text(item.data.find(XHTML('body')), stylizer) output.append('\n\n') return ''.join(output) def tidy_up(self, text): - # Needs tweaking and finetuning + # May need tweaking and finetuning def check_escaping(text, tests): for t in tests: # I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged txt = '%s' % t - self.log.debug('DEBUG: ' + txt) if txt != '%': - text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text) + text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text) text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text) text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text) return text @@ -87,26 +87,26 @@ class TextileMLizer(OEB2HTML): text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%']) text = re.sub(r'%\xa0+', r'%', text) #remove empty spans - text = re.sub(r'%%', r'', text) #remove empty spans + text = re.sub(r'%%', r'', text) #remove empty spans - MAY MERGE SOME ? text = re.sub(r'%([_+*-]+)%', r'\1', text) #remove spans from tagged output text = re.sub(r' +\n', r'\n', text) #remove spaces before a newline text = re.sub(r'^\n+', r'', text) #remove newlines at top of file text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) #correct blockcode paras text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) #correct blockquote paras -# text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) #reduce blank lines + insert blank para text = re.sub(r'\n{3}', r'\n\n', text) #reduce blank lines text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text) text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text) - text = re.sub(r'\n(p.*\.\n)(p.*\.)', r'\n\2', text) text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text) # blank paragraph text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph + text = re.sub(u'\np[<>=]{0,2}\. \xa0', r'\np. ', text) # blank paragraph + text = re.sub(r'\n(p.*\.\n)(p.*\.)', r'\n\2', text) + text = re.sub(r'\n(p\. \n)(p.*\.)', r'\n\2', text) text = re.sub(r' {2,}\|', r' |', text) #sort out spaces in tables + # Now put back spaces removed earlier as they're needed here text = re.sub(r'\np\.\n', r'\np. \n', text) text = re.sub(r' \n\n\n', r' \n\n', text) #reduce blank lines - - # started work on trying to fix footnotes -# text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text) + return text def remove_newlines(self, text): @@ -123,13 +123,11 @@ class TextileMLizer(OEB2HTML): def check_styles(self, style): txt = '{' - if style['color'] and style['color'] != 'black': - txt += 'color:'+style['color']+';' - try: - if style['background']: + if self.opts.keep_color: + if 'color' in style.cssdict() and style['color'] != 'black': + txt += 'color:'+style['color']+';' + if 'background' in style.cssdict(): txt += 'background:'+style['background']+';' - except: - pass txt += '}' if txt == '{}': txt = '' return txt @@ -148,30 +146,44 @@ class TextileMLizer(OEB2HTML): return tests[i] return '' - def check_padding(self, style, tests): + def check_padding(self, style, stylizer): txt = '' - for i in tests: - try: - ems = int(round(float(style[i[0]] / style['font-size']))) - if ems >=1: - txt += i[1] * ems - except: - pass + left_padding_pts = 0 + left_margin_pts = 0 + if 'padding-left' in style.cssdict() and style['padding-left'] != 'auto': + left_padding_pts = unit_convert(style['padding-left'], style.width, style.fontSize, stylizer.profile.dpi) + if 'margin-left' in style.cssdict() and style['margin-left'] != 'auto': + left_margin_pts = unit_convert(style['margin-left'], style.width, style.fontSize, stylizer.profile.dpi) + left = left_margin_pts + left_padding_pts + emleft = int(round(left / stylizer.profile.fbase)) + if emleft >= 1: + txt += '(' * emleft + right_padding_pts = 0 + right_margin_pts = 0 + if 'padding-right' in style.cssdict() and style['padding-right'] != 'auto': + right_padding_pts = unit_convert(style['padding-right'], style.width, style.fontSize, stylizer.profile.dpi) + if 'margin-right' in style.cssdict() and style['margin-right'] != 'auto': + right_margin_pts = unit_convert(style['margin-right'], style.width, style.fontSize, stylizer.profile.dpi) + right = right_margin_pts + right_padding_pts + emright = int(round(right / stylizer.profile.fbase)) + if emright >= 1: + txt += ')' * emright + return txt def check_id_tag(self, attribs): txt = '' - if attribs.has_key('id'): # and attribs['id'] in self.links.values(): + if attribs.has_key('id'): txt = '(#'+attribs['id']+ ')' self.our_ids.append('#'+attribs['id']) self.id_no_text = u'\xa0' return txt - def build_block(self, tag, style, attribs): + def build_block(self, tag, style, attribs, stylizer): txt = '\n' + tag if self.opts.keep_links: txt += self.check_id_tag(attribs) - txt += self.check_padding(style, [['padding-left','('],['padding-right',')']]) + txt += self.check_padding(style, stylizer) txt += self.check_halign(style) txt += self.check_styles(style) return txt @@ -202,22 +214,24 @@ class TextileMLizer(OEB2HTML): tags = [] tag = barename(elem.tag) attribs = elem.attrib - + # Ignore anything that is set to not be displayed. if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': return [''] # Soft scene breaks. - text.append(self.check_padding(style, ['margin-top',u'\n\n\xa0'])) - + if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto': + ems = int(round(float(style.marginTop) / style.fontSize) - 1) + if ems >= 1: + text.append(u'\n\n\xa0' * ems) + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): if tag == 'div': tag = 'p' - text.append(self.build_block(tag, style, attribs)) + text.append(self.build_block(tag, style, attribs, stylizer)) text.append('. ') tags.append('\n') - #self.style_embed = [] if style['font-style'] == 'italic' or tag in ('i', 'em'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): @@ -306,15 +320,17 @@ class TextileMLizer(OEB2HTML): text.append('(' + txt + ')') tags.append('!') elif tag in ('ol', 'ul'): - self.list.append({'name':tag, 'num':0}) + self.list.append({'name': tag, 'num': 0}) text.append('') tags.append(tag) elif tag == 'li': if self.list: li = self.list[-1] - else: li = {'name':'ul', 'num':0} + else: li = {'name': 'ul', 'num': 0} text.append('\n') - if li['name'] == 'ul': text.append('*'*len(self.list)+' ') - elif li['name'] == 'ol': text.append('#'*len(self.list)+' ') + if li['name'] == 'ul': + text.append('*' * len(self.list) + ' ') + elif li['name'] == 'ol': + text.append('#' * len(self.list) + ' ') tags.append('') elif tag == 'dl': text.append('\n') @@ -329,7 +345,7 @@ class TextileMLizer(OEB2HTML): text.append('') tags.append('\n') elif tag == 'table': - txt = self.build_block(tag, style, attribs) + txt = self.build_block(tag, style, attribs, stylizer) txt += '. \n' if txt != '\ntable. \n': text.append(txt) @@ -337,10 +353,10 @@ class TextileMLizer(OEB2HTML): text.append('\n') tags.append('') elif tag == 'tr': - txt = self.build_block('', style, attribs) + txt = self.build_block('', style, attribs, stylizer) txt += '. ' if txt != '\n. ': - txt = re.sub ('\n','',txt) + txt = re.sub ('\n', '', txt) text.append(txt) tags.append('|\n') elif tag == 'td': @@ -352,12 +368,9 @@ class TextileMLizer(OEB2HTML): txt += '\\' + attribs['colspan'] if attribs.has_key ('rowspan'): txt += '/' + attribs['rowspan'] - try: - txt += self.check_styles(style) - except: - pass + txt += self.check_styles(style) if txt != '': - text.append(txt+'. ') + text.append(txt + '. ') tags.append('') elif tag == 'th': text.append('|_. ') @@ -432,7 +445,10 @@ class TextileMLizer(OEB2HTML): text.append('%s' % t) # Soft scene breaks. - text.append(self.check_padding(style, ['margin-bottom',u'\n\n\xa0'])) + if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto': + ems = int(round((float(style.marginBottom) / style.fontSize) - 1)) + if ems >=1: + text.append(u'\n\n\xa0' * ems) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: From d6ec680ebbbadc659a09105d66aaa60299ac1be9 Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 9 May 2011 06:43:19 -0400 Subject: [PATCH 11/25] Leigh's latest changes. --- src/calibre/ebooks/textile/functions.py | 2 +- src/calibre/ebooks/txt/textileml.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index dd1914cf9f..b186e79ad4 100755 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -792,6 +792,7 @@ class Textile(object): text = self.noTextile(text) text = self.code(text) + text = self.glyphs(text) text = self.links(text) if not self.noimage: @@ -803,7 +804,6 @@ class Textile(object): text = self.span(text) text = self.footnoteRef(text) - text = self.glyphs(text) return text.rstrip('\n') diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 2f04c4676b..082332ffd8 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -69,8 +69,8 @@ class TextileMLizer(OEB2HTML): txt = '%s' % t if txt != '%': text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text) - text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text) - text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text) + text = re.sub(r'([a-zA-Z0-9\'"\-])('+t+'[a-zA-Z0-9\'"!? ,.\-]+'+t+')', r'\1[\2]', text) + text = re.sub(r'('+t+'[a-zA-Z0-9\'"!? ,.\-]+'+t+')([a-zA-Z0-9\'"!?\-])', r'[\1]\2', text) return text # Now tidyup links and ids - remove ones that don't have a correponding opposite From c384188057639b42e5e10c142f6e1425f94d09ba Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 9 May 2011 06:47:45 -0400 Subject: [PATCH 12/25] Leigh's latest changes. --- src/calibre/ebooks/txt/textileml.py | 58 +++++++++++++++-------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 082332ffd8..31c118251d 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -68,9 +68,8 @@ class TextileMLizer(OEB2HTML): # I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged txt = '%s' % t if txt != '%': - text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text) - text = re.sub(r'([a-zA-Z0-9\'"\-])('+t+'[a-zA-Z0-9\'"!? ,.\-]+'+t+')', r'\1[\2]', text) - text = re.sub(r'('+t+'[a-zA-Z0-9\'"!? ,.\-]+'+t+')([a-zA-Z0-9\'"!?\-])', r'[\1]\2', text) + text = re.sub(r'([^'+t+'|^\n])'+t+'\]\['+t+'([^'+t+'])', r'\1\2', text) + text = re.sub(r'(\s|[*_])\[('+t+'[a-zA-Z0-9 \',.*_]+'+t+')\](\s|[*_])', r'\1\2\3', text) return text # Now tidyup links and ids - remove ones that don't have a correponding opposite @@ -84,7 +83,8 @@ class TextileMLizer(OEB2HTML): text = re.sub(r'\('+i+'\)', '', text) # Note - I'm not checking for escaped '-' as this will also get hypenated words - text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%']) + text = check_escaping(text, ['\*', '_', '\+', '-']) +# text = check_escaping(text, ['\*', '_', '\+', '-']) text = re.sub(r'%\xa0+', r'%', text) #remove empty spans text = re.sub(r'%%', r'', text) #remove empty spans - MAY MERGE SOME ? @@ -93,20 +93,24 @@ class TextileMLizer(OEB2HTML): text = re.sub(r'^\n+', r'', text) #remove newlines at top of file text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) #correct blockcode paras text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) #correct blockquote paras +# text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) #reduce blank lines + insert blank para text = re.sub(r'\n{3}', r'\n\n', text) #reduce blank lines +# text = re.sub(r' ((\* ?)+) ', r' ==\1== ', text) text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text) text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text) text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text) # blank paragraph text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph - text = re.sub(u'\np[<>=]{0,2}\. \xa0', r'\np. ', text) # blank paragraph - text = re.sub(r'\n(p.*\.\n)(p.*\.)', r'\n\2', text) + text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text) # blank paragraph + text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text) text = re.sub(r'\n(p\. \n)(p.*\.)', r'\n\2', text) text = re.sub(r' {2,}\|', r' |', text) #sort out spaces in tables # Now put back spaces removed earlier as they're needed here text = re.sub(r'\np\.\n', r'\np. \n', text) text = re.sub(r' \n\n\n', r' \n\n', text) #reduce blank lines - + + # started work on trying to fix footnotes +# text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text) return text def remove_newlines(self, text): @@ -236,29 +240,29 @@ class TextileMLizer(OEB2HTML): if style['font-style'] == 'italic' or tag in ('i', 'em'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): if self.style_italic == False: - text.append('_') - tags.append('_') - self.style_embed.append ('_') + text.append('[_') + tags.append('_]') + self.style_embed.append('_') self.style_italic = True if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): if self.style_bold == False: - text.append('*') - tags.append('*') - self.style_embed.append ('*') + text.append('[*') + tags.append('*]') + self.style_embed.append('*') self.style_bold = True if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): if tag != 'a': if self.style_under == False: - text.append('+') - tags.append('+') - self.style_embed.append ('+') + text.append('[+') + tags.append('+]') + self.style_embed.append('+') self.style_under = True if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'): if self.style_strike == False: - text.append('-') - tags.append('-') - self.style_embed.append ('-') + text.append('[-') + tags.append('-]') + self.style_embed.append('-') self.style_strike = True if tag == 'br': for i in reversed(self.style_embed): @@ -428,26 +432,24 @@ class TextileMLizer(OEB2HTML): t = '' text.append(self.id_no_text) self.id_no_text = u'' - if t == '*': + if t == '*]': self.style_bold = False - elif t == '_': + elif t == '_]': self.style_italic = False - elif t == '+': + elif t == '+]': self.style_under = False - elif t == '-': + elif t == '-]': self.style_strike = False elif t == '&': self.style_smallcap = False - if t in ('*', '_', '+', '-'): + if t in ('*]', '_]', '+]', '-]'): txt = self.style_embed.pop() - text.append(txt) - else: - text.append('%s' % t) + text.append('%s' % t) # Soft scene breaks. if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto': ems = int(round((float(style.marginBottom) / style.fontSize) - 1)) - if ems >=1: + if ems >= 1: text.append(u'\n\n\xa0' * ems) # Add the text that is outside of the tag. From 842ba755575c108fc0c8ab93cac383185776f212 Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 9 May 2011 21:19:28 -0400 Subject: [PATCH 13/25] More changes. --- src/calibre/ebooks/textile/functions.py | 34 +++++++++----- src/calibre/ebooks/txt/textileml.py | 59 ++++++++++++++++--------- 2 files changed, 61 insertions(+), 32 deletions(-) diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index b186e79ad4..0e1811f195 100755 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -12,7 +12,7 @@ A Humane Web Text Generator #__date__ = '2009/12/04' __copyright__ = """ -Copyright (c) 2011, Leigh Parry +Copyright (c) 2011, Leigh Parry Copyright (c) 2011, John Schember Copyright (c) 2009, Jason Samsa, http://jsamsa.com/ Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/ @@ -219,14 +219,13 @@ class Textile(object): ] glyph_defaults = [ (re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'), r'\1\2×\3'), # dimension sign - (re.compile(r'(\d+)\'', re.I), r'\1′'), # prime - (re.compile(r'(\d+)\"', re.I), r'\1″'), # prime-double + (re.compile(r'(\d+)\'(\s)', re.I), r'\1′\2'), # prime + (re.compile(r'(\d+)\"(\s)', re.I), r'\1″\2'), # prime-double (re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), r'\1'), # 3+ uppercase acronym (re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), r'\1'), # 3+ uppercase (re.compile(r'\b(\s{0,1})?\.{3}'), r'\1…'), # ellipsis (re.compile(r'^[\*_-]{3,}$', re.M), r'
'), #
scene-break -# (re.compile(r'\b--\b'), r'—'), # em dash - (re.compile(r'([^-])--([^-])'), r'\1—\2'), # em dash + (re.compile(r'(^|[^-])--([^-]|$)'), r'\1—\2'), # em dash (re.compile(r'\s-(?:\s|$)'), r' – '), # en dash (re.compile(r'\b( ?)[([]TM[])]', re.I), r'\1™'), # trademark (re.compile(r'\b( ?)[([]R[])]', re.I), r'\1®'), # registered @@ -706,6 +705,21 @@ class Textile(object): result.append(line) return ''.join(result) + def glyphs_only(self, text): + # fix: hackish + text = re.sub(r'"\Z', '\" ', text) + + result = [] + for line in re.compile(r'(<.*?>)', re.U).split(text): + if not re.search(r'<.*>', line): + rules = [] + if re.search(r'{.+?}', line): + rules = self.macro_defaults + for s, r in rules: + line = s.sub(r, line) + result.append(line) + return ''.join(result) + def vAlign(self, input): d = {'^':'top', '-':'middle', '~':'bottom'} return d.get(input, '') @@ -792,7 +806,6 @@ class Textile(object): text = self.noTextile(text) text = self.code(text) - text = self.glyphs(text) text = self.links(text) if not self.noimage: @@ -804,6 +817,7 @@ class Textile(object): text = self.span(text) text = self.footnoteRef(text) + text = self.glyphs(text) return text.rstrip('\n') @@ -814,6 +828,7 @@ class Textile(object): 'fooobar ... and hello world ...' """ + text = self.glyphs_only(text) punct = '!"#$%&\'*+,-./:;=?@\\^_`|~' pattern = r''' @@ -868,7 +883,7 @@ class Textile(object): >>> t.span(r"hello %(bob)span *strong* and **bold**% goodbye") 'hello span strong and bold goodbye' """ - qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^', r'&') + qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^') pnct = ".,\"'?!;:" for qtag in qtags: @@ -900,9 +915,7 @@ class Textile(object): '%' : 'span', '+' : 'ins', '~' : 'sub', - '^' : 'sup', - '&' : 'span style="font-variant:small-caps;"' -# '&' : 'span style="font-transform:uppercase;font-size:smaller;"' + '^' : 'sup' } tag = qtags[tag] atts = self.pba(atts) @@ -1046,4 +1059,3 @@ def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'): return Textile(restricted=True, lite=lite, noimage=noimage).textile(text, rel='nofollow', html_type=html_type) - diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 31c118251d..814ba01a3e 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -69,7 +69,8 @@ class TextileMLizer(OEB2HTML): txt = '%s' % t if txt != '%': text = re.sub(r'([^'+t+'|^\n])'+t+'\]\['+t+'([^'+t+'])', r'\1\2', text) - text = re.sub(r'(\s|[*_])\[('+t+'[a-zA-Z0-9 \',.*_]+'+t+')\](\s|[*_])', r'\1\2\3', text) + text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text) + text = re.sub(r'(\s|[*_\'"])\[('+t+'[a-zA-Z0-9 \'",.*_]+'+t+')\](\s|[*_\'"?!,.])', r'\1\2\3', text) return text # Now tidyup links and ids - remove ones that don't have a correponding opposite @@ -77,14 +78,17 @@ class TextileMLizer(OEB2HTML): for i in self.our_links: if i[0] == '#': if i not in self.our_ids: - text = re.sub(r'"(.+)":'+i, '\1', text) + self.log.debug('Link has no target - %s ...' % i) + text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text) for i in self.our_ids: if i not in self.our_links: - text = re.sub(r'\('+i+'\)', '', text) + self.log.debug('ID has no link - %s ...' % i) + text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text) - # Note - I'm not checking for escaped '-' as this will also get hypenated words - text = check_escaping(text, ['\*', '_', '\+', '-']) -# text = check_escaping(text, ['\*', '_', '\+', '-']) + # Remove obvious non-needed escaping, add sub/sup-script ones + text = check_escaping(text, ['\*', '_', '\*']) + text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text) # escape the super/sub-scripts if needed + text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text) # escape the super/sub-scripts if needed text = re.sub(r'%\xa0+', r'%', text) #remove empty spans text = re.sub(r'%%', r'', text) #remove empty spans - MAY MERGE SOME ? @@ -96,13 +100,14 @@ class TextileMLizer(OEB2HTML): # text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) #reduce blank lines + insert blank para text = re.sub(r'\n{3}', r'\n\n', text) #reduce blank lines # text = re.sub(r' ((\* ?)+) ', r' ==\1== ', text) - text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text) + text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text) + text = re.sub(r'\n\n {2,4}%', r'%', text) #Check span following blank para text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text) text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text) # blank paragraph text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text) # blank paragraph text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text) - text = re.sub(r'\n(p\. \n)(p.*\.)', r'\n\2', text) + text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text) text = re.sub(r' {2,}\|', r' |', text) #sort out spaces in tables # Now put back spaces removed earlier as they're needed here @@ -193,7 +198,8 @@ class TextileMLizer(OEB2HTML): return txt def prepare_string_for_textile(self, txt): - if re.search(r'(\s([*&_+\-=~@%|]|\?{2}))|(([*&_+\-=~@%|]|\?{2})\s)', txt): +# if re.search(r'(\s([*&_+\-~@%|]|\?{2}))|(([*&_+\-~@%|]|\?{2})\s)', txt): + if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt): return ' ==%s== ' % txt return txt @@ -240,15 +246,23 @@ class TextileMLizer(OEB2HTML): if style['font-style'] == 'italic' or tag in ('i', 'em'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): if self.style_italic == False: - text.append('[_') - tags.append('_]') + if self.in_a_link: + text.append('_') + tags.append('_') + else: + text.append('[_') + tags.append('_]') self.style_embed.append('_') self.style_italic = True if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): if self.style_bold == False: - text.append('[*') - tags.append('*]') + if self.in_a_link: + text.append('*') + tags.append('*') + else: + text.append('[*') + tags.append('*]') self.style_embed.append('*') self.style_bold = True if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): @@ -304,14 +318,17 @@ class TextileMLizer(OEB2HTML): tags.append('pre\n') elif tag == 'a': if self.opts.keep_links: - text.append('"') - tags.append('a') if attribs.has_key('href'): + text.append('"') + tags.append('a') tags.append('":' + attribs['href']) self.our_links.append(attribs['href']) - if attribs.has_key('title'): - tags.append('(' + attribs['title'] + ')') - self.in_a_link = True + if attribs.has_key('title'): + tags.append('(' + attribs['title'] + ')') + self.in_a_link = True + else: + text.append('%') + tags.append('%') elif tag == 'img': if self.opts.keep_image_references: txt = '!' + self.check_halign(style) @@ -432,9 +449,9 @@ class TextileMLizer(OEB2HTML): t = '' text.append(self.id_no_text) self.id_no_text = u'' - if t == '*]': + if t in ('*]', '*'): self.style_bold = False - elif t == '_]': + elif t in ('_]', '_'): self.style_italic = False elif t == '+]': self.style_under = False @@ -442,7 +459,7 @@ class TextileMLizer(OEB2HTML): self.style_strike = False elif t == '&': self.style_smallcap = False - if t in ('*]', '_]', '+]', '-]'): + if t in ('*]', '_]', '+]', '-]', '*', '_'): txt = self.style_embed.pop() text.append('%s' % t) From 3ca59beaf5c825fbb14af90b0108b7792a011924 Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 9 May 2011 21:21:35 -0400 Subject: [PATCH 14/25] Add email. --- src/calibre/ebooks/textile/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index 0e1811f195..8a9c6b082a 100755 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -12,7 +12,7 @@ A Humane Web Text Generator #__date__ = '2009/12/04' __copyright__ = """ -Copyright (c) 2011, Leigh Parry +Copyright (c) 2011, Leigh Parry Copyright (c) 2011, John Schember Copyright (c) 2009, Jason Samsa, http://jsamsa.com/ Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/ From b95f9949be04a4d92eeabc76629cff0361817d47 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 10 May 2011 06:37:40 -0400 Subject: [PATCH 15/25] Rename function. --- src/calibre/ebooks/textile/functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index 8a9c6b082a..e088d264fc 100755 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -705,7 +705,7 @@ class Textile(object): result.append(line) return ''.join(result) - def glyphs_only(self, text): + def macros_only(self, text): # fix: hackish text = re.sub(r'"\Z', '\" ', text) @@ -828,7 +828,7 @@ class Textile(object): 'fooobar ... and hello world ...' """ - text = self.glyphs_only(text) + text = self.macros_only(text) punct = '!"#$%&\'*+,-./:;=?@\\^_`|~' pattern = r''' From 441718f76c867da749a10607f931b8b03485d331 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 10 May 2011 18:55:19 -0400 Subject: [PATCH 16/25] TXT: small Textile changes. Remove old textile conversion code. --- src/calibre/ebooks/txt/textileml.py | 58 ++++---- src/calibre/utils/html2textile.py | 209 ---------------------------- 2 files changed, 34 insertions(+), 233 deletions(-) delete mode 100644 src/calibre/utils/html2textile.py diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 814ba01a3e..17988053e8 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -78,44 +78,55 @@ class TextileMLizer(OEB2HTML): for i in self.our_links: if i[0] == '#': if i not in self.our_ids: - self.log.debug('Link has no target - %s ...' % i) text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text) for i in self.our_ids: if i not in self.our_links: - self.log.debug('ID has no link - %s ...' % i) text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text) # Remove obvious non-needed escaping, add sub/sup-script ones text = check_escaping(text, ['\*', '_', '\*']) - text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text) # escape the super/sub-scripts if needed - text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text) # escape the super/sub-scripts if needed + # escape the super/sub-scripts if needed + text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text) + # escape the super/sub-scripts if needed + text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text) - text = re.sub(r'%\xa0+', r'%', text) #remove empty spans - text = re.sub(r'%%', r'', text) #remove empty spans - MAY MERGE SOME ? - text = re.sub(r'%([_+*-]+)%', r'\1', text) #remove spans from tagged output - text = re.sub(r' +\n', r'\n', text) #remove spaces before a newline - text = re.sub(r'^\n+', r'', text) #remove newlines at top of file - text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) #correct blockcode paras - text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) #correct blockquote paras -# text = re.sub(r'\n{4,}', r'\n\np. \n\n', text) #reduce blank lines + insert blank para - text = re.sub(r'\n{3}', r'\n\n', text) #reduce blank lines -# text = re.sub(r' ((\* ?)+) ', r' ==\1== ', text) + #remove empty spans + text = re.sub(r'%\xa0+', r'%', text) + #remove empty spans - MAY MERGE SOME ? + text = re.sub(r'%%', r'', text) + #remove spans from tagged output + text = re.sub(r'%([_+*-]+)%', r'\1', text) + #remove spaces before a newline + text = re.sub(r' +\n', r'\n', text) + #remove newlines at top of file + text = re.sub(r'^\n+', r'', text) + #correct blockcode paras + text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) + #correct blockquote paras + text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) + + #reduce blank lines + text = re.sub(r'\n{3}', r'\n\n', text) text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text) - text = re.sub(r'\n\n {2,4}%', r'%', text) #Check span following blank para + #Check span following blank para + text = re.sub(r'\n+ +%', r' %', text) text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text) - text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text) # blank paragraph - text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph - text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text) # blank paragraph + # blank paragraph + text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text) + # blank paragraph + text = re.sub(u'\n\xa0', r'\np. ', text) + # blank paragraph + text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text) text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text) text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text) - text = re.sub(r' {2,}\|', r' |', text) #sort out spaces in tables + #sort out spaces in tables + text = re.sub(r' {2,}\|', r' |', text) # Now put back spaces removed earlier as they're needed here text = re.sub(r'\np\.\n', r'\np. \n', text) - text = re.sub(r' \n\n\n', r' \n\n', text) #reduce blank lines - - # started work on trying to fix footnotes -# text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text) + #reduce blank lines + text = re.sub(r' \n\n\n', r' \n\n', text) + return text def remove_newlines(self, text): @@ -198,7 +209,6 @@ class TextileMLizer(OEB2HTML): return txt def prepare_string_for_textile(self, txt): -# if re.search(r'(\s([*&_+\-~@%|]|\?{2}))|(([*&_+\-~@%|]|\?{2})\s)', txt): if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt): return ' ==%s== ' % txt return txt diff --git a/src/calibre/utils/html2textile.py b/src/calibre/utils/html2textile.py deleted file mode 100644 index 786e912e36..0000000000 --- a/src/calibre/utils/html2textile.py +++ /dev/null @@ -1,209 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright (c) 2010, Webreactor - Marcin Lulek -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of the nor the -# names of its contributors may be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY -# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -from lxml import etree -from calibre.ebooks.oeb.base import barename - -class EchoTarget: - - def __init__(self): - self.final_output = [] - self.block = False - self.ol_ident = 0 - self.ul_ident = 0 - self.list_types = [] - self.haystack = [] - - def start(self, tag, attrib): - tag = barename(tag) - - newline = '\n' - dot = '' - new_tag = '' - - if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): - new_tag = tag - dot = '. ' - elif tag == 'p': - new_tag = '' - dot = '' - elif tag == 'blockquote': - new_tag = 'bq' - dot = '. ' - elif tag in ('b', 'strong'): - new_tag = '*' - newline = '' - elif tag in ('em', 'i'): - new_tag = '_' - newline = '' - elif tag == 'cite': - new_tag = '??' - newline = '' - elif tag == 'del': - new_tag = '-' - newline = '' - elif tag == 'ins': - new_tag = '+' - newline = '' - elif tag == 'sup': - new_tag = '^' - newline = '' - elif tag == 'sub': - new_tag = '~' - newline = '' - elif tag == 'span': - new_tag = '' - newline = '' - elif tag == 'a': - self.block = True - if 'title' in attrib: - self.a_part = {'title':attrib.get('title'), - 'href':attrib.get('href', '')} - else: - self.a_part = {'title':None, 'href':attrib.get('href', '')} - new_tag = '' - newline = '' - - elif tag == 'img': - if 'alt' in attrib: - new_tag = ' !%s(%s)' % (attrib.get('src'), attrib.get('title'),) - else: - new_tag = ' !%s' % attrib.get('src') - newline = '' - - elif tag in ('ul', 'ol'): - new_tag = '' - newline = '' - self.list_types.append(tag) - if tag == 'ul': - self.ul_ident += 1 - else: - self.ol_ident += 1 - - elif tag == 'li': - indent = self.ul_ident + self.ol_ident - if self.list_types[-1] == 'ul': - new_tag = '*' * indent + ' ' - newline = '\n' - else: - new_tag = '#' * indent + ' ' - newline = '\n' - - - if tag not in ('ul', 'ol'): - textile = '%(newline)s%(tag)s%(dot)s' % \ - { - 'newline':newline, - 'tag':new_tag, - 'dot':dot - } - if not self.block: - self.final_output.append(textile) - else: - self.haystack.append(textile) - - def end(self, tag): - tag = barename(tag) - - if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'): - self.final_output.append('\n') - elif tag in ('b', 'strong'): - self.final_output.append('*') - elif tag in ('em', 'i'): - self.final_output.append('_') - elif tag == 'cite': - self.final_output.append('??') - elif tag == 'del': - self.final_output.append('-') - elif tag == 'ins': - self.final_output.append('+') - elif tag == 'sup': - self.final_output.append('^') - elif tag == 'sub': - self.final_output.append('~') - elif tag == 'span': - self.final_output.append('') - elif tag == 'a': - if self.a_part['title']: - textilized = ' "%s (%s)":%s ' % ( - ''.join(self.haystack), - self.a_part.get('title'), - self.a_part.get('href'), - ) - self.haystack = [] - else: - textilized = ' "%s":%s ' % ( - ''.join(self.haystack), - self.a_part.get('href'), - ) - self.haystack = [] - self.final_output.append(textilized) - self.block = False - elif tag == 'img': - self.final_output.append('!') - elif tag == 'ul': - self.ul_ident -= 1 - self.list_types.pop() - if len(self.list_types) == 0: - self.final_output.append('\n') - elif tag == 'ol': - self.ol_ident -= 1 - self.list_types.pop() - if len(self.list_types) == 0: - self.final_output.append('\n') - - def data(self, data): - #we dont want any linebreaks inside our tags - node_data = data.replace('\n','') - if not self.block: - self.final_output.append(node_data) - else: - self.haystack.append(node_data) - - def comment(self, text): - pass - - def close(self): - return "closed!" - - -def html2textile(html): - #1st pass - #clean the whitespace and convert html to xhtml - parser = etree.HTMLParser() - tree = etree.fromstring(html, parser) - xhtml = etree.tostring(tree, method="xml") - parser = etree.XMLParser(remove_blank_text=True) - root = etree.XML(xhtml, parser) - cleaned_html = etree.tostring(root) - #2nd pass build textile - target = EchoTarget() - parser = etree.XMLParser(target=target) - root = etree.fromstring(cleaned_html, parser) - textilized_text = ''.join(target.final_output).lstrip().rstrip() - return textilized_text From 5c1b683536ccb7fb221b13e35c4ae73db46cd35b Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 10 May 2011 18:58:30 -0400 Subject: [PATCH 17/25] TXT: Add keep color GUI option. --- src/calibre/gui2/convert/txt_output.py | 2 +- src/calibre/gui2/convert/txt_output.ui | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/calibre/gui2/convert/txt_output.py b/src/calibre/gui2/convert/txt_output.py index 8427f83824..816e8d7785 100644 --- a/src/calibre/gui2/convert/txt_output.py +++ b/src/calibre/gui2/convert/txt_output.py @@ -19,7 +19,7 @@ class PluginWidget(Widget, Ui_Form): Widget.__init__(self, parent, ['newline', 'max_line_length', 'force_max_line_length', 'inline_toc', 'txt_output_formatting', 'keep_links', 'keep_image_references', - 'txt_output_encoding']) + 'keep_color', 'txt_output_encoding']) self.db, self.book_id = db, book_id for x in get_option('newline').option.choices: self.opt_newline.addItem(x) diff --git a/src/calibre/gui2/convert/txt_output.ui b/src/calibre/gui2/convert/txt_output.ui index 1ef9e6e6b9..36ffabb07e 100644 --- a/src/calibre/gui2/convert/txt_output.ui +++ b/src/calibre/gui2/convert/txt_output.ui @@ -122,6 +122,13 @@ + + + + Do not remove font color before processing + + + From 28dfc420d758cef69b9a4ea048406152b20636bb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 12 May 2011 11:18:03 -0600 Subject: [PATCH 18/25] Fix #778208 (Fetch news from Readers Digest) --- recipes/readers_digest.recipe | 150 ++-------------------------------- 1 file changed, 9 insertions(+), 141 deletions(-) diff --git a/recipes/readers_digest.recipe b/recipes/readers_digest.recipe index 3689ca4c53..caf5cf081d 100644 --- a/recipes/readers_digest.recipe +++ b/recipes/readers_digest.recipe @@ -3,7 +3,6 @@ __license__ = 'GPL v3' ''' ''' from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.web.feeds import Feed class ReadersDigest(BasicNewsRecipe): @@ -38,151 +37,20 @@ class ReadersDigest(BasicNewsRecipe): ''' - remove_tags = [ - dict(name='h4', attrs={'class':'close'}), - dict(name='div', attrs={'class':'fromLine'}), - dict(name='img', attrs={'class':'colorTag'}), - dict(name='div', attrs={'id':'sponsorArticleHeader'}), - dict(name='div', attrs={'class':'horizontalAd'}), - dict(name='div', attrs={'id':'imageCounterLeft'}), - dict(name='div', attrs={'id':'commentsPrint'}) - ] - - feeds = [ - ('New in RD', 'http://feeds.rd.com/ReadersDigest'), - ('Jokes', 'http://feeds.rd.com/ReadersDigestJokes'), - ('Cartoons', 'http://feeds.rd.com/ReadersDigestCartoons'), - ('Blogs','http://feeds.rd.com/ReadersDigestBlogs') + ('Food', 'http://www.rd.com/food/feed'), + ('Health', 'http://www.rd.com/health/feed'), + ('Home', 'http://www.rd.com/home/feed'), + ('Family', 'http://www.rd.com/family/feed'), + ('Money', 'http://www.rd.com/money/feed'), + ('Travel', 'http://www.rd.com/travel/feed'), ] cover_url = 'http://www.rd.com/images/logo-main-rd.gif' - - -#------------------------------------------------------------------------------------------------- - - def print_version(self, url): - - # Get the identity number of the current article and append it to the root print URL - - if url.find('/article') > 0: - ident = url[url.find('/article')+8:url.find('.html?')-4] - url = 'http://www.rd.com/content/printContent.do?contentId=' + ident - - elif url.find('/post') > 0: - - # in this case, have to get the page itself to derive the Print page. - soup = self.index_to_soup(url) - newsoup = soup.find('ul',attrs={'class':'printBlock'}) - url = 'http://www.rd.com' + newsoup('a')[0]['href'] - url = url[0:url.find('&Keep')] - - return url - -#------------------------------------------------------------------------------------------------- - - def parse_index(self): - - pages = [ - ('Your America','http://www.rd.com/your-america-inspiring-people-and-stories', 'channelLeftContainer',{'class':'moreLeft'}), - # useless recipes ('Living Healthy','http://www.rd.com/living-healthy', 'channelLeftContainer',{'class':'moreLeft'}), - ('Advice and Know-How','http://www.rd.com/advice-and-know-how', 'channelLeftContainer',{'class':'moreLeft'}) - + keep_only_tags = dict(id='main-content') + remove_tags = [ + {'class':['post-categories']}, ] - feeds = [] - - for page in pages: - section, url, divider, attrList = page - newArticles = self.page_parse(url, divider, attrList) - feeds.append((section,newArticles)) - - # after the pages of the site have been processed, parse several RSS feeds for additional sections - newfeeds = Feed() - newfeeds = self.parse_rss() - - - # The utility code in parse_rss returns a Feed object. Convert each feed/article combination into a form suitable - # for this module (parse_index). - - for feed in newfeeds: - newArticles = [] - for article in feed.articles: - newArt = { - 'title' : article.title, - 'url' : article.url, - 'date' : article.date, - 'description' : article.text_summary - } - newArticles.append(newArt) - - - # New and Blogs should be the first two feeds. - if feed.title == 'New in RD': - feeds.insert(0,(feed.title,newArticles)) - elif feed.title == 'Blogs': - feeds.insert(1,(feed.title,newArticles)) - else: - feeds.append((feed.title,newArticles)) - - - return feeds - -#------------------------------------------------------------------------------------------------- - - def page_parse(self, mainurl, divider, attrList): - - articles = [] - mainsoup = self.index_to_soup(mainurl) - for item in mainsoup.findAll(attrs=attrList): - newArticle = { - 'title' : item('img')[0]['alt'], - 'url' : 'http://www.rd.com'+item('a')[0]['href'], - 'date' : '', - 'description' : '' - } - articles.append(newArticle) - - - - return articles - - - -#------------------------------------------------------------------------------------------------- - - def parse_rss (self): - - # Do the "official" parse_feeds first - feeds = BasicNewsRecipe.parse_feeds(self) - - - # Loop thru the articles in all feeds to find articles with "recipe" in it - recipeArticles = [] - for curfeed in feeds: - delList = [] - for a,curarticle in enumerate(curfeed.articles): - if curarticle.title.upper().find('RECIPE') >= 0: - recipeArticles.append(curarticle) - delList.append(curarticle) - if len(delList)>0: - for d in delList: - index = curfeed.articles.index(d) - curfeed.articles[index:index+1] = [] - - # If there are any recipes found, create a new Feed object and append. - if len(recipeArticles) > 0: - pfeed = Feed() - pfeed.title = 'Recipes' - pfeed.descrition = 'Recipe Feed (Virtual)' - pfeed.image_url = None - pfeed.oldest_article = 30 - pfeed.id_counter = len(recipeArticles) - # Create a new Feed, add the recipe articles, and then append - # to "official" list of feeds - pfeed.articles = recipeArticles[:] - feeds.append(pfeed) - - return feeds From 751890a83f5fef83968c0de39313980c7be3d7e7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 12 May 2011 13:15:52 -0600 Subject: [PATCH 19/25] ... --- src/calibre/ebooks/metadata/sources/identify.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/metadata/sources/identify.py b/src/calibre/ebooks/metadata/sources/identify.py index b084f86294..0cc070c3c6 100644 --- a/src/calibre/ebooks/metadata/sources/identify.py +++ b/src/calibre/ebooks/metadata/sources/identify.py @@ -382,7 +382,7 @@ def identify(log, abort, # {{{ if key not in filter_results: filtered_results.append(r) filter_results.add(key) - presults = filtered_results + results[plugin] = presults = filtered_results plog = logs[plugin].getvalue().strip() log('\n'+'*'*30, plugin.name, '*'*30) From e19edba3efe5fa257591ed0fe1fbfb286317257d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 12 May 2011 14:31:21 -0600 Subject: [PATCH 20/25] EPUB Input: Ignore missing cover file when converting, instead of erroring out. Fixes #781848 ([Errno 2] No such file or directory while converting) --- src/calibre/ebooks/epub/input.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/epub/input.py b/src/calibre/ebooks/epub/input.py index 917c5ad8ae..ac1d61ce59 100644 --- a/src/calibre/ebooks/epub/input.py +++ b/src/calibre/ebooks/epub/input.py @@ -103,10 +103,11 @@ class EPUBInput(InputFormatPlugin): t.set('href', guide_cover) t.set('title', 'Title Page') from calibre.ebooks import render_html_svg_workaround - renderer = render_html_svg_workaround(guide_cover, log) - if renderer is not None: - open('calibre_raster_cover.jpg', 'wb').write( - renderer) + if os.path.exists(guide_cover): + renderer = render_html_svg_workaround(guide_cover, log) + if renderer is not None: + open('calibre_raster_cover.jpg', 'wb').write( + renderer) def find_opf(self): def attr(n, attr): From 953c8e939558ed380ae0a817cd89303a6fc959f7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 12 May 2011 15:05:55 -0600 Subject: [PATCH 21/25] Allow the use of condensed/expanded fonts as interface fonts --- src/calibre/gui2/__init__.py | 6 +++++- src/calibre/gui2/preferences/look_feel.py | 18 +++++++++++++----- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py index 1dfe1d8d14..28504f2a31 100644 --- a/src/calibre/gui2/__init__.py +++ b/src/calibre/gui2/__init__.py @@ -620,7 +620,11 @@ class Application(QApplication): self.original_font = QFont(QApplication.font()) fi = gprefs['font'] if fi is not None: - QApplication.setFont(QFont(*fi)) + font = QFont(*(fi[:4])) + s = gprefs.get('font_stretch', None) + if s is not None: + font.setStretch(s) + QApplication.setFont(font) def _send_file_open_events(self): with self._file_open_lock: diff --git a/src/calibre/gui2/preferences/look_feel.py b/src/calibre/gui2/preferences/look_feel.py index 620113cc3f..ee2d7a5428 100644 --- a/src/calibre/gui2/preferences/look_feel.py +++ b/src/calibre/gui2/preferences/look_feel.py @@ -161,7 +161,11 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): def initialize(self): ConfigWidgetBase.initialize(self) - self.current_font = self.initial_font = gprefs['font'] + font = gprefs['font'] + if font is not None: + font = list(font) + font.append(gprefs.get('font_stretch', QFont.Unstretched)) + self.current_font = self.initial_font = font self.update_font_display() self.display_model.initialize() @@ -178,7 +182,8 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): def build_font_obj(self): font_info = self.current_font if font_info is not None: - font = QFont(*font_info) + font = QFont(*(font_info[:4])) + font.setStretch(font_info[4]) else: font = qt_app.original_font return font @@ -215,15 +220,18 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): if fd.exec_() == fd.Accepted: font = fd.selectedFont() fi = QFontInfo(font) - self.current_font = (unicode(fi.family()), fi.pointSize(), - fi.weight(), fi.italic()) + self.current_font = [unicode(fi.family()), fi.pointSize(), + fi.weight(), fi.italic(), font.stretch()] self.update_font_display() self.changed_signal.emit() def commit(self, *args): rr = ConfigWidgetBase.commit(self, *args) if self.current_font != self.initial_font: - gprefs['font'] = self.current_font + gprefs['font'] = (self.current_font[:4] if self.current_font else + None) + gprefs['font_stretch'] = (self.current_font[4] if self.current_font + is not None else QFont.Unstretched) QApplication.setFont(self.font_display.font()) rr = True self.display_model.commit() From af23efd3d6992b488803048f48efb1a1a1f7b908 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 12 May 2011 15:09:41 -0600 Subject: [PATCH 22/25] Fix Strategy+Business --- recipes/strategy-business.recipe | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/strategy-business.recipe b/recipes/strategy-business.recipe index ab58965e98..a4697ecfcd 100644 --- a/recipes/strategy-business.recipe +++ b/recipes/strategy-business.recipe @@ -33,7 +33,7 @@ class StrategyBusinessRecipe(BasicNewsRecipe): elif c.name.endswith('_password'): br[c.name] = self.password raw = br.submit().read() - if '>Logout' not in raw: + if 'You have been logged in' not in raw: raise ValueError('Failed to login, check your username and password') return br From dc0834e8bcfdd49a84ab68cafec51d0433ab7988 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 12 May 2011 18:06:17 -0400 Subject: [PATCH 23/25] TXT: Textileml tweaks. --- src/calibre/ebooks/txt/textileml.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 17988053e8..36dc9952d2 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -106,17 +106,17 @@ class TextileMLizer(OEB2HTML): text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) #reduce blank lines - text = re.sub(r'\n{3}', r'\n\n', text) + text = re.sub(r'\n{3}', r'\n\np. \n\n', text) text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text) #Check span following blank para text = re.sub(r'\n+ +%', r' %', text) text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text) # blank paragraph - text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text) + text = re.sub(r'\n(p.*\.)\n', r'\n\1 \n\n', text) # blank paragraph - text = re.sub(u'\n\xa0', r'\np. ', text) + text = re.sub(u'\n\xa0', r'\np. ', text) # blank paragraph - text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text) + text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text) text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text) text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text) #sort out spaces in tables From dc74afe1f272de99c99480a3a10b312a6fc48176 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 12 May 2011 16:54:20 -0600 Subject: [PATCH 24/25] ... --- src/calibre/utils/Zeroconf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/utils/Zeroconf.py b/src/calibre/utils/Zeroconf.py index fbb9b4e71f..2b3661162f 100755 --- a/src/calibre/utils/Zeroconf.py +++ b/src/calibre/utils/Zeroconf.py @@ -869,7 +869,8 @@ class Engine(threading.Thread): if DEBUG: traceback.print_exc() except: - traceback.print_exc() + if DEBUG: + traceback.print_exc() except: pass From 936a6892dcaea49b3dba3e353da8af874a39a5f0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 12 May 2011 17:15:13 -0600 Subject: [PATCH 25/25] ... --- src/calibre/ebooks/metadata/sources/amazon.py | 2 +- src/calibre/ebooks/metadata/sources/google.py | 2 +- src/calibre/ebooks/metadata/sources/overdrive.py | 2 +- src/calibre/gui2/preferences/metadata_sources.py | 5 +++-- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index 31d815af63..40cd54cfbd 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -280,7 +280,7 @@ class Worker(Thread): # Get details {{{ class Amazon(Source): name = 'Amazon.com' - description = _('Downloads metadata from Amazon') + description = _('Downloads metadata and covers from Amazon') capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset(['title', 'authors', 'identifier:amazon', diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py index b479368bac..bd1043b774 100644 --- a/src/calibre/ebooks/metadata/sources/google.py +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -157,7 +157,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{ class GoogleBooks(Source): name = 'Google' - description = _('Downloads metadata from Google Books') + description = _('Downloads metadata and covers from Google Books') capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate', diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py index 4ee248579e..f52b1f423b 100755 --- a/src/calibre/ebooks/metadata/sources/overdrive.py +++ b/src/calibre/ebooks/metadata/sources/overdrive.py @@ -30,7 +30,7 @@ base_url = 'http://search.overdrive.com/' class OverDrive(Source): name = 'Overdrive' - description = _('Downloads metadata from Overdrive\'s Content Reserve') + description = _('Downloads metadata and covers from Overdrive\'s Content Reserve') capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate', diff --git a/src/calibre/gui2/preferences/metadata_sources.py b/src/calibre/gui2/preferences/metadata_sources.py index 05ff23987d..f7465fb0ee 100644 --- a/src/calibre/gui2/preferences/metadata_sources.py +++ b/src/calibre/gui2/preferences/metadata_sources.py @@ -71,9 +71,10 @@ class SourcesModel(QAbstractTableModel): # {{{ plugin.is_configured()): return QIcon(I('list_remove.png')) elif role == Qt.ToolTipRole: + base = plugin.description + '\n\n' if plugin.is_configured(): - return _('This source is configured and ready to go') - return _('This source needs configuration') + return base + _('This source is configured and ready to go') + return base + _('This source needs configuration') return NONE def setData(self, index, val, role):