diff --git a/recipes/hvg.recipe b/recipes/hvg.recipe new file mode 100644 index 0000000000..8e9218d9c3 --- /dev/null +++ b/recipes/hvg.recipe @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +import re +from calibre.web.feeds.recipes import BasicNewsRecipe + +class HVG(BasicNewsRecipe): + title = 'HVG.HU' + __author__ = u'István Papp' + description = u'Friss hírek a HVG-től' + timefmt = ' [%Y. %b. %d., %a.]' + oldest_article = 4 + language = 'hu' + + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + encoding = 'utf8' + publisher = 'HVG Online' + category = u'news, hírek, hvg' + extra_css = 'body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' + preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')] + remove_tags_before = dict(id='pg-content') + remove_javascript = True + remove_empty_feeds = True + + feeds = [ + (u'Itthon', u'http://hvg.hu/rss/itthon') + ,(u'Világ', u'http://hvg.hu/rss/vilag') + ,(u'Gazdaság', u'http://hvg.hu/rss/gazdasag') + ,(u'IT | Tudomány', u'http://hvg.hu/rss/tudomany') + ,(u'Panoráma', u'http://hvg.hu/rss/Panorama') + ,(u'Karrier', u'http://hvg.hu/rss/karrier') + ,(u'Gasztronómia', u'http://hvg.hu/rss/gasztronomia') + ,(u'Helyi érték', u'http://hvg.hu/rss/helyiertek') + ,(u'Kultúra', u'http://hvg.hu/rss/kultura') + ,(u'Cégautó', u'http://hvg.hu/rss/cegauto') + ,(u'Vállalkozó szellem', u'http://hvg.hu/rss/kkv') + ,(u'Egészség', u'http://hvg.hu/rss/egeszseg') + ,(u'Vélemény', u'http://hvg.hu/rss/velemeny') + ,(u'Sport', u'http://hvg.hu/rss/sport') + ] + + def print_version(self, url): + return url.replace ('#rss', '/print') + diff --git a/recipes/lwn_weekly.recipe b/recipes/lwn_weekly.recipe index 28ee35802a..7363062346 100644 --- a/recipes/lwn_weekly.recipe +++ b/recipes/lwn_weekly.recipe @@ -23,6 +23,11 @@ class WeeklyLWN(BasicNewsRecipe): remove_tags_after = dict(attrs={'class':'ArticleText'}) remove_tags = [dict(name=['h2', 'form'])] + preprocess_regexps = [ + # Remove the
and "Log in to post comments" + (re.compile(r'> sys.stderr, e parser.print_help() return 1 @@ -291,13 +291,13 @@ def main(): elif args[1].startswith("prs500:"): try: infile = open(args[0], "rb") - except IOError, e: + except IOError as e: print >> sys.stderr, e parser.print_help() return 1 try: dev.put_file(infile, args[1][7:]) - except PathError, err: + except PathError as err: if options.force and 'exists' in str(err): dev.del_file(err.path, False) dev.put_file(infile, args[1][7:]) @@ -355,7 +355,7 @@ def main(): return 1 except DeviceLocked: print >> sys.stderr, "The device is locked. Use the --unlock option" - except (ArgumentError, DeviceError), e: + except (ArgumentError, DeviceError) as e: print >>sys.stderr, e return 1 return 0 diff --git a/src/calibre/devices/prs500/driver.py b/src/calibre/devices/prs500/driver.py index 65ecc98a81..aaba094fb3 100644 --- a/src/calibre/devices/prs500/driver.py +++ b/src/calibre/devices/prs500/driver.py @@ -177,7 +177,7 @@ class PRS500(DeviceConfig, DevicePlugin): dev.send_validated_command(BeginEndSession(end=True)) dev.in_session = False raise - except USBError, err: + except USBError as err: if "No such device" in str(err): raise DeviceError() elif "Connection timed out" in str(err): @@ -272,7 +272,7 @@ class PRS500(DeviceConfig, DevicePlugin): self.bulk_read_max_packet_size = red.MaxPacketSize self.bulk_write_max_packet_size = wed.MaxPacketSize self.handle.claim_interface(self.INTERFACE_ID) - except USBError, err: + except USBError as err: raise DeviceBusy(str(err)) # Large timeout as device may still be initializing res = self.send_validated_command(GetUSBProtocolVersion(), timeout=20000) @@ -303,7 +303,7 @@ class PRS500(DeviceConfig, DevicePlugin): try: self.handle.reset() self.handle.release_interface(self.INTERFACE_ID) - except Exception, err: + except Exception as err: print >> sys.stderr, err self.handle, self.device = None, None self.in_session = False @@ -509,7 +509,7 @@ class PRS500(DeviceConfig, DevicePlugin): outfile.write("".join(map(chr, packets[0][16:]))) for i in range(1, len(packets)): outfile.write("".join(map(chr, packets[i]))) - except IOError, err: + except IOError as err: self.send_validated_command(FileClose(_id)) raise ArgumentError("File get operation failed. " + \ "Could not write to local location: " + str(err)) @@ -656,7 +656,7 @@ class PRS500(DeviceConfig, DevicePlugin): dest = None try: dest = self.path_properties(path, end_session=False) - except PathError, err: + except PathError as err: if "does not exist" in str(err) or "not mounted" in str(err): return (False, None) else: raise diff --git a/src/calibre/devices/usbms/device.py b/src/calibre/devices/usbms/device.py index 37b2b061e5..c46e9539c9 100644 --- a/src/calibre/devices/usbms/device.py +++ b/src/calibre/devices/usbms/device.py @@ -124,11 +124,11 @@ class Device(DeviceConfig, DevicePlugin): if not prefix: return 0, 0 prefix = prefix[:-1] - win32file = __import__('win32file', globals(), locals(), [], -1) + import win32file try: sectors_per_cluster, bytes_per_sector, free_clusters, total_clusters = \ win32file.GetDiskFreeSpace(prefix) - except Exception, err: + except Exception as err: if getattr(err, 'args', [None])[0] == 21: # Disk not ready time.sleep(3) sectors_per_cluster, bytes_per_sector, free_clusters, total_clusters = \ @@ -771,7 +771,7 @@ class Device(DeviceConfig, DevicePlugin): for d in drives: try: eject(d) - except Exception, e: + except Exception as e: print 'Udisks eject call for:', d, 'failed:' print '\t', e failures = True diff --git a/src/calibre/ebooks/__init__.py b/src/calibre/ebooks/__init__.py index c5bac936b5..7776be5e28 100644 --- a/src/calibre/ebooks/__init__.py +++ b/src/calibre/ebooks/__init__.py @@ -57,7 +57,7 @@ class HTMLRenderer(object): buf.open(QBuffer.WriteOnly) image.save(buf, 'JPEG') self.data = str(ba.data()) - except Exception, e: + except Exception as e: self.exception = e self.traceback = traceback.format_exc() finally: diff --git a/src/calibre/ebooks/epub/fix/container.py b/src/calibre/ebooks/epub/fix/container.py index 539d886312..1669290a7b 100644 --- a/src/calibre/ebooks/epub/fix/container.py +++ b/src/calibre/ebooks/epub/fix/container.py @@ -151,7 +151,7 @@ class Container(object): if name in self.mime_map: try: raw = self._parse(raw, self.mime_map[name]) - except XMLSyntaxError, err: + except XMLSyntaxError as err: raise ParseError(name, unicode(err)) self.cache[name] = raw return raw diff --git a/src/calibre/ebooks/epub/fix/main.py b/src/calibre/ebooks/epub/fix/main.py index fbfe80551d..e4c1a60a77 100644 --- a/src/calibre/ebooks/epub/fix/main.py +++ b/src/calibre/ebooks/epub/fix/main.py @@ -54,7 +54,7 @@ def main(args=sys.argv): epub = os.path.abspath(args[1]) try: run(epub, opts, default_log) - except ParseError, err: + except ParseError as err: default_log.error(unicode(err)) raise SystemExit(1) diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 1599d3c896..dd0a247a67 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -110,7 +110,7 @@ class HTMLFile(object): try: with open(self.path, 'rb') as f: src = f.read() - except IOError, err: + except IOError as err: msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err)) if level == 0: raise IOError(msg) @@ -202,7 +202,7 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None) raise IgnoreFile('%s is a binary file'%nf.path, -1) nl.append(nf) flat.append(nf) - except IgnoreFile, err: + except IgnoreFile as err: rejects.append(link) if not err.doesnt_exist or verbose > 1: print repr(err) diff --git a/src/calibre/ebooks/lrf/html/convert_from.py b/src/calibre/ebooks/lrf/html/convert_from.py index 3be8f85e45..4ee1538e3f 100644 --- a/src/calibre/ebooks/lrf/html/convert_from.py +++ b/src/calibre/ebooks/lrf/html/convert_from.py @@ -332,7 +332,7 @@ class HTMLConverter(object): soup = BeautifulSoup(raw, convertEntities=BeautifulSoup.XHTML_ENTITIES, markupMassage=nmassage) - except ConversionError, err: + except ConversionError as err: if 'Failed to coerce to unicode' in str(err): raw = unicode(raw, 'utf8', 'replace') soup = BeautifulSoup(raw, @@ -935,7 +935,7 @@ class HTMLConverter(object): try: im = PILImage.open(path) - except IOError, err: + except IOError as err: self.log.warning('Unable to process image: %s\n%s'%( original_path, err)) return encoding = detect_encoding(im) @@ -953,7 +953,7 @@ class HTMLConverter(object): pt.close() self.scaled_images[path] = pt return pt.name - except (IOError, SystemError), err: # PIL chokes on interlaced PNG images as well a some GIF images + except (IOError, SystemError) as err: # PIL chokes on interlaced PNG images as well a some GIF images self.log.warning(_('Unable to process image %s. Error: %s')%(path, err)) if width == None or height == None: @@ -1013,7 +1013,7 @@ class HTMLConverter(object): if not self.images.has_key(path): try: self.images[path] = ImageStream(path, encoding=encoding) - except LrsError, err: + except LrsError as err: self.log.warning(_('Could not process image: %s\n%s')%( original_path, err)) return @@ -1768,7 +1768,7 @@ class HTMLConverter(object): tag_css = self.tag_css(tag)[0] # Table should not inherit CSS try: self.process_table(tag, tag_css) - except Exception, err: + except Exception as err: self.log.warning(_('An error occurred while processing a table: %s. Ignoring table markup.')%repr(err)) self.log.exception('') self.log.debug(_('Bad table:\n%s')%unicode(tag)[:300]) @@ -1858,7 +1858,7 @@ def process_file(path, options, logger): tf.close() tim.save(tf.name) tpath = tf.name - except IOError, err: # PIL sometimes fails, for example on interlaced PNG files + except IOError as err: # PIL sometimes fails, for example on interlaced PNG files logger.warn(_('Could not read cover image: %s'), err) options.cover = None else: diff --git a/src/calibre/ebooks/markdown/markdown.py b/src/calibre/ebooks/markdown/markdown.py index e734079116..677047878a 100644 --- a/src/calibre/ebooks/markdown/markdown.py +++ b/src/calibre/ebooks/markdown/markdown.py @@ -34,7 +34,7 @@ License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD import re, sys, codecs from logging import getLogger, StreamHandler, Formatter, \ - DEBUG, INFO, WARN, ERROR, CRITICAL + DEBUG, INFO, WARN, CRITICAL MESSAGE_THRESHOLD = CRITICAL @@ -95,7 +95,7 @@ def removeBOM(text, encoding): # and uses the actual name of the executable called.) EXECUTABLE_NAME_FOR_USAGE = "python markdown.py" - + # --------------- CONSTANTS YOU _SHOULD NOT_ HAVE TO CHANGE ---------- @@ -242,8 +242,6 @@ class Element: if bidi: - orig_bidi = self.bidi - if not self.bidi or self.isDocumentElement: # Once the bidi is set don't change it (except for doc element) self.bidi = bidi @@ -319,7 +317,7 @@ class Element: childBuffer += "/>" - + buffer += "<" + self.nodeName if self.nodeName in ['p', 'li', 'ul', 'ol', @@ -330,10 +328,10 @@ class Element: bidi = self.bidi else: bidi = self.doc.bidi - + if bidi=="rtl": self.setAttribute("dir", "rtl") - + for attr in self.attributes: value = self.attribute_values[attr] value = self.doc.normalizeEntities(value, @@ -358,7 +356,7 @@ class TextNode: attrRegExp = re.compile(r'\{@([^\}]*)=([^\}]*)}') # {@id=123} def __init__ (self, text): - self.value = text + self.value = text def attributeCallback(self, match): @@ -372,7 +370,7 @@ class TextNode: text = self.value self.parent.setBidi(getBidiType(text)) - + if not text.startswith(HTML_PLACEHOLDER_PREFIX): if self.parent.nodeName == "p": text = text.replace("\n", "\n ") @@ -413,11 +411,11 @@ There are two types of preprocessors: TextPreprocessor and Preprocessor. class TextPreprocessor: ''' TextPreprocessors are run before the text is broken into lines. - + Each TextPreprocessor implements a "run" method that takes a pointer to a text string of the document, modifies it as necessary and returns - either the same pointer or a pointer to a new string. - + either the same pointer or a pointer to a new string. + TextPreprocessors must extend markdown.TextPreprocessor. ''' @@ -431,18 +429,18 @@ class Preprocessor: Each preprocessor implements a "run" method that takes a pointer to a list of lines of the document, modifies it as necessary and returns - either the same pointer or a pointer to a new list. - + either the same pointer or a pointer to a new list. + Preprocessors must extend markdown.Preprocessor. ''' def run(self, lines): pass - + class HtmlBlockPreprocessor(TextPreprocessor): """Removes html blocks from the source text and stores it.""" - + def _get_left_tag(self, block): return block[1:].replace(">", " ", 1).split()[0].lower() @@ -451,7 +449,7 @@ class HtmlBlockPreprocessor(TextPreprocessor): return block.rstrip()[-len(left_tag)-2:-1].lower() def _equal_tags(self, left_tag, right_tag): - + if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc. return True if ("/" + left_tag) == right_tag: @@ -467,17 +465,17 @@ class HtmlBlockPreprocessor(TextPreprocessor): def _is_oneliner(self, tag): return (tag in ['hr', 'hr/']) - + def run(self, text): new_blocks = [] text = text.split("\n\n") - + items = [] left_tag = '' right_tag = '' in_tag = False # flag - + for block in text: if block.startswith("\n"): block = block[1:] @@ -485,7 +483,7 @@ class HtmlBlockPreprocessor(TextPreprocessor): if not in_tag: if block.startswith("<"): - + left_tag = self._get_left_tag(block) right_tag = self._get_right_tag(left_tag, block) @@ -497,13 +495,13 @@ class HtmlBlockPreprocessor(TextPreprocessor): if self._is_oneliner(left_tag): new_blocks.append(block.strip()) continue - + if block[1] == "!": # is a comment block left_tag = "--" right_tag = self._get_right_tag(left_tag, block) # keep checking conditions below and maybe just append - + if block.rstrip().endswith(">") \ and self._equal_tags(left_tag, right_tag): new_blocks.append( @@ -519,9 +517,9 @@ class HtmlBlockPreprocessor(TextPreprocessor): else: items.append(block.strip()) - + right_tag = self._get_right_tag(left_tag, block) - + if self._equal_tags(left_tag, right_tag): # if find closing tag in_tag = False @@ -532,7 +530,7 @@ class HtmlBlockPreprocessor(TextPreprocessor): if items: new_blocks.append(self.stash.store('\n\n'.join(items))) new_blocks.append('\n') - + return "\n\n".join(new_blocks) HTML_BLOCK_PREPROCESSOR = HtmlBlockPreprocessor() @@ -605,7 +603,7 @@ LINE_PREPROCESSOR = LinePreprocessor() class ReferencePreprocessor(Preprocessor): - ''' + ''' Removes reference definitions from the text and stores them for later use. ''' @@ -760,7 +758,7 @@ class BacktickPattern (Pattern): return el -class DoubleTagPattern (SimpleTagPattern): +class DoubleTagPattern (SimpleTagPattern): def handleMatch(self, m, doc): tag1, tag2 = self.tag.split(",") @@ -775,7 +773,6 @@ class HtmlPattern (Pattern): def handleMatch (self, m, doc): rawhtml = m.group(2) - inline = True place_holder = self.stash.store(rawhtml) return doc.createTextNode(place_holder) @@ -926,11 +923,11 @@ There are two types of post-processors: Postprocessor and TextPostprocessor class Postprocessor: ''' Postprocessors are run before the dom it converted back into text. - + Each Postprocessor implements a "run" method that takes a pointer to a - NanoDom document, modifies it as necessary and returns a NanoDom + NanoDom document, modifies it as necessary and returns a NanoDom document. - + Postprocessors must extend markdown.Postprocessor. There are currently no standard post-processors, but the footnote @@ -945,10 +942,10 @@ class Postprocessor: class TextPostprocessor: ''' TextPostprocessors are run after the dom it converted back into text. - + Each TextPostprocessor implements a "run" method that takes a pointer to a text string, modifies it as necessary and returns a text string. - + TextPostprocessors must extend markdown.TextPostprocessor. ''' @@ -971,7 +968,7 @@ class RawHtmlTextPostprocessor(TextPostprocessor): html = '' else: html = HTML_REMOVED_TEXT - + text = text.replace("

%s\n

" % (HTML_PLACEHOLDER % i), html + "\n") text = text.replace(HTML_PLACEHOLDER % i, html) @@ -1031,7 +1028,6 @@ class BlockGuru: remainder of the original list""" items = [] - item = -1 i = 0 # to keep track of where we are @@ -1187,7 +1183,7 @@ class Markdown: RAWHTMLTEXTPOSTPROCESSOR] self.prePatterns = [] - + self.inlinePatterns = [DOUBLE_BACKTICK_PATTERN, BACKTICK_PATTERN, @@ -1241,7 +1237,7 @@ class Markdown: configs_for_ext = configs[ext] else: configs_for_ext = [] - extension = module.makeExtension(configs_for_ext) + extension = module.makeExtension(configs_for_ext) extension.extendMarkdown(self, globals()) @@ -1310,7 +1306,7 @@ class Markdown: else: buffer.append(line) self._processSection(self.top_element, buffer) - + #self._processSection(self.top_element, self.lines) # Not sure why I put this in but let's leave it for now. @@ -1426,7 +1422,7 @@ class Markdown: for item in list: el.appendChild(item) - + def _processUList(self, parent_elem, lines, inList): self._processList(parent_elem, lines, inList, @@ -1458,7 +1454,7 @@ class Markdown: i = 0 # a counter to keep track of where we are - for line in lines: + for line in lines: loose = 0 if not line.strip(): @@ -1477,7 +1473,7 @@ class Markdown: # Check if the next non-blank line is still a part of the list if ( RE.regExp['ul'].match(next) or - RE.regExp['ol'].match(next) or + RE.regExp['ol'].match(next) or RE.regExp['tabbed'].match(next) ): # get rid of any white space in the line items[item].append(line.strip()) @@ -1618,7 +1614,7 @@ class Markdown: i = 0 while i < len(parts): - + x = parts[i] if isinstance(x, (str, unicode)): @@ -1641,14 +1637,14 @@ class Markdown: parts[i] = self.doc.createTextNode(x) return parts - + def _applyPattern(self, line, pattern, patternIndex): """ Given a pattern name, this function checks if the line fits the pattern, creates the necessary elements, and returns back a list consisting of NanoDom elements and/or strings. - + @param line: the text to be processed @param pattern: the pattern to be checked @@ -1676,19 +1672,19 @@ class Markdown: if not node.nodeName in ["code", "pre"]: for child in node.childNodes: if isinstance(child, TextNode): - + result = self._handleInline(child.value, patternIndex+1) - + if result: if result == [child]: continue - + result.reverse() #to make insertion easier position = node.childNodes.index(child) - + node.removeChild(child) for item in result: @@ -1699,7 +1695,7 @@ class Markdown: self.doc.createTextNode(item)) else: node.insertChild(position, item) - + @@ -1798,14 +1794,14 @@ def markdownFromFile(input = None, def markdown(text, extensions = [], safe_mode = False): - + message(DEBUG, "in markdown.markdown(), received text:\n%s" % text) extension_names = [] extension_configs = {} - + for ext in extensions: - pos = ext.find("(") + pos = ext.find("(") if pos == -1: extension_names.append(ext) else: @@ -1820,7 +1816,7 @@ def markdown(text, safe_mode = safe_mode) return md.convert(text) - + class Extension: @@ -1845,26 +1841,11 @@ Python 2.3 or higher required for advanced command line options. For lower versions of Python use: %s INPUT_FILE > OUTPUT_FILE - + """ % EXECUTABLE_NAME_FOR_USAGE def parse_options(): - - try: - optparse = __import__("optparse") - except: - if len(sys.argv) == 2: - return {'input': sys.argv[1], - 'output': None, - 'message_threshold': CRITICAL, - 'safe': False, - 'extensions': [], - 'encoding': None } - - else: - print OPTPARSE_WARNING - return None - + import optparse parser = optparse.OptionParser(usage="%prog INPUTFILE [options]") parser.add_option("-f", "--file", dest="filename", @@ -1881,7 +1862,7 @@ def parse_options(): parser.add_option("-s", "--safe", dest="safe", default=False, metavar="SAFE_MODE", help="same mode ('replace', 'remove' or 'escape' user's HTML tag)") - + parser.add_option("--noisy", action="store_const", const=DEBUG, dest="verbose", help="print debug messages") @@ -1914,14 +1895,14 @@ def main(): if not options: sys.exit(0) - + markdownFromFile(**options) if __name__ == '__main__': sys.exit(main()) """ Run Markdown from the command line. """ - + diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index 8e4dd1dd27..4100439feb 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -108,7 +108,7 @@ def _get_cover_url(br, asin): q = 'http://amzn.com/'+asin try: raw = br.open_novisit(q).read() - except Exception, e: + except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return None @@ -139,7 +139,7 @@ def get_metadata(br, asin, mi): q = 'http://amzn.com/'+asin try: raw = br.open_novisit(q).read() - except Exception, e: + except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return False diff --git a/src/calibre/ebooks/metadata/amazonfr.py b/src/calibre/ebooks/metadata/amazonfr.py index 156fff3d75..248c8d9ed0 100644 --- a/src/calibre/ebooks/metadata/amazonfr.py +++ b/src/calibre/ebooks/metadata/amazonfr.py @@ -33,7 +33,7 @@ class AmazonFr(MetadataSource): try: self.results = search(self.title, self.book_author, self.publisher, self.isbn, max_results=10, verbose=self.verbose, lang='fr') - except Exception, e: + except Exception as e: self.exception = e self.tb = traceback.format_exc() @@ -50,7 +50,7 @@ class AmazonEs(MetadataSource): try: self.results = search(self.title, self.book_author, self.publisher, self.isbn, max_results=10, verbose=self.verbose, lang='es') - except Exception, e: + except Exception as e: self.exception = e self.tb = traceback.format_exc() @@ -67,7 +67,7 @@ class AmazonEn(MetadataSource): try: self.results = search(self.title, self.book_author, self.publisher, self.isbn, max_results=10, verbose=self.verbose, lang='en') - except Exception, e: + except Exception as e: self.exception = e self.tb = traceback.format_exc() @@ -84,7 +84,7 @@ class AmazonDe(MetadataSource): try: self.results = search(self.title, self.book_author, self.publisher, self.isbn, max_results=10, verbose=self.verbose, lang='de') - except Exception, e: + except Exception as e: self.exception = e self.tb = traceback.format_exc() @@ -103,7 +103,7 @@ class Amazon(MetadataSource): try: self.results = search(self.title, self.book_author, self.publisher, self.isbn, max_results=10, verbose=self.verbose, lang='all') - except Exception, e: + except Exception as e: self.exception = e self.tb = traceback.format_exc() @@ -193,7 +193,7 @@ class Query(object): try: raw = browser.open_novisit(self.urldata, timeout=timeout).read() - except Exception, e: + except Exception as e: report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: @@ -226,7 +226,7 @@ class Query(object): try: urldata = self.urldata + '&page=' + str(i) raw = browser.open_novisit(urldata, timeout=timeout).read() - except Exception, e: + except Exception as e: continue if '404 - ' in raw: continue @@ -413,7 +413,7 @@ class ResultList(list): def get_individual_metadata(self, browser, linkdata, verbose): try: raw = browser.open_novisit(linkdata).read() - except Exception, e: + except Exception as e: report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: @@ -445,7 +445,7 @@ class ResultList(list): # self.clean_entry(entry, invalid_id=inv_ids) title = self.get_title(entry) authors = self.get_authors(entry) - except Exception, e: + except Exception as e: if verbose: print 'Failed to get all details for an entry' print e diff --git a/src/calibre/ebooks/metadata/book/base.py b/src/calibre/ebooks/metadata/book/base.py index 2bf23e4b82..91dcc29230 100644 --- a/src/calibre/ebooks/metadata/book/base.py +++ b/src/calibre/ebooks/metadata/book/base.py @@ -575,7 +575,10 @@ class Metadata(object): orig_res = res datatype = cmeta['datatype'] if datatype == 'text' and cmeta['is_multiple']: - res = u', '.join(sorted(res, key=sort_key)) + if cmeta['display'].get('is_names', False): + res = u' & '.join(res) + else: + res = u', '.join(sorted(res, key=sort_key)) elif datatype == 'series' and series_with_index: if self.get_extra(key) is not None: res = res + \ diff --git a/src/calibre/ebooks/metadata/covers.py b/src/calibre/ebooks/metadata/covers.py index 6ea292aa93..aa2ae8387d 100644 --- a/src/calibre/ebooks/metadata/covers.py +++ b/src/calibre/ebooks/metadata/covers.py @@ -91,7 +91,7 @@ class OpenLibraryCovers(CoverDownload): # {{{ br.open_novisit(HeadRequest(self.OPENLIBRARY%mi.isbn), timeout=timeout) self.debug('cover for', mi.isbn, 'found') ans.set() - except Exception, e: + except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 302: self.debug('cover for', mi.isbn, 'found') ans.set() @@ -106,7 +106,7 @@ class OpenLibraryCovers(CoverDownload): # {{{ try: ans = br.open(self.OPENLIBRARY%mi.isbn, timeout=timeout).read() result_queue.put((True, ans, 'jpg', self.name)) - except Exception, e: + except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: result_queue.put((False, _('ISBN: %s not found')%mi.isbn, '', self.name)) else: @@ -131,7 +131,7 @@ class AmazonCovers(CoverDownload): # {{{ get_cover_url(mi.isbn, br) self.debug('cover for', mi.isbn, 'found') ans.set() - except Exception, e: + except Exception as e: self.debug(e) def get_covers(self, mi, result_queue, abort, timeout=5.): @@ -145,7 +145,7 @@ class AmazonCovers(CoverDownload): # {{{ raise ValueError('No cover found for ISBN: %s'%mi.isbn) cover_data = br.open_novisit(url).read() result_queue.put((True, cover_data, 'jpg', self.name)) - except Exception, e: + except Exception as e: result_queue.put((False, self.exception_to_string(e), traceback.format_exc(), self.name)) @@ -242,7 +242,7 @@ class DoubanCovers(CoverDownload): # {{{ try: url = self.DOUBAN_ISBN_URL + isbn + "?apikey=" + self.CALIBRE_DOUBAN_API_KEY src = br.open(url, timeout=timeout).read() - except Exception, err: + except Exception as err: if isinstance(getattr(err, 'args', [None])[0], socket.timeout): err = Exception(_('Douban.com API timed out. Try again later.')) raise err @@ -275,7 +275,7 @@ class DoubanCovers(CoverDownload): # {{{ if self.get_cover_url(mi.isbn, br, timeout=timeout) != None: self.debug('cover for', mi.isbn, 'found') ans.set() - except Exception, e: + except Exception as e: self.debug(e) def get_covers(self, mi, result_queue, abort, timeout=5.): @@ -286,7 +286,7 @@ class DoubanCovers(CoverDownload): # {{{ url = self.get_cover_url(mi.isbn, br, timeout=timeout) cover_data = br.open_novisit(url).read() result_queue.put((True, cover_data, 'jpg', self.name)) - except Exception, e: + except Exception as e: result_queue.put((False, self.exception_to_string(e), traceback.format_exc(), self.name)) # }}} diff --git a/src/calibre/ebooks/metadata/douban.py b/src/calibre/ebooks/metadata/douban.py index c6a34b6162..98a51f69d1 100644 --- a/src/calibre/ebooks/metadata/douban.py +++ b/src/calibre/ebooks/metadata/douban.py @@ -49,7 +49,7 @@ class DoubanBooks(MetadataSource): self.results = search(self.title, self.book_author, self.publisher, self.isbn, max_results=10, verbose=self.verbose) - except Exception, e: + except Exception as e: self.exception = e self.tb = traceback.format_exc() @@ -192,7 +192,7 @@ class ResultList(list): raw = browser.open(id_url).read() feed = etree.fromstring(raw) x = entry(feed)[0] - except Exception, e: + except Exception as e: if verbose: print 'Failed to get all details for an entry' print e @@ -212,7 +212,7 @@ def search(title=None, author=None, publisher=None, isbn=None, api_key = CALIBRE_DOUBAN_API_KEY while start > 0 and len(entries) <= max_results: - new, start = Query(title=title, author=author, publisher=publisher, + new, start = Query(title=title, author=author, publisher=publisher, isbn=isbn, max_results=max_results, start_index=start, api_key=api_key)(br, verbose) if not new: break diff --git a/src/calibre/ebooks/metadata/epub.py b/src/calibre/ebooks/metadata/epub.py index f19b89eb88..27fa94e217 100644 --- a/src/calibre/ebooks/metadata/epub.py +++ b/src/calibre/ebooks/metadata/epub.py @@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' '''Read meta information from epub files''' -import os, re, posixpath, shutil +import os, re, posixpath from cStringIO import StringIO from contextlib import closing @@ -192,6 +192,13 @@ def get_metadata(stream, extract_cover=True): def get_quick_metadata(stream): return get_metadata(stream, False) +def _write_new_cover(new_cdata, cpath): + from calibre.utils.magick.draw import save_cover_data_to + new_cover = PersistentTemporaryFile(suffix=os.path.splitext(cpath)[1]) + new_cover.close() + save_cover_data_to(new_cdata, new_cover.name) + return new_cover + def set_metadata(stream, mi, apply_null=False, update_timestamp=False): stream.seek(0) reader = OCFZipReader(stream, root=os.getcwdu()) @@ -208,6 +215,7 @@ def set_metadata(stream, mi, apply_null=False, update_timestamp=False): new_cdata = open(mi.cover, 'rb').read() except: pass + new_cover = cpath = None if new_cdata and raster_cover: try: cpath = posixpath.join(posixpath.dirname(reader.opf_path), @@ -215,19 +223,7 @@ def set_metadata(stream, mi, apply_null=False, update_timestamp=False): cover_replacable = not reader.encryption_meta.is_encrypted(cpath) and \ os.path.splitext(cpath)[1].lower() in ('.png', '.jpg', '.jpeg') if cover_replacable: - from calibre.utils.magick.draw import save_cover_data_to, \ - identify - new_cover = PersistentTemporaryFile(suffix=os.path.splitext(cpath)[1]) - resize_to = None - if False: # Resize new cover to same size as old cover - shutil.copyfileobj(reader.open(cpath), new_cover) - new_cover.close() - width, height, fmt = identify(new_cover.name) - resize_to = (width, height) - else: - new_cover.close() - save_cover_data_to(new_cdata, new_cover.name, - resize_to=resize_to) + new_cover = _write_new_cover(new_cdata, cpath) replacements[cpath] = open(new_cover.name, 'rb') except: import traceback @@ -249,4 +245,11 @@ def set_metadata(stream, mi, apply_null=False, update_timestamp=False): newopf = StringIO(reader.opf.render()) safe_replace(stream, reader.container[OPF.MIMETYPE], newopf, extra_replacements=replacements) + try: + if cpath is not None: + replacements[cpath].close() + os.remove(replacements[cpath].name) + except: + pass + diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index 0401ee78c5..fb01c5dd71 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -93,7 +93,7 @@ class MetadataSource(Plugin): # {{{ traceback.print_exc() mi.comments = None - except Exception, e: + except Exception as e: self.exception = e self.tb = traceback.format_exc() @@ -186,7 +186,7 @@ class GoogleBooks(MetadataSource): # {{{ self.results = search(self.title, self.book_author, self.publisher, self.isbn, max_results=10, verbose=self.verbose) - except Exception, e: + except Exception as e: self.exception = e self.tb = traceback.format_exc() @@ -217,7 +217,7 @@ class ISBNDB(MetadataSource): # {{{ try: opts, args = option_parser().parse_args(args) self.results = create_books(opts, args) - except Exception, e: + except Exception as e: self.exception = e self.tb = traceback.format_exc() @@ -244,7 +244,7 @@ class Amazon(MetadataSource): # {{{ try: self.results = get_social_metadata(self.title, self.book_author, self.publisher, self.isbn) - except Exception, e: + except Exception as e: self.exception = e self.tb = traceback.format_exc() @@ -285,7 +285,7 @@ class KentDistrictLibrary(MetadataSource): # {{{ from calibre.ebooks.metadata.kdl import get_series try: self.results = get_series(self.title, self.book_author) - except Exception, e: + except Exception as e: import traceback traceback.print_exc() self.exception = e diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index b780f2b39d..145e39768d 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -30,7 +30,7 @@ class Fictionwise(MetadataSource): # {{{ try: self.results = search(self.title, self.book_author, self.publisher, self.isbn, max_results=10, verbose=self.verbose) - except Exception, e: + except Exception as e: self.exception = e self.tb = traceback.format_exc() @@ -91,7 +91,7 @@ class Query(object): try: raw = browser.open_novisit(self.BASE_URL, self.urldata, timeout=timeout).read() - except Exception, e: + except Exception as e: report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: @@ -276,7 +276,7 @@ class ResultList(list): def get_individual_metadata(self, browser, linkdata, verbose): try: raw = browser.open_novisit(self.BASE_URL + linkdata).read() - except Exception, e: + except Exception as e: report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: @@ -311,7 +311,7 @@ class ResultList(list): #maybe strenghten the search ratings = self.get_rating(entry.xpath("./p/table")[1], verbose) authors = self.get_authors(entry) - except Exception, e: + except Exception as e: if verbose: print _('Failed to get all details for an entry') print e @@ -328,7 +328,7 @@ class ResultList(list): #maybe strenghten the search ratings = self.get_rating(entry.xpath("./p/table")[1], verbose) authors = self.get_authors(entry) - except Exception, e: + except Exception as e: if verbose: print _('Failed to get all details for an entry') print e diff --git a/src/calibre/ebooks/metadata/google_books.py b/src/calibre/ebooks/metadata/google_books.py index 2087b7c489..5a5e09234e 100644 --- a/src/calibre/ebooks/metadata/google_books.py +++ b/src/calibre/ebooks/metadata/google_books.py @@ -176,7 +176,7 @@ class ResultList(list): raw = browser.open(id_url).read() feed = etree.fromstring(raw) x = entry(feed)[0] - except Exception, e: + except Exception as e: if verbose: print 'Failed to get all details for an entry' print e diff --git a/src/calibre/ebooks/metadata/imp.py b/src/calibre/ebooks/metadata/imp.py index e2a2b61f31..28bc2bc00f 100644 --- a/src/calibre/ebooks/metadata/imp.py +++ b/src/calibre/ebooks/metadata/imp.py @@ -38,7 +38,7 @@ def get_metadata(stream): mi.author = author if category: mi.category = category - except Exception, err: + except Exception as err: msg = u'Couldn\'t read metadata from imp: %s with error %s'%(mi.title, unicode(err)) print >>sys.stderr, msg.encode('utf8') return mi diff --git a/src/calibre/ebooks/metadata/isbndb.py b/src/calibre/ebooks/metadata/isbndb.py index 1c5f706593..54cd403c62 100644 --- a/src/calibre/ebooks/metadata/isbndb.py +++ b/src/calibre/ebooks/metadata/isbndb.py @@ -25,7 +25,7 @@ def fetch_metadata(url, max=3, timeout=5.): while len(books) < total_results and max > 0: try: raw = br.open(url, timeout=timeout).read() - except Exception, err: + except Exception as err: raise ISBNDBError('Could not fetch ISBNDB metadata. Error: '+str(err)) soup = BeautifulStoneSoup(raw, convertEntities=BeautifulStoneSoup.XML_ENTITIES) diff --git a/src/calibre/ebooks/metadata/kdl.py b/src/calibre/ebooks/metadata/kdl.py index b0b961b603..aa2f0d7246 100644 --- a/src/calibre/ebooks/metadata/kdl.py +++ b/src/calibre/ebooks/metadata/kdl.py @@ -43,7 +43,7 @@ def get_series(title, authors, timeout=60): br = browser() try: raw = br.open_novisit(url, timeout=timeout).read() - except URLError, e: + except URLError as e: if isinstance(e.reason, socket.timeout): raise Exception('KDL Server busy, try again later') raise diff --git a/src/calibre/ebooks/metadata/library_thing.py b/src/calibre/ebooks/metadata/library_thing.py index a0f28a3c21..be0cd5f324 100644 --- a/src/calibre/ebooks/metadata/library_thing.py +++ b/src/calibre/ebooks/metadata/library_thing.py @@ -45,7 +45,7 @@ def check_for_cover(isbn, timeout=5.): try: br.open_novisit(HeadRequest(OPENLIBRARY%isbn), timeout=timeout) return True - except Exception, e: + except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 302: return True return False diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 8914e2d985..2afa6c018a 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -32,7 +32,7 @@ class NiceBooks(MetadataSource): try: self.results = search(self.title, self.book_author, self.publisher, self.isbn, max_results=10, verbose=self.verbose) - except Exception, e: + except Exception as e: self.exception = e self.tb = traceback.format_exc() @@ -54,7 +54,7 @@ class NiceBooksCovers(CoverDownload): if Covers(mi.isbn)(entry).check_cover(): self.debug('cover for', mi.isbn, 'found') ans.set() - except Exception, e: + except Exception as e: self.debug(e) def get_covers(self, mi, result_queue, abort, timeout=5.): @@ -67,7 +67,7 @@ class NiceBooksCovers(CoverDownload): if not ext: ext = 'jpg' result_queue.put((True, cover_data, ext, self.name)) - except Exception, e: + except Exception as e: result_queue.put((False, self.exception_to_string(e), traceback.format_exc(), self.name)) @@ -109,7 +109,7 @@ class Query(object): try: raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read() - except Exception, e: + except Exception as e: report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: @@ -144,7 +144,7 @@ class Query(object): try: urldata = self.urldata + '&p=' + str(i) raw = browser.open_novisit(self.BASE_URL+urldata, timeout=timeout).read() - except Exception, e: + except Exception as e: continue if '<title>404 - ' in raw: continue @@ -233,7 +233,7 @@ class ResultList(list): def get_individual_metadata(self, browser, linkdata, verbose): try: raw = browser.open_novisit(self.BASE_URL + linkdata).read() - except Exception, e: + except Exception as e: report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: @@ -266,7 +266,7 @@ class ResultList(list): entry = entry.find("div[@id='book-info']") title = self.get_title(entry) authors = self.get_authors(entry) - except Exception, e: + except Exception as e: if verbose: print 'Failed to get all details for an entry' print e @@ -280,7 +280,7 @@ class ResultList(list): entry = entry.find("div[@id='book-info']") title = self.get_title(entry) authors = self.get_authors(entry) - except Exception, e: + except Exception as e: if verbose: print 'Failed to get all details for an entry' print e @@ -315,7 +315,7 @@ class Covers(object): cover, ext = browser.open_novisit(self.urlimg, timeout=timeout).read(), \ self.urlimg.rpartition('.')[-1] return cover, ext if ext else 'jpg' - except Exception, err: + except Exception as err: if isinstance(getattr(err, 'args', [None])[0], socket.timeout): raise NiceBooksError(_('Nicebooks timed out. Try again later.')) if not len(self.urlimg): diff --git a/src/calibre/ebooks/metadata/rb.py b/src/calibre/ebooks/metadata/rb.py index 1f13ce1d9d..c8ab657146 100644 --- a/src/calibre/ebooks/metadata/rb.py +++ b/src/calibre/ebooks/metadata/rb.py @@ -43,7 +43,7 @@ def get_metadata(stream): elif key.strip() == 'AUTHOR': mi.author = value mi.authors = string_to_authors(value) - except Exception, err: + except Exception as err: msg = u'Couldn\'t read metadata from rb: %s with error %s'%(mi.title, unicode(err)) print >>sys.stderr, msg.encode('utf8') raise diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index 9460ed7ace..c9c7350a74 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -41,12 +41,12 @@ class Worker(Thread): # {{{ try: self.get_details() except: - self.log.error('get_details failed for url: %r'%self.url) + self.log.exception('get_details failed for url: %r'%self.url) def get_details(self): try: raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip() - except Exception, e: + except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: self.log.error('URL malformed: %r'%self.url) @@ -168,7 +168,7 @@ class Worker(Thread): # {{{ if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id) if self.cover_url: - self.cache_identifier_to_cover_url(self.amazon_id, + self.plugin.cache_identifier_to_cover_url(self.amazon_id, self.cover_url) self.result_queue.put(mi) @@ -359,7 +359,7 @@ class Amazon(Source): br = self.browser try: raw = br.open_novisit(query, timeout=timeout).read().strip() - except Exception, e: + except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: log.error('Query malformed: %r'%query) diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 6fc52eb88b..0d820c2bae 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -21,6 +21,7 @@ def create_log(ostream=None): log.outputs = [FileStream(ostream)] return log +# Comparing Metadata objects for relevance {{{ words = ("the", "a", "an", "of", "and") prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words))) trailing_paren_pat = re.compile(r'\(.*\)$') @@ -35,6 +36,55 @@ def cleanup_title(s): s = whitespace_pat.sub(' ', s) return s.strip() +class InternalMetadataCompareKeyGen(object): + + ''' + Generate a sort key for comparison of the relevance of Metadata objects, + given a search query. + + The sort key ensures that an ascending order sort is a sort by order of + decreasing relevance. + + The algorithm is: + + 1. Prefer results that have the same ISBN as specified in the query + 2. Prefer results with all available fields filled in + 3. Prefer results that are an exact title match to the query + 4. Prefer results with longer comments (greater than 10 % longer) + 5. Prefer results with a cached cover URL + 6. Use the relevance of the result as reported by the metadata source's search + engine + ''' + + def __init__(self, mi, source_plugin, title, authors, identifiers): + isbn = 1 if mi.isbn and mi.isbn == identifiers.get('isbn', None) else 2 + + all_fields = 1 if source_plugin.test_fields(mi) is None else 2 + + exact_title = 1 if title and \ + cleanup_title(title) == cleanup_title(mi.title) else 2 + + has_cover = 2 if source_plugin.get_cached_cover_url(mi.identifiers)\ + is None else 1 + + self.base = (isbn, all_fields, exact_title) + self.comments_len = len(mi.comments.strip() if mi.comments else '') + self.extra = (has_cover, getattr(mi, 'source_relevance', 0)) + + def __cmp__(self, other): + result = cmp(self.base, other.base) + if result == 0: + # Now prefer results with the longer comments, within 10% + cx, cy = self.comments_len, other.comments_len + t = (cx + cy) / 20 + delta = cy - cx + if abs(delta) > t: + result = delta + else: + result = cmp(self.extra, other.extra) + return result + +# }}} class Source(Plugin): @@ -70,7 +120,7 @@ class Source(Plugin): def browser(self): if self._browser is None: self._browser = browser(user_agent=random_user_agent()) - return self._browser + return self._browser.clone_browser() # }}} @@ -172,69 +222,30 @@ class Source(Plugin): def get_cached_cover_url(self, identifiers): ''' Return cached cover URL for the book identified by - the identifiers dict or Noneif no such URL exists + the identifiers dict or None if no such URL exists. + + Note that this method must only return validated URLs, i.e. not URLS + that could result in a generic cover image or a not found error. ''' return None - def compare_identify_results(self, x, y, title=None, authors=None, + def identify_results_keygen(self, title=None, authors=None, identifiers={}): ''' - Method used to sort the results from a call to identify by relevance. - Uses the actual query and various heuristics to rank results. - Re-implement in your plugin if this generic algorithm is not suitable. - Note that this method assumes x and y have a source_relevance - attribute. + Return a function that is used to generate a key that can sort Metadata + objects by their relevance given a search query (title, authors, + identifiers). - one < two iff one is more relevant than two + These keys are used to sort the results of a call to :meth:`identify`. + + For details on the default algorithm see + :class:`InternalMetadataCompareKeyGen`. Re-implement this function in + your plugin if the default algorithm is not suitable. ''' - # First, guarantee that if the query specifies an ISBN, the result with - # the same isbn is the most relevant - def isbn_test(mi): - return mi.isbn and mi.isbn == identifiers.get('isbn', None) - - def boolcmp(a, b): - return -1 if a and not b else 1 if not a and b else 0 - - x_has_isbn, y_has_isbn = isbn_test(x), isbn_test(y) - result = boolcmp(x_has_isbn, y_has_isbn) - if result != 0: - return result - - # Now prefer results that have complete metadata over those that don't - x_has_all_fields = self.test_fields(x) is None - y_has_all_fields = self.test_fields(y) is None - - result = boolcmp(x_has_all_fields, y_has_all_fields) - if result != 0: - return result - - # Now prefer results whose title matches the search query - if title: - x_title = cleanup_title(x.title) - y_title = cleanup_title(y.title) - t = cleanup_title(title) - x_has_title, y_has_title = x_title == t, y_title == t - result = boolcmp(x_has_title, y_has_title) - if result != 0: - return result - - # Now prefer results with the longer comments, within 10% - cx = len(x.comments.strip() if x.comments else '') - cy = len(y.comments.strip() if y.comments else '') - t = (cx + cy) / 20 - result = cy - cx - if result != 0 and abs(cx - cy) > t: - return result - - # Now prefer results with cached cover URLs - x_has_cover = self.get_cached_cover_url(x.identifiers) is not None - y_has_cover = self.get_cached_cover_url(y.identifiers) is not None - result = boolcmp(x_has_cover, y_has_cover) - if result != 0: - return result - - # Now use the relevance reported by the remote search engine - return x.source_relevance - y.source_relevance + def keygen(mi): + return InternalMetadataCompareKeyGen(mi, self, title, authors, + identifiers) + return keygen def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=5): diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py index b7298c0099..06362cf8b8 100644 --- a/src/calibre/ebooks/metadata/sources/google.py +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -213,7 +213,7 @@ class GoogleBooks(Source): br = self.browser try: raw = br.open_novisit(query, timeout=timeout).read() - except Exception, e: + except Exception as e: log.exception('Failed to make identify query: %r'%query) return as_unicode(e) @@ -222,7 +222,7 @@ class GoogleBooks(Source): feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=parser) entries = entry(feed) - except Exception, e: + except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) diff --git a/src/calibre/ebooks/metadata/sources/test.py b/src/calibre/ebooks/metadata/sources/test.py index 032041ef29..3419a91d31 100644 --- a/src/calibre/ebooks/metadata/sources/test.py +++ b/src/calibre/ebooks/metadata/sources/test.py @@ -11,7 +11,6 @@ import os, tempfile, time from Queue import Queue, Empty from threading import Event - from calibre.customize.ui import metadata_plugins from calibre import prints from calibre.ebooks.metadata import check_isbn @@ -90,11 +89,17 @@ def test_identify_plugin(name, tests): except Empty: break - prints('Found', len(results), 'matches:') + prints('Found', len(results), 'matches:', end=' ') + prints('Smaller relevance means better match') - for mi in results: + results.sort(key=plugin.identify_results_keygen( + title=kwargs.get('title', None), authors=kwargs.get('authors', + None), identifiers=kwargs.get('identifiers', {}))) + + for i, mi in enumerate(results): + prints('*'*30, 'Relevance:', i, '*'*30) prints(mi) - prints('\n\n') + prints('*'*75, '\n\n') possibles = [] for mi in results: @@ -117,6 +122,9 @@ def test_identify_plugin(name, tests): prints('Failed to find', plugin.test_fields(possibles[0])) raise SystemExit(1) + if results[0] is not possibles[0]: + prints('Most relevant result failed the tests') + prints('Average time per query', sum(times)/len(times)) diff --git a/src/calibre/ebooks/metadata/toc.py b/src/calibre/ebooks/metadata/toc.py index 10d45186de..17f99150be 100644 --- a/src/calibre/ebooks/metadata/toc.py +++ b/src/calibre/ebooks/metadata/toc.py @@ -147,7 +147,7 @@ class TOC(list): if path and os.access(path, os.R_OK): try: self.read_ncx_toc(path) - except Exception, err: + except Exception as err: print 'WARNING: Invalid NCX file:', err return cwd = os.path.abspath(self.base_path) diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index f1b1b1ef63..8877ecdd0b 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -769,7 +769,8 @@ class MobiReader(object): def extract_text(self): self.log.debug('Extracting text...') - text_sections = [self.text_section(i) for i in range(1, self.book_header.records + 1)] + text_sections = [self.text_section(i) for i in range(1, + min(self.book_header.records + 1, len(self.sections)))] processed_records = list(range(0, self.book_header.records + 1)) self.mobi_html = '' diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 3bd936b803..e5f2cace7f 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -884,13 +884,13 @@ class Manifest(object): def first_pass(data): try: data = etree.fromstring(data, parser=parser) - except etree.XMLSyntaxError, err: + except etree.XMLSyntaxError as err: self.oeb.log.exception('Initial parse failed:') repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) data = ENTITY_RE.sub(repl, data) try: data = etree.fromstring(data, parser=parser) - except etree.XMLSyntaxError, err: + except etree.XMLSyntaxError as err: self.oeb.logger.warn('Parsing file %r as HTML' % self.href) if err.args and err.args[0].startswith('Excessive depth'): from lxml.html import soupparser diff --git a/src/calibre/ebooks/pdf/manipulate/decrypt.py b/src/calibre/ebooks/pdf/manipulate/decrypt.py index ede12f15ee..fd8510efc7 100644 --- a/src/calibre/ebooks/pdf/manipulate/decrypt.py +++ b/src/calibre/ebooks/pdf/manipulate/decrypt.py @@ -103,7 +103,7 @@ def main(args=sys.argv, name=''): try: decrypt(args[0], opts.output, args[1]) - except DecryptionError, e: + except DecryptionError as e: print e.value return 1 diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 564ba14a32..4ac1d0e368 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -50,7 +50,7 @@ def pdftohtml(output_dir, pdf_path, no_images): try: p = popen(cmd, stderr=logf._fd, stdout=logf._fd, stdin=subprocess.PIPE) - except OSError, err: + except OSError as err: if err.errno == 2: raise ConversionError(_('Could not find pdftohtml, check it is in your PATH')) else: @@ -60,7 +60,7 @@ def pdftohtml(output_dir, pdf_path, no_images): try: ret = p.wait() break - except OSError, e: + except OSError as e: if e.errno == errno.EINTR: continue else: diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 1594b2fbce..23c16f473d 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -268,7 +268,7 @@ class RTFInput(InputFormatPlugin): self.log('Converting RTF to XML...') try: xml = self.generate_xml(stream.name) - except RtfInvalidCodeException, e: + except RtfInvalidCodeException as e: raise ValueError(_('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try it.\n%s')%e) diff --git a/src/calibre/ebooks/snb/snbfile.py b/src/calibre/ebooks/snb/snbfile.py index 9a7d65e417..1a0986baf4 100644 --- a/src/calibre/ebooks/snb/snbfile.py +++ b/src/calibre/ebooks/snb/snbfile.py @@ -85,7 +85,7 @@ class SNBFile: uncompressedData += bzdc.decompress(data) else: uncompressedData += data - except Exception, e: + except Exception as e: print e if len(uncompressedData) != self.plainStreamSizeUncompressed: raise Exception() diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index c3c82ef893..b01a7bcdb7 100755 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -1,4 +1,6 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- + """ PyTextile @@ -206,6 +208,12 @@ class Textile(object): (re.compile(r'{clubs?}'), r'♣'), # club (re.compile(r'{hearts?}'), r'♥'), # heart (re.compile(r'{diam(onds?|s)}'), r'♦'), # diamond + (re.compile(r'{"}'), r'"'), # double-quote + (re.compile(r"{'}"), r'''), # single-quote + (re.compile(r"{(’|'/|/')}"), r'’'), # closing-single-quote - apostrophe + (re.compile(r"{(‘|\\'|'\\)}"), r'‘'), # opening-single-quote + (re.compile(r'{(”|"/|/")}'), r'”'), # closing-double-quote + (re.compile(r'{(“|\\"|"\\)}'), r'“'), # opening-double-quote ] glyph_defaults = [ (re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'), r'\1\2×\3'), # dimension sign diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 7face4c24f..cad55b8c3f 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -165,7 +165,7 @@ class TXTInput(InputFormatPlugin): elif options.formatting_type == 'textile': log.debug('Running text through textile conversion...') html = convert_textile(txt) - setattr(options, 'smarten_punctuation', True) + #setattr(options, 'smarten_punctuation', True) else: log.debug('Running text through basic conversion...') flow_size = getattr(options, 'flow_size', 0) diff --git a/src/calibre/gui2/actions/copy_to_library.py b/src/calibre/gui2/actions/copy_to_library.py index 0668baeac6..2e4d0380be 100644 --- a/src/calibre/gui2/actions/copy_to_library.py +++ b/src/calibre/gui2/actions/copy_to_library.py @@ -32,7 +32,7 @@ class Worker(Thread): def run(self): try: self.doit() - except Exception, err: + except Exception as err: import traceback try: err = unicode(err) diff --git a/src/calibre/gui2/add.py b/src/calibre/gui2/add.py index f40cf0ff75..44b5bb446b 100644 --- a/src/calibre/gui2/add.py +++ b/src/calibre/gui2/add.py @@ -78,7 +78,7 @@ class RecursiveFind(QThread): # {{{ if isinstance(root, unicode): root = root.encode(filesystem_encoding) self.walk(root) - except Exception, err: + except Exception as err: import traceback traceback.print_exc() try: diff --git a/src/calibre/gui2/convert/__init__.py b/src/calibre/gui2/convert/__init__.py index 925fecd693..bdcf9ede05 100644 --- a/src/calibre/gui2/convert/__init__.py +++ b/src/calibre/gui2/convert/__init__.py @@ -6,7 +6,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -import textwrap, codecs +import textwrap, codecs, importlib from functools import partial from PyQt4.Qt import QWidget, QSpinBox, QDoubleSpinBox, QLineEdit, QTextEdit, \ @@ -22,8 +22,8 @@ from calibre.customize.ui import plugin_for_input_format def config_widget_for_input_plugin(plugin): name = plugin.name.lower().replace(' ', '_') try: - return __import__('calibre.gui2.convert.'+name, - fromlist=[1]).PluginWidget + return importlib.import_module( + 'calibre.gui2.convert.'+name).PluginWidget except ImportError: pass diff --git a/src/calibre/gui2/convert/bulk.py b/src/calibre/gui2/convert/bulk.py index 349f39ac76..576b3ca3e7 100644 --- a/src/calibre/gui2/convert/bulk.py +++ b/src/calibre/gui2/convert/bulk.py @@ -4,7 +4,7 @@ __license__ = 'GPL 3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' -import shutil +import shutil, importlib from PyQt4.Qt import QString, SIGNAL @@ -82,8 +82,8 @@ class BulkConfig(Config): output_widget = None name = self.plumber.output_plugin.name.lower().replace(' ', '_') try: - output_widget = __import__('calibre.gui2.convert.'+name, - fromlist=[1]) + output_widget = importlib.import_module( + 'calibre.gui2.convert.'+name) pw = output_widget.PluginWidget pw.ICON = I('back.png') pw.HELP = _('Options specific to the output format.') diff --git a/src/calibre/gui2/convert/metadata.py b/src/calibre/gui2/convert/metadata.py index 95dd7623c9..80311502e8 100644 --- a/src/calibre/gui2/convert/metadata.py +++ b/src/calibre/gui2/convert/metadata.py @@ -192,7 +192,7 @@ class MetadataWidget(Widget, Ui_Form): try: cf = open(_file, "rb") cover = cf.read() - except IOError, e: + except IOError as e: d = error_dialog(self.parent(), _('Error reading file'), _("<p>There was an error reading from file: <br /><b>") + _file + "</b></p><br />"+str(e)) d.exec_() diff --git a/src/calibre/gui2/convert/search_and_replace.py b/src/calibre/gui2/convert/search_and_replace.py index c2241ff8eb..407e7922e7 100644 --- a/src/calibre/gui2/convert/search_and_replace.py +++ b/src/calibre/gui2/convert/search_and_replace.py @@ -69,7 +69,7 @@ class SearchAndReplaceWidget(Widget, Ui_Form): try: pat = unicode(x.regex) re.compile(pat) - except Exception, err: + except Exception as err: error_dialog(self, _('Invalid regular expression'), _('Invalid regular expression: %s')%err, show=True) return False diff --git a/src/calibre/gui2/convert/single.py b/src/calibre/gui2/convert/single.py index 59fcbb65ad..3575fb5ffb 100644 --- a/src/calibre/gui2/convert/single.py +++ b/src/calibre/gui2/convert/single.py @@ -6,7 +6,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -import sys, cPickle, shutil +import sys, cPickle, shutil, importlib from PyQt4.Qt import QString, SIGNAL, QAbstractListModel, Qt, QVariant, QFont @@ -182,8 +182,8 @@ class Config(ResizableDialog, Ui_Dialog): output_widget = None name = self.plumber.output_plugin.name.lower().replace(' ', '_') try: - output_widget = __import__('calibre.gui2.convert.'+name, - fromlist=[1]) + output_widget = importlib.import_module( + 'calibre.gui2.convert.'+name) pw = output_widget.PluginWidget pw.ICON = I('back.png') pw.HELP = _('Options specific to the output format.') @@ -193,8 +193,8 @@ class Config(ResizableDialog, Ui_Dialog): input_widget = None name = self.plumber.input_plugin.name.lower().replace(' ', '_') try: - input_widget = __import__('calibre.gui2.convert.'+name, - fromlist=[1]) + input_widget = importlib.import_module( + 'calibre.gui2.convert.'+name) pw = input_widget.PluginWidget pw.ICON = I('forward.png') pw.HELP = _('Options specific to the input format.') diff --git a/src/calibre/gui2/custom_column_widgets.py b/src/calibre/gui2/custom_column_widgets.py index beaca77a38..10602fb28c 100644 --- a/src/calibre/gui2/custom_column_widgets.py +++ b/src/calibre/gui2/custom_column_widgets.py @@ -226,10 +226,18 @@ class Comments(Base): class Text(Base): def setup_ui(self, parent): + if self.col_metadata['display'].get('is_names', False): + self.sep = u' & ' + else: + self.sep = u', ' values = self.all_values = list(self.db.all_custom(num=self.col_id)) values.sort(key=sort_key) if self.col_metadata['is_multiple']: w = MultiCompleteLineEdit(parent) + w.set_separator(self.sep.strip()) + if self.sep == u' & ': + w.set_space_before_sep(True) + w.set_add_separator(tweaks['authors_completer_append_separator']) w.update_items_cache(values) w.setSizePolicy(QSizePolicy.Minimum, QSizePolicy.Preferred) else: @@ -261,12 +269,12 @@ class Text(Base): if self.col_metadata['is_multiple']: if not val: val = [] - self.widgets[1].setText(u', '.join(val)) + self.widgets[1].setText(self.sep.join(val)) def getter(self): if self.col_metadata['is_multiple']: val = unicode(self.widgets[1].text()).strip() - ans = [x.strip() for x in val.split(',') if x.strip()] + ans = [x.strip() for x in val.split(self.sep.strip()) if x.strip()] if not ans: ans = None return ans @@ -847,13 +855,20 @@ class BulkText(BulkBase): self.main_widget.setSizePolicy(QSizePolicy.Minimum, QSizePolicy.Preferred) self.adding_widget = self.main_widget - w = RemoveTags(parent, values) - self.widgets.append(QLabel('&'+self.col_metadata['name']+': ' + - _('tags to remove'), parent)) - self.widgets.append(w) - self.removing_widget = w - w.tags_box.textChanged.connect(self.a_c_checkbox_changed) - w.checkbox.stateChanged.connect(self.a_c_checkbox_changed) + if not self.col_metadata['display'].get('is_names', False): + w = RemoveTags(parent, values) + self.widgets.append(QLabel('&'+self.col_metadata['name']+': ' + + _('tags to remove'), parent)) + self.widgets.append(w) + self.removing_widget = w + self.main_widget.set_separator(',') + w.tags_box.textChanged.connect(self.a_c_checkbox_changed) + w.checkbox.stateChanged.connect(self.a_c_checkbox_changed) + else: + self.main_widget.set_separator('&') + self.main_widget.set_space_before_sep(True) + self.main_widget.set_add_separator( + tweaks['authors_completer_append_separator']) else: self.make_widgets(parent, MultiCompleteComboBox) self.main_widget.set_separator(None) @@ -882,21 +897,26 @@ class BulkText(BulkBase): if not self.a_c_checkbox.isChecked(): return if self.col_metadata['is_multiple']: - remove_all, adding, rtext = self.gui_val - remove = set() - if remove_all: - remove = set(self.db.all_custom(num=self.col_id)) + if self.col_metadata['display'].get('is_names', False): + val = self.gui_val + add = [v.strip() for v in val.split('&') if v.strip()] + self.db.set_custom_bulk(book_ids, add, num=self.col_id) else: - txt = rtext + remove_all, adding, rtext = self.gui_val + remove = set() + if remove_all: + remove = set(self.db.all_custom(num=self.col_id)) + else: + txt = rtext + if txt: + remove = set([v.strip() for v in txt.split(',')]) + txt = adding if txt: - remove = set([v.strip() for v in txt.split(',')]) - txt = adding - if txt: - add = set([v.strip() for v in txt.split(',')]) - else: - add = set() - self.db.set_custom_bulk_multiple(book_ids, add=add, remove=remove, - num=self.col_id) + add = set([v.strip() for v in txt.split(',')]) + else: + add = set() + self.db.set_custom_bulk_multiple(book_ids, add=add, + remove=remove, num=self.col_id) else: val = self.gui_val val = self.normalize_ui_val(val) @@ -905,10 +925,11 @@ class BulkText(BulkBase): def getter(self): if self.col_metadata['is_multiple']: - return self.removing_widget.checkbox.isChecked(), \ - unicode(self.adding_widget.text()), \ - unicode(self.removing_widget.tags_box.text()) - + if not self.col_metadata['display'].get('is_names', False): + return self.removing_widget.checkbox.isChecked(), \ + unicode(self.adding_widget.text()), \ + unicode(self.removing_widget.tags_box.text()) + return unicode(self.adding_widget.text()) val = unicode(self.main_widget.currentText()).strip() if not val: val = None diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index 215e67c46f..ab2177cef1 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -64,7 +64,7 @@ class DeviceJob(BaseJob): # {{{ self.result = self.func(*self.args, **self.kwargs) if self._aborted: return - except (Exception, SystemExit), err: + except (Exception, SystemExit) as err: if self._aborted: return self.failed = True @@ -162,7 +162,7 @@ class DeviceManager(Thread): # {{{ dev.reset(detected_device=detected_device, report_progress=self.report_progress) dev.open(self.current_library_uuid) - except OpenFeedback, e: + except OpenFeedback as e: if dev not in self.ejected_devices: self.open_feedback_msg(dev.get_gui_name(), e.feedback_msg) self.ejected_devices.add(dev) diff --git a/src/calibre/gui2/device_drivers/configwidget.py b/src/calibre/gui2/device_drivers/configwidget.py index 97c492b550..fc7e16e639 100644 --- a/src/calibre/gui2/device_drivers/configwidget.py +++ b/src/calibre/gui2/device_drivers/configwidget.py @@ -133,7 +133,7 @@ class ConfigWidget(QWidget, Ui_ConfigWidget): try: validation_formatter.validate(tmpl) return True - except Exception, err: + except Exception as err: error_dialog(self, _('Invalid template'), '<p>'+_('The template %s is invalid:')%tmpl + \ '<br>'+unicode(err), show=True) diff --git a/src/calibre/gui2/dialogs/catalog.py b/src/calibre/gui2/dialogs/catalog.py index ebca7235eb..a8f7ed160f 100644 --- a/src/calibre/gui2/dialogs/catalog.py +++ b/src/calibre/gui2/dialogs/catalog.py @@ -6,7 +6,7 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -import os, sys +import os, sys, importlib from calibre.customize.ui import config from calibre.gui2.dialogs.catalog_ui import Ui_Dialog @@ -43,8 +43,7 @@ class Catalog(ResizableDialog, Ui_Dialog): name = plugin.name.lower().replace(' ', '_') if type(plugin) in builtin_plugins: try: - catalog_widget = __import__('calibre.gui2.catalog.'+name, - fromlist=[1]) + catalog_widget = importlib.import_module('calibre.gui2.catalog.'+name) pw = catalog_widget.PluginWidget() pw.initialize(name, db) pw.ICON = I('forward.png') @@ -75,7 +74,7 @@ class Catalog(ResizableDialog, Ui_Dialog): # Import the dynamic PluginWidget() from .py file provided in plugin.zip try: sys.path.insert(0, plugin.resources_path) - catalog_widget = __import__(name, fromlist=[1]) + catalog_widget = importlib.import_module(name) pw = catalog_widget.PluginWidget() pw.initialize(name) pw.ICON = I('forward.png') diff --git a/src/calibre/gui2/dialogs/check_library.py b/src/calibre/gui2/dialogs/check_library.py index 560090d2b3..95f99d4034 100644 --- a/src/calibre/gui2/dialogs/check_library.py +++ b/src/calibre/gui2/dialogs/check_library.py @@ -68,7 +68,7 @@ class DBCheck(QDialog): # {{{ self.start_load() return QTimer.singleShot(0, self.do_one_dump) - except Exception, e: + except Exception as e: import traceback self.error = (as_unicode(e), traceback.format_exc()) self.reject() @@ -90,7 +90,7 @@ class DBCheck(QDialog): # {{{ self.conn.commit() QTimer.singleShot(0, self.do_one_load) - except Exception, e: + except Exception as e: import traceback self.error = (as_unicode(e), traceback.format_exc()) self.reject() @@ -111,7 +111,7 @@ class DBCheck(QDialog): # {{{ self.pb.setValue(self.pb.value() + 1) self.count -= 1 QTimer.singleShot(0, self.do_one_load) - except Exception, e: + except Exception as e: import traceback self.error = (as_unicode(e), traceback.format_exc()) self.reject() diff --git a/src/calibre/gui2/dialogs/metadata_bulk.py b/src/calibre/gui2/dialogs/metadata_bulk.py index 9b25545252..0683f2cb91 100644 --- a/src/calibre/gui2/dialogs/metadata_bulk.py +++ b/src/calibre/gui2/dialogs/metadata_bulk.py @@ -120,7 +120,7 @@ class MyBlockingBusy(QDialog): # {{{ self.msg.setText(self.msg_text.format(self.phases[self.current_phase], percent)) self.do_one(id) - except Exception, err: + except Exception as err: import traceback try: err = unicode(err) @@ -653,7 +653,10 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog): if self.destination_field_fm['is_multiple']: if self.comma_separated.isChecked(): - if dest == 'authors': + if dest == 'authors' or \ + (self.destination_field_fm['is_custom'] and + self.destination_field_fm['datatype'] == 'text' and + self.destination_field_fm['display'].get('is_names', False)): splitter = ' & ' else: splitter = ',' diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index 9efe7f7160..f6b7b94453 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -76,7 +76,7 @@ class CoverFetcher(Thread): # {{{ self.cover_data, self.errors = download_cover(mi, timeout=self.timeout) - except Exception, e: + except Exception as e: self.exception = e self.traceback = traceback.format_exc() print self.traceback @@ -183,7 +183,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): try: cf = open(_file, "rb") cover = cf.read() - except IOError, e: + except IOError as e: d = error_dialog(self, _('Error reading file'), _("<p>There was an error reading from file: <br /><b>") + _file + "</b></p><br />"+str(e)) d.exec_() diff --git a/src/calibre/gui2/dialogs/tag_editor.py b/src/calibre/gui2/dialogs/tag_editor.py index 6bd8eb7dbe..bf3bb9fd4e 100644 --- a/src/calibre/gui2/dialogs/tag_editor.py +++ b/src/calibre/gui2/dialogs/tag_editor.py @@ -122,6 +122,8 @@ class TagEditor(QDialog, Ui_TagEditor): tags = unicode(self.add_tag_input.text()).split(',') for tag in tags: tag = tag.strip() + if not tag: + continue for item in self.available_tags.findItems(tag, Qt.MatchFixedString): self.available_tags.takeItem(self.available_tags.row(item)) if tag not in self.tags: diff --git a/src/calibre/gui2/dialogs/user_profiles.py b/src/calibre/gui2/dialogs/user_profiles.py index 5453a90766..d66d02d211 100644 --- a/src/calibre/gui2/dialogs/user_profiles.py +++ b/src/calibre/gui2/dialogs/user_profiles.py @@ -237,7 +237,7 @@ class %(classname)s(%(base_class)s): try: compile_recipe(src) - except Exception, err: + except Exception as err: error_dialog(self, _('Invalid input'), _('<p>Could not create recipe. Error:<br>%s')%str(err)).exec_() return @@ -246,7 +246,7 @@ class %(classname)s(%(base_class)s): src = unicode(self.source_code.toPlainText()) try: title = compile_recipe(src).title - except Exception, err: + except Exception as err: error_dialog(self, _('Invalid input'), _('<p>Could not create recipe. Error:<br>%s')%str(err)).exec_() return @@ -333,7 +333,7 @@ class %(classname)s(%(base_class)s): try: profile = open(file, 'rb').read().decode('utf-8') title = compile_recipe(profile).title - except Exception, err: + except Exception as err: error_dialog(self, _('Invalid input'), _('<p>Could not create recipe. Error:<br>%s')%str(err)).exec_() return diff --git a/src/calibre/gui2/dnd.py b/src/calibre/gui2/dnd.py index 928de72578..1f9dbdfa34 100644 --- a/src/calibre/gui2/dnd.py +++ b/src/calibre/gui2/dnd.py @@ -35,7 +35,7 @@ class Worker(Thread): # {{{ try: br = browser() br.retrieve(self.url, self.fpath, self.callback) - except Exception, e: + except Exception as e: self.err = as_unicode(e) import traceback self.tb = traceback.format_exc() diff --git a/src/calibre/gui2/email.py b/src/calibre/gui2/email.py index c84b3180f7..81c1d9c255 100644 --- a/src/calibre/gui2/email.py +++ b/src/calibre/gui2/email.py @@ -116,7 +116,7 @@ class Emailer(Thread): # {{{ try: self.sendmail(job) break - except Exception, e: + except Exception as e: if not self._run: return import traceback diff --git a/src/calibre/gui2/library/delegates.py b/src/calibre/gui2/library/delegates.py index 3a090f8102..0f74500099 100644 --- a/src/calibre/gui2/library/delegates.py +++ b/src/calibre/gui2/library/delegates.py @@ -398,7 +398,7 @@ class CcTemplateDelegate(QStyledItemDelegate): # {{{ val = unicode(editor.textbox.toPlainText()) try: validation_formatter.validate(val) - except Exception, err: + except Exception as err: error_dialog(self.parent(), _('Invalid template'), '<p>'+_('The template %s is invalid:')%val + \ '<br>'+str(err), show=True) diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py index a200562ea9..c921ea125f 100644 --- a/src/calibre/gui2/library/models.py +++ b/src/calibre/gui2/library/models.py @@ -640,18 +640,18 @@ class BooksModel(QAbstractTableModel): # {{{ return self.bool_yes_icon return self.bool_blank_icon - def text_type(r, mult=False, idx=-1): + def text_type(r, mult=None, idx=-1): text = self.db.data[r][idx] - if text and mult: - return QVariant(', '.join(sorted(text.split('|'),key=sort_key))) + if text and mult is not None: + if mult: + return QVariant(u' & '.join(text.split('|'))) + return QVariant(u', '.join(sorted(text.split('|'),key=sort_key))) return QVariant(text) - def decorated_text_type(r, mult=False, idx=-1): + def decorated_text_type(r, idx=-1): text = self.db.data[r][idx] if force_to_bool(text) is not None: return None - if text and mult: - return QVariant(', '.join(sorted(text.split('|'),key=sort_key))) return QVariant(text) def number_type(r, idx=-1): @@ -659,7 +659,7 @@ class BooksModel(QAbstractTableModel): # {{{ self.dc = { 'title' : functools.partial(text_type, - idx=self.db.field_metadata['title']['rec_index'], mult=False), + idx=self.db.field_metadata['title']['rec_index'], mult=None), 'authors' : functools.partial(authors, idx=self.db.field_metadata['authors']['rec_index']), 'size' : functools.partial(size, @@ -671,14 +671,14 @@ class BooksModel(QAbstractTableModel): # {{{ 'rating' : functools.partial(rating_type, idx=self.db.field_metadata['rating']['rec_index']), 'publisher': functools.partial(text_type, - idx=self.db.field_metadata['publisher']['rec_index'], mult=False), + idx=self.db.field_metadata['publisher']['rec_index'], mult=None), 'tags' : functools.partial(tags, idx=self.db.field_metadata['tags']['rec_index']), 'series' : functools.partial(series_type, idx=self.db.field_metadata['series']['rec_index'], siix=self.db.field_metadata['series_index']['rec_index']), 'ondevice' : functools.partial(text_type, - idx=self.db.field_metadata['ondevice']['rec_index'], mult=False), + idx=self.db.field_metadata['ondevice']['rec_index'], mult=None), } self.dc_decorator = { @@ -692,11 +692,12 @@ class BooksModel(QAbstractTableModel): # {{{ datatype = self.custom_columns[col]['datatype'] if datatype in ('text', 'comments', 'composite', 'enumeration'): mult=self.custom_columns[col]['is_multiple'] + if mult is not None: + mult = self.custom_columns[col]['display'].get('is_names', False) self.dc[col] = functools.partial(text_type, idx=idx, mult=mult) if datatype in ['text', 'composite', 'enumeration'] and not mult: if self.custom_columns[col]['display'].get('use_decorations', False): - self.dc[col] = functools.partial(decorated_text_type, - idx=idx, mult=mult) + self.dc[col] = functools.partial(decorated_text_type, idx=idx) self.dc_decorator[col] = functools.partial( bool_type_decorator, idx=idx, bool_cols_are_tristate= diff --git a/src/calibre/gui2/library/views.py b/src/calibre/gui2/library/views.py index c62936a46f..0cce33da9e 100644 --- a/src/calibre/gui2/library/views.py +++ b/src/calibre/gui2/library/views.py @@ -78,6 +78,7 @@ class BooksView(QTableView): # {{{ self.pubdate_delegate = PubDateDelegate(self) self.tags_delegate = CompleteDelegate(self, ',', 'all_tags') self.authors_delegate = CompleteDelegate(self, '&', 'all_author_names', True) + self.cc_names_delegate = CompleteDelegate(self, '&', 'all_custom', True) self.series_delegate = TextDelegate(self) self.publisher_delegate = TextDelegate(self) self.text_delegate = TextDelegate(self) @@ -410,6 +411,7 @@ class BooksView(QTableView): # {{{ self.save_state() self._model.set_database(db) self.tags_delegate.set_database(db) + self.cc_names_delegate.set_database(db) self.authors_delegate.set_database(db) self.series_delegate.set_auto_complete_function(db.all_series) self.publisher_delegate.set_auto_complete_function(db.all_publishers) @@ -431,12 +433,17 @@ class BooksView(QTableView): # {{{ self.setItemDelegateForColumn(cm.index(colhead), delegate) elif cc['datatype'] == 'comments': self.setItemDelegateForColumn(cm.index(colhead), self.cc_comments_delegate) - elif cc['datatype'] in ('text', 'series'): + elif cc['datatype'] == 'text': if cc['is_multiple']: - self.setItemDelegateForColumn(cm.index(colhead), self.tags_delegate) + if cc['display'].get('is_names', False): + self.setItemDelegateForColumn(cm.index(colhead), + self.cc_names_delegate) + else: + self.setItemDelegateForColumn(cm.index(colhead), + self.tags_delegate) else: self.setItemDelegateForColumn(cm.index(colhead), self.cc_text_delegate) - elif cc['datatype'] in ('int', 'float'): + elif cc['datatype'] in ('series', 'int', 'float'): self.setItemDelegateForColumn(cm.index(colhead), self.cc_text_delegate) elif cc['datatype'] == 'bool': self.setItemDelegateForColumn(cm.index(colhead), self.cc_bool_delegate) diff --git a/src/calibre/gui2/lrf_renderer/main.py b/src/calibre/gui2/lrf_renderer/main.py index 2acfd3c9a7..e68e04adcf 100644 --- a/src/calibre/gui2/lrf_renderer/main.py +++ b/src/calibre/gui2/lrf_renderer/main.py @@ -35,7 +35,7 @@ class RenderWorker(QThread): self.stream = None if self.aborted: self.lrf = None - except Exception, err: + except Exception as err: self.lrf, self.stream = None, None self.exception = err self.formatted_traceback = traceback.format_exc() diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py index 976b679726..c67ec8c2b4 100644 --- a/src/calibre/gui2/main.py +++ b/src/calibre/gui2/main.py @@ -399,7 +399,7 @@ def main(args=sys.argv): if __name__ == '__main__': try: sys.exit(main()) - except Exception, err: + except Exception as err: if not iswindows: raise tb = traceback.format_exc() from PyQt4.QtGui import QErrorMessage diff --git a/src/calibre/gui2/metadata/basic_widgets.py b/src/calibre/gui2/metadata/basic_widgets.py index d5a8de7b67..635a037482 100644 --- a/src/calibre/gui2/metadata/basic_widgets.py +++ b/src/calibre/gui2/metadata/basic_widgets.py @@ -656,7 +656,7 @@ class Cover(ImageView): # {{{ try: cf = open(_file, "rb") cover = cf.read() - except IOError, e: + except IOError as e: d = error_dialog(self, _('Error reading file'), _("<p>There was an error reading from file: <br /><b>") + _file + "</b></p><br />"+str(e)) diff --git a/src/calibre/gui2/metadata/bulk_download.py b/src/calibre/gui2/metadata/bulk_download.py index 461f56b60c..7a7f49dabf 100644 --- a/src/calibre/gui2/metadata/bulk_download.py +++ b/src/calibre/gui2/metadata/bulk_download.py @@ -88,7 +88,7 @@ class DownloadMetadata(Thread): def run(self): try: self._run() - except Exception, e: + except Exception as e: self.exception = e self.tb = traceback.format_exc() diff --git a/src/calibre/gui2/metadata/single.py b/src/calibre/gui2/metadata/single.py index 3b6dd0e253..5b17b454e7 100644 --- a/src/calibre/gui2/metadata/single.py +++ b/src/calibre/gui2/metadata/single.py @@ -303,7 +303,7 @@ class MetadataSingleDialogBase(ResizableDialog): return False self.books_to_refresh |= getattr(widget, 'books_to_refresh', set([])) - except IOError, err: + except IOError as err: if err.errno == 13: # Permission denied import traceback fname = err.filename if err.filename else 'file' diff --git a/src/calibre/gui2/notify.py b/src/calibre/gui2/notify.py index 501f7007eb..947d98f1a4 100644 --- a/src/calibre/gui2/notify.py +++ b/src/calibre/gui2/notify.py @@ -34,7 +34,7 @@ class DBUSNotifier(Notifier): import dbus self.dbus = dbus self._notify = dbus.Interface(dbus.SessionBus().get_object(server, path), interface) - except Exception, err: + except Exception as err: self.ok = False self.err = str(err) diff --git a/src/calibre/gui2/preferences/conversion.py b/src/calibre/gui2/preferences/conversion.py index 8de9ee1661..b5240227d3 100644 --- a/src/calibre/gui2/preferences/conversion.py +++ b/src/calibre/gui2/preferences/conversion.py @@ -5,6 +5,8 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' +import importlib + from PyQt4.Qt import QIcon, Qt, QStringListModel, QVariant from calibre.gui2.preferences import ConfigWidgetBase, test_widget, AbortCommit @@ -104,8 +106,8 @@ class OutputOptions(Base): for plugin in output_format_plugins(): name = plugin.name.lower().replace(' ', '_') try: - output_widget = __import__('calibre.gui2.convert.'+name, - fromlist=[1]) + output_widget = importlib.import_module( + 'calibre.gui2.convert.'+name) pw = output_widget.PluginWidget self.conversion_widgets.append(pw) except ImportError: diff --git a/src/calibre/gui2/preferences/create_custom_column.py b/src/calibre/gui2/preferences/create_custom_column.py index cee34f150e..f476845f8b 100644 --- a/src/calibre/gui2/preferences/create_custom_column.py +++ b/src/calibre/gui2/preferences/create_custom_column.py @@ -63,7 +63,7 @@ class CreateCustomColumn(QDialog, Ui_QCreateCustomColumn): for col, name in [('isbn', _('ISBN')), ('formats', _('Formats')), ('last_modified', _('Modified Date')), ('yesno', _('Yes/No')), ('tags', _('Tags')), ('series', _('Series')), ('rating', - _('Rating'))]: + _('Rating')), ('people', _("People's names"))]: text += ' <a href="col:%s">%s</a>,'%(col, name) text = text[:-1] self.shortcuts.setText(text) @@ -125,6 +125,8 @@ class CreateCustomColumn(QDialog, Ui_QCreateCustomColumn): self.datatype_changed() if ct in ['text', 'composite', 'enumeration']: self.use_decorations.setChecked(c['display'].get('use_decorations', False)) + elif ct == '*text': + self.is_names.setChecked(c['display'].get('is_names', False)) self.exec_() def shortcut_activated(self, url): @@ -134,6 +136,7 @@ class CreateCustomColumn(QDialog, Ui_QCreateCustomColumn): 'tags' : 1, 'series': 3, 'rating': 8, + 'people': 1, }.get(which, 10)) self.column_name_box.setText(which) self.column_heading_box.setText({ @@ -143,7 +146,9 @@ class CreateCustomColumn(QDialog, Ui_QCreateCustomColumn): 'tags': _('My Tags'), 'series': _('My Series'), 'rating': _('My Rating'), - 'last_modified':_('Modified Date')}[which]) + 'last_modified':_('Modified Date'), + 'people': _('People')}[which]) + self.is_names.setChecked(which == 'people') if self.composite_box.isVisible(): self.composite_box.setText( { @@ -153,7 +158,6 @@ class CreateCustomColumn(QDialog, Ui_QCreateCustomColumn): }[which]) self.composite_sort_by.setCurrentIndex(2 if which == 'last_modified' else 0) - def datatype_changed(self, *args): try: col_type = self.column_types[self.column_type_box.currentIndex()]['datatype'] @@ -167,6 +171,7 @@ class CreateCustomColumn(QDialog, Ui_QCreateCustomColumn): for x in ('box', 'default_label', 'label'): getattr(self, 'enum_'+x).setVisible(col_type == 'enumeration') self.use_decorations.setVisible(col_type in ['text', 'composite', 'enumeration']) + self.is_names.setVisible(col_type == '*text') def accept(self): col = unicode(self.column_name_box.text()).strip() @@ -241,6 +246,8 @@ class CreateCustomColumn(QDialog, Ui_QCreateCustomColumn): return self.simple_error('', _('The value "{0}" is in the ' 'list more than once').format(l[i])) display_dict = {'enum_values': l} + elif col_type == 'text' and is_multiple: + display_dict = {'is_names': self.is_names.isChecked()} if col_type in ['text', 'composite', 'enumeration']: display_dict['use_decorations'] = self.use_decorations.checkState() diff --git a/src/calibre/gui2/preferences/create_custom_column.ui b/src/calibre/gui2/preferences/create_custom_column.ui index 3290d3c846..619b0c6212 100644 --- a/src/calibre/gui2/preferences/create_custom_column.ui +++ b/src/calibre/gui2/preferences/create_custom_column.ui @@ -9,7 +9,7 @@ <rect> <x>0</x> <y>0</y> - <width>603</width> + <width>831</width> <height>344</height> </rect> </property> @@ -110,27 +110,37 @@ </item> <item> <widget class="QCheckBox" name="use_decorations"> - <property name="text"> - <string>Show checkmarks</string> - </property> <property name="toolTip"> <string>Show check marks in the GUI. Values of 'yes', 'checked', and 'true' will show a green check. Values of 'no', 'unchecked', and 'false' will show a red X. Everything else will show nothing.</string> </property> + <property name="text"> + <string>Show checkmarks</string> + </property> + </widget> + </item> + <item> + <widget class="QCheckBox" name="is_names"> + <property name="toolTip"> + <string>Check this box if this column contains names, like the authors column.</string> + </property> + <property name="text"> + <string>Contains names</string> + </property> </widget> </item> <item> <spacer name="horizontalSpacer_27"> - <property name="orientation"> - <enum>Qt::Horizontal</enum> - </property> <property name="sizePolicy"> <sizepolicy hsizetype="Expanding" vsizetype="Fixed"> <horstretch>10</horstretch> <verstretch>0</verstretch> </sizepolicy> </property> + <property name="orientation"> + <enum>Qt::Horizontal</enum> + </property> <property name="sizeHint" stdset="0"> <size> <width>20</width> @@ -241,25 +251,25 @@ Everything else will show nothing.</string> </item> <item> <widget class="QCheckBox" name="composite_make_category"> - <property name="text"> - <string>Show in tags browser</string> - </property> <property name="toolTip"> <string>If checked, this column will appear in the tags browser as a category</string> </property> + <property name="text"> + <string>Show in tags browser</string> + </property> </widget> </item> <item> <spacer name="horizontalSpacer_24"> - <property name="orientation"> - <enum>Qt::Horizontal</enum> - </property> <property name="sizePolicy"> <sizepolicy hsizetype="Expanding" vsizetype="Fixed"> <horstretch>10</horstretch> <verstretch>0</verstretch> </sizepolicy> </property> + <property name="orientation"> + <enum>Qt::Horizontal</enum> + </property> <property name="sizeHint" stdset="0"> <size> <width>20</width> diff --git a/src/calibre/gui2/preferences/look_feel.py b/src/calibre/gui2/preferences/look_feel.py index 206f2b97fb..a2d2236039 100644 --- a/src/calibre/gui2/preferences/look_feel.py +++ b/src/calibre/gui2/preferences/look_feel.py @@ -64,8 +64,9 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): r('tags_browser_collapse_at', gprefs) choices = set([k for k in db.field_metadata.all_field_keys() - if db.field_metadata[k]['is_category'] and - db.field_metadata[k]['datatype'] in ['text', 'series', 'enumeration']]) + if db.field_metadata[k]['is_category'] and + (db.field_metadata[k]['datatype'] in ['text', 'series', 'enumeration']) and + not db.field_metadata[k]['display'].get('is_names', False)]) choices -= set(['authors', 'publisher', 'formats', 'news', 'identifiers']) choices |= set(['search']) self.opt_categories_using_hierarchy.update_items_cache(choices) diff --git a/src/calibre/gui2/preferences/plugboard.py b/src/calibre/gui2/preferences/plugboard.py index e1dc6b03bd..8f2b084d76 100644 --- a/src/calibre/gui2/preferences/plugboard.py +++ b/src/calibre/gui2/preferences/plugboard.py @@ -251,7 +251,7 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): if d != 0: try: validation_formatter.validate(s) - except Exception, err: + except Exception as err: error_dialog(self, _('Invalid template'), '<p>'+_('The template %s is invalid:')%s + \ '<br>'+str(err), show=True) diff --git a/src/calibre/gui2/preferences/save_template.py b/src/calibre/gui2/preferences/save_template.py index 4c00a14c0f..96ca8c8945 100644 --- a/src/calibre/gui2/preferences/save_template.py +++ b/src/calibre/gui2/preferences/save_template.py @@ -57,7 +57,7 @@ class SaveTemplate(QWidget, Ui_Form): return question_dialog(self, _('Constant template'), _('The template contains no {fields}, so all ' 'books will have the same name. Is this OK?')) - except Exception, err: + except Exception as err: error_dialog(self, _('Invalid template'), '<p>'+_('The template %s is invalid:')%tmpl + \ '<br>'+str(err), show=True) diff --git a/src/calibre/gui2/tag_view.py b/src/calibre/gui2/tag_view.py index 34fa3a8b10..6b1ce2f851 100644 --- a/src/calibre/gui2/tag_view.py +++ b/src/calibre/gui2/tag_view.py @@ -658,8 +658,7 @@ class TagTreeItem(object): # {{{ def tag_data(self, role): tag = self.tag - if tag.category == 'authors' and \ - tweaks['categories_use_field_for_author_name'] == 'author_sort': + if tag.use_sort_as_name: name = tag.sort tt_author = True else: @@ -1275,6 +1274,7 @@ class TagsModel(QAbstractItemModel): # {{{ if len(components) == 0 or '.'.join(components) != tag.original_name: components = [tag.original_name] if (not tag.is_hierarchical) and (in_uc or + (fm['is_custom'] and fm['display'].get('is_names', False)) or key in ['authors', 'publisher', 'news', 'formats', 'rating'] or key not in self.db.prefs.get('categories_using_hierarchy', []) or len(components) == 1): diff --git a/src/calibre/gui2/viewer/dictionary.py b/src/calibre/gui2/viewer/dictionary.py index dad8d1821c..d5dd4d0a86 100644 --- a/src/calibre/gui2/viewer/dictionary.py +++ b/src/calibre/gui2/viewer/dictionary.py @@ -36,7 +36,7 @@ class Lookup(QThread): def run(self): try: self.define() - except Exception, e: + except Exception as e: import traceback self.exception = e self.traceback = traceback.format_exc() diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py index c570a6e159..ea0509b51a 100644 --- a/src/calibre/gui2/widgets.py +++ b/src/calibre/gui2/widgets.py @@ -97,7 +97,7 @@ class FilenamePattern(QWidget, Ui_Form): def do_test(self): try: pat = self.pattern() - except Exception, err: + except Exception as err: error_dialog(self, _('Invalid regular expression'), _('Invalid regular expression: %s')%err).exec_() return diff --git a/src/calibre/gui2/wizard/__init__.py b/src/calibre/gui2/wizard/__init__.py index c629b10b5d..a32347dc72 100644 --- a/src/calibre/gui2/wizard/__init__.py +++ b/src/calibre/gui2/wizard/__init__.py @@ -565,7 +565,7 @@ def move_library(oldloc, newloc, parent, callback_on_complete): # Try to load existing library at new location try: LibraryDatabase2(newloc) - except Exception, err: + except Exception as err: det = traceback.format_exc() error_dialog(parent, _('Invalid database'), _('<p>An invalid library already exists at ' @@ -577,7 +577,7 @@ def move_library(oldloc, newloc, parent, callback_on_complete): else: callback(newloc) return - except Exception, err: + except Exception as err: det = traceback.format_exc() error_dialog(parent, _('Could not move library'), unicode(err), det, show=True) diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py index 19ef7e213c..e5864ceaaf 100644 --- a/src/calibre/library/caches.py +++ b/src/calibre/library/caches.py @@ -15,7 +15,7 @@ from calibre.utils.config import tweaks, prefs from calibre.utils.date import parse_date, now, UNDEFINED_DATE from calibre.utils.search_query_parser import SearchQueryParser from calibre.utils.pyparsing import ParseException -from calibre.ebooks.metadata import title_sort +from calibre.ebooks.metadata import title_sort, author_to_author_sort from calibre.ebooks.metadata.opf2 import metadata_to_opf from calibre import prints @@ -1023,7 +1023,11 @@ class SortKeyGenerator(object): if val: sep = fm['is_multiple'] if sep: - val = sep.join(sorted(val.split(sep), + if fm['display'].get('is_names', False): + val = sep.join( + [author_to_author_sort(v) for v in val.split(sep)]) + else: + val = sep.join(sorted(val.split(sep), key=self.string_sort_key)) val = self.string_sort_key(val) diff --git a/src/calibre/library/custom_columns.py b/src/calibre/library/custom_columns.py index dec55f2b02..48960ac871 100644 --- a/src/calibre/library/custom_columns.py +++ b/src/calibre/library/custom_columns.py @@ -117,7 +117,7 @@ class CustomColumns(object): if x is None: return [] if isinstance(x, (str, unicode, bytes)): - x = x.split(',') + x = x.split('&' if d['display'].get('is_names', False) else',') x = [y.strip() for y in x if y.strip()] x = [y.decode(preferred_encoding, 'replace') if not isinstance(y, unicode) else y for y in x] @@ -482,8 +482,11 @@ class CustomColumns(object): set_val = val if data['is_multiple'] else [val] existing = getter() if not existing: - existing = [] - for x in set(set_val) - set(existing): + existing = set([]) + else: + existing = set(existing) + # preserve the order in set_val + for x in [v for v in set_val if v not in existing]: # normalized types are text and ratings, so we can do this check # to see if we need to re-add the value if not x: diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index e751d4d522..b23c8ff4a4 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -48,7 +48,7 @@ class Tag(object): def __init__(self, name, id=None, count=0, state=0, avg=0, sort=None, tooltip=None, icon=None, category=None, id_set=None, - is_editable = True, is_searchable=True): + is_editable = True, is_searchable=True, use_sort_as_name=False): self.name = self.original_name = name self.id = id self.count = count @@ -59,6 +59,7 @@ class Tag(object): self.id_set = id_set if id_set is not None else set([]) self.avg_rating = avg/2.0 if avg is not None else 0 self.sort = sort + self.use_sort_as_name = use_sort_as_name if self.avg_rating > 0: if tooltip: tooltip = tooltip + ': ' @@ -1323,6 +1324,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): for l in list: (id, val) = (l[0], l[1]) tids[category][val] = (id, '{0:05.2f}'.format(val)) + elif cat['datatype'] == 'text' and cat['is_multiple'] and \ + cat['display'].get('is_names', False): + for l in list: + (id, val) = (l[0], l[1]) + tids[category][val] = (id, author_to_author_sort(val)) else: for l in list: (id, val) = (l[0], l[1]) @@ -1480,11 +1486,20 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): reverse=True items.sort(key=kf, reverse=reverse) + if tweaks['categories_use_field_for_author_name'] == 'author_sort' and\ + (category == 'authors' or + (cat['display'].get('is_names', False) and + cat['is_custom'] and cat['is_multiple'] and + cat['datatype'] == 'text')): + use_sort_as_name = True + else: + use_sort_as_name = False is_editable = category not in ['news', 'rating'] categories[category] = [tag_class(formatter(r.n), count=r.c, id=r.id, avg=avgr(r), sort=r.s, icon=icon, tooltip=tooltip, category=category, - id_set=r.id_set, is_editable=is_editable) + id_set=r.id_set, is_editable=is_editable, + use_sort_as_name=use_sort_as_name) for r in items] #print 'end phase "tags list":', time.clock() - last, 'seconds' diff --git a/src/calibre/library/server/base.py b/src/calibre/library/server/base.py index 83d395dec5..dba6abbfa5 100644 --- a/src/calibre/library/server/base.py +++ b/src/calibre/library/server/base.py @@ -222,7 +222,7 @@ class LibraryServer(ContentServer, MobileServer, XMLServer, OPDSServer, Cache, # cherrypy.engine.signal_handler.subscribe() cherrypy.engine.block() - except Exception, e: + except Exception as e: self.exception = e finally: self.is_running = False diff --git a/src/calibre/library/server/browse.py b/src/calibre/library/server/browse.py index f1d9b9785c..895fbb06e9 100644 --- a/src/calibre/library/server/browse.py +++ b/src/calibre/library/server/browse.py @@ -15,7 +15,7 @@ from calibre import isbytestring, force_unicode, fit_image, \ prepare_string_for_xml from calibre.utils.ordered_dict import OrderedDict from calibre.utils.filenames import ascii_filename -from calibre.utils.config import prefs, tweaks +from calibre.utils.config import prefs from calibre.utils.icu import sort_key from calibre.utils.magick import Image from calibre.library.comments import comments_to_html @@ -155,8 +155,7 @@ def get_category_items(category, items, restriction, datatype, prefix): # {{{ '<div>{1}</div>' '<div>{2}</div></div>') rating, rstring = render_rating(i.avg_rating, prefix) - if i.category == 'authors' and \ - tweaks['categories_use_field_for_author_name'] == 'author_sort': + if i.use_sort_as_name: name = xml(i.sort) else: name = xml(i.name) @@ -696,7 +695,10 @@ class BrowseServer(object): xml(href, True), xml(val if len(dbtags) == 1 else tag.name), xml(key, True))) - join = ' & ' if key == 'authors' else ', ' + join = ' & ' if key == 'authors' or \ + (fm['is_custom'] and + fm['display'].get('is_names', False)) \ + else ', ' args[key] = join.join(vals) added_key = True if not added_key: diff --git a/src/calibre/library/server/content.py b/src/calibre/library/server/content.py index 11ea2b951e..919f5a7969 100644 --- a/src/calibre/library/server/content.py +++ b/src/calibre/library/server/content.py @@ -169,7 +169,7 @@ class ContentServer(object): return cover return save_cover_data_to(img, 'img.jpg', return_data=True, resize_to=(width, height)) - except Exception, err: + except Exception as err: import traceback cherrypy.log.error('Failed to generate cover:') cherrypy.log.error(traceback.print_exc()) diff --git a/src/calibre/library/server/main.py b/src/calibre/library/server/main.py index e4de710c6a..3a6f918022 100644 --- a/src/calibre/library/server/main.py +++ b/src/calibre/library/server/main.py @@ -69,7 +69,7 @@ def daemonize(stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'): if pid > 0: # exit first parent sys.exit(0) - except OSError, e: + except OSError as e: print >>sys.stderr, "fork #1 failed: %d (%s)" % (e.errno, e.strerror) sys.exit(1) @@ -84,7 +84,7 @@ def daemonize(stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'): if pid > 0: # exit from second parent sys.exit(0) - except OSError, e: + except OSError as e: print >>sys.stderr, "fork #2 failed: %d (%s)" % (e.errno, e.strerror) sys.exit(1) diff --git a/src/calibre/library/server/opds.py b/src/calibre/library/server/opds.py index e7fdffbbbb..bdd35c16f1 100644 --- a/src/calibre/library/server/opds.py +++ b/src/calibre/library/server/opds.py @@ -22,7 +22,6 @@ from calibre.library.server.utils import format_tag_string, Offsets from calibre import guess_type, prepare_string_for_xml as xml from calibre.utils.icu import sort_key from calibre.utils.ordered_dict import OrderedDict -from calibre.utils.config import tweaks BASE_HREFS = { 0 : '/stanza', @@ -126,8 +125,7 @@ def CATALOG_ENTRY(item, item_kind, base_href, version, updated, count = (_('%d books') if item.count > 1 else _('%d book'))%item.count if ignore_count: count = '' - if item.category == 'authors' and \ - tweaks['categories_use_field_for_author_name'] == 'author_sort': + if item.use_sort_as_name: name = item.sort else: name = item.name diff --git a/src/calibre/library/sqlite.py b/src/calibre/library/sqlite.py index 2075ab5880..511106fe7b 100644 --- a/src/calibre/library/sqlite.py +++ b/src/calibre/library/sqlite.py @@ -193,7 +193,7 @@ def load_c_extensions(conn, debug=DEBUG): conn.load_extension(ext_path) conn.enable_load_extension(False) return True - except Exception, e: + except Exception as e: if debug: print 'Failed to load high performance sqlite C extension' print e @@ -247,14 +247,14 @@ class DBThread(Thread): if func == 'dump': try: ok, res = True, tuple(self.conn.iterdump()) - except Exception, err: + except Exception as err: ok, res = False, (err, traceback.format_exc()) elif func == 'create_dynamic_filter': try: f = DynamicFilter(args[0]) self.conn.create_function(args[0], 1, f) ok, res = True, f - except Exception, err: + except Exception as err: ok, res = False, (err, traceback.format_exc()) else: bfunc = getattr(self.conn, func) @@ -263,7 +263,7 @@ class DBThread(Thread): try: ok, res = True, bfunc(*args, **kwargs) break - except OperationalError, err: + except OperationalError as err: # Retry if unable to open db file e = str(err) if 'unable to open' not in e or i == 2: @@ -273,10 +273,10 @@ class DBThread(Thread): reprlib.repr(kwargs)) raise time.sleep(0.5) - except Exception, err: + except Exception as err: ok, res = False, (err, traceback.format_exc()) self.results.put((ok, res)) - except Exception, err: + except Exception as err: self.unhandled_error = (err, traceback.format_exc()) class DatabaseException(Exception): diff --git a/src/calibre/linux.py b/src/calibre/linux.py index 22f8af56c2..dfab13e3b8 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -3,7 +3,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' ''' Post installation script for linux ''' -import sys, os, cPickle, textwrap, stat +import sys, os, cPickle, textwrap, stat, importlib from subprocess import check_call from calibre import __appname__, prints, guess_type @@ -59,7 +59,7 @@ for x in {manifest!r}: shutil.rmtree(x) else: os.unlink(x) - except Exception, e: + except Exception as e: print 'Failed to delete', x print '\t', e @@ -285,7 +285,7 @@ class PostInstall: complete -o nospace -C calibre-complete ebook-convert ''')) - except TypeError, err: + except TypeError as err: if 'resolve_entities' in str(err): print 'You need python-lxml >= 2.0.5 for calibre' sys.exit(1) @@ -309,7 +309,7 @@ class PostInstall: for src in entry_points['console_scripts']: prog, right = src.split('=') prog = prog.strip() - module = __import__(right.split(':')[0].strip(), fromlist=['a']) + module = importlib.import_module(right.split(':')[0].strip()) parser = getattr(module, 'option_parser', None) if parser is None: continue diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index 948611f775..97ef32e9d4 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -493,7 +493,16 @@ Most purchased EPUB books have `DRM <http://wiki.mobileread.com/wiki/DRM>`_. Thi I am getting a "Permission Denied" error? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -A permission denied error can occur because of many possible reasons, none of them having anything to do with |app|. You can get permission denied errors if you are using an SD card with write protect enabled. Or if you, or some program you used changed the file permissions of the files in question to read only. Or if there is a filesystem error on the device which caused your operating system to mount the filesystem in read only mode or mark a particular file as read only pending recovery. Or if the files have their owner set to a user other than you. Or if your file is open in another program. You will need to fix the underlying cause of the permissions error before resuming to use |app|. Read the error message carefully, see what file it points to and fix the permissions on that file. +A permission denied error can occur because of many possible reasons, none of them having anything to do with |app|. + + * You can get permission denied errors if you are using an SD card with write protect enabled. + * If you, or some program you used changed the file permissions of the files in question to read only. + * If there is a filesystem error on the device which caused your operating system to mount the filesystem in read only mode or mark a particular file as read only pending recovery. + * If the files have their owner set to a user other than you. + * If your file is open in another program. + * If the file resides on a device, you may have reached the limit of a maximum of 256 files in the root of the device. In this case you need to reformat the device/sd card referered to in the error message with a FAT32 filesystem, or delete some files from the SD card/device memory. + +You will need to fix the underlying cause of the permissions error before resuming to use |app|. Read the error message carefully, see what file it points to and fix the permissions on that file. Can I have the comment metadata show up on my reader? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -510,7 +519,7 @@ You have two choices: How is |app| licensed? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -|app| is licensed under the GNU General Public License v3 (an open source license). This means that you are free to redistribute |app| as long as you make the source code available. So if you want to put |app| on a CD with your product, you must also put the |app| source code on the CD. The source code is available for download `from googlecode <http://code.google.com/p/calibre-ebook/downloads/list>`_. You are free to use the results of conversions from |app| however you want. You cannot use code, libraries from |app| in your software without maing your software open source. For details, see `The GNU GPL v3 http://www.gnu.org/licenses/gpl.html`_. +|app| is licensed under the GNU General Public License v3 (an open source license). This means that you are free to redistribute |app| as long as you make the source code available. So if you want to put |app| on a CD with your product, you must also put the |app| source code on the CD. The source code is available for download `from googlecode <http://code.google.com/p/calibre-ebook/downloads/list>`_. You are free to use the results of conversions from |app| however you want. You cannot use code, libraries from |app| in your software without maing your software open source. For details, see `The GNU GPL v3 <http://www.gnu.org/licenses/gpl.html>`_. How do I run calibre from my USB stick? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/calibre/manual/news.rst b/src/calibre/manual/news.rst index d0838ccb0f..ed306a168e 100644 --- a/src/calibre/manual/news.rst +++ b/src/calibre/manual/news.rst @@ -137,7 +137,7 @@ to the recipe. Finally, lets replace some of the :term:`CSS` that we disabled ea With these additions, our recipe has become "production quality", indeed it is very close to the actual recipe used by |app| for the *BBC*, shown below: -.. literalinclude:: ../../../resources/recipes/bbc.recipe +.. literalinclude:: ../../../recipes/bbc.recipe This :term:`recipe` explores only the tip of the iceberg when it comes to the power of |app|. To explore more of the abilities of |app| we'll examine a more complex real life example in the next section. diff --git a/src/calibre/manual/sub_groups.rst b/src/calibre/manual/sub_groups.rst index c27b3581f8..e5a433dce9 100644 --- a/src/calibre/manual/sub_groups.rst +++ b/src/calibre/manual/sub_groups.rst @@ -105,8 +105,8 @@ After creating the saved search, you can use it as a restriction. .. image:: images/sg_restrict2.jpg :align: center - Useful Template Functions - ------------------------- +Useful Template Functions +------------------------- You might want to use the genre information in a template, such as with save to disk or send to device. The question might then be "How do I get the outermost genre name or names?" An |app| template function, subitems, is provided to make doing this easier. @@ -114,4 +114,4 @@ After creating the saved search, you can use it as a restriction. {#genre:subitems(0,1)||/}{title} - {authors} -See :ref:`The |app| template language <templatelangcalibre>` for more information templates and the subitem function. \ No newline at end of file +See :ref:`The |app| template language <templatelangcalibre>` for more information templates and the subitem function. diff --git a/src/calibre/utils/Zeroconf.py b/src/calibre/utils/Zeroconf.py index f4a7119d16..fbb9b4e71f 100755 --- a/src/calibre/utils/Zeroconf.py +++ b/src/calibre/utils/Zeroconf.py @@ -863,7 +863,7 @@ class Engine(threading.Thread): for socket in rr: try: self.readers[socket].handle_read() - except NonLocalNameException, err: + except NonLocalNameException as err: print err except UnicodeDecodeError: if DEBUG: diff --git a/src/calibre/utils/formatter.py b/src/calibre/utils/formatter.py index 740e67bee8..2e40275beb 100644 --- a/src/calibre/utils/formatter.py +++ b/src/calibre/utils/formatter.py @@ -316,7 +316,7 @@ class TemplateFormatter(string.Formatter): self.locals = {} try: ans = self.vformat(fmt, [], kwargs).strip() - except Exception, e: + except Exception as e: if DEBUG: traceback.print_exc() ans = error_value + ' ' + e.message diff --git a/src/calibre/utils/ipc/worker.py b/src/calibre/utils/ipc/worker.py index e187235a9e..9594f64ae4 100644 --- a/src/calibre/utils/ipc/worker.py +++ b/src/calibre/utils/ipc/worker.py @@ -6,7 +6,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -import os, cPickle, sys +import os, cPickle, sys, importlib from multiprocessing.connection import Client from threading import Thread from Queue import Queue @@ -75,7 +75,7 @@ class Progress(Thread): def get_func(name): module, func, notification = PARALLEL_FUNCS[name] - module = __import__(module, fromlist=[1]) + module = importlib.import_module(module) func = getattr(module, func) return func, notification diff --git a/src/calibre/utils/lock.py b/src/calibre/utils/lock.py index 5098c78f90..0b66be963b 100644 --- a/src/calibre/utils/lock.py +++ b/src/calibre/utils/lock.py @@ -32,7 +32,7 @@ class WindowsExclFile(object): None, #No template file ) break - except pywintypes.error, err: + except pywintypes.error as err: if getattr(err, 'args', [-1])[0] in (0x20, 0x21): time.sleep(1) continue diff --git a/src/calibre/utils/pdftk.py b/src/calibre/utils/pdftk.py index 1263b60306..f4fcb8a2e3 100644 --- a/src/calibre/utils/pdftk.py +++ b/src/calibre/utils/pdftk.py @@ -56,7 +56,7 @@ def set_metadata(stream, mi): try: p.wait() break - except OSError, e: + except OSError as e: if e.errno == errno.EINTR: continue else: diff --git a/src/calibre/utils/smtp.py b/src/calibre/utils/smtp.py index 744021f911..81936a8f71 100644 --- a/src/calibre/utils/smtp.py +++ b/src/calibre/utils/smtp.py @@ -76,7 +76,7 @@ def sendmail_direct(from_, to, msg, timeout, localhost, verbose, s.connect(host, 25) s.sendmail(from_, [to], msg) return s.quit() - except Exception, e: + except Exception as e: last_error, last_traceback = e, traceback.format_exc() if last_error is not None: print last_traceback diff --git a/src/calibre/web/feeds/feedparser.py b/src/calibre/web/feeds/feedparser.py index ead9207b70..99c3e09666 100755 --- a/src/calibre/web/feeds/feedparser.py +++ b/src/calibre/web/feeds/feedparser.py @@ -6,12 +6,11 @@ Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds Visit http://feedparser.org/ for the latest version Visit http://feedparser.org/docs/ for the latest documentation -Required: Python 2.1 or later -Recommended: Python 2.3 or later +Required: Python 2.4 or later Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/> """ -__version__ = "4.2-pre-" + "$Revision: 316 $"[11:14] + "-svn" +__version__ = "5.0.1" __license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved. Redistribution and use in source and binary forms, with or without modification, @@ -42,14 +41,14 @@ __contributors__ = ["Jason Diamond <http://injektilo.org/>", "Kevin Marks <http://epeus.blogspot.com/>", "Sam Ruby <http://intertwingly.net/>", "Ade Oshineye <http://blog.oshineye.com/>", - "Martin Pool <http://sourcefrog.net/>"] + "Martin Pool <http://sourcefrog.net/>", + "Kurt McKee <http://kurtmckee.org/>"] _debug = 0 # HTTP "User-Agent" header to send to servers when downloading feeds. # If you are embedding feedparser in a larger application, you should # change this to your application name and URL. -USER_AGENT = 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4' # Changed by Kovid - +USER_AGENT = 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11' # Changed by Kovid # HTTP "Accept" header to send to servers when downloading feeds. If you don't # want to send an Accept header, set this to None. ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1" @@ -76,12 +75,73 @@ RESOLVE_RELATIVE_URIS = 1 # HTML content, set this to 1. SANITIZE_HTML = 1 -# ---------- required modules (should come with any Python distribution) ---------- -import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2 +# ---------- Python 3 modules (make it work if possible) ---------- try: - from cStringIO import StringIO as _StringIO + import rfc822 +except ImportError: + from email import _parseaddr as rfc822 + +try: + # Python 3.1 introduces bytes.maketrans and simultaneously + # deprecates string.maketrans; use bytes.maketrans if possible + _maketrans = bytes.maketrans +except (NameError, AttributeError): + import string + _maketrans = string.maketrans + +# base64 support for Atom feeds that contain embedded binary data +try: + import base64, binascii + # Python 3.1 deprecates decodestring in favor of decodebytes + _base64decode = getattr(base64, 'decodebytes', base64.decodestring) except: - from StringIO import StringIO as _StringIO + base64 = binascii = None + +def _s2bytes(s): + # Convert a UTF-8 str to bytes if the interpreter is Python 3 + try: + return bytes(s, 'utf8') + except (NameError, TypeError): + # In Python 2.5 and below, bytes doesn't exist (NameError) + # In Python 2.6 and above, bytes and str are the same (TypeError) + return s + +def _l2bytes(l): + # Convert a list of ints to bytes if the interpreter is Python 3 + try: + if bytes is not str: + # In Python 2.6 and above, this call won't raise an exception + # but it will return bytes([65]) as '[65]' instead of 'A' + return bytes(l) + raise NameError + except NameError: + return ''.join(map(chr, l)) + +# If you want feedparser to allow all URL schemes, set this to () +# List culled from Python's urlparse documentation at: +# http://docs.python.org/library/urlparse.html +# as well as from "URI scheme" at Wikipedia: +# https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme +# Many more will likely need to be added! +ACCEPTABLE_URI_SCHEMES = ( + 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'mailto', + 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', 'sftp', + 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', 'wais', + # Additional common-but-unofficial schemes + 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs', + 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg', +) +#ACCEPTABLE_URI_SCHEMES = () + +# ---------- required modules (should come with any Python distribution) ---------- +import sgmllib, re, sys, copy, urlparse, time, types, cgi, urllib, urllib2, datetime +try: + from io import BytesIO as _StringIO +except ImportError: + try: + from cStringIO import StringIO as _StringIO + except: + from StringIO import StringIO as _StringIO # ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- @@ -114,12 +174,6 @@ except: data = data.replace(char, entity) return data -# base64 support for Atom feeds that contain embedded binary data -try: - import base64, binascii -except: - base64 = binascii = None - # cjkcodecs and iconv_codec provide support for more character encodings. # Both are available from http://cjkpython.i18n.org/ try: @@ -172,17 +226,27 @@ class UndeclaredNamespace(Exception): pass sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') sgmllib.special = re.compile('<!') -sgmllib.charref = re.compile('&#(\d+|x[0-9a-fA-F]+);') +sgmllib.charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);') if sgmllib.endbracket.search(' <').start(0): - class EndBracketMatch: - endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''') + class EndBracketRegEx: + def __init__(self): + # Overriding the built-in sgmllib.endbracket regex allows the + # parser to find angle brackets embedded in element attributes. + self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''') def search(self,string,index=0): - self.match = self.endbracket.match(string,index) - if self.match: return self - def start(self,n): + match = self.endbracket.match(string,index) + if match is not None: + # Returning a new object in the calling thread's context + # resolves a thread-safety. + return EndBracketMatch(match) + return None + class EndBracketMatch: + def __init__(self, match): + self.match = match + def start(self, n): return self.match.end(n) - sgmllib.endbracket = EndBracketMatch() + sgmllib.endbracket = EndBracketRegEx() SUPPORTED_VERSIONS = {'': 'unknown', 'rss090': 'RSS 0.90', @@ -220,7 +284,7 @@ class FeedParserDict(UserDict): 'guid': 'id', 'date': 'updated', 'date_parsed': 'updated_parsed', - 'description': ['subtitle', 'summary'], + 'description': ['summary', 'subtitle'], 'url': ['href'], 'modified': 'updated', 'modified_parsed': 'updated_parsed', @@ -245,9 +309,9 @@ class FeedParserDict(UserDict): realkey = self.keymap.get(key, key) if type(realkey) == types.ListType: for k in realkey: - if UserDict.has_key(self, k): + if UserDict.__contains__(self, k): return UserDict.__getitem__(self, k) - if UserDict.has_key(self, key): + if UserDict.__contains__(self, key): return UserDict.__getitem__(self, key) return UserDict.__getitem__(self, realkey) @@ -272,9 +336,12 @@ class FeedParserDict(UserDict): def has_key(self, key): try: - return hasattr(self, key) or UserDict.has_key(self, key) + return hasattr(self, key) or UserDict.__contains__(self, key) except AttributeError: return False + # This alias prevents the 2to3 tool from changing the semantics of the + # __contains__ function below and exhausting the maximum recursion depth + __has_key = has_key def __getattr__(self, key): try: @@ -294,7 +361,7 @@ class FeedParserDict(UserDict): return self.__setitem__(key, value) def __contains__(self, key): - return self.has_key(key) + return self.__has_key(key) def zopeCompatibilityHack(): global FeedParserDict @@ -327,9 +394,8 @@ def _ebcdic_to_ascii(s): 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249, 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255 ) - import string - _ebcdic_to_ascii_map = string.maketrans( \ - ''.join(map(chr, range(256))), ''.join(map(chr, emap))) + _ebcdic_to_ascii_map = _maketrans( \ + _l2bytes(range(256)), _l2bytes(emap)) return s.translate(_ebcdic_to_ascii_map) _cp1252 = { @@ -483,6 +549,10 @@ class _FeedParserMixin: # normalize attrs attrs = [(k.lower(), v) for k, v in attrs] attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] + # the sgml parser doesn't handle entities in attributes, but + # strict xml parsers do -- account for this difference + if isinstance(self, _LooseFeedParser): + attrs = [(k, v.replace('&', '&')) for k, v in attrs] # track xml:base and xml:lang attrsD = dict(attrs) @@ -492,7 +562,12 @@ class _FeedParserMixin: baseuri = unicode(baseuri, self.encoding) except: baseuri = unicode(baseuri, 'iso-8859-1') - self.baseuri = _urljoin(self.baseuri, baseuri) + # ensure that self.baseuri is always an absolute URI that + # uses a whitelisted URI scheme (e.g. not `javscript:`) + if self.baseuri: + self.baseuri = _makeSafeAbsoluteURI(self.baseuri, baseuri) or self.baseuri + else: + self.baseuri = _urljoin(self.baseuri, baseuri) lang = attrsD.get('xml:lang', attrsD.get('lang')) if lang == '': # xml:lang could be explicitly set to '', we need to capture that @@ -671,7 +746,7 @@ class _FeedParserMixin: def mapContentType(self, contentType): contentType = contentType.lower() - if contentType == 'text': + if contentType == 'text' or contentType == 'plain': contentType = 'text/plain' elif contentType == 'html': contentType = 'text/html' @@ -735,6 +810,11 @@ class _FeedParserMixin: else: pieces = pieces[1:-1] + # Ensure each piece is a str for Python 3 + for (i, v) in enumerate(pieces): + if not isinstance(v, basestring): + pieces[i] = v.decode('utf-8') + output = ''.join(pieces) if stripWhitespace: output = output.strip() @@ -743,11 +823,15 @@ class _FeedParserMixin: # decode base64 content if base64 and self.contentparams.get('base64', 0): try: - output = base64.decodestring(output) + output = _base64decode(output) except binascii.Error: pass except binascii.Incomplete: pass + except TypeError: + # In Python 3, base64 takes and outputs bytes, not str + # This may not be the most correct way to accomplish this + output = _base64decode(output.encode('utf-8')).decode('utf-8') # resolve relative URIs if (element in self.can_be_relative_uri) and output: @@ -805,7 +889,7 @@ class _FeedParserMixin: # address common error where people take data that is already # utf-8, presume that it is iso-8859-1, and re-encode it. - if self.encoding=='utf-8' and type(output) == type(u''): + if self.encoding in ('utf-8', 'utf-8_INVALID_PYTHON_3') and type(output) == type(u''): try: output = unicode(output.encode('iso-8859-1'), 'utf-8') except: @@ -830,9 +914,14 @@ class _FeedParserMixin: contentparams['value'] = output self.entries[-1][element].append(contentparams) elif element == 'link': - self.entries[-1][element] = output - if output: - self.entries[-1]['links'][-1]['href'] = output + if not self.inimage: + # query variables in urls in link elements are improperly + # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're + # unhandled character references. fix this special case. + output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output) + self.entries[-1][element] = output + if output: + self.entries[-1]['links'][-1]['href'] = output else: if element == 'description': element = 'summary' @@ -847,6 +936,9 @@ class _FeedParserMixin: element = 'subtitle' context[element] = output if element == 'link': + # fix query variables; see above for the explanation + output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output) + context[element] = output context['links'][-1]['href'] = output elif self.incontent: contentparams = copy.deepcopy(self.contentparams) @@ -874,21 +966,21 @@ class _FeedParserMixin: # text, but this is routinely ignored. This is an attempt to detect # the most common cases. As false positives often result in silent # data loss, this function errs on the conservative side. - def lookslikehtml(self, str): + def lookslikehtml(self, s): if self.version.startswith('atom'): return if self.contentparams.get('type','text/html') != 'text/plain': return # must have a close tag or a entity reference to qualify - if not (re.search(r'</(\w+)>',str) or re.search("&#?\w+;",str)): return + if not (re.search(r'</(\w+)>',s) or re.search("&#?\w+;",s)): return # all tags must be in a restricted subset of valid HTML tags if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements, - re.findall(r'</?(\w+)',str)): return + re.findall(r'</?(\w+)',s)): return # all entities must have been defined as valid HTML entities from htmlentitydefs import entitydefs if filter(lambda e: e not in entitydefs.keys(), - re.findall(r'&(\w+);',str)): return + re.findall(r'&(\w+);',s)): return return 1 @@ -929,9 +1021,12 @@ class _FeedParserMixin: attrsD['href'] = href return attrsD - def _save(self, key, value): + def _save(self, key, value, overwrite=False): context = self._getContext() - context.setdefault(key, value) + if overwrite: + context[key] = value + else: + context.setdefault(key, value) def _start_rss(self, attrsD): versionmap = {'0.91': 'rss091u', @@ -988,7 +1083,8 @@ class _FeedParserMixin: def _start_image(self, attrsD): context = self._getContext() - context.setdefault('image', FeedParserDict()) + if not self.inentry: + context.setdefault('image', FeedParserDict()) self.inimage = 1 self.hasTitle = 0 self.push('image', 0) @@ -1013,6 +1109,10 @@ class _FeedParserMixin: def _start_author(self, attrsD): self.inauthor = 1 self.push('author', 1) + # Append a new FeedParserDict when expecting an author + context = self._getContext() + context.setdefault('authors', []) + context['authors'].append(FeedParserDict()) _start_managingeditor = _start_author _start_dc_author = _start_author _start_dc_creator = _start_author @@ -1147,6 +1247,8 @@ class _FeedParserMixin: context.setdefault(prefix + '_detail', FeedParserDict()) context[prefix + '_detail'][key] = value self._sync_author_detail() + context.setdefault('authors', [FeedParserDict()]) + context['authors'][-1][key] = value def _save_contributor(self, key, value): context = self._getContext() @@ -1252,7 +1354,7 @@ class _FeedParserMixin: def _end_published(self): value = self.pop('published') - self._save('published_parsed', _parse_date(value)) + self._save('published_parsed', _parse_date(value), overwrite=True) _end_dcterms_issued = _end_published _end_issued = _end_published @@ -1262,15 +1364,17 @@ class _FeedParserMixin: _start_dcterms_modified = _start_updated _start_pubdate = _start_updated _start_dc_date = _start_updated + _start_lastbuilddate = _start_updated def _end_updated(self): value = self.pop('updated') parsed_value = _parse_date(value) - self._save('updated_parsed', parsed_value) + self._save('updated_parsed', parsed_value, overwrite=True) _end_modified = _end_updated _end_dcterms_modified = _end_updated _end_pubdate = _end_updated _end_dc_date = _end_updated + _end_lastbuilddate = _end_updated def _start_created(self, attrsD): self.push('created', 1) @@ -1278,14 +1382,14 @@ class _FeedParserMixin: def _end_created(self): value = self.pop('created') - self._save('created_parsed', _parse_date(value)) + self._save('created_parsed', _parse_date(value), overwrite=True) _end_dcterms_created = _end_created def _start_expirationdate(self, attrsD): self.push('expired', 1) def _end_expirationdate(self): - self._save('expired_parsed', _parse_date(self.pop('expired'))) + self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True) def _start_cc_license(self, attrsD): context = self._getContext() @@ -1334,6 +1438,10 @@ class _FeedParserMixin: _start_dc_subject = _start_category _start_keywords = _start_category + def _start_media_category(self, attrsD): + attrsD.setdefault('scheme', 'http://search.yahoo.com/mrss/category_schema') + self._start_category(attrsD) + def _end_itunes_keywords(self): for term in self.pop('itunes_keywords').split(): self._addTag(term, 'http://www.itunes.com/', None) @@ -1354,6 +1462,7 @@ class _FeedParserMixin: _end_dc_subject = _end_category _end_keywords = _end_category _end_itunes_category = _end_category + _end_media_category = _end_category def _start_cloud(self, attrsD): self._getContext()['cloud'] = FeedParserDict(attrsD) @@ -1368,11 +1477,10 @@ class _FeedParserMixin: attrsD = self._itsAnHrefDamnIt(attrsD) if attrsD.has_key('href'): attrsD['href'] = self.resolveURI(attrsD['href']) - if attrsD.get('rel')=='enclosure' and not context.get('id'): - context['id'] = attrsD.get('href') expectingText = self.infeed or self.inentry or self.insource context.setdefault('links', []) - context['links'].append(FeedParserDict(attrsD)) + if not (self.inentry and self.inimage): + context['links'].append(FeedParserDict(attrsD)) if attrsD.has_key('href'): expectingText = 0 if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types): @@ -1498,9 +1606,6 @@ class _FeedParserMixin: context = self._getContext() attrsD['rel']='enclosure' context.setdefault('links', []).append(FeedParserDict(attrsD)) - href = attrsD.get('href') - if href and not context.get('id'): - context['id'] = href def _start_source(self, attrsD): if 'url' in attrsD: @@ -1537,10 +1642,10 @@ class _FeedParserMixin: _start_fullitem = _start_content_encoded def _end_content(self): - copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types) + copyToSummary = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types) value = self.popContent('content') - if copyToDescription: - self._save('description', value) + if copyToSummary: + self._save('summary', value) _end_body = _end_content _end_xhtml_body = _end_content @@ -1550,7 +1655,8 @@ class _FeedParserMixin: def _start_itunes_image(self, attrsD): self.push('itunes_image', 0) - self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) + if attrsD.get('href'): + self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) _start_itunes_link = _start_itunes_image def _end_itunes_block(self): @@ -1559,7 +1665,10 @@ class _FeedParserMixin: def _end_itunes_explicit(self): value = self.pop('itunes_explicit', 0) - self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0 + # Convert 'yes' -> True, 'clean' to False, and any other value to None + # False and None both evaluate as False, so the difference can be ignored + # by applications that only need to know if the content is explicit. + self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0] def _start_media_content(self, attrsD): context = self._getContext() @@ -1588,6 +1697,17 @@ class _FeedParserMixin: context = self._getContext() context['media_player']['content'] = value + def _start_newlocation(self, attrsD): + self.push('newlocation', 1) + + def _end_newlocation(self): + url = self.pop('newlocation') + context = self._getContext() + # don't set newlocation if the context isn't right + if context is not self.feeddata: + return + context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip()) + if _XML_AVAILABLE: class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): def __init__(self, baseuri, baselang, encoding): @@ -1689,9 +1809,9 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): 'source', 'track', 'wbr' ] - def __init__(self, encoding, type): + def __init__(self, encoding, _type): self.encoding = encoding - self.type = type + self._type = _type if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding) sgmllib.SGMLParser.__init__(self) @@ -1708,7 +1828,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): def parse_starttag(self,i): j=sgmllib.SGMLParser.parse_starttag(self, i) - if self.type == 'application/xhtml+xml': + if self._type == 'application/xhtml+xml': if j>2 and self.rawdata[j-2:j]=='/>': self.unknown_endtag(self.lasttag) return j @@ -1719,8 +1839,14 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data) data = data.replace(''', "'") data = data.replace('"', '"') - if self.encoding and type(data) == type(u''): - data = data.encode(self.encoding) + try: + bytes + if bytes is str: + raise NameError + self.encoding = self.encoding + '_INVALID_PYTHON_3' + except NameError: + if self.encoding and type(data) == type(u''): + data = data.encode(self.encoding) sgmllib.SGMLParser.feed(self, data) sgmllib.SGMLParser.close(self) @@ -1749,7 +1875,11 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): value = unicode(value, self.encoding) except: value = unicode(value, 'iso-8859-1') - uattrs.append((unicode(key, self.encoding), value)) + try: + # Currently, in Python 3 the key is already a str, and cannot be decoded again + uattrs.append((unicode(key, self.encoding), value)) + except TypeError: + uattrs.append((key, value)) strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]) if self.encoding: try: @@ -1840,6 +1970,14 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): '''Return processed HTML as a single string''' return ''.join([str(p) for p in self.pieces]) + def parse_declaration(self, i): + try: + return sgmllib.SGMLParser.parse_declaration(self, i) + except sgmllib.SGMLParseError: + # escape the doctype declaration and continue parsing + self.handle_data('<') + return i+1 + class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): def __init__(self, baseuri, baselang, encoding, entities): sgmllib.SGMLParser.__init__(self) @@ -2019,10 +2157,10 @@ class _MicroformatsParser: arLines = [] def processSingleString(sProperty): - sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1) + sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1).decode(self.encoding) if sValue: arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue)) - return sValue or '' + return sValue or u'' def processSingleURI(sProperty): sValue = self.getPropertyValue(elmCard, sProperty, self.URI) @@ -2071,8 +2209,8 @@ class _MicroformatsParser: sAgentValue = sAgentValue.replace(';', '\\;') if sAgentValue: arLines.append(self.vcardFold('AGENT:' + sAgentValue)) - elmAgent['class'] = '' - elmAgent.contents = [] + # Completely remove the agent element from the parse tree + elmAgent.extract() else: sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1); if sAgentValue: @@ -2219,8 +2357,8 @@ class _MicroformatsParser: processSingleURI('key') if arLines: - arLines = ['BEGIN:vCard','VERSION:3.0'] + arLines + ['END:vCard'] - sVCards += '\n'.join(arLines) + '\n' + arLines = [u'BEGIN:vCard',u'VERSION:3.0'] + arLines + [u'END:vCard'] + sVCards += u'\n'.join(arLines) + u'\n' return sVCards.strip() @@ -2277,7 +2415,12 @@ class _MicroformatsParser: def _parseMicroformats(htmlSource, baseURI, encoding): if not BeautifulSoup: return if _debug: sys.stderr.write('entering _parseMicroformats\n') - p = _MicroformatsParser(htmlSource, baseURI, encoding) + try: + p = _MicroformatsParser(htmlSource, baseURI, encoding) + except UnicodeEncodeError: + # sgmllib throws this exception when performing lookups of tags + # with non-ASCII characters in them. + return p.vcard = p.findVCards(p.document) p.findTags() p.findEnclosures() @@ -2311,12 +2454,12 @@ class _RelativeURIResolver(_BaseHTMLProcessor): ('q', 'cite'), ('script', 'src')] - def __init__(self, baseuri, encoding, type): - _BaseHTMLProcessor.__init__(self, encoding, type) + def __init__(self, baseuri, encoding, _type): + _BaseHTMLProcessor.__init__(self, encoding, _type) self.baseuri = baseuri def resolveURI(self, uri): - return _urljoin(self.baseuri, uri.strip()) + return _makeSafeAbsoluteURI(_urljoin(self.baseuri, uri.strip())) def unknown_starttag(self, tag, attrs): if _debug: @@ -2325,27 +2468,44 @@ class _RelativeURIResolver(_BaseHTMLProcessor): attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) -def _resolveRelativeURIs(htmlSource, baseURI, encoding, type): +def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type): if _debug: sys.stderr.write('entering _resolveRelativeURIs\n') - p = _RelativeURIResolver(baseURI, encoding, type) + p = _RelativeURIResolver(baseURI, encoding, _type) p.feed(htmlSource) return p.output() +def _makeSafeAbsoluteURI(base, rel=None): + # bail if ACCEPTABLE_URI_SCHEMES is empty + if not ACCEPTABLE_URI_SCHEMES: + return _urljoin(base, rel or u'') + if not base: + return rel or u'' + if not rel: + scheme = urlparse.urlparse(base)[0] + if not scheme or scheme in ACCEPTABLE_URI_SCHEMES: + return base + return u'' + uri = _urljoin(base, rel) + if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES: + return u'' + return uri + class _HTMLSanitizer(_BaseHTMLProcessor): - acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'article', - 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 'canvas', - 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command', - 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', - 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer', - 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', - 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', - 'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', - 'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', - 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub', - 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead', - 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'] + acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', + 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', + 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', + 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', + 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', + 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1', + 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', + 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', + 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', + 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', + 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', + 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', + 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'] acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis', @@ -2469,7 +2629,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor): self.unacceptablestack += 1 # add implicit namespaces to html5 inline svg/mathml - if self.type.endswith('html'): + if self._type.endswith('html'): if not dict(attrs).get('xmlns'): if tag=='svg': attrs.append( ('xmlns','http://www.w3.org/2000/svg') ) @@ -2514,6 +2674,9 @@ class _HTMLSanitizer(_BaseHTMLProcessor): for key, value in self.normalize_attrs(attrs): if key in acceptable_attributes: key=keymap.get(key,key) + # make sure the uri uses an acceptable uri scheme + if key == u'href': + value = _makeSafeAbsoluteURI(value) clean_attrs.append((key,value)) elif key=='style': clean_value = self.sanitize_style(value) @@ -2569,9 +2732,22 @@ class _HTMLSanitizer(_BaseHTMLProcessor): return ' '.join(clean) + def parse_comment(self, i, report=1): + ret = _BaseHTMLProcessor.parse_comment(self, i, report) + if ret >= 0: + return ret + # if ret == -1, this may be a malicious attempt to circumvent + # sanitization, or a page-destroying unclosed comment + match = re.compile(r'--[^>]*>').search(self.rawdata, i+4) + if match: + return match.end() + # unclosed comment; deliberately fail to handle_data() + return len(self.rawdata) -def _sanitizeHTML(htmlSource, encoding, type): - p = _HTMLSanitizer(encoding, type) + +def _sanitizeHTML(htmlSource, encoding, _type): + p = _HTMLSanitizer(encoding, _type) + htmlSource = htmlSource.replace('<![CDATA[', '<![CDATA[') p.feed(htmlSource) data = p.output() if TIDY_MARKUP: @@ -2654,7 +2830,7 @@ class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler try: assert sys.version.split()[0] >= '2.3.3' assert base64 != None - user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':') + user, passw = _base64decode(req.headers['Authorization'].split(' ')[1]).split(':') realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] self.add_password(realm, host, user, passw) retry = self.http_error_auth_reqed('www-authenticate', host, req, headers) @@ -2663,7 +2839,7 @@ class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler except: return self.http_error_default(req, fp, code, msg, headers) -def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, extra_headers): +def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers): """URL, filename, or string --> stream This function lets you define parsers that take any input source @@ -2691,7 +2867,7 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h If handlers is supplied, it is a list of handlers used to build a urllib2 opener. - if extra_headers is supplied it is a dictionary of HTTP request headers + if request_headers is supplied it is a dictionary of HTTP request headers that will override the values generated by FeedParser. """ @@ -2701,7 +2877,12 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h if url_file_stream_or_string == '-': return sys.stdin - if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'): + if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'): + # Deal with the feed URI scheme + if url_file_stream_or_string.startswith('feed:http'): + url_file_stream_or_string = url_file_stream_or_string[5:] + elif url_file_stream_or_string.startswith('feed:'): + url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:] if not agent: agent = USER_AGENT # test for inline user:password for basic auth @@ -2713,20 +2894,20 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h user_passwd, realhost = urllib.splituser(realhost) if user_passwd: url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest) - auth = base64.encodestring(user_passwd).strip() + auth = base64.standard_b64encode(user_passwd).strip() # iri support try: if isinstance(url_file_stream_or_string,unicode): - url_file_stream_or_string = url_file_stream_or_string.encode('idna') + url_file_stream_or_string = url_file_stream_or_string.encode('idna').decode('utf-8') else: - url_file_stream_or_string = url_file_stream_or_string.decode('utf-8').encode('idna') + url_file_stream_or_string = url_file_stream_or_string.decode('utf-8').encode('idna').decode('utf-8') except: pass # try to open with urllib2 (to use optional headers) - request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, extra_headers) - opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers)) + request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers) + opener = apply(urllib2.build_opener, tuple(handlers + [_FeedURLHandler()])) opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent try: return opener.open(request) @@ -2735,20 +2916,22 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h # try to open with native open function (if url_file_stream_or_string is a filename) try: - return open(url_file_stream_or_string) + return open(url_file_stream_or_string, 'rb') except: pass # treat url_file_stream_or_string as string return _StringIO(str(url_file_stream_or_string)) -def _build_urllib2_request(url, agent, etag, modified, referrer, auth, extra_headers): +def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers): request = urllib2.Request(url) request.add_header('User-Agent', agent) if etag: request.add_header('If-None-Match', etag) if type(modified) == type(''): modified = _parse_date(modified) + elif isinstance(modified, datetime.datetime): + modified = modified.utctimetuple() if modified: # format into an RFC 1123-compliant timestamp. We can't use # time.strftime() since the %a and %b directives can be affected @@ -2773,7 +2956,7 @@ def _build_urllib2_request(url, agent, etag, modified, referrer, auth, extra_hea request.add_header('Accept', ACCEPT_HEADER) # use this for whatever -- cookies, special headers, etc # [('Cookie','Something'),('x-special-header','Another Value')] - for header_name, header_value in extra_headers.items(): + for header_name, header_value in request_headers.items(): request.add_header(header_name, header_value) request.add_header('A-IM', 'feed') # RFC 3229 support return request @@ -2811,9 +2994,15 @@ _iso8601_re = [ + r'(\.(?P<fracsecond>\d+))?' + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?' for tmpl in _iso8601_tmpl] -del tmpl +try: + del tmpl +except NameError: + pass _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] -del regex +try: + del regex +except NameError: + pass def _parse_date_iso8601(dateString): '''Parse a variety of ISO-8601-compatible formats like 20040105''' m = None @@ -2887,7 +3076,7 @@ def _parse_date_iso8601(dateString): # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) # which is guaranteed to normalize d/m/y/h/m/s. # Many implementations have bugs, but we'll pretend they don't. - return time.localtime(time.mktime(tm)) + return time.localtime(time.mktime(tuple(tm))) registerDateHandler(_parse_date_iso8601) # 8-bit date handling routines written by ytrewq1. @@ -3128,12 +3317,12 @@ def _parse_date_w3dtf(dateString): __date_re = ('(?P<year>\d\d\d\d)' '(?:(?P<dsep>-|)' - '(?:(?P<julian>\d\d\d)' - '|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?') + '(?:(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?' + '|(?P<julian>\d\d\d)))?') __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)' __tzd_rx = re.compile(__tzd_re) __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)' - '(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?' + '(?:(?P=tsep)(?P<seconds>\d\d)(?:[.,]\d+)?)?' + __tzd_re) __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re) __datetime_rx = re.compile(__datetime_re) @@ -3157,6 +3346,10 @@ def _parse_date_rfc822(dateString): else: data.append('') dateString = " ".join(data) + # Account for the Etc/GMT timezone by stripping 'Etc/' + elif len(data) == 5 and data[4].lower().startswith('etc/'): + data[4] = data[4][4:] + dateString = " ".join(data) if len(data) < 5: dateString += ' 00:00:00 GMT' tm = rfc822.parsedate_tz(dateString) @@ -3194,7 +3387,7 @@ def _parse_date(dateString): raise ValueError map(int, date9tuple) return date9tuple - except Exception, e: + except Exception as e: if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e))) pass return None @@ -3261,59 +3454,59 @@ def _getCharacterEncoding(http_headers, xml_data): sniffed_xml_encoding = '' xml_encoding = '' true_encoding = '' - http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type')) + http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type', http_headers.get('Content-type'))) # Must sniff for non-ASCII-compatible character encodings before # searching for XML declaration. This heuristic is defined in # section F of the XML specification: # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info try: - if xml_data[:4] == '\x4c\x6f\xa7\x94': + if xml_data[:4] == _l2bytes([0x4c, 0x6f, 0xa7, 0x94]): # EBCDIC xml_data = _ebcdic_to_ascii(xml_data) - elif xml_data[:4] == '\x00\x3c\x00\x3f': + elif xml_data[:4] == _l2bytes([0x00, 0x3c, 0x00, 0x3f]): # UTF-16BE sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'): + elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xfe, 0xff])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])): # UTF-16BE with BOM sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x3f\x00': + elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x3f, 0x00]): # UTF-16LE sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'): + elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xff, 0xfe])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])): # UTF-16LE with BOM sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\x00\x3c': + elif xml_data[:4] == _l2bytes([0x00, 0x00, 0x00, 0x3c]): # UTF-32BE sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x00\x00': + elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x00, 0x00]): # UTF-32LE sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\xfe\xff': + elif xml_data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]): # UTF-32BE with BOM sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\xff\xfe\x00\x00': + elif xml_data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]): # UTF-32LE with BOM sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') - elif xml_data[:3] == '\xef\xbb\xbf': + elif xml_data[:3] == _l2bytes([0xef, 0xbb, 0xbf]): # UTF-8 with BOM sniffed_xml_encoding = 'utf-8' xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') else: # ASCII-compatible pass - xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) + xml_encoding_match = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')).match(xml_data) except: xml_encoding_match = None if xml_encoding_match: - xml_encoding = xml_encoding_match.groups()[0].lower() + xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower() if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): xml_encoding = sniffed_xml_encoding acceptable_content_type = 0 @@ -3329,7 +3522,7 @@ def _getCharacterEncoding(http_headers, xml_data): true_encoding = http_encoding or 'us-ascii' elif http_content_type.startswith('text/'): true_encoding = http_encoding or 'us-ascii' - elif http_headers and (not http_headers.has_key('content-type')): + elif http_headers and (not (http_headers.has_key('content-type') or http_headers.has_key('Content-type'))): true_encoding = xml_encoding or 'iso-8859-1' else: true_encoding = xml_encoding or 'utf-8' @@ -3347,35 +3540,35 @@ def _toUTF8(data, encoding): ''' if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding) # strip Byte Order Mark (if present) - if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'): + if (len(data) >= 4) and (data[:2] == _l2bytes([0xfe, 0xff])) and (data[2:4] != _l2bytes([0x00, 0x00])): if _debug: sys.stderr.write('stripping BOM\n') if encoding != 'utf-16be': sys.stderr.write('trying utf-16be instead\n') encoding = 'utf-16be' data = data[2:] - elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'): + elif (len(data) >= 4) and (data[:2] == _l2bytes([0xff, 0xfe])) and (data[2:4] != _l2bytes([0x00, 0x00])): if _debug: sys.stderr.write('stripping BOM\n') if encoding != 'utf-16le': sys.stderr.write('trying utf-16le instead\n') encoding = 'utf-16le' data = data[2:] - elif data[:3] == '\xef\xbb\xbf': + elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]): if _debug: sys.stderr.write('stripping BOM\n') if encoding != 'utf-8': sys.stderr.write('trying utf-8 instead\n') encoding = 'utf-8' data = data[3:] - elif data[:4] == '\x00\x00\xfe\xff': + elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]): if _debug: sys.stderr.write('stripping BOM\n') if encoding != 'utf-32be': sys.stderr.write('trying utf-32be instead\n') encoding = 'utf-32be' data = data[4:] - elif data[:4] == '\xff\xfe\x00\x00': + elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]): if _debug: sys.stderr.write('stripping BOM\n') if encoding != 'utf-32le': @@ -3398,36 +3591,36 @@ def _stripDoctype(data): rss_version may be 'rss091n' or None stripped_data is the same XML document, minus the DOCTYPE ''' - start = re.search('<\w',data) + start = re.search(_s2bytes('<\w'), data) start = start and start.start() or -1 head,data = data[:start+1], data[start+1:] - entity_pattern = re.compile(r'^\s*<!ENTITY([^>]*?)>', re.MULTILINE) + entity_pattern = re.compile(_s2bytes(r'^\s*<!ENTITY([^>]*?)>'), re.MULTILINE) entity_results=entity_pattern.findall(head) - head = entity_pattern.sub('', head) - doctype_pattern = re.compile(r'^\s*<!DOCTYPE([^>]*?)>', re.MULTILINE) + head = entity_pattern.sub(_s2bytes(''), head) + doctype_pattern = re.compile(_s2bytes(r'^\s*<!DOCTYPE([^>]*?)>'), re.MULTILINE) doctype_results = doctype_pattern.findall(head) - doctype = doctype_results and doctype_results[0] or '' - if doctype.lower().count('netscape'): + doctype = doctype_results and doctype_results[0] or _s2bytes('') + if doctype.lower().count(_s2bytes('netscape')): version = 'rss091n' else: version = None # only allow in 'safe' inline entity definitions - replacement='' + replacement=_s2bytes('') if len(doctype_results)==1 and entity_results: - safe_pattern=re.compile('\s+(\w+)\s+"(&#\w+;|[^&"]*)"') + safe_pattern=re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"')) safe_entities=filter(lambda e: safe_pattern.match(e),entity_results) if safe_entities: - replacement='<!DOCTYPE feed [\n <!ENTITY %s>\n]>' % '>\n <!ENTITY '.join(safe_entities) + replacement=_s2bytes('<!DOCTYPE feed [\n <!ENTITY') + _s2bytes('>\n <!ENTITY ').join(safe_entities) + _s2bytes('>\n]>') data = doctype_pattern.sub(replacement, head) + data - return version, data, dict(replacement and safe_pattern.findall(replacement)) + return version, data, dict(replacement and [(k.decode('utf-8'), v.decode('utf-8')) for k, v in safe_pattern.findall(replacement)]) -def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[], extra_headers={}): +def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[], request_headers={}, response_headers={}): '''Parse a feed from a URL, file, stream, or string. - extra_headers, if given, is a dict from http header name to value to add + request_headers, if given, is a dict from http header name to value to add to the request; this overrides internally generated values. ''' result = FeedParserDict() @@ -3435,23 +3628,31 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer result['entries'] = [] if _XML_AVAILABLE: result['bozo'] = 0 - if type(handlers) == types.InstanceType: + if not isinstance(handlers, list): handlers = [handlers] try: - f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, extra_headers) + f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers) data = f.read() - except Exception, e: + except Exception as e: result['bozo'] = 1 result['bozo_exception'] = e data = None f = None + if hasattr(f, 'headers'): + result['headers'] = dict(f.headers) + # overwrite existing headers using response_headers + if 'headers' in result: + result['headers'].update(response_headers) + elif response_headers: + result['headers'] = copy.deepcopy(response_headers) + # if feed is gzip-compressed, decompress it - if f and data and hasattr(f, 'headers'): - if gzip and f.headers.get('content-encoding', '') == 'gzip': + if f and data and 'headers' in result: + if gzip and result['headers'].get('content-encoding') == 'gzip': try: data = gzip.GzipFile(fileobj=_StringIO(data)).read() - except Exception, e: + except Exception as e: # Some feeds claim to be gzipped but they're not, so # we get garbage. Ideally, we should re-request the # feed without the 'Accept-encoding: gzip' header, @@ -3459,30 +3660,29 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer result['bozo'] = 1 result['bozo_exception'] = e data = '' - elif zlib and f.headers.get('content-encoding', '') == 'deflate': + elif zlib and result['headers'].get('content-encoding') == 'deflate': try: data = zlib.decompress(data, -zlib.MAX_WBITS) - except Exception, e: + except Exception as e: result['bozo'] = 1 result['bozo_exception'] = e data = '' # save HTTP headers - if hasattr(f, 'info'): - info = f.info() - etag = info.getheader('ETag') - if etag: - result['etag'] = etag - last_modified = info.getheader('Last-Modified') - if last_modified: - result['modified'] = _parse_date(last_modified) + if 'headers' in result: + if 'etag' in result['headers'] or 'ETag' in result['headers']: + etag = result['headers'].get('etag', result['headers'].get('ETag')) + if etag: + result['etag'] = etag + if 'last-modified' in result['headers'] or 'Last-Modified' in result['headers']: + modified = result['headers'].get('last-modified', result['headers'].get('Last-Modified')) + if modified: + result['modified'] = _parse_date(modified) if hasattr(f, 'url'): result['href'] = f.url result['status'] = 200 if hasattr(f, 'status'): result['status'] = f.status - if hasattr(f, 'headers'): - result['headers'] = f.headers.dict if hasattr(f, 'close'): f.close() @@ -3495,8 +3695,8 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \ _getCharacterEncoding(http_headers, data) if http_headers and (not acceptable_content_type): - if http_headers.has_key('content-type'): - bozo_message = '%s is not an XML media type' % http_headers['content-type'] + if http_headers.has_key('content-type') or http_headers.has_key('Content-type'): + bozo_message = '%s is not an XML media type' % http_headers.get('content-type', http_headers.get('Content-type')) else: bozo_message = 'no Content-type specified' result['bozo'] = 1 @@ -3505,8 +3705,12 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer if data is not None: result['version'], data, entities = _stripDoctype(data) - baseuri = http_headers.get('content-location', result.get('href')) - baselang = http_headers.get('content-language', None) + # ensure that baseuri is an absolute uri using an acceptable URI scheme + contentloc = http_headers.get('content-location', http_headers.get('Content-Location', '')) + href = result.get('href', '') + baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href + + baselang = http_headers.get('content-language', http_headers.get('Content-Language', None)) # if server sent 304, we're done if result.get('status', 0) == 304: @@ -3582,7 +3786,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer elif proposed_encoding != result['encoding']: result['bozo'] = 1 result['bozo_exception'] = CharacterEncodingOverride( \ - 'documented declared as %s, but parsed as %s' % \ + 'document declared as %s, but parsed as %s' % \ (result['encoding'], proposed_encoding)) result['encoding'] = proposed_encoding @@ -3603,7 +3807,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'}) try: saxparser.parse(source) - except Exception, e: + except Exception as e: if _debug: import traceback traceback.print_stack() @@ -3613,8 +3817,8 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer result['bozo_exception'] = feedparser.exc or e use_strict_parser = 0 if not use_strict_parser: - feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '', entities) - feedparser.feed(data) + feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities) + feedparser.feed(data.decode('utf-8', 'replace')) result['feed'] = feedparser.feeddata result['entries'] = feedparser.entries result['version'] = result['version'] or feedparser.version diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 6215132e4b..f2aeb4e4bd 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -14,7 +14,7 @@ from contextlib import nested, closing from calibre import browser, __appname__, iswindows, \ - strftime, preferred_encoding + strftime, preferred_encoding, as_unicode from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag from calibre.ebooks.metadata.opf2 import OPFCreator from calibre import entity_to_unicode @@ -986,8 +986,8 @@ class BasicNewsRecipe(Recipe): self.cover_path = None try: cu = self.get_cover_url() - except Exception, err: - self.log.error(_('Could not download cover: %s')%str(err)) + except Exception as err: + self.log.error(_('Could not download cover: %s')%as_unicode(err)) self.log.debug(traceback.format_exc()) else: if not cu: @@ -1318,11 +1318,11 @@ class BasicNewsRecipe(Recipe): oldest_article=self.oldest_article, max_articles_per_feed=self.max_articles_per_feed, get_article_url=self.get_article_url)) - except Exception, err: + except Exception as err: feed = Feed() msg = 'Failed feed: %s'%(title if title else url) feed.populate_from_preparsed_feed(msg, []) - feed.description = repr(err) + feed.description = as_unicode(err) parsed_feeds.append(feed) self.log.exception(msg) @@ -1468,7 +1468,7 @@ class CalibrePeriodical(BasicNewsRecipe): 'http://news.calibre-ebook.com/subscribed_files/%s/0/temp.downloaded_recipe' % self.calibre_periodicals_slug ).read() - except Exception, e: + except Exception as e: if hasattr(e, 'getcode') and e.getcode() == 403: raise DownloadDenied( _('You do not have permission to download this issue.' diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index f2e22c8f5e..64a2c32fb3 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -210,7 +210,7 @@ class RecursiveFetcher(object): with closing(open_func(url, timeout=self.timeout)) as f: data = response(f.read()+f.read()) data.newurl = f.geturl() - except urllib2.URLError, err: + except urllib2.URLError as err: if hasattr(err, 'code') and responses.has_key(err.code): raise FetchError, responses[err.code] if getattr(err, 'reason', [0])[0] == 104 or \