From 5eb92ac4ee0a4507a48ddcd2b910761706f08290 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 14 Sep 2008 15:51:03 -0700 Subject: [PATCH] Add a browse by tags mode --- src/calibre/__init__.py | 2 +- src/calibre/ebooks/epub/__init__.py | 2 - src/calibre/ebooks/epub/from_html.py | 45 +- src/calibre/ebooks/html.py | 154 +- src/calibre/ebooks/metadata/toc.py | 9 + src/calibre/gui2/__init__.py | 5 +- src/calibre/gui2/dialogs/metadata_single.py | 4 +- src/calibre/gui2/images/minus.svg | 325 ++++ src/calibre/gui2/images/plus.svg | 930 ++++------ src/calibre/gui2/images/publisher.png | Bin 0 -> 17173 bytes src/calibre/gui2/images/series.svg | 1096 +++++++++++ src/calibre/gui2/images/tags.svg | 503 +++++ src/calibre/gui2/library.py | 15 + src/calibre/gui2/main.py | 20 +- src/calibre/gui2/main.ui | 159 +- src/calibre/gui2/status.py | 15 + src/calibre/gui2/tags.py | 143 ++ src/calibre/library/database2.py | 89 +- src/calibre/trac/plugins/download.py | 2 +- src/calibre/utils/mechanize/__init__.py | 125 -- src/calibre/utils/mechanize/_auth.py | 500 ----- src/calibre/utils/mechanize/_beautifulsoup.py | 1080 ----------- src/calibre/utils/mechanize/_clientcookie.py | 1651 ----------------- src/calibre/utils/mechanize/_debug.py | 28 - src/calibre/utils/mechanize/_gzip.py | 103 - src/calibre/utils/mechanize/_headersutil.py | 226 --- src/calibre/utils/mechanize/_html.py | 607 ------ src/calibre/utils/mechanize/_http.py | 729 -------- src/calibre/utils/mechanize/_lwpcookiejar.py | 185 -- src/calibre/utils/mechanize/_mechanize.py | 656 ------- .../utils/mechanize/_mozillacookiejar.py | 159 -- src/calibre/utils/mechanize/_msiecookiejar.py | 387 ---- src/calibre/utils/mechanize/_opener.py | 421 ----- src/calibre/utils/mechanize/_pullparser.py | 334 ---- src/calibre/utils/mechanize/_request.py | 86 - src/calibre/utils/mechanize/_response.py | 515 ----- src/calibre/utils/mechanize/_rfc3986.py | 240 --- src/calibre/utils/mechanize/_seek.py | 16 - src/calibre/utils/mechanize/_upgrade.py | 40 - src/calibre/utils/mechanize/_urllib2.py | 62 - src/calibre/utils/mechanize/_useragent.py | 348 ---- src/calibre/utils/mechanize/_util.py | 279 --- src/calibre/web/feeds/news.py | 14 +- 43 files changed, 2748 insertions(+), 9561 deletions(-) create mode 100644 src/calibre/gui2/images/minus.svg create mode 100644 src/calibre/gui2/images/publisher.png create mode 100644 src/calibre/gui2/images/series.svg create mode 100644 src/calibre/gui2/images/tags.svg create mode 100644 src/calibre/gui2/tags.py delete mode 100644 src/calibre/utils/mechanize/__init__.py delete mode 100644 src/calibre/utils/mechanize/_auth.py delete mode 100644 src/calibre/utils/mechanize/_beautifulsoup.py delete mode 100644 src/calibre/utils/mechanize/_clientcookie.py delete mode 100644 src/calibre/utils/mechanize/_debug.py delete mode 100644 src/calibre/utils/mechanize/_gzip.py delete mode 100644 src/calibre/utils/mechanize/_headersutil.py delete mode 100644 src/calibre/utils/mechanize/_html.py delete mode 100644 src/calibre/utils/mechanize/_http.py delete mode 100644 src/calibre/utils/mechanize/_lwpcookiejar.py delete mode 100644 src/calibre/utils/mechanize/_mechanize.py delete mode 100644 src/calibre/utils/mechanize/_mozillacookiejar.py delete mode 100644 src/calibre/utils/mechanize/_msiecookiejar.py delete mode 100644 src/calibre/utils/mechanize/_opener.py delete mode 100644 src/calibre/utils/mechanize/_pullparser.py delete mode 100644 src/calibre/utils/mechanize/_request.py delete mode 100644 src/calibre/utils/mechanize/_response.py delete mode 100644 src/calibre/utils/mechanize/_rfc3986.py delete mode 100644 src/calibre/utils/mechanize/_seek.py delete mode 100644 src/calibre/utils/mechanize/_upgrade.py delete mode 100644 src/calibre/utils/mechanize/_urllib2.py delete mode 100644 src/calibre/utils/mechanize/_useragent.py delete mode 100644 src/calibre/utils/mechanize/_util.py diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index c69518c0c5..5e7a2df8eb 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -14,7 +14,7 @@ from calibre.constants import iswindows, isosx, islinux, isfrozen, \ terminal_controller, preferred_encoding, \ __appname__, __version__, __author__, \ win32event, win32api, winerror, fcntl -from calibre.utils import mechanize +import mechanize def unicode_path(path, abs=False): if not isinstance(path, unicode): diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index 0d5ed517f3..0585385143 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -66,8 +66,6 @@ to auto-generate a Table of Contents. help=_('Maximum number of links from each HTML file to insert into the TOC. Set to 0 to disable. Default is: %default.')) toc('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False, help=_("Don't add auto-detected chapters to the Table of Contents.")) - toc('add_files_to_toc', ['--add-files-to-toc'], default=False, - help=_('If more than one HTML file is found, create a TOC entry for each file.')) return c \ No newline at end of file diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index 59cd871dc7..6abb45e858 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -5,19 +5,19 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' import os, sys, re, shutil, cStringIO from lxml.etree import XPath -from lxml import etree -from calibre.ebooks.html import Parser, get_text, merge_metadata, get_filelist,\ +from calibre.ebooks.html import Processor, get_text, merge_metadata, get_filelist,\ opf_traverse, create_metadata, rebase_toc from calibre.ebooks.epub import config as common_config from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.metadata.toc import TOC -class HTMLProcessor(Parser): +class HTMLProcessor(Processor): - def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, toc=None): - Parser.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, + def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles): + Processor.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='html2epub') if opts.verbose > 2: self.debug_tree('parsed') @@ -27,36 +27,11 @@ class HTMLProcessor(Parser): if opts.verbose > 2: self.debug_tree('nocss') - if toc is not None: - self.populate_toc(toc) - self.collect_font_statistics() self.split() - def detect_chapters(self): - self.detected_chapters = self.opts.chapter(self.root) - for elem in self.detected_chapters: - style = elem.get('style', '') - style += ';page-break-before: always' - elem.set(style, style) - - def save(self): - head = self.root.xpath('//head') - if head: - head = head[0] - else: - head = self.root.xpath('//body') - head = head[0] if head else self.root - style = etree.SubElement(head, 'style', attrib={'type':'text/css'}) - style.text='\n'+self.css - style.tail = '\n\n' - Parser.save(self) - - def populate_toc(self, toc): - if self.level >= self.opts.max_toc_recursion: - return - + def collect_font_statistics(self): ''' @@ -93,11 +68,13 @@ the element of the OPF file. def parse_content(filelist, opts, tdir): os.makedirs(os.path.join(tdir, 'content', 'resources')) resource_map = {} + toc = TOC(base_path=tdir) for htmlfile in filelist: hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'), resource_map, filelist) + hp.populate_toc(toc) hp.save() - return resource_map, hp.htmlfile_map + return resource_map, hp.htmlfile_map, toc def convert(htmlfile, opts, notification=None): htmlfile = os.path.abspath(htmlfile) @@ -115,7 +92,7 @@ def convert(htmlfile, opts, notification=None): namespaces={'re':'http://exslt.org/regular-expressions'}) with TemporaryDirectory('_html2epub') as tdir: - resource_map, htmlfile_map = parse_content(filelist, opts, tdir) + resource_map, htmlfile_map, generated_toc = parse_content(filelist, opts, tdir) resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()] if opf.cover and os.access(opf.cover, os.R_OK): @@ -130,6 +107,8 @@ def convert(htmlfile, opts, notification=None): buf = cStringIO.StringIO() if mi.toc: rebase_toc(mi.toc, htmlfile_map, opts.output) + if mi.toc is None or len(mi.toc) < 2: + mi.toc = generated_toc with open(os.path.join(tdir, 'metadata.opf'), 'wb') as f: mi.render(f, buf) toc = buf.getvalue() diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index be3f7201e3..f96cde8623 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -12,7 +12,7 @@ import sys, re, os, shutil, logging, tempfile, cStringIO from urlparse import urlparse from urllib import unquote -from lxml import html +from lxml import html, etree from lxml.etree import XPath get_text = XPath("//text()") @@ -83,20 +83,24 @@ class HTMLFile(object): The encoding of the file is available as :member:`encoding`. ''' - HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE) - LINK_PAT = re.compile( + HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE) + TITLE_PAT = re.compile('([^<>]+)', re.IGNORECASE) + LINK_PAT = re.compile( r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P[^"]+)")|(?:\'(?P[^\']+)\')|(?P[^\s]+))', re.DOTALL|re.IGNORECASE) - def __init__(self, path_to_html_file, level, encoding, verbose): + def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None): ''' :param level: The level of this file. Should be 0 for the root file. :param encoding: Use `encoding` to decode HTML. + :param referrer: The :class:`HTMLFile` that first refers to this file. ''' - self.path = unicode_path(path_to_html_file, abs=True) - self.base = os.path.dirname(self.path) - self.level = level - self.links = [] + self.path = unicode_path(path_to_html_file, abs=True) + self.title = os.path.splitext(os.path.basename(self.path))[0] + self.base = os.path.dirname(self.path) + self.level = level + self.referrer = referrer + self.links = [] try: with open(self.path, 'rb') as f: @@ -115,6 +119,9 @@ class HTMLFile(object): self.encoding = encoding src = src.decode(encoding, 'replace') + match = self.TITLE_PAT.search(src) + if match is not None: + self.title = match.group(1) self.find_links(src) @@ -187,7 +194,7 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None) if link.path is None or link.path in flat: continue try: - nf = HTMLFile(link.path, level, encoding, verbose) + nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf) nl.append(nf) flat.append(nf) except IgnoreFile, err: @@ -383,12 +390,110 @@ class Parser(PreProcessor, LoggingInterface): name = 'resources/' + name self.resource_map[link.path] = name return name + +class Processor(Parser): + ''' + This class builds on :class:`Parser` to provide additional methods + to perform various processing/modification tasks on HTML files. + ''' + LINKS_PATH = XPath('//a[@href]') + + def detect_chapters(self): + self.detected_chapters = self.opts.chapter(self.root) + for elem in self.detected_chapters: + style = elem.get('style', '').strip() + if style and not style.endswith(';'): + style += '; ' + style += 'page-break-before: always' + elem.set(style, style) + + def save(self): + head = self.root.xpath('//head') + if head: + head = head[0] + else: + head = self.root.xpath('//body') + head = head[0] if head else self.root + style = etree.SubElement(head, 'style', attrib={'type':'text/css'}) + style.text='\n'+self.css + style.tail = '\n\n' + Parser.save(self) + + def populate_toc(self, toc): + if self.level >= self.opts.max_toc_recursion: + return + + referrer = toc + if self.htmlfile.referrer is not None: + name = self.htmlfile_map[self.htmlfile.referrer] + href = 'content/'+name + for i in toc.flat(): + if href == i.href and i.fragment is None: + referrer = i + break + + def add_item(href, fragment, text, target): + for entry in toc.flat(): + if entry.href == href and entry.fragment ==fragment: + return entry + if len(text) > 50: + text = text[:50] + u'\u2026' + return target.add_item(href, fragment, text) + + name = self.htmlfile_map[self.htmlfile] + href = 'content/'+name + + if referrer.href != href: # Happens for root file + target = add_item(href, None, self.htmlfile.title, referrer) + + # Add links to TOC + if self.opts.max_toc_links > 0: + for link in list(self.LINKS_PATH(self.root))[:self.opts.max_toc_links]: + text = (u''.join(link.xpath('string()'))).strip() + if text: + href = link.get('href', '') + if href: + href = 'content/'+href + parts = href.split('#') + href, fragment = parts[0], None + if len(parts) > 1: + fragment = parts[1] + if self.htmlfile.referrer is not None: + name = self.htmlfile_map[self.htmlfile.referrer.path] + add_item(href, fragment, text, target) + + # Add chapters to TOC + if not self.opts.no_chapters_in_toc: + for elem in getattr(self, 'detected_chapters', []): + text = (u''.join(elem.xpath('string()'))).strip() + if text: + name = self.htmlfile_map[self.path] + href = 'content/'+name + add_item(href, None, text, target) + + def extract_css(self): ''' Remove all CSS information from the document and store in self.raw_css. This includes tags. ''' + counter = 0 + def get_id(chapter, prefix='calibre_css_'): + new_id = '%s_%d'%(prefix, counter) + counter += 1 + if chapter.tag.lower() == 'a' and 'name' in chapter.keys(): + chapter.attrib['id'] = id = chapter.get('name') + if not id: + chapter.attrib['id'] = chapter.attrib['name'] = new_id + return new_id + if 'id' in chapter.keys(): + id = chapter.get('id') + else: + id = new_id + chapter.set('id', id) + return id + css = [] for link in self.root.xpath('//link'): if 'css' in link.get('type', 'text/css').lower(): @@ -402,7 +507,6 @@ class Parser(PreProcessor, LoggingInterface): css.append('\n'.join(get_text(style))) style.getparent().remove(style) - font_id = 1 for font in self.root.xpath('//font'): try: size = int(font.attrib.pop('size', '3')) @@ -415,37 +519,15 @@ class Parser(PreProcessor, LoggingInterface): color = font.attrib.pop('color', None) if color is not None: setting += 'color:%s'%color - id = 'calibre_font_id_%d'%font_id - font.set('id', 'calibre_font_id_%d'%font_id) - font_id += 1 + id = get_id(font) css.append('#%s { %s }'%(id, setting)) - - css_counter = 1 for elem in self.root.xpath('//*[@style]'): if 'id' not in elem.keys(): - elem.set('id', 'calibre_css_id_%d'%css_counter) - css_counter += 1 - css.append('#%s {%s}'%(elem.get('id'), elem.get('style'))) + id = get_id(elem) + css.append('#%s {%s}'%(id, elem.get('style'))) elem.attrib.pop('style') - chapter_counter = 1 - for chapter in self.detected_chapters: - if chapter.tag.lower() == 'a': - if 'name' in chapter.keys(): - chapter.attrib['id'] = id = chapter.get('name') - elif 'id' in chapter.keys(): - id = chapter.get('id') - else: - id = 'calibre_detected_chapter_%d'%chapter_counter - chapter_counter += 1 - chapter.set('id', id) - else: - if 'id' not in chapter.keys(): - id = 'calibre_detected_chapter_%d'%chapter_counter - chapter_counter += 1 - chapter.set('id', id) - css.append('#%s {%s}'%(id, 'page-break-before:always')) - + self.raw_css = '\n\n'.join(css) self.css = unicode(self.raw_css) # TODO: Figure out what to do about CSS imports from linked stylesheets diff --git a/src/calibre/ebooks/metadata/toc.py b/src/calibre/ebooks/metadata/toc.py index cd28b9799e..2eb6402a41 100644 --- a/src/calibre/ebooks/metadata/toc.py +++ b/src/calibre/ebooks/metadata/toc.py @@ -48,10 +48,19 @@ class TOC(list): depth = c + 1 return depth + def flat(self): + 'Depth first iteration over the tree rooted at self' + yield self + for obj in self: + for i in obj.flat(): + yield i + @apply def abspath(): doc='Return the file this toc entry points to as a absolute path to a file on the system.' def fget(self): + if self.href is None: + return None path = self.href.replace('/', os.sep) if not os.path.isabs(path): path = os.path.join(self.base_path, path) diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py index 2727d7b7cb..1681eb6ff4 100644 --- a/src/calibre/gui2/__init__.py +++ b/src/calibre/gui2/__init__.py @@ -3,7 +3,8 @@ __copyright__ = '2008, Kovid Goyal ' """ The GUI """ import sys, os, re, StringIO, traceback from PyQt4.QtCore import QVariant, QFileInfo, QObject, SIGNAL, QBuffer, Qt, QSize, \ - QByteArray, QLocale, QUrl, QTranslator, QCoreApplication + QByteArray, QLocale, QUrl, QTranslator, QCoreApplication, \ + QModelIndex from PyQt4.QtGui import QFileDialog, QMessageBox, QPixmap, QFileIconProvider, \ QIcon, QTableView, QDialogButtonBox, QApplication @@ -159,7 +160,7 @@ class TableView(QTableView): else: cols = dynamic[key] if not cols: - cols = [True for i in range(self.model().columnCount(self))] + cols = [True for i in range(self.model().columnCount(QModelIndex()))] for i in range(len(cols)): hidden = self.isColumnHidden(i) diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index 9e17b83b4f..342bf25cd3 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -304,8 +304,8 @@ class MetadataSingleDialog(QDialog, Ui_MetadataSingleDialog): self.title.setText(book.title) self.authors.setText(', '.join(book.authors)) if book.author_sort: self.author_sort.setText(book.author_sort) - self.publisher.setText(book.publisher) - self.isbn.setText(book.isbn) + if book.publisher: self.publisher.setText(book.publisher) + if book.isbn: self.isbn.setText(book.isbn) summ = book.comments if summ: prefix = qstring_to_unicode(self.comments.toPlainText()) diff --git a/src/calibre/gui2/images/minus.svg b/src/calibre/gui2/images/minus.svg new file mode 100644 index 0000000000..8d483a3c2d --- /dev/null +++ b/src/calibre/gui2/images/minus.svg @@ -0,0 +1,325 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + diff --git a/src/calibre/gui2/images/plus.svg b/src/calibre/gui2/images/plus.svg index af011703ff..50a22437a7 100644 --- a/src/calibre/gui2/images/plus.svg +++ b/src/calibre/gui2/images/plus.svg @@ -7,239 +7,48 @@ xmlns:svg="http://www.w3.org/2000/svg" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" - xmlns:sodipodi="http://inkscape.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" width="128" height="128" - id="svg1307" + id="svg2936" sodipodi:version="0.32" - inkscape:version="0.43" + inkscape:version="0.45+devel" version="1.0" - sodipodi:docbase="/home/pinheiro/Documents/pics/new oxygen/svg" - sodipodi:docname="plus.svg"> + sodipodi:docname="edit-add.svgz" + inkscape:output_extension="org.inkscape.output.svgz.inkscape"> - - - - - - - - - - - - - - + id="defs2938"> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + xlink:href="#XMLID_4_" + id="linearGradient4284" + gradientUnits="userSpaceOnUse" + gradientTransform="matrix(2.4999996,0,0,2.4999996,174,-145.99998)" + x1="-13.757333" + y1="76.708466" + x2="-62.424866" + y2="104.80668" /> - - - - - - - - - - - - - - - + xlink:href="#linearGradient3260" + id="linearGradient4299" + gradientUnits="userSpaceOnUse" + spreadMethod="reflect" + x1="73.742638" + y1="15.336544" + x2="80" + y2="19.281664" /> - - - - - - - - - - - + xlink:href="#linearGradient3260" + id="linearGradient4297" + gradientUnits="userSpaceOnUse" + spreadMethod="reflect" + x1="73.742638" + y1="15.336544" + x2="80" + y2="19.281664" /> + id="linearGradient5412" + gradientUnits="userSpaceOnUse" + x1="28" + y1="57.5" + x2="28" + y2="0"> + style="stop-color:#fff14d;stop-opacity:1;" + id="stop5414" /> + style="stop-color:#f8ffa0;stop-opacity:0;" + id="stop5416" /> + + + + + + xlink:href="#linearGradient3030" + id="radialGradient4275" + gradientUnits="userSpaceOnUse" + gradientTransform="matrix(2.4999996,0,0,2.4999996,174,-145.99998)" + cx="-44" + cy="84" + fx="-60" + fy="100" + r="24" /> + + + + + + + + + + + - + + + + + - + + + + + + + + + + + + + + + x1="97.622581" + y1="77.512512" + x2="98.097946" + y2="105.10625" + gradientTransform="translate(-36.000006,-20.000008)" /> - + - - - - - - - - - - - - - - - - - - - - - - - - + gradientTransform="scale(1.039383,0.9621093)" + x1="64.341991" + y1="18.50366" + x2="76.284438" + y2="18.50366" /> - + inkscape:current-layer="layer1" + height="128px" + width="128px" + gridtolerance="10000" + inkscape:window-width="976" + inkscape:window-height="904" + inkscape:window-x="260" + inkscape:window-y="43"> + + id="metadata2941"> image/svg+xml - - - - Oxygen team - - - - - - - - - - + inkscape:label="Livello 1" + inkscape:groupmode="layer" + id="layer1"> + style="fill:url(#linearGradient4284);fill-opacity:1;stroke:none;stroke-width:8;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:1.08779998;stroke-opacity:1" + d="M 59.156262,4 C 50.862445,4 44.000004,10.862442 44.000004,19.156248 L 44.000004,43.999993 L 19.156259,43.999993 C 10.862452,43.999993 4.0000112,50.862435 4.0000112,59.156236 L 4.0000112,68.843735 C 4.0000112,77.137559 10.862452,83.999983 19.156259,83.999983 L 44.000004,83.999983 L 44.000004,108.84375 C 44.000004,117.13755 50.862451,124 59.156262,124 L 68.843765,124 C 77.137555,124 84.000005,117.13755 84.000005,108.84375 L 84.000005,83.999983 L 108.84376,83.999983 C 117.13756,83.999983 124,77.137559 124,68.843735 L 124,59.156236 C 124,50.862435 117.13756,43.999993 108.84376,43.999993 L 84.000005,43.999993 L 84.000005,19.156248 C 84.000005,10.862442 77.137555,4 68.843765,4 L 59.156262,4 z" + id="path3012" + sodipodi:nodetypes="ccccccccccccccccccccc" + clip-path="none" /> + sodipodi:nodetypes="cccccc" + transform="matrix(-1.2499999,0,0,1.2499999,144.00001,-10.102078)" + id="path3091" + d="M 69.875971,12.057888 C 68.798883,12.123171 67.34775,12.277052 66.875971,12.995388 L 68.465655,24.133449 L 79,23.37409 L 79,22.90534 C 80.740958,20.33518 74.219552,11.998548 69.875971,12.057888 z" + style="fill:url(#linearGradient4299);fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;filter:url(#filter3387)" + clip-path="none" /> + style="fill:url(#linearGradient4297);fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;filter:url(#filter3387)" + d="M 69.875971,12.057888 C 68.798883,12.123171 67.34775,12.277052 66.875971,12.995388 L 68.172686,21.789699 L 79,23.37409 L 79,22.90534 C 80.740958,20.33518 74.219552,11.998548 69.875971,12.057888 z" + id="path3095" + transform="matrix(1.2499999,0,0,1.2499999,-15.110921,-10.102078)" + sodipodi:nodetypes="cccccc" + clip-path="none" /> + style="opacity:0.68164804;fill:url(#linearGradient3003);fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;filter:url(#filter3387)" + d="M 69.875971,12.057888 C 68.798883,12.123171 67.34775,12.277052 66.875971,12.995388 L 69.051593,23.742824 L 79,23.37409 L 79,22.90534 C 80.740958,20.33518 74.219552,11.998548 69.875971,12.057888 z" + id="path3197" + transform="matrix(-1.2499999,0,0,-1.2499999,144.00001,139.07195)" + sodipodi:nodetypes="cccccc" /> + sodipodi:nodetypes="cccccc" + transform="matrix(1.2499999,0,0,-1.2499999,-15.110911,139.07195)" + id="path3199" + d="M 69.875971,12.057888 C 68.798883,12.123171 67.34775,12.277052 66.875971,12.995388 L 67.782061,23.547512 L 79,23.37409 L 79,22.90534 C 80.740958,20.33518 74.219552,11.998548 69.875971,12.057888 z" + style="opacity:0.70786516;fill:url(#linearGradient3005);fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;filter:url(#filter3387)" /> + sodipodi:nodetypes="cccccc" + transform="matrix(1.2499999,0,0,1.2499999,24.889073,28.928032)" + id="path3221" + d="M 69.875971,12.057888 C 68.798883,12.123171 67.34775,12.277052 66.875971,12.995388 L 68.465655,24.133449 L 79,23.37409 L 79,22.90534 C 80.740958,20.33518 74.219552,11.998548 69.875971,12.057888 z" + style="opacity:0.55056176;fill:url(#linearGradient4291);fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;filter:url(#filter3387)" + clip-path="none" /> + sodipodi:nodetypes="ccccccccccccccccccccc" + id="path3028" + d="M 59.156262,4 C 50.862445,4 44.000004,10.862442 44.000004,19.156248 L 44.000004,43.999993 L 19.156259,43.999993 C 10.862452,43.999993 4.000007,47.134768 4.000007,55.428569 C 4.000008,53.062884 4.00001,63.906397 4.000011,68.843735 C 4.000011,77.137559 10.862452,83.999983 19.156259,83.999983 L 44.000004,83.999983 L 44.000004,108.84375 C 44.000004,117.13755 50.862451,124 59.156262,124 L 68.843765,124 C 77.13756,124 84.00001,117.13755 84.00001,108.84375 L 84.00001,83.999983 L 108.84376,83.999983 C 117.13756,83.999983 124,77.137559 124,68.843735 L 124,59.156236 C 124,50.862435 117.13756,43.999993 108.84376,43.999993 L 84.00001,43.999993 L 84.00001,19.156248 C 84.00001,10.862442 77.13756,4 68.843765,4 L 59.156262,4 z" + style="opacity:0.58052434;fill:url(#radialGradient4275);fill-opacity:1;stroke:none;stroke-width:8;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:1.08779998;stroke-opacity:1" + clip-path="none" /> + + + + + + diff --git a/src/calibre/gui2/images/publisher.png b/src/calibre/gui2/images/publisher.png new file mode 100644 index 0000000000000000000000000000000000000000..d3ff9aaf029f25608f918b6058b58bd0fb4e8a3f GIT binary patch literal 17173 zcmbST^LrfK+usjQ*Alu;oRz!Hn`+$ha1Y`s2CLynZ2zmJ$s~sS-N|fxLN?byu4U!9PQl9Oq?xPoLsGP&V`5o z0CIr5l(>d>?rE1-Bdx5@!AHgMYnk80Y7~(?l_k#CX(Z~@L~{;!;w|ZB^mmY-XV(O6 z?LbYP_`N{Y)OlUpGW<@j=@L&eAOuSR7&#`MvXj?l;)v#Stbg}X{DR6Gn;6SySF3T* zd2n`q3|S7osR&o(W9iwgvpB6$emIHtg7-cgeTV^0SJym%_3b?eKoB=Ak$Yoo<+=(@5a z;eyZC|JrVj&Muc6&KLbyce6y*H0)vT8!f%RZ+R7#*yDe#`^pE;XF9UqwDG*Kk>|^4 zb5{nldA{+y_K=nSHtSQVHCj2NFIZc%WbJC|8ftQ6tm}CXC|ZjOK9wYjkV1?KM?ox# z`6WU5U2ObmFb6#uZEhd4BPa3ZzC&z|_W0GWF<*6mq@6!(lr`~5P*I~2B~}H4?#R?J z(dm!{BV4>UuDo_IC0STDeomWZZUZb?4ca}~2tw=7kfDbQEk~Aw@mlx0=sL=guI0rA z6m%ct8(QGT>!z<&rWJ{BD|`5dQX^x|Z(4~wn%S?mrKpPTKi}|vb+6(9?2BqHE$n|5 z7#haXJ{Z(>@7sRKJKkQd(fitR`v#TFMWbmu){aIutEf%<3%LI{i-@2Sk130(XQv1* z{JW8$UGsA^0;Tb>Df0cW=i@=0=NWz?hXICKYm3f);h38Vdw`8Cj+^g0mOKIUpPN|| z_=gL#o&D;n=at9`jd3T-n-BYm$4A&b*Q~B@&947Ecrj!th}f8jCp<7CsO8lf)k{nUS)$?ucoYDlwf*i-lji<2Ar#N z6c_rrloT0Zpl0!E?$J@4|3zB>xN@fK_5C_Fd2k!swF0!x>Fww zEG{Hh({3}e^g~@n&z$3X9C5yRk-Sb;FTE~}HbqV2e8Ka;()0Gce82adoI%GM}c@>-HR;+$%D8tIF2q?>l@baPlnH783JU0e+denbu$aI%Pc3C}V|V2)#d= zajC~a%k2L)_8~|IZ2=UDoAgwFM|c4eUjLkRI|5eSW`y^j?@m}*(iHH)%>fmu*9;>r zH8>JDQ0e?UGNq%}3UH{oC{|zHZ;y%vy&!o<=a)z77wPa*ost1I%E|A;LDNGnBqSq_ z7j~t;7BkX)NJjb|a#sCsl>CoodZY}pa|5S+aQysq{muoG=A{UcEt>hnqHPXUu4n<9 zskWQ=-Iw3fhD@&nChDhIagY!A;@tuZzqZb@#pe6)j{3PLQ^(Fs{%P+aqkyed)I?HW ztoc~F_`q8>aKk-WuC=oE&a8`~e|~v+EC{-UL;f!D5{NGHit*um!5g^p@Z4i4@)Xta zIEx^%vhKu_^EzSlsvRRBR;SU=<51K2inYyWX=xepLKc|&;D2oOhXkRTBHnBpNEuJH zt}UX zW;SLA8Yu&TcuG^Hw%Z)PRYSW*Nxr9{=hQf^|NBjW(}jwnVx7sG?(3WAD|5|3a{@Xl z+KpAvHJ#|B;9CmN==r)pz1lO@GlcB&!02_GvQJwf>`c(pc|WVXB_ZIYP?q0qiYOeT z4T_$Vy3|H-0o{g+I5tiN=lJ)N>gLIR=Lp@~?r{PMAkaRc=+JkEJG{}hn#cQbwy?aw z&%?|8K5hB6lSA=$RGH^3fmL2J{4y7to1HcRT`B`j_oe82BDH*GZtkuy@A~Nd=Ntke z()H<4Vu+R&PvnROD8skYRk|nC)siAdqNM-x`E6os|&%5&JtQP|q#&U>6sMCf)v`QNY)tqemj+>myqIqDap0uqk6a5)8 zT_aX>{{u{!5!%ekC7mW)IJ@Pl?aM7Qrf=Q3nvB|Nc{y8~?*XuP42xM=xNj#sRlWB7 ze&UT7!S-%3r+$`c4#7X*Mw-}JWyHS3?X-eC94C%sB>unPQ$eN=1y0T$r>L}~+73X>mC_Hnucqml0&yH}}IxLiY{*O0sr8f=R* z2Z^>uAu7y7Jup6Ajmf(T&VE74cIqiI{?G7(OsbPeo9KW&ZwD8OW( z3S3{fSzn>7w}$g=qzb6F>wZt``MHlEdf7Ie$2S(YhkUW#QqbT1hLS^T7SR`O!e&J< z`03Hr9XwUleT_^qLgBsH;qSU5voy@kZLEP<7H(X+RJB^J+w4KC?@6fd0$18Fvla7= z=j!-|^|jYZugMgSH|TO81_{kU!fM32F1*IhSYC$wc;O#Xm&9&^K zpez`_J`)qZjDun$DITFh$W&vf?l0EkZHO|{zq8;(ePNl$%5{0?uTrD0Q0Dh{|G|cG zV`MqZf`|O~*kfLyjS9Wd*K-=RarOrf;%JX6nk#Dbm!%ELQc8?yAt*UR|7>aR|a{Yj4{ac#9eF_`C3S`ssIN=7BhXRN~b%Q4Y^{f z46!s(&kR7Dd{jYGvS-TvvPa`ivi^4djXneU_CB>DH3ml*e`4G)Xz!Yv>n*rVFAUU1 zw56Xmr6Mx95{-Lv@vYjT=NuhKAx_mK1BA0taB2&;`X)7HN&DiCsb73U%DNx&Q^~qL zB>dOc1EKDux6+?3sJt$_ft2 zb%5rv3#TJ%L?Nd&gjTZM5~^E0a0NAXoje%6vFmtj#mjRK&~!X;3EiNP>sRpX1v` zEdp&j5<|ad8@}c^2Lzqw?>1SfelI!(3>=)=R7zMV3%Mgu^`slZ>n*)vTB7I#1}|;a zAm|V+X8gK-#x=N-sDI&rpE_1iyWk^_$|5ZwI@fIir5s!ibr6&y(ClgEs$o zs3%_}4El1JkGdMV#Gakbn(PtRmRrt{pM0wAJTiVRd%X9*aVbR=|6q&$QEdMhReZSK z^Um7izD!(rK)er#w^(xk@x-5PuJCZ6@8t7#f0?s<$s^sUxX^kL2p(PR&V@-thq;OT zF?~53J$aT+r{gX1`L1EeqkVCnTB(93;CHvm8oFz9c9+%3d)_>AwbwmoGis3o`S={Q z7!}8x{_J@lb7ND&6!Ap|pZJSV(uUuAw+Vsu#CX20MWLBqCHh*x3_hdL$EwwDQ7phJ z%zS;dO1o#-NlpYv`cfGsNcJ(sz2w<(d#2W|(u>)kXNXxx?iDRZRC6+D88#_4w%X3^ zky2S{Q3Cf^u} zGG$*`G!vgzL*Q&pyx!<_gLwPCRjr|O6unT$XEz@$ovlyH9>6s^9i18=;NSb|(YYTV z3glzv+IPf0WMEj(*!r&j z$@cTviByLFEa^q5j?}JW7g`v&>yqXoo%Mem8jtgsy%X|F|h zY~hau2edi@#CFuys&dZ=xH%r^rigGdSm?mnz<7YTyb^+ z9pgvyPLyxlbvxAM>{2z{M7@ZuuI^u_a@m9M3nKXT5fgrNlIr|*!i_!XqA&Dp;r8fM z+5X6Gl^HF3lcDQ1a~c5!Z4oNMh#VfjrN7i=?ykwY+XYd~6~Z9TdFaPX2j)oUn)-#H z%nFwkAj5t+_rmP@MtX<>2cpJFW3c(XTORyz<$L(t4&`iY{Akwfpk)tsd}!yn>iFLC z9eP8kLo}v$cI^gL(iAbuf-sZcN>#5obsXs*_xeGwwfuQgS%NLLLF33P+SPtWJg_|D z7v9YgNXc98cKK6qq7!Q$>OXShI^$)izE0JmFL*j~x94uxBhV-VgIu^a*~X>!Yn{q~ zPDk#`gr1vGDv9w2!iYJ%TLGCI$D`~7A4nE5j%Ppi&cwYrTI29C_A~wp)w@*E{UkYpluaq?n@e#Z?zvF9B z4LaOZ1>6iDJXx+gqRlcBH~O`wdN)g})B%WW#bxBw6-XwEjKM2yXJAEjSavs;LHFuD z{Ra8u-#mrV-E->&DsOPNfy~DXM^kGEx1l?qK%&e%sA~rNJL_v*`i)47%u}D%utYm_ zgbXAKVV4^Z>3xCKSrxp9+pvv#gShoa-n11-ru_SBy+Lo{jR6J#f=X^ti*M-;qmB|@ zmEOj3`~`at$P=G`LOvC6cm%Zl^kJ zlG?)iqsMiU*fwyef;rnRH8!oz&{KBS?NImOKRQczFm6r$Z$RQp@^af0pfnvGs!g%i z?o)u89D{ViVprGM7F*4de0xN&W5_IC@^3yHYMzD3MdPsh_(c}u8}qKL^AaW)&G*)t$nZ? zYS)Lb?(BNTKFny8)srEwBazsM(-{mmA+W225WE8?S2#Cl~j|Gi{fuT@S6E7j)4 z5Fh&RI%f5LcT>AKjlK)$xz;u^0Dn90pp2C$0#ZkrsjZ2(($MHJQX7!e1$c;X93Z#{ zhT?>Vp3QEkLqVmuxjm|eQ=lr;d{LPz-sH3Xi3tnI=v=RfTuNXE_l%{-4IXdONbCSd!GnnBA+I(9q3-%b;GFCSRxgGv3_oFFpsrC5h74dg7h%74}vQZCu`#|+$@{4d617pXDMAu^Us2~8$8FpVtp1f zewK*T%zSZ`3aPJ5Y$EL9iG#m1w8JqZdSq&aXaG|A57g35Y$z$NZG-I%ORDq_i9l(L z2*M8bSpA+zOG!)eL)YK0#nHz3cQrJOor^I5l!ZYTgXmJ^_!DCy_;`ygP}Y5kpTbIn zxwCt&MkO@)cZ_2V@<(yW;JHTx!o8KLU|H?Vg7+d>39o3<&(wf;K%8jZ zgbPald#4=`PXJtkUeow}SmdBz&^Gp)%6iZRY$6L>s5HR3E|3Gj=R#6@R;>ZVMP5md znm3lC`WS-)gSy4cIdygpJwAPR%23CLkbBH2q1r#ySho0ChFJRj;+5)}9Kdo=cifdA zHwe#QPv#2oD+`^@l`8!{k@Pn|tv3`U| zGd*!bC6-l`i?=FAFjR9TQkD!(mdAXdXGh6Wc^l^!JO8D+fW)ADbNO2_StaGi&n@id z8#mY}{UZlBC3iQpxw8QSa^vP#l&rS^aRWz+zlX0NK?fv?8!)qd44MviO#Tkp!cxUX z$5W;rrkJ73z6Ba@jBN&okRp3#h;iv*E|c>B|=o^d9v@SY*YWt7qD!|QwPyWg`f0}!<)ugT+Z1@Ljz^G0g) zHt-RlppKCY87bu&Ti^uI>H9UmN3Vi+9G7Iu@DhCnRVIzc7^k2N5TAvG5o>8^#=~pX z2>xe;nK~CHY<0`yI<;u23ZVglm3T=4nM#0Q`H>LJH6_3u7fHu%TA(r%7R=W06Ctg+ z9qw>7j}cDGF@Pg>GNE=JHmL-KM0oJ2rIQ!o$6!54JgJ}@QRFW!OHj0Cv8kQ^r$3Y6IzzJuJ;y zK4-KYGo_eOor0~dx^tYSMN%nTm?8&8&k4h)Xm5ty8sV&@Q0BpQLVaEfrt1bw)M z0a?LM0c~j-_9@G}ff4lCjUMPy^>}m= zIwdSiE32&hEMs@u@a}wz5&@Ho?f`0+$<0ghcxIO_=x z7vJI=QOaoPAVPo)4bt5+$w)-}pUxnf>{M(zc=_baq(1TcQ_#SRW5@U8lg)p+wHXpW zxk@T1r^3YNxpn|HzrPAJtkUZ7f8y&c%J0Uxf+NqU)xT3787JY_qu0nT(oC;YD`$Ao zrWa5WuV|)U_OLIbqqU~V`jJ8)dPpsI%ySA`sDks|z#|XAhU=unCLO7o`&}a9QKn&u zRHZ{YoGZ@eF|Uz-O?NC5NFE#dN&_dO{cXxJ%-rx>TdY_m^J2Hr3I9A--m=!iTM>{P z@$H`O%rk;J3|=0~qiOMNx~Umenh0yeg^x#QGXD($=*Ln6;nHapFw2o#k}>FXC#6!$ zK%R_3lWg^!$ITAc2~fe>z+6XNEgt7Xmj0HI#A$?31x`V+uD-$nS%Q3>+?D6SQNHGr zx{ul3qV~c%dMpPv4jz$<)}`z#kU+zovnqUD8QIrQSP{c`E!-SDQ6?Y~A%WIkj?V6( z8kxBSqy`$i0_vs5k)X-#8Y~&ihN(EJ#u;_XGI^rs@cTCaM$rT> z3!YqC=&3on&ijke%L@Xryz5q*Rdun>r9hz$$Hcx8=@qyu0Ym8R)D3dW46i=&SutO1 z^}EfK>P3<31!YRX8$+g)c|YsV!+>B>rZ8|BQr`KpauW z>mSjy=z*f6|3ed`i!g1T3sZkfv}zpu0xuqI!$2sj-0lcSzngnXf|tciUw5u!ruWi` zYpCQYc^ucK_CI&XV|q7FmEz}4C3HQ|#(yR=Usy6Y6&4D{NM6vc-FctBAcv68MNw)dE`_e=vKa$Gx)bbHJ*8X+xU0~x^S8s8wVHGjd^}^OF^%_^}?-65?wa2MlJM5C@Z+|$rOV!R7oJvk@wy@CYd&#C(s)&MCchgp$ zm6JJY3XcL~lZi+_t5BRwtsTQ5=rvr=;X|dAtk|5$F~MWnsFBCjp=^;IY?6&A8M>rv z?Vc=)N(euHYRfnU-sb*?rHc_Wd4Q7N&ivJ8-tkK#im@8YoYsS2%fIn$0@MZjmc`%d z3=V#;B__rlq}pqN2yb>OvT|cDwQZqAC(Ylx8Y>SNOTi5}salFNTA;QdQ3ONZ^BqY2 zwlUGmgwc(MU>)}%Vdqf7;Gw@Tm2BwlbMOzdL2oX;eYCCLy(!|1Z!$KnZU=OM9SPNZ z;Bxf+;HIMB6=;G}%$14^C)Cng@U~~<`tEb!45Gm=)WBm>rPs!gxMu9R@_FAYzp&hk z-utCtqKj9m7FBm8A9&Ie6e9ZS)FY1*d5A+YQmVmlv-*j(-UW>JKE?VE|Zy zrBFi86S&9h?l*4dNqZ&dHU)jgMCSZvL(hw!|1n)qE_bDNacV!CBEufX0%wE&eM?Zn z#@m#Uq+EysZu|~Jp#rr#AZ)z6r@S7_yb@=3Cul(=KDj=|%R8sl7;R^tiz(jZK_qfu zx&mMNl+#Dw$1aT5>NrF={mSs^g&||&QGAd!=-zGh#^{5y%;G2aN|{pW{D97Y)ytT? z<%yEwfSAo1RPeOgXmfgr&CrmXB&AYVOl|A&Jl#dlMOrDJ7%{*gRD1%%d{BWJpsOK;HyzJDgE)&eU-y)(8aILwHS0Z-v4yq2aD{Cm^v6v>x zHgWm3*FDZOy$vyESDL@{F>>(pOJtXoA8DrdrXhV^msCtZQUB%z zFFYgxAFNgDq)TffZD3)XNI4O#rrZ3QXCT=tWHw8WDz0ZoC9h_)Kb;r*Uj1WvNn^jJ z2xk{lsJ8o(#0HB>38orWq5LsqM&BdCHXz=j>kU>ltn1}Lt#Xmif0wQ4AJf!N`8JJ0 zObt+@4Hbj@%GEKNuW$72Jrk7+$#9v>O1l}hO|rB9CE`4rZr3qa5g){((nTWVpU8Hb zl}}oJzI=4?O`L$6-`QJqfm3}_(+V+^7EHnVIA$ zlJnG-u9ms^3S?7%rsF?@<+gtq)ShqG$FTIea(tc9DqrcmvzVtg`6v{=eYk6|ywls} zGB3$$z)rCNT=+(1Nh{SAavg)q!i+ove8Y?`lTVa zkjV-=Ey$b@XE)TuICMIR(v+2@*gBTbL}edRk%g2k{(0F=KDjl3x0>55g|33f(nYxC z%V9VWnUIip@TODwr`)@?lLnyx;Z_@Ix2P!L&EzRg;7adJXR}g~#YRCUQR!0d&`oDm zwSJjWa%PH|WNUuRqytoSGf^CoBwfOeFAHi9s2kq6SwM;4r8PeX#N7_9M^`rdi?^~h z|I6=r&xbGb;bT`wLdye_DVmMo>l2hX(vcgXS0U-?!C8#BvV!lk^8B@Nx#x{#xh=uQ zS+xCJT9@!=fa?hGg$_8o1Y|2^_x11z)h2HIBaF|)eiImB!t%UG3}w^KC-PITl2fq7 zo2P*c7FMWF0)GM{-fW8bgo)^NS0UN%VNp&{LCjX@W0ciKIhUrw;Cs&wOWI_Pe0To>%DTxINnejDz5yzUuc?zTPslaHNG zcCVGR z6S{V85D&WW4To~yKx1B2b?-s+r|AKwC6J#vMn_^(8ZEo(!a+dGdT)BQU}hDsi$1yu z0kx{sE^pgOnAi0zwhFfGAANKvDHAb}N9$4c#N++W%(ge2;)pn-^zMtk|2jNM+8>XP zq|+{@PifRrAv1nXt?9yk51fV$+juc6h9PdOJ)sFfhcL6Q!mQ-}UnCdI)rW@5$}R&$ zpI%x0pVf-2QGP)owJ~@zaL@kHzlh~KSUGbk6JBSXg}(+d2Xc}}nU!Fy-t`F{Riysd zhFxI$mC6_!uXI`3^Gv+%vo-Wuj(U(toh{K<+%iA#iI~@Q_mJJO^T@KM{oVk`49xg~ z^GW*s{N1U}ehAsIeNSea56WZ~ybxvqdhH_%N?Mo}iYe$h<=Ey^$My5`&C1T-Q|W%S zm=-t?dSB>S6U&t-+*GB{0Cl_Z=CnBOWD`4X_A}N<(E~FkQlq8M{Wis)eaH0!$D}(& zqZ*pWaqYod3Ej6AT2+PvGas)OgD%lFpj!=J>t~Um%aNdQ!D-=)nOrYYm*j2VtQfzC z{4SxZTTcDJJMLVst@PS+`a_Sl@S?bJy65%PC#sEl779j}Hi=p_0#)YL_q*7#M=94-n&odW^iMr+3%hyA z3^jI6!I;!!u6XYm4{g_f#|vNewgj@Go2@n+-;bS`&s)#5lBxUHVm;T!QY2CG2{x%N zf<&i!M>v@EM~1lUyB`L3stns=69PA3q*TMrz}_{o3r^SNK_+fHcgufF-hADEyIkAv z$|jKRtMr&&zJr}$>N{Vnn?o(*ss&rFLIlsL5gVFGMus>X@_p`geSH=DPbJByQ39AI z`rL%K`L-D$CC0M83%$0XC%;1JN^{JCFG>Q+HQd1Fo7ZZDiFvyE@)fWZ4#hM-aEpIE zXymh$$q$q4*7!&>L3>bh6okl12)dMfEG(#Ka}1%De-wQP46|(rHp$9T1-6V1t4gJo zWHIVAz~UdGd`g_^lG9{pT>7F1ld%m@vvUBNJ24`-e0%?Z(uZW~rZ}3M7Ki<3yPCpU zFGBHOSy?&TdrGW@n&VN0#jQ0X_1$}qCShihk^IhwfJNIlTZeRV6#5KCuGh@4NHjKK zPU%D59})#@(HIigG*CG_JrX1|5a<=Po=>^w$||6xLCo3Nc_cm0Wnz4ZSZW&eZozmJ z1|LQ>;_WfR0xfE6=^4tWbOa%WpV?*ONiFUDW244glKChR{Sy&@T2 zay{U`OJFi{-R+YQw{%KnQaTt^j-4k$dYaUf!A5@jY8Oe+|{qxd{_lF!xlax5A zB-(;e%iD~Ks+N&d3|ftGk1%~PfAbf(PlB+HJ8vwW`7774N6DNXss^Wao=J^J*ntn| z)^Tl)WSCC2f`^0K-qEurPOlV)$}Fr^6MyFB7q9J^{RD#&AqjuPiG_!Y_O45ps+q6j z@2+P#FW;7^3K!crhWrE7%0-UX#+1BP|@5=hZe()BhMW zdY}jj%hzVB8Z<>F|64`yGP0BV(1S5abeQWh#2Dmb_463Gi zh@f1G(lx&1h|9Sl;i}mwrIrwA-7`0s^o|Z((cJkK3o{kyp-xhYHMA;)c~}*AvENjO zh}FRFGZ;_co)SdNx$Ve`+}9e6U^ZLqm3uJ_RbSD(OoUN!oq~H3{u~w0rw+&>=&XM$ zY-tI_2wn3-zwRxb!6*+cmQ7etr?2nqK^H{?HCoWAbhpNl%V*`~4Y;5{xJ3?j{wejD z&i)!0*{m8Kpf5|j!&jg$iKxgVFPVR66i9$BLb={lmsL0oYo2SR3vtWl7Jg)JgvWqdCEW;ld2hwh@!a5^Q39LmAt z+;)c-XFf{YY_$^f(gRLv_p3*A-EUKoApkk3x2KsP5p|djzF<@hWMN^e*W-fsq4e&2 zQlaH(SHN!L`E*ByEPn73tm-5cx@hdiF9;%?vP~OaCDAtWW|5SX%+0miOdBUim&Sp8 zyd(1Je6ckIwZj-8t(qcH&?IdP)-FD+obg_7F0h)m!sSYoWgy(aPcTX^6*Pz9dy)1s zX4DyEa7A6&d9Vn333GC)laLkkF=ntc=kTFi*>SJG)Kso?deC{4UMX0Vm|*baK)!^m z>kOVW;NBwqY#S%Wx*CbtW4vaH<{)=yA;)Xa!KXw%)Tq{FE5hUPPC!@B`LAcjp`TX7S<>=FMS7mfbo0mao4wfVa2J zi)#=J`Fww1(secmp4f$ zB=O_ApFi5{wV$QKUEetrvu(!P)%K!8_XGR6nT3|OhO~Ja{Lcg&U+QjD+f*PE1IfZJ zgtr;yUN#u{*JLM`27pKr(NclEEo8nw$Ua6RfjG$23J@l|R3ZdBOtQ1yhqLFh;Ug3% zFP}Qx^jT=e;%4bAwRB8n!*3 zaT8%(rBsbp88^b@D+>-GKmR!AjOfVB+ief|Vm45-HC?IVt1&n}PfFpMv!D}+23Gzz zeJg2=!_CS=!TxIu%T{i~a>~A$+ZZEx@Jr8)!FlJ7C*;<_WQ;S(O#K8$OH1eTT2cOZ zTP~7SjZ*`cul=({-*ThZ<(`B;-35xm`dK`3(TdARjo8`_T8*+4G;=ZwO-+9BI2y@< zp|)?P7*jgA`~+@0Rg=bnn~aaUtQC33%npQD<=iw!EL`ow$>X1H4MgQJ;Q5-vPR#Eg ztPH*5AJ(2pS-K7>=eef-a+UY~J*9qt*q23Lq7qE9X=Q5qA(Lg?H=cU48OQVY#}R9I zRQriO4}Fr6Vgb&+zo#Uk4`UsdVes*1+k9C{K?~_htJgJ^ZCB4H@(a{{A|s`Sl`b1$ z=vND(SE|Tx==8AhI}ms#XtXxuMr^cI9<)7zGyqtrcm9k%<1r_nV(@}?byU#)X~W6* z;KG6k9_;0~_m7cfKg?@g9!D0`AV?axd6PFnx>J+jAB_9?Gq#oc&!9HvdxcDW)F~0?rzhNh{b3~)XSO00S|8T5keh}#7=(t>_@n< zq|J|-K^aLtP6W)U60q|KjPL#h2F^;^NMrD<1Q-j-BEUnSI#9xHo0?JyJ6k+2?we7` zoc;CoaE1++Eiesh1k>%##&G>nh(Yu8Uq@J5n+;bq8!`c*VBbWJAuS@b%t*vdvs&n{ zv2Mv)*T+#+c=-5~K?)a=q~EZj)_;7!6O&q%_N(n#Y@@U#gYr^q1v^7=Gld%COL;k& zd3ob2K_qNE{QaMZ3+UvrB2-Ar=cFYtmawDt3{TVw$&yHS*5Bdo+j0V{e{3gSRj3+zAMtDa<`Z!bm!bM3 zfl66GHG*Dm_fTW6l|K2(0Pz(!#iVO3J{9z!ny0I#PWUJM)ER}U56h?PT)&P0s;mm3 zYfjFXgRZMeqkoPxyPsdY&^4%`l|whVTRM&?hCvvZyXO+3r-mS_?Mbrq`m zGZ1%w5#MBkz2*97mTj5Bu*)i1sL}$L`hsPHRYBIYMypoY`Objs)oPVvjX+Lm(?paY zpKqtZL8OSD)$!%uTi6uN9S2<9gD?GMHshCv5GVJr8*W>+{3Aa_sfqcVT*bJh9Wxgc zs|jYRS|aWEAjI(XYYHmddPu|z6{=<_!lvD>3W!yY%T(B(&~o4pwrH-zFyQH5|!t9IfCz z$?l+eT9j=HLVqPX)EvV@82M46KgpL$PFT=H{W=&FAEk4#)#^Bi`FX)rb>YWB4sD zk1J6^nVfqoy%;kNps*d>r{@+@6LWVV;^_Xww@3s&KB17E_ROborifJQynPZUcJPG= zJZO@(RP)PJ0UeBcWyqQFtmtE;lVfz_xeHTA9v&LrufXb{(jO_A+Fobdm)!w>%gS4C zvM0QK7d*`?6btpoFbu0rE5aG4`BSXCKL7j+j{L?1;nd7xj5h={wULsQg%}%M*f{t*5aGJtB8^aw1494AKPXEXQjvF{v zJ@;rjTRjl|F{)MFl9!a|R$~3NT@Ab&5^A%-cT_<)tcX>M-d|;UgUiswP52B$M}K#> ztPSE-&ux5_Pkn&HXnhkJm+vp{y~c|qW!c6-yd&$4D+@yhMQ$+ z`PD^uayd_+uKuH)*mCpV!(<9V zgN}O{|I8@s*QIrj4dX#!yHaM4ET`lWQLm&83VqKwB08)YPeoNm4O|ct80N z*}`dMO!?8uEUb21F@A0%rcoQ~t+drJr_!RqluWnA9r^Ip#H7+PD9QC>sEZYGH6IxB zld6dk(LqcFFw|<>tt{zz)!%ca*aeit=)-L+w)WMMsa=K%lbIUm(qZO}e`}dJ?@}+M zV6W(H$+A0^hHBRscR1$9%Ni@BOt5qVW9$tNQT2bK0_&}+c!?E13qMu{-GYoheDMLP zZEOU8>zOz;As2gEQmq9qGYd{2q!6pGe%+9v>x~bBJ>#8Gu)!|>*zd_&pD9@ctOET0 zrCD2GWl*X_uZ$tCkBjFH5ltc(zv$#(djv;WTcDh6Yuvn^sIQi-p`XFy-jbF_37(Kc-;^^s@Kok&{Rpx=CXd5b?k;TbGGi; z0m3Qx9j1=({}?A9QVIrZSMVtu3H>!JhRs`K!RwUR2SQilD$niQa#!;6bH}MMG>Cf% z-zN1@9INl#KCqgB>H{IqiFw*B*7>^*b|$=jd_7WjTtf}#Ghz}zfgYwCAtVv#+nhy; zUzQjTc+)Mn&I-P{u+I6@m{mJUi&68WlN@(n`2Q|@sZ5UDD17Mz0e@|)<5Ieys0$8rb=WUH{BRI!<6 z(>CWZi*bKgS*37a<6P(A%Ike#nKY^`dy>t3bY#>|@JpN+7F9m{=tS_;`0fAAQ zS6AE?8fu8bbZoufHh~xJ$jXfWJv>AOV;}VQ`X+5&3OSnqc7FIeR(j(rVqccs(x3Vs zQKR!Rc?@aYck+QzRFPR_jX5IHQQFiWoIG3bIa5 z$n*h?SlFe_X?2^@|2x5xr(idyD{PwcsOqW-v9}Py|BNB#S0r7wvCxz^#(y?OP#s0^ zq?oH^-);NZ4yr7`pt<1J=}Rdxmc1IP!Y3?djtj+@m3`ACdqVh=rK++P{b^-(lxGDR zh)s^0=+PyVULD0oqB~ca7_|E&hYA{jR{FW9JdGvfa)-KX6g@eTn)@A5>Hu!8wJhqO zJ+H~%IxfUm5AS4#FRxoKt`4Ld%4SVf*YWGMMQ$KaGHU4kN_X^Ms0!&-;~>@hAFn{k z@>~Y&B8$va;e;keZF9z0uMHW*aJEC{v zj&)H`suX=30}*xjV4*gja|kgOK88P>2r_TC46ED^r(P(ERse1hQVDb#Ef+EaPBPcp zieW%Tki;dSqaH=&#Ln@K+aWe7vGuQM#*>jhN%kBsG-PTKHV zfSQ!b_xQkG>`xgx=%N@M#mUb-0 zwT@It=M)9g=uRvp9yg{4r$R9!^;+lo3rYHcMV;&Ph5l;Hy=lfj(*_Sc1gvlLCl^?> z5eL~(2g109T+*qC@wK*=)&+hkcx)nKKw%i7CodId%a@go z*+vZb46^MnEFq%h=w)`kJ+V3kbMzYgylifd4v_j`n#ZbHm`6?Eu1;ihlrGBq zNWXN8X<1NiHz$)cPlK%q&bJ?KhzJPyWn5r=yJ)k_83z;E)z+bRv8*^`N%_Z@R?O%z zSLalP%zir3nReetOzKBn?(Hn=7ZQB9Xcr_Ti}P2R&*bCvOw**KpHWzA3p4ILX3Ob# z$6}LdbPDVnHN6tAPbRC6*~^rn7fRf&m3X`(KH)$Ab)tD3Mg-%(QbQSkU;K;xlh&Kg zS(3!N#6?z!AHq78Qj4qN5ah*DD<44(-g|AVcitZ}*20K&tciDkrRmNn9h-o}(nT45 zN7uSk-Zx}_qhvu^AJbk|vwc;(mvWmxv+s6_0fR#(r!N3X?DW0+hMe%0_(PXu z0rJ3Fgjn%n`5KqB;2F7+x&nPpnRN>o2YNae`$#?x(*h%E8`1xiGZ2b*9hsJ;sG$Y` zfcE|Ge*vr?E&Rre)ugKtj7tg40^=N0Fq9nZfp)J5-SriXSg2*M3_}4Z)32#&$rbc8 z!3 z?=c4Cv793-56vXf0#+{OoGIC@{yd>=NY0|Ul)AHZ>x(Zx`}78Y zW6+fF5;}amRsKNnWP1Qs0jvYK+Fu%8efzx+-dx{!d1Lj(E9*(C)jFFiE#O%~0qXeh z;Nbq=y?fg?zu)}f>#u$Uun%Alzz#M?&GlF8!!jL5p2`&Rz18@mIU$+^h@d8wq zfX4)X%#wkG{QxBRKLH$q0!5SnyoDj=q!GwYtn>#Qf_erst++APOu*cL6W|0C5aK|; zj=+|N*_A;`DW#nbQXin$2ONNeZ$gd-0?aSW_$)zpcn*@`NN+Q^@gD#hBZZpbAO;ED zbp1j2{|{)dlkEjcy&c1ufKXcS6S;%g9Krg71M>_y5MchnNl>ve`7N2y8Ei>QV}zu<3ywg51xFyjf+G-M!4U|s;0Od* pa0CJ@I069{9Dx7}jzE9~$G^CSB{dI&s%8KH002ovPDHLkV1gXhx-kF% literal 0 HcmV?d00001 diff --git a/src/calibre/gui2/images/series.svg b/src/calibre/gui2/images/series.svg new file mode 100644 index 0000000000..c26d1ef7a2 --- /dev/null +++ b/src/calibre/gui2/images/series.svg @@ -0,0 +1,1096 @@ + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/calibre/gui2/images/tags.svg b/src/calibre/gui2/images/tags.svg new file mode 100644 index 0000000000..608f637c82 --- /dev/null +++ b/src/calibre/gui2/images/tags.svg @@ -0,0 +1,503 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + Oxygen team + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/calibre/gui2/library.py b/src/calibre/gui2/library.py index 03fb4c59b6..5309455b99 100644 --- a/src/calibre/gui2/library.py +++ b/src/calibre/gui2/library.py @@ -201,9 +201,13 @@ class BooksModel(QAbstractTableModel): LibraryDatabase.sizeof_old_database(path) > 0 def columnCount(self, parent): + if parent and parent.isValid(): + return 0 return len(self.cols) def rowCount(self, parent): + if parent and parent.isValid(): + return 0 return self.db.rows() if self.db else 0 def count(self): @@ -676,9 +680,13 @@ class DeviceBooksModel(BooksModel): self.reset() def columnCount(self, parent): + if parent and parent.isValid(): + return 0 return 5 def rowCount(self, parent): + if parent and parent.isValid(): + return 0 return len(self.map) def set_database(self, db): @@ -855,6 +863,13 @@ class SearchBox(QLineEdit): self.prev_search = text self.emit(SIGNAL('search(PyQt_PyObject, PyQt_PyObject)'), text, refinement) + def search_from_tokens(self, tokens, all): + ans = u' '.join([u'%s:%s'%x for x in tokens]) + if not all: + ans = '[' + ans + ']' + self.set_search_string(ans) + + def set_search_string(self, txt): self.normalize_state() self.setText(txt) diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py index 652ffcae09..6204bc40ee 100644 --- a/src/calibre/gui2/main.py +++ b/src/calibre/gui2/main.py @@ -245,6 +245,13 @@ in which you want to store your books files. Any existing books will be automati self.cover_cache = CoverCache(self.library_path) self.cover_cache.start() self.library_view.model().cover_cache = self.cover_cache + self.tags_view.setVisible(False) + self.match_all.setVisible(False) + self.match_any.setVisible(False) + self.tags_view.set_database(db, self.match_all) + self.connect(self.tags_view, SIGNAL('tags_marked(PyQt_PyObject, PyQt_PyObject)'), + self.search.search_from_tokens) + self.connect(self.status_bar.tag_view_button, SIGNAL('toggled(bool)'), self.toggle_tags_view) ########################### Cover Flow ################################ self.cover_flow = None if CoverFlow is not None: @@ -284,6 +291,16 @@ in which you want to store your books files. Any existing books will be automati self.status_bar.book_info.book_data.setMaximumHeight(1000) self.setMaximumHeight(available_height()) + def toggle_tags_view(self, show): + if show: + self.tags_view.setVisible(True) + self.match_all.setVisible(True) + self.match_any.setVisible(True) + self.tags_view.setFocus(Qt.OtherFocusReason) + else: + self.tags_view.setVisible(False) + self.match_all.setVisible(False) + self.match_any.setVisible(False) def sync_cf_to_listview(self, index, *args): if not hasattr(index, 'row') and self.library_view.currentIndex().row() != index: @@ -787,7 +804,8 @@ in which you want to store your books files. Any existing books will be automati self.status_bar.showMessage(_('News fetched. Uploading to device.'), 2000) self.persistent_files.append(pt) try: - os.remove(pt.name) + if not to_device: + os.remove(pt.name) except: pass diff --git a/src/calibre/gui2/main.ui b/src/calibre/gui2/main.ui index a5f7c2ddcc..59ce80884a 100644 --- a/src/calibre/gui2/main.ui +++ b/src/calibre/gui2/main.ui @@ -24,14 +24,6 @@ :/library:/library - - - 0 - 79 - 865 - 716 - - @@ -242,60 +234,88 @@ - 2 + 0 - - - 0 - 0 - 100 - 30 - - - + - - - - 100 - 10 - - - - true - - - true - - - false - - - QAbstractItemView::DragDrop - - - true - - - QAbstractItemView::SelectRows - - - false - - + + + + + + + Match any + + + false + + + + + + + Match all + + + true + + + + + + + true + + + true + + + true + + + true + + + + + + + + + + 100 + 10 + + + + true + + + true + + + false + + + QAbstractItemView::DragDrop + + + true + + + QAbstractItemView::SelectRows + + + false + + + + - - - 0 - 0 - 100 - 30 - - @@ -331,14 +351,6 @@ - - - 0 - 0 - 857 - 552 - - @@ -378,14 +390,6 @@ - - - 0 - 0 - 865 - 79 - - 0 @@ -425,14 +429,6 @@ - - - 0 - 795 - 865 - 27 - - true @@ -564,6 +560,11 @@ QTableView
library.h
+ + TagsView + QTreeView +
tags.h
+
diff --git a/src/calibre/gui2/status.py b/src/calibre/gui2/status.py index ce259acd9f..d627330f1f 100644 --- a/src/calibre/gui2/status.py +++ b/src/calibre/gui2/status.py @@ -140,13 +140,28 @@ class CoverFlowButton(QToolButton): def disable(self, reason): self.setDisabled(True) self.setToolTip(_('

Browsing books by their covers is disabled.
Import of pictureflow module failed:
')+reason) + +class TagViewButton(QToolButton): + + def __init__(self, parent=None): + QToolButton.__init__(self, parent) + self.setIconSize(QSize(80, 80)) + self.setIcon(QIcon(':/images/tags.svg')) + self.setToolTip(_('Click to browse books by tags')) + self.setSizePolicy(QSizePolicy(QSizePolicy.Preferred, QSizePolicy.Expanding)) + self.setCheckable(True) + self.setChecked(False) + self.setAutoRaise(True) + class StatusBar(QStatusBar): def __init__(self, jobs_dialog): QStatusBar.__init__(self) self.movie_button = MovieButton(QMovie(':/images/jobs-animated.mng'), jobs_dialog) self.cover_flow_button = CoverFlowButton() + self.tag_view_button = TagViewButton() self.addPermanentWidget(self.cover_flow_button) + self.addPermanentWidget(self.tag_view_button) self.addPermanentWidget(self.movie_button) self.book_info = BookInfoDisplay(self.clearMessage) self.connect(self.book_info, SIGNAL('show_book_info()'), self.show_book_info) diff --git a/src/calibre/gui2/tags.py b/src/calibre/gui2/tags.py new file mode 100644 index 0000000000..190dfe7567 --- /dev/null +++ b/src/calibre/gui2/tags.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +Browsing book collection by tags. +''' + +from PyQt4.Qt import QAbstractItemModel, Qt, QVariant, QTreeView, QModelIndex, \ + QFont, SIGNAL, QSize, QColor, QIcon + +NONE = QVariant() + +class TagsView(QTreeView): + + def __init__(self, *args): + QTreeView.__init__(self, *args) + self.setUniformRowHeights(True) + self.setCursor(Qt.PointingHandCursor) + self.setIconSize(QSize(30, 30)) + + def set_database(self, db, match_all): + self._model = TagsModel(db) + self.match_all = match_all + self.setModel(self._model) + self.connect(self, SIGNAL('clicked(QModelIndex)'), self.toggle) + + def toggle(self, index): + if self._model.toggle(index): + self.emit(SIGNAL('tags_marked(PyQt_PyObject, PyQt_PyObject)'), + self._model.tokens(), self.match_all.isChecked()) + +class Tag(unicode): + + def __init__(self, name): + unicode.__init__(self, name) + self.state = 0 + +class TagsModel(QAbstractItemModel): + + categories = [_('Authors'), _('Series'), _('Formats'), _('Publishers'), _('Tags')] + row_map = {0: 'author', 1:'series', 2:'format', 3:'publisher', 4:'tag'} + + def __init__(self, db): + QAbstractItemModel.__init__(self) + self.db = db + self.refresh() + self.bold_font = QFont() + self.bold_font.setBold(True) + self.bold_font = QVariant(self.bold_font) + self.status_map = [QColor(200,200,200, 0), QIcon(':/images/plus.svg'), QIcon(':/images/minus.svg')] + self.status_map = list(map(QVariant, self.status_map)) + self.cmap = [QIcon(':/images/user_profile.svg'), QIcon(':/images/series.svg'), QIcon(':/images/book.svg'), QIcon(':/images/publisher.png'), QIcon(':/images/tags.svg')] + self.cmap = list(map(QVariant, self.cmap)) + + def refresh(self): + self._data = self.db.get_categories() + for key in self._data: + self._data[key] = list(map(Tag, self._data[key])) + self.reset() + + def toggle(self, index): + if index.parent().isValid(): + category = self.row_map[index.parent().row()] + tag = self._data[category][index.row()] + tag.state = (tag.state + 1)%3 + self.emit(SIGNAL('dataChanged(QModelIndex,QModelIndex)'), index, index) + return True + return False + + def tokens(self): + ans = [] + for key in self.row_map.values(): + for tag in self._data[key]: + if tag.state > 0: + if tag.state == 2: + tag = '!'+tag + ans.append((key, tag)) + return ans + + def index(self, row, col, parent=QModelIndex()): + if parent.isValid(): + if parent.parent().isValid(): # parent is a tag + return QModelIndex() + try: + category = self.row_map[parent.row()] + except KeyError: + return QModelIndex() + if col == 0 and row < len(self._data[category]): + return self.createIndex(row, col, parent.row()) + return QModelIndex() + if col == 0 and row < len(self.categories): + return self.createIndex(row, col, -1) + return QModelIndex() + + def parent(self, index): + if not index.isValid() or index.internalId() < 0: + return QModelIndex() + return self.createIndex(index.internalId(), 0, -1) + + def rowCount(self, parent): + if not parent or not parent.isValid(): + return len(self.categories) + if not parent.parent().isValid(): + return len(self._data[self.row_map[parent.row()]]) + return 0 + + def columnCount(self, parent): + return 1 + + def flags(self, index): + if not index.isValid(): + return Qt.NoItemFlags + return Qt.ItemIsEnabled + + def category_data(self, index, role): + if role == Qt.DisplayRole: + row = index.row() + return QVariant(self.categories[row]) + if role == Qt.FontRole: + return self.bold_font + if role == Qt.SizeHintRole: + return QVariant(QSize(100, 40)) + if role == Qt.DecorationRole: + return self.cmap[index.row()] + return NONE + + def tag_data(self, index, role): + category = self.row_map[index.parent().row()] + if role == Qt.DisplayRole: + return QVariant(self._data[category][index.row()]) + if role == Qt.DecorationRole: + return self.status_map[self._data[category][index.row()].state] + return NONE + + + def data(self, index, role): + if not index.parent().isValid(): + return self.category_data(index, role) + if not index.parent().parent().isValid(): + return self.tag_data(index, role) + return NONE diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 63bd0d10d0..d3a231a87c 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -214,7 +214,7 @@ class ResultCache(object): for id in ids: self._data[id] = conn.execute('SELECT * from meta WHERE id=?', (id,)).fetchone() return map(self.row, ids) - + def refresh(self, db, field, ascending): field = field.lower() method = getattr(self, 'sort_on_' + self.METHOD_MAP[field]) @@ -396,6 +396,25 @@ class LibraryDatabase2(LibraryDatabase): CREATE INDEX series_idx ON series (name COLLATE NOCASE); CREATE INDEX series_sort_idx ON books (series_index, id); ''')) + + def upgrade_version_2(self): + ''' Fix Foreign key constraints for deleting from link tables. ''' + script = textwrap.dedent('''\ + DROP TRIGGER fkc_delete_books_%(ltable)s_link; + CREATE TRIGGER fkc_delete_on_%(table)s + BEFORE DELETE ON %(table)s + BEGIN + SELECT CASE + WHEN (SELECT COUNT(id) FROM books_%(ltable)s_link WHERE %(ltable_col)s=OLD.id) > 0 + THEN RAISE(ABORT, 'Foreign key violation: %(table)s is still referenced') + END; + END; + DELETE FROM %(table)s WHERE (SELECT COUNT(id) FROM books_%(ltable)s_link WHERE %(ltable_col)s=%(table)s.id) < 1; + ''') + self.conn.executescript(script%dict(ltable='authors', table='authors', ltable_col='author')) + self.conn.executescript(script%dict(ltable='publishers', table='publishers', ltable_col='publisher')) + self.conn.executescript(script%dict(ltable='tags', table='tags', ltable_col='tag')) + self.conn.executescript(script%dict(ltable='series', table='series', ltable_col='series')) def path(self, index, index_is_id=False): 'Return the relative path to the directory containing this books files as a unicode string.' @@ -596,6 +615,33 @@ class LibraryDatabase2(LibraryDatabase): self.conn.execute('DELETE FROM data WHERE book=? AND format=?', (id, format.upper())) self.conn.commit() + def clean(self): + ''' + Remove orphaned entries. + ''' + st = 'DELETE FROM %(table)s WHERE (SELECT COUNT(id) FROM books_%(ltable)s_link WHERE %(ltable_col)s=%(table)s.id) < 1;' + self.conn.execute(st%dict(ltable='authors', table='authors', ltable_col='author')) + self.conn.execute(st%dict(ltable='publishers', table='publishers', ltable_col='publisher')) + self.conn.execute(st%dict(ltable='tags', table='tags', ltable_col='tag')) + self.conn.execute(st%dict(ltable='series', table='series', ltable_col='series')) + self.conn.commit() + + def get_categories(self): + categories = {} + def get(name, category, field='name'): + ans = self.conn.execute('SELECT DISTINCT %s FROM %s'%(field, name)).fetchall() + ans = [x[0].strip() for x in ans] + try: + ans.remove('') + except ValueError: pass + ans.sort() + categories[category] = ans + for x in (('authors', 'author'), ('tags', 'tag'), ('publishers', 'publisher'), ('series', 'series')): + get(*x) + get('data', 'format', 'format') + return categories + + def set(self, row, column, val): ''' Convenience method for setting the title, authors, publisher or rating @@ -650,6 +696,7 @@ class LibraryDatabase2(LibraryDatabase): `authors`: A list of authors. ''' self.conn.execute('DELETE FROM books_authors_link WHERE book=?',(id,)) + self.conn.execute('DELETE FROM authors WHERE (SELECT COUNT(id) FROM books_authors_link WHERE author=authors.id) < 1') for a in authors: if not a: continue @@ -672,9 +719,47 @@ class LibraryDatabase2(LibraryDatabase): return self.conn.execute('UPDATE books SET title=? WHERE id=?', (title, id)) self.set_path(id, True) - + + def set_publisher(self, id, publisher): + self.conn.execute('DELETE FROM books_publishers_link WHERE book=?',(id,)) + self.conn.execute('DELETE FROM publishers WHERE (SELECT COUNT(id) FROM books_publishers_link WHERE publisher=publishers.id) < 1') + if publisher: + pub = self.conn.execute('SELECT id from publishers WHERE name=?', (publisher,)).fetchone() + if pub: + aid = pub[0] + else: + aid = self.conn.execute('INSERT INTO publishers(name) VALUES (?)', (publisher,)).lastrowid + self.conn.execute('INSERT INTO books_publishers_link(book, publisher) VALUES (?,?)', (id, aid)) + self.conn.commit() + + def set_tags(self, id, tags, append=False): + ''' + @param tags: list of strings + @param append: If True existing tags are not removed + ''' + if not append: + self.conn.execute('DELETE FROM books_tags_link WHERE book=?', (id,)) + self.conn.execute('DELETE FROM tags WHERE (SELECT COUNT(id) FROM books_tags_link WHERE tag=tags.id) < 1') + for tag in set(tags): + tag = tag.lower().strip() + if not tag: + continue + t = self.conn.execute('SELECT id FROM tags WHERE name=?', (tag,)).fetchone() + if t: + tid = t[0] + else: + tid = self.conn.execute('INSERT INTO tags(name) VALUES(?)', (tag,)).lastrowid + + if not self.conn.execute('SELECT book FROM books_tags_link WHERE book=? AND tag=?', + (id, tid)).fetchone(): + self.conn.execute('INSERT INTO books_tags_link(book, tag) VALUES (?,?)', + (id, tid)) + self.conn.commit() + + def set_series(self, id, series): self.conn.execute('DELETE FROM books_series_link WHERE book=?',(id,)) + self.conn.execute('DELETE FROM series WHERE (SELECT COUNT(id) FROM books_series_link WHERE series=series.id) < 1') if series: s = self.conn.execute('SELECT id from series WHERE name=?', (series,)).fetchone() if s: diff --git a/src/calibre/trac/plugins/download.py b/src/calibre/trac/plugins/download.py index 90a53efa4a..ebcc1447d8 100644 --- a/src/calibre/trac/plugins/download.py +++ b/src/calibre/trac/plugins/download.py @@ -31,7 +31,7 @@ class Distribution(object): ('libusb', '0.1.12', None, None, None), ('Qt', '4.4.0', 'qt', 'libqt4-core libqt4-gui', 'qt4'), ('PyQt', '4.4.2', 'PyQt4', 'python-qt4', 'PyQt4'), - ('mechanize for python', '0.1.7b', 'dev-python/mechanize', 'python-mechanize', 'python-mechanize'), + ('mechanize for python', '0.1.8', 'dev-python/mechanize', 'python-mechanize', 'python-mechanize'), ('ImageMagick', '6.3.5', 'imagemagick', 'imagemagick', 'ImageMagick'), ('xdg-utils', '1.0.2', 'xdg-utils', 'xdg-utils', 'xdg-utils'), ('dbus-python', '0.82.2', 'dbus-python', 'python-dbus', 'dbus-python'), diff --git a/src/calibre/utils/mechanize/__init__.py b/src/calibre/utils/mechanize/__init__.py deleted file mode 100644 index 8bea889f30..0000000000 --- a/src/calibre/utils/mechanize/__init__.py +++ /dev/null @@ -1,125 +0,0 @@ -__all__ = [ - 'AbstractBasicAuthHandler', - 'AbstractDigestAuthHandler', - 'BaseHandler', - 'Browser', - 'BrowserStateError', - 'CacheFTPHandler', - 'ContentTooShortError', - 'Cookie', - 'CookieJar', - 'CookiePolicy', - 'DefaultCookiePolicy', - 'DefaultFactory', - 'FTPHandler', - 'Factory', - 'FileCookieJar', - 'FileHandler', - 'FormNotFoundError', - 'FormsFactory', - 'GopherError', - 'GopherHandler', - 'HTTPBasicAuthHandler', - 'HTTPCookieProcessor', - 'HTTPDefaultErrorHandler', - 'HTTPDigestAuthHandler', - 'HTTPEquivProcessor', - 'HTTPError', - 'HTTPErrorProcessor', - 'HTTPHandler', - 'HTTPPasswordMgr', - 'HTTPPasswordMgrWithDefaultRealm', - 'HTTPProxyPasswordMgr', - 'HTTPRedirectDebugProcessor', - 'HTTPRedirectHandler', - 'HTTPRefererProcessor', - 'HTTPRefreshProcessor', - 'HTTPRequestUpgradeProcessor', - 'HTTPResponseDebugProcessor', - 'HTTPRobotRulesProcessor', - 'HTTPSClientCertMgr', - 'HTTPSHandler', - 'HeadParser', - 'History', - 'LWPCookieJar', - 'Link', - 'LinkNotFoundError', - 'LinksFactory', - 'LoadError', - 'MSIECookieJar', - 'MozillaCookieJar', - 'OpenerDirector', - 'OpenerFactory', - 'ParseError', - 'ProxyBasicAuthHandler', - 'ProxyDigestAuthHandler', - 'ProxyHandler', - 'Request', - 'ResponseUpgradeProcessor', - 'RobotExclusionError', - 'RobustFactory', - 'RobustFormsFactory', - 'RobustLinksFactory', - 'RobustTitleFactory', - 'SeekableProcessor', - 'SeekableResponseOpener', - 'TitleFactory', - 'URLError', - 'USE_BARE_EXCEPT', - 'UnknownHandler', - 'UserAgent', - 'UserAgentBase', - 'XHTMLCompatibleHeadParser', - '__version__', - 'build_opener', - 'install_opener', - 'lwp_cookie_str', - 'make_response', - 'request_host', - 'response_seek_wrapper', # XXX deprecate in public interface? - 'seek_wrapped_response' # XXX should probably use this internally in place of response_seek_wrapper() - 'str2time', - 'urlopen', - 'urlretrieve'] - -from _mechanize import __version__ - -# high-level stateful browser-style interface -from _mechanize import \ - Browser, History, \ - BrowserStateError, LinkNotFoundError, FormNotFoundError - -# configurable URL-opener interface -from _useragent import UserAgentBase, UserAgent -from _html import \ - ParseError, \ - Link, \ - Factory, DefaultFactory, RobustFactory, \ - FormsFactory, LinksFactory, TitleFactory, \ - RobustFormsFactory, RobustLinksFactory, RobustTitleFactory - -# urllib2 work-alike interface (part from mechanize, part from urllib2) -# This is a superset of the urllib2 interface. -from _urllib2 import * - -# misc -from _opener import ContentTooShortError, OpenerFactory, urlretrieve -from _util import http2time as str2time -from _response import \ - response_seek_wrapper, seek_wrapped_response, make_response -from _http import HeadParser -try: - from _http import XHTMLCompatibleHeadParser -except ImportError: - pass - -# cookies -from _clientcookie import Cookie, CookiePolicy, DefaultCookiePolicy, \ - CookieJar, FileCookieJar, LoadError, request_host -from _lwpcookiejar import LWPCookieJar, lwp_cookie_str -from _mozillacookiejar import MozillaCookieJar -from _msiecookiejar import MSIECookieJar - -# If you hate the idea of turning bugs into warnings, do: -# import mechanize; mechanize.USE_BARE_EXCEPT = False -USE_BARE_EXCEPT = True diff --git a/src/calibre/utils/mechanize/_auth.py b/src/calibre/utils/mechanize/_auth.py deleted file mode 100644 index 9bb5873019..0000000000 --- a/src/calibre/utils/mechanize/_auth.py +++ /dev/null @@ -1,500 +0,0 @@ -"""HTTP Authentication and Proxy support. - -All but HTTPProxyPasswordMgr come from Python 2.5. - - -Copyright 2006 John J. Lee - -This code is free software; you can redistribute it and/or modify it under -the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt -included with the distribution). - -""" - -import re, base64, urlparse, posixpath, md5, sha, sys, copy - -from urllib2 import BaseHandler -from urllib import getproxies, unquote, splittype, splituser, splitpasswd, \ - splitport - - -def _parse_proxy(proxy): - """Return (scheme, user, password, host/port) given a URL or an authority. - - If a URL is supplied, it must have an authority (host:port) component. - According to RFC 3986, having an authority component means the URL must - have two slashes after the scheme: - - >>> _parse_proxy('file:/ftp.example.com/') - Traceback (most recent call last): - ValueError: proxy URL with no authority: 'file:/ftp.example.com/' - - The first three items of the returned tuple may be None. - - Examples of authority parsing: - - >>> _parse_proxy('proxy.example.com') - (None, None, None, 'proxy.example.com') - >>> _parse_proxy('proxy.example.com:3128') - (None, None, None, 'proxy.example.com:3128') - - The authority component may optionally include userinfo (assumed to be - username:password): - - >>> _parse_proxy('joe:password@proxy.example.com') - (None, 'joe', 'password', 'proxy.example.com') - >>> _parse_proxy('joe:password@proxy.example.com:3128') - (None, 'joe', 'password', 'proxy.example.com:3128') - - Same examples, but with URLs instead: - - >>> _parse_proxy('http://proxy.example.com/') - ('http', None, None, 'proxy.example.com') - >>> _parse_proxy('http://proxy.example.com:3128/') - ('http', None, None, 'proxy.example.com:3128') - >>> _parse_proxy('http://joe:password@proxy.example.com/') - ('http', 'joe', 'password', 'proxy.example.com') - >>> _parse_proxy('http://joe:password@proxy.example.com:3128') - ('http', 'joe', 'password', 'proxy.example.com:3128') - - Everything after the authority is ignored: - - >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128') - ('ftp', 'joe', 'password', 'proxy.example.com') - - Test for no trailing '/' case: - - >>> _parse_proxy('http://joe:password@proxy.example.com') - ('http', 'joe', 'password', 'proxy.example.com') - - """ - scheme, r_scheme = splittype(proxy) - if not r_scheme.startswith("/"): - # authority - scheme = None - authority = proxy - else: - # URL - if not r_scheme.startswith("//"): - raise ValueError("proxy URL with no authority: %r" % proxy) - # We have an authority, so for RFC 3986-compliant URLs (by ss 3. - # and 3.3.), path is empty or starts with '/' - end = r_scheme.find("/", 2) - if end == -1: - end = None - authority = r_scheme[2:end] - userinfo, hostport = splituser(authority) - if userinfo is not None: - user, password = splitpasswd(userinfo) - else: - user = password = None - return scheme, user, password, hostport - -class ProxyHandler(BaseHandler): - # Proxies must be in front - handler_order = 100 - - def __init__(self, proxies=None): - if proxies is None: - proxies = getproxies() - assert hasattr(proxies, 'has_key'), "proxies must be a mapping" - self.proxies = proxies - for type, url in proxies.items(): - setattr(self, '%s_open' % type, - lambda r, proxy=url, type=type, meth=self.proxy_open: \ - meth(r, proxy, type)) - - def proxy_open(self, req, proxy, type): - orig_type = req.get_type() - proxy_type, user, password, hostport = _parse_proxy(proxy) - if proxy_type is None: - proxy_type = orig_type - if user and password: - user_pass = '%s:%s' % (unquote(user), unquote(password)) - creds = base64.encodestring(user_pass).strip() - req.add_header('Proxy-authorization', 'Basic ' + creds) - hostport = unquote(hostport) - req.set_proxy(hostport, proxy_type) - if orig_type == proxy_type: - # let other handlers take care of it - return None - else: - # need to start over, because the other handlers don't - # grok the proxy's URL type - # e.g. if we have a constructor arg proxies like so: - # {'http': 'ftp://proxy.example.com'}, we may end up turning - # a request for http://acme.example.com/a into one for - # ftp://proxy.example.com/a - return self.parent.open(req) - -class HTTPPasswordMgr: - - def __init__(self): - self.passwd = {} - - def add_password(self, realm, uri, user, passwd): - # uri could be a single URI or a sequence - if isinstance(uri, basestring): - uri = [uri] - if not realm in self.passwd: - self.passwd[realm] = {} - for default_port in True, False: - reduced_uri = tuple( - [self.reduce_uri(u, default_port) for u in uri]) - self.passwd[realm][reduced_uri] = (user, passwd) - - def find_user_password(self, realm, authuri): - domains = self.passwd.get(realm, {}) - for default_port in True, False: - reduced_authuri = self.reduce_uri(authuri, default_port) - for uris, authinfo in domains.iteritems(): - for uri in uris: - if self.is_suburi(uri, reduced_authuri): - return authinfo - return None, None - - def reduce_uri(self, uri, default_port=True): - """Accept authority or URI and extract only the authority and path.""" - # note HTTP URLs do not have a userinfo component - parts = urlparse.urlsplit(uri) - if parts[1]: - # URI - scheme = parts[0] - authority = parts[1] - path = parts[2] or '/' - else: - # host or host:port - scheme = None - authority = uri - path = '/' - host, port = splitport(authority) - if default_port and port is None and scheme is not None: - dport = {"http": 80, - "https": 443, - }.get(scheme) - if dport is not None: - authority = "%s:%d" % (host, dport) - return authority, path - - def is_suburi(self, base, test): - """Check if test is below base in a URI tree - - Both args must be URIs in reduced form. - """ - if base == test: - return True - if base[0] != test[0]: - return False - common = posixpath.commonprefix((base[1], test[1])) - if len(common) == len(base[1]): - return True - return False - - -class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): - - def find_user_password(self, realm, authuri): - user, password = HTTPPasswordMgr.find_user_password(self, realm, - authuri) - if user is not None: - return user, password - return HTTPPasswordMgr.find_user_password(self, None, authuri) - - -class AbstractBasicAuthHandler: - - rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I) - - # XXX there can actually be multiple auth-schemes in a - # www-authenticate header. should probably be a lot more careful - # in parsing them to extract multiple alternatives - - def __init__(self, password_mgr=None): - if password_mgr is None: - password_mgr = HTTPPasswordMgr() - self.passwd = password_mgr - self.add_password = self.passwd.add_password - - def http_error_auth_reqed(self, authreq, host, req, headers): - # host may be an authority (without userinfo) or a URL with an - # authority - # XXX could be multiple headers - authreq = headers.get(authreq, None) - if authreq: - mo = AbstractBasicAuthHandler.rx.search(authreq) - if mo: - scheme, realm = mo.groups() - if scheme.lower() == 'basic': - return self.retry_http_basic_auth(host, req, realm) - - def retry_http_basic_auth(self, host, req, realm): - user, pw = self.passwd.find_user_password(realm, host) - if pw is not None: - raw = "%s:%s" % (user, pw) - auth = 'Basic %s' % base64.encodestring(raw).strip() - if req.headers.get(self.auth_header, None) == auth: - return None - newreq = copy.copy(req) - newreq.add_header(self.auth_header, auth) - newreq.visit = False - return self.parent.open(newreq) - else: - return None - - -class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): - - auth_header = 'Authorization' - - def http_error_401(self, req, fp, code, msg, headers): - url = req.get_full_url() - return self.http_error_auth_reqed('www-authenticate', - url, req, headers) - - -class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): - - auth_header = 'Proxy-authorization' - - def http_error_407(self, req, fp, code, msg, headers): - # http_error_auth_reqed requires that there is no userinfo component in - # authority. Assume there isn't one, since urllib2 does not (and - # should not, RFC 3986 s. 3.2.1) support requests for URLs containing - # userinfo. - authority = req.get_host() - return self.http_error_auth_reqed('proxy-authenticate', - authority, req, headers) - - -def randombytes(n): - """Return n random bytes.""" - # Use /dev/urandom if it is available. Fall back to random module - # if not. It might be worthwhile to extend this function to use - # other platform-specific mechanisms for getting random bytes. - if os.path.exists("/dev/urandom"): - f = open("/dev/urandom") - s = f.read(n) - f.close() - return s - else: - L = [chr(random.randrange(0, 256)) for i in range(n)] - return "".join(L) - -class AbstractDigestAuthHandler: - # Digest authentication is specified in RFC 2617. - - # XXX The client does not inspect the Authentication-Info header - # in a successful response. - - # XXX It should be possible to test this implementation against - # a mock server that just generates a static set of challenges. - - # XXX qop="auth-int" supports is shaky - - def __init__(self, passwd=None): - if passwd is None: - passwd = HTTPPasswordMgr() - self.passwd = passwd - self.add_password = self.passwd.add_password - self.retried = 0 - self.nonce_count = 0 - - def reset_retry_count(self): - self.retried = 0 - - def http_error_auth_reqed(self, auth_header, host, req, headers): - authreq = headers.get(auth_header, None) - if self.retried > 5: - # Don't fail endlessly - if we failed once, we'll probably - # fail a second time. Hm. Unless the Password Manager is - # prompting for the information. Crap. This isn't great - # but it's better than the current 'repeat until recursion - # depth exceeded' approach - raise HTTPError(req.get_full_url(), 401, "digest auth failed", - headers, None) - else: - self.retried += 1 - if authreq: - scheme = authreq.split()[0] - if scheme.lower() == 'digest': - return self.retry_http_digest_auth(req, authreq) - - def retry_http_digest_auth(self, req, auth): - token, challenge = auth.split(' ', 1) - chal = parse_keqv_list(parse_http_list(challenge)) - auth = self.get_authorization(req, chal) - if auth: - auth_val = 'Digest %s' % auth - if req.headers.get(self.auth_header, None) == auth_val: - return None - newreq = copy.copy(req) - newreq.add_unredirected_header(self.auth_header, auth_val) - newreq.visit = False - return self.parent.open(newreq) - - def get_cnonce(self, nonce): - # The cnonce-value is an opaque - # quoted string value provided by the client and used by both client - # and server to avoid chosen plaintext attacks, to provide mutual - # authentication, and to provide some message integrity protection. - # This isn't a fabulous effort, but it's probably Good Enough. - dig = sha.new("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(), - randombytes(8))).hexdigest() - return dig[:16] - - def get_authorization(self, req, chal): - try: - realm = chal['realm'] - nonce = chal['nonce'] - qop = chal.get('qop') - algorithm = chal.get('algorithm', 'MD5') - # mod_digest doesn't send an opaque, even though it isn't - # supposed to be optional - opaque = chal.get('opaque', None) - except KeyError: - return None - - H, KD = self.get_algorithm_impls(algorithm) - if H is None: - return None - - user, pw = self.passwd.find_user_password(realm, req.get_full_url()) - if user is None: - return None - - # XXX not implemented yet - if req.has_data(): - entdig = self.get_entity_digest(req.get_data(), chal) - else: - entdig = None - - A1 = "%s:%s:%s" % (user, realm, pw) - A2 = "%s:%s" % (req.get_method(), - # XXX selector: what about proxies and full urls - req.get_selector()) - if qop == 'auth': - self.nonce_count += 1 - ncvalue = '%08x' % self.nonce_count - cnonce = self.get_cnonce(nonce) - noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2)) - respdig = KD(H(A1), noncebit) - elif qop is None: - respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) - else: - # XXX handle auth-int. - pass - - # XXX should the partial digests be encoded too? - - base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ - 'response="%s"' % (user, realm, nonce, req.get_selector(), - respdig) - if opaque: - base += ', opaque="%s"' % opaque - if entdig: - base += ', digest="%s"' % entdig - base += ', algorithm="%s"' % algorithm - if qop: - base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) - return base - - def get_algorithm_impls(self, algorithm): - # lambdas assume digest modules are imported at the top level - if algorithm == 'MD5': - H = lambda x: md5.new(x).hexdigest() - elif algorithm == 'SHA': - H = lambda x: sha.new(x).hexdigest() - # XXX MD5-sess - KD = lambda s, d: H("%s:%s" % (s, d)) - return H, KD - - def get_entity_digest(self, data, chal): - # XXX not implemented yet - return None - - -class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): - """An authentication protocol defined by RFC 2069 - - Digest authentication improves on basic authentication because it - does not transmit passwords in the clear. - """ - - auth_header = 'Authorization' - handler_order = 490 - - def http_error_401(self, req, fp, code, msg, headers): - host = urlparse.urlparse(req.get_full_url())[1] - retry = self.http_error_auth_reqed('www-authenticate', - host, req, headers) - self.reset_retry_count() - return retry - - -class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): - - auth_header = 'Proxy-Authorization' - handler_order = 490 - - def http_error_407(self, req, fp, code, msg, headers): - host = req.get_host() - retry = self.http_error_auth_reqed('proxy-authenticate', - host, req, headers) - self.reset_retry_count() - return retry - - -# XXX ugly implementation, should probably not bother deriving -class HTTPProxyPasswordMgr(HTTPPasswordMgr): - # has default realm and host/port - def add_password(self, realm, uri, user, passwd): - # uri could be a single URI or a sequence - if uri is None or isinstance(uri, basestring): - uris = [uri] - else: - uris = uri - passwd_by_domain = self.passwd.setdefault(realm, {}) - for uri in uris: - for default_port in True, False: - reduced_uri = self.reduce_uri(uri, default_port) - passwd_by_domain[reduced_uri] = (user, passwd) - - def find_user_password(self, realm, authuri): - attempts = [(realm, authuri), (None, authuri)] - # bleh, want default realm to take precedence over default - # URI/authority, hence this outer loop - for default_uri in False, True: - for realm, authuri in attempts: - authinfo_by_domain = self.passwd.get(realm, {}) - for default_port in True, False: - reduced_authuri = self.reduce_uri(authuri, default_port) - for uri, authinfo in authinfo_by_domain.iteritems(): - if uri is None and not default_uri: - continue - if self.is_suburi(uri, reduced_authuri): - return authinfo - user, password = None, None - - if user is not None: - break - return user, password - - def reduce_uri(self, uri, default_port=True): - if uri is None: - return None - return HTTPPasswordMgr.reduce_uri(self, uri, default_port) - - def is_suburi(self, base, test): - if base is None: - # default to the proxy's host/port - hostport, path = test - base = (hostport, "/") - return HTTPPasswordMgr.is_suburi(self, base, test) - - -class HTTPSClientCertMgr(HTTPPasswordMgr): - # implementation inheritance: this is not a proper subclass - def add_key_cert(self, uri, key_file, cert_file): - self.add_password(None, uri, key_file, cert_file) - def find_key_cert(self, authuri): - return HTTPPasswordMgr.find_user_password(self, None, authuri) diff --git a/src/calibre/utils/mechanize/_beautifulsoup.py b/src/calibre/utils/mechanize/_beautifulsoup.py deleted file mode 100644 index 2541dcc63a..0000000000 --- a/src/calibre/utils/mechanize/_beautifulsoup.py +++ /dev/null @@ -1,1080 +0,0 @@ -"""Beautiful Soup -Elixir and Tonic -"The Screen-Scraper's Friend" -v2.1.1 -http://www.crummy.com/software/BeautifulSoup/ - -Beautiful Soup parses arbitrarily invalid XML- or HTML-like substance -into a tree representation. It provides methods and Pythonic idioms -that make it easy to search and modify the tree. - -A well-formed XML/HTML document will yield a well-formed data -structure. An ill-formed XML/HTML document will yield a -correspondingly ill-formed data structure. If your document is only -locally well-formed, you can use this library to find and process the -well-formed part of it. The BeautifulSoup class has heuristics for -obtaining a sensible parse tree in the face of common HTML errors. - -Beautiful Soup has no external dependencies. It works with Python 2.2 -and up. - -Beautiful Soup defines classes for four different parsing strategies: - - * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific - language that kind of looks like XML. - - * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid - or invalid. - - * ICantBelieveItsBeautifulSoup, for parsing valid but bizarre HTML - that trips up BeautifulSoup. - - * BeautifulSOAP, for making it easier to parse XML documents that use - lots of subelements containing a single string, where you'd prefer - they put that string into an attribute (such as SOAP messages). - -You can subclass BeautifulStoneSoup or BeautifulSoup to create a -parsing strategy specific to an XML schema or a particular bizarre -HTML document. Typically your subclass would just override -SELF_CLOSING_TAGS and/or NESTABLE_TAGS. -""" -from __future__ import generators - -__author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "2.1.1" -__date__ = "$Date: 2004/10/18 00:14:20 $" -__copyright__ = "Copyright (c) 2004-2005 Leonard Richardson" -__license__ = "PSF" - -from sgmllib import SGMLParser, SGMLParseError -import types -import re -import sgmllib - -#This code makes Beautiful Soup able to parse XML with namespaces -sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') - -class NullType(object): - - """Similar to NoneType with a corresponding singleton instance - 'Null' that, unlike None, accepts any message and returns itself. - - Examples: - >>> Null("send", "a", "message")("and one more", - ... "and what you get still") is Null - True - """ - - def __new__(cls): return Null - def __call__(self, *args, **kwargs): return Null -## def __getstate__(self, *args): return Null - def __getattr__(self, attr): return Null - def __getitem__(self, item): return Null - def __setattr__(self, attr, value): pass - def __setitem__(self, item, value): pass - def __len__(self): return 0 - # FIXME: is this a python bug? otherwise ``for x in Null: pass`` - # never terminates... - def __iter__(self): return iter([]) - def __contains__(self, item): return False - def __repr__(self): return "Null" -Null = object.__new__(NullType) - -class PageElement: - """Contains the navigational information for some part of the page - (either a tag or a piece of text)""" - - def setup(self, parent=Null, previous=Null): - """Sets up the initial relations between this element and - other elements.""" - self.parent = parent - self.previous = previous - self.next = Null - self.previousSibling = Null - self.nextSibling = Null - if self.parent and self.parent.contents: - self.previousSibling = self.parent.contents[-1] - self.previousSibling.nextSibling = self - - def findNext(self, name=None, attrs={}, text=None): - """Returns the first item that matches the given criteria and - appears after this Tag in the document.""" - return self._first(self.fetchNext, name, attrs, text) - firstNext = findNext - - def fetchNext(self, name=None, attrs={}, text=None, limit=None): - """Returns all items that match the given criteria and appear - before after Tag in the document.""" - return self._fetch(name, attrs, text, limit, self.nextGenerator) - - def findNextSibling(self, name=None, attrs={}, text=None): - """Returns the closest sibling to this Tag that matches the - given criteria and appears after this Tag in the document.""" - return self._first(self.fetchNextSiblings, name, attrs, text) - firstNextSibling = findNextSibling - - def fetchNextSiblings(self, name=None, attrs={}, text=None, limit=None): - """Returns the siblings of this Tag that match the given - criteria and appear after this Tag in the document.""" - return self._fetch(name, attrs, text, limit, self.nextSiblingGenerator) - - def findPrevious(self, name=None, attrs={}, text=None): - """Returns the first item that matches the given criteria and - appears before this Tag in the document.""" - return self._first(self.fetchPrevious, name, attrs, text) - - def fetchPrevious(self, name=None, attrs={}, text=None, limit=None): - """Returns all items that match the given criteria and appear - before this Tag in the document.""" - return self._fetch(name, attrs, text, limit, self.previousGenerator) - firstPrevious = findPrevious - - def findPreviousSibling(self, name=None, attrs={}, text=None): - """Returns the closest sibling to this Tag that matches the - given criteria and appears before this Tag in the document.""" - return self._first(self.fetchPreviousSiblings, name, attrs, text) - firstPreviousSibling = findPreviousSibling - - def fetchPreviousSiblings(self, name=None, attrs={}, text=None, - limit=None): - """Returns the siblings of this Tag that match the given - criteria and appear before this Tag in the document.""" - return self._fetch(name, attrs, text, limit, - self.previousSiblingGenerator) - - def findParent(self, name=None, attrs={}): - """Returns the closest parent of this Tag that matches the given - criteria.""" - r = Null - l = self.fetchParents(name, attrs, 1) - if l: - r = l[0] - return r - firstParent = findParent - - def fetchParents(self, name=None, attrs={}, limit=None): - """Returns the parents of this Tag that match the given - criteria.""" - return self._fetch(name, attrs, None, limit, self.parentGenerator) - - #These methods do the real heavy lifting. - - def _first(self, method, name, attrs, text): - r = Null - l = method(name, attrs, text, 1) - if l: - r = l[0] - return r - - def _fetch(self, name, attrs, text, limit, generator): - "Iterates over a generator looking for things that match." - if not hasattr(attrs, 'items'): - attrs = {'class' : attrs} - - results = [] - g = generator() - while True: - try: - i = g.next() - except StopIteration: - break - found = None - if isinstance(i, Tag): - if not text: - if not name or self._matches(i, name): - match = True - for attr, matchAgainst in attrs.items(): - check = i.get(attr) - if not self._matches(check, matchAgainst): - match = False - break - if match: - found = i - elif text: - if self._matches(i, text): - found = i - if found: - results.append(found) - if limit and len(results) >= limit: - break - return results - - #Generators that can be used to navigate starting from both - #NavigableTexts and Tags. - def nextGenerator(self): - i = self - while i: - i = i.next - yield i - - def nextSiblingGenerator(self): - i = self - while i: - i = i.nextSibling - yield i - - def previousGenerator(self): - i = self - while i: - i = i.previous - yield i - - def previousSiblingGenerator(self): - i = self - while i: - i = i.previousSibling - yield i - - def parentGenerator(self): - i = self - while i: - i = i.parent - yield i - - def _matches(self, chunk, howToMatch): - #print 'looking for %s in %s' % (howToMatch, chunk) - # - # If given a list of items, return true if the list contains a - # text element that matches. - if isList(chunk) and not isinstance(chunk, Tag): - for tag in chunk: - if isinstance(tag, NavigableText) and self._matches(tag, howToMatch): - return True - return False - if callable(howToMatch): - return howToMatch(chunk) - if isinstance(chunk, Tag): - #Custom match methods take the tag as an argument, but all other - #ways of matching match the tag name as a string - chunk = chunk.name - #Now we know that chunk is a string - if not isinstance(chunk, basestring): - chunk = str(chunk) - if hasattr(howToMatch, 'match'): - # It's a regexp object. - return howToMatch.search(chunk) - if isList(howToMatch): - return chunk in howToMatch - if hasattr(howToMatch, 'items'): - return howToMatch.has_key(chunk) - #It's just a string - return str(howToMatch) == chunk - -class NavigableText(PageElement): - - def __getattr__(self, attr): - "For backwards compatibility, text.string gives you text" - if attr == 'string': - return self - else: - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) - -class NavigableString(str, NavigableText): - pass - -class NavigableUnicodeString(unicode, NavigableText): - pass - -class Tag(PageElement): - - """Represents a found HTML tag with its attributes and contents.""" - - def __init__(self, name, attrs=None, parent=Null, previous=Null): - "Basic constructor." - self.name = name - if attrs == None: - attrs = [] - self.attrs = attrs - self.contents = [] - self.setup(parent, previous) - self.hidden = False - - def get(self, key, default=None): - """Returns the value of the 'key' attribute for the tag, or - the value given for 'default' if it doesn't have that - attribute.""" - return self._getAttrMap().get(key, default) - - def __getitem__(self, key): - """tag[key] returns the value of the 'key' attribute for the tag, - and throws an exception if it's not there.""" - return self._getAttrMap()[key] - - def __iter__(self): - "Iterating over a tag iterates over its contents." - return iter(self.contents) - - def __len__(self): - "The length of a tag is the length of its list of contents." - return len(self.contents) - - def __contains__(self, x): - return x in self.contents - - def __nonzero__(self): - "A tag is non-None even if it has no contents." - return True - - def __setitem__(self, key, value): - """Setting tag[key] sets the value of the 'key' attribute for the - tag.""" - self._getAttrMap() - self.attrMap[key] = value - found = False - for i in range(0, len(self.attrs)): - if self.attrs[i][0] == key: - self.attrs[i] = (key, value) - found = True - if not found: - self.attrs.append((key, value)) - self._getAttrMap()[key] = value - - def __delitem__(self, key): - "Deleting tag[key] deletes all 'key' attributes for the tag." - for item in self.attrs: - if item[0] == key: - self.attrs.remove(item) - #We don't break because bad HTML can define the same - #attribute multiple times. - self._getAttrMap() - if self.attrMap.has_key(key): - del self.attrMap[key] - - def __call__(self, *args, **kwargs): - """Calling a tag like a function is the same as calling its - fetch() method. Eg. tag('a') returns a list of all the A tags - found within this tag.""" - return apply(self.fetch, args, kwargs) - - def __getattr__(self, tag): - if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: - return self.first(tag[:-3]) - elif tag.find('__') != 0: - return self.first(tag) - - def __eq__(self, other): - """Returns true iff this tag has the same name, the same attributes, - and the same contents (recursively) as the given tag. - - NOTE: right now this will return false if two tags have the - same attributes in a different order. Should this be fixed?""" - if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): - return False - for i in range(0, len(self.contents)): - if self.contents[i] != other.contents[i]: - return False - return True - - def __ne__(self, other): - """Returns true iff this tag is not identical to the other tag, - as defined in __eq__.""" - return not self == other - - def __repr__(self): - """Renders this tag as a string.""" - return str(self) - - def __unicode__(self): - return self.__str__(1) - - def __str__(self, needUnicode=None, showStructureIndent=None): - """Returns a string or Unicode representation of this tag and - its contents. - - NOTE: since Python's HTML parser consumes whitespace, this - method is not certain to reproduce the whitespace present in - the original string.""" - - attrs = [] - if self.attrs: - for key, val in self.attrs: - attrs.append('%s="%s"' % (key, val)) - close = '' - closeTag = '' - if self.isSelfClosing(): - close = ' /' - else: - closeTag = '' % self.name - indentIncrement = None - if showStructureIndent != None: - indentIncrement = showStructureIndent - if not self.hidden: - indentIncrement += 1 - contents = self.renderContents(indentIncrement, needUnicode=needUnicode) - if showStructureIndent: - space = '\n%s' % (' ' * showStructureIndent) - if self.hidden: - s = contents - else: - s = [] - attributeString = '' - if attrs: - attributeString = ' ' + ' '.join(attrs) - if showStructureIndent: - s.append(space) - s.append('<%s%s%s>' % (self.name, attributeString, close)) - s.append(contents) - if closeTag and showStructureIndent != None: - s.append(space) - s.append(closeTag) - s = ''.join(s) - isUnicode = type(s) == types.UnicodeType - if needUnicode and not isUnicode: - s = unicode(s) - elif isUnicode and needUnicode==False: - s = str(s) - return s - - def prettify(self, needUnicode=None): - return self.__str__(needUnicode, showStructureIndent=True) - - def renderContents(self, showStructureIndent=None, needUnicode=None): - """Renders the contents of this tag as a (possibly Unicode) - string.""" - s=[] - for c in self: - text = None - if isinstance(c, NavigableUnicodeString) or type(c) == types.UnicodeType: - text = unicode(c) - elif isinstance(c, Tag): - s.append(c.__str__(needUnicode, showStructureIndent)) - elif needUnicode: - text = unicode(c) - else: - text = str(c) - if text: - if showStructureIndent != None: - if text[-1] == '\n': - text = text[:-1] - s.append(text) - return ''.join(s) - - #Soup methods - - def firstText(self, text, recursive=True): - """Convenience method to retrieve the first piece of text matching the - given criteria. 'text' can be a string, a regular expression object, - a callable that takes a string and returns whether or not the - string 'matches', etc.""" - return self.first(recursive=recursive, text=text) - - def fetchText(self, text, recursive=True, limit=None): - """Convenience method to retrieve all pieces of text matching the - given criteria. 'text' can be a string, a regular expression object, - a callable that takes a string and returns whether or not the - string 'matches', etc.""" - return self.fetch(recursive=recursive, text=text, limit=limit) - - def first(self, name=None, attrs={}, recursive=True, text=None): - """Return only the first child of this - Tag matching the given criteria.""" - r = Null - l = self.fetch(name, attrs, recursive, text, 1) - if l: - r = l[0] - return r - findChild = first - - def fetch(self, name=None, attrs={}, recursive=True, text=None, - limit=None): - """Extracts a list of Tag objects that match the given - criteria. You can specify the name of the Tag and any - attributes you want the Tag to have. - - The value of a key-value pair in the 'attrs' map can be a - string, a list of strings, a regular expression object, or a - callable that takes a string and returns whether or not the - string matches for some custom definition of 'matches'. The - same is true of the tag name.""" - generator = self.recursiveChildGenerator - if not recursive: - generator = self.childGenerator - return self._fetch(name, attrs, text, limit, generator) - fetchChildren = fetch - - #Utility methods - - def isSelfClosing(self): - """Returns true iff this is a self-closing tag as defined in the HTML - standard. - - TODO: This is specific to BeautifulSoup and its subclasses, but it's - used by __str__""" - return self.name in BeautifulSoup.SELF_CLOSING_TAGS - - def append(self, tag): - """Appends the given tag to the contents of this tag.""" - self.contents.append(tag) - - #Private methods - - def _getAttrMap(self): - """Initializes a map representation of this tag's attributes, - if not already initialized.""" - if not getattr(self, 'attrMap'): - self.attrMap = {} - for (key, value) in self.attrs: - self.attrMap[key] = value - return self.attrMap - - #Generator methods - def childGenerator(self): - for i in range(0, len(self.contents)): - yield self.contents[i] - raise StopIteration - - def recursiveChildGenerator(self): - stack = [(self, 0)] - while stack: - tag, start = stack.pop() - if isinstance(tag, Tag): - for i in range(start, len(tag.contents)): - a = tag.contents[i] - yield a - if isinstance(a, Tag) and tag.contents: - if i < len(tag.contents) - 1: - stack.append((tag, i+1)) - stack.append((a, 0)) - break - raise StopIteration - - -def isList(l): - """Convenience method that works with all 2.x versions of Python - to determine whether or not something is listlike.""" - return hasattr(l, '__iter__') \ - or (type(l) in (types.ListType, types.TupleType)) - -def buildTagMap(default, *args): - """Turns a list of maps, lists, or scalars into a single map. - Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out - of lists and partial maps.""" - built = {} - for portion in args: - if hasattr(portion, 'items'): - #It's a map. Merge it. - for k,v in portion.items(): - built[k] = v - elif isList(portion): - #It's a list. Map each item to the default. - for k in portion: - built[k] = default - else: - #It's a scalar. Map it to the default. - built[portion] = default - return built - -class BeautifulStoneSoup(Tag, SGMLParser): - - """This class contains the basic parser and fetch code. It defines - a parser that knows nothing about tag behavior except for the - following: - - You can't close a tag without closing all the tags it encloses. - That is, "" actually means - "". - - [Another possible explanation is "", but since - this class defines no SELF_CLOSING_TAGS, it will never use that - explanation.] - - This class is useful for parsing XML or made-up markup languages, - or when BeautifulSoup makes an assumption counter to what you were - expecting.""" - - SELF_CLOSING_TAGS = {} - NESTABLE_TAGS = {} - RESET_NESTING_TAGS = {} - QUOTE_TAGS = {} - - #As a public service we will by default silently replace MS smart quotes - #and similar characters with their HTML or ASCII equivalents. - MS_CHARS = { '\x80' : '€', - '\x81' : ' ', - '\x82' : '‚', - '\x83' : 'ƒ', - '\x84' : '„', - '\x85' : '…', - '\x86' : '†', - '\x87' : '‡', - '\x88' : '⁁', - '\x89' : '%', - '\x8A' : 'Š', - '\x8B' : '<', - '\x8C' : 'Œ', - '\x8D' : '?', - '\x8E' : 'Z', - '\x8F' : '?', - '\x90' : '?', - '\x91' : '‘', - '\x92' : '’', - '\x93' : '“', - '\x94' : '”', - '\x95' : '•', - '\x96' : '–', - '\x97' : '—', - '\x98' : '˜', - '\x99' : '™', - '\x9a' : 'š', - '\x9b' : '>', - '\x9c' : 'œ', - '\x9d' : '?', - '\x9e' : 'z', - '\x9f' : 'Ÿ',} - - PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'), - lambda(x):x.group(1) + ' />'), - (re.compile(']*)>'), - lambda(x):''), - (re.compile("([\x80-\x9f])"), - lambda(x): BeautifulStoneSoup.MS_CHARS.get(x.group(1))) - ] - - ROOT_TAG_NAME = '[document]' - - def __init__(self, text=None, avoidParserProblems=True, - initialTextIsEverything=True): - """Initialize this as the 'root tag' and feed in any text to - the parser. - - NOTE about avoidParserProblems: sgmllib will process most bad - HTML, and BeautifulSoup has tricks for dealing with some HTML - that kills sgmllib, but Beautiful Soup can nonetheless choke - or lose data if your data uses self-closing tags or - declarations incorrectly. By default, Beautiful Soup sanitizes - its input to avoid the vast majority of these problems. The - problems are relatively rare, even in bad HTML, so feel free - to pass in False to avoidParserProblems if they don't apply to - you, and you'll get better performance. The only reason I have - this turned on by default is so I don't get so many tech - support questions. - - The two most common instances of invalid HTML that will choke - sgmllib are fixed by the default parser massage techniques: - -
(No space between name of closing tag and tag close) - (Extraneous whitespace in declaration) - - You can pass in a custom list of (RE object, replace method) - tuples to get Beautiful Soup to scrub your input the way you - want.""" - Tag.__init__(self, self.ROOT_TAG_NAME) - if avoidParserProblems \ - and not isList(avoidParserProblems): - avoidParserProblems = self.PARSER_MASSAGE - self.avoidParserProblems = avoidParserProblems - SGMLParser.__init__(self) - self.quoteStack = [] - self.hidden = 1 - self.reset() - if hasattr(text, 'read'): - #It's a file-type object. - text = text.read() - if text: - self.feed(text) - if initialTextIsEverything: - self.done() - - def __getattr__(self, methodName): - """This method routes method call requests to either the SGMLParser - superclass or the Tag superclass, depending on the method name.""" - if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ - or methodName.find('do_') == 0: - return SGMLParser.__getattr__(self, methodName) - elif methodName.find('__') != 0: - return Tag.__getattr__(self, methodName) - else: - raise AttributeError - - def feed(self, text): - if self.avoidParserProblems: - for fix, m in self.avoidParserProblems: - text = fix.sub(m, text) - SGMLParser.feed(self, text) - - def done(self): - """Called when you're done parsing, so that the unclosed tags can be - correctly processed.""" - self.endData() #NEW - while self.currentTag.name != self.ROOT_TAG_NAME: - self.popTag() - - def reset(self): - SGMLParser.reset(self) - self.currentData = [] - self.currentTag = None - self.tagStack = [] - self.pushTag(self) - - def popTag(self): - tag = self.tagStack.pop() - # Tags with just one string-owning child get the child as a - # 'string' property, so that soup.tag.string is shorthand for - # soup.tag.contents[0] - if len(self.currentTag.contents) == 1 and \ - isinstance(self.currentTag.contents[0], NavigableText): - self.currentTag.string = self.currentTag.contents[0] - - #print "Pop", tag.name - if self.tagStack: - self.currentTag = self.tagStack[-1] - return self.currentTag - - def pushTag(self, tag): - #print "Push", tag.name - if self.currentTag: - self.currentTag.append(tag) - self.tagStack.append(tag) - self.currentTag = self.tagStack[-1] - - def endData(self): - currentData = ''.join(self.currentData) - if currentData: - if not currentData.strip(): - if '\n' in currentData: - currentData = '\n' - else: - currentData = ' ' - c = NavigableString - if type(currentData) == types.UnicodeType: - c = NavigableUnicodeString - o = c(currentData) - o.setup(self.currentTag, self.previous) - if self.previous: - self.previous.next = o - self.previous = o - self.currentTag.contents.append(o) - self.currentData = [] - - def _popToTag(self, name, inclusivePop=True): - """Pops the tag stack up to and including the most recent - instance of the given tag. If inclusivePop is false, pops the tag - stack up to but *not* including the most recent instqance of - the given tag.""" - if name == self.ROOT_TAG_NAME: - return - - numPops = 0 - mostRecentTag = None - for i in range(len(self.tagStack)-1, 0, -1): - if name == self.tagStack[i].name: - numPops = len(self.tagStack)-i - break - if not inclusivePop: - numPops = numPops - 1 - - for i in range(0, numPops): - mostRecentTag = self.popTag() - return mostRecentTag - - def _smartPop(self, name): - - """We need to pop up to the previous tag of this type, unless - one of this tag's nesting reset triggers comes between this - tag and the previous tag of this type, OR unless this tag is a - generic nesting trigger and another generic nesting trigger - comes between this tag and the previous tag of this type. - - Examples: -

FooBar

should pop to 'p', not 'b'. -

FooBar

should pop to 'table', not 'p'. -

Foo

Bar

should pop to 'tr', not 'p'. -

FooBar

should pop to 'p', not 'b'. - -

    • *
    • * should pop to 'ul', not the first 'li'. -
  • ** should pop to 'table', not the first 'tr' - tag should - implicitly close the previous tag within the same
    ** should pop to 'tr', not the first 'td' - """ - - nestingResetTriggers = self.NESTABLE_TAGS.get(name) - isNestable = nestingResetTriggers != None - isResetNesting = self.RESET_NESTING_TAGS.has_key(name) - popTo = None - inclusive = True - for i in range(len(self.tagStack)-1, 0, -1): - p = self.tagStack[i] - if (not p or p.name == name) and not isNestable: - #Non-nestable tags get popped to the top or to their - #last occurance. - popTo = name - break - if (nestingResetTriggers != None - and p.name in nestingResetTriggers) \ - or (nestingResetTriggers == None and isResetNesting - and self.RESET_NESTING_TAGS.has_key(p.name)): - - #If we encounter one of the nesting reset triggers - #peculiar to this tag, or we encounter another tag - #that causes nesting to reset, pop up to but not - #including that tag. - - popTo = p.name - inclusive = False - break - p = p.parent - if popTo: - self._popToTag(popTo, inclusive) - - def unknown_starttag(self, name, attrs, selfClosing=0): - #print "Start tag %s" % name - if self.quoteStack: - #This is not a real tag. - #print "<%s> is not real!" % name - attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) - self.handle_data('<%s%s>' % (name, attrs)) - return - self.endData() - if not name in self.SELF_CLOSING_TAGS and not selfClosing: - self._smartPop(name) - tag = Tag(name, attrs, self.currentTag, self.previous) - if self.previous: - self.previous.next = tag - self.previous = tag - self.pushTag(tag) - if selfClosing or name in self.SELF_CLOSING_TAGS: - self.popTag() - if name in self.QUOTE_TAGS: - #print "Beginning quote (%s)" % name - self.quoteStack.append(name) - self.literal = 1 - - def unknown_endtag(self, name): - if self.quoteStack and self.quoteStack[-1] != name: - #This is not a real end tag. - #print " is not real!" % name - self.handle_data('' % name) - return - self.endData() - self._popToTag(name) - if self.quoteStack and self.quoteStack[-1] == name: - self.quoteStack.pop() - self.literal = (len(self.quoteStack) > 0) - - def handle_data(self, data): - self.currentData.append(data) - - def handle_pi(self, text): - "Propagate processing instructions right through." - self.handle_data("" % text) - - def handle_comment(self, text): - "Propagate comments right through." - self.handle_data("" % text) - - def handle_charref(self, ref): - "Propagate char refs right through." - self.handle_data('&#%s;' % ref) - - def handle_entityref(self, ref): - "Propagate entity refs right through." - self.handle_data('&%s;' % ref) - - def handle_decl(self, data): - "Propagate DOCTYPEs and the like right through." - self.handle_data('' % data) - - def parse_declaration(self, i): - """Treat a bogus SGML declaration as raw data. Treat a CDATA - declaration as regular data.""" - j = None - if self.rawdata[i:i+9] == '', i) - if k == -1: - k = len(self.rawdata) - self.handle_data(self.rawdata[i+9:k]) - j = k+3 - else: - try: - j = SGMLParser.parse_declaration(self, i) - except SGMLParseError: - toHandle = self.rawdata[i:] - self.handle_data(toHandle) - j = i + len(toHandle) - return j - -class BeautifulSoup(BeautifulStoneSoup): - - """This parser knows the following facts about HTML: - - * Some tags have no closing tag and should be interpreted as being - closed as soon as they are encountered. - - * The text inside some tags (ie. 'script') may contain tags which - are not really part of the document and which should be parsed - as text, not tags. If you want to parse the text as tags, you can - always fetch it and parse it explicitly. - - * Tag nesting rules: - - Most tags can't be nested at all. For instance, the occurance of - a

    tag should implicitly close the previous

    tag. - -

    Para1

    Para2 - should be transformed into: -

    Para1

    Para2 - - Some tags can be nested arbitrarily. For instance, the occurance - of a

    tag should _not_ implicitly close the previous -
    tag. - - Alice said:
    Bob said:
    Blah - should NOT be transformed into: - Alice said:
    Bob said:
    Blah - - Some tags can be nested, but the nesting is reset by the - interposition of other tags. For instance, a
    , - but not close a tag in another table. - -
    BlahBlah - should be transformed into: -
    BlahBlah - but, - Blah
    Blah - should NOT be transformed into - Blah
    Blah - - Differing assumptions about tag nesting rules are a major source - of problems with the BeautifulSoup class. If BeautifulSoup is not - treating as nestable a tag your page author treats as nestable, - try ICantBelieveItsBeautifulSoup before writing your own - subclass.""" - - SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta', - 'spacer', 'link', 'frame', 'base']) - - QUOTE_TAGS = {'script': None} - - #According to the HTML standard, each of these inline tags can - #contain another tag of the same type. Furthermore, it's common - #to actually use these tags this way. - NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', - 'center'] - - #According to the HTML standard, these block tags can contain - #another tag of the same type. Furthermore, it's common - #to actually use these tags this way. - NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] - - #Lists can contain other lists, but there are restrictions. - NESTABLE_LIST_TAGS = { 'ol' : [], - 'ul' : [], - 'li' : ['ul', 'ol'], - 'dl' : [], - 'dd' : ['dl'], - 'dt' : ['dl'] } - - #Tables can contain other tables, but there are restrictions. - NESTABLE_TABLE_TAGS = {'table' : [], - 'tr' : ['table', 'tbody', 'tfoot', 'thead'], - 'td' : ['tr'], - 'th' : ['tr'], - } - - NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] - - #If one of these tags is encountered, all tags up to the next tag of - #this type are popped. - RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', - NON_NESTABLE_BLOCK_TAGS, - NESTABLE_LIST_TAGS, - NESTABLE_TABLE_TAGS) - - NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, - NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) - -class ICantBelieveItsBeautifulSoup(BeautifulSoup): - - """The BeautifulSoup class is oriented towards skipping over - common HTML errors like unclosed tags. However, sometimes it makes - errors of its own. For instance, consider this fragment: - - FooBar - - This is perfectly valid (if bizarre) HTML. However, the - BeautifulSoup class will implicitly close the first b tag when it - encounters the second 'b'. It will think the author wrote - "FooBar", and didn't close the first 'b' tag, because - there's no real-world reason to bold something that's already - bold. When it encounters '' it will close two more 'b' - tags, for a grand total of three tags closed instead of two. This - can throw off the rest of your document structure. The same is - true of a number of other tags, listed below. - - It's much more common for someone to forget to close (eg.) a 'b' - tag than to actually use nested 'b' tags, and the BeautifulSoup - class handles the common case. This class handles the - not-co-common case: where you can't believe someone wrote what - they did, but it's valid HTML and BeautifulSoup screwed up by - assuming it wouldn't be. - - If this doesn't do what you need, try subclassing this class or - BeautifulSoup, and providing your own list of NESTABLE_TAGS.""" - - I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ - ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', - 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', - 'big'] - - I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] - - NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, - I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, - I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) - -class BeautifulSOAP(BeautifulStoneSoup): - """This class will push a tag with only a single string child into - the tag's parent as an attribute. The attribute's name is the tag - name, and the value is the string child. An example should give - the flavor of the change: - - baz - => - baz - - You can then access fooTag['bar'] instead of fooTag.barTag.string. - - This is, of course, useful for scraping structures that tend to - use subelements instead of attributes, such as SOAP messages. Note - that it modifies its input, so don't print the modified version - out. - - I'm not sure how many people really want to use this class; let me - know if you do. Mainly I like the name.""" - - def popTag(self): - if len(self.tagStack) > 1: - tag = self.tagStack[-1] - parent = self.tagStack[-2] - parent._getAttrMap() - if (isinstance(tag, Tag) and len(tag.contents) == 1 and - isinstance(tag.contents[0], NavigableText) and - not parent.attrMap.has_key(tag.name)): - parent[tag.name] = tag.contents[0] - BeautifulStoneSoup.popTag(self) - -#Enterprise class names! It has come to our attention that some people -#think the names of the Beautiful Soup parser classes are too silly -#and "unprofessional" for use in enterprise screen-scraping. We feel -#your pain! For such-minded folk, the Beautiful Soup Consortium And -#All-Night Kosher Bakery recommends renaming this file to -#"RobustParser.py" (or, in cases of extreme enterprisitude, -#"RobustParserBeanInterface.class") and using the following -#enterprise-friendly class aliases: -class RobustXMLParser(BeautifulStoneSoup): - pass -class RobustHTMLParser(BeautifulSoup): - pass -class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): - pass -class SimplifyingSOAPParser(BeautifulSOAP): - pass - -### - - -#By default, act as an HTML pretty-printer. -if __name__ == '__main__': - import sys - soup = BeautifulStoneSoup(sys.stdin.read()) - print soup.prettify() diff --git a/src/calibre/utils/mechanize/_clientcookie.py b/src/calibre/utils/mechanize/_clientcookie.py deleted file mode 100644 index e8f0f67d4a..0000000000 --- a/src/calibre/utils/mechanize/_clientcookie.py +++ /dev/null @@ -1,1651 +0,0 @@ -"""HTTP cookie handling for web clients. - -This module originally developed from my port of Gisle Aas' Perl module -HTTP::Cookies, from the libwww-perl library. - -Docstrings, comments and debug strings in this code refer to the -attributes of the HTTP cookie system as cookie-attributes, to distinguish -them clearly from Python attributes. - - CookieJar____ - / \ \ - FileCookieJar \ \ - / | \ \ \ - MozillaCookieJar | LWPCookieJar \ \ - | | \ - | ---MSIEBase | \ - | / | | \ - | / MSIEDBCookieJar BSDDBCookieJar - |/ - MSIECookieJar - -Comments to John J Lee . - - -Copyright 2002-2006 John J Lee -Copyright 1997-1999 Gisle Aas (original libwww-perl code) -Copyright 2002-2003 Johnny Lee (original MSIE Perl code) - -This code is free software; you can redistribute it and/or modify it -under the terms of the BSD or ZPL 2.1 licenses (see the file -COPYING.txt included with the distribution). - -""" - -import sys, re, copy, time, struct, urllib, types, logging -try: - import threading - _threading = threading; del threading -except ImportError: - import dummy_threading - _threading = dummy_threading; del dummy_threading -import httplib # only for the default HTTP port - -MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " - "instance initialised with one)") -DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT) - -from _headersutil import split_header_words, parse_ns_headers -from _util import isstringlike -import _rfc3986 - -debug = logging.getLogger("mechanize.cookies").debug - - -def reraise_unmasked_exceptions(unmasked=()): - # There are a few catch-all except: statements in this module, for - # catching input that's bad in unexpected ways. - # This function re-raises some exceptions we don't want to trap. - import mechanize, warnings - if not mechanize.USE_BARE_EXCEPT: - raise - unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError) - etype = sys.exc_info()[0] - if issubclass(etype, unmasked): - raise - # swallowed an exception - import traceback, StringIO - f = StringIO.StringIO() - traceback.print_exc(None, f) - msg = f.getvalue() - warnings.warn("mechanize bug!\n%s" % msg, stacklevel=2) - - -IPV4_RE = re.compile(r"\.\d+$") -def is_HDN(text): - """Return True if text is a host domain name.""" - # XXX - # This may well be wrong. Which RFC is HDN defined in, if any (for - # the purposes of RFC 2965)? - # For the current implementation, what about IPv6? Remember to look - # at other uses of IPV4_RE also, if change this. - return not (IPV4_RE.search(text) or - text == "" or - text[0] == "." or text[-1] == ".") - -def domain_match(A, B): - """Return True if domain A domain-matches domain B, according to RFC 2965. - - A and B may be host domain names or IP addresses. - - RFC 2965, section 1: - - Host names can be specified either as an IP address or a HDN string. - Sometimes we compare one host name with another. (Such comparisons SHALL - be case-insensitive.) Host A's name domain-matches host B's if - - * their host name strings string-compare equal; or - - * A is a HDN string and has the form NB, where N is a non-empty - name string, B has the form .B', and B' is a HDN string. (So, - x.y.com domain-matches .Y.com but not Y.com.) - - Note that domain-match is not a commutative operation: a.b.c.com - domain-matches .c.com, but not the reverse. - - """ - # Note that, if A or B are IP addresses, the only relevant part of the - # definition of the domain-match algorithm is the direct string-compare. - A = A.lower() - B = B.lower() - if A == B: - return True - if not is_HDN(A): - return False - i = A.rfind(B) - has_form_nb = not (i == -1 or i == 0) - return ( - has_form_nb and - B.startswith(".") and - is_HDN(B[1:]) - ) - -def liberal_is_HDN(text): - """Return True if text is a sort-of-like a host domain name. - - For accepting/blocking domains. - - """ - return not IPV4_RE.search(text) - -def user_domain_match(A, B): - """For blocking/accepting domains. - - A and B may be host domain names or IP addresses. - - """ - A = A.lower() - B = B.lower() - if not (liberal_is_HDN(A) and liberal_is_HDN(B)): - if A == B: - # equal IP addresses - return True - return False - initial_dot = B.startswith(".") - if initial_dot and A.endswith(B): - return True - if not initial_dot and A == B: - return True - return False - -cut_port_re = re.compile(r":\d+$") -def request_host(request): - """Return request-host, as defined by RFC 2965. - - Variation from RFC: returned value is lowercased, for convenient - comparison. - - """ - url = request.get_full_url() - host = _rfc3986.urlsplit(url)[1] - if host is None: - host = request.get_header("Host", "") - - # remove port, if present - host = cut_port_re.sub("", host, 1) - return host.lower() - -def eff_request_host(request): - """Return a tuple (request-host, effective request-host name). - - As defined by RFC 2965, except both are lowercased. - - """ - erhn = req_host = request_host(request) - if req_host.find(".") == -1 and not IPV4_RE.search(req_host): - erhn = req_host + ".local" - return req_host, erhn - -def request_path(request): - """request-URI, as defined by RFC 2965.""" - url = request.get_full_url() - path, query, frag = _rfc3986.urlsplit(url)[2:] - path = escape_path(path) - req_path = _rfc3986.urlunsplit((None, None, path, query, frag)) - if not req_path.startswith("/"): - req_path = "/"+req_path - return req_path - -def request_port(request): - host = request.get_host() - i = host.find(':') - if i >= 0: - port = host[i+1:] - try: - int(port) - except ValueError: - debug("nonnumeric port: '%s'", port) - return None - else: - port = DEFAULT_HTTP_PORT - return port - -# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't -# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738). -HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()" -ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") -def uppercase_escaped_char(match): - return "%%%s" % match.group(1).upper() -def escape_path(path): - """Escape any invalid characters in HTTP URL, and uppercase all escapes.""" - # There's no knowing what character encoding was used to create URLs - # containing %-escapes, but since we have to pick one to escape invalid - # path characters, we pick UTF-8, as recommended in the HTML 4.0 - # specification: - # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1 - # And here, kind of: draft-fielding-uri-rfc2396bis-03 - # (And in draft IRI specification: draft-duerst-iri-05) - # (And here, for new URI schemes: RFC 2718) - if isinstance(path, types.UnicodeType): - path = path.encode("utf-8") - path = urllib.quote(path, HTTP_PATH_SAFE) - path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) - return path - -def reach(h): - """Return reach of host h, as defined by RFC 2965, section 1. - - The reach R of a host name H is defined as follows: - - * If - - - H is the host domain name of a host; and, - - - H has the form A.B; and - - - A has no embedded (that is, interior) dots; and - - - B has at least one embedded dot, or B is the string "local". - then the reach of H is .B. - - * Otherwise, the reach of H is H. - - >>> reach("www.acme.com") - '.acme.com' - >>> reach("acme.com") - 'acme.com' - >>> reach("acme.local") - '.local' - - """ - i = h.find(".") - if i >= 0: - #a = h[:i] # this line is only here to show what a is - b = h[i+1:] - i = b.find(".") - if is_HDN(h) and (i >= 0 or b == "local"): - return "."+b - return h - -def is_third_party(request): - """ - - RFC 2965, section 3.3.6: - - An unverifiable transaction is to a third-party host if its request- - host U does not domain-match the reach R of the request-host O in the - origin transaction. - - """ - req_host = request_host(request) - # the origin request's request-host was stuffed into request by - # _urllib2_support.AbstractHTTPHandler - return not domain_match(req_host, reach(request.origin_req_host)) - - -class Cookie: - """HTTP Cookie. - - This class represents both Netscape and RFC 2965 cookies. - - This is deliberately a very simple class. It just holds attributes. It's - possible to construct Cookie instances that don't comply with the cookie - standards. CookieJar.make_cookies is the factory function for Cookie - objects -- it deals with cookie parsing, supplying defaults, and - normalising to the representation used in this class. CookiePolicy is - responsible for checking them to see whether they should be accepted from - and returned to the server. - - version: integer; - name: string; - value: string (may be None); - port: string; None indicates no attribute was supplied (eg. "Port", rather - than eg. "Port=80"); otherwise, a port string (eg. "80") or a port list - string (eg. "80,8080") - port_specified: boolean; true if a value was supplied with the Port - cookie-attribute - domain: string; - domain_specified: boolean; true if Domain was explicitly set - domain_initial_dot: boolean; true if Domain as set in HTTP header by server - started with a dot (yes, this really is necessary!) - path: string; - path_specified: boolean; true if Path was explicitly set - secure: boolean; true if should only be returned over secure connection - expires: integer; seconds since epoch (RFC 2965 cookies should calculate - this value from the Max-Age attribute) - discard: boolean, true if this is a session cookie; (if no expires value, - this should be true) - comment: string; - comment_url: string; - rfc2109: boolean; true if cookie arrived in a Set-Cookie: (not - Set-Cookie2:) header, but had a version cookie-attribute of 1 - rest: mapping of other cookie-attributes - - Note that the port may be present in the headers, but unspecified ("Port" - rather than"Port=80", for example); if this is the case, port is None. - - """ - - def __init__(self, version, name, value, - port, port_specified, - domain, domain_specified, domain_initial_dot, - path, path_specified, - secure, - expires, - discard, - comment, - comment_url, - rest, - rfc2109=False, - ): - - if version is not None: version = int(version) - if expires is not None: expires = int(expires) - if port is None and port_specified is True: - raise ValueError("if port is None, port_specified must be false") - - self.version = version - self.name = name - self.value = value - self.port = port - self.port_specified = port_specified - # normalise case, as per RFC 2965 section 3.3.3 - self.domain = domain.lower() - self.domain_specified = domain_specified - # Sigh. We need to know whether the domain given in the - # cookie-attribute had an initial dot, in order to follow RFC 2965 - # (as clarified in draft errata). Needed for the returned $Domain - # value. - self.domain_initial_dot = domain_initial_dot - self.path = path - self.path_specified = path_specified - self.secure = secure - self.expires = expires - self.discard = discard - self.comment = comment - self.comment_url = comment_url - self.rfc2109 = rfc2109 - - self._rest = copy.copy(rest) - - def has_nonstandard_attr(self, name): - return self._rest.has_key(name) - def get_nonstandard_attr(self, name, default=None): - return self._rest.get(name, default) - def set_nonstandard_attr(self, name, value): - self._rest[name] = value - def nonstandard_attr_keys(self): - return self._rest.keys() - - def is_expired(self, now=None): - if now is None: now = time.time() - return (self.expires is not None) and (self.expires <= now) - - def __str__(self): - if self.port is None: p = "" - else: p = ":"+self.port - limit = self.domain + p + self.path - if self.value is not None: - namevalue = "%s=%s" % (self.name, self.value) - else: - namevalue = self.name - return "" % (namevalue, limit) - - def __repr__(self): - args = [] - for name in ["version", "name", "value", - "port", "port_specified", - "domain", "domain_specified", "domain_initial_dot", - "path", "path_specified", - "secure", "expires", "discard", "comment", "comment_url", - ]: - attr = getattr(self, name) - args.append("%s=%s" % (name, repr(attr))) - args.append("rest=%s" % repr(self._rest)) - args.append("rfc2109=%s" % repr(self.rfc2109)) - return "Cookie(%s)" % ", ".join(args) - - -class CookiePolicy: - """Defines which cookies get accepted from and returned to server. - - May also modify cookies. - - The subclass DefaultCookiePolicy defines the standard rules for Netscape - and RFC 2965 cookies -- override that if you want a customised policy. - - As well as implementing set_ok and return_ok, implementations of this - interface must also supply the following attributes, indicating which - protocols should be used, and how. These can be read and set at any time, - though whether that makes complete sense from the protocol point of view is - doubtful. - - Public attributes: - - netscape: implement netscape protocol - rfc2965: implement RFC 2965 protocol - rfc2109_as_netscape: - WARNING: This argument will change or go away if is not accepted into - the Python standard library in this form! - If true, treat RFC 2109 cookies as though they were Netscape cookies. The - default is for this attribute to be None, which means treat 2109 cookies - as RFC 2965 cookies unless RFC 2965 handling is switched off (which it is, - by default), and as Netscape cookies otherwise. - hide_cookie2: don't add Cookie2 header to requests (the presence of - this header indicates to the server that we understand RFC 2965 - cookies) - - """ - def set_ok(self, cookie, request): - """Return true if (and only if) cookie should be accepted from server. - - Currently, pre-expired cookies never get this far -- the CookieJar - class deletes such cookies itself. - - cookie: mechanize.Cookie object - request: object implementing the interface defined by - CookieJar.extract_cookies.__doc__ - - """ - raise NotImplementedError() - - def return_ok(self, cookie, request): - """Return true if (and only if) cookie should be returned to server. - - cookie: mechanize.Cookie object - request: object implementing the interface defined by - CookieJar.add_cookie_header.__doc__ - - """ - raise NotImplementedError() - - def domain_return_ok(self, domain, request): - """Return false if cookies should not be returned, given cookie domain. - - This is here as an optimization, to remove the need for checking every - cookie with a particular domain (which may involve reading many files). - The default implementations of domain_return_ok and path_return_ok - (return True) leave all the work to return_ok. - - If domain_return_ok returns true for the cookie domain, path_return_ok - is called for the cookie path. Otherwise, path_return_ok and return_ok - are never called for that cookie domain. If path_return_ok returns - true, return_ok is called with the Cookie object itself for a full - check. Otherwise, return_ok is never called for that cookie path. - - Note that domain_return_ok is called for every *cookie* domain, not - just for the *request* domain. For example, the function might be - called with both ".acme.com" and "www.acme.com" if the request domain is - "www.acme.com". The same goes for path_return_ok. - - For argument documentation, see the docstring for return_ok. - - """ - return True - - def path_return_ok(self, path, request): - """Return false if cookies should not be returned, given cookie path. - - See the docstring for domain_return_ok. - - """ - return True - - -class DefaultCookiePolicy(CookiePolicy): - """Implements the standard rules for accepting and returning cookies. - - Both RFC 2965 and Netscape cookies are covered. RFC 2965 handling is - switched off by default. - - The easiest way to provide your own policy is to override this class and - call its methods in your overriden implementations before adding your own - additional checks. - - import mechanize - class MyCookiePolicy(mechanize.DefaultCookiePolicy): - def set_ok(self, cookie, request): - if not mechanize.DefaultCookiePolicy.set_ok( - self, cookie, request): - return False - if i_dont_want_to_store_this_cookie(): - return False - return True - - In addition to the features required to implement the CookiePolicy - interface, this class allows you to block and allow domains from setting - and receiving cookies. There are also some strictness switches that allow - you to tighten up the rather loose Netscape protocol rules a little bit (at - the cost of blocking some benign cookies). - - A domain blacklist and whitelist is provided (both off by default). Only - domains not in the blacklist and present in the whitelist (if the whitelist - is active) participate in cookie setting and returning. Use the - blocked_domains constructor argument, and blocked_domains and - set_blocked_domains methods (and the corresponding argument and methods for - allowed_domains). If you set a whitelist, you can turn it off again by - setting it to None. - - Domains in block or allow lists that do not start with a dot must - string-compare equal. For example, "acme.com" matches a blacklist entry of - "acme.com", but "www.acme.com" does not. Domains that do start with a dot - are matched by more specific domains too. For example, both "www.acme.com" - and "www.munitions.acme.com" match ".acme.com" (but "acme.com" itself does - not). IP addresses are an exception, and must match exactly. For example, - if blocked_domains contains "192.168.1.2" and ".168.1.2" 192.168.1.2 is - blocked, but 193.168.1.2 is not. - - Additional Public Attributes: - - General strictness switches - - strict_domain: don't allow sites to set two-component domains with - country-code top-level domains like .co.uk, .gov.uk, .co.nz. etc. - This is far from perfect and isn't guaranteed to work! - - RFC 2965 protocol strictness switches - - strict_rfc2965_unverifiable: follow RFC 2965 rules on unverifiable - transactions (usually, an unverifiable transaction is one resulting from - a redirect or an image hosted on another site); if this is false, cookies - are NEVER blocked on the basis of verifiability - - Netscape protocol strictness switches - - strict_ns_unverifiable: apply RFC 2965 rules on unverifiable transactions - even to Netscape cookies - strict_ns_domain: flags indicating how strict to be with domain-matching - rules for Netscape cookies: - DomainStrictNoDots: when setting cookies, host prefix must not contain a - dot (eg. www.foo.bar.com can't set a cookie for .bar.com, because - www.foo contains a dot) - DomainStrictNonDomain: cookies that did not explicitly specify a Domain - cookie-attribute can only be returned to a domain that string-compares - equal to the domain that set the cookie (eg. rockets.acme.com won't - be returned cookies from acme.com that had no Domain cookie-attribute) - DomainRFC2965Match: when setting cookies, require a full RFC 2965 - domain-match - DomainLiberal and DomainStrict are the most useful combinations of the - above flags, for convenience - strict_ns_set_initial_dollar: ignore cookies in Set-Cookie: headers that - have names starting with '$' - strict_ns_set_path: don't allow setting cookies whose path doesn't - path-match request URI - - """ - - DomainStrictNoDots = 1 - DomainStrictNonDomain = 2 - DomainRFC2965Match = 4 - - DomainLiberal = 0 - DomainStrict = DomainStrictNoDots|DomainStrictNonDomain - - def __init__(self, - blocked_domains=None, allowed_domains=None, - netscape=True, rfc2965=False, - # WARNING: this argument will change or go away if is not - # accepted into the Python standard library in this form! - # default, ie. treat 2109 as netscape iff not rfc2965 - rfc2109_as_netscape=None, - hide_cookie2=False, - strict_domain=False, - strict_rfc2965_unverifiable=True, - strict_ns_unverifiable=False, - strict_ns_domain=DomainLiberal, - strict_ns_set_initial_dollar=False, - strict_ns_set_path=False, - ): - """ - Constructor arguments should be used as keyword arguments only. - - blocked_domains: sequence of domain names that we never accept cookies - from, nor return cookies to - allowed_domains: if not None, this is a sequence of the only domains - for which we accept and return cookies - - For other arguments, see CookiePolicy.__doc__ and - DefaultCookiePolicy.__doc__.. - - """ - self.netscape = netscape - self.rfc2965 = rfc2965 - self.rfc2109_as_netscape = rfc2109_as_netscape - self.hide_cookie2 = hide_cookie2 - self.strict_domain = strict_domain - self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable - self.strict_ns_unverifiable = strict_ns_unverifiable - self.strict_ns_domain = strict_ns_domain - self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar - self.strict_ns_set_path = strict_ns_set_path - - if blocked_domains is not None: - self._blocked_domains = tuple(blocked_domains) - else: - self._blocked_domains = () - - if allowed_domains is not None: - allowed_domains = tuple(allowed_domains) - self._allowed_domains = allowed_domains - - def blocked_domains(self): - """Return the sequence of blocked domains (as a tuple).""" - return self._blocked_domains - def set_blocked_domains(self, blocked_domains): - """Set the sequence of blocked domains.""" - self._blocked_domains = tuple(blocked_domains) - - def is_blocked(self, domain): - for blocked_domain in self._blocked_domains: - if user_domain_match(domain, blocked_domain): - return True - return False - - def allowed_domains(self): - """Return None, or the sequence of allowed domains (as a tuple).""" - return self._allowed_domains - def set_allowed_domains(self, allowed_domains): - """Set the sequence of allowed domains, or None.""" - if allowed_domains is not None: - allowed_domains = tuple(allowed_domains) - self._allowed_domains = allowed_domains - - def is_not_allowed(self, domain): - if self._allowed_domains is None: - return False - for allowed_domain in self._allowed_domains: - if user_domain_match(domain, allowed_domain): - return False - return True - - def set_ok(self, cookie, request): - """ - If you override set_ok, be sure to call this method. If it returns - false, so should your subclass (assuming your subclass wants to be more - strict about which cookies to accept). - - """ - debug(" - checking cookie %s", cookie) - - assert cookie.name is not None - - for n in "version", "verifiability", "name", "path", "domain", "port": - fn_name = "set_ok_"+n - fn = getattr(self, fn_name) - if not fn(cookie, request): - return False - - return True - - def set_ok_version(self, cookie, request): - if cookie.version is None: - # Version is always set to 0 by parse_ns_headers if it's a Netscape - # cookie, so this must be an invalid RFC 2965 cookie. - debug(" Set-Cookie2 without version attribute (%s)", cookie) - return False - if cookie.version > 0 and not self.rfc2965: - debug(" RFC 2965 cookies are switched off") - return False - elif cookie.version == 0 and not self.netscape: - debug(" Netscape cookies are switched off") - return False - return True - - def set_ok_verifiability(self, cookie, request): - if request.unverifiable and is_third_party(request): - if cookie.version > 0 and self.strict_rfc2965_unverifiable: - debug(" third-party RFC 2965 cookie during " - "unverifiable transaction") - return False - elif cookie.version == 0 and self.strict_ns_unverifiable: - debug(" third-party Netscape cookie during " - "unverifiable transaction") - return False - return True - - def set_ok_name(self, cookie, request): - # Try and stop servers setting V0 cookies designed to hack other - # servers that know both V0 and V1 protocols. - if (cookie.version == 0 and self.strict_ns_set_initial_dollar and - cookie.name.startswith("$")): - debug(" illegal name (starts with '$'): '%s'", cookie.name) - return False - return True - - def set_ok_path(self, cookie, request): - if cookie.path_specified: - req_path = request_path(request) - if ((cookie.version > 0 or - (cookie.version == 0 and self.strict_ns_set_path)) and - not req_path.startswith(cookie.path)): - debug(" path attribute %s is not a prefix of request " - "path %s", cookie.path, req_path) - return False - return True - - def set_ok_countrycode_domain(self, cookie, request): - """Return False if explicit cookie domain is not acceptable. - - Called by set_ok_domain, for convenience of overriding by - subclasses. - - """ - if cookie.domain_specified and self.strict_domain: - domain = cookie.domain - # since domain was specified, we know that: - assert domain.startswith(".") - if domain.count(".") == 2: - # domain like .foo.bar - i = domain.rfind(".") - tld = domain[i+1:] - sld = domain[1:i] - if (sld.lower() in [ - "co", "ac", - "com", "edu", "org", "net", "gov", "mil", "int", - "aero", "biz", "cat", "coop", "info", "jobs", "mobi", - "museum", "name", "pro", "travel", - ] and - len(tld) == 2): - # domain like .co.uk - return False - return True - - def set_ok_domain(self, cookie, request): - if self.is_blocked(cookie.domain): - debug(" domain %s is in user block-list", cookie.domain) - return False - if self.is_not_allowed(cookie.domain): - debug(" domain %s is not in user allow-list", cookie.domain) - return False - if not self.set_ok_countrycode_domain(cookie, request): - debug(" country-code second level domain %s", cookie.domain) - return False - if cookie.domain_specified: - req_host, erhn = eff_request_host(request) - domain = cookie.domain - if domain.startswith("."): - undotted_domain = domain[1:] - else: - undotted_domain = domain - embedded_dots = (undotted_domain.find(".") >= 0) - if not embedded_dots and domain != ".local": - debug(" non-local domain %s contains no embedded dot", - domain) - return False - if cookie.version == 0: - if (not erhn.endswith(domain) and - (not erhn.startswith(".") and - not ("."+erhn).endswith(domain))): - debug(" effective request-host %s (even with added " - "initial dot) does not end end with %s", - erhn, domain) - return False - if (cookie.version > 0 or - (self.strict_ns_domain & self.DomainRFC2965Match)): - if not domain_match(erhn, domain): - debug(" effective request-host %s does not domain-match " - "%s", erhn, domain) - return False - if (cookie.version > 0 or - (self.strict_ns_domain & self.DomainStrictNoDots)): - host_prefix = req_host[:-len(domain)] - if (host_prefix.find(".") >= 0 and - not IPV4_RE.search(req_host)): - debug(" host prefix %s for domain %s contains a dot", - host_prefix, domain) - return False - return True - - def set_ok_port(self, cookie, request): - if cookie.port_specified: - req_port = request_port(request) - if req_port is None: - req_port = "80" - else: - req_port = str(req_port) - for p in cookie.port.split(","): - try: - int(p) - except ValueError: - debug(" bad port %s (not numeric)", p) - return False - if p == req_port: - break - else: - debug(" request port (%s) not found in %s", - req_port, cookie.port) - return False - return True - - def return_ok(self, cookie, request): - """ - If you override return_ok, be sure to call this method. If it returns - false, so should your subclass (assuming your subclass wants to be more - strict about which cookies to return). - - """ - # Path has already been checked by path_return_ok, and domain blocking - # done by domain_return_ok. - debug(" - checking cookie %s", cookie) - - for n in "version", "verifiability", "secure", "expires", "port", "domain": - fn_name = "return_ok_"+n - fn = getattr(self, fn_name) - if not fn(cookie, request): - return False - return True - - def return_ok_version(self, cookie, request): - if cookie.version > 0 and not self.rfc2965: - debug(" RFC 2965 cookies are switched off") - return False - elif cookie.version == 0 and not self.netscape: - debug(" Netscape cookies are switched off") - return False - return True - - def return_ok_verifiability(self, cookie, request): - if request.unverifiable and is_third_party(request): - if cookie.version > 0 and self.strict_rfc2965_unverifiable: - debug(" third-party RFC 2965 cookie during unverifiable " - "transaction") - return False - elif cookie.version == 0 and self.strict_ns_unverifiable: - debug(" third-party Netscape cookie during unverifiable " - "transaction") - return False - return True - - def return_ok_secure(self, cookie, request): - if cookie.secure and request.get_type() != "https": - debug(" secure cookie with non-secure request") - return False - return True - - def return_ok_expires(self, cookie, request): - if cookie.is_expired(self._now): - debug(" cookie expired") - return False - return True - - def return_ok_port(self, cookie, request): - if cookie.port: - req_port = request_port(request) - if req_port is None: - req_port = "80" - for p in cookie.port.split(","): - if p == req_port: - break - else: - debug(" request port %s does not match cookie port %s", - req_port, cookie.port) - return False - return True - - def return_ok_domain(self, cookie, request): - req_host, erhn = eff_request_host(request) - domain = cookie.domain - - # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't - if (cookie.version == 0 and - (self.strict_ns_domain & self.DomainStrictNonDomain) and - not cookie.domain_specified and domain != erhn): - debug(" cookie with unspecified domain does not string-compare " - "equal to request domain") - return False - - if cookie.version > 0 and not domain_match(erhn, domain): - debug(" effective request-host name %s does not domain-match " - "RFC 2965 cookie domain %s", erhn, domain) - return False - if cookie.version == 0 and not ("."+erhn).endswith(domain): - debug(" request-host %s does not match Netscape cookie domain " - "%s", req_host, domain) - return False - return True - - def domain_return_ok(self, domain, request): - # Liberal check of domain. This is here as an optimization to avoid - # having to load lots of MSIE cookie files unless necessary. - - # Munge req_host and erhn to always start with a dot, so as to err on - # the side of letting cookies through. - dotted_req_host, dotted_erhn = eff_request_host(request) - if not dotted_req_host.startswith("."): - dotted_req_host = "."+dotted_req_host - if not dotted_erhn.startswith("."): - dotted_erhn = "."+dotted_erhn - if not (dotted_req_host.endswith(domain) or - dotted_erhn.endswith(domain)): - #debug(" request domain %s does not match cookie domain %s", - # req_host, domain) - return False - - if self.is_blocked(domain): - debug(" domain %s is in user block-list", domain) - return False - if self.is_not_allowed(domain): - debug(" domain %s is not in user allow-list", domain) - return False - - return True - - def path_return_ok(self, path, request): - debug("- checking cookie path=%s", path) - req_path = request_path(request) - if not req_path.startswith(path): - debug(" %s does not path-match %s", req_path, path) - return False - return True - - -def vals_sorted_by_key(adict): - keys = adict.keys() - keys.sort() - return map(adict.get, keys) - -class MappingIterator: - """Iterates over nested mapping, depth-first, in sorted order by key.""" - def __init__(self, mapping): - self._s = [(vals_sorted_by_key(mapping), 0, None)] # LIFO stack - - def __iter__(self): return self - - def next(self): - # this is hairy because of lack of generators - while 1: - try: - vals, i, prev_item = self._s.pop() - except IndexError: - raise StopIteration() - if i < len(vals): - item = vals[i] - i = i + 1 - self._s.append((vals, i, prev_item)) - try: - item.items - except AttributeError: - # non-mapping - break - else: - # mapping - self._s.append((vals_sorted_by_key(item), 0, item)) - continue - return item - - -# Used as second parameter to dict.get method, to distinguish absent -# dict key from one with a None value. -class Absent: pass - -class CookieJar: - """Collection of HTTP cookies. - - You may not need to know about this class: try mechanize.urlopen(). - - The major methods are extract_cookies and add_cookie_header; these are all - you are likely to need. - - CookieJar supports the iterator protocol: - - for cookie in cookiejar: - # do something with cookie - - Methods: - - add_cookie_header(request) - extract_cookies(response, request) - make_cookies(response, request) - set_cookie_if_ok(cookie, request) - set_cookie(cookie) - clear_session_cookies() - clear_expired_cookies() - clear(domain=None, path=None, name=None) - - Public attributes - - policy: CookiePolicy object - - """ - - non_word_re = re.compile(r"\W") - quote_re = re.compile(r"([\"\\])") - strict_domain_re = re.compile(r"\.?[^.]*") - domain_re = re.compile(r"[^.]*") - dots_re = re.compile(r"^\.+") - - def __init__(self, policy=None): - """ - See CookieJar.__doc__ for argument documentation. - - """ - if policy is None: - policy = DefaultCookiePolicy() - self._policy = policy - - self._cookies = {} - - # for __getitem__ iteration in pre-2.2 Pythons - self._prev_getitem_index = 0 - - def set_policy(self, policy): - self._policy = policy - - def _cookies_for_domain(self, domain, request): - cookies = [] - if not self._policy.domain_return_ok(domain, request): - return [] - debug("Checking %s for cookies to return", domain) - cookies_by_path = self._cookies[domain] - for path in cookies_by_path.keys(): - if not self._policy.path_return_ok(path, request): - continue - cookies_by_name = cookies_by_path[path] - for cookie in cookies_by_name.values(): - if not self._policy.return_ok(cookie, request): - debug(" not returning cookie") - continue - debug(" it's a match") - cookies.append(cookie) - return cookies - - def _cookies_for_request(self, request): - """Return a list of cookies to be returned to server.""" - cookies = [] - for domain in self._cookies.keys(): - cookies.extend(self._cookies_for_domain(domain, request)) - return cookies - - def _cookie_attrs(self, cookies): - """Return a list of cookie-attributes to be returned to server. - - like ['foo="bar"; $Path="/"', ...] - - The $Version attribute is also added when appropriate (currently only - once per request). - - """ - # add cookies in order of most specific (ie. longest) path first - def decreasing_size(a, b): return cmp(len(b.path), len(a.path)) - cookies.sort(decreasing_size) - - version_set = False - - attrs = [] - for cookie in cookies: - # set version of Cookie header - # XXX - # What should it be if multiple matching Set-Cookie headers have - # different versions themselves? - # Answer: there is no answer; was supposed to be settled by - # RFC 2965 errata, but that may never appear... - version = cookie.version - if not version_set: - version_set = True - if version > 0: - attrs.append("$Version=%s" % version) - - # quote cookie value if necessary - # (not for Netscape protocol, which already has any quotes - # intact, due to the poorly-specified Netscape Cookie: syntax) - if ((cookie.value is not None) and - self.non_word_re.search(cookie.value) and version > 0): - value = self.quote_re.sub(r"\\\1", cookie.value) - else: - value = cookie.value - - # add cookie-attributes to be returned in Cookie header - if cookie.value is None: - attrs.append(cookie.name) - else: - attrs.append("%s=%s" % (cookie.name, value)) - if version > 0: - if cookie.path_specified: - attrs.append('$Path="%s"' % cookie.path) - if cookie.domain.startswith("."): - domain = cookie.domain - if (not cookie.domain_initial_dot and - domain.startswith(".")): - domain = domain[1:] - attrs.append('$Domain="%s"' % domain) - if cookie.port is not None: - p = "$Port" - if cookie.port_specified: - p = p + ('="%s"' % cookie.port) - attrs.append(p) - - return attrs - - def add_cookie_header(self, request): - """Add correct Cookie: header to request (urllib2.Request object). - - The Cookie2 header is also added unless policy.hide_cookie2 is true. - - The request object (usually a urllib2.Request instance) must support - the methods get_full_url, get_host, get_type, has_header, get_header, - header_items and add_unredirected_header, as documented by urllib2, and - the port attribute (the port number). Actually, - RequestUpgradeProcessor will automatically upgrade your Request object - to one with has_header, get_header, header_items and - add_unredirected_header, if it lacks those methods, for compatibility - with pre-2.4 versions of urllib2. - - """ - debug("add_cookie_header") - self._policy._now = self._now = int(time.time()) - - req_host, erhn = eff_request_host(request) - strict_non_domain = ( - self._policy.strict_ns_domain & self._policy.DomainStrictNonDomain) - - cookies = self._cookies_for_request(request) - - attrs = self._cookie_attrs(cookies) - if attrs: - if not request.has_header("Cookie"): - request.add_unredirected_header("Cookie", "; ".join(attrs)) - - # if necessary, advertise that we know RFC 2965 - if self._policy.rfc2965 and not self._policy.hide_cookie2: - for cookie in cookies: - if cookie.version != 1 and not request.has_header("Cookie2"): - request.add_unredirected_header("Cookie2", '$Version="1"') - break - - self.clear_expired_cookies() - - def _normalized_cookie_tuples(self, attrs_set): - """Return list of tuples containing normalised cookie information. - - attrs_set is the list of lists of key,value pairs extracted from - the Set-Cookie or Set-Cookie2 headers. - - Tuples are name, value, standard, rest, where name and value are the - cookie name and value, standard is a dictionary containing the standard - cookie-attributes (discard, secure, version, expires or max-age, - domain, path and port) and rest is a dictionary containing the rest of - the cookie-attributes. - - """ - cookie_tuples = [] - - boolean_attrs = "discard", "secure" - value_attrs = ("version", - "expires", "max-age", - "domain", "path", "port", - "comment", "commenturl") - - for cookie_attrs in attrs_set: - name, value = cookie_attrs[0] - - # Build dictionary of standard cookie-attributes (standard) and - # dictionary of other cookie-attributes (rest). - - # Note: expiry time is normalised to seconds since epoch. V0 - # cookies should have the Expires cookie-attribute, and V1 cookies - # should have Max-Age, but since V1 includes RFC 2109 cookies (and - # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we - # accept either (but prefer Max-Age). - max_age_set = False - - bad_cookie = False - - standard = {} - rest = {} - for k, v in cookie_attrs[1:]: - lc = k.lower() - # don't lose case distinction for unknown fields - if lc in value_attrs or lc in boolean_attrs: - k = lc - if k in boolean_attrs and v is None: - # boolean cookie-attribute is present, but has no value - # (like "discard", rather than "port=80") - v = True - if standard.has_key(k): - # only first value is significant - continue - if k == "domain": - if v is None: - debug(" missing value for domain attribute") - bad_cookie = True - break - # RFC 2965 section 3.3.3 - v = v.lower() - if k == "expires": - if max_age_set: - # Prefer max-age to expires (like Mozilla) - continue - if v is None: - debug(" missing or invalid value for expires " - "attribute: treating as session cookie") - continue - if k == "max-age": - max_age_set = True - try: - v = int(v) - except ValueError: - debug(" missing or invalid (non-numeric) value for " - "max-age attribute") - bad_cookie = True - break - # convert RFC 2965 Max-Age to seconds since epoch - # XXX Strictly you're supposed to follow RFC 2616 - # age-calculation rules. Remember that zero Max-Age is a - # is a request to discard (old and new) cookie, though. - k = "expires" - v = self._now + v - if (k in value_attrs) or (k in boolean_attrs): - if (v is None and - k not in ["port", "comment", "commenturl"]): - debug(" missing value for %s attribute" % k) - bad_cookie = True - break - standard[k] = v - else: - rest[k] = v - - if bad_cookie: - continue - - cookie_tuples.append((name, value, standard, rest)) - - return cookie_tuples - - def _cookie_from_cookie_tuple(self, tup, request): - # standard is dict of standard cookie-attributes, rest is dict of the - # rest of them - name, value, standard, rest = tup - - domain = standard.get("domain", Absent) - path = standard.get("path", Absent) - port = standard.get("port", Absent) - expires = standard.get("expires", Absent) - - # set the easy defaults - version = standard.get("version", None) - if version is not None: version = int(version) - secure = standard.get("secure", False) - # (discard is also set if expires is Absent) - discard = standard.get("discard", False) - comment = standard.get("comment", None) - comment_url = standard.get("commenturl", None) - - # set default path - if path is not Absent and path != "": - path_specified = True - path = escape_path(path) - else: - path_specified = False - path = request_path(request) - i = path.rfind("/") - if i != -1: - if version == 0: - # Netscape spec parts company from reality here - path = path[:i] - else: - path = path[:i+1] - if len(path) == 0: path = "/" - - # set default domain - domain_specified = domain is not Absent - # but first we have to remember whether it starts with a dot - domain_initial_dot = False - if domain_specified: - domain_initial_dot = bool(domain.startswith(".")) - if domain is Absent: - req_host, erhn = eff_request_host(request) - domain = erhn - elif not domain.startswith("."): - domain = "."+domain - - # set default port - port_specified = False - if port is not Absent: - if port is None: - # Port attr present, but has no value: default to request port. - # Cookie should then only be sent back on that port. - port = request_port(request) - else: - port_specified = True - port = re.sub(r"\s+", "", port) - else: - # No port attr present. Cookie can be sent back on any port. - port = None - - # set default expires and discard - if expires is Absent: - expires = None - discard = True - elif expires <= self._now: - # Expiry date in past is request to delete cookie. This can't be - # in DefaultCookiePolicy, because can't delete cookies there. - try: - self.clear(domain, path, name) - except KeyError: - pass - debug("Expiring cookie, domain='%s', path='%s', name='%s'", - domain, path, name) - return None - - return Cookie(version, - name, value, - port, port_specified, - domain, domain_specified, domain_initial_dot, - path, path_specified, - secure, - expires, - discard, - comment, - comment_url, - rest) - - def _cookies_from_attrs_set(self, attrs_set, request): - cookie_tuples = self._normalized_cookie_tuples(attrs_set) - - cookies = [] - for tup in cookie_tuples: - cookie = self._cookie_from_cookie_tuple(tup, request) - if cookie: cookies.append(cookie) - return cookies - - def _process_rfc2109_cookies(self, cookies): - if self._policy.rfc2109_as_netscape is None: - rfc2109_as_netscape = not self._policy.rfc2965 - else: - rfc2109_as_netscape = self._policy.rfc2109_as_netscape - for cookie in cookies: - if cookie.version == 1: - cookie.rfc2109 = True - if rfc2109_as_netscape: - # treat 2109 cookies as Netscape cookies rather than - # as RFC2965 cookies - cookie.version = 0 - - def make_cookies(self, response, request): - """Return sequence of Cookie objects extracted from response object. - - See extract_cookies.__doc__ for the interfaces required of the - response and request arguments. - - """ - # get cookie-attributes for RFC 2965 and Netscape protocols - headers = response.info() - rfc2965_hdrs = headers.getheaders("Set-Cookie2") - ns_hdrs = headers.getheaders("Set-Cookie") - - rfc2965 = self._policy.rfc2965 - netscape = self._policy.netscape - - if ((not rfc2965_hdrs and not ns_hdrs) or - (not ns_hdrs and not rfc2965) or - (not rfc2965_hdrs and not netscape) or - (not netscape and not rfc2965)): - return [] # no relevant cookie headers: quick exit - - try: - cookies = self._cookies_from_attrs_set( - split_header_words(rfc2965_hdrs), request) - except: - reraise_unmasked_exceptions() - cookies = [] - - if ns_hdrs and netscape: - try: - # RFC 2109 and Netscape cookies - ns_cookies = self._cookies_from_attrs_set( - parse_ns_headers(ns_hdrs), request) - except: - reraise_unmasked_exceptions() - ns_cookies = [] - self._process_rfc2109_cookies(ns_cookies) - - # Look for Netscape cookies (from Set-Cookie headers) that match - # corresponding RFC 2965 cookies (from Set-Cookie2 headers). - # For each match, keep the RFC 2965 cookie and ignore the Netscape - # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are - # bundled in with the Netscape cookies for this purpose, which is - # reasonable behaviour. - if rfc2965: - lookup = {} - for cookie in cookies: - lookup[(cookie.domain, cookie.path, cookie.name)] = None - - def no_matching_rfc2965(ns_cookie, lookup=lookup): - key = ns_cookie.domain, ns_cookie.path, ns_cookie.name - return not lookup.has_key(key) - ns_cookies = filter(no_matching_rfc2965, ns_cookies) - - if ns_cookies: - cookies.extend(ns_cookies) - - return cookies - - def set_cookie_if_ok(self, cookie, request): - """Set a cookie if policy says it's OK to do so. - - cookie: mechanize.Cookie instance - request: see extract_cookies.__doc__ for the required interface - - """ - self._policy._now = self._now = int(time.time()) - - if self._policy.set_ok(cookie, request): - self.set_cookie(cookie) - - def set_cookie(self, cookie): - """Set a cookie, without checking whether or not it should be set. - - cookie: mechanize.Cookie instance - """ - c = self._cookies - if not c.has_key(cookie.domain): c[cookie.domain] = {} - c2 = c[cookie.domain] - if not c2.has_key(cookie.path): c2[cookie.path] = {} - c3 = c2[cookie.path] - c3[cookie.name] = cookie - - def extract_cookies(self, response, request): - """Extract cookies from response, where allowable given the request. - - Look for allowable Set-Cookie: and Set-Cookie2: headers in the response - object passed as argument. Any of these headers that are found are - used to update the state of the object (subject to the policy.set_ok - method's approval). - - The response object (usually be the result of a call to - mechanize.urlopen, or similar) should support an info method, which - returns a mimetools.Message object (in fact, the 'mimetools.Message - object' may be any object that provides a getallmatchingheaders - method). - - The request object (usually a urllib2.Request instance) must support - the methods get_full_url and get_host, as documented by urllib2, and - the port attribute (the port number). The request is used to set - default values for cookie-attributes as well as for checking that the - cookie is OK to be set. - - """ - debug("extract_cookies: %s", response.info()) - self._policy._now = self._now = int(time.time()) - - for cookie in self.make_cookies(response, request): - if self._policy.set_ok(cookie, request): - debug(" setting cookie: %s", cookie) - self.set_cookie(cookie) - - def clear(self, domain=None, path=None, name=None): - """Clear some cookies. - - Invoking this method without arguments will clear all cookies. If - given a single argument, only cookies belonging to that domain will be - removed. If given two arguments, cookies belonging to the specified - path within that domain are removed. If given three arguments, then - the cookie with the specified name, path and domain is removed. - - Raises KeyError if no matching cookie exists. - - """ - if name is not None: - if (domain is None) or (path is None): - raise ValueError( - "domain and path must be given to remove a cookie by name") - del self._cookies[domain][path][name] - elif path is not None: - if domain is None: - raise ValueError( - "domain must be given to remove cookies by path") - del self._cookies[domain][path] - elif domain is not None: - del self._cookies[domain] - else: - self._cookies = {} - - def clear_session_cookies(self): - """Discard all session cookies. - - Discards all cookies held by object which had either no Max-Age or - Expires cookie-attribute or an explicit Discard cookie-attribute, or - which otherwise have ended up with a true discard attribute. For - interactive browsers, the end of a session usually corresponds to - closing the browser window. - - Note that the save method won't save session cookies anyway, unless you - ask otherwise by passing a true ignore_discard argument. - - """ - for cookie in self: - if cookie.discard: - self.clear(cookie.domain, cookie.path, cookie.name) - - def clear_expired_cookies(self): - """Discard all expired cookies. - - You probably don't need to call this method: expired cookies are never - sent back to the server (provided you're using DefaultCookiePolicy), - this method is called by CookieJar itself every so often, and the save - method won't save expired cookies anyway (unless you ask otherwise by - passing a true ignore_expires argument). - - """ - now = time.time() - for cookie in self: - if cookie.is_expired(now): - self.clear(cookie.domain, cookie.path, cookie.name) - - def __getitem__(self, i): - if i == 0: - self._getitem_iterator = self.__iter__() - elif self._prev_getitem_index != i-1: raise IndexError( - "CookieJar.__getitem__ only supports sequential iteration") - self._prev_getitem_index = i - try: - return self._getitem_iterator.next() - except StopIteration: - raise IndexError() - - def __iter__(self): - return MappingIterator(self._cookies) - - def __len__(self): - """Return number of contained cookies.""" - i = 0 - for cookie in self: i = i + 1 - return i - - def __repr__(self): - r = [] - for cookie in self: r.append(repr(cookie)) - return "<%s[%s]>" % (self.__class__, ", ".join(r)) - - def __str__(self): - r = [] - for cookie in self: r.append(str(cookie)) - return "<%s[%s]>" % (self.__class__, ", ".join(r)) - - -class LoadError(Exception): pass - -class FileCookieJar(CookieJar): - """CookieJar that can be loaded from and saved to a file. - - Additional methods - - save(filename=None, ignore_discard=False, ignore_expires=False) - load(filename=None, ignore_discard=False, ignore_expires=False) - revert(filename=None, ignore_discard=False, ignore_expires=False) - - Additional public attributes - - filename: filename for loading and saving cookies - - Additional public readable attributes - - delayload: request that cookies are lazily loaded from disk; this is only - a hint since this only affects performance, not behaviour (unless the - cookies on disk are changing); a CookieJar object may ignore it (in fact, - only MSIECookieJar lazily loads cookies at the moment) - - """ - - def __init__(self, filename=None, delayload=False, policy=None): - """ - See FileCookieJar.__doc__ for argument documentation. - - Cookies are NOT loaded from the named file until either the load or - revert method is called. - - """ - CookieJar.__init__(self, policy) - if filename is not None and not isstringlike(filename): - raise ValueError("filename must be string-like") - self.filename = filename - self.delayload = bool(delayload) - - def save(self, filename=None, ignore_discard=False, ignore_expires=False): - """Save cookies to a file. - - filename: name of file in which to save cookies - ignore_discard: save even cookies set to be discarded - ignore_expires: save even cookies that have expired - - The file is overwritten if it already exists, thus wiping all its - cookies. Saved cookies can be restored later using the load or revert - methods. If filename is not specified, self.filename is used; if - self.filename is None, ValueError is raised. - - """ - raise NotImplementedError() - - def load(self, filename=None, ignore_discard=False, ignore_expires=False): - """Load cookies from a file. - - Old cookies are kept unless overwritten by newly loaded ones. - - Arguments are as for .save(). - - If filename is not specified, self.filename is used; if self.filename - is None, ValueError is raised. The named file must be in the format - understood by the class, or LoadError will be raised. This format will - be identical to that written by the save method, unless the load format - is not sufficiently well understood (as is the case for MSIECookieJar). - - """ - if filename is None: - if self.filename is not None: filename = self.filename - else: raise ValueError(MISSING_FILENAME_TEXT) - - f = open(filename) - try: - self._really_load(f, filename, ignore_discard, ignore_expires) - finally: - f.close() - - def revert(self, filename=None, - ignore_discard=False, ignore_expires=False): - """Clear all cookies and reload cookies from a saved file. - - Raises LoadError (or IOError) if reversion is not successful; the - object's state will not be altered if this happens. - - """ - if filename is None: - if self.filename is not None: filename = self.filename - else: raise ValueError(MISSING_FILENAME_TEXT) - - old_state = copy.deepcopy(self._cookies) - self._cookies = {} - try: - self.load(filename, ignore_discard, ignore_expires) - except (LoadError, IOError): - self._cookies = old_state - raise diff --git a/src/calibre/utils/mechanize/_debug.py b/src/calibre/utils/mechanize/_debug.py deleted file mode 100644 index 596b11477e..0000000000 --- a/src/calibre/utils/mechanize/_debug.py +++ /dev/null @@ -1,28 +0,0 @@ -import logging - -from urllib2 import BaseHandler -from _response import response_seek_wrapper - - -class HTTPResponseDebugProcessor(BaseHandler): - handler_order = 900 # before redirections, after everything else - - def http_response(self, request, response): - if not hasattr(response, "seek"): - response = response_seek_wrapper(response) - info = logging.getLogger("mechanize.http_responses").info - try: - info(response.read()) - finally: - response.seek(0) - info("*****************************************************") - return response - - https_response = http_response - -class HTTPRedirectDebugProcessor(BaseHandler): - def http_request(self, request): - if hasattr(request, "redirect_dict"): - info = logging.getLogger("mechanize.http_redirects").info - info("redirecting to %s", request.get_full_url()) - return request diff --git a/src/calibre/utils/mechanize/_gzip.py b/src/calibre/utils/mechanize/_gzip.py deleted file mode 100644 index 46a98a3858..0000000000 --- a/src/calibre/utils/mechanize/_gzip.py +++ /dev/null @@ -1,103 +0,0 @@ -import urllib2 -from cStringIO import StringIO -import _response - -# GzipConsumer was taken from Fredrik Lundh's effbot.org-0.1-20041009 library -class GzipConsumer: - - def __init__(self, consumer): - self.__consumer = consumer - self.__decoder = None - self.__data = "" - - def __getattr__(self, key): - return getattr(self.__consumer, key) - - def feed(self, data): - if self.__decoder is None: - # check if we have a full gzip header - data = self.__data + data - try: - i = 10 - flag = ord(data[3]) - if flag & 4: # extra - x = ord(data[i]) + 256*ord(data[i+1]) - i = i + 2 + x - if flag & 8: # filename - while ord(data[i]): - i = i + 1 - i = i + 1 - if flag & 16: # comment - while ord(data[i]): - i = i + 1 - i = i + 1 - if flag & 2: # crc - i = i + 2 - if len(data) < i: - raise IndexError("not enough data") - if data[:3] != "\x1f\x8b\x08": - raise IOError("invalid gzip data") - data = data[i:] - except IndexError: - self.__data = data - return # need more data - import zlib - self.__data = "" - self.__decoder = zlib.decompressobj(-zlib.MAX_WBITS) - data = self.__decoder.decompress(data) - if data: - self.__consumer.feed(data) - - def close(self): - if self.__decoder: - data = self.__decoder.flush() - if data: - self.__consumer.feed(data) - self.__consumer.close() - - -# -------------------------------------------------------------------- - -# the rest of this module is John Lee's stupid code, not -# Fredrik's nice code :-) - -class stupid_gzip_consumer: - def __init__(self): self.data = [] - def feed(self, data): self.data.append(data) - -class stupid_gzip_wrapper(_response.closeable_response): - def __init__(self, response): - self._response = response - - c = stupid_gzip_consumer() - gzc = GzipConsumer(c) - gzc.feed(response.read()) - self.__data = StringIO("".join(c.data)) - - def read(self, size=-1): - return self.__data.read(size) - def readline(self, size=-1): - return self.__data.readline(size) - def readlines(self, sizehint=-1): - return self.__data.readlines(size) - - def __getattr__(self, name): - # delegate unknown methods/attributes - return getattr(self._response, name) - -class HTTPGzipProcessor(urllib2.BaseHandler): - handler_order = 200 # response processing before HTTPEquivProcessor - - def http_request(self, request): - request.add_header("Accept-Encoding", "gzip") - return request - - def http_response(self, request, response): - # post-process response - enc_hdrs = response.info().getheaders("Content-encoding") - for enc_hdr in enc_hdrs: - if ("gzip" in enc_hdr) or ("compress" in enc_hdr): - return stupid_gzip_wrapper(response) - return response - - https_response = http_response diff --git a/src/calibre/utils/mechanize/_headersutil.py b/src/calibre/utils/mechanize/_headersutil.py deleted file mode 100644 index d8fe47a0e7..0000000000 --- a/src/calibre/utils/mechanize/_headersutil.py +++ /dev/null @@ -1,226 +0,0 @@ -"""Utility functions for HTTP header value parsing and construction. - -Copyright 1997-1998, Gisle Aas -Copyright 2002-2006, John J. Lee - -This code is free software; you can redistribute it and/or modify it -under the terms of the BSD or ZPL 2.1 licenses (see the file -COPYING.txt included with the distribution). - -""" - -import os, re -from types import StringType -from types import UnicodeType -STRING_TYPES = StringType, UnicodeType - -from _util import http2time -import _rfc3986 - -def is_html(ct_headers, url, allow_xhtml=False): - """ - ct_headers: Sequence of Content-Type headers - url: Response URL - - """ - if not ct_headers: - # guess - ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1] - html_exts = [".htm", ".html"] - if allow_xhtml: - html_exts += [".xhtml"] - return ext in html_exts - # use first header - ct = split_header_words(ct_headers)[0][0][0] - html_types = ["text/html"] - if allow_xhtml: - html_types += [ - "text/xhtml", "text/xml", - "application/xml", "application/xhtml+xml", - ] - return ct in html_types - -def unmatched(match): - """Return unmatched part of re.Match object.""" - start, end = match.span(0) - return match.string[:start]+match.string[end:] - -token_re = re.compile(r"^\s*([^=\s;,]+)") -quoted_value_re = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"") -value_re = re.compile(r"^\s*=\s*([^\s;,]*)") -escape_re = re.compile(r"\\(.)") -def split_header_words(header_values): - r"""Parse header values into a list of lists containing key,value pairs. - - The function knows how to deal with ",", ";" and "=" as well as quoted - values after "=". A list of space separated tokens are parsed as if they - were separated by ";". - - If the header_values passed as argument contains multiple values, then they - are treated as if they were a single value separated by comma ",". - - This means that this function is useful for parsing header fields that - follow this syntax (BNF as from the HTTP/1.1 specification, but we relax - the requirement for tokens). - - headers = #header - header = (token | parameter) *( [";"] (token | parameter)) - - token = 1* - separators = "(" | ")" | "<" | ">" | "@" - | "," | ";" | ":" | "\" | <"> - | "/" | "[" | "]" | "?" | "=" - | "{" | "}" | SP | HT - - quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) - qdtext = > - quoted-pair = "\" CHAR - - parameter = attribute "=" value - attribute = token - value = token | quoted-string - - Each header is represented by a list of key/value pairs. The value for a - simple token (not part of a parameter) is None. Syntactically incorrect - headers will not necessarily be parsed as you would want. - - This is easier to describe with some examples: - - >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz']) - [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]] - >>> split_header_words(['text/html; charset="iso-8859-1"']) - [[('text/html', None), ('charset', 'iso-8859-1')]] - >>> split_header_words([r'Basic realm="\"foo\bar\""']) - [[('Basic', None), ('realm', '"foobar"')]] - - """ - assert type(header_values) not in STRING_TYPES - result = [] - for text in header_values: - orig_text = text - pairs = [] - while text: - m = token_re.search(text) - if m: - text = unmatched(m) - name = m.group(1) - m = quoted_value_re.search(text) - if m: # quoted value - text = unmatched(m) - value = m.group(1) - value = escape_re.sub(r"\1", value) - else: - m = value_re.search(text) - if m: # unquoted value - text = unmatched(m) - value = m.group(1) - value = value.rstrip() - else: - # no value, a lone token - value = None - pairs.append((name, value)) - elif text.lstrip().startswith(","): - # concatenated headers, as per RFC 2616 section 4.2 - text = text.lstrip()[1:] - if pairs: result.append(pairs) - pairs = [] - else: - # skip junk - non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text) - assert nr_junk_chars > 0, ( - "split_header_words bug: '%s', '%s', %s" % - (orig_text, text, pairs)) - text = non_junk - if pairs: result.append(pairs) - return result - -join_escape_re = re.compile(r"([\"\\])") -def join_header_words(lists): - """Do the inverse of the conversion done by split_header_words. - - Takes a list of lists of (key, value) pairs and produces a single header - value. Attribute values are quoted if needed. - - >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]]) - 'text/plain; charset="iso-8859/1"' - >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]]) - 'text/plain, charset="iso-8859/1"' - - """ - headers = [] - for pairs in lists: - attr = [] - for k, v in pairs: - if v is not None: - if not re.search(r"^\w+$", v): - v = join_escape_re.sub(r"\\\1", v) # escape " and \ - v = '"%s"' % v - if k is None: # Netscape cookies may have no name - k = v - else: - k = "%s=%s" % (k, v) - attr.append(k) - if attr: headers.append("; ".join(attr)) - return ", ".join(headers) - -def parse_ns_headers(ns_headers): - """Ad-hoc parser for Netscape protocol cookie-attributes. - - The old Netscape cookie format for Set-Cookie can for instance contain - an unquoted "," in the expires field, so we have to use this ad-hoc - parser instead of split_header_words. - - XXX This may not make the best possible effort to parse all the crap - that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient - parser is probably better, so could do worse than following that if - this ever gives any trouble. - - Currently, this is also used for parsing RFC 2109 cookies. - - """ - known_attrs = ("expires", "domain", "path", "secure", - # RFC 2109 attrs (may turn up in Netscape cookies, too) - "port", "max-age") - - result = [] - for ns_header in ns_headers: - pairs = [] - version_set = False - params = re.split(r";\s*", ns_header) - for ii in range(len(params)): - param = params[ii] - param = param.rstrip() - if param == "": continue - if "=" not in param: - k, v = param, None - else: - k, v = re.split(r"\s*=\s*", param, 1) - k = k.lstrip() - if ii != 0: - lc = k.lower() - if lc in known_attrs: - k = lc - if k == "version": - # This is an RFC 2109 cookie. - version_set = True - if k == "expires": - # convert expires date to seconds since epoch - if v.startswith('"'): v = v[1:] - if v.endswith('"'): v = v[:-1] - v = http2time(v) # None if invalid - pairs.append((k, v)) - - if pairs: - if not version_set: - pairs.append(("version", "0")) - result.append(pairs) - - return result - - -def _test(): - import doctest, _headersutil - return doctest.testmod(_headersutil) - -if __name__ == "__main__": - _test() diff --git a/src/calibre/utils/mechanize/_html.py b/src/calibre/utils/mechanize/_html.py deleted file mode 100644 index 2d562c98bf..0000000000 --- a/src/calibre/utils/mechanize/_html.py +++ /dev/null @@ -1,607 +0,0 @@ -"""HTML handling. - -Copyright 2003-2006 John J. Lee - -This code is free software; you can redistribute it and/or modify it under -the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt -included with the distribution). - -""" - -import re, copy, htmlentitydefs -import sgmllib, HTMLParser, ClientForm - -import _request -from _headersutil import split_header_words, is_html as _is_html -import _rfc3986 - -DEFAULT_ENCODING = "latin-1" - - -# the base classe is purely for backwards compatibility -class ParseError(ClientForm.ParseError): pass - - -class CachingGeneratorFunction(object): - """Caching wrapper around a no-arguments iterable.""" - - def __init__(self, iterable): - self._cache = [] - # wrap iterable to make it non-restartable (otherwise, repeated - # __call__ would give incorrect results) - self._iterator = iter(iterable) - - def __call__(self): - cache = self._cache - for item in cache: - yield item - for item in self._iterator: - cache.append(item) - yield item - - -class EncodingFinder: - def __init__(self, default_encoding): - self._default_encoding = default_encoding - def encoding(self, response): - # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV - # headers may be in the response. HTTP-EQUIV headers come last, - # so try in order from first to last. - for ct in response.info().getheaders("content-type"): - for k, v in split_header_words([ct])[0]: - if k == "charset": - return v - return self._default_encoding - -class ResponseTypeFinder: - def __init__(self, allow_xhtml): - self._allow_xhtml = allow_xhtml - def is_html(self, response, encoding): - ct_hdrs = response.info().getheaders("content-type") - url = response.geturl() - # XXX encoding - return _is_html(ct_hdrs, url, self._allow_xhtml) - - -# idea for this argument-processing trick is from Peter Otten -class Args: - def __init__(self, args_map): - self.dictionary = dict(args_map) - def __getattr__(self, key): - try: - return self.dictionary[key] - except KeyError: - return getattr(self.__class__, key) - -def form_parser_args( - select_default=False, - form_parser_class=None, - request_class=None, - backwards_compat=False, - ): - return Args(locals()) - - -class Link: - def __init__(self, base_url, url, text, tag, attrs): - assert None not in [url, tag, attrs] - self.base_url = base_url - self.absolute_url = _rfc3986.urljoin(base_url, url) - self.url, self.text, self.tag, self.attrs = url, text, tag, attrs - def __cmp__(self, other): - try: - for name in "url", "text", "tag", "attrs": - if getattr(self, name) != getattr(other, name): - return -1 - except AttributeError: - return -1 - return 0 - def __repr__(self): - return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % ( - self.base_url, self.url, self.text, self.tag, self.attrs) - - -class LinksFactory: - - def __init__(self, - link_parser_class=None, - link_class=Link, - urltags=None, - ): - import _pullparser - if link_parser_class is None: - link_parser_class = _pullparser.TolerantPullParser - self.link_parser_class = link_parser_class - self.link_class = link_class - if urltags is None: - urltags = { - "a": "href", - "area": "href", - "frame": "src", - "iframe": "src", - } - self.urltags = urltags - self._response = None - self._encoding = None - - def set_response(self, response, base_url, encoding): - self._response = response - self._encoding = encoding - self._base_url = base_url - - def links(self): - """Return an iterator that provides links of the document.""" - response = self._response - encoding = self._encoding - base_url = self._base_url - p = self.link_parser_class(response, encoding=encoding) - - try: - for token in p.tags(*(self.urltags.keys()+["base"])): - if token.type == "endtag": - continue - if token.data == "base": - base_href = dict(token.attrs).get("href") - if base_href is not None: - base_url = base_href - continue - attrs = dict(token.attrs) - tag = token.data - name = attrs.get("name") - text = None - # XXX use attr_encoding for ref'd doc if that doc does not - # provide one by other means - #attr_encoding = attrs.get("charset") - url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL? - if not url: - # Probably an link or . - # For our purposes a link is something with a URL, so - # ignore this. - continue - - url = _rfc3986.clean_url(url, encoding) - if tag == "a": - if token.type != "startendtag": - # hmm, this'd break if end tag is missing - text = p.get_compressed_text(("endtag", tag)) - # but this doesn't work for eg. - # Andy - #text = p.get_compressed_text() - - yield Link(base_url, url, text, tag, token.attrs) - except sgmllib.SGMLParseError, exc: - raise ParseError(exc) - -class FormsFactory: - - """Makes a sequence of objects satisfying ClientForm.HTMLForm interface. - - After calling .forms(), the .global_form attribute is a form object - containing all controls not a descendant of any FORM element. - - For constructor argument docs, see ClientForm.ParseResponse - argument docs. - - """ - - def __init__(self, - select_default=False, - form_parser_class=None, - request_class=None, - backwards_compat=False, - ): - import ClientForm - self.select_default = select_default - if form_parser_class is None: - form_parser_class = ClientForm.FormParser - self.form_parser_class = form_parser_class - if request_class is None: - request_class = _request.Request - self.request_class = request_class - self.backwards_compat = backwards_compat - self._response = None - self.encoding = None - self.global_form = None - - def set_response(self, response, encoding): - self._response = response - self.encoding = encoding - self.global_form = None - - def forms(self): - import ClientForm - encoding = self.encoding - try: - forms = ClientForm.ParseResponseEx( - self._response, - select_default=self.select_default, - form_parser_class=self.form_parser_class, - request_class=self.request_class, - encoding=encoding, - _urljoin=_rfc3986.urljoin, - _urlparse=_rfc3986.urlsplit, - _urlunparse=_rfc3986.urlunsplit, - ) - except ClientForm.ParseError, exc: - raise ParseError(exc) - self.global_form = forms[0] - return forms[1:] - -class TitleFactory: - def __init__(self): - self._response = self._encoding = None - - def set_response(self, response, encoding): - self._response = response - self._encoding = encoding - - def title(self): - import _pullparser - p = _pullparser.TolerantPullParser( - self._response, encoding=self._encoding) - try: - try: - p.get_tag("title") - except _pullparser.NoMoreTokensError: - return None - else: - return p.get_text() - except sgmllib.SGMLParseError, exc: - raise ParseError(exc) - - -def unescape(data, entities, encoding): - if data is None or "&" not in data: - return data - - def replace_entities(match): - ent = match.group() - if ent[1] == "#": - return unescape_charref(ent[2:-1], encoding) - - repl = entities.get(ent[1:-1]) - if repl is not None: - repl = unichr(repl) - if type(repl) != type(""): - try: - repl = repl.encode(encoding) - except UnicodeError: - repl = ent - else: - repl = ent - return repl - - return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data) - -def unescape_charref(data, encoding): - name, base = data, 10 - if name.startswith("x"): - name, base= name[1:], 16 - uc = unichr(int(name, base)) - if encoding is None: - return uc - else: - try: - repl = uc.encode(encoding) - except UnicodeError: - repl = "&#%s;" % data - return repl - - -# bizarre import gymnastics for bundled BeautifulSoup -import _beautifulsoup -import ClientForm -RobustFormParser, NestingRobustFormParser = ClientForm._create_bs_classes( - _beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup - ) -# monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-( -import sgmllib -sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]") - -class MechanizeBs(_beautifulsoup.BeautifulSoup): - _entitydefs = htmlentitydefs.name2codepoint - # don't want the magic Microsoft-char workaround - PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'), - lambda(x):x.group(1) + ' />'), - (re.compile(']*)>'), - lambda(x):'') - ] - - def __init__(self, encoding, text=None, avoidParserProblems=True, - initialTextIsEverything=True): - self._encoding = encoding - _beautifulsoup.BeautifulSoup.__init__( - self, text, avoidParserProblems, initialTextIsEverything) - - def handle_charref(self, ref): - t = unescape("&#%s;"%ref, self._entitydefs, self._encoding) - self.handle_data(t) - def handle_entityref(self, ref): - t = unescape("&%s;"%ref, self._entitydefs, self._encoding) - self.handle_data(t) - def unescape_attrs(self, attrs): - escaped_attrs = [] - for key, val in attrs: - val = unescape(val, self._entitydefs, self._encoding) - escaped_attrs.append((key, val)) - return escaped_attrs - -class RobustLinksFactory: - - compress_re = re.compile(r"\s+") - - def __init__(self, - link_parser_class=None, - link_class=Link, - urltags=None, - ): - import _beautifulsoup - if link_parser_class is None: - link_parser_class = MechanizeBs - self.link_parser_class = link_parser_class - self.link_class = link_class - if urltags is None: - urltags = { - "a": "href", - "area": "href", - "frame": "src", - "iframe": "src", - } - self.urltags = urltags - self._bs = None - self._encoding = None - self._base_url = None - - def set_soup(self, soup, base_url, encoding): - self._bs = soup - self._base_url = base_url - self._encoding = encoding - - def links(self): - import _beautifulsoup - bs = self._bs - base_url = self._base_url - encoding = self._encoding - gen = bs.recursiveChildGenerator() - for ch in bs.recursiveChildGenerator(): - if (isinstance(ch, _beautifulsoup.Tag) and - ch.name in self.urltags.keys()+["base"]): - link = ch - attrs = bs.unescape_attrs(link.attrs) - attrs_dict = dict(attrs) - if link.name == "base": - base_href = attrs_dict.get("href") - if base_href is not None: - base_url = base_href - continue - url_attr = self.urltags[link.name] - url = attrs_dict.get(url_attr) - if not url: - continue - url = _rfc3986.clean_url(url, encoding) - text = link.firstText(lambda t: True) - if text is _beautifulsoup.Null: - # follow _pullparser's weird behaviour rigidly - if link.name == "a": - text = "" - else: - text = None - else: - text = self.compress_re.sub(" ", text.strip()) - yield Link(base_url, url, text, link.name, attrs) - - -class RobustFormsFactory(FormsFactory): - def __init__(self, *args, **kwds): - import ClientForm - args = form_parser_args(*args, **kwds) - if args.form_parser_class is None: - args.form_parser_class = RobustFormParser - FormsFactory.__init__(self, **args.dictionary) - - def set_response(self, response, encoding): - self._response = response - self.encoding = encoding - - -class RobustTitleFactory: - def __init__(self): - self._bs = self._encoding = None - - def set_soup(self, soup, encoding): - self._bs = soup - self._encoding = encoding - - def title(self): - import _beautifulsoup - title = self._bs.first("title") - if title == _beautifulsoup.Null: - return None - else: - return title.firstText(lambda t: True) - - -class Factory: - """Factory for forms, links, etc. - - This interface may expand in future. - - Public methods: - - set_request_class(request_class) - set_response(response) - forms() - links() - - Public attributes: - - Note that accessing these attributes may raise ParseError. - - encoding: string specifying the encoding of response if it contains a text - document (this value is left unspecified for documents that do not have - an encoding, e.g. an image file) - is_html: true if response contains an HTML document (XHTML may be - regarded as HTML too) - title: page title, or None if no title or not HTML - global_form: form object containing all controls that are not descendants - of any FORM element, or None if the forms_factory does not support - supplying a global form - - """ - - LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"] - - def __init__(self, forms_factory, links_factory, title_factory, - encoding_finder=EncodingFinder(DEFAULT_ENCODING), - response_type_finder=ResponseTypeFinder(allow_xhtml=False), - ): - """ - - Pass keyword arguments only. - - default_encoding: character encoding to use if encoding cannot be - determined (or guessed) from the response. You should turn on - HTTP-EQUIV handling if you want the best chance of getting this right - without resorting to this default. The default value of this - parameter (currently latin-1) may change in future. - - """ - self._forms_factory = forms_factory - self._links_factory = links_factory - self._title_factory = title_factory - self._encoding_finder = encoding_finder - self._response_type_finder = response_type_finder - - self.set_response(None) - - def set_request_class(self, request_class): - """Set urllib2.Request class. - - ClientForm.HTMLForm instances returned by .forms() will return - instances of this class when .click()ed. - - """ - self._forms_factory.request_class = request_class - - def set_response(self, response): - """Set response. - - The response must either be None or implement the same interface as - objects returned by urllib2.urlopen(). - - """ - self._response = response - self._forms_genf = self._links_genf = None - self._get_title = None - for name in self.LAZY_ATTRS: - try: - delattr(self, name) - except AttributeError: - pass - - def __getattr__(self, name): - if name not in self.LAZY_ATTRS: - return getattr(self.__class__, name) - - if name == "encoding": - self.encoding = self._encoding_finder.encoding( - copy.copy(self._response)) - return self.encoding - elif name == "is_html": - self.is_html = self._response_type_finder.is_html( - copy.copy(self._response), self.encoding) - return self.is_html - elif name == "title": - if self.is_html: - self.title = self._title_factory.title() - else: - self.title = None - return self.title - elif name == "global_form": - self.forms() - return self.global_form - - def forms(self): - """Return iterable over ClientForm.HTMLForm-like objects. - - Raises mechanize.ParseError on failure. - """ - # this implementation sets .global_form as a side-effect, for benefit - # of __getattr__ impl - if self._forms_genf is None: - try: - self._forms_genf = CachingGeneratorFunction( - self._forms_factory.forms()) - except: # XXXX define exception! - self.set_response(self._response) - raise - self.global_form = getattr( - self._forms_factory, "global_form", None) - return self._forms_genf() - - def links(self): - """Return iterable over mechanize.Link-like objects. - - Raises mechanize.ParseError on failure. - """ - if self._links_genf is None: - try: - self._links_genf = CachingGeneratorFunction( - self._links_factory.links()) - except: # XXXX define exception! - self.set_response(self._response) - raise - return self._links_genf() - -class DefaultFactory(Factory): - """Based on sgmllib.""" - def __init__(self, i_want_broken_xhtml_support=False): - Factory.__init__( - self, - forms_factory=FormsFactory(), - links_factory=LinksFactory(), - title_factory=TitleFactory(), - response_type_finder=ResponseTypeFinder( - allow_xhtml=i_want_broken_xhtml_support), - ) - - def set_response(self, response): - Factory.set_response(self, response) - if response is not None: - self._forms_factory.set_response( - copy.copy(response), self.encoding) - self._links_factory.set_response( - copy.copy(response), response.geturl(), self.encoding) - self._title_factory.set_response( - copy.copy(response), self.encoding) - -class RobustFactory(Factory): - """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is - DefaultFactory. - - """ - def __init__(self, i_want_broken_xhtml_support=False, - soup_class=None): - Factory.__init__( - self, - forms_factory=RobustFormsFactory(), - links_factory=RobustLinksFactory(), - title_factory=RobustTitleFactory(), - response_type_finder=ResponseTypeFinder( - allow_xhtml=i_want_broken_xhtml_support), - ) - if soup_class is None: - soup_class = MechanizeBs - self._soup_class = soup_class - - def set_response(self, response): - import _beautifulsoup - Factory.set_response(self, response) - if response is not None: - data = response.read() - soup = self._soup_class(self.encoding, data) - self._forms_factory.set_response( - copy.copy(response), self.encoding) - self._links_factory.set_soup( - soup, response.geturl(), self.encoding) - self._title_factory.set_soup(soup, self.encoding) diff --git a/src/calibre/utils/mechanize/_http.py b/src/calibre/utils/mechanize/_http.py deleted file mode 100644 index d73f3f44e5..0000000000 --- a/src/calibre/utils/mechanize/_http.py +++ /dev/null @@ -1,729 +0,0 @@ -"""HTTP related handlers. - -Note that some other HTTP handlers live in more specific modules: _auth.py, -_gzip.py, etc. - - -Copyright 2002-2006 John J Lee - -This code is free software; you can redistribute it and/or modify it -under the terms of the BSD or ZPL 2.1 licenses (see the file -COPYING.txt included with the distribution). - -""" - -import copy, time, tempfile, htmlentitydefs, re, logging, socket, \ - urllib2, urllib, httplib, sgmllib -from urllib2 import URLError, HTTPError, BaseHandler -from cStringIO import StringIO - -from _request import Request -from _util import isstringlike -from _response import closeable_response, response_seek_wrapper -from _html import unescape, unescape_charref -from _headersutil import is_html -from _clientcookie import CookieJar, request_host -import _rfc3986 - -debug = logging.getLogger("mechanize").debug - -# monkeypatch urllib2.HTTPError to show URL -## def urllib2_str(self): -## return 'HTTP Error %s: %s (%s)' % ( -## self.code, self.msg, self.geturl()) -## urllib2.HTTPError.__str__ = urllib2_str - - -CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes -DEFAULT_ENCODING = 'latin-1' - - -# This adds "refresh" to the list of redirectables and provides a redirection -# algorithm that doesn't go into a loop in the presence of cookies -# (Python 2.4 has this new algorithm, 2.3 doesn't). -class HTTPRedirectHandler(BaseHandler): - # maximum number of redirections to any single URL - # this is needed because of the state that cookies introduce - max_repeats = 4 - # maximum total number of redirections (regardless of URL) before - # assuming we're in a loop - max_redirections = 10 - - # Implementation notes: - - # To avoid the server sending us into an infinite loop, the request - # object needs to track what URLs we have already seen. Do this by - # adding a handler-specific attribute to the Request object. The value - # of the dict is used to count the number of times the same URL has - # been visited. This is needed because visiting the same URL twice - # does not necessarily imply a loop, thanks to state introduced by - # cookies. - - # Always unhandled redirection codes: - # 300 Multiple Choices: should not handle this here. - # 304 Not Modified: no need to handle here: only of interest to caches - # that do conditional GETs - # 305 Use Proxy: probably not worth dealing with here - # 306 Unused: what was this for in the previous versions of protocol?? - - def redirect_request(self, newurl, req, fp, code, msg, headers): - """Return a Request or None in response to a redirect. - - This is called by the http_error_30x methods when a redirection - response is received. If a redirection should take place, return a - new Request to allow http_error_30x to perform the redirect; - otherwise, return None to indicate that an HTTPError should be - raised. - - """ - if code in (301, 302, 303, "refresh") or \ - (code == 307 and not req.has_data()): - # Strictly (according to RFC 2616), 301 or 302 in response to - # a POST MUST NOT cause a redirection without confirmation - # from the user (of urllib2, in this case). In practice, - # essentially all clients do redirect in this case, so we do - # the same. - # XXX really refresh redirections should be visiting; tricky to - # fix, so this will wait until post-stable release - new = Request(newurl, - headers=req.headers, - origin_req_host=req.get_origin_req_host(), - unverifiable=True, - visit=False, - ) - new._origin_req = getattr(req, "_origin_req", req) - return new - else: - raise HTTPError(req.get_full_url(), code, msg, headers, fp) - - def http_error_302(self, req, fp, code, msg, headers): - # Some servers (incorrectly) return multiple Location headers - # (so probably same goes for URI). Use first header. - if headers.has_key('location'): - newurl = headers.getheaders('location')[0] - elif headers.has_key('uri'): - newurl = headers.getheaders('uri')[0] - else: - return - newurl = _rfc3986.clean_url(newurl, "latin-1") - newurl = _rfc3986.urljoin(req.get_full_url(), newurl) - - # XXX Probably want to forget about the state of the current - # request, although that might interact poorly with other - # handlers that also use handler-specific request attributes - new = self.redirect_request(newurl, req, fp, code, msg, headers) - if new is None: - return - - # loop detection - # .redirect_dict has a key url if url was previously visited. - if hasattr(req, 'redirect_dict'): - visited = new.redirect_dict = req.redirect_dict - if (visited.get(newurl, 0) >= self.max_repeats or - len(visited) >= self.max_redirections): - raise HTTPError(req.get_full_url(), code, - self.inf_msg + msg, headers, fp) - else: - visited = new.redirect_dict = req.redirect_dict = {} - visited[newurl] = visited.get(newurl, 0) + 1 - - # Don't close the fp until we are sure that we won't use it - # with HTTPError. - fp.read() - fp.close() - - return self.parent.open(new) - - http_error_301 = http_error_303 = http_error_307 = http_error_302 - http_error_refresh = http_error_302 - - inf_msg = "The HTTP server returned a redirect error that would " \ - "lead to an infinite loop.\n" \ - "The last 30x error message was:\n" - - -# XXX would self.reset() work, instead of raising this exception? -class EndOfHeadError(Exception): pass -class AbstractHeadParser: - # only these elements are allowed in or before HEAD of document - head_elems = ("html", "head", - "title", "base", - "script", "style", "meta", "link", "object") - _entitydefs = htmlentitydefs.name2codepoint - _encoding = DEFAULT_ENCODING - - def __init__(self): - self.http_equiv = [] - - def start_meta(self, attrs): - http_equiv = content = None - for key, value in attrs: - if key == "http-equiv": - http_equiv = self.unescape_attr_if_required(value) - elif key == "content": - content = self.unescape_attr_if_required(value) - if http_equiv is not None and content is not None: - self.http_equiv.append((http_equiv, content)) - - def end_head(self): - raise EndOfHeadError() - - def handle_entityref(self, name): - #debug("%s", name) - self.handle_data(unescape( - '&%s;' % name, self._entitydefs, self._encoding)) - - def handle_charref(self, name): - #debug("%s", name) - self.handle_data(unescape_charref(name, self._encoding)) - - def unescape_attr(self, name): - #debug("%s", name) - return unescape(name, self._entitydefs, self._encoding) - - def unescape_attrs(self, attrs): - #debug("%s", attrs) - escaped_attrs = {} - for key, val in attrs.items(): - escaped_attrs[key] = self.unescape_attr(val) - return escaped_attrs - - def unknown_entityref(self, ref): - self.handle_data("&%s;" % ref) - - def unknown_charref(self, ref): - self.handle_data("&#%s;" % ref) - - -try: - import HTMLParser -except ImportError: - pass -else: - class XHTMLCompatibleHeadParser(AbstractHeadParser, - HTMLParser.HTMLParser): - def __init__(self): - HTMLParser.HTMLParser.__init__(self) - AbstractHeadParser.__init__(self) - - def handle_starttag(self, tag, attrs): - if tag not in self.head_elems: - raise EndOfHeadError() - try: - method = getattr(self, 'start_' + tag) - except AttributeError: - try: - method = getattr(self, 'do_' + tag) - except AttributeError: - pass # unknown tag - else: - method(attrs) - else: - method(attrs) - - def handle_endtag(self, tag): - if tag not in self.head_elems: - raise EndOfHeadError() - try: - method = getattr(self, 'end_' + tag) - except AttributeError: - pass # unknown tag - else: - method() - - def unescape(self, name): - # Use the entitydefs passed into constructor, not - # HTMLParser.HTMLParser's entitydefs. - return self.unescape_attr(name) - - def unescape_attr_if_required(self, name): - return name # HTMLParser.HTMLParser already did it - -class HeadParser(AbstractHeadParser, sgmllib.SGMLParser): - - def _not_called(self): - assert False - - def __init__(self): - sgmllib.SGMLParser.__init__(self) - AbstractHeadParser.__init__(self) - - def handle_starttag(self, tag, method, attrs): - if tag not in self.head_elems: - raise EndOfHeadError() - if tag == "meta": - method(attrs) - - def unknown_starttag(self, tag, attrs): - self.handle_starttag(tag, self._not_called, attrs) - - def handle_endtag(self, tag, method): - if tag in self.head_elems: - method() - else: - raise EndOfHeadError() - - def unescape_attr_if_required(self, name): - return self.unescape_attr(name) - -def parse_head(fileobj, parser): - """Return a list of key, value pairs.""" - while 1: - data = fileobj.read(CHUNK) - try: - parser.feed(data) - except EndOfHeadError: - break - if len(data) != CHUNK: - # this should only happen if there is no HTML body, or if - # CHUNK is big - break - return parser.http_equiv - -class HTTPEquivProcessor(BaseHandler): - """Append META HTTP-EQUIV headers to regular HTTP headers.""" - - handler_order = 300 # before handlers that look at HTTP headers - - def __init__(self, head_parser_class=HeadParser, - i_want_broken_xhtml_support=False, - ): - self.head_parser_class = head_parser_class - self._allow_xhtml = i_want_broken_xhtml_support - - def http_response(self, request, response): - if not hasattr(response, "seek"): - response = response_seek_wrapper(response) - http_message = response.info() - url = response.geturl() - ct_hdrs = http_message.getheaders("content-type") - if is_html(ct_hdrs, url, self._allow_xhtml): - try: - try: - html_headers = parse_head(response, self.head_parser_class()) - finally: - response.seek(0) - except (HTMLParser.HTMLParseError, - sgmllib.SGMLParseError): - pass - else: - for hdr, val in html_headers: - # add a header - http_message.dict[hdr.lower()] = val - text = hdr + ": " + val - for line in text.split("\n"): - http_message.headers.append(line + "\n") - return response - - https_response = http_response - -class HTTPCookieProcessor(BaseHandler): - """Handle HTTP cookies. - - Public attributes: - - cookiejar: CookieJar instance - - """ - def __init__(self, cookiejar=None): - if cookiejar is None: - cookiejar = CookieJar() - self.cookiejar = cookiejar - - def http_request(self, request): - self.cookiejar.add_cookie_header(request) - return request - - def http_response(self, request, response): - self.cookiejar.extract_cookies(response, request) - return response - - https_request = http_request - https_response = http_response - -try: - import robotparser -except ImportError: - pass -else: - class MechanizeRobotFileParser(robotparser.RobotFileParser): - - def __init__(self, url='', opener=None): - import _opener - robotparser.RobotFileParser.__init__(self, url) - self._opener = opener - - def set_opener(self, opener=None): - if opener is None: - opener = _opener.OpenerDirector() - self._opener = opener - - def read(self): - """Reads the robots.txt URL and feeds it to the parser.""" - if self._opener is None: - self.set_opener() - req = Request(self.url, unverifiable=True, visit=False) - try: - f = self._opener.open(req) - except HTTPError, f: - pass - except (IOError, socket.error, OSError), exc: - robotparser._debug("ignoring error opening %r: %s" % - (self.url, exc)) - return - lines = [] - line = f.readline() - while line: - lines.append(line.strip()) - line = f.readline() - status = f.code - if status == 401 or status == 403: - self.disallow_all = True - robotparser._debug("disallow all") - elif status >= 400: - self.allow_all = True - robotparser._debug("allow all") - elif status == 200 and lines: - robotparser._debug("parse lines") - self.parse(lines) - - class RobotExclusionError(urllib2.HTTPError): - def __init__(self, request, *args): - apply(urllib2.HTTPError.__init__, (self,)+args) - self.request = request - - class HTTPRobotRulesProcessor(BaseHandler): - # before redirections, after everything else - handler_order = 800 - - try: - from httplib import HTTPMessage - except: - from mimetools import Message - http_response_class = Message - else: - http_response_class = HTTPMessage - - def __init__(self, rfp_class=MechanizeRobotFileParser): - self.rfp_class = rfp_class - self.rfp = None - self._host = None - - def http_request(self, request): - scheme = request.get_type() - if scheme not in ["http", "https"]: - # robots exclusion only applies to HTTP - return request - - if request.get_selector() == "/robots.txt": - # /robots.txt is always OK to fetch - return request - - host = request.get_host() - - # robots.txt requests don't need to be allowed by robots.txt :-) - origin_req = getattr(request, "_origin_req", None) - if (origin_req is not None and - origin_req.get_selector() == "/robots.txt" and - origin_req.get_host() == host - ): - return request - - if host != self._host: - self.rfp = self.rfp_class() - try: - self.rfp.set_opener(self.parent) - except AttributeError: - debug("%r instance does not support set_opener" % - self.rfp.__class__) - self.rfp.set_url(scheme+"://"+host+"/robots.txt") - self.rfp.read() - self._host = host - - ua = request.get_header("User-agent", "") - if self.rfp.can_fetch(ua, request.get_full_url()): - return request - else: - # XXX This should really have raised URLError. Too late now... - msg = "request disallowed by robots.txt" - raise RobotExclusionError( - request, - request.get_full_url(), - 403, msg, - self.http_response_class(StringIO()), StringIO(msg)) - - https_request = http_request - -class HTTPRefererProcessor(BaseHandler): - """Add Referer header to requests. - - This only makes sense if you use each RefererProcessor for a single - chain of requests only (so, for example, if you use a single - HTTPRefererProcessor to fetch a series of URLs extracted from a single - page, this will break). - - There's a proper implementation of this in mechanize.Browser. - - """ - def __init__(self): - self.referer = None - - def http_request(self, request): - if ((self.referer is not None) and - not request.has_header("Referer")): - request.add_unredirected_header("Referer", self.referer) - return request - - def http_response(self, request, response): - self.referer = response.geturl() - return response - - https_request = http_request - https_response = http_response - - -def clean_refresh_url(url): - # e.g. Firefox 1.5 does (something like) this - if ((url.startswith('"') and url.endswith('"')) or - (url.startswith("'") and url.endswith("'"))): - url = url[1:-1] - return _rfc3986.clean_url(url, "latin-1") # XXX encoding - -def parse_refresh_header(refresh): - """ - >>> parse_refresh_header("1; url=http://example.com/") - (1.0, 'http://example.com/') - >>> parse_refresh_header("1; url='http://example.com/'") - (1.0, 'http://example.com/') - >>> parse_refresh_header("1") - (1.0, None) - >>> parse_refresh_header("blah") - Traceback (most recent call last): - ValueError: invalid literal for float(): blah - - """ - - ii = refresh.find(";") - if ii != -1: - pause, newurl_spec = float(refresh[:ii]), refresh[ii+1:] - jj = newurl_spec.find("=") - key = None - if jj != -1: - key, newurl = newurl_spec[:jj], newurl_spec[jj+1:] - newurl = clean_refresh_url(newurl) - if key is None or key.strip().lower() != "url": - raise ValueError() - else: - pause, newurl = float(refresh), None - return pause, newurl - -class HTTPRefreshProcessor(BaseHandler): - """Perform HTTP Refresh redirections. - - Note that if a non-200 HTTP code has occurred (for example, a 30x - redirect), this processor will do nothing. - - By default, only zero-time Refresh headers are redirected. Use the - max_time attribute / constructor argument to allow Refresh with longer - pauses. Use the honor_time attribute / constructor argument to control - whether the requested pause is honoured (with a time.sleep()) or - skipped in favour of immediate redirection. - - Public attributes: - - max_time: see above - honor_time: see above - - """ - handler_order = 1000 - - def __init__(self, max_time=0, honor_time=True): - self.max_time = max_time - self.honor_time = honor_time - - def http_response(self, request, response): - code, msg, hdrs = response.code, response.msg, response.info() - - if code == 200 and hdrs.has_key("refresh"): - refresh = hdrs.getheaders("refresh")[0] - try: - pause, newurl = parse_refresh_header(refresh) - except ValueError: - debug("bad Refresh header: %r" % refresh) - return response - if newurl is None: - newurl = response.geturl() - if (self.max_time is None) or (pause <= self.max_time): - if pause > 1E-3 and self.honor_time: - time.sleep(pause) - hdrs["location"] = newurl - # hardcoded http is NOT a bug - response = self.parent.error( - "http", request, response, - "refresh", msg, hdrs) - - return response - - https_response = http_response - -class HTTPErrorProcessor(BaseHandler): - """Process HTTP error responses. - - The purpose of this handler is to to allow other response processors a - look-in by removing the call to parent.error() from - AbstractHTTPHandler. - - For non-200 error codes, this just passes the job on to the - Handler._error_ methods, via the OpenerDirector.error - method. Eventually, urllib2.HTTPDefaultErrorHandler will raise an - HTTPError if no other handler handles the error. - - """ - handler_order = 1000 # after all other processors - - def http_response(self, request, response): - code, msg, hdrs = response.code, response.msg, response.info() - - if code != 200: - # hardcoded http is NOT a bug - response = self.parent.error( - "http", request, response, code, msg, hdrs) - - return response - - https_response = http_response - - -class HTTPDefaultErrorHandler(BaseHandler): - def http_error_default(self, req, fp, code, msg, hdrs): - # why these error methods took the code, msg, headers args in the first - # place rather than a response object, I don't know, but to avoid - # multiple wrapping, we're discarding them - - if isinstance(fp, urllib2.HTTPError): - response = fp - else: - response = urllib2.HTTPError( - req.get_full_url(), code, msg, hdrs, fp) - assert code == response.code - assert msg == response.msg - assert hdrs == response.hdrs - raise response - - -class AbstractHTTPHandler(BaseHandler): - - def __init__(self, debuglevel=0): - self._debuglevel = debuglevel - - def set_http_debuglevel(self, level): - self._debuglevel = level - - def do_request_(self, request): - host = request.get_host() - if not host: - raise URLError('no host given') - - if request.has_data(): # POST - data = request.get_data() - if not request.has_header('Content-type'): - request.add_unredirected_header( - 'Content-type', - 'application/x-www-form-urlencoded') - - scheme, sel = urllib.splittype(request.get_selector()) - sel_host, sel_path = urllib.splithost(sel) - if not request.has_header('Host'): - request.add_unredirected_header('Host', sel_host or host) - for name, value in self.parent.addheaders: - name = name.capitalize() - if not request.has_header(name): - request.add_unredirected_header(name, value) - - return request - - def do_open(self, http_class, req): - """Return an addinfourl object for the request, using http_class. - - http_class must implement the HTTPConnection API from httplib. - The addinfourl return value is a file-like object. It also - has methods and attributes including: - - info(): return a mimetools.Message object for the headers - - geturl(): return the original request URL - - code: HTTP status code - """ - host = req.get_host() - if not host: - raise URLError('no host given') - - h = http_class(host) # will parse host:port - h.set_debuglevel(self._debuglevel) - - headers = dict(req.headers) - headers.update(req.unredirected_hdrs) - # We want to make an HTTP/1.1 request, but the addinfourl - # class isn't prepared to deal with a persistent connection. - # It will try to read all remaining data from the socket, - # which will block while the server waits for the next request. - # So make sure the connection gets closed after the (only) - # request. - headers["Connection"] = "close" - headers = dict( - [(name.title(), val) for name, val in headers.items()]) - try: - h.request(req.get_method(), req.get_selector(), req.data, headers) - r = h.getresponse() - except socket.error, err: # XXX what error? - raise URLError(err) - - # Pick apart the HTTPResponse object to get the addinfourl - # object initialized properly. - - # Wrap the HTTPResponse object in socket's file object adapter - # for Windows. That adapter calls recv(), so delegate recv() - # to read(). This weird wrapping allows the returned object to - # have readline() and readlines() methods. - - # XXX It might be better to extract the read buffering code - # out of socket._fileobject() and into a base class. - - r.recv = r.read - fp = socket._fileobject(r) - - resp = closeable_response(fp, r.msg, req.get_full_url(), - r.status, r.reason) - return resp - - -class HTTPHandler(AbstractHTTPHandler): - def http_open(self, req): - return self.do_open(httplib.HTTPConnection, req) - - http_request = AbstractHTTPHandler.do_request_ - -if hasattr(httplib, 'HTTPS'): - - class HTTPSConnectionFactory: - def __init__(self, key_file, cert_file): - self._key_file = key_file - self._cert_file = cert_file - def __call__(self, hostport): - return httplib.HTTPSConnection( - hostport, - key_file=self._key_file, cert_file=self._cert_file) - - class HTTPSHandler(AbstractHTTPHandler): - def __init__(self, client_cert_manager=None): - AbstractHTTPHandler.__init__(self) - self.client_cert_manager = client_cert_manager - - def https_open(self, req): - if self.client_cert_manager is not None: - key_file, cert_file = self.client_cert_manager.find_key_cert( - req.get_full_url()) - conn_factory = HTTPSConnectionFactory(key_file, cert_file) - else: - conn_factory = httplib.HTTPSConnection - return self.do_open(conn_factory, req) - - https_request = AbstractHTTPHandler.do_request_ diff --git a/src/calibre/utils/mechanize/_lwpcookiejar.py b/src/calibre/utils/mechanize/_lwpcookiejar.py deleted file mode 100644 index f8d49cf2d4..0000000000 --- a/src/calibre/utils/mechanize/_lwpcookiejar.py +++ /dev/null @@ -1,185 +0,0 @@ -"""Load / save to libwww-perl (LWP) format files. - -Actually, the format is slightly extended from that used by LWP's -(libwww-perl's) HTTP::Cookies, to avoid losing some RFC 2965 information -not recorded by LWP. - -It uses the version string "2.0", though really there isn't an LWP Cookies -2.0 format. This indicates that there is extra information in here -(domain_dot and port_spec) while still being compatible with libwww-perl, -I hope. - -Copyright 2002-2006 John J Lee -Copyright 1997-1999 Gisle Aas (original libwww-perl code) - -This code is free software; you can redistribute it and/or modify it -under the terms of the BSD or ZPL 2.1 licenses (see the file -COPYING.txt included with the distribution). - -""" - -import time, re, logging - -from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \ - MISSING_FILENAME_TEXT, LoadError -from _headersutil import join_header_words, split_header_words -from _util import iso2time, time2isoz - -debug = logging.getLogger("mechanize").debug - - -def lwp_cookie_str(cookie): - """Return string representation of Cookie in an the LWP cookie file format. - - Actually, the format is extended a bit -- see module docstring. - - """ - h = [(cookie.name, cookie.value), - ("path", cookie.path), - ("domain", cookie.domain)] - if cookie.port is not None: h.append(("port", cookie.port)) - if cookie.path_specified: h.append(("path_spec", None)) - if cookie.port_specified: h.append(("port_spec", None)) - if cookie.domain_initial_dot: h.append(("domain_dot", None)) - if cookie.secure: h.append(("secure", None)) - if cookie.expires: h.append(("expires", - time2isoz(float(cookie.expires)))) - if cookie.discard: h.append(("discard", None)) - if cookie.comment: h.append(("comment", cookie.comment)) - if cookie.comment_url: h.append(("commenturl", cookie.comment_url)) - if cookie.rfc2109: h.append(("rfc2109", None)) - - keys = cookie.nonstandard_attr_keys() - keys.sort() - for k in keys: - h.append((k, str(cookie.get_nonstandard_attr(k)))) - - h.append(("version", str(cookie.version))) - - return join_header_words([h]) - -class LWPCookieJar(FileCookieJar): - """ - The LWPCookieJar saves a sequence of"Set-Cookie3" lines. - "Set-Cookie3" is the format used by the libwww-perl libary, not known - to be compatible with any browser, but which is easy to read and - doesn't lose information about RFC 2965 cookies. - - Additional methods - - as_lwp_str(ignore_discard=True, ignore_expired=True) - - """ - - magic_re = r"^\#LWP-Cookies-(\d+\.\d+)" - - def as_lwp_str(self, ignore_discard=True, ignore_expires=True): - """Return cookies as a string of "\n"-separated "Set-Cookie3" headers. - - ignore_discard and ignore_expires: see docstring for FileCookieJar.save - - """ - now = time.time() - r = [] - for cookie in self: - if not ignore_discard and cookie.discard: - debug(" Not saving %s: marked for discard", cookie.name) - continue - if not ignore_expires and cookie.is_expired(now): - debug(" Not saving %s: expired", cookie.name) - continue - r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie)) - return "\n".join(r+[""]) - - def save(self, filename=None, ignore_discard=False, ignore_expires=False): - if filename is None: - if self.filename is not None: filename = self.filename - else: raise ValueError(MISSING_FILENAME_TEXT) - - f = open(filename, "w") - try: - debug("Saving LWP cookies file") - # There really isn't an LWP Cookies 2.0 format, but this indicates - # that there is extra information in here (domain_dot and - # port_spec) while still being compatible with libwww-perl, I hope. - f.write("#LWP-Cookies-2.0\n") - f.write(self.as_lwp_str(ignore_discard, ignore_expires)) - finally: - f.close() - - def _really_load(self, f, filename, ignore_discard, ignore_expires): - magic = f.readline() - if not re.search(self.magic_re, magic): - msg = "%s does not seem to contain cookies" % filename - raise LoadError(msg) - - now = time.time() - - header = "Set-Cookie3:" - boolean_attrs = ("port_spec", "path_spec", "domain_dot", - "secure", "discard", "rfc2109") - value_attrs = ("version", - "port", "path", "domain", - "expires", - "comment", "commenturl") - - try: - while 1: - line = f.readline() - if line == "": break - if not line.startswith(header): - continue - line = line[len(header):].strip() - - for data in split_header_words([line]): - name, value = data[0] - standard = {} - rest = {} - for k in boolean_attrs: - standard[k] = False - for k, v in data[1:]: - if k is not None: - lc = k.lower() - else: - lc = None - # don't lose case distinction for unknown fields - if (lc in value_attrs) or (lc in boolean_attrs): - k = lc - if k in boolean_attrs: - if v is None: v = True - standard[k] = v - elif k in value_attrs: - standard[k] = v - else: - rest[k] = v - - h = standard.get - expires = h("expires") - discard = h("discard") - if expires is not None: - expires = iso2time(expires) - if expires is None: - discard = True - domain = h("domain") - domain_specified = domain.startswith(".") - c = Cookie(h("version"), name, value, - h("port"), h("port_spec"), - domain, domain_specified, h("domain_dot"), - h("path"), h("path_spec"), - h("secure"), - expires, - discard, - h("comment"), - h("commenturl"), - rest, - h("rfc2109"), - ) - if not ignore_discard and c.discard: - continue - if not ignore_expires and c.is_expired(now): - continue - self.set_cookie(c) - except: - reraise_unmasked_exceptions((IOError,)) - raise LoadError("invalid Set-Cookie3 format file %s" % filename) - diff --git a/src/calibre/utils/mechanize/_mechanize.py b/src/calibre/utils/mechanize/_mechanize.py deleted file mode 100644 index a9b8d9e0b5..0000000000 --- a/src/calibre/utils/mechanize/_mechanize.py +++ /dev/null @@ -1,656 +0,0 @@ -"""Stateful programmatic WWW navigation, after Perl's WWW::Mechanize. - -Copyright 2003-2006 John J. Lee -Copyright 2003 Andy Lester (original Perl code) - -This code is free software; you can redistribute it and/or modify it -under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt -included with the distribution). - -""" - -import urllib2, sys, copy, re - -from _useragent import UserAgentBase -from _html import DefaultFactory -import _response -import _request -import _rfc3986 - -__version__ = (0, 1, 7, "b", None) # 0.1.7b - -class BrowserStateError(Exception): pass -class LinkNotFoundError(Exception): pass -class FormNotFoundError(Exception): pass - - -class History: - """ - - Though this will become public, the implied interface is not yet stable. - - """ - def __init__(self): - self._history = [] # LIFO - def add(self, request, response): - self._history.append((request, response)) - def back(self, n, _response): - response = _response # XXX move Browser._response into this class? - while n > 0 or response is None: - try: - request, response = self._history.pop() - except IndexError: - raise BrowserStateError("already at start of history") - n -= 1 - return request, response - def clear(self): - del self._history[:] - def close(self): - for request, response in self._history: - if response is not None: - response.close() - del self._history[:] - - -class HTTPRefererProcessor(urllib2.BaseHandler): - def http_request(self, request): - # See RFC 2616 14.36. The only times we know the source of the - # request URI has a URI associated with it are redirect, and - # Browser.click() / Browser.submit() / Browser.follow_link(). - # Otherwise, it's the user's job to add any Referer header before - # .open()ing. - if hasattr(request, "redirect_dict"): - request = self.parent._add_referer_header( - request, origin_request=False) - return request - - https_request = http_request - - -class Browser(UserAgentBase): - """Browser-like class with support for history, forms and links. - - BrowserStateError is raised whenever the browser is in the wrong state to - complete the requested operation - eg., when .back() is called when the - browser history is empty, or when .follow_link() is called when the current - response does not contain HTML data. - - Public attributes: - - request: current request (mechanize.Request or urllib2.Request) - form: currently selected form (see .select_form()) - - """ - - handler_classes = copy.copy(UserAgentBase.handler_classes) - handler_classes["_referer"] = HTTPRefererProcessor - default_features = copy.copy(UserAgentBase.default_features) - default_features.append("_referer") - - def __init__(self, - factory=None, - history=None, - request_class=None, - ): - """ - - Only named arguments should be passed to this constructor. - - factory: object implementing the mechanize.Factory interface. - history: object implementing the mechanize.History interface. Note - this interface is still experimental and may change in future. - request_class: Request class to use. Defaults to mechanize.Request - by default for Pythons older than 2.4, urllib2.Request otherwise. - - The Factory and History objects passed in are 'owned' by the Browser, - so they should not be shared across Browsers. In particular, - factory.set_response() should not be called except by the owning - Browser itself. - - Note that the supplied factory's request_class is overridden by this - constructor, to ensure only one Request class is used. - - """ - self._handle_referer = True - - if history is None: - history = History() - self._history = history - - if request_class is None: - if not hasattr(urllib2.Request, "add_unredirected_header"): - request_class = _request.Request - else: - request_class = urllib2.Request # Python >= 2.4 - - if factory is None: - factory = DefaultFactory() - factory.set_request_class(request_class) - self._factory = factory - self.request_class = request_class - - self.request = None - self._set_response(None, False) - - # do this last to avoid __getattr__ problems - UserAgentBase.__init__(self) - - def close(self): - UserAgentBase.close(self) - if self._response is not None: - self._response.close() - if self._history is not None: - self._history.close() - self._history = None - - # make use after .close easy to spot - self.form = None - self.request = self._response = None - self.request = self.response = self.set_response = None - self.geturl = self.reload = self.back = None - self.clear_history = self.set_cookie = self.links = self.forms = None - self.viewing_html = self.encoding = self.title = None - self.select_form = self.click = self.submit = self.click_link = None - self.follow_link = self.find_link = None - - def set_handle_referer(self, handle): - """Set whether to add Referer header to each request. - - This base class does not implement this feature (so don't turn this on - if you're using this base class directly), but the subclass - mechanize.Browser does. - - """ - self._set_handler("_referer", handle) - self._handle_referer = bool(handle) - - def _add_referer_header(self, request, origin_request=True): - if self.request is None: - return request - scheme = request.get_type() - original_scheme = self.request.get_type() - if scheme not in ["http", "https"]: - return request - if not origin_request and not self.request.has_header("Referer"): - return request - - if (self._handle_referer and - original_scheme in ["http", "https"] and - not (original_scheme == "https" and scheme != "https")): - # strip URL fragment (RFC 2616 14.36) - parts = _rfc3986.urlsplit(self.request.get_full_url()) - parts = parts[:-1]+(None,) - referer = _rfc3986.urlunsplit(parts) - request.add_unredirected_header("Referer", referer) - return request - - def open_novisit(self, url, data=None): - """Open a URL without visiting it. - - The browser state (including .request, .response(), history, forms and - links) are all left unchanged by calling this function. - - The interface is the same as for .open(). - - This is useful for things like fetching images. - - See also .retrieve(). - - """ - return self._mech_open(url, data, visit=False) - - def open(self, url, data=None): - return self._mech_open(url, data) - - def _mech_open(self, url, data=None, update_history=True, visit=None): - try: - url.get_full_url - except AttributeError: - # string URL -- convert to absolute URL if required - scheme, authority = _rfc3986.urlsplit(url)[:2] - if scheme is None: - # relative URL - if self._response is None: - raise BrowserStateError( - "can't fetch relative reference: " - "not viewing any document") - url = _rfc3986.urljoin(self._response.geturl(), url) - - request = self._request(url, data, visit) - visit = request.visit - if visit is None: - visit = True - - if visit: - self._visit_request(request, update_history) - - success = True - try: - response = UserAgentBase.open(self, request, data) - except urllib2.HTTPError, error: - success = False - if error.fp is None: # not a response - raise - response = error -## except (IOError, socket.error, OSError), error: -## # Yes, urllib2 really does raise all these :-(( -## # See test_urllib2.py for examples of socket.gaierror and OSError, -## # plus note that FTPHandler raises IOError. -## # XXX I don't seem to have an example of exactly socket.error being -## # raised, only socket.gaierror... -## # I don't want to start fixing these here, though, since this is a -## # subclass of OpenerDirector, and it would break old code. Even in -## # Python core, a fix would need some backwards-compat. hack to be -## # acceptable. -## raise - - if visit: - self._set_response(response, False) - response = copy.copy(self._response) - elif response is not None: - response = _response.upgrade_response(response) - - if not success: - raise response - return response - - def __str__(self): - text = [] - text.append("<%s " % self.__class__.__name__) - if self._response: - text.append("visiting %s" % self._response.geturl()) - else: - text.append("(not visiting a URL)") - if self.form: - text.append("\n selected form:\n %s\n" % str(self.form)) - text.append(">") - return "".join(text) - - def response(self): - """Return a copy of the current response. - - The returned object has the same interface as the object returned by - .open() (or urllib2.urlopen()). - - """ - return copy.copy(self._response) - - def set_response(self, response): - """Replace current response with (a copy of) response. - - response may be None. - - This is intended mostly for HTML-preprocessing. - """ - self._set_response(response, True) - - def _set_response(self, response, close_current): - # sanity check, necessary but far from sufficient - if not (response is None or - (hasattr(response, "info") and hasattr(response, "geturl") and - hasattr(response, "read") - ) - ): - raise ValueError("not a response object") - - self.form = None - if response is not None: - response = _response.upgrade_response(response) - if close_current and self._response is not None: - self._response.close() - self._response = response - self._factory.set_response(response) - - def visit_response(self, response, request=None): - """Visit the response, as if it had been .open()ed. - - Unlike .set_response(), this updates history rather than replacing the - current response. - """ - if request is None: - request = _request.Request(response.geturl()) - self._visit_request(request, True) - self._set_response(response, False) - - def _visit_request(self, request, update_history): - if self._response is not None: - self._response.close() - if self.request is not None and update_history: - self._history.add(self.request, self._response) - self._response = None - # we want self.request to be assigned even if UserAgentBase.open - # fails - self.request = request - - def geturl(self): - """Get URL of current document.""" - if self._response is None: - raise BrowserStateError("not viewing any document") - return self._response.geturl() - - def reload(self): - """Reload current document, and return response object.""" - if self.request is None: - raise BrowserStateError("no URL has yet been .open()ed") - if self._response is not None: - self._response.close() - return self._mech_open(self.request, update_history=False) - - def back(self, n=1): - """Go back n steps in history, and return response object. - - n: go back this number of steps (default 1 step) - - """ - if self._response is not None: - self._response.close() - self.request, response = self._history.back(n, self._response) - self.set_response(response) - if not response.read_complete: - return self.reload() - return copy.copy(response) - - def clear_history(self): - self._history.clear() - - def set_cookie(self, cookie_string): - """Request to set a cookie. - - Note that it is NOT necessary to call this method under ordinary - circumstances: cookie handling is normally entirely automatic. The - intended use case is rather to simulate the setting of a cookie by - client script in a web page (e.g. JavaScript). In that case, use of - this method is necessary because mechanize currently does not support - JavaScript, VBScript, etc. - - The cookie is added in the same way as if it had arrived with the - current response, as a result of the current request. This means that, - for example, it is not appropriate to set the cookie based on the - current request, no cookie will be set. - - The cookie will be returned automatically with subsequent responses - made by the Browser instance whenever that's appropriate. - - cookie_string should be a valid value of the Set-Cookie header. - - For example: - - browser.set_cookie( - "sid=abcdef; expires=Wednesday, 09-Nov-06 23:12:40 GMT") - - Currently, this method does not allow for adding RFC 2986 cookies. - This limitation will be lifted if anybody requests it. - - """ - if self._response is None: - raise BrowserStateError("not viewing any document") - if self.request.get_type() not in ["http", "https"]: - raise BrowserStateError("can't set cookie for non-HTTP/HTTPS " - "transactions") - cookiejar = self._ua_handlers["_cookies"].cookiejar - response = self.response() # copy - headers = response.info() - headers["Set-cookie"] = cookie_string - cookiejar.extract_cookies(response, self.request) - - def links(self, **kwds): - """Return iterable over links (mechanize.Link objects).""" - if not self.viewing_html(): - raise BrowserStateError("not viewing HTML") - links = self._factory.links() - if kwds: - return self._filter_links(links, **kwds) - else: - return links - - def forms(self): - """Return iterable over forms. - - The returned form objects implement the ClientForm.HTMLForm interface. - - """ - if not self.viewing_html(): - raise BrowserStateError("not viewing HTML") - return self._factory.forms() - - def global_form(self): - """Return the global form object, or None if the factory implementation - did not supply one. - - The "global" form object contains all controls that are not descendants of - any FORM element. - - The returned form object implements the ClientForm.HTMLForm interface. - - This is a separate method since the global form is not regarded as part - of the sequence of forms in the document -- mostly for - backwards-compatibility. - - """ - if not self.viewing_html(): - raise BrowserStateError("not viewing HTML") - return self._factory.global_form - - def viewing_html(self): - """Return whether the current response contains HTML data.""" - if self._response is None: - raise BrowserStateError("not viewing any document") - return self._factory.is_html - - def encoding(self): - """""" - if self._response is None: - raise BrowserStateError("not viewing any document") - return self._factory.encoding - - def title(self): - """Return title, or None if there is no title element in the document. - - Tags are stripped or textified as described in docs for - PullParser.get_text() method of pullparser module. - - """ - if not self.viewing_html(): - raise BrowserStateError("not viewing HTML") - return self._factory.title - - def select_form(self, name=None, predicate=None, nr=None): - """Select an HTML form for input. - - This is a bit like giving a form the "input focus" in a browser. - - If a form is selected, the Browser object supports the HTMLForm - interface, so you can call methods like .set_value(), .set(), and - .click(). - - Another way to select a form is to assign to the .form attribute. The - form assigned should be one of the objects returned by the .forms() - method. - - At least one of the name, predicate and nr arguments must be supplied. - If no matching form is found, mechanize.FormNotFoundError is raised. - - If name is specified, then the form must have the indicated name. - - If predicate is specified, then the form must match that function. The - predicate function is passed the HTMLForm as its single argument, and - should return a boolean value indicating whether the form matched. - - nr, if supplied, is the sequence number of the form (where 0 is the - first). Note that control 0 is the first form matching all the other - arguments (if supplied); it is not necessarily the first control in the - form. - - """ - if not self.viewing_html(): - raise BrowserStateError("not viewing HTML") - if (name is None) and (predicate is None) and (nr is None): - raise ValueError( - "at least one argument must be supplied to specify form") - - orig_nr = nr - for form in self.forms(): - if name is not None and name != form.name: - continue - if predicate is not None and not predicate(form): - continue - if nr: - nr -= 1 - continue - self.form = form - break # success - else: - # failure - description = [] - if name is not None: description.append("name '%s'" % name) - if predicate is not None: - description.append("predicate %s" % predicate) - if orig_nr is not None: description.append("nr %d" % orig_nr) - description = ", ".join(description) - raise FormNotFoundError("no form matching "+description) - - def click(self, *args, **kwds): - """See ClientForm.HTMLForm.click for documentation.""" - if not self.viewing_html(): - raise BrowserStateError("not viewing HTML") - request = self.form.click(*args, **kwds) - return self._add_referer_header(request) - - def submit(self, *args, **kwds): - """Submit current form. - - Arguments are as for ClientForm.HTMLForm.click(). - - Return value is same as for Browser.open(). - - """ - return self.open(self.click(*args, **kwds)) - - def click_link(self, link=None, **kwds): - """Find a link and return a Request object for it. - - Arguments are as for .find_link(), except that a link may be supplied - as the first argument. - - """ - if not self.viewing_html(): - raise BrowserStateError("not viewing HTML") - if not link: - link = self.find_link(**kwds) - else: - if kwds: - raise ValueError( - "either pass a Link, or keyword arguments, not both") - request = self.request_class(link.absolute_url) - return self._add_referer_header(request) - - def follow_link(self, link=None, **kwds): - """Find a link and .open() it. - - Arguments are as for .click_link(). - - Return value is same as for Browser.open(). - - """ - return self.open(self.click_link(link, **kwds)) - - def find_link(self, **kwds): - """Find a link in current page. - - Links are returned as mechanize.Link objects. - - # Return third link that .search()-matches the regexp "python" - # (by ".search()-matches", I mean that the regular expression method - # .search() is used, rather than .match()). - find_link(text_regex=re.compile("python"), nr=2) - - # Return first http link in the current page that points to somewhere - # on python.org whose link text (after tags have been removed) is - # exactly "monty python". - find_link(text="monty python", - url_regex=re.compile("http.*python.org")) - - # Return first link with exactly three HTML attributes. - find_link(predicate=lambda link: len(link.attrs) == 3) - - Links include anchors (), image maps (), and frames (, -