diff --git a/recipes/espn.recipe b/recipes/espn.recipe index 34c772f767..03c95d0001 100644 --- a/recipes/espn.recipe +++ b/recipes/espn.recipe @@ -20,7 +20,7 @@ class ESPN(BasicNewsRecipe): use_embedded_content = False remove_javascript = True - needs_subscription = True + needs_subscription = 'optional' encoding= 'ISO-8859-1' remove_tags_before = dict(name='font', attrs={'class':'date'}) @@ -75,32 +75,30 @@ class ESPN(BasicNewsRecipe): return soup - - def get_browser(self): br = BasicNewsRecipe.get_browser() - br.set_handle_refresh(False) - url = ('https://r.espn.go.com/members/v3_1/login') - raw = br.open(url).read() - raw = re.sub(r'(?s)
.*?id="regsigninbtn".*?
', '', raw) - with TemporaryFile(suffix='.htm') as fname: - with open(fname, 'wb') as f: - f.write(raw) - br.open_local_file(fname) + if self.username and self.password: + br.set_handle_refresh(False) + url = ('https://r.espn.go.com/members/v3_1/login') + raw = br.open(url).read() + raw = re.sub(r'(?s)
.*?id="regsigninbtn".*?
', '', raw) + with TemporaryFile(suffix='.htm') as fname: + with open(fname, 'wb') as f: + f.write(raw) + br.open_local_file(fname) - br.form = br.forms().next() - br.form.find_control(name='username', type='text').value = self.username - br.form['password'] = self.password - br.submit().read() - br.open('http://espn.go.com').read() - br.set_handle_refresh(True) + br.form = br.forms().next() + br.form.find_control(name='username', type='text').value = self.username + br.form['password'] = self.password + br.submit().read() + br.open('http://espn.go.com').read() + br.set_handle_refresh(True) return br def get_article_url(self, article): return article.get('guid', None) def print_version(self, url): - if 'eticket' in url: return url.partition('&')[0].replace('story?', 'print?') match = re.search(r'story\?(id=\d+)', url) diff --git a/recipes/tweakers_net.recipe b/recipes/tweakers_net.recipe new file mode 100644 index 0000000000..f9bbe27ec9 --- /dev/null +++ b/recipes/tweakers_net.recipe @@ -0,0 +1,66 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__docformat__ = 'restructuredtext en' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class Tweakers(BasicNewsRecipe): + title = u'Tweakers.net - with Reactions' + __author__ = 'Roedi06' + language = 'nl' + oldest_article = 7 + max_articles_per_feed = 100 + cover_url = 'http://img51.imageshack.us/img51/7470/tweakersnetebook.gif' + + keep_only_tags = [dict(name='div', attrs={'class':'columnwrapper news'}), + {'id':'reacties'}, + ] + + remove_tags = [dict(name='div', attrs={'id' : ['utracker']}), + {'id' : ['channelNav']}, + {'id' : ['contentArea']}, + {'class' : ['breadCrumb']}, + {'class' : ['nextPrevious ellipsis']}, + {'class' : ['advertorial']}, + {'class' : ['sidebar']}, + {'class' : ['filterBox']}, + {'id' : ['toggleButtonTxt']}, + {'id' : ['socialButtons']}, + {'class' : ['button']}, + {'class' : ['textadTop']}, + {'class' : ['commentLink']}, + {'title' : ['Reageer op deze reactie']}, + {'class' : ['pageIndex']}, + {'class' : ['reactieHeader collapsed']}, + ] + no_stylesheets=True + + preprocess_regexps = [ + (re.compile(r'', re.IGNORECASE | re.DOTALL), lambda match : ''), + (re.compile(r'

', re.IGNORECASE | re.DOTALL), lambda match : ''), + (re.compile(r'

', re.IGNORECASE | re.DOTALL), lambda match : ''), + (re.compile(r''), lambda h1: ''), + (re.compile(r''), lambda h2: ''), + (re.compile(r'', re.IGNORECASE | re.DOTALL), lambda match : ''), + (re.compile(r'', re.IGNORECASE | re.DOTALL), lambda match : ''), + (re.compile(r'
.*?
'), lambda h1: ''), + ] + + extra_css = '.reactieHeader { color: #333333; font-size: 6px; border-bottom:solid 2px #333333; border-top:solid 1px #333333; } \ + .reactieContent { font-family:"Times New Roman",Georgia,Serif; color: #000000; font-size: 8px; } \ + .quote { font-family:"Times New Roman",Georgia,Serif; padding-left:2px; border-left:solid 3px #666666; color: #666666; }' + + + feeds = [(u'Tweakers.net', u'http://feeds.feedburner.com/tweakers/nieuws')] + + def print_version(self, url): + return url + '?max=200' + diff --git a/src/calibre/db/cache.py b/src/calibre/db/cache.py index e79d496cd9..10fe0bb014 100644 --- a/src/calibre/db/cache.py +++ b/src/calibre/db/cache.py @@ -14,6 +14,7 @@ from functools import wraps, partial from calibre.db.locking import create_locks, RecordLock from calibre.db.fields import create_field from calibre.db.tables import VirtualTable +from calibre.db.lazy import FormatMetadata, FormatsList from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import now @@ -127,14 +128,8 @@ class Cache(object): if not formats: good_formats = None else: - good_formats = [] - for f in formats: - try: - mi.format_metadata[f] = self._format_metadata(book_id, f) - except: - pass - else: - good_formats.append(f) + mi.format_metadata = FormatMetadata(self, id, formats) + good_formats = FormatsList(formats, mi.format_metadata) mi.formats = good_formats mi.has_cover = _('Yes') if self._field_for('cover', book_id, default_value=False) else '' diff --git a/src/calibre/db/lazy.py b/src/calibre/db/lazy.py new file mode 100644 index 0000000000..be9334c056 --- /dev/null +++ b/src/calibre/db/lazy.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import weakref +from functools import wraps +from collections import MutableMapping, MutableSequence + +''' +Avoid doing stats on all files in a book when getting metadata for that book. +Speeds up calibre startup with large libraries/libraries on a network share, +with a composite custom column. +''' + +# Lazy format metadata retrieval {{{ +def resolved(f): + @wraps(f) + def wrapper(self, *args, **kwargs): + if getattr(self, '_must_resolve', True): + self._resolve() + self._must_resolve = False + return f(self, *args, **kwargs) + return wrapper + +class MutableBase(object): + + @resolved + def __str__(self): + return str(self._values) + + @resolved + def __repr__(self): + return repr(self._values) + + @resolved + def __unicode__(self): + return unicode(self._values) + + @resolved + def __len__(self): + return len(self._values) + + @resolved + def __iter__(self): + return iter(self._values) + + @resolved + def __contains__(self, key): + return key in self._values + + @resolved + def __getitem__(self, fmt): + return self._values[fmt] + + @resolved + def __setitem__(self, key, val): + self._values[key] = val + + @resolved + def __delitem__(self, key): + del self._values[key] + + +class FormatMetadata(MutableBase, MutableMapping): + + def __init__(self, db, id_, formats): + self._dbwref = weakref.ref(db) + self._id = id_ + self._formats = formats + + def _resolve(self): + db = self._dbwref() + self._values = {} + for f in self._formats: + try: + self._values[f] = db.format_metadata(self._id, f) + except: + pass + +class FormatsList(MutableBase, MutableSequence): + + def __init__(self, formats, format_metadata): + self._formats = formats + self._format_metadata = format_metadata + + def _resolve(self): + self._values = [f for f in self._formats if f in self._format_metadata] + + @resolved + def insert(self, idx, val): + self._values.insert(idx, val) + +# }}} + diff --git a/src/calibre/ebooks/chm/input.py b/src/calibre/ebooks/chm/input.py index 9aa8272ee9..f36685bd91 100644 --- a/src/calibre/ebooks/chm/input.py +++ b/src/calibre/ebooks/chm/input.py @@ -11,6 +11,7 @@ from calibre.customize.conversion import InputFormatPlugin from calibre.ptempfile import TemporaryDirectory from calibre.utils.localization import get_lang from calibre.utils.filenames import ascii_filename +from calibre.constants import filesystem_encoding class CHMInput(InputFormatPlugin): @@ -36,6 +37,8 @@ class CHMInput(InputFormatPlugin): log.debug('Processing CHM...') with TemporaryDirectory('_chm2oeb') as tdir: + if not isinstance(tdir, unicode): + tdir = tdir.decode(filesystem_encoding) html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) diff --git a/src/calibre/ebooks/chm/metadata.py b/src/calibre/ebooks/chm/metadata.py index 26b09c7676..ea67947231 100644 --- a/src/calibre/ebooks/chm/metadata.py +++ b/src/calibre/ebooks/chm/metadata.py @@ -6,13 +6,14 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import re +import re, codecs from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata import string_to_authors, MetaInformation from calibre.utils.logging import default_log from calibre.ptempfile import TemporaryFile +from calibre import force_unicode def _clean(s): return s.replace(u'\u00a0', u' ') @@ -138,6 +139,13 @@ def get_metadata_from_reader(rdr): resolve_entities=True)[0]) title = rdr.title + try: + x = rdr.GetEncoding() + codecs.lookup(x) + enc = x + except: + enc = 'cp1252' + title = force_unicode(title, enc) authors = _get_authors(home) mi = MetaInformation(title, authors) publisher = _get_publisher(home) diff --git a/src/calibre/ebooks/chm/reader.py b/src/calibre/ebooks/chm/reader.py index 05ec388a9b..fc7d865265 100644 --- a/src/calibre/ebooks/chm/reader.py +++ b/src/calibre/ebooks/chm/reader.py @@ -4,7 +4,7 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ,' \ ' and Alex Bramley .' -import os, re +import os, re, codecs from calibre import guess_type as guess_mimetype from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString @@ -99,8 +99,17 @@ class CHMReader(CHMFile): def ExtractFiles(self, output_dir=os.getcwdu(), debug_dump=False): html_files = set([]) + try: + x = self.GetEncoding() + codecs.lookup(x) + enc = x + except: + enc = 'cp1252' for path in self.Contents(): - lpath = os.path.join(output_dir, path) + fpath = path + if not isinstance(path, unicode): + fpath = path.decode(enc) + lpath = os.path.join(output_dir, fpath) self._ensure_dir(lpath) try: data = self.GetFile(path) @@ -123,6 +132,7 @@ class CHMReader(CHMFile): self.log.warn('%r filename too long, skipping'%path) continue raise + if debug_dump: import shutil shutil.copytree(output_dir, os.path.join(debug_dump, 'debug_dump')) diff --git a/src/calibre/ebooks/oeb/parse_utils.py b/src/calibre/ebooks/oeb/parse_utils.py index 712427d457..f0b48afb39 100644 --- a/src/calibre/ebooks/oeb/parse_utils.py +++ b/src/calibre/ebooks/oeb/parse_utils.py @@ -103,7 +103,7 @@ def html5_parse(data, max_nesting_depth=100): xmlns_declaration = '{%s}'%XMLNS_NS non_html5_namespaces = {} seen_namespaces = set() - for elem in tuple(data.iter()): + for elem in tuple(data.iter(tag=etree.Element)): elem.attrib.pop('xmlns', None) namespaces = {} for x in tuple(elem.attrib): diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 5fe010f829..00ca0e39a2 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -7,8 +7,8 @@ __docformat__ = 'restructuredtext en' The database used to store ebook metadata ''' import os, sys, shutil, cStringIO, glob, time, functools, traceback, re, \ - json, uuid, hashlib, copy, weakref -from collections import defaultdict, MutableMapping, MutableSequence + json, uuid, hashlib, copy +from collections import defaultdict import threading, random from itertools import repeat from math import ceil @@ -40,6 +40,7 @@ from calibre.utils.magick.draw import save_cover_data_to from calibre.utils.recycle_bin import delete_file, delete_tree from calibre.utils.formatter_functions import load_user_template_functions from calibre.db.errors import NoSuchFormat +from calibre.db.lazy import FormatMetadata, FormatsList from calibre.utils.localization import (canonicalize_lang, calibre_langcode_to_name) @@ -81,90 +82,6 @@ class Tag(object): def __repr__(self): return str(self) -class FormatMetadata(MutableMapping): # {{{ - - def __init__(self, db, id_, formats): - self.dbwref = weakref.ref(db) - self.id_ = id_ - self.formats = formats - self._must_do = True - self.values = {} - - def _resolve(self): - if self._must_do: - for f in self.formats: - try: - self.values[f] = self.dbwref().format_metadata(self.id_, f) - except: - pass - self._must_do = False - - def __getitem__(self, fmt): - self._resolve() - return self.values[fmt] - - def __setitem__(self, key, val): - self._resolve() - self.values[key] = val - - def __delitem__(self, key): - self._resolve() - self.values.__delitem__(key) - - def __len__(self): - self._resolve() - return len(self.values) - - def __iter__(self): - self._resolve() - return self.values.__iter__() - -class FormatsList(MutableSequence): - - def __init__(self, formats, format_metadata): - self.formats = formats - self.format_metadata = format_metadata - self._must_do = True - self.values = [] - - def _resolve(self): - if self._must_do: - for f in self.formats: - try: - if f in self.format_metadata: - self.values.append(f) - except: - pass - self._must_do = False - - def __getitem__(self, dex): - self._resolve() - return self.values[dex] - - def __setitem__(self, key, dex): - self._resolve() - self.values[key] = dex - - def __delitem__(self, dex): - self._resolve() - self.values.__delitem__(dex) - - def __len__(self): - self._resolve() - return len(self.values) - - def __iter__(self): - self._resolve() - return self.values.__iter__() - - def insert(self, idx, val): - self._resolve() - self.values.insert(idx, val) - -# }}} - - - class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): ''' An ebook metadata database that stores references to ebook files on disk. @@ -253,6 +170,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): except: traceback.print_exc() self.field_metadata = FieldMetadata() + self.format_filename_cache = defaultdict(dict) self._library_id_ = None # Create the lock to be used to guard access to the metadata writer # queues. This must be an RLock, not a Lock @@ -393,6 +311,12 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): if not self.is_second_db: load_user_template_functions(self.prefs.get('user_template_functions', [])) + # Load the format filename cache + self.format_filename_cache = defaultdict(dict) + for book_id, fmt, name in self.conn.get( + 'SELECT book,format,name FROM data'): + self.format_filename_cache[book_id][fmt.upper() if fmt else ''] = name + self.conn.executescript(''' DROP TRIGGER IF EXISTS author_insert_trg; CREATE TEMP TRIGGER author_insert_trg @@ -682,7 +606,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): fname = self.construct_file_name(id) changed = False for format in formats: - name = self.conn.get('SELECT name FROM data WHERE book=? AND format=?', (id, format), all=False) + name = self.format_filename_cache[id].get(format.upper(), None) if name and name != fname: changed = True break @@ -1222,12 +1146,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): def format_files(self, index, index_is_id=False): id = index if index_is_id else self.id(index) - try: - formats = self.conn.get('SELECT name,format FROM data WHERE book=?', (id,)) - formats = map(lambda x:(x[0], x[1]), formats) - return formats - except: - return [] + return [(v, k) for k, v in self.format_filename_cache[id].iteritems()] def formats(self, index, index_is_id=False, verify_formats=True): ''' Return available formats as a comma separated list or None if there are no available formats ''' @@ -1313,7 +1232,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): ''' id = index if index_is_id else self.id(index) try: - name = self.conn.get('SELECT name FROM data WHERE book=? AND format=?', (id, format), all=False) + name = self.format_filename_cache[id][format.upper()] except: return None if name: @@ -1410,11 +1329,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): def add_format(self, index, format, stream, index_is_id=False, path=None, notify=True, replace=True): id = index if index_is_id else self.id(index) - if format: - self.format_metadata_cache[id].pop(format.upper(), None) + if not format: format = '' + self.format_metadata_cache[id].pop(format.upper(), None) + name = self.format_filename_cache[id].get(format.upper(), None) if path is None: path = os.path.join(self.library_path, self.path(id, index_is_id=True)) - name = self.conn.get('SELECT name FROM data WHERE book=? AND format=?', (id, format), all=False) if name and not replace: return False name = self.construct_file_name(id) @@ -1432,6 +1351,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): self.conn.execute('INSERT OR REPLACE INTO data (book,format,uncompressed_size,name) VALUES (?,?,?,?)', (id, format.upper(), size, name)) self.conn.commit() + self.format_filename_cache[id][format.upper()] = name self.refresh_ids([id]) if notify: self.notify('metadata', [id]) @@ -1479,9 +1399,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): def remove_format(self, index, format, index_is_id=False, notify=True, commit=True, db_only=False): id = index if index_is_id else self.id(index) - if format: - self.format_metadata_cache[id].pop(format.upper(), None) - name = self.conn.get('SELECT name FROM data WHERE book=? AND format=?', (id, format), all=False) + if not format: format = '' + self.format_metadata_cache[id].pop(format.upper(), None) + name = self.format_filename_cache[id].pop(format.upper(), None) if name: if not db_only: try: diff --git a/src/calibre/manual/images/bookmark.png b/src/calibre/manual/images/bookmark.png index c6671a2541..4ba83fb49c 100644 Binary files a/src/calibre/manual/images/bookmark.png and b/src/calibre/manual/images/bookmark.png differ diff --git a/src/calibre/manual/images/pref_button.png b/src/calibre/manual/images/pref_button.png index f43f2d7627..52d9bae6e0 100644 Binary files a/src/calibre/manual/images/pref_button.png and b/src/calibre/manual/images/pref_button.png differ diff --git a/src/calibre/manual/images/ref_mode_button.png b/src/calibre/manual/images/ref_mode_button.png index efed1af26b..3ec70e91ab 100644 Binary files a/src/calibre/manual/images/ref_mode_button.png and b/src/calibre/manual/images/ref_mode_button.png differ