Merge from trunk

This commit is contained in:
Charles Haley 2012-01-18 11:23:48 +01:00
commit 5162eb7c6d
12 changed files with 229 additions and 130 deletions

View File

@ -20,7 +20,7 @@ class ESPN(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
remove_javascript = True remove_javascript = True
needs_subscription = True needs_subscription = 'optional'
encoding= 'ISO-8859-1' encoding= 'ISO-8859-1'
remove_tags_before = dict(name='font', attrs={'class':'date'}) remove_tags_before = dict(name='font', attrs={'class':'date'})
@ -75,32 +75,30 @@ class ESPN(BasicNewsRecipe):
return soup return soup
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
br.set_handle_refresh(False) if self.username and self.password:
url = ('https://r.espn.go.com/members/v3_1/login') br.set_handle_refresh(False)
raw = br.open(url).read() url = ('https://r.espn.go.com/members/v3_1/login')
raw = re.sub(r'(?s)<form>.*?id="regsigninbtn".*?</form>', '', raw) raw = br.open(url).read()
with TemporaryFile(suffix='.htm') as fname: raw = re.sub(r'(?s)<form>.*?id="regsigninbtn".*?</form>', '', raw)
with open(fname, 'wb') as f: with TemporaryFile(suffix='.htm') as fname:
f.write(raw) with open(fname, 'wb') as f:
br.open_local_file(fname) f.write(raw)
br.open_local_file(fname)
br.form = br.forms().next() br.form = br.forms().next()
br.form.find_control(name='username', type='text').value = self.username br.form.find_control(name='username', type='text').value = self.username
br.form['password'] = self.password br.form['password'] = self.password
br.submit().read() br.submit().read()
br.open('http://espn.go.com').read() br.open('http://espn.go.com').read()
br.set_handle_refresh(True) br.set_handle_refresh(True)
return br return br
def get_article_url(self, article): def get_article_url(self, article):
return article.get('guid', None) return article.get('guid', None)
def print_version(self, url): def print_version(self, url):
if 'eticket' in url: if 'eticket' in url:
return url.partition('&')[0].replace('story?', 'print?') return url.partition('&')[0].replace('story?', 'print?')
match = re.search(r'story\?(id=\d+)', url) match = re.search(r'story\?(id=\d+)', url)

View File

@ -0,0 +1,66 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__docformat__ = 'restructuredtext en'
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Tweakers(BasicNewsRecipe):
title = u'Tweakers.net - with Reactions'
__author__ = 'Roedi06'
language = 'nl'
oldest_article = 7
max_articles_per_feed = 100
cover_url = 'http://img51.imageshack.us/img51/7470/tweakersnetebook.gif'
keep_only_tags = [dict(name='div', attrs={'class':'columnwrapper news'}),
{'id':'reacties'},
]
remove_tags = [dict(name='div', attrs={'id' : ['utracker']}),
{'id' : ['channelNav']},
{'id' : ['contentArea']},
{'class' : ['breadCrumb']},
{'class' : ['nextPrevious ellipsis']},
{'class' : ['advertorial']},
{'class' : ['sidebar']},
{'class' : ['filterBox']},
{'id' : ['toggleButtonTxt']},
{'id' : ['socialButtons']},
{'class' : ['button']},
{'class' : ['textadTop']},
{'class' : ['commentLink']},
{'title' : ['Reageer op deze reactie']},
{'class' : ['pageIndex']},
{'class' : ['reactieHeader collapsed']},
]
no_stylesheets=True
preprocess_regexps = [
(re.compile(r'<hr*?>', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'<p>', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'</p>', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'<a.*?>'), lambda h1: '<b><u>'),
(re.compile(r'</a>'), lambda h2: '</u></b>'),
(re.compile(r'<span class="new">', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'</span>', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_0'), lambda match : ' - moderated 0<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_0'),
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_1'), lambda match : ' - moderated +1<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_1'),
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_2'), lambda match : ' - moderated +2<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_2'),
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_3'), lambda match : ' - moderated +3<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_3'),
(re.compile(r'<div class="moderation">.*?</div>'), lambda h1: ''),
]
extra_css = '.reactieHeader { color: #333333; font-size: 6px; border-bottom:solid 2px #333333; border-top:solid 1px #333333; } \
.reactieContent { font-family:"Times New Roman",Georgia,Serif; color: #000000; font-size: 8px; } \
.quote { font-family:"Times New Roman",Georgia,Serif; padding-left:2px; border-left:solid 3px #666666; color: #666666; }'
feeds = [(u'Tweakers.net', u'http://feeds.feedburner.com/tweakers/nieuws')]
def print_version(self, url):
return url + '?max=200'

View File

@ -14,6 +14,7 @@ from functools import wraps, partial
from calibre.db.locking import create_locks, RecordLock from calibre.db.locking import create_locks, RecordLock
from calibre.db.fields import create_field from calibre.db.fields import create_field
from calibre.db.tables import VirtualTable from calibre.db.tables import VirtualTable
from calibre.db.lazy import FormatMetadata, FormatsList
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.date import now from calibre.utils.date import now
@ -127,14 +128,8 @@ class Cache(object):
if not formats: if not formats:
good_formats = None good_formats = None
else: else:
good_formats = [] mi.format_metadata = FormatMetadata(self, id, formats)
for f in formats: good_formats = FormatsList(formats, mi.format_metadata)
try:
mi.format_metadata[f] = self._format_metadata(book_id, f)
except:
pass
else:
good_formats.append(f)
mi.formats = good_formats mi.formats = good_formats
mi.has_cover = _('Yes') if self._field_for('cover', book_id, mi.has_cover = _('Yes') if self._field_for('cover', book_id,
default_value=False) else '' default_value=False) else ''

99
src/calibre/db/lazy.py Normal file
View File

@ -0,0 +1,99 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import weakref
from functools import wraps
from collections import MutableMapping, MutableSequence
'''
Avoid doing stats on all files in a book when getting metadata for that book.
Speeds up calibre startup with large libraries/libraries on a network share,
with a composite custom column.
'''
# Lazy format metadata retrieval {{{
def resolved(f):
@wraps(f)
def wrapper(self, *args, **kwargs):
if getattr(self, '_must_resolve', True):
self._resolve()
self._must_resolve = False
return f(self, *args, **kwargs)
return wrapper
class MutableBase(object):
@resolved
def __str__(self):
return str(self._values)
@resolved
def __repr__(self):
return repr(self._values)
@resolved
def __unicode__(self):
return unicode(self._values)
@resolved
def __len__(self):
return len(self._values)
@resolved
def __iter__(self):
return iter(self._values)
@resolved
def __contains__(self, key):
return key in self._values
@resolved
def __getitem__(self, fmt):
return self._values[fmt]
@resolved
def __setitem__(self, key, val):
self._values[key] = val
@resolved
def __delitem__(self, key):
del self._values[key]
class FormatMetadata(MutableBase, MutableMapping):
def __init__(self, db, id_, formats):
self._dbwref = weakref.ref(db)
self._id = id_
self._formats = formats
def _resolve(self):
db = self._dbwref()
self._values = {}
for f in self._formats:
try:
self._values[f] = db.format_metadata(self._id, f)
except:
pass
class FormatsList(MutableBase, MutableSequence):
def __init__(self, formats, format_metadata):
self._formats = formats
self._format_metadata = format_metadata
def _resolve(self):
self._values = [f for f in self._formats if f in self._format_metadata]
@resolved
def insert(self, idx, val):
self._values.insert(idx, val)
# }}}

View File

@ -11,6 +11,7 @@ from calibre.customize.conversion import InputFormatPlugin
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory
from calibre.utils.localization import get_lang from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
from calibre.constants import filesystem_encoding
class CHMInput(InputFormatPlugin): class CHMInput(InputFormatPlugin):
@ -36,6 +37,8 @@ class CHMInput(InputFormatPlugin):
log.debug('Processing CHM...') log.debug('Processing CHM...')
with TemporaryDirectory('_chm2oeb') as tdir: with TemporaryDirectory('_chm2oeb') as tdir:
if not isinstance(tdir, unicode):
tdir = tdir.decode(filesystem_encoding)
html_input = plugin_for_input_format('html') html_input = plugin_for_input_format('html')
for opt in html_input.options: for opt in html_input.options:
setattr(options, opt.option.name, opt.recommended_value) setattr(options, opt.option.name, opt.recommended_value)

View File

@ -6,13 +6,14 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re import re, codecs
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata import string_to_authors, MetaInformation from calibre.ebooks.metadata import string_to_authors, MetaInformation
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
from calibre.ptempfile import TemporaryFile from calibre.ptempfile import TemporaryFile
from calibre import force_unicode
def _clean(s): def _clean(s):
return s.replace(u'\u00a0', u' ') return s.replace(u'\u00a0', u' ')
@ -138,6 +139,13 @@ def get_metadata_from_reader(rdr):
resolve_entities=True)[0]) resolve_entities=True)[0])
title = rdr.title title = rdr.title
try:
x = rdr.GetEncoding()
codecs.lookup(x)
enc = x
except:
enc = 'cp1252'
title = force_unicode(title, enc)
authors = _get_authors(home) authors = _get_authors(home)
mi = MetaInformation(title, authors) mi = MetaInformation(title, authors)
publisher = _get_publisher(home) publisher = _get_publisher(home)

View File

@ -4,7 +4,7 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
' and Alex Bramley <a.bramley at gmail.com>.' ' and Alex Bramley <a.bramley at gmail.com>.'
import os, re import os, re, codecs
from calibre import guess_type as guess_mimetype from calibre import guess_type as guess_mimetype
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString
@ -99,8 +99,17 @@ class CHMReader(CHMFile):
def ExtractFiles(self, output_dir=os.getcwdu(), debug_dump=False): def ExtractFiles(self, output_dir=os.getcwdu(), debug_dump=False):
html_files = set([]) html_files = set([])
try:
x = self.GetEncoding()
codecs.lookup(x)
enc = x
except:
enc = 'cp1252'
for path in self.Contents(): for path in self.Contents():
lpath = os.path.join(output_dir, path) fpath = path
if not isinstance(path, unicode):
fpath = path.decode(enc)
lpath = os.path.join(output_dir, fpath)
self._ensure_dir(lpath) self._ensure_dir(lpath)
try: try:
data = self.GetFile(path) data = self.GetFile(path)
@ -123,6 +132,7 @@ class CHMReader(CHMFile):
self.log.warn('%r filename too long, skipping'%path) self.log.warn('%r filename too long, skipping'%path)
continue continue
raise raise
if debug_dump: if debug_dump:
import shutil import shutil
shutil.copytree(output_dir, os.path.join(debug_dump, 'debug_dump')) shutil.copytree(output_dir, os.path.join(debug_dump, 'debug_dump'))

View File

@ -103,7 +103,7 @@ def html5_parse(data, max_nesting_depth=100):
xmlns_declaration = '{%s}'%XMLNS_NS xmlns_declaration = '{%s}'%XMLNS_NS
non_html5_namespaces = {} non_html5_namespaces = {}
seen_namespaces = set() seen_namespaces = set()
for elem in tuple(data.iter()): for elem in tuple(data.iter(tag=etree.Element)):
elem.attrib.pop('xmlns', None) elem.attrib.pop('xmlns', None)
namespaces = {} namespaces = {}
for x in tuple(elem.attrib): for x in tuple(elem.attrib):

View File

@ -7,8 +7,8 @@ __docformat__ = 'restructuredtext en'
The database used to store ebook metadata The database used to store ebook metadata
''' '''
import os, sys, shutil, cStringIO, glob, time, functools, traceback, re, \ import os, sys, shutil, cStringIO, glob, time, functools, traceback, re, \
json, uuid, hashlib, copy, weakref json, uuid, hashlib, copy
from collections import defaultdict, MutableMapping, MutableSequence from collections import defaultdict
import threading, random import threading, random
from itertools import repeat from itertools import repeat
from math import ceil from math import ceil
@ -40,6 +40,7 @@ from calibre.utils.magick.draw import save_cover_data_to
from calibre.utils.recycle_bin import delete_file, delete_tree from calibre.utils.recycle_bin import delete_file, delete_tree
from calibre.utils.formatter_functions import load_user_template_functions from calibre.utils.formatter_functions import load_user_template_functions
from calibre.db.errors import NoSuchFormat from calibre.db.errors import NoSuchFormat
from calibre.db.lazy import FormatMetadata, FormatsList
from calibre.utils.localization import (canonicalize_lang, from calibre.utils.localization import (canonicalize_lang,
calibre_langcode_to_name) calibre_langcode_to_name)
@ -81,90 +82,6 @@ class Tag(object):
def __repr__(self): def __repr__(self):
return str(self) return str(self)
class FormatMetadata(MutableMapping): # {{{
def __init__(self, db, id_, formats):
self.dbwref = weakref.ref(db)
self.id_ = id_
self.formats = formats
self._must_do = True
self.values = {}
def _resolve(self):
if self._must_do:
for f in self.formats:
try:
self.values[f] = self.dbwref().format_metadata(self.id_, f)
except:
pass
self._must_do = False
def __getitem__(self, fmt):
self._resolve()
return self.values[fmt]
def __setitem__(self, key, val):
self._resolve()
self.values[key] = val
def __delitem__(self, key):
self._resolve()
self.values.__delitem__(key)
def __len__(self):
self._resolve()
return len(self.values)
def __iter__(self):
self._resolve()
return self.values.__iter__()
class FormatsList(MutableSequence):
def __init__(self, formats, format_metadata):
self.formats = formats
self.format_metadata = format_metadata
self._must_do = True
self.values = []
def _resolve(self):
if self._must_do:
for f in self.formats:
try:
if f in self.format_metadata:
self.values.append(f)
except:
pass
self._must_do = False
def __getitem__(self, dex):
self._resolve()
return self.values[dex]
def __setitem__(self, key, dex):
self._resolve()
self.values[key] = dex
def __delitem__(self, dex):
self._resolve()
self.values.__delitem__(dex)
def __len__(self):
self._resolve()
return len(self.values)
def __iter__(self):
self._resolve()
return self.values.__iter__()
def insert(self, idx, val):
self._resolve()
self.values.insert(idx, val)
# }}}
class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
''' '''
An ebook metadata database that stores references to ebook files on disk. An ebook metadata database that stores references to ebook files on disk.
@ -253,6 +170,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
except: except:
traceback.print_exc() traceback.print_exc()
self.field_metadata = FieldMetadata() self.field_metadata = FieldMetadata()
self.format_filename_cache = defaultdict(dict)
self._library_id_ = None self._library_id_ = None
# Create the lock to be used to guard access to the metadata writer # Create the lock to be used to guard access to the metadata writer
# queues. This must be an RLock, not a Lock # queues. This must be an RLock, not a Lock
@ -393,6 +311,12 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
if not self.is_second_db: if not self.is_second_db:
load_user_template_functions(self.prefs.get('user_template_functions', [])) load_user_template_functions(self.prefs.get('user_template_functions', []))
# Load the format filename cache
self.format_filename_cache = defaultdict(dict)
for book_id, fmt, name in self.conn.get(
'SELECT book,format,name FROM data'):
self.format_filename_cache[book_id][fmt.upper() if fmt else ''] = name
self.conn.executescript(''' self.conn.executescript('''
DROP TRIGGER IF EXISTS author_insert_trg; DROP TRIGGER IF EXISTS author_insert_trg;
CREATE TEMP TRIGGER author_insert_trg CREATE TEMP TRIGGER author_insert_trg
@ -682,7 +606,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
fname = self.construct_file_name(id) fname = self.construct_file_name(id)
changed = False changed = False
for format in formats: for format in formats:
name = self.conn.get('SELECT name FROM data WHERE book=? AND format=?', (id, format), all=False) name = self.format_filename_cache[id].get(format.upper(), None)
if name and name != fname: if name and name != fname:
changed = True changed = True
break break
@ -1222,12 +1146,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
def format_files(self, index, index_is_id=False): def format_files(self, index, index_is_id=False):
id = index if index_is_id else self.id(index) id = index if index_is_id else self.id(index)
try: return [(v, k) for k, v in self.format_filename_cache[id].iteritems()]
formats = self.conn.get('SELECT name,format FROM data WHERE book=?', (id,))
formats = map(lambda x:(x[0], x[1]), formats)
return formats
except:
return []
def formats(self, index, index_is_id=False, verify_formats=True): def formats(self, index, index_is_id=False, verify_formats=True):
''' Return available formats as a comma separated list or None if there are no available formats ''' ''' Return available formats as a comma separated list or None if there are no available formats '''
@ -1313,7 +1232,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
''' '''
id = index if index_is_id else self.id(index) id = index if index_is_id else self.id(index)
try: try:
name = self.conn.get('SELECT name FROM data WHERE book=? AND format=?', (id, format), all=False) name = self.format_filename_cache[id][format.upper()]
except: except:
return None return None
if name: if name:
@ -1410,11 +1329,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
def add_format(self, index, format, stream, index_is_id=False, path=None, def add_format(self, index, format, stream, index_is_id=False, path=None,
notify=True, replace=True): notify=True, replace=True):
id = index if index_is_id else self.id(index) id = index if index_is_id else self.id(index)
if format: if not format: format = ''
self.format_metadata_cache[id].pop(format.upper(), None) self.format_metadata_cache[id].pop(format.upper(), None)
name = self.format_filename_cache[id].get(format.upper(), None)
if path is None: if path is None:
path = os.path.join(self.library_path, self.path(id, index_is_id=True)) path = os.path.join(self.library_path, self.path(id, index_is_id=True))
name = self.conn.get('SELECT name FROM data WHERE book=? AND format=?', (id, format), all=False)
if name and not replace: if name and not replace:
return False return False
name = self.construct_file_name(id) name = self.construct_file_name(id)
@ -1432,6 +1351,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
self.conn.execute('INSERT OR REPLACE INTO data (book,format,uncompressed_size,name) VALUES (?,?,?,?)', self.conn.execute('INSERT OR REPLACE INTO data (book,format,uncompressed_size,name) VALUES (?,?,?,?)',
(id, format.upper(), size, name)) (id, format.upper(), size, name))
self.conn.commit() self.conn.commit()
self.format_filename_cache[id][format.upper()] = name
self.refresh_ids([id]) self.refresh_ids([id])
if notify: if notify:
self.notify('metadata', [id]) self.notify('metadata', [id])
@ -1479,9 +1399,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
def remove_format(self, index, format, index_is_id=False, notify=True, def remove_format(self, index, format, index_is_id=False, notify=True,
commit=True, db_only=False): commit=True, db_only=False):
id = index if index_is_id else self.id(index) id = index if index_is_id else self.id(index)
if format: if not format: format = ''
self.format_metadata_cache[id].pop(format.upper(), None) self.format_metadata_cache[id].pop(format.upper(), None)
name = self.conn.get('SELECT name FROM data WHERE book=? AND format=?', (id, format), all=False) name = self.format_filename_cache[id].pop(format.upper(), None)
if name: if name:
if not db_only: if not db_only:
try: try:

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.6 KiB

After

Width:  |  Height:  |  Size: 1.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 KiB

After

Width:  |  Height:  |  Size: 1.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 733 B

After

Width:  |  Height:  |  Size: 2.3 KiB