Merge from trunk

This commit is contained in:
Charles Haley 2012-01-18 11:23:48 +01:00
commit 5162eb7c6d
12 changed files with 229 additions and 130 deletions

View File

@ -20,7 +20,7 @@ class ESPN(BasicNewsRecipe):
use_embedded_content = False
remove_javascript = True
needs_subscription = True
needs_subscription = 'optional'
encoding= 'ISO-8859-1'
remove_tags_before = dict(name='font', attrs={'class':'date'})
@ -75,32 +75,30 @@ class ESPN(BasicNewsRecipe):
return soup
def get_browser(self):
br = BasicNewsRecipe.get_browser()
br.set_handle_refresh(False)
url = ('https://r.espn.go.com/members/v3_1/login')
raw = br.open(url).read()
raw = re.sub(r'(?s)<form>.*?id="regsigninbtn".*?</form>', '', raw)
with TemporaryFile(suffix='.htm') as fname:
with open(fname, 'wb') as f:
f.write(raw)
br.open_local_file(fname)
if self.username and self.password:
br.set_handle_refresh(False)
url = ('https://r.espn.go.com/members/v3_1/login')
raw = br.open(url).read()
raw = re.sub(r'(?s)<form>.*?id="regsigninbtn".*?</form>', '', raw)
with TemporaryFile(suffix='.htm') as fname:
with open(fname, 'wb') as f:
f.write(raw)
br.open_local_file(fname)
br.form = br.forms().next()
br.form.find_control(name='username', type='text').value = self.username
br.form['password'] = self.password
br.submit().read()
br.open('http://espn.go.com').read()
br.set_handle_refresh(True)
br.form = br.forms().next()
br.form.find_control(name='username', type='text').value = self.username
br.form['password'] = self.password
br.submit().read()
br.open('http://espn.go.com').read()
br.set_handle_refresh(True)
return br
def get_article_url(self, article):
return article.get('guid', None)
def print_version(self, url):
if 'eticket' in url:
return url.partition('&')[0].replace('story?', 'print?')
match = re.search(r'story\?(id=\d+)', url)

View File

@ -0,0 +1,66 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__docformat__ = 'restructuredtext en'
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Tweakers(BasicNewsRecipe):
title = u'Tweakers.net - with Reactions'
__author__ = 'Roedi06'
language = 'nl'
oldest_article = 7
max_articles_per_feed = 100
cover_url = 'http://img51.imageshack.us/img51/7470/tweakersnetebook.gif'
keep_only_tags = [dict(name='div', attrs={'class':'columnwrapper news'}),
{'id':'reacties'},
]
remove_tags = [dict(name='div', attrs={'id' : ['utracker']}),
{'id' : ['channelNav']},
{'id' : ['contentArea']},
{'class' : ['breadCrumb']},
{'class' : ['nextPrevious ellipsis']},
{'class' : ['advertorial']},
{'class' : ['sidebar']},
{'class' : ['filterBox']},
{'id' : ['toggleButtonTxt']},
{'id' : ['socialButtons']},
{'class' : ['button']},
{'class' : ['textadTop']},
{'class' : ['commentLink']},
{'title' : ['Reageer op deze reactie']},
{'class' : ['pageIndex']},
{'class' : ['reactieHeader collapsed']},
]
no_stylesheets=True
preprocess_regexps = [
(re.compile(r'<hr*?>', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'<p>', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'</p>', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'<a.*?>'), lambda h1: '<b><u>'),
(re.compile(r'</a>'), lambda h2: '</u></b>'),
(re.compile(r'<span class="new">', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'</span>', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_0'), lambda match : ' - moderated 0<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_0'),
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_1'), lambda match : ' - moderated +1<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_1'),
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_2'), lambda match : ' - moderated +2<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_2'),
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_3'), lambda match : ' - moderated +3<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_3'),
(re.compile(r'<div class="moderation">.*?</div>'), lambda h1: ''),
]
extra_css = '.reactieHeader { color: #333333; font-size: 6px; border-bottom:solid 2px #333333; border-top:solid 1px #333333; } \
.reactieContent { font-family:"Times New Roman",Georgia,Serif; color: #000000; font-size: 8px; } \
.quote { font-family:"Times New Roman",Georgia,Serif; padding-left:2px; border-left:solid 3px #666666; color: #666666; }'
feeds = [(u'Tweakers.net', u'http://feeds.feedburner.com/tweakers/nieuws')]
def print_version(self, url):
return url + '?max=200'

View File

@ -14,6 +14,7 @@ from functools import wraps, partial
from calibre.db.locking import create_locks, RecordLock
from calibre.db.fields import create_field
from calibre.db.tables import VirtualTable
from calibre.db.lazy import FormatMetadata, FormatsList
from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.date import now
@ -127,14 +128,8 @@ class Cache(object):
if not formats:
good_formats = None
else:
good_formats = []
for f in formats:
try:
mi.format_metadata[f] = self._format_metadata(book_id, f)
except:
pass
else:
good_formats.append(f)
mi.format_metadata = FormatMetadata(self, id, formats)
good_formats = FormatsList(formats, mi.format_metadata)
mi.formats = good_formats
mi.has_cover = _('Yes') if self._field_for('cover', book_id,
default_value=False) else ''

99
src/calibre/db/lazy.py Normal file
View File

@ -0,0 +1,99 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import weakref
from functools import wraps
from collections import MutableMapping, MutableSequence
'''
Avoid doing stats on all files in a book when getting metadata for that book.
Speeds up calibre startup with large libraries/libraries on a network share,
with a composite custom column.
'''
# Lazy format metadata retrieval {{{
def resolved(f):
@wraps(f)
def wrapper(self, *args, **kwargs):
if getattr(self, '_must_resolve', True):
self._resolve()
self._must_resolve = False
return f(self, *args, **kwargs)
return wrapper
class MutableBase(object):
@resolved
def __str__(self):
return str(self._values)
@resolved
def __repr__(self):
return repr(self._values)
@resolved
def __unicode__(self):
return unicode(self._values)
@resolved
def __len__(self):
return len(self._values)
@resolved
def __iter__(self):
return iter(self._values)
@resolved
def __contains__(self, key):
return key in self._values
@resolved
def __getitem__(self, fmt):
return self._values[fmt]
@resolved
def __setitem__(self, key, val):
self._values[key] = val
@resolved
def __delitem__(self, key):
del self._values[key]
class FormatMetadata(MutableBase, MutableMapping):
def __init__(self, db, id_, formats):
self._dbwref = weakref.ref(db)
self._id = id_
self._formats = formats
def _resolve(self):
db = self._dbwref()
self._values = {}
for f in self._formats:
try:
self._values[f] = db.format_metadata(self._id, f)
except:
pass
class FormatsList(MutableBase, MutableSequence):
def __init__(self, formats, format_metadata):
self._formats = formats
self._format_metadata = format_metadata
def _resolve(self):
self._values = [f for f in self._formats if f in self._format_metadata]
@resolved
def insert(self, idx, val):
self._values.insert(idx, val)
# }}}

View File

@ -11,6 +11,7 @@ from calibre.customize.conversion import InputFormatPlugin
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
from calibre.constants import filesystem_encoding
class CHMInput(InputFormatPlugin):
@ -36,6 +37,8 @@ class CHMInput(InputFormatPlugin):
log.debug('Processing CHM...')
with TemporaryDirectory('_chm2oeb') as tdir:
if not isinstance(tdir, unicode):
tdir = tdir.decode(filesystem_encoding)
html_input = plugin_for_input_format('html')
for opt in html_input.options:
setattr(options, opt.option.name, opt.recommended_value)

View File

@ -6,13 +6,14 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
import re, codecs
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata import string_to_authors, MetaInformation
from calibre.utils.logging import default_log
from calibre.ptempfile import TemporaryFile
from calibre import force_unicode
def _clean(s):
return s.replace(u'\u00a0', u' ')
@ -138,6 +139,13 @@ def get_metadata_from_reader(rdr):
resolve_entities=True)[0])
title = rdr.title
try:
x = rdr.GetEncoding()
codecs.lookup(x)
enc = x
except:
enc = 'cp1252'
title = force_unicode(title, enc)
authors = _get_authors(home)
mi = MetaInformation(title, authors)
publisher = _get_publisher(home)

View File

@ -4,7 +4,7 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
' and Alex Bramley <a.bramley at gmail.com>.'
import os, re
import os, re, codecs
from calibre import guess_type as guess_mimetype
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString
@ -99,8 +99,17 @@ class CHMReader(CHMFile):
def ExtractFiles(self, output_dir=os.getcwdu(), debug_dump=False):
html_files = set([])
try:
x = self.GetEncoding()
codecs.lookup(x)
enc = x
except:
enc = 'cp1252'
for path in self.Contents():
lpath = os.path.join(output_dir, path)
fpath = path
if not isinstance(path, unicode):
fpath = path.decode(enc)
lpath = os.path.join(output_dir, fpath)
self._ensure_dir(lpath)
try:
data = self.GetFile(path)
@ -123,6 +132,7 @@ class CHMReader(CHMFile):
self.log.warn('%r filename too long, skipping'%path)
continue
raise
if debug_dump:
import shutil
shutil.copytree(output_dir, os.path.join(debug_dump, 'debug_dump'))

View File

@ -103,7 +103,7 @@ def html5_parse(data, max_nesting_depth=100):
xmlns_declaration = '{%s}'%XMLNS_NS
non_html5_namespaces = {}
seen_namespaces = set()
for elem in tuple(data.iter()):
for elem in tuple(data.iter(tag=etree.Element)):
elem.attrib.pop('xmlns', None)
namespaces = {}
for x in tuple(elem.attrib):

View File

@ -7,8 +7,8 @@ __docformat__ = 'restructuredtext en'
The database used to store ebook metadata
'''
import os, sys, shutil, cStringIO, glob, time, functools, traceback, re, \
json, uuid, hashlib, copy, weakref
from collections import defaultdict, MutableMapping, MutableSequence
json, uuid, hashlib, copy
from collections import defaultdict
import threading, random
from itertools import repeat
from math import ceil
@ -40,6 +40,7 @@ from calibre.utils.magick.draw import save_cover_data_to
from calibre.utils.recycle_bin import delete_file, delete_tree
from calibre.utils.formatter_functions import load_user_template_functions
from calibre.db.errors import NoSuchFormat
from calibre.db.lazy import FormatMetadata, FormatsList
from calibre.utils.localization import (canonicalize_lang,
calibre_langcode_to_name)
@ -81,90 +82,6 @@ class Tag(object):
def __repr__(self):
return str(self)
class FormatMetadata(MutableMapping): # {{{
def __init__(self, db, id_, formats):
self.dbwref = weakref.ref(db)
self.id_ = id_
self.formats = formats
self._must_do = True
self.values = {}
def _resolve(self):
if self._must_do:
for f in self.formats:
try:
self.values[f] = self.dbwref().format_metadata(self.id_, f)
except:
pass
self._must_do = False
def __getitem__(self, fmt):
self._resolve()
return self.values[fmt]
def __setitem__(self, key, val):
self._resolve()
self.values[key] = val
def __delitem__(self, key):
self._resolve()
self.values.__delitem__(key)
def __len__(self):
self._resolve()
return len(self.values)
def __iter__(self):
self._resolve()
return self.values.__iter__()
class FormatsList(MutableSequence):
def __init__(self, formats, format_metadata):
self.formats = formats
self.format_metadata = format_metadata
self._must_do = True
self.values = []
def _resolve(self):
if self._must_do:
for f in self.formats:
try:
if f in self.format_metadata:
self.values.append(f)
except:
pass
self._must_do = False
def __getitem__(self, dex):
self._resolve()
return self.values[dex]
def __setitem__(self, key, dex):
self._resolve()
self.values[key] = dex
def __delitem__(self, dex):
self._resolve()
self.values.__delitem__(dex)
def __len__(self):
self._resolve()
return len(self.values)
def __iter__(self):
self._resolve()
return self.values.__iter__()
def insert(self, idx, val):
self._resolve()
self.values.insert(idx, val)
# }}}
class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
'''
An ebook metadata database that stores references to ebook files on disk.
@ -253,6 +170,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
except:
traceback.print_exc()
self.field_metadata = FieldMetadata()
self.format_filename_cache = defaultdict(dict)
self._library_id_ = None
# Create the lock to be used to guard access to the metadata writer
# queues. This must be an RLock, not a Lock
@ -393,6 +311,12 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
if not self.is_second_db:
load_user_template_functions(self.prefs.get('user_template_functions', []))
# Load the format filename cache
self.format_filename_cache = defaultdict(dict)
for book_id, fmt, name in self.conn.get(
'SELECT book,format,name FROM data'):
self.format_filename_cache[book_id][fmt.upper() if fmt else ''] = name
self.conn.executescript('''
DROP TRIGGER IF EXISTS author_insert_trg;
CREATE TEMP TRIGGER author_insert_trg
@ -682,7 +606,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
fname = self.construct_file_name(id)
changed = False
for format in formats:
name = self.conn.get('SELECT name FROM data WHERE book=? AND format=?', (id, format), all=False)
name = self.format_filename_cache[id].get(format.upper(), None)
if name and name != fname:
changed = True
break
@ -1222,12 +1146,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
def format_files(self, index, index_is_id=False):
id = index if index_is_id else self.id(index)
try:
formats = self.conn.get('SELECT name,format FROM data WHERE book=?', (id,))
formats = map(lambda x:(x[0], x[1]), formats)
return formats
except:
return []
return [(v, k) for k, v in self.format_filename_cache[id].iteritems()]
def formats(self, index, index_is_id=False, verify_formats=True):
''' Return available formats as a comma separated list or None if there are no available formats '''
@ -1313,7 +1232,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
'''
id = index if index_is_id else self.id(index)
try:
name = self.conn.get('SELECT name FROM data WHERE book=? AND format=?', (id, format), all=False)
name = self.format_filename_cache[id][format.upper()]
except:
return None
if name:
@ -1410,11 +1329,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
def add_format(self, index, format, stream, index_is_id=False, path=None,
notify=True, replace=True):
id = index if index_is_id else self.id(index)
if format:
self.format_metadata_cache[id].pop(format.upper(), None)
if not format: format = ''
self.format_metadata_cache[id].pop(format.upper(), None)
name = self.format_filename_cache[id].get(format.upper(), None)
if path is None:
path = os.path.join(self.library_path, self.path(id, index_is_id=True))
name = self.conn.get('SELECT name FROM data WHERE book=? AND format=?', (id, format), all=False)
if name and not replace:
return False
name = self.construct_file_name(id)
@ -1432,6 +1351,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
self.conn.execute('INSERT OR REPLACE INTO data (book,format,uncompressed_size,name) VALUES (?,?,?,?)',
(id, format.upper(), size, name))
self.conn.commit()
self.format_filename_cache[id][format.upper()] = name
self.refresh_ids([id])
if notify:
self.notify('metadata', [id])
@ -1479,9 +1399,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
def remove_format(self, index, format, index_is_id=False, notify=True,
commit=True, db_only=False):
id = index if index_is_id else self.id(index)
if format:
self.format_metadata_cache[id].pop(format.upper(), None)
name = self.conn.get('SELECT name FROM data WHERE book=? AND format=?', (id, format), all=False)
if not format: format = ''
self.format_metadata_cache[id].pop(format.upper(), None)
name = self.format_filename_cache[id].pop(format.upper(), None)
if name:
if not db_only:
try:

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.6 KiB

After

Width:  |  Height:  |  Size: 1.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 KiB

After

Width:  |  Height:  |  Size: 1.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 733 B

After

Width:  |  Height:  |  Size: 2.3 KiB