Speed up reading the db

Principally by storing dates as UTC instead of local time. Includes some
micro optimizations in the code paths to build the maps.
This commit is contained in:
Kovid Goyal 2013-07-23 08:16:55 +05:30
parent 627375e091
commit a1581e1433
12 changed files with 205 additions and 114 deletions

View File

@ -54,7 +54,7 @@ def _get_series_values(val):
pass
return (val, None)
def get_data_as_dict(self, prefix=None, authors_as_string=False, ids=None):
def get_data_as_dict(self, prefix=None, authors_as_string=False, ids=None, convert_to_local_tz=True):
'''
Return all metadata stored in the database as a dict. Includes paths to
the cover and each format.
@ -66,6 +66,7 @@ def get_data_as_dict(self, prefix=None, authors_as_string=False, ids=None):
'''
import os
from calibre.ebooks.metadata import authors_to_string
from calibre.utils.date import as_local_time
backend = getattr(self, 'backend', self) # Works with both old and legacy interfaces
if prefix is None:
prefix = backend.library_path
@ -88,6 +89,10 @@ def get_data_as_dict(self, prefix=None, authors_as_string=False, ids=None):
x = {}
for field in FIELDS:
x[field] = record[self.FIELD_MAP[field]]
if convert_to_local_tz and hasattr(self, 'new_api'):
for tf in ('timestamp', 'pubdate', 'last_modified'):
x[tf] = as_local_time(x[tf])
data.append(x)
x['id'] = db_id
x['formats'] = []

View File

@ -13,7 +13,7 @@ from datetime import timedelta
from calibre.constants import preferred_encoding
from calibre.utils.config_base import prefs
from calibre.utils.date import parse_date, UNDEFINED_DATE, now
from calibre.utils.date import parse_date, UNDEFINED_DATE, now, dt_as_local
from calibre.utils.icu import primary_find, sort_key
from calibre.utils.localization import lang_map, canonicalize_lang
from calibre.utils.search_query_parser import SearchQueryParser, ParseException
@ -211,7 +211,7 @@ class DateSearch(object): # {{{
for v, book_ids in field_iter():
if isinstance(v, (str, unicode)):
v = parse_date(v)
if v is not None and relop(v, qd, field_count):
if v is not None and relop(dt_as_local(v), qd, field_count):
matches |= book_ids
return matches

View File

@ -7,16 +7,37 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from datetime import datetime
from datetime import datetime, timedelta
from collections import defaultdict
from dateutil.tz import tzoffset
from calibre.constants import plugins
from calibre.utils.date import parse_date, local_tz, UNDEFINED_DATE
from calibre.utils.date import parse_date, UNDEFINED_DATE, utc_tz
from calibre.ebooks.metadata import author_to_author_sort
_c_speedup = plugins['speedup'][0]
_c_speedup = plugins['speedup'][0].parse_date
def c_parse(val):
try:
year, month, day, hour, minutes, seconds, tzsecs = _c_speedup(val)
except (AttributeError, TypeError):
# If a value like 2001 is stored in the column, apsw will return it as
# an int
if isinstance(val, (int, float)):
return datetime(int(val), 1, 3, tzinfo=utc_tz)
except:
pass
else:
try:
ans = datetime(year, month, day, hour, minutes, seconds, tzinfo=utc_tz)
if tzsecs is not 0:
ans -= timedelta(seconds=tzsecs)
except OverflowError:
ans = UNDEFINED_DATE
return ans
try:
return parse_date(val, as_utc=True, assume_utc=True)
except ValueError:
return UNDEFINED_DATE
ONE_ONE, MANY_ONE, MANY_MANY = xrange(3)
@ -24,28 +45,6 @@ class Null:
pass
null = Null()
def _c_convert_timestamp(val):
if not val:
return None
try:
ret = _c_speedup.parse_date(val.strip())
except AttributeError:
# If a value like 2001 is stored in the column, apsw will return it as
# an int
if isinstance(val, (int, float)):
return datetime(int(val), 1, 1, tzinfo=tzoffset(None, 0)).astimezone(local_tz)
ret = None
except:
ret = None
if ret is None:
return parse_date(val, as_utc=False)
year, month, day, hour, minutes, seconds, tzsecs = ret
try:
return datetime(year, month, day, hour, minutes, seconds,
tzinfo=tzoffset(None, tzsecs)).astimezone(local_tz)
except OverflowError:
return UNDEFINED_DATE.astimezone(local_tz)
class Table(object):
def __init__(self, name, metadata, link_table=None):
@ -54,7 +53,7 @@ class Table(object):
# self.unserialize() maps values from the db to python objects
self.unserialize = {
'datetime': _c_convert_timestamp,
'datetime': c_parse,
'bool': bool
}.get(metadata['datatype'], None)
if name == 'authors':
@ -89,7 +88,6 @@ class OneToOneTable(Table):
table_type = ONE_ONE
def read(self, db):
self.book_col_map = {}
idcol = 'id' if self.metadata['table'] == 'books' else 'book'
query = db.conn.execute('SELECT {0}, {1} FROM {2}'.format(idcol,
self.metadata['column'], self.metadata['table']))
@ -175,7 +173,7 @@ class ManyToOneTable(Table):
def read(self, db):
self.id_map = {}
self.col_book_map = {}
self.col_book_map = defaultdict(set)
self.book_col_map = {}
self.read_id_maps(db)
self.read_maps(db)
@ -190,13 +188,13 @@ class ManyToOneTable(Table):
self.id_map = {book_id:us(val) for book_id, val in query}
def read_maps(self, db):
for row in db.conn.execute(
cbm = self.col_book_map
bcm = self.book_col_map
for book, item_id in db.conn.execute(
'SELECT book, {0} FROM {1}'.format(
self.metadata['link_column'], self.link_table)):
if row[1] not in self.col_book_map:
self.col_book_map[row[1]] = set()
self.col_book_map[row[1]].add(row[0])
self.book_col_map[row[0]] = row[1]
cbm[item_id].add(book)
bcm[book] = item_id
def remove_books(self, book_ids, db):
clean = set()
@ -272,17 +270,14 @@ class ManyToManyTable(ManyToOneTable):
do_clean_on_remove = True
def read_maps(self, db):
for row in db.conn.execute(
bcm = defaultdict(list)
cbm = self.col_book_map
for book, item_id in db.conn.execute(
self.selectq.format(self.metadata['link_column'], self.link_table)):
if row[1] not in self.col_book_map:
self.col_book_map[row[1]] = set()
self.col_book_map[row[1]].add(row[0])
if row[0] not in self.book_col_map:
self.book_col_map[row[0]] = []
self.book_col_map[row[0]].append(row[1])
cbm[item_id].add(book)
bcm[book].append(item_id)
for key in tuple(self.book_col_map.iterkeys()):
self.book_col_map[key] = tuple(self.book_col_map[key])
self.book_col_map = {k:tuple(v) for k, v in bcm.iteritems()}
def remove_books(self, book_ids, db):
clean = set()
@ -351,17 +346,16 @@ class ManyToManyTable(ManyToOneTable):
class AuthorsTable(ManyToManyTable):
def read_id_maps(self, db):
self.alink_map = {}
self.asort_map = {}
self.id_map = {}
self.alink_map = lm = {}
self.asort_map = sm = {}
self.id_map = im = {}
us = self.unserialize
for row in db.conn.execute(
for aid, name, sort, link in db.conn.execute(
'SELECT id, name, sort, link FROM authors'):
val = us(row[1])
self.id_map[row[0]] = self.unserialize(val)
self.asort_map[row[0]] = (row[2] if row[2] else
author_to_author_sort(val))
self.alink_map[row[0]] = row[3]
name = us(name)
im[aid] = name
sm[aid] = (sort or author_to_author_sort(name))
lm[aid] = link
def set_sort_names(self, aus_map, db):
aus_map = {aid:(a or '').strip() for aid, a in aus_map.iteritems()}
@ -404,22 +398,20 @@ class FormatsTable(ManyToManyTable):
pass
def read_maps(self, db):
self.fname_map = defaultdict(dict)
self.size_map = defaultdict(dict)
for row in db.conn.execute('SELECT book, format, name, uncompressed_size FROM data'):
if row[1] is not None:
fmt = row[1].upper()
if fmt not in self.col_book_map:
self.col_book_map[fmt] = set()
self.col_book_map[fmt].add(row[0])
if row[0] not in self.book_col_map:
self.book_col_map[row[0]] = []
self.book_col_map[row[0]].append(fmt)
self.fname_map[row[0]][fmt] = row[2]
self.size_map[row[0]][fmt] = row[3]
self.fname_map = fnm = defaultdict(dict)
self.size_map = sm = defaultdict(dict)
self.col_book_map = cbm = defaultdict(set)
bcm = defaultdict(list)
for key in tuple(self.book_col_map.iterkeys()):
self.book_col_map[key] = tuple(sorted(self.book_col_map[key]))
for book, fmt, name, sz in db.conn.execute('SELECT book, format, name, uncompressed_size FROM data'):
if fmt is not None:
fmt = fmt.upper()
cbm[fmt].add(book)
bcm[book].append(fmt)
fnm[book][fmt] = name
sm[book][fmt] = sz
self.book_col_map = {k:tuple(sorted(v)) for k, v in bcm.iteritems()}
def remove_books(self, book_ids, db):
clean = ManyToManyTable.remove_books(self, book_ids, db)
@ -485,14 +477,12 @@ class IdentifiersTable(ManyToManyTable):
pass
def read_maps(self, db):
for row in db.conn.execute('SELECT book, type, val FROM identifiers'):
if row[1] is not None and row[2] is not None:
if row[1] not in self.col_book_map:
self.col_book_map[row[1]] = set()
self.col_book_map[row[1]].add(row[0])
if row[0] not in self.book_col_map:
self.book_col_map[row[0]] = {}
self.book_col_map[row[0]][row[1]] = row[2]
self.book_col_map = defaultdict(dict)
self.col_book_map = defaultdict(set)
for book, typ, val in db.conn.execute('SELECT book, type, val FROM identifiers'):
if typ is not None and val is not None:
self.col_book_map[typ].add(book)
self.book_col_map[book][typ] = val
def remove_books(self, book_ids, db):
clean = set()

View File

@ -0,0 +1,36 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import os, cProfile
from tempfile import gettempdir
from calibre.db.legacy import LibraryDatabase
db = None
def initdb(path):
global db
db = LibraryDatabase(os.path.expanduser(path))
def show_stats(path):
from pstats import Stats
s = Stats(path)
s.sort_stats('cumulative')
s.print_stats(30)
def main():
stats = os.path.join(gettempdir(), 'read_db.stats')
pr = cProfile.Profile()
pr.enable()
initdb('~/documents/largelib')
pr.disable()
pr.dump_stats(stats)
show_stats(stats)
print ('Stats saved to', stats)
if __name__ == '__main__':
main()

View File

@ -385,3 +385,21 @@ class ReadingTest(BaseTest):
self.assertFalse(x.has_book(Metadata(title[:1])))
db.close()
# }}}
def test_datetime(self):
' Test the reading of datetimes stored in the db '
from calibre.utils.date import parse_date
from calibre.db.tables import c_parse, UNDEFINED_DATE, _c_speedup
# First test parsing of string to UTC time
for raw in ('2013-07-22 15:18:29+05:30', ' 2013-07-22 15:18:29+00:00', '2013-07-22 15:18:29', '2003-09-21 23:30:00-06:00'):
self.assertTrue(_c_speedup(raw))
ctime = c_parse(raw)
pytime = parse_date(raw, assume_utc=True)
self.assertEqual(ctime, pytime)
self.assertEqual(c_parse(2003).year, 2003)
for x in (None, '', 'abc'):
self.assertEqual(UNDEFINED_DATE, c_parse(x))

View File

@ -18,7 +18,7 @@ from calibre.ebooks.metadata import author_to_author_sort
from calibre.library.catalogs import AuthorSortMismatchException, EmptyCatalogException, \
InvalidGenresSourceFieldException
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.date import format_date, is_date_undefined, now as nowf
from calibre.utils.date import format_date, is_date_undefined, now as nowf, as_local_time
from calibre.utils.filenames import ascii_text, shorten_components_to
from calibre.utils.icu import capitalize, collation_order, sort_key
from calibre.utils.magick.draw import thumbnail
@ -940,7 +940,7 @@ class CatalogBuilder(object):
if is_date_undefined(record['pubdate']):
this_title['date'] = None
else:
this_title['date'] = strftime(u'%B %Y', record['pubdate'].timetuple())
this_title['date'] = strftime(u'%B %Y', as_local_time(record['pubdate']).timetuple())
this_title['timestamp'] = record['timestamp']

View File

@ -74,7 +74,7 @@ def do_list(db, fields, afields, sort_by, ascending, search_text, line_width, se
db.sort(sort_by, ascending)
if search_text:
db.search(search_text)
data = db.get_data_as_dict(prefix, authors_as_string=True)
data = db.get_data_as_dict(prefix, authors_as_string=True, convert_to_local_tz=False)
if limit > -1:
data = data[:limit]
fields = ['id'] + fields

View File

@ -10,7 +10,7 @@ import re, os, posixpath
import cherrypy
from calibre import fit_image, guess_type
from calibre.utils.date import fromtimestamp
from calibre.utils.date import fromtimestamp, as_utc
from calibre.library.caches import SortKeyGenerator
from calibre.library.save_to_disk import find_plugboard
from calibre.ebooks.metadata import authors_to_string
@ -54,6 +54,7 @@ class ContentServer(object):
Generates a locale independent, english timestamp from a datetime
object
'''
updated = as_utc(updated)
lm = updated.strftime('day, %d month %Y %H:%M:%S GMT')
day ={0:'Sun', 1:'Mon', 2:'Tue', 3:'Wed', 4:'Thu', 5:'Fri', 6:'Sat'}
lm = lm.replace('day', day[int(updated.strftime('%w'))])

View File

@ -19,7 +19,7 @@ from calibre.library.server.utils import strftime, format_tag_string
from calibre.ebooks.metadata import fmt_sidx
from calibre.constants import __appname__
from calibre import human_readable, isbytestring
from calibre.utils.date import utcfromtimestamp
from calibre.utils.date import utcfromtimestamp, as_local_time
from calibre.utils.filenames import ascii_filename
from calibre.utils.icu import sort_key
@ -254,7 +254,7 @@ class MobileServer(object):
no_tag_count=True)
book['title'] = record[FM['title']]
for x in ('timestamp', 'pubdate'):
book[x] = strftime('%d %b, %Y', record[FM[x]])
book[x] = strftime('%d %b, %Y', as_local_time(record[FM[x]]))
book['id'] = record[FM['id']]
books.append(book)
for key in CKEYS:

View File

@ -22,6 +22,7 @@ from calibre.library.server import custom_fields_to_display
from calibre.library.server.utils import format_tag_string, Offsets
from calibre import guess_type, prepare_string_for_xml as xml
from calibre.utils.icu import sort_key
from calibre.utils.date import as_utc
BASE_HREFS = {
0 : '/stanza',
@ -58,7 +59,7 @@ ID = E.id
ICON = E.icon
def UPDATED(dt, *args, **kwargs):
return E.updated(dt.strftime('%Y-%m-%dT%H:%M:%S+00:00'), *args, **kwargs)
return E.updated(as_utc(dt).strftime('%Y-%m-%dT%H:%M:%S+00:00'), *args, **kwargs)
LINK = partial(E.link, type='application/atom+xml')
NAVLINK = partial(E.link,

View File

@ -6,23 +6,47 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
from datetime import datetime, time, timedelta
import re, time
from datetime import datetime, time as dtime, timedelta
from functools import partial
from dateutil.tz import tzlocal, tzutc
from dateutil.tz import tzlocal, tzutc, EPOCHORDINAL
from calibre import strftime
class SafeLocalTimeZone(tzlocal):
'''
Assume DST was not in effect for historical dates, if DST
data for the local timezone is not present in the operating system.
'''
def _isdst(self, dt):
# We can't use mktime here. It is unstable when deciding if
# the hour near to a change is DST or not.
#
# timestamp = time.mktime((dt.year, dt.month, dt.day, dt.hour,
# dt.minute, dt.second, dt.weekday(), 0, -1))
# return time.localtime(timestamp).tm_isdst
#
# The code above yields the following result:
#
#>>> import tz, datetime
#>>> t = tz.tzlocal()
#>>> datetime.datetime(2003,2,15,23,tzinfo=t).tzname()
#'BRDT'
#>>> datetime.datetime(2003,2,16,0,tzinfo=t).tzname()
#'BRST'
#>>> datetime.datetime(2003,2,15,23,tzinfo=t).tzname()
#'BRST'
#>>> datetime.datetime(2003,2,15,22,tzinfo=t).tzname()
#'BRDT'
#>>> datetime.datetime(2003,2,15,23,tzinfo=t).tzname()
#'BRDT'
#
# Here is a more stable implementation:
#
try:
return tzlocal._isdst(self, dt)
timestamp = ((dt.toordinal() - EPOCHORDINAL) * 86400
+ dt.hour * 3600
+ dt.minute * 60
+ dt.second)
return time.localtime(timestamp+time.timezone).tm_isdst
except ValueError:
pass
return False
@ -150,6 +174,11 @@ def as_local_time(date_time, assume_utc=True):
_local_tz)
return date_time.astimezone(_local_tz)
def dt_as_local(dt):
if dt.tzinfo is local_tz:
return dt
return dt.astimezone(local_tz)
def as_utc(date_time, assume_utc=True):
if not hasattr(date_time, 'tzinfo'):
return date_time
@ -174,24 +203,27 @@ def utcfromtimestamp(stamp):
traceback.print_exc()
return utcnow()
#### Format date functions
# Format date functions
def fd_format_hour(dt, strf, ampm, hr):
l = len(hr)
h = dt.hour
if ampm:
h = h%12
if l == 1: return '%d'%h
if l == 1:
return '%d'%h
return '%02d'%h
def fd_format_minute(dt, strf, ampm, min):
l = len(min)
if l == 1: return '%d'%dt.minute
if l == 1:
return '%d'%dt.minute
return '%02d'%dt.minute
def fd_format_second(dt, strf, ampm, sec):
l = len(sec)
if l == 1: return '%d'%dt.second
if l == 1:
return '%d'%dt.second
return '%02d'%dt.second
def fd_format_ampm(dt, strf, ampm, ap):
@ -202,20 +234,27 @@ def fd_format_ampm(dt, strf, ampm, ap):
def fd_format_day(dt, strf, ampm, dy):
l = len(dy)
if l == 1: return '%d'%dt.day
if l == 2: return '%02d'%dt.day
if l == 3: return strf('%a')
if l == 1:
return '%d'%dt.day
if l == 2:
return '%02d'%dt.day
if l == 3:
return strf('%a')
return strf('%A')
def fd_format_month(dt, strf, ampm, mo):
l = len(mo)
if l == 1: return '%d'%dt.month
if l == 2: return '%02d'%dt.month
if l == 3: return strf('%b')
if l == 1:
return '%d'%dt.month
if l == 2:
return '%02d'%dt.month
if l == 3:
return strf('%b')
return strf('%B')
def fd_format_year(dt, strf, ampm, yr):
if len(yr) == 2: return '%02d'%(dt.year % 100)
if len(yr) == 2:
return '%02d'%(dt.year % 100)
return '%04d'%dt.year
fd_function_index = {
@ -240,7 +279,7 @@ def format_date(dt, format, assume_utc=False, as_utc=False):
format = 'dd MMM yyyy'
if not isinstance(dt, datetime):
dt = datetime.combine(dt, time())
dt = datetime.combine(dt, dtime())
if hasattr(dt, 'tzinfo'):
if dt.tzinfo is None:
@ -260,7 +299,7 @@ def format_date(dt, format, assume_utc=False, as_utc=False):
'(s{1,2})|(m{1,2})|(h{1,2})|(ap)|(AP)|(d{1,4}|M{1,4}|(?:yyyy|yy))',
repl_func, format)
#### Clean date functions
# Clean date functions
def cd_has_hour(tt, dt):
tt['hour'] = dt.hour
@ -307,7 +346,7 @@ def clean_date_for_sort(dt, format):
format = 'yyMd'
if not isinstance(dt, datetime):
dt = datetime.combine(dt, time())
dt = datetime.combine(dt, dtime())
if hasattr(dt, 'tzinfo'):
if dt.tzinfo is None:
@ -364,6 +403,8 @@ def replace_months(datestr, clang):
for k in dictoen.iterkeys():
tmp = re.sub(k, dictoen[k], datestr)
if tmp != datestr: break
if tmp != datestr:
break
return tmp

View File

@ -13,12 +13,12 @@ speedup_parse_date(PyObject *self, PyObject *args) {
long year, month, day, hour, minute, second, tzh = 0, tzm = 0, sign = 0;
size_t len;
if(!PyArg_ParseTuple(args, "s", &raw)) return NULL;
while ((*raw == ' ' || *raw == '\t' || *raw == '\n' || *raw == '\r' || *raw == '\f' || *raw == '\v') && *raw != 0) raw++;
len = strlen(raw);
if (len < 19) Py_RETURN_NONE;
orig = raw;
year = strtol(raw, &end, 10);
if ((end - raw) != 4) Py_RETURN_NONE;
raw += 5;
@ -28,7 +28,6 @@ speedup_parse_date(PyObject *self, PyObject *args) {
if ((end - raw) != 2) Py_RETURN_NONE;
raw += 3;
day = strtol(raw, &end, 10);
if ((end - raw) != 2) Py_RETURN_NONE;
raw += 3;