Faster ISO 8601 parser

This commit is contained in:
Kovid Goyal 2016-06-04 11:00:46 +05:30
parent a96d7ac336
commit ae632d23cc
2 changed files with 201 additions and 90 deletions

View File

@ -10,23 +10,46 @@ import re
from dateutil.tz import tzlocal, tzutc, tzoffset
from calibre.constants import plugins
speedup, err = plugins['speedup']
if not speedup:
raise RuntimeError(err)
class SafeLocalTimeZone(tzlocal):
def _isdst(self, dt):
# This method in tzlocal raises ValueError if dt is out of range.
# In such cases, just assume that dt is not DST.
try:
tzlocal._isdst(self, dt)
return super(SafeLocalTimeZone, self)._isdst(dt)
except Exception:
pass
return False
utc_tz = tzutc()
local_tz = SafeLocalTimeZone()
del tzutc, tzlocal
_iso_pat = None
if hasattr(speedup, 'parse_iso8601'): # For people running from source without updated binaries
def parse_iso8601(date_string, assume_utc=True, as_utc=True):
dt, aware, tzseconds = speedup.parse_iso8601(date_string)
tz = utc_tz if assume_utc else local_tz
if aware: # timezone was specified
if tzseconds == 0:
tz = utc_tz
else:
sign = '-' if tzseconds < 0 else '+'
description = "%s%02d:%02d" % (sign, abs(tzseconds) // 3600, (abs(tzseconds) % 3600) // 60)
tz = tzoffset(description, tzseconds)
dt = dt.replace(tzinfo=tz)
if as_utc and tz is utc_tz:
return dt
return dt.astimezone(utc_tz if as_utc else local_tz)
else:
# Pure python implementation
_iso_pat = None
def iso_pat():
def iso_pat():
global _iso_pat
if _iso_pat is None:
_iso_pat = re.compile(
@ -73,7 +96,7 @@ def iso_pat():
""", re.VERBOSE)
return _iso_pat
def to_int(d, key, default_to_zero=False, default=None, required=True):
def to_int(d, key, default_to_zero=False, default=None, required=True):
"""Pull a value from the dict and convert to int
:param default_to_zero: If the value is None or empty, treat it as zero
@ -89,7 +112,7 @@ def to_int(d, key, default_to_zero=False, default=None, required=True):
else:
return int(value)
def parse_timezone(matches, default_timezone=utc_tz):
def parse_timezone(matches, default_timezone=utc_tz):
"""Parses ISO 8601 time zone specs into tzinfo offsets
"""
@ -110,7 +133,7 @@ def parse_timezone(matches, default_timezone=utc_tz):
minutes = -minutes
return tzoffset(description, 3600*hours + 60*minutes)
def parse_iso8601(date_string, assume_utc=True, as_utc=True):
def parse_iso8601(date_string, assume_utc=True, as_utc=True):
if isinstance(date_string, bytes):
date_string = date_string.decode('ascii')
m = iso_pat().match(date_string)

View File

@ -1,5 +1,6 @@
#define UNICODE
#include <Python.h>
#include <datetime.h>
#include <stdlib.h>
#include <fcntl.h>
@ -353,9 +354,95 @@ clean_xml_chars(PyObject *self, PyObject *text) {
return (PyObject*)ans;
}
static PyObject *
speedup_iso_8601(PyObject *self, PyObject *args) {
char *str = NULL, *c = NULL;
int year = 0, month = 0, day = 0, hour = 0, minute = 0, second = 0, usecond = 0, i = 0, tzhour = 1000, tzminute = 0, tzsign = 0;
if (!PyArg_ParseTuple(args, "s", &str)) return NULL;
c = str;
#define RAISE(msg) return PyErr_Format(PyExc_ValueError, "%s is not a valid ISO 8601 datestring: %s", str, msg);
#define CHAR_IS_DIGIT(c) (*c >= '0' && *c <= '9')
#define READ_DECIMAL_NUMBER(max_digits, x, abort) \
for (i = 0; i < max_digits; i++) { \
if (CHAR_IS_DIGIT(c)) x = 10 * x + *c++ - '0'; \
else { abort; } \
}
#define OPTIONAL_SEPARATOR(x) if(*c == x) c++;
// Ignore leading whitespace
while(*c == ' ' || *c == '\n' || *c == '\r' || *c == '\t' || *c == '\v' || *c == '\f') c++;
// Year
READ_DECIMAL_NUMBER(4, year, RAISE("No year specified"));
OPTIONAL_SEPARATOR('-');
// Month (optional)
READ_DECIMAL_NUMBER(2, month, break);
if (month == 0) month = 1; // YYYY format
else {
OPTIONAL_SEPARATOR('-');
// Day (optional)
READ_DECIMAL_NUMBER(2, day, break);
}
if (day == 0) day = 1; // YYYY-MM format
if (month > 12) RAISE("month greater than 12");
if (*c == 'T' || *c == ' ') // Time separator
{
c++;
// Hour
READ_DECIMAL_NUMBER(2, hour, RAISE("No hour specified"));
OPTIONAL_SEPARATOR(':');
// Minute (optional)
READ_DECIMAL_NUMBER(2, minute, break);
OPTIONAL_SEPARATOR(':');
// Second (optional)
READ_DECIMAL_NUMBER(2, second, break);
if (*c == '.' || *c == ',') // separator for microseconds
{
c++;
// Parse fraction of second up to 6 places
READ_DECIMAL_NUMBER(6, usecond, break);
// Omit excessive digits
while (CHAR_IS_DIGIT(c)) c++;
// If we break early, fully expand the usecond
while (i++ < 6) usecond *= 10;
}
}
switch(*c) {
case 'Z':
tzhour = 0; c++; break;
case '+':
tzsign = 1; c++; break;
case '-':
tzsign = -1; c++; break;
default:
break;
}
if (tzsign != 0) {
tzhour = 0;
READ_DECIMAL_NUMBER(2, tzhour, break);
OPTIONAL_SEPARATOR(':');
READ_DECIMAL_NUMBER(2, tzminute, break);
}
return Py_BuildValue("NOi", PyDateTime_FromDateAndTime(year, month, day, hour, minute, second, usecond), (tzhour == 1000) ? Py_False : Py_True, tzsign*60*(tzhour*60 + tzminute));
}
static PyMethodDef speedup_methods[] = {
{"parse_date", speedup_parse_date, METH_VARARGS,
"parse_date()\n\nParse ISO dates faster."
"parse_date()\n\nParse ISO dates faster (specialized for dates stored in the calibre db)."
},
{"parse_iso8601", speedup_iso_8601, METH_VARARGS,
"parse_iso8601(datestring)\n\nParse ISO 8601 dates faster. More spec compliant than parse_date()"
},
{"pdf_float", speedup_pdf_float, METH_VARARGS,
@ -403,6 +490,7 @@ initspeedup(void) {
"Implementation of methods in C for speed."
);
if (m == NULL) return;
PyDateTime_IMPORT;
#ifdef O_CLOEXEC
PyModule_AddIntConstant(m, "O_CLOEXEC", O_CLOEXEC);
#endif