mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Faster ISO 8601 parser
This commit is contained in:
parent
a96d7ac336
commit
ae632d23cc
@ -10,125 +10,148 @@ import re
|
||||
|
||||
from dateutil.tz import tzlocal, tzutc, tzoffset
|
||||
|
||||
from calibre.constants import plugins
|
||||
speedup, err = plugins['speedup']
|
||||
if not speedup:
|
||||
raise RuntimeError(err)
|
||||
|
||||
class SafeLocalTimeZone(tzlocal):
|
||||
|
||||
def _isdst(self, dt):
|
||||
# This method in tzlocal raises ValueError if dt is out of range.
|
||||
# In such cases, just assume that dt is not DST.
|
||||
try:
|
||||
tzlocal._isdst(self, dt)
|
||||
return super(SafeLocalTimeZone, self)._isdst(dt)
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
utc_tz = tzutc()
|
||||
local_tz = SafeLocalTimeZone()
|
||||
del tzutc, tzlocal
|
||||
|
||||
_iso_pat = None
|
||||
if hasattr(speedup, 'parse_iso8601'): # For people running from source without updated binaries
|
||||
def parse_iso8601(date_string, assume_utc=True, as_utc=True):
|
||||
dt, aware, tzseconds = speedup.parse_iso8601(date_string)
|
||||
tz = utc_tz if assume_utc else local_tz
|
||||
if aware: # timezone was specified
|
||||
if tzseconds == 0:
|
||||
tz = utc_tz
|
||||
else:
|
||||
sign = '-' if tzseconds < 0 else '+'
|
||||
description = "%s%02d:%02d" % (sign, abs(tzseconds) // 3600, (abs(tzseconds) % 3600) // 60)
|
||||
tz = tzoffset(description, tzseconds)
|
||||
dt = dt.replace(tzinfo=tz)
|
||||
if as_utc and tz is utc_tz:
|
||||
return dt
|
||||
return dt.astimezone(utc_tz if as_utc else local_tz)
|
||||
else:
|
||||
# Pure python implementation
|
||||
_iso_pat = None
|
||||
|
||||
def iso_pat():
|
||||
global _iso_pat
|
||||
if _iso_pat is None:
|
||||
_iso_pat = re.compile(
|
||||
# Adapted from http://delete.me.uk/2005/03/iso8601.html
|
||||
r"""
|
||||
(?P<year>[0-9]{4})
|
||||
(
|
||||
(
|
||||
(-(?P<monthdash>[0-9]{1,2}))
|
||||
|
|
||||
(?P<month>[0-9]{2})
|
||||
(?!$) # Don't allow YYYYMM
|
||||
)
|
||||
def iso_pat():
|
||||
global _iso_pat
|
||||
if _iso_pat is None:
|
||||
_iso_pat = re.compile(
|
||||
# Adapted from http://delete.me.uk/2005/03/iso8601.html
|
||||
r"""
|
||||
(?P<year>[0-9]{4})
|
||||
(
|
||||
(
|
||||
(-(?P<daydash>[0-9]{1,2}))
|
||||
(-(?P<monthdash>[0-9]{1,2}))
|
||||
|
|
||||
(?P<day>[0-9]{2})
|
||||
(?P<month>[0-9]{2})
|
||||
(?!$) # Don't allow YYYYMM
|
||||
)
|
||||
(
|
||||
(
|
||||
(?P<separator>[ T])
|
||||
(?P<hour>[0-9]{2})
|
||||
(:{0,1}(?P<minute>[0-9]{2})){0,1}
|
||||
(-(?P<daydash>[0-9]{1,2}))
|
||||
|
|
||||
(?P<day>[0-9]{2})
|
||||
)
|
||||
(
|
||||
(
|
||||
:{0,1}(?P<second>[0-9]{1,2})
|
||||
([.,](?P<second_fraction>[0-9]+)){0,1}
|
||||
){0,1}
|
||||
(?P<timezone>
|
||||
Z
|
||||
|
|
||||
(?P<separator>[ T])
|
||||
(?P<hour>[0-9]{2})
|
||||
(:{0,1}(?P<minute>[0-9]{2})){0,1}
|
||||
(
|
||||
(?P<tz_sign>[-+])
|
||||
(?P<tz_hour>[0-9]{2})
|
||||
:{0,1}
|
||||
(?P<tz_minute>[0-9]{2}){0,1}
|
||||
)
|
||||
:{0,1}(?P<second>[0-9]{1,2})
|
||||
([.,](?P<second_fraction>[0-9]+)){0,1}
|
||||
){0,1}
|
||||
(?P<timezone>
|
||||
Z
|
||||
|
|
||||
(
|
||||
(?P<tz_sign>[-+])
|
||||
(?P<tz_hour>[0-9]{2})
|
||||
:{0,1}
|
||||
(?P<tz_minute>[0-9]{2}){0,1}
|
||||
)
|
||||
){0,1}
|
||||
){0,1}
|
||||
){0,1}
|
||||
)
|
||||
){0,1} # YYYY-MM
|
||||
){0,1} # YYYY only
|
||||
$
|
||||
""", re.VERBOSE)
|
||||
return _iso_pat
|
||||
)
|
||||
){0,1} # YYYY-MM
|
||||
){0,1} # YYYY only
|
||||
$
|
||||
""", re.VERBOSE)
|
||||
return _iso_pat
|
||||
|
||||
def to_int(d, key, default_to_zero=False, default=None, required=True):
|
||||
"""Pull a value from the dict and convert to int
|
||||
def to_int(d, key, default_to_zero=False, default=None, required=True):
|
||||
"""Pull a value from the dict and convert to int
|
||||
|
||||
:param default_to_zero: If the value is None or empty, treat it as zero
|
||||
:param default: If the value is missing in the dict use this default
|
||||
:param default_to_zero: If the value is None or empty, treat it as zero
|
||||
:param default: If the value is missing in the dict use this default
|
||||
|
||||
"""
|
||||
value = d.get(key) or default
|
||||
if (value is None or value == '') and default_to_zero:
|
||||
return 0
|
||||
if value is None:
|
||||
if required:
|
||||
raise ValueError("Unable to read %s from %s" % (key, d))
|
||||
else:
|
||||
return int(value)
|
||||
"""
|
||||
value = d.get(key) or default
|
||||
if (value is None or value == '') and default_to_zero:
|
||||
return 0
|
||||
if value is None:
|
||||
if required:
|
||||
raise ValueError("Unable to read %s from %s" % (key, d))
|
||||
else:
|
||||
return int(value)
|
||||
|
||||
def parse_timezone(matches, default_timezone=utc_tz):
|
||||
"""Parses ISO 8601 time zone specs into tzinfo offsets
|
||||
def parse_timezone(matches, default_timezone=utc_tz):
|
||||
"""Parses ISO 8601 time zone specs into tzinfo offsets
|
||||
|
||||
"""
|
||||
"""
|
||||
|
||||
if matches["timezone"] == "Z":
|
||||
return utc_tz
|
||||
# This isn't strictly correct, but it's common to encounter dates without
|
||||
# timezones so I'll assume the default (which defaults to UTC).
|
||||
# Addresses issue 4.
|
||||
if matches["timezone"] is None:
|
||||
return default_timezone
|
||||
sign = matches["tz_sign"]
|
||||
hours = to_int(matches, "tz_hour")
|
||||
minutes = to_int(matches, "tz_minute", default_to_zero=True)
|
||||
description = "%s%02d:%02d" % (sign, hours, minutes)
|
||||
if sign == "-":
|
||||
hours = -hours
|
||||
minutes = -minutes
|
||||
return tzoffset(description, 3600*hours + 60*minutes)
|
||||
if matches["timezone"] == "Z":
|
||||
return utc_tz
|
||||
# This isn't strictly correct, but it's common to encounter dates without
|
||||
# timezones so I'll assume the default (which defaults to UTC).
|
||||
# Addresses issue 4.
|
||||
if matches["timezone"] is None:
|
||||
return default_timezone
|
||||
sign = matches["tz_sign"]
|
||||
hours = to_int(matches, "tz_hour")
|
||||
minutes = to_int(matches, "tz_minute", default_to_zero=True)
|
||||
description = "%s%02d:%02d" % (sign, hours, minutes)
|
||||
if sign == "-":
|
||||
hours = -hours
|
||||
minutes = -minutes
|
||||
return tzoffset(description, 3600*hours + 60*minutes)
|
||||
|
||||
def parse_iso8601(date_string, assume_utc=True, as_utc=True):
|
||||
if isinstance(date_string, bytes):
|
||||
date_string = date_string.decode('ascii')
|
||||
m = iso_pat().match(date_string)
|
||||
if m is None:
|
||||
raise ValueError('%r is not a valid ISO8601 date' % date_string)
|
||||
groups = m.groupdict()
|
||||
tz = parse_timezone(groups, default_timezone=utc_tz if assume_utc else local_tz)
|
||||
groups["second_fraction"] = int(Decimal("0.%s" % (groups["second_fraction"] or 0)) * Decimal("1000000.0"))
|
||||
return datetime(
|
||||
year=to_int(groups, "year"),
|
||||
month=to_int(groups, "month", default=to_int(groups, "monthdash", required=False, default=1)),
|
||||
day=to_int(groups, "day", default=to_int(groups, "daydash", required=False, default=1)),
|
||||
hour=to_int(groups, "hour", default_to_zero=True),
|
||||
minute=to_int(groups, "minute", default_to_zero=True),
|
||||
second=to_int(groups, "second", default_to_zero=True),
|
||||
microsecond=groups["second_fraction"],
|
||||
tzinfo=tz,
|
||||
).astimezone(utc_tz if as_utc else local_tz)
|
||||
def parse_iso8601(date_string, assume_utc=True, as_utc=True):
|
||||
if isinstance(date_string, bytes):
|
||||
date_string = date_string.decode('ascii')
|
||||
m = iso_pat().match(date_string)
|
||||
if m is None:
|
||||
raise ValueError('%r is not a valid ISO8601 date' % date_string)
|
||||
groups = m.groupdict()
|
||||
tz = parse_timezone(groups, default_timezone=utc_tz if assume_utc else local_tz)
|
||||
groups["second_fraction"] = int(Decimal("0.%s" % (groups["second_fraction"] or 0)) * Decimal("1000000.0"))
|
||||
return datetime(
|
||||
year=to_int(groups, "year"),
|
||||
month=to_int(groups, "month", default=to_int(groups, "monthdash", required=False, default=1)),
|
||||
day=to_int(groups, "day", default=to_int(groups, "daydash", required=False, default=1)),
|
||||
hour=to_int(groups, "hour", default_to_zero=True),
|
||||
minute=to_int(groups, "minute", default_to_zero=True),
|
||||
second=to_int(groups, "second", default_to_zero=True),
|
||||
microsecond=groups["second_fraction"],
|
||||
tzinfo=tz,
|
||||
).astimezone(utc_tz if as_utc else local_tz)
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
|
@ -1,5 +1,6 @@
|
||||
#define UNICODE
|
||||
#include <Python.h>
|
||||
#include <datetime.h>
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <fcntl.h>
|
||||
@ -353,9 +354,95 @@ clean_xml_chars(PyObject *self, PyObject *text) {
|
||||
return (PyObject*)ans;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
speedup_iso_8601(PyObject *self, PyObject *args) {
|
||||
char *str = NULL, *c = NULL;
|
||||
int year = 0, month = 0, day = 0, hour = 0, minute = 0, second = 0, usecond = 0, i = 0, tzhour = 1000, tzminute = 0, tzsign = 0;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "s", &str)) return NULL;
|
||||
c = str;
|
||||
|
||||
#define RAISE(msg) return PyErr_Format(PyExc_ValueError, "%s is not a valid ISO 8601 datestring: %s", str, msg);
|
||||
#define CHAR_IS_DIGIT(c) (*c >= '0' && *c <= '9')
|
||||
#define READ_DECIMAL_NUMBER(max_digits, x, abort) \
|
||||
for (i = 0; i < max_digits; i++) { \
|
||||
if (CHAR_IS_DIGIT(c)) x = 10 * x + *c++ - '0'; \
|
||||
else { abort; } \
|
||||
}
|
||||
#define OPTIONAL_SEPARATOR(x) if(*c == x) c++;
|
||||
|
||||
// Ignore leading whitespace
|
||||
while(*c == ' ' || *c == '\n' || *c == '\r' || *c == '\t' || *c == '\v' || *c == '\f') c++;
|
||||
|
||||
// Year
|
||||
READ_DECIMAL_NUMBER(4, year, RAISE("No year specified"));
|
||||
OPTIONAL_SEPARATOR('-');
|
||||
// Month (optional)
|
||||
READ_DECIMAL_NUMBER(2, month, break);
|
||||
if (month == 0) month = 1; // YYYY format
|
||||
else {
|
||||
OPTIONAL_SEPARATOR('-');
|
||||
|
||||
// Day (optional)
|
||||
READ_DECIMAL_NUMBER(2, day, break);
|
||||
}
|
||||
if (day == 0) day = 1; // YYYY-MM format
|
||||
if (month > 12) RAISE("month greater than 12");
|
||||
|
||||
if (*c == 'T' || *c == ' ') // Time separator
|
||||
{
|
||||
c++;
|
||||
|
||||
// Hour
|
||||
READ_DECIMAL_NUMBER(2, hour, RAISE("No hour specified"));
|
||||
OPTIONAL_SEPARATOR(':');
|
||||
// Minute (optional)
|
||||
READ_DECIMAL_NUMBER(2, minute, break);
|
||||
OPTIONAL_SEPARATOR(':');
|
||||
// Second (optional)
|
||||
READ_DECIMAL_NUMBER(2, second, break);
|
||||
|
||||
if (*c == '.' || *c == ',') // separator for microseconds
|
||||
{
|
||||
c++;
|
||||
// Parse fraction of second up to 6 places
|
||||
READ_DECIMAL_NUMBER(6, usecond, break);
|
||||
// Omit excessive digits
|
||||
while (CHAR_IS_DIGIT(c)) c++;
|
||||
// If we break early, fully expand the usecond
|
||||
while (i++ < 6) usecond *= 10;
|
||||
}
|
||||
}
|
||||
|
||||
switch(*c) {
|
||||
case 'Z':
|
||||
tzhour = 0; c++; break;
|
||||
case '+':
|
||||
tzsign = 1; c++; break;
|
||||
case '-':
|
||||
tzsign = -1; c++; break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
if (tzsign != 0) {
|
||||
tzhour = 0;
|
||||
READ_DECIMAL_NUMBER(2, tzhour, break);
|
||||
OPTIONAL_SEPARATOR(':');
|
||||
READ_DECIMAL_NUMBER(2, tzminute, break);
|
||||
}
|
||||
|
||||
return Py_BuildValue("NOi", PyDateTime_FromDateAndTime(year, month, day, hour, minute, second, usecond), (tzhour == 1000) ? Py_False : Py_True, tzsign*60*(tzhour*60 + tzminute));
|
||||
}
|
||||
|
||||
|
||||
static PyMethodDef speedup_methods[] = {
|
||||
{"parse_date", speedup_parse_date, METH_VARARGS,
|
||||
"parse_date()\n\nParse ISO dates faster."
|
||||
"parse_date()\n\nParse ISO dates faster (specialized for dates stored in the calibre db)."
|
||||
},
|
||||
|
||||
{"parse_iso8601", speedup_iso_8601, METH_VARARGS,
|
||||
"parse_iso8601(datestring)\n\nParse ISO 8601 dates faster. More spec compliant than parse_date()"
|
||||
},
|
||||
|
||||
{"pdf_float", speedup_pdf_float, METH_VARARGS,
|
||||
@ -403,6 +490,7 @@ initspeedup(void) {
|
||||
"Implementation of methods in C for speed."
|
||||
);
|
||||
if (m == NULL) return;
|
||||
PyDateTime_IMPORT;
|
||||
#ifdef O_CLOEXEC
|
||||
PyModule_AddIntConstant(m, "O_CLOEXEC", O_CLOEXEC);
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user