diff --git a/src/calibre/utils/iso8601.py b/src/calibre/utils/iso8601.py index 1b36919dcf..87fba239ea 100644 --- a/src/calibre/utils/iso8601.py +++ b/src/calibre/utils/iso8601.py @@ -10,125 +10,148 @@ import re from dateutil.tz import tzlocal, tzutc, tzoffset +from calibre.constants import plugins +speedup, err = plugins['speedup'] +if not speedup: + raise RuntimeError(err) + class SafeLocalTimeZone(tzlocal): def _isdst(self, dt): # This method in tzlocal raises ValueError if dt is out of range. # In such cases, just assume that dt is not DST. try: - tzlocal._isdst(self, dt) + return super(SafeLocalTimeZone, self)._isdst(dt) except Exception: pass return False utc_tz = tzutc() local_tz = SafeLocalTimeZone() +del tzutc, tzlocal -_iso_pat = None +if hasattr(speedup, 'parse_iso8601'): # For people running from source without updated binaries + def parse_iso8601(date_string, assume_utc=True, as_utc=True): + dt, aware, tzseconds = speedup.parse_iso8601(date_string) + tz = utc_tz if assume_utc else local_tz + if aware: # timezone was specified + if tzseconds == 0: + tz = utc_tz + else: + sign = '-' if tzseconds < 0 else '+' + description = "%s%02d:%02d" % (sign, abs(tzseconds) // 3600, (abs(tzseconds) % 3600) // 60) + tz = tzoffset(description, tzseconds) + dt = dt.replace(tzinfo=tz) + if as_utc and tz is utc_tz: + return dt + return dt.astimezone(utc_tz if as_utc else local_tz) +else: + # Pure python implementation + _iso_pat = None -def iso_pat(): - global _iso_pat - if _iso_pat is None: - _iso_pat = re.compile( - # Adapted from http://delete.me.uk/2005/03/iso8601.html - r""" - (?P[0-9]{4}) - ( - ( - (-(?P[0-9]{1,2})) - | - (?P[0-9]{2}) - (?!$) # Don't allow YYYYMM - ) + def iso_pat(): + global _iso_pat + if _iso_pat is None: + _iso_pat = re.compile( + # Adapted from http://delete.me.uk/2005/03/iso8601.html + r""" + (?P[0-9]{4}) ( ( - (-(?P[0-9]{1,2})) + (-(?P[0-9]{1,2})) | - (?P[0-9]{2}) + (?P[0-9]{2}) + (?!$) # Don't allow YYYYMM ) ( ( - (?P[ T]) - (?P[0-9]{2}) - (:{0,1}(?P[0-9]{2})){0,1} + (-(?P[0-9]{1,2})) + | + (?P[0-9]{2}) + ) + ( ( - :{0,1}(?P[0-9]{1,2}) - ([.,](?P[0-9]+)){0,1} - ){0,1} - (?P - Z - | + (?P[ T]) + (?P[0-9]{2}) + (:{0,1}(?P[0-9]{2})){0,1} ( - (?P[-+]) - (?P[0-9]{2}) - :{0,1} - (?P[0-9]{2}){0,1} - ) + :{0,1}(?P[0-9]{1,2}) + ([.,](?P[0-9]+)){0,1} + ){0,1} + (?P + Z + | + ( + (?P[-+]) + (?P[0-9]{2}) + :{0,1} + (?P[0-9]{2}){0,1} + ) + ){0,1} ){0,1} - ){0,1} - ) - ){0,1} # YYYY-MM - ){0,1} # YYYY only - $ - """, re.VERBOSE) - return _iso_pat + ) + ){0,1} # YYYY-MM + ){0,1} # YYYY only + $ + """, re.VERBOSE) + return _iso_pat -def to_int(d, key, default_to_zero=False, default=None, required=True): - """Pull a value from the dict and convert to int + def to_int(d, key, default_to_zero=False, default=None, required=True): + """Pull a value from the dict and convert to int - :param default_to_zero: If the value is None or empty, treat it as zero - :param default: If the value is missing in the dict use this default + :param default_to_zero: If the value is None or empty, treat it as zero + :param default: If the value is missing in the dict use this default - """ - value = d.get(key) or default - if (value is None or value == '') and default_to_zero: - return 0 - if value is None: - if required: - raise ValueError("Unable to read %s from %s" % (key, d)) - else: - return int(value) + """ + value = d.get(key) or default + if (value is None or value == '') and default_to_zero: + return 0 + if value is None: + if required: + raise ValueError("Unable to read %s from %s" % (key, d)) + else: + return int(value) -def parse_timezone(matches, default_timezone=utc_tz): - """Parses ISO 8601 time zone specs into tzinfo offsets + def parse_timezone(matches, default_timezone=utc_tz): + """Parses ISO 8601 time zone specs into tzinfo offsets - """ + """ - if matches["timezone"] == "Z": - return utc_tz - # This isn't strictly correct, but it's common to encounter dates without - # timezones so I'll assume the default (which defaults to UTC). - # Addresses issue 4. - if matches["timezone"] is None: - return default_timezone - sign = matches["tz_sign"] - hours = to_int(matches, "tz_hour") - minutes = to_int(matches, "tz_minute", default_to_zero=True) - description = "%s%02d:%02d" % (sign, hours, minutes) - if sign == "-": - hours = -hours - minutes = -minutes - return tzoffset(description, 3600*hours + 60*minutes) + if matches["timezone"] == "Z": + return utc_tz + # This isn't strictly correct, but it's common to encounter dates without + # timezones so I'll assume the default (which defaults to UTC). + # Addresses issue 4. + if matches["timezone"] is None: + return default_timezone + sign = matches["tz_sign"] + hours = to_int(matches, "tz_hour") + minutes = to_int(matches, "tz_minute", default_to_zero=True) + description = "%s%02d:%02d" % (sign, hours, minutes) + if sign == "-": + hours = -hours + minutes = -minutes + return tzoffset(description, 3600*hours + 60*minutes) -def parse_iso8601(date_string, assume_utc=True, as_utc=True): - if isinstance(date_string, bytes): - date_string = date_string.decode('ascii') - m = iso_pat().match(date_string) - if m is None: - raise ValueError('%r is not a valid ISO8601 date' % date_string) - groups = m.groupdict() - tz = parse_timezone(groups, default_timezone=utc_tz if assume_utc else local_tz) - groups["second_fraction"] = int(Decimal("0.%s" % (groups["second_fraction"] or 0)) * Decimal("1000000.0")) - return datetime( - year=to_int(groups, "year"), - month=to_int(groups, "month", default=to_int(groups, "monthdash", required=False, default=1)), - day=to_int(groups, "day", default=to_int(groups, "daydash", required=False, default=1)), - hour=to_int(groups, "hour", default_to_zero=True), - minute=to_int(groups, "minute", default_to_zero=True), - second=to_int(groups, "second", default_to_zero=True), - microsecond=groups["second_fraction"], - tzinfo=tz, - ).astimezone(utc_tz if as_utc else local_tz) + def parse_iso8601(date_string, assume_utc=True, as_utc=True): + if isinstance(date_string, bytes): + date_string = date_string.decode('ascii') + m = iso_pat().match(date_string) + if m is None: + raise ValueError('%r is not a valid ISO8601 date' % date_string) + groups = m.groupdict() + tz = parse_timezone(groups, default_timezone=utc_tz if assume_utc else local_tz) + groups["second_fraction"] = int(Decimal("0.%s" % (groups["second_fraction"] or 0)) * Decimal("1000000.0")) + return datetime( + year=to_int(groups, "year"), + month=to_int(groups, "month", default=to_int(groups, "monthdash", required=False, default=1)), + day=to_int(groups, "day", default=to_int(groups, "daydash", required=False, default=1)), + hour=to_int(groups, "hour", default_to_zero=True), + minute=to_int(groups, "minute", default_to_zero=True), + second=to_int(groups, "second", default_to_zero=True), + microsecond=groups["second_fraction"], + tzinfo=tz, + ).astimezone(utc_tz if as_utc else local_tz) if __name__ == '__main__': import sys diff --git a/src/calibre/utils/speedup.c b/src/calibre/utils/speedup.c index d989cb9f0d..4e80141c6e 100644 --- a/src/calibre/utils/speedup.c +++ b/src/calibre/utils/speedup.c @@ -1,5 +1,6 @@ #define UNICODE #include +#include #include #include @@ -353,9 +354,95 @@ clean_xml_chars(PyObject *self, PyObject *text) { return (PyObject*)ans; } +static PyObject * +speedup_iso_8601(PyObject *self, PyObject *args) { + char *str = NULL, *c = NULL; + int year = 0, month = 0, day = 0, hour = 0, minute = 0, second = 0, usecond = 0, i = 0, tzhour = 1000, tzminute = 0, tzsign = 0; + + if (!PyArg_ParseTuple(args, "s", &str)) return NULL; + c = str; + +#define RAISE(msg) return PyErr_Format(PyExc_ValueError, "%s is not a valid ISO 8601 datestring: %s", str, msg); +#define CHAR_IS_DIGIT(c) (*c >= '0' && *c <= '9') +#define READ_DECIMAL_NUMBER(max_digits, x, abort) \ + for (i = 0; i < max_digits; i++) { \ + if (CHAR_IS_DIGIT(c)) x = 10 * x + *c++ - '0'; \ + else { abort; } \ + } +#define OPTIONAL_SEPARATOR(x) if(*c == x) c++; + + // Ignore leading whitespace + while(*c == ' ' || *c == '\n' || *c == '\r' || *c == '\t' || *c == '\v' || *c == '\f') c++; + + // Year + READ_DECIMAL_NUMBER(4, year, RAISE("No year specified")); + OPTIONAL_SEPARATOR('-'); + // Month (optional) + READ_DECIMAL_NUMBER(2, month, break); + if (month == 0) month = 1; // YYYY format + else { + OPTIONAL_SEPARATOR('-'); + + // Day (optional) + READ_DECIMAL_NUMBER(2, day, break); + } + if (day == 0) day = 1; // YYYY-MM format + if (month > 12) RAISE("month greater than 12"); + + if (*c == 'T' || *c == ' ') // Time separator + { + c++; + + // Hour + READ_DECIMAL_NUMBER(2, hour, RAISE("No hour specified")); + OPTIONAL_SEPARATOR(':'); + // Minute (optional) + READ_DECIMAL_NUMBER(2, minute, break); + OPTIONAL_SEPARATOR(':'); + // Second (optional) + READ_DECIMAL_NUMBER(2, second, break); + + if (*c == '.' || *c == ',') // separator for microseconds + { + c++; + // Parse fraction of second up to 6 places + READ_DECIMAL_NUMBER(6, usecond, break); + // Omit excessive digits + while (CHAR_IS_DIGIT(c)) c++; + // If we break early, fully expand the usecond + while (i++ < 6) usecond *= 10; + } + } + + switch(*c) { + case 'Z': + tzhour = 0; c++; break; + case '+': + tzsign = 1; c++; break; + case '-': + tzsign = -1; c++; break; + default: + break; + } + + if (tzsign != 0) { + tzhour = 0; + READ_DECIMAL_NUMBER(2, tzhour, break); + OPTIONAL_SEPARATOR(':'); + READ_DECIMAL_NUMBER(2, tzminute, break); + } + + return Py_BuildValue("NOi", PyDateTime_FromDateAndTime(year, month, day, hour, minute, second, usecond), (tzhour == 1000) ? Py_False : Py_True, tzsign*60*(tzhour*60 + tzminute)); +} + + static PyMethodDef speedup_methods[] = { {"parse_date", speedup_parse_date, METH_VARARGS, - "parse_date()\n\nParse ISO dates faster." + "parse_date()\n\nParse ISO dates faster (specialized for dates stored in the calibre db)." + }, + + {"parse_iso8601", speedup_iso_8601, METH_VARARGS, + "parse_iso8601(datestring)\n\nParse ISO 8601 dates faster. More spec compliant than parse_date()" }, {"pdf_float", speedup_pdf_float, METH_VARARGS, @@ -403,6 +490,7 @@ initspeedup(void) { "Implementation of methods in C for speed." ); if (m == NULL) return; + PyDateTime_IMPORT; #ifdef O_CLOEXEC PyModule_AddIntConstant(m, "O_CLOEXEC", O_CLOEXEC); #endif