calibre/src/tinycss/decoding.py

# coding: utf8
"""
    tinycss.decoding
    ----------------

    Decoding stylesheets from bytes to Unicode.
    http://www.w3.org/TR/CSS21/syndata.html#charset

    :copyright: (c) 2012 by Simon Sapin.
    :license: BSD, see LICENSE for more details.
"""


import operator
import re

from polyglot.binary import from_hex_bytes


__all__ = ['decode']  # Everything else is implementation detail


def decode(css_bytes, protocol_encoding=None,
           linking_encoding=None, document_encoding=None):
    """
    Determine the character encoding from the passed metadata and the
    ``@charset`` rule in the stylesheet (if any); and decode accordingly.
    If no encoding information is available or decoding fails,
    decoding defaults to UTF-8 and then fall back on ISO-8859-1.

    :param css_bytes:
        a CSS stylesheet as a byte string
    :param protocol_encoding:
        The "charset" parameter of a "Content-Type" HTTP header (if any),
        or similar metadata for other protocols.
    :param linking_encoding:
        ``<link charset="">`` or other metadata from the linking mechanism
        (if any)
    :param document_encoding:
        Encoding of the referring style sheet or document (if any)
    :return:
        A tuple of an Unicode string, with any BOM removed, and the
        encoding that was used.

    """
    if protocol_encoding:
        css_unicode = try_encoding(css_bytes, protocol_encoding)
        if css_unicode is not None:
            return css_unicode, protocol_encoding
    for encoding, pattern in ENCODING_MAGIC_NUMBERS:
        match = pattern(css_bytes)
        if match:
            has_at_charset = isinstance(encoding, tuple)
            if has_at_charset:
                extract, endianness = encoding
                encoding = extract(match.group(1))
                # Get an ASCII-only unicode value.
                # This is the only thing that works on both Python 2 and 3
                # for bytes.decode()
                # Non-ASCII encoding names are invalid anyway,
                # but make sure they stay invalid.
                encoding = encoding.decode('ascii', 'replace')
                encoding = encoding.replace('\ufffd', '?')
                if encoding.replace('-', '').replace('_', '').lower() in [
                        'utf16', 'utf32']:
                    encoding += endianness
                encoding = encoding.encode('ascii', 'replace').decode('ascii')
            css_unicode = try_encoding(css_bytes, encoding)
            if css_unicode and not (has_at_charset and not
                                    css_unicode.startswith('@charset "')):
                return css_unicode, encoding
            break
    for encoding in [linking_encoding, document_encoding]:
        if encoding:
            css_unicode = try_encoding(css_bytes, encoding)
            if css_unicode is not None:
                return css_unicode, encoding
    css_unicode = try_encoding(css_bytes, 'UTF-8')
    if css_unicode is not None:
        return css_unicode, 'UTF-8'
    return try_encoding(css_bytes, 'ISO-8859-1', fallback=False), 'ISO-8859-1'


def try_encoding(css_bytes, encoding, fallback=True):
    if fallback:
        try:
            css_unicode = css_bytes.decode(encoding)
        # LookupError means unknown encoding
        except (UnicodeDecodeError, LookupError):
            return None
    else:
        css_unicode = css_bytes.decode(encoding)
    if css_unicode and css_unicode[0] == '\ufeff':
        # Remove any Byte Order Mark
        css_unicode = css_unicode[1:]
    return css_unicode


def hex2re(hex_data):
    return re.escape(from_hex_bytes(hex_data.replace(' ', '').encode('ascii')))


class Slicer:
    """Slice()[start:stop:end] == slice(start, stop, end)"""
    def __getitem__(self, slice_):
        return operator.itemgetter(slice_)


Slice = Slicer()


# List of (bom_size, encoding, pattern)
#   bom_size is in bytes and can be zero
#   encoding is a string or (slice_, endianness) for "as specified"
#   slice_ is a slice object.How to extract the specified

ENCODING_MAGIC_NUMBERS = [
    ((Slice[:], ''), re.compile(
        hex2re('EF BB BF 40 63 68 61 72 73 65 74 20 22')
        + b'([^\x22]*?)'
        + hex2re('22 3B')).match),

    ('UTF-8', re.compile(
        hex2re('EF BB BF')).match),

    ((Slice[:], ''), re.compile(
        hex2re('40 63 68 61 72 73 65 74 20 22')
        + b'([^\x22]*?)'
        + hex2re('22 3B')).match),

    ((Slice[1::2], '-BE'), re.compile(
        hex2re('FE FF 00 40 00 63 00 68 00 61 00 72 00 73 00 65 00'
               '74 00 20 00 22')
        + b'((\x00[^\x22])*?)'
        + hex2re('00 22 00 3B')).match),

    ((Slice[1::2], '-BE'), re.compile(
        hex2re('00 40 00 63 00 68 00 61 00 72 00 73 00 65 00 74 00'
               '20 00 22')
        + b'((\x00[^\x22])*?)'
        + hex2re('00 22 00 3B')).match),

    ((Slice[::2], '-LE'), re.compile(
        hex2re('FF FE 40 00 63 00 68 00 61 00 72 00 73 00 65 00 74'
               '00 20 00 22 00')
        + b'(([^\x22]\x00)*?)'
        + hex2re('22 00 3B 00')).match),

    ((Slice[::2], '-LE'), re.compile(
        hex2re('40 00 63 00 68 00 61 00 72 00 73 00 65 00 74 00 20'
               '00 22 00')
        + b'(([^\x22]\x00)*?)'
        + hex2re('22 00 3B 00')).match),

    ((Slice[3::4], '-BE'), re.compile(
        hex2re('00 00 FE FF 00 00 00 40 00 00 00 63 00 00 00 68 00'
               '00 00 61 00 00 00 72 00 00 00 73 00 00 00 65 00 00'
               '00 74 00 00 00 20 00 00 00 22')
        + b'((\x00\x00\x00[^\x22])*?)'
        + hex2re('00 00 00 22 00 00 00 3B')).match),

    ((Slice[3::4], '-BE'), re.compile(
        hex2re('00 00 00 40 00 00 00 63 00 00 00 68 00 00 00 61 00'
               '00 00 72 00 00 00 73 00 00 00 65 00 00 00 74 00 00'
               '00 20 00 00 00 22')
        + b'((\x00\x00\x00[^\x22])*?)'
        + hex2re('00 00 00 22 00 00 00 3B')).match),


# Python does not support 2143 or 3412 endianness, AFAIK.
# I guess we could fix it up ourselves but meh. Patches welcome.

#    ((Slice[2::4], '-2143'), re.compile(
#        hex2re('00 00 FF FE 00 00 40 00 00 00 63 00 00 00 68 00 00'
#               '00 61 00 00 00 72 00 00 00 73 00 00 00 65 00 00 00'
#               '74 00 00 00 20 00 00 00 22 00')
#        + b'((\x00\x00[^\x22]\x00)*?)'
#        + hex2re('00 00 22 00 00 00 3B 00')).match),

#    ((Slice[2::4], '-2143'), re.compile(
#        hex2re('00 00 40 00 00 00 63 00 00 00 68 00 00 00 61 00 00'
#               '00 72 00 00 00 73 00 00 00 65 00 00 00 74 00 00 00'
#               '20 00 00 00 22 00')
#        + b'((\x00\x00[^\x22]\x00)*?)'
#        + hex2re('00 00 22 00 00 00 3B 00')).match),

#    ((Slice[1::4], '-3412'), re.compile(
#        hex2re('FE FF 00 00 00 40 00 00 00 63 00 00 00 68 00 00 00'
#               '61 00 00 00 72 00 00 00 73 00 00 00 65 00 00 00 74'
#               '00 00 00 20 00 00 00 22 00 00')
#        + b'((\x00[^\x22]\x00\x00)*?)'
#        + hex2re('00 22 00 00 00 3B 00 00')).match),

#    ((Slice[1::4], '-3412'), re.compile(
#        hex2re('00 40 00 00 00 63 00 00 00 68 00 00 00 61 00 00 00'
#               '72 00 00 00 73 00 00 00 65 00 00 00 74 00 00 00 20'
#               '00 00 00 22 00 00')
#        + b'((\x00[^\x22]\x00\x00)*?)'
#        + hex2re('00 22 00 00 00 3B 00 00')).match),

    ((Slice[::4], '-LE'), re.compile(
        hex2re('FF FE 00 00 40 00 00 00 63 00 00 00 68 00 00 00 61'
               '00 00 00 72 00 00 00 73 00 00 00 65 00 00 00 74 00'
               '00 00 20 00 00 00 22 00 00 00')
        + b'(([^\x22]\x00\x00\x00)*?)'
        + hex2re('22 00 00 00 3B 00 00 00')).match),

    ((Slice[::4], '-LE'), re.compile(
        hex2re('40 00 00 00 63 00 00 00 68 00 00 00 61 00 00 00 72'
               '00 00 00 73 00 00 00 65 00 00 00 74 00 00 00 20 00'
               '00 00 22 00 00 00')
        + b'(([^\x22]\x00\x00\x00)*?)'
        + hex2re('22 00 00 00 3B 00 00 00')).match),

    ('UTF-32-BE', re.compile(
        hex2re('00 00 FE FF')).match),

    ('UTF-32-LE', re.compile(
        hex2re('FF FE 00 00')).match),

#    ('UTF-32-2143', re.compile(
#        hex2re('00 00 FF FE')).match),

#    ('UTF-32-3412', re.compile(
#        hex2re('FE FF 00 00')).match),

    ('UTF-16-BE', re.compile(
        hex2re('FE FF')).match),

    ('UTF-16-LE', re.compile(
        hex2re('FF FE')).match),


# Some of there are supported by Python, but I didn’t bother.
# You know the story with patches ...

#    # as specified, transcoded from EBCDIC to ASCII
#    ('as_specified-EBCDIC', re.compile(
#        hex2re('7C 83 88 81 99 A2 85 A3 40 7F')
#        + b'([^\x7F]*?)'
#        + hex2re('7F 5E')).match),

#    # as specified, transcoded from IBM1026 to ASCII
#    ('as_specified-IBM1026', re.compile(
#        hex2re('AE 83 88 81 99 A2 85 A3 40 FC')
#        + b'([^\xFC]*?)'
#        + hex2re('FC 5E')).match),

#    # as specified, transcoded from GSM 03.38 to ASCII
#    ('as_specified-GSM_03.38', re.compile(
#        hex2re('00 63 68 61 72 73 65 74 20 22')
#        + b'([^\x22]*?)'
#        + hex2re('22 3B')).match),
]