calibre/src/tinycss/tokenizer.py

# coding: utf8
"""
    tinycss.tokenizer
    -----------------

    Tokenizer for the CSS core syntax:
    http://www.w3.org/TR/CSS21/syndata.html#tokenization

    This is the pure-python implementation. See also speedups.pyx

    :copyright: (c) 2012 by Simon Sapin.
    :license: BSD, see LICENSE for more details.
"""

from __future__ import unicode_literals

from tinycss import token_data


def tokenize_flat(css_source, ignore_comments=True,
    # Make these local variable to avoid global lookups in the loop
    tokens_dispatch=token_data.TOKEN_DISPATCH,
    unicode_unescape=token_data.UNICODE_UNESCAPE,
    newline_unescape=token_data.NEWLINE_UNESCAPE,
    simple_unescape=token_data.SIMPLE_UNESCAPE,
    find_newlines=token_data.FIND_NEWLINES,
    Token=token_data.Token,
    len=len,
    int=int,
    float=float,
    list=list,
    _None=None,
):
    """
    :param css_source:
        CSS as an unicode string
    :param ignore_comments:
        if true (the default) comments will not be included in the
        return value
    :return:
        An iterator of :class:`Token`

    """

    pos = 0
    line = 1
    column = 1
    source_len = len(css_source)
    tokens = []
    while pos < source_len:
        char = css_source[pos]
        if char in ':;{}()[]':
            type_ = char
            css_value = char
        else:
            codepoint = min(ord(char), 160)
            for _index, type_, regexp in tokens_dispatch[codepoint]:
                match = regexp(css_source, pos)
                if match is not None:
                    # First match is the longest. See comments on TOKENS above.
                    css_value = match.group()
                    break
            else:
                # No match.
                # "Any other character not matched by the above rules,
                #  and neither a single nor a double quote."
                # ... but quotes at the start of a token are always matched
                # by STRING or BAD_STRING. So DELIM is any single character.
                type_ = 'DELIM'
                css_value = char
        length = len(css_value)
        next_pos = pos + length

        # A BAD_COMMENT is a comment at EOF. Ignore it too.
        if not (ignore_comments and type_ in ('COMMENT', 'BAD_COMMENT')):
            # Parse numbers, extract strings and URIs, unescape
            unit = _None
            if type_ == 'DIMENSION':
                value = match.group(1)
                value = float(value) if '.' in value else int(value)
                unit = match.group(2)
                unit = simple_unescape(unit)
                unit = unicode_unescape(unit)
                unit = unit.lower()  # normalize
            elif type_ == 'PERCENTAGE':
                value = css_value[:-1]
                value = float(value) if '.' in value else int(value)
                unit = '%'
            elif type_ == 'NUMBER':
                value = css_value
                if '.' in value:
                    value = float(value)
                else:
                    value = int(value)
                    type_ = 'INTEGER'
            elif type_ in ('IDENT', 'ATKEYWORD', 'HASH', 'FUNCTION'):
                value = simple_unescape(css_value)
                value = unicode_unescape(value)
            elif type_ == 'URI':
                value = match.group(1)
                if value and value[0] in '"\'':
                    value = value[1:-1]  # Remove quotes
                    value = newline_unescape(value)
                value = simple_unescape(value)
                value = unicode_unescape(value)
            elif type_ == 'STRING':
                value = css_value[1:-1]  # Remove quotes
                value = newline_unescape(value)
                value = simple_unescape(value)
                value = unicode_unescape(value)
            # BAD_STRING can only be one of:
            # * Unclosed string at the end of the stylesheet:
            #   Close the string, but this is not an error.
            #   Make it a "good" STRING token.
            # * Unclosed string at the (unescaped) end of the line:
            #   Close the string, but this is an error.
            #   Leave it as a BAD_STRING, don’t bother parsing it.
            # See http://www.w3.org/TR/CSS21/syndata.html#parsing-errors
            elif type_ == 'BAD_STRING' and next_pos == source_len:
                type_ = 'STRING'
                value = css_value[1:]  # Remove quote
                value = newline_unescape(value)
                value = simple_unescape(value)
                value = unicode_unescape(value)
            else:
                value = css_value
            tokens.append(Token(type_, css_value, value, unit, line, column))

        pos = next_pos
        newlines = find_newlines(css_value)
        if newlines:
            line += len(newlines)
            # Add 1 to have lines start at column 1, not 0
            column = length - newlines[-1].end() + 1
        else:
            column += length
    return tokens


def regroup(tokens):
    """
    Match pairs of tokens: () [] {} function()
    (Strings in "" or '' are taken care of by the tokenizer.)

    Opening tokens are replaced by a :class:`ContainerToken`.
    Closing tokens are removed. Unmatched closing tokens are invalid
    but left as-is. All nested structures that are still open at
    the end of the stylesheet are implicitly closed.

    :param tokens:
        a *flat* iterable of tokens, as returned by :func:`tokenize_flat`.
    :return:
        A tree of tokens.

    """
    # "global" objects for the inner recursion
    pairs = {'FUNCTION': ')', '(': ')', '[': ']', '{': '}'}
    tokens = iter(tokens)
    eof = [False]

    def _regroup_inner(stop_at=None,
            tokens=tokens, pairs=pairs, eof=eof,
            ContainerToken=token_data.ContainerToken,
            FunctionToken=token_data.FunctionToken):
        for token in tokens:
            type_ = token.type
            if type_ == stop_at:
                return

            end = pairs.get(type_)
            if end is None:
                yield token  # Not a grouping token
            else:
                assert not isinstance(token, ContainerToken), (
                    'Token looks already grouped: {0}'.format(token))
                content = list(_regroup_inner(end))
                if eof[0]:
                    end = ''  # Implicit end of structure at EOF.
                if type_ == 'FUNCTION':
                    yield FunctionToken(token.type, token.as_css(), end,
                                        token.value, content,
                                        token.line, token.column)
                else:
                    yield ContainerToken(token.type, token.as_css(), end,
                                         content,
                                         token.line, token.column)
        else:
            eof[0] = True  # end of file/stylesheet
    return _regroup_inner()


def tokenize_grouped(css_source, ignore_comments=True):
    """
    :param css_source:
        CSS as an unicode string
    :param ignore_comments:
        if true (the default) comments will not be included in the
        return value
    :return:
        An iterator of :class:`Token`

    """
    return regroup(tokenize_flat(css_source, ignore_comments))


# Optional Cython version of tokenize_flat
# Make both versions available with explicit names for tests.
python_tokenize_flat = tokenize_flat

try:
    tok = token_data.load_c_tokenizer()
except (ImportError, RuntimeError):
    c_tokenize_flat = None
else:
    # Use the c tokenizer by default
    c_tokenize_flat = tokenize_flat = lambda s, ignore_comments=False:tok.tokenize_flat(s, ignore_comments)