ebook-converter/ebook_converter/css_selectors/parser.py

#!/usr/bin/env python2
# vim:fileencoding=utf-8
"""
    Tokenizer, parser and parsed objects for CSS selectors.

    :copyright: (c) 2007-2012 Ian Bicking and contributors.
                See AUTHORS for more details.
    :license: BSD, see LICENSE for more details.

"""

import sys
import re
import operator
import string

from ebook_converter.css_selectors.errors import SelectorSyntaxError, ExpressionError
from ebook_converter.polyglot.builtins import unicode_type, codepoint_to_chr, range


utab = {c:c+32 for c in range(ord(u'A'), ord(u'Z')+1)}

if sys.version_info.major < 3:
    tab = string.maketrans(string.ascii_uppercase, string.ascii_lowercase)

    def ascii_lower(string):
        """Lower-case, but only in the ASCII range."""
        return string.translate(utab if isinstance(string, unicode_type) else tab)

    def urepr(x):
        if isinstance(x, list):
            return '[%s]' % ', '.join((map(urepr, x)))
        ans = repr(x)
        if ans.startswith("u'") or ans.startswith('u"'):
            ans = ans[1:]
        return ans


else:

    def ascii_lower(x):
        return x.translate(utab)

    urepr = repr


# Parsed objects

class Selector(object):

    """
    Represents a parsed selector.
    """

    def __init__(self, tree, pseudo_element=None):
        self.parsed_tree = tree
        if pseudo_element is not None and not isinstance(
                pseudo_element, FunctionalPseudoElement):
            pseudo_element = ascii_lower(pseudo_element)
        #: A :class:`FunctionalPseudoElement`,
        #: or the identifier for the pseudo-element as a string,
        #  or ``None``.
        #:
        #: +-------------------------+----------------+--------------------------------+
        #: |                         | Selector       | Pseudo-element                 |
        #: +=========================+================+================================+
        #: | CSS3 syntax             | ``a::before``  | ``'before'``                   |
        #: +-------------------------+----------------+--------------------------------+
        #: | Older syntax            | ``a:before``   | ``'before'``                   |
        #: +-------------------------+----------------+--------------------------------+
        #: | From the Lists3_ draft, | ``li::marker`` | ``'marker'``                   |
        #: | not in Selectors3       |                |                                |
        #: +-------------------------+----------------+--------------------------------+
        #: | Invalid pseudo-class    | ``li:marker``  | ``None``                       |
        #: +-------------------------+----------------+--------------------------------+
        #: | Functinal               | ``a::foo(2)``  | ``FunctionalPseudoElement(…)`` |
        #: +-------------------------+----------------+--------------------------------+
        #:
        # : .. _Lists3: http://www.w3.org/TR/2011/WD-css3-lists-20110524/#marker-pseudoelement
        self.pseudo_element = pseudo_element

    def __repr__(self):
        if isinstance(self.pseudo_element, FunctionalPseudoElement):
            pseudo_element = repr(self.pseudo_element)
        if self.pseudo_element:
            pseudo_element = '::%s' % self.pseudo_element
        else:
            pseudo_element = ''
        return '%s[%r%s]' % (
            self.__class__.__name__, self.parsed_tree, pseudo_element)

    def specificity(self):
        """Return the specificity_ of this selector as a tuple of 3 integers.

        .. _specificity: http://www.w3.org/TR/selectors/#specificity

        """
        a, b, c = self.parsed_tree.specificity()
        if self.pseudo_element:
            c += 1
        return a, b, c


class Class(object):

    """
    Represents selector.class_name
    """
    def __init__(self, selector, class_name):
        self.selector = selector
        self.class_name = class_name

    def __repr__(self):
        return '%s[%r.%s]' % (
            self.__class__.__name__, self.selector, self.class_name)

    def specificity(self):
        a, b, c = self.selector.specificity()
        b += 1
        return a, b, c


class FunctionalPseudoElement(object):

    """
    Represents selector::name(arguments)

    .. attribute:: name

        The name (identifier) of the pseudo-element, as a string.

    .. attribute:: arguments

        The arguments of the pseudo-element, as a list of tokens.

        **Note:** tokens are not part of the public API,
        and may change between versions.
        Use at your own risks.

    """
    def __init__(self, name, arguments):
        self.name = ascii_lower(name)
        self.arguments = arguments

    def __repr__(self):
        return '%s[::%s(%s)]' % (
            self.__class__.__name__, self.name,
            urepr([token.value for token in self.arguments]))

    def argument_types(self):
        return [token.type for token in self.arguments]

    def specificity(self):
        a, b, c = self.selector.specificity()
        b += 1
        return a, b, c


class Function(object):

    """
    Represents selector:name(expr)
    """
    def __init__(self, selector, name, arguments):
        self.selector = selector
        self.name = ascii_lower(name)
        self.arguments = arguments
        self._parsed_arguments = None

    def __repr__(self):
        return '%s[%r:%s(%s)]' % (
            self.__class__.__name__, self.selector, self.name,
            urepr([token.value for token in self.arguments]))

    def argument_types(self):
        return [token.type for token in self.arguments]

    @property
    def parsed_arguments(self):
        if self._parsed_arguments is None:
            try:
                self._parsed_arguments = parse_series(self.arguments)
            except ValueError:
                raise ExpressionError("Invalid series: '%r'" % self.arguments)
        return self._parsed_arguments

    def parse_arguments(self):
        if not self.arguments_parsed:
            self.arguments_parsed = True

    def specificity(self):
        a, b, c = self.selector.specificity()
        b += 1
        return a, b, c


class Pseudo(object):

    """
    Represents selector:ident
    """
    def __init__(self, selector, ident):
        self.selector = selector
        self.ident = ascii_lower(ident)

    def __repr__(self):
        return '%s[%r:%s]' % (
            self.__class__.__name__, self.selector, self.ident)

    def specificity(self):
        a, b, c = self.selector.specificity()
        b += 1
        return a, b, c


class Negation(object):

    """
    Represents selector:not(subselector)
    """
    def __init__(self, selector, subselector):
        self.selector = selector
        self.subselector = subselector

    def __repr__(self):
        return '%s[%r:not(%r)]' % (
            self.__class__.__name__, self.selector, self.subselector)

    def specificity(self):
        a1, b1, c1 = self.selector.specificity()
        a2, b2, c2 = self.subselector.specificity()
        return a1 + a2, b1 + b2, c1 + c2


class Attrib(object):

    """
    Represents selector[namespace|attrib operator value]
    """
    def __init__(self, selector, namespace, attrib, operator, value):
        self.selector = selector
        self.namespace = namespace
        self.attrib = attrib
        self.operator = operator
        self.value = value

    def __repr__(self):
        if self.namespace:
            attrib = '%s|%s' % (self.namespace, self.attrib)
        else:
            attrib = self.attrib
        if self.operator == 'exists':
            return '%s[%r[%s]]' % (
                self.__class__.__name__, self.selector, attrib)
        else:
            return '%s[%r[%s %s %s]]' % (
                self.__class__.__name__, self.selector, attrib,
                self.operator, urepr(self.value))

    def specificity(self):
        a, b, c = self.selector.specificity()
        b += 1
        return a, b, c


class Element(object):

    """
    Represents namespace|element

    `None` is for the universal selector '*'

    """
    def __init__(self, namespace=None, element=None):
        self.namespace = namespace
        self.element = element

    def __repr__(self):
        element = self.element or '*'
        if self.namespace:
            element = '%s|%s' % (self.namespace, element)
        return '%s[%s]' % (self.__class__.__name__, element)

    def specificity(self):
        if self.element:
            return 0, 0, 1
        else:
            return 0, 0, 0


class Hash(object):

    """
    Represents selector#id
    """
    def __init__(self, selector, id):
        self.selector = selector
        self.id = id

    def __repr__(self):
        return '%s[%r#%s]' % (
            self.__class__.__name__, self.selector, self.id)

    def specificity(self):
        a, b, c = self.selector.specificity()
        a += 1
        return a, b, c


class CombinedSelector(object):

    def __init__(self, selector, combinator, subselector):
        assert selector is not None
        self.selector = selector
        self.combinator = combinator
        self.subselector = subselector

    def __repr__(self):
        if self.combinator == ' ':
            comb = '<followed>'
        else:
            comb = self.combinator
        return '%s[%r %s %r]' % (
            self.__class__.__name__, self.selector, comb, self.subselector)

    def specificity(self):
        a1, b1, c1 = self.selector.specificity()
        a2, b2, c2 = self.subselector.specificity()
        return a1 + a2, b1 + b2, c1 + c2


# Parser

# foo
_el_re = re.compile(r'^[ \t\r\n\f]*([a-zA-Z]+)[ \t\r\n\f]*$')

# foo#bar or #bar
_id_re = re.compile(r'^[ \t\r\n\f]*([a-zA-Z]*)#([a-zA-Z0-9_-]+)[ \t\r\n\f]*$')

# foo.bar or .bar
_class_re = re.compile(
    r'^[ \t\r\n\f]*([a-zA-Z]*)\.([a-zA-Z][a-zA-Z0-9_-]*)[ \t\r\n\f]*$')


def parse(css):
    """Parse a CSS *group of selectors*.

    :param css:
        A *group of selectors* as an Unicode string.
    :raises:
        :class:`SelectorSyntaxError` on invalid selectors.
    :returns:
        A list of parsed :class:`Selector` objects, one for each
        selector in the comma-separated group.

    """
    # Fast path for simple cases
    match = _el_re.match(css)
    if match:
        return [Selector(Element(element=match.group(1)))]
    match = _id_re.match(css)
    if match is not None:
        return [Selector(Hash(Element(element=match.group(1) or None),
                              match.group(2)))]
    match = _class_re.match(css)
    if match is not None:
        return [Selector(Class(Element(element=match.group(1) or None),
                               match.group(2)))]

    stream = TokenStream(tokenize(css))
    stream.source = css
    return list(parse_selector_group(stream))
#    except SelectorSyntaxError:
#        e = sys.exc_info()[1]
#        message = "%s at %s -> %r" % (
#            e, stream.used, stream.peek())
#        e.msg = message
#        e.args = tuple([message])
#        raise


def parse_selector_group(stream):
    stream.skip_whitespace()
    while 1:
        yield Selector(*parse_selector(stream))
        if stream.peek() == ('DELIM', ','):
            stream.next()
            stream.skip_whitespace()
        else:
            break


def parse_selector(stream):
    result, pseudo_element = parse_simple_selector(stream)
    while 1:
        stream.skip_whitespace()
        peek = stream.peek()
        if peek in (('EOF', None), ('DELIM', ',')):
            break
        if pseudo_element:
            raise SelectorSyntaxError(
                'Got pseudo-element ::%s not at the end of a selector'
                % pseudo_element)
        if peek.is_delim('+', '>', '~'):
            # A combinator
            combinator = stream.next().value
            stream.skip_whitespace()
        else:
            # By exclusion, the last parse_simple_selector() ended
            # at peek == ' '
            combinator = ' '
        next_selector, pseudo_element = parse_simple_selector(stream)
        result = CombinedSelector(result, combinator, next_selector)
    return result, pseudo_element


special_pseudo_elements = (
    'first-line', 'first-letter', 'before', 'after')


def parse_simple_selector(stream, inside_negation=False):
    stream.skip_whitespace()
    selector_start = len(stream.used)
    peek = stream.peek()
    if peek.type == 'IDENT' or peek == ('DELIM', '*'):
        if peek.type == 'IDENT':
            namespace = stream.next().value
        else:
            stream.next()
            namespace = None
        if stream.peek() == ('DELIM', '|'):
            stream.next()
            element = stream.next_ident_or_star()
        else:
            element = namespace
            namespace = None
    else:
        element = namespace = None
    result = Element(namespace, element)
    pseudo_element = None
    while 1:
        peek = stream.peek()
        if peek.type in ('S', 'EOF') or peek.is_delim(',', '+', '>', '~') or (
                inside_negation and peek == ('DELIM', ')')):
            break
        if pseudo_element:
            raise SelectorSyntaxError(
                'Got pseudo-element ::%s not at the end of a selector'
                % pseudo_element)
        if peek.type == 'HASH':
            result = Hash(result, stream.next().value)
        elif peek == ('DELIM', '.'):
            stream.next()
            result = Class(result, stream.next_ident())
        elif peek == ('DELIM', '['):
            stream.next()
            result = parse_attrib(result, stream)
        elif peek == ('DELIM', ':'):
            stream.next()
            if stream.peek() == ('DELIM', ':'):
                stream.next()
                pseudo_element = stream.next_ident()
                if stream.peek() == ('DELIM', '('):
                    stream.next()
                    pseudo_element = FunctionalPseudoElement(
                        pseudo_element, parse_arguments(stream))
                continue
            ident = stream.next_ident()
            if ident.lower() in special_pseudo_elements:
                # Special case: CSS 2.1 pseudo-elements can have a single ':'
                # Any new pseudo-element must have two.
                pseudo_element = unicode_type(ident)
                continue
            if stream.peek() != ('DELIM', '('):
                result = Pseudo(result, ident)
                continue
            stream.next()
            stream.skip_whitespace()
            if ident.lower() == 'not':
                if inside_negation:
                    raise SelectorSyntaxError('Got nested :not()')
                argument, argument_pseudo_element = parse_simple_selector(
                    stream, inside_negation=True)
                next = stream.next()
                if argument_pseudo_element:
                    raise SelectorSyntaxError(
                        'Got pseudo-element ::%s inside :not() at %s'
                        % (argument_pseudo_element, next.pos))
                if next != ('DELIM', ')'):
                    raise SelectorSyntaxError("Expected ')', got %s" % (next,))
                result = Negation(result, argument)
            else:
                result = Function(result, ident, parse_arguments(stream))
        else:
            raise SelectorSyntaxError(
                "Expected selector, got %s" % (peek,))
    if len(stream.used) == selector_start:
        raise SelectorSyntaxError(
            "Expected selector, got %s" % (stream.peek(),))
    return result, pseudo_element


def parse_arguments(stream):
    arguments = []
    while 1:
        stream.skip_whitespace()
        next = stream.next()
        if next.type in ('IDENT', 'STRING', 'NUMBER') or next in [
                ('DELIM', '+'), ('DELIM', '-')]:
            arguments.append(next)
        elif next == ('DELIM', ')'):
            return arguments
        else:
            raise SelectorSyntaxError(
                "Expected an argument, got %s" % (next,))


def parse_attrib(selector, stream):
    stream.skip_whitespace()
    attrib = stream.next_ident_or_star()
    if attrib is None and stream.peek() != ('DELIM', '|'):
        raise SelectorSyntaxError(
            "Expected '|', got %s" % (stream.peek(),))
    if stream.peek() == ('DELIM', '|'):
        stream.next()
        if stream.peek() == ('DELIM', '='):
            namespace = None
            stream.next()
            op = '|='
        else:
            namespace = attrib
            attrib = stream.next_ident()
            op = None
    else:
        namespace = op = None
    if op is None:
        stream.skip_whitespace()
        next = stream.next()
        if next == ('DELIM', ']'):
            return Attrib(selector, namespace, attrib, 'exists', None)
        elif next == ('DELIM', '='):
            op = '='
        elif next.is_delim('^', '$', '*', '~', '|', '!') and (
                stream.peek() == ('DELIM', '=')):
            op = next.value + '='
            stream.next()
        else:
            raise SelectorSyntaxError(
                "Operator expected, got %s" % (next,))
    stream.skip_whitespace()
    value = stream.next()
    if value.type not in ('IDENT', 'STRING'):
        raise SelectorSyntaxError(
            "Expected string or ident, got %s" % (value,))
    stream.skip_whitespace()
    next = stream.next()
    if next != ('DELIM', ']'):
        raise SelectorSyntaxError(
            "Expected ']', got %s" % (next,))
    return Attrib(selector, namespace, attrib, op, value.value)


def parse_series(tokens):
    """
    Parses the arguments for :nth-child() and friends.

    :raises: A list of tokens
    :returns: :``(a, b)``

    """
    for token in tokens:
        if token.type == 'STRING':
            raise ValueError('String tokens not allowed in series.')
    s = ''.join(token.value for token in tokens).strip()
    if s == 'odd':
        return (2, 1)
    elif s == 'even':
        return (2, 0)
    elif s == 'n':
        return (1, 0)
    if 'n' not in s:
        # Just b
        return (0, int(s))
    a, b = s.split('n', 1)
    if not a:
        a = 1
    elif a == '-' or a == '+':
        a = int(a+'1')
    else:
        a = int(a)
    if not b:
        b = 0
    else:
        b = int(b)
    return (a, b)


# Token objects

class Token(tuple):

    def __new__(cls, type_, value, pos):
        obj = tuple.__new__(cls, (type_, value))
        obj.pos = pos
        return obj

    def __repr__(self):
        return "<%s '%s' at %i>" % (self.type, self.value, self.pos)

    def is_delim(self, *values):
        return self.type == 'DELIM' and self.value in values

    type = property(operator.itemgetter(0))
    value = property(operator.itemgetter(1))


class EOFToken(Token):

    def __new__(cls, pos):
        return Token.__new__(cls, 'EOF', None, pos)

    def __repr__(self):
        return '<%s at %i>' % (self.type, self.pos)


# Tokenizer


class TokenMacros:
    unicode_escape = r'\\([0-9a-f]{1,6})(?:\r\n|[ \n\r\t\f])?'
    escape = unicode_escape + r'|\\[^\n\r\f0-9a-f]'
    string_escape = r'\\(?:\n|\r\n|\r|\f)|' + escape
    nonascii = r'[^\0-\177]'
    nmchar = '[_a-z0-9-]|%s|%s' % (escape, nonascii)
    nmstart = '[_a-z]|%s|%s' % (escape, nonascii)


def _compile(pattern):
    return re.compile(pattern % vars(TokenMacros), re.IGNORECASE).match


_match_whitespace = _compile(r'[ \t\r\n\f]+')
_match_number = _compile(r'[+-]?(?:[0-9]*\.[0-9]+|[0-9]+)')
_match_hash = _compile('#(?:%(nmchar)s)+')
_match_ident = _compile('-?(?:%(nmstart)s)(?:%(nmchar)s)*')
_match_string_by_quote = {
    "'": _compile(r"([^\n\r\f\\']|%(string_escape)s)*"),
    '"': _compile(r'([^\n\r\f\\"]|%(string_escape)s)*'),
}

_sub_simple_escape = re.compile(r'\\(.)').sub
_sub_unicode_escape = re.compile(TokenMacros.unicode_escape, re.I).sub
_sub_newline_escape =re.compile(r'\\(?:\n|\r\n|\r|\f)').sub

# Same as r'\1', but faster on CPython
if hasattr(operator, 'methodcaller'):
    # Python 2.6+
    _replace_simple = operator.methodcaller('group', 1)
else:
    def _replace_simple(match):
        return match.group(1)


def _replace_unicode(match):
    codepoint = int(match.group(1), 16)
    if codepoint > sys.maxunicode:
        codepoint = 0xFFFD
    return codepoint_to_chr(codepoint)


def unescape_ident(value):
    value = _sub_unicode_escape(_replace_unicode, value)
    value = _sub_simple_escape(_replace_simple, value)
    return value


def tokenize(s):
    pos = 0
    len_s = len(s)
    while pos < len_s:
        match = _match_whitespace(s, pos=pos)
        if match:
            yield Token('S', ' ', pos)
            pos = match.end()
            continue

        match = _match_ident(s, pos=pos)
        if match:
            value = _sub_simple_escape(_replace_simple,
                    _sub_unicode_escape(_replace_unicode, match.group()))
            yield Token('IDENT', value, pos)
            pos = match.end()
            continue

        match = _match_hash(s, pos=pos)
        if match:
            value = _sub_simple_escape(_replace_simple,
                    _sub_unicode_escape(_replace_unicode, match.group()[1:]))
            yield Token('HASH', value, pos)
            pos = match.end()
            continue

        quote = s[pos]
        if quote in _match_string_by_quote:
            match = _match_string_by_quote[quote](s, pos=pos + 1)
            assert match, 'Should have found at least an empty match'
            end_pos = match.end()
            if end_pos == len_s:
                raise SelectorSyntaxError('Unclosed string at %s' % pos)
            if s[end_pos] != quote:
                raise SelectorSyntaxError('Invalid string at %s' % pos)
            value = _sub_simple_escape(_replace_simple,
                    _sub_unicode_escape(_replace_unicode,
                    _sub_newline_escape('', match.group())))
            yield Token('STRING', value, pos)
            pos = end_pos + 1
            continue

        match = _match_number(s, pos=pos)
        if match:
            value = match.group()
            yield Token('NUMBER', value, pos)
            pos = match.end()
            continue

        pos2 = pos + 2
        if s[pos:pos2] == '/*':
            pos = s.find('*/', pos2)
            if pos == -1:
                pos = len_s
            else:
                pos += 2
            continue

        yield Token('DELIM', s[pos], pos)
        pos += 1

    assert pos == len_s
    yield EOFToken(pos)


class TokenStream(object):

    def __init__(self, tokens, source=None):
        self.used = []
        self.tokens = iter(tokens)
        self.source = source
        self.peeked = None
        self._peeking = False
        try:
            self.next_token = self.tokens.next
        except AttributeError:
            # Python 3
            self.next_token = self.tokens.__next__

    def next(self):
        if self._peeking:
            self._peeking = False
            self.used.append(self.peeked)
            return self.peeked
        else:
            next = self.next_token()
            self.used.append(next)
            return next

    def peek(self):
        if not self._peeking:
            self.peeked = self.next_token()
            self._peeking = True
        return self.peeked

    def next_ident(self):
        next = self.next()
        if next.type != 'IDENT':
            raise SelectorSyntaxError('Expected ident, got %s' % (next,))
        return next.value

    def next_ident_or_star(self):
        next = self.next()
        if next.type == 'IDENT':
            return next.value
        elif next == ('DELIM', '*'):
            return None
        else:
            raise SelectorSyntaxError(
                "Expected ident or '*', got %s" % (next,))

    def skip_whitespace(self):
        peek = self.peek()
        if peek.type == 'S':
            self.next()