mirror of
https://github.com/gryf/ebook-converter.git
synced 2025-12-28 12:12:26 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
792 lines
23 KiB
Python
792 lines
23 KiB
Python
#!/usr/bin/env python2
|
|
# vim:fileencoding=utf-8
|
|
"""
|
|
Tokenizer, parser and parsed objects for CSS selectors.
|
|
|
|
:copyright: (c) 2007-2012 Ian Bicking and contributors.
|
|
See AUTHORS for more details.
|
|
:license: BSD, see LICENSE for more details.
|
|
|
|
"""
|
|
|
|
import sys
|
|
import re
|
|
import operator
|
|
import string
|
|
|
|
from ebook_converter.css_selectors.errors import SelectorSyntaxError, ExpressionError
|
|
from ebook_converter.polyglot.builtins import unicode_type, codepoint_to_chr, range
|
|
|
|
|
|
utab = {c:c+32 for c in range(ord(u'A'), ord(u'Z')+1)}
|
|
|
|
if sys.version_info.major < 3:
|
|
tab = string.maketrans(string.ascii_uppercase, string.ascii_lowercase)
|
|
|
|
def ascii_lower(string):
|
|
"""Lower-case, but only in the ASCII range."""
|
|
return string.translate(utab if isinstance(string, unicode_type) else tab)
|
|
|
|
def urepr(x):
|
|
if isinstance(x, list):
|
|
return '[%s]' % ', '.join((map(urepr, x)))
|
|
ans = repr(x)
|
|
if ans.startswith("u'") or ans.startswith('u"'):
|
|
ans = ans[1:]
|
|
return ans
|
|
|
|
|
|
else:
|
|
|
|
def ascii_lower(x):
|
|
return x.translate(utab)
|
|
|
|
urepr = repr
|
|
|
|
|
|
# Parsed objects
|
|
|
|
class Selector(object):
|
|
|
|
"""
|
|
Represents a parsed selector.
|
|
"""
|
|
|
|
def __init__(self, tree, pseudo_element=None):
|
|
self.parsed_tree = tree
|
|
if pseudo_element is not None and not isinstance(
|
|
pseudo_element, FunctionalPseudoElement):
|
|
pseudo_element = ascii_lower(pseudo_element)
|
|
#: A :class:`FunctionalPseudoElement`,
|
|
#: or the identifier for the pseudo-element as a string,
|
|
# or ``None``.
|
|
#:
|
|
#: +-------------------------+----------------+--------------------------------+
|
|
#: | | Selector | Pseudo-element |
|
|
#: +=========================+================+================================+
|
|
#: | CSS3 syntax | ``a::before`` | ``'before'`` |
|
|
#: +-------------------------+----------------+--------------------------------+
|
|
#: | Older syntax | ``a:before`` | ``'before'`` |
|
|
#: +-------------------------+----------------+--------------------------------+
|
|
#: | From the Lists3_ draft, | ``li::marker`` | ``'marker'`` |
|
|
#: | not in Selectors3 | | |
|
|
#: +-------------------------+----------------+--------------------------------+
|
|
#: | Invalid pseudo-class | ``li:marker`` | ``None`` |
|
|
#: +-------------------------+----------------+--------------------------------+
|
|
#: | Functinal | ``a::foo(2)`` | ``FunctionalPseudoElement(…)`` |
|
|
#: +-------------------------+----------------+--------------------------------+
|
|
#:
|
|
# : .. _Lists3: http://www.w3.org/TR/2011/WD-css3-lists-20110524/#marker-pseudoelement
|
|
self.pseudo_element = pseudo_element
|
|
|
|
def __repr__(self):
|
|
if isinstance(self.pseudo_element, FunctionalPseudoElement):
|
|
pseudo_element = repr(self.pseudo_element)
|
|
if self.pseudo_element:
|
|
pseudo_element = '::%s' % self.pseudo_element
|
|
else:
|
|
pseudo_element = ''
|
|
return '%s[%r%s]' % (
|
|
self.__class__.__name__, self.parsed_tree, pseudo_element)
|
|
|
|
def specificity(self):
|
|
"""Return the specificity_ of this selector as a tuple of 3 integers.
|
|
|
|
.. _specificity: http://www.w3.org/TR/selectors/#specificity
|
|
|
|
"""
|
|
a, b, c = self.parsed_tree.specificity()
|
|
if self.pseudo_element:
|
|
c += 1
|
|
return a, b, c
|
|
|
|
|
|
class Class(object):
|
|
|
|
"""
|
|
Represents selector.class_name
|
|
"""
|
|
def __init__(self, selector, class_name):
|
|
self.selector = selector
|
|
self.class_name = class_name
|
|
|
|
def __repr__(self):
|
|
return '%s[%r.%s]' % (
|
|
self.__class__.__name__, self.selector, self.class_name)
|
|
|
|
def specificity(self):
|
|
a, b, c = self.selector.specificity()
|
|
b += 1
|
|
return a, b, c
|
|
|
|
|
|
class FunctionalPseudoElement(object):
|
|
|
|
"""
|
|
Represents selector::name(arguments)
|
|
|
|
.. attribute:: name
|
|
|
|
The name (identifier) of the pseudo-element, as a string.
|
|
|
|
.. attribute:: arguments
|
|
|
|
The arguments of the pseudo-element, as a list of tokens.
|
|
|
|
**Note:** tokens are not part of the public API,
|
|
and may change between versions.
|
|
Use at your own risks.
|
|
|
|
"""
|
|
def __init__(self, name, arguments):
|
|
self.name = ascii_lower(name)
|
|
self.arguments = arguments
|
|
|
|
def __repr__(self):
|
|
return '%s[::%s(%s)]' % (
|
|
self.__class__.__name__, self.name,
|
|
urepr([token.value for token in self.arguments]))
|
|
|
|
def argument_types(self):
|
|
return [token.type for token in self.arguments]
|
|
|
|
def specificity(self):
|
|
a, b, c = self.selector.specificity()
|
|
b += 1
|
|
return a, b, c
|
|
|
|
|
|
class Function(object):
|
|
|
|
"""
|
|
Represents selector:name(expr)
|
|
"""
|
|
def __init__(self, selector, name, arguments):
|
|
self.selector = selector
|
|
self.name = ascii_lower(name)
|
|
self.arguments = arguments
|
|
self._parsed_arguments = None
|
|
|
|
def __repr__(self):
|
|
return '%s[%r:%s(%s)]' % (
|
|
self.__class__.__name__, self.selector, self.name,
|
|
urepr([token.value for token in self.arguments]))
|
|
|
|
def argument_types(self):
|
|
return [token.type for token in self.arguments]
|
|
|
|
@property
|
|
def parsed_arguments(self):
|
|
if self._parsed_arguments is None:
|
|
try:
|
|
self._parsed_arguments = parse_series(self.arguments)
|
|
except ValueError:
|
|
raise ExpressionError("Invalid series: '%r'" % self.arguments)
|
|
return self._parsed_arguments
|
|
|
|
def parse_arguments(self):
|
|
if not self.arguments_parsed:
|
|
self.arguments_parsed = True
|
|
|
|
def specificity(self):
|
|
a, b, c = self.selector.specificity()
|
|
b += 1
|
|
return a, b, c
|
|
|
|
|
|
class Pseudo(object):
|
|
|
|
"""
|
|
Represents selector:ident
|
|
"""
|
|
def __init__(self, selector, ident):
|
|
self.selector = selector
|
|
self.ident = ascii_lower(ident)
|
|
|
|
def __repr__(self):
|
|
return '%s[%r:%s]' % (
|
|
self.__class__.__name__, self.selector, self.ident)
|
|
|
|
def specificity(self):
|
|
a, b, c = self.selector.specificity()
|
|
b += 1
|
|
return a, b, c
|
|
|
|
|
|
class Negation(object):
|
|
|
|
"""
|
|
Represents selector:not(subselector)
|
|
"""
|
|
def __init__(self, selector, subselector):
|
|
self.selector = selector
|
|
self.subselector = subselector
|
|
|
|
def __repr__(self):
|
|
return '%s[%r:not(%r)]' % (
|
|
self.__class__.__name__, self.selector, self.subselector)
|
|
|
|
def specificity(self):
|
|
a1, b1, c1 = self.selector.specificity()
|
|
a2, b2, c2 = self.subselector.specificity()
|
|
return a1 + a2, b1 + b2, c1 + c2
|
|
|
|
|
|
class Attrib(object):
|
|
|
|
"""
|
|
Represents selector[namespace|attrib operator value]
|
|
"""
|
|
def __init__(self, selector, namespace, attrib, operator, value):
|
|
self.selector = selector
|
|
self.namespace = namespace
|
|
self.attrib = attrib
|
|
self.operator = operator
|
|
self.value = value
|
|
|
|
def __repr__(self):
|
|
if self.namespace:
|
|
attrib = '%s|%s' % (self.namespace, self.attrib)
|
|
else:
|
|
attrib = self.attrib
|
|
if self.operator == 'exists':
|
|
return '%s[%r[%s]]' % (
|
|
self.__class__.__name__, self.selector, attrib)
|
|
else:
|
|
return '%s[%r[%s %s %s]]' % (
|
|
self.__class__.__name__, self.selector, attrib,
|
|
self.operator, urepr(self.value))
|
|
|
|
def specificity(self):
|
|
a, b, c = self.selector.specificity()
|
|
b += 1
|
|
return a, b, c
|
|
|
|
|
|
class Element(object):
|
|
|
|
"""
|
|
Represents namespace|element
|
|
|
|
`None` is for the universal selector '*'
|
|
|
|
"""
|
|
def __init__(self, namespace=None, element=None):
|
|
self.namespace = namespace
|
|
self.element = element
|
|
|
|
def __repr__(self):
|
|
element = self.element or '*'
|
|
if self.namespace:
|
|
element = '%s|%s' % (self.namespace, element)
|
|
return '%s[%s]' % (self.__class__.__name__, element)
|
|
|
|
def specificity(self):
|
|
if self.element:
|
|
return 0, 0, 1
|
|
else:
|
|
return 0, 0, 0
|
|
|
|
|
|
class Hash(object):
|
|
|
|
"""
|
|
Represents selector#id
|
|
"""
|
|
def __init__(self, selector, id):
|
|
self.selector = selector
|
|
self.id = id
|
|
|
|
def __repr__(self):
|
|
return '%s[%r#%s]' % (
|
|
self.__class__.__name__, self.selector, self.id)
|
|
|
|
def specificity(self):
|
|
a, b, c = self.selector.specificity()
|
|
a += 1
|
|
return a, b, c
|
|
|
|
|
|
class CombinedSelector(object):
|
|
|
|
def __init__(self, selector, combinator, subselector):
|
|
assert selector is not None
|
|
self.selector = selector
|
|
self.combinator = combinator
|
|
self.subselector = subselector
|
|
|
|
def __repr__(self):
|
|
if self.combinator == ' ':
|
|
comb = '<followed>'
|
|
else:
|
|
comb = self.combinator
|
|
return '%s[%r %s %r]' % (
|
|
self.__class__.__name__, self.selector, comb, self.subselector)
|
|
|
|
def specificity(self):
|
|
a1, b1, c1 = self.selector.specificity()
|
|
a2, b2, c2 = self.subselector.specificity()
|
|
return a1 + a2, b1 + b2, c1 + c2
|
|
|
|
|
|
# Parser
|
|
|
|
# foo
|
|
_el_re = re.compile(r'^[ \t\r\n\f]*([a-zA-Z]+)[ \t\r\n\f]*$')
|
|
|
|
# foo#bar or #bar
|
|
_id_re = re.compile(r'^[ \t\r\n\f]*([a-zA-Z]*)#([a-zA-Z0-9_-]+)[ \t\r\n\f]*$')
|
|
|
|
# foo.bar or .bar
|
|
_class_re = re.compile(
|
|
r'^[ \t\r\n\f]*([a-zA-Z]*)\.([a-zA-Z][a-zA-Z0-9_-]*)[ \t\r\n\f]*$')
|
|
|
|
|
|
def parse(css):
|
|
"""Parse a CSS *group of selectors*.
|
|
|
|
:param css:
|
|
A *group of selectors* as an Unicode string.
|
|
:raises:
|
|
:class:`SelectorSyntaxError` on invalid selectors.
|
|
:returns:
|
|
A list of parsed :class:`Selector` objects, one for each
|
|
selector in the comma-separated group.
|
|
|
|
"""
|
|
# Fast path for simple cases
|
|
match = _el_re.match(css)
|
|
if match:
|
|
return [Selector(Element(element=match.group(1)))]
|
|
match = _id_re.match(css)
|
|
if match is not None:
|
|
return [Selector(Hash(Element(element=match.group(1) or None),
|
|
match.group(2)))]
|
|
match = _class_re.match(css)
|
|
if match is not None:
|
|
return [Selector(Class(Element(element=match.group(1) or None),
|
|
match.group(2)))]
|
|
|
|
stream = TokenStream(tokenize(css))
|
|
stream.source = css
|
|
return list(parse_selector_group(stream))
|
|
# except SelectorSyntaxError:
|
|
# e = sys.exc_info()[1]
|
|
# message = "%s at %s -> %r" % (
|
|
# e, stream.used, stream.peek())
|
|
# e.msg = message
|
|
# e.args = tuple([message])
|
|
# raise
|
|
|
|
|
|
def parse_selector_group(stream):
|
|
stream.skip_whitespace()
|
|
while 1:
|
|
yield Selector(*parse_selector(stream))
|
|
if stream.peek() == ('DELIM', ','):
|
|
stream.next()
|
|
stream.skip_whitespace()
|
|
else:
|
|
break
|
|
|
|
|
|
def parse_selector(stream):
|
|
result, pseudo_element = parse_simple_selector(stream)
|
|
while 1:
|
|
stream.skip_whitespace()
|
|
peek = stream.peek()
|
|
if peek in (('EOF', None), ('DELIM', ',')):
|
|
break
|
|
if pseudo_element:
|
|
raise SelectorSyntaxError(
|
|
'Got pseudo-element ::%s not at the end of a selector'
|
|
% pseudo_element)
|
|
if peek.is_delim('+', '>', '~'):
|
|
# A combinator
|
|
combinator = stream.next().value
|
|
stream.skip_whitespace()
|
|
else:
|
|
# By exclusion, the last parse_simple_selector() ended
|
|
# at peek == ' '
|
|
combinator = ' '
|
|
next_selector, pseudo_element = parse_simple_selector(stream)
|
|
result = CombinedSelector(result, combinator, next_selector)
|
|
return result, pseudo_element
|
|
|
|
|
|
special_pseudo_elements = (
|
|
'first-line', 'first-letter', 'before', 'after')
|
|
|
|
|
|
def parse_simple_selector(stream, inside_negation=False):
|
|
stream.skip_whitespace()
|
|
selector_start = len(stream.used)
|
|
peek = stream.peek()
|
|
if peek.type == 'IDENT' or peek == ('DELIM', '*'):
|
|
if peek.type == 'IDENT':
|
|
namespace = stream.next().value
|
|
else:
|
|
stream.next()
|
|
namespace = None
|
|
if stream.peek() == ('DELIM', '|'):
|
|
stream.next()
|
|
element = stream.next_ident_or_star()
|
|
else:
|
|
element = namespace
|
|
namespace = None
|
|
else:
|
|
element = namespace = None
|
|
result = Element(namespace, element)
|
|
pseudo_element = None
|
|
while 1:
|
|
peek = stream.peek()
|
|
if peek.type in ('S', 'EOF') or peek.is_delim(',', '+', '>', '~') or (
|
|
inside_negation and peek == ('DELIM', ')')):
|
|
break
|
|
if pseudo_element:
|
|
raise SelectorSyntaxError(
|
|
'Got pseudo-element ::%s not at the end of a selector'
|
|
% pseudo_element)
|
|
if peek.type == 'HASH':
|
|
result = Hash(result, stream.next().value)
|
|
elif peek == ('DELIM', '.'):
|
|
stream.next()
|
|
result = Class(result, stream.next_ident())
|
|
elif peek == ('DELIM', '['):
|
|
stream.next()
|
|
result = parse_attrib(result, stream)
|
|
elif peek == ('DELIM', ':'):
|
|
stream.next()
|
|
if stream.peek() == ('DELIM', ':'):
|
|
stream.next()
|
|
pseudo_element = stream.next_ident()
|
|
if stream.peek() == ('DELIM', '('):
|
|
stream.next()
|
|
pseudo_element = FunctionalPseudoElement(
|
|
pseudo_element, parse_arguments(stream))
|
|
continue
|
|
ident = stream.next_ident()
|
|
if ident.lower() in special_pseudo_elements:
|
|
# Special case: CSS 2.1 pseudo-elements can have a single ':'
|
|
# Any new pseudo-element must have two.
|
|
pseudo_element = unicode_type(ident)
|
|
continue
|
|
if stream.peek() != ('DELIM', '('):
|
|
result = Pseudo(result, ident)
|
|
continue
|
|
stream.next()
|
|
stream.skip_whitespace()
|
|
if ident.lower() == 'not':
|
|
if inside_negation:
|
|
raise SelectorSyntaxError('Got nested :not()')
|
|
argument, argument_pseudo_element = parse_simple_selector(
|
|
stream, inside_negation=True)
|
|
next = stream.next()
|
|
if argument_pseudo_element:
|
|
raise SelectorSyntaxError(
|
|
'Got pseudo-element ::%s inside :not() at %s'
|
|
% (argument_pseudo_element, next.pos))
|
|
if next != ('DELIM', ')'):
|
|
raise SelectorSyntaxError("Expected ')', got %s" % (next,))
|
|
result = Negation(result, argument)
|
|
else:
|
|
result = Function(result, ident, parse_arguments(stream))
|
|
else:
|
|
raise SelectorSyntaxError(
|
|
"Expected selector, got %s" % (peek,))
|
|
if len(stream.used) == selector_start:
|
|
raise SelectorSyntaxError(
|
|
"Expected selector, got %s" % (stream.peek(),))
|
|
return result, pseudo_element
|
|
|
|
|
|
def parse_arguments(stream):
|
|
arguments = []
|
|
while 1:
|
|
stream.skip_whitespace()
|
|
next = stream.next()
|
|
if next.type in ('IDENT', 'STRING', 'NUMBER') or next in [
|
|
('DELIM', '+'), ('DELIM', '-')]:
|
|
arguments.append(next)
|
|
elif next == ('DELIM', ')'):
|
|
return arguments
|
|
else:
|
|
raise SelectorSyntaxError(
|
|
"Expected an argument, got %s" % (next,))
|
|
|
|
|
|
def parse_attrib(selector, stream):
|
|
stream.skip_whitespace()
|
|
attrib = stream.next_ident_or_star()
|
|
if attrib is None and stream.peek() != ('DELIM', '|'):
|
|
raise SelectorSyntaxError(
|
|
"Expected '|', got %s" % (stream.peek(),))
|
|
if stream.peek() == ('DELIM', '|'):
|
|
stream.next()
|
|
if stream.peek() == ('DELIM', '='):
|
|
namespace = None
|
|
stream.next()
|
|
op = '|='
|
|
else:
|
|
namespace = attrib
|
|
attrib = stream.next_ident()
|
|
op = None
|
|
else:
|
|
namespace = op = None
|
|
if op is None:
|
|
stream.skip_whitespace()
|
|
next = stream.next()
|
|
if next == ('DELIM', ']'):
|
|
return Attrib(selector, namespace, attrib, 'exists', None)
|
|
elif next == ('DELIM', '='):
|
|
op = '='
|
|
elif next.is_delim('^', '$', '*', '~', '|', '!') and (
|
|
stream.peek() == ('DELIM', '=')):
|
|
op = next.value + '='
|
|
stream.next()
|
|
else:
|
|
raise SelectorSyntaxError(
|
|
"Operator expected, got %s" % (next,))
|
|
stream.skip_whitespace()
|
|
value = stream.next()
|
|
if value.type not in ('IDENT', 'STRING'):
|
|
raise SelectorSyntaxError(
|
|
"Expected string or ident, got %s" % (value,))
|
|
stream.skip_whitespace()
|
|
next = stream.next()
|
|
if next != ('DELIM', ']'):
|
|
raise SelectorSyntaxError(
|
|
"Expected ']', got %s" % (next,))
|
|
return Attrib(selector, namespace, attrib, op, value.value)
|
|
|
|
|
|
def parse_series(tokens):
|
|
"""
|
|
Parses the arguments for :nth-child() and friends.
|
|
|
|
:raises: A list of tokens
|
|
:returns: :``(a, b)``
|
|
|
|
"""
|
|
for token in tokens:
|
|
if token.type == 'STRING':
|
|
raise ValueError('String tokens not allowed in series.')
|
|
s = ''.join(token.value for token in tokens).strip()
|
|
if s == 'odd':
|
|
return (2, 1)
|
|
elif s == 'even':
|
|
return (2, 0)
|
|
elif s == 'n':
|
|
return (1, 0)
|
|
if 'n' not in s:
|
|
# Just b
|
|
return (0, int(s))
|
|
a, b = s.split('n', 1)
|
|
if not a:
|
|
a = 1
|
|
elif a == '-' or a == '+':
|
|
a = int(a+'1')
|
|
else:
|
|
a = int(a)
|
|
if not b:
|
|
b = 0
|
|
else:
|
|
b = int(b)
|
|
return (a, b)
|
|
|
|
|
|
# Token objects
|
|
|
|
class Token(tuple):
|
|
|
|
def __new__(cls, type_, value, pos):
|
|
obj = tuple.__new__(cls, (type_, value))
|
|
obj.pos = pos
|
|
return obj
|
|
|
|
def __repr__(self):
|
|
return "<%s '%s' at %i>" % (self.type, self.value, self.pos)
|
|
|
|
def is_delim(self, *values):
|
|
return self.type == 'DELIM' and self.value in values
|
|
|
|
type = property(operator.itemgetter(0))
|
|
value = property(operator.itemgetter(1))
|
|
|
|
|
|
class EOFToken(Token):
|
|
|
|
def __new__(cls, pos):
|
|
return Token.__new__(cls, 'EOF', None, pos)
|
|
|
|
def __repr__(self):
|
|
return '<%s at %i>' % (self.type, self.pos)
|
|
|
|
|
|
# Tokenizer
|
|
|
|
|
|
class TokenMacros:
|
|
unicode_escape = r'\\([0-9a-f]{1,6})(?:\r\n|[ \n\r\t\f])?'
|
|
escape = unicode_escape + r'|\\[^\n\r\f0-9a-f]'
|
|
string_escape = r'\\(?:\n|\r\n|\r|\f)|' + escape
|
|
nonascii = r'[^\0-\177]'
|
|
nmchar = '[_a-z0-9-]|%s|%s' % (escape, nonascii)
|
|
nmstart = '[_a-z]|%s|%s' % (escape, nonascii)
|
|
|
|
|
|
def _compile(pattern):
|
|
return re.compile(pattern % vars(TokenMacros), re.IGNORECASE).match
|
|
|
|
|
|
_match_whitespace = _compile(r'[ \t\r\n\f]+')
|
|
_match_number = _compile(r'[+-]?(?:[0-9]*\.[0-9]+|[0-9]+)')
|
|
_match_hash = _compile('#(?:%(nmchar)s)+')
|
|
_match_ident = _compile('-?(?:%(nmstart)s)(?:%(nmchar)s)*')
|
|
_match_string_by_quote = {
|
|
"'": _compile(r"([^\n\r\f\\']|%(string_escape)s)*"),
|
|
'"': _compile(r'([^\n\r\f\\"]|%(string_escape)s)*'),
|
|
}
|
|
|
|
_sub_simple_escape = re.compile(r'\\(.)').sub
|
|
_sub_unicode_escape = re.compile(TokenMacros.unicode_escape, re.I).sub
|
|
_sub_newline_escape =re.compile(r'\\(?:\n|\r\n|\r|\f)').sub
|
|
|
|
# Same as r'\1', but faster on CPython
|
|
if hasattr(operator, 'methodcaller'):
|
|
# Python 2.6+
|
|
_replace_simple = operator.methodcaller('group', 1)
|
|
else:
|
|
def _replace_simple(match):
|
|
return match.group(1)
|
|
|
|
|
|
def _replace_unicode(match):
|
|
codepoint = int(match.group(1), 16)
|
|
if codepoint > sys.maxunicode:
|
|
codepoint = 0xFFFD
|
|
return codepoint_to_chr(codepoint)
|
|
|
|
|
|
def unescape_ident(value):
|
|
value = _sub_unicode_escape(_replace_unicode, value)
|
|
value = _sub_simple_escape(_replace_simple, value)
|
|
return value
|
|
|
|
|
|
def tokenize(s):
|
|
pos = 0
|
|
len_s = len(s)
|
|
while pos < len_s:
|
|
match = _match_whitespace(s, pos=pos)
|
|
if match:
|
|
yield Token('S', ' ', pos)
|
|
pos = match.end()
|
|
continue
|
|
|
|
match = _match_ident(s, pos=pos)
|
|
if match:
|
|
value = _sub_simple_escape(_replace_simple,
|
|
_sub_unicode_escape(_replace_unicode, match.group()))
|
|
yield Token('IDENT', value, pos)
|
|
pos = match.end()
|
|
continue
|
|
|
|
match = _match_hash(s, pos=pos)
|
|
if match:
|
|
value = _sub_simple_escape(_replace_simple,
|
|
_sub_unicode_escape(_replace_unicode, match.group()[1:]))
|
|
yield Token('HASH', value, pos)
|
|
pos = match.end()
|
|
continue
|
|
|
|
quote = s[pos]
|
|
if quote in _match_string_by_quote:
|
|
match = _match_string_by_quote[quote](s, pos=pos + 1)
|
|
assert match, 'Should have found at least an empty match'
|
|
end_pos = match.end()
|
|
if end_pos == len_s:
|
|
raise SelectorSyntaxError('Unclosed string at %s' % pos)
|
|
if s[end_pos] != quote:
|
|
raise SelectorSyntaxError('Invalid string at %s' % pos)
|
|
value = _sub_simple_escape(_replace_simple,
|
|
_sub_unicode_escape(_replace_unicode,
|
|
_sub_newline_escape('', match.group())))
|
|
yield Token('STRING', value, pos)
|
|
pos = end_pos + 1
|
|
continue
|
|
|
|
match = _match_number(s, pos=pos)
|
|
if match:
|
|
value = match.group()
|
|
yield Token('NUMBER', value, pos)
|
|
pos = match.end()
|
|
continue
|
|
|
|
pos2 = pos + 2
|
|
if s[pos:pos2] == '/*':
|
|
pos = s.find('*/', pos2)
|
|
if pos == -1:
|
|
pos = len_s
|
|
else:
|
|
pos += 2
|
|
continue
|
|
|
|
yield Token('DELIM', s[pos], pos)
|
|
pos += 1
|
|
|
|
assert pos == len_s
|
|
yield EOFToken(pos)
|
|
|
|
|
|
class TokenStream(object):
|
|
|
|
def __init__(self, tokens, source=None):
|
|
self.used = []
|
|
self.tokens = iter(tokens)
|
|
self.source = source
|
|
self.peeked = None
|
|
self._peeking = False
|
|
try:
|
|
self.next_token = self.tokens.next
|
|
except AttributeError:
|
|
# Python 3
|
|
self.next_token = self.tokens.__next__
|
|
|
|
def next(self):
|
|
if self._peeking:
|
|
self._peeking = False
|
|
self.used.append(self.peeked)
|
|
return self.peeked
|
|
else:
|
|
next = self.next_token()
|
|
self.used.append(next)
|
|
return next
|
|
|
|
def peek(self):
|
|
if not self._peeking:
|
|
self.peeked = self.next_token()
|
|
self._peeking = True
|
|
return self.peeked
|
|
|
|
def next_ident(self):
|
|
next = self.next()
|
|
if next.type != 'IDENT':
|
|
raise SelectorSyntaxError('Expected ident, got %s' % (next,))
|
|
return next.value
|
|
|
|
def next_ident_or_star(self):
|
|
next = self.next()
|
|
if next.type == 'IDENT':
|
|
return next.value
|
|
elif next == ('DELIM', '*'):
|
|
return None
|
|
else:
|
|
raise SelectorSyntaxError(
|
|
"Expected ident or '*', got %s" % (next,))
|
|
|
|
def skip_whitespace(self):
|
|
peek = self.peek()
|
|
if peek.type == 'S':
|
|
self.next()
|