mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-06 19:44:12 +01:00
Initial import
This commit is contained in:
216
ebook_converter/tinycss/tokenizer.py
Normal file
216
ebook_converter/tinycss/tokenizer.py
Normal file
@@ -0,0 +1,216 @@
|
||||
# coding: utf8
|
||||
"""
|
||||
tinycss.tokenizer
|
||||
-----------------
|
||||
|
||||
Tokenizer for the CSS core syntax:
|
||||
http://www.w3.org/TR/CSS21/syndata.html#tokenization
|
||||
|
||||
This is the pure-python implementation. See also speedups.pyx
|
||||
|
||||
:copyright: (c) 2012 by Simon Sapin.
|
||||
:license: BSD, see LICENSE for more details.
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from tinycss import token_data
|
||||
|
||||
|
||||
def tokenize_flat(css_source, ignore_comments=True,
|
||||
# Make these local variable to avoid global lookups in the loop
|
||||
tokens_dispatch=token_data.TOKEN_DISPATCH,
|
||||
unicode_unescape=token_data.UNICODE_UNESCAPE,
|
||||
newline_unescape=token_data.NEWLINE_UNESCAPE,
|
||||
simple_unescape=token_data.SIMPLE_UNESCAPE,
|
||||
find_newlines=token_data.FIND_NEWLINES,
|
||||
Token=token_data.Token,
|
||||
len=len,
|
||||
int=int,
|
||||
float=float,
|
||||
list=list,
|
||||
_None=None,
|
||||
):
|
||||
"""
|
||||
:param css_source:
|
||||
CSS as an unicode string
|
||||
:param ignore_comments:
|
||||
if true (the default) comments will not be included in the
|
||||
return value
|
||||
:return:
|
||||
An iterator of :class:`Token`
|
||||
|
||||
"""
|
||||
|
||||
pos = 0
|
||||
line = 1
|
||||
column = 1
|
||||
source_len = len(css_source)
|
||||
tokens = []
|
||||
while pos < source_len:
|
||||
char = css_source[pos]
|
||||
if char in ':;{}()[]':
|
||||
type_ = char
|
||||
css_value = char
|
||||
else:
|
||||
codepoint = min(ord(char), 160)
|
||||
for _index, type_, regexp in tokens_dispatch[codepoint]:
|
||||
match = regexp(css_source, pos)
|
||||
if match is not None:
|
||||
# First match is the longest. See comments on TOKENS above.
|
||||
css_value = match.group()
|
||||
break
|
||||
else:
|
||||
# No match.
|
||||
# "Any other character not matched by the above rules,
|
||||
# and neither a single nor a double quote."
|
||||
# ... but quotes at the start of a token are always matched
|
||||
# by STRING or BAD_STRING. So DELIM is any single character.
|
||||
type_ = 'DELIM'
|
||||
css_value = char
|
||||
length = len(css_value)
|
||||
next_pos = pos + length
|
||||
|
||||
# A BAD_COMMENT is a comment at EOF. Ignore it too.
|
||||
if not (ignore_comments and type_ in ('COMMENT', 'BAD_COMMENT')):
|
||||
# Parse numbers, extract strings and URIs, unescape
|
||||
unit = _None
|
||||
if type_ == 'DIMENSION':
|
||||
value = match.group(1)
|
||||
value = float(value) if '.' in value else int(value)
|
||||
unit = match.group(2)
|
||||
unit = simple_unescape(unit)
|
||||
unit = unicode_unescape(unit)
|
||||
unit = unit.lower() # normalize
|
||||
elif type_ == 'PERCENTAGE':
|
||||
value = css_value[:-1]
|
||||
value = float(value) if '.' in value else int(value)
|
||||
unit = '%'
|
||||
elif type_ == 'NUMBER':
|
||||
value = css_value
|
||||
if '.' in value:
|
||||
value = float(value)
|
||||
else:
|
||||
value = int(value)
|
||||
type_ = 'INTEGER'
|
||||
elif type_ in ('IDENT', 'ATKEYWORD', 'HASH', 'FUNCTION'):
|
||||
value = simple_unescape(css_value)
|
||||
value = unicode_unescape(value)
|
||||
elif type_ == 'URI':
|
||||
value = match.group(1)
|
||||
if value and value[0] in '"\'':
|
||||
value = value[1:-1] # Remove quotes
|
||||
value = newline_unescape(value)
|
||||
value = simple_unescape(value)
|
||||
value = unicode_unescape(value)
|
||||
elif type_ == 'STRING':
|
||||
value = css_value[1:-1] # Remove quotes
|
||||
value = newline_unescape(value)
|
||||
value = simple_unescape(value)
|
||||
value = unicode_unescape(value)
|
||||
# BAD_STRING can only be one of:
|
||||
# * Unclosed string at the end of the stylesheet:
|
||||
# Close the string, but this is not an error.
|
||||
# Make it a "good" STRING token.
|
||||
# * Unclosed string at the (unescaped) end of the line:
|
||||
# Close the string, but this is an error.
|
||||
# Leave it as a BAD_STRING, don’t bother parsing it.
|
||||
# See http://www.w3.org/TR/CSS21/syndata.html#parsing-errors
|
||||
elif type_ == 'BAD_STRING' and next_pos == source_len:
|
||||
type_ = 'STRING'
|
||||
value = css_value[1:] # Remove quote
|
||||
value = newline_unescape(value)
|
||||
value = simple_unescape(value)
|
||||
value = unicode_unescape(value)
|
||||
else:
|
||||
value = css_value
|
||||
tokens.append(Token(type_, css_value, value, unit, line, column))
|
||||
|
||||
pos = next_pos
|
||||
newlines = find_newlines(css_value)
|
||||
if newlines:
|
||||
line += len(newlines)
|
||||
# Add 1 to have lines start at column 1, not 0
|
||||
column = length - newlines[-1].end() + 1
|
||||
else:
|
||||
column += length
|
||||
return tokens
|
||||
|
||||
|
||||
def regroup(tokens):
|
||||
"""
|
||||
Match pairs of tokens: () [] {} function()
|
||||
(Strings in "" or '' are taken care of by the tokenizer.)
|
||||
|
||||
Opening tokens are replaced by a :class:`ContainerToken`.
|
||||
Closing tokens are removed. Unmatched closing tokens are invalid
|
||||
but left as-is. All nested structures that are still open at
|
||||
the end of the stylesheet are implicitly closed.
|
||||
|
||||
:param tokens:
|
||||
a *flat* iterable of tokens, as returned by :func:`tokenize_flat`.
|
||||
:return:
|
||||
A tree of tokens.
|
||||
|
||||
"""
|
||||
# "global" objects for the inner recursion
|
||||
pairs = {'FUNCTION': ')', '(': ')', '[': ']', '{': '}'}
|
||||
tokens = iter(tokens)
|
||||
eof = [False]
|
||||
|
||||
def _regroup_inner(stop_at=None,
|
||||
tokens=tokens, pairs=pairs, eof=eof,
|
||||
ContainerToken=token_data.ContainerToken,
|
||||
FunctionToken=token_data.FunctionToken):
|
||||
for token in tokens:
|
||||
type_ = token.type
|
||||
if type_ == stop_at:
|
||||
return
|
||||
|
||||
end = pairs.get(type_)
|
||||
if end is None:
|
||||
yield token # Not a grouping token
|
||||
else:
|
||||
assert not isinstance(token, ContainerToken), (
|
||||
'Token looks already grouped: {0}'.format(token))
|
||||
content = list(_regroup_inner(end))
|
||||
if eof[0]:
|
||||
end = '' # Implicit end of structure at EOF.
|
||||
if type_ == 'FUNCTION':
|
||||
yield FunctionToken(token.type, token.as_css(), end,
|
||||
token.value, content,
|
||||
token.line, token.column)
|
||||
else:
|
||||
yield ContainerToken(token.type, token.as_css(), end,
|
||||
content,
|
||||
token.line, token.column)
|
||||
else:
|
||||
eof[0] = True # end of file/stylesheet
|
||||
return _regroup_inner()
|
||||
|
||||
|
||||
def tokenize_grouped(css_source, ignore_comments=True):
|
||||
"""
|
||||
:param css_source:
|
||||
CSS as an unicode string
|
||||
:param ignore_comments:
|
||||
if true (the default) comments will not be included in the
|
||||
return value
|
||||
:return:
|
||||
An iterator of :class:`Token`
|
||||
|
||||
"""
|
||||
return regroup(tokenize_flat(css_source, ignore_comments))
|
||||
|
||||
|
||||
# Optional Cython version of tokenize_flat
|
||||
# Make both versions available with explicit names for tests.
|
||||
python_tokenize_flat = tokenize_flat
|
||||
|
||||
try:
|
||||
tok = token_data.load_c_tokenizer()
|
||||
except (ImportError, RuntimeError):
|
||||
c_tokenize_flat = None
|
||||
else:
|
||||
# Use the c tokenizer by default
|
||||
c_tokenize_flat = tokenize_flat = lambda s, ignore_comments=False:tok.tokenize_flat(s, ignore_comments)
|
||||
Reference in New Issue
Block a user