HEX

File: //usr/local/aegis/PythonLoader/third_party/apacheconfig/lexer.py
#
# This file is part of apacheconfig software.
#
# Copyright (c) 2018-2019, Ilya Etingof <etingof@gmail.com>
# License: https://github.com/etingof/apacheconfig/LICENSE.rst
#
from __future__ import unicode_literals

import logging
import re

import ply.lex as lex
import six

from apacheconfig.error import ApacheConfigError

log = logging.getLogger(__name__)


class SingleQuotedString(six.text_type):
    is_single_quoted = True


class DoubleQuotedString(six.text_type):
    is_double_quoted = True


class HashCommentsLexer(object):
    tokens = (
        'HASHCOMMENT',
    )

    states = ()

    def t_HASHCOMMENT(self, t):
        r'(?<!\\)\#(?:(?:\\\n)|[^\n\r])*'
        # Matches unescaped pound-sign, then escaped newlines or characters
        if not self.options.get('multilinehashcomments'):
            # If multiline hash-comments aren't allowed, ignore escaped
            # newlines
            if '\n' in t.value:
                first, second = t.value.split('\n', 1)
                t.lexer.lexpos = t.lexer.lexpos - len(second) - 1
                t.value = first
        return t


class CStyleCommentsLexer(object):
    tokens = (
        'CCOMMENT',
    )

    states = (
        ('ccomment', 'exclusive'),
    )

    def t_CCOMMENT(self, t):
        r'\/\*'
        t.lexer.code_start = t.lexer.lexpos
        t.lexer.ccomment_level = 1  # Initial comment level
        t.lexer.begin('ccomment')

    def t_ccomment_open(self, t):
        r'\/\*'
        t.lexer.ccomment_level += 1

    def t_ccomment_close(self, t):
        r'\*\/'
        t.lexer.ccomment_level -= 1

        if t.lexer.ccomment_level == 0:
            t.value = t.lexer.lexdata[t.lexer.code_start:
                                      t.lexer.lexpos + 1 - 3]
            t.type = "CCOMMENT"
            t.lexer.lineno += t.value.count('\n')
            t.lexer.begin('INITIAL')
            return t

    def t_ccomment_body(self, t):
        r'.+?'

    def t_ccomment_error(self, t):
        raise ApacheConfigError("Illegal character '%s' in C-style comment"
                                % t.value[0])


class ApacheIncludesLexer(object):
    tokens = (
        'APACHEINCLUDE',
        'APACHEINCLUDEOPTIONAL'
    )

    states = ()

    def t_APACHEINCLUDE(self, t):
        r'include[\t ]+[^\n\r]+'
        include, whitespace, value = re.split(r'([ \t]+)', t.value, maxsplit=1)
        t.value = include, whitespace, value
        return t

    def t_APACHEINCLUDEOPTIONAL(self, t):
        r'includeoptional[\t ]+[^\n\r]+'
        include, whitespace, value = re.split(r'([ \t]+)', t.value, maxsplit=1)
        t.value = include, whitespace, value
        return t


class BaseApacheConfigLexer(object):

    tokens = (
        'INCLUDE',
        'OPEN_TAG',
        'CLOSE_TAG',
        'OPEN_CLOSE_TAG',
        'OPTION_AND_VALUE',
        'OPTION_AND_VALUE_NOSTRIP',
        'WHITESPACE',
        'NEWLINE',
    )

    states = (
        ('multiline', 'exclusive'),
        ('heredoc', 'exclusive'),
    )

    def __init__(self, tempdir=None, debug=False):
        self._tempdir = tempdir
        self._debug = debug
        self.engine = None
        self.reset()

    def reset(self):
        self.engine = lex.lex(
            module=self,
            reflags=re.DOTALL | re.IGNORECASE,
            outputdir=self._tempdir,
            debuglog=log if self._debug else None,
            errorlog=log if self._debug else None
        )

    def tokenize(self, text):
        self.engine.input(text)

        tokens = []

        while True:
            token = self.engine.token()
            if not token:
                break
            tokens.append(token.value)

        return tokens

    # Tokenizer rules

    def t_INCLUDE(self, t):
        r'<<include[\t ]+[^\n\r\t]+>>'
        include, whitespace, value = re.split(r'([ \t]+)',
                                              t.value[2:-2], maxsplit=1)
        t.value = '<<', include, whitespace, value, '>>'
        return t

    def t_CLOSE_TAG(self, t):
        r'</[^\n\r]+>'
        t.value = t.value[2:-1]
        return t

    def t_OPEN_CLOSE_TAG(self, t):
        r'<[^\n\r/]*?[^\n\r/ ]/>'
        if self.options.get('disableemptyelementtags', False):
            t.type = 'OPEN_TAG'
            return self.t_OPEN_TAG(t)
        t.value = t.value[1:-2]
        return self._lex_option(t)

    def t_OPEN_TAG(self, t):
        r'<[^\n\r]+>|<[^\n\r]+\\\n'
        t.value = t.value[1:-1]
        return self._lex_option(t)

    @staticmethod
    def _parse_option_value(token, lineno):
        # Grabs the first token before the first non-quoted whitespace.
        match = re.search(r'[^=\s"\']+|"([^"]*)"|\'([^\']*)\'', token)
        if not match:
            raise ApacheConfigError(
                'Syntax error in option-value pair %s on line '
                '%d' % (token, lineno))
        option = match.group(0)
        if len(token.strip()) == len(option):
            return token, None, None
        # If there's more, split it out into whitespace and value.
        _, middle, value = re.split(r'((?:\s|=|\\\s)+)',
                                    token[len(option):], maxsplit=1)
        if not option:
            raise ApacheConfigError(
                'Syntax error in option-value pair %s on line '
                '%d' % (token, lineno))
        if value:
            stripped = value.strip()
            if stripped[0] == '"' and stripped[-1] == '"':
                value = DoubleQuotedString(stripped[1:-1])
            if stripped[0] == "'" and stripped[-1] == "'":
                value = SingleQuotedString(stripped[1:-1])
        return option, middle, value

    def _pre_parse_value(self, option, value):
        try:
            pre_parse_value = self.options['plug']['pre_parse_value']
            return pre_parse_value(option, value)

        except KeyError:
            return True, option, value

    def _lex_option(self, t):
        if t.value.endswith('\\'):
            t.lexer.multiline_newline_seen = False
            t.lexer.code_start = t.lexer.lexpos - len(t.value)
            if "TAG" in t.type:
                t.lexer.code_start -= 1
            t.lexer.begin('multiline')
            self._current_type = t.type
            return

        lineno = len(re.findall(r'\r\n|\n|\r', t.value))

        option, whitespace, value = self._parse_option_value(t.value, t.lineno)
        if not value:
            t.value = (option,)
            return t

        process, option, value = self._pre_parse_value(option, value)
        if not process:
            return

        if value.startswith('<<'):
            t.lexer.heredoc_anchor = value[2:].strip()
            t.lexer.heredoc_option = option
            t.lexer.heredoc_whitespace = whitespace
            t.lexer.code_start = t.lexer.lexpos + 1
            t.lexer.begin('heredoc')
            return

        t.value = option, whitespace, value

        t.lexer.lineno += lineno

        return t

    def t_multiline_OPTION_AND_VALUE(self, t):
        r'[^\r\n]+'
        t.lexer.multiline_newline_seen = False

        if t.value.endswith('\\'):
            return

        t.type = self._current_type
        t.lexer.begin('INITIAL')

        value = t.lexer.lexdata[t.lexer.code_start:t.lexer.lexpos + 1]
        value = self._remove_trailing_whitespace(value)
        t.lexer.lexpos = t.lexer.code_start + len(value)
        t.lexer.lineno += len(re.findall(r'\r\n|\n|\r', value))

        option, whitespace, value = self._parse_option_value(value, t.lineno)

        process, option, value = self._pre_parse_value(option, value)
        if not process:
            return

        if t.type == "OPEN_TAG":
            if value.endswith("/>"):
                t.type = "OPEN_CLOSE_TAG"
                value = value[:-1]
            value = value[:-1]

        # To match perl parser behavior, whitespace between text is normalized
        # when a value or block name is on multiple lines.
        if ("\\\n" in value and not
           self.options.get('preservewhitespace', False)):
            value = " ".join(re.split(r'(?:\s|\\\s)+', value))
        t.value = option, whitespace, value
        return t

    def t_multiline_NEWLINE(self, t):
        r'\r\n|\n|\r'
        if t.lexer.multiline_newline_seen:
            return self.t_multiline_OPTION_AND_VALUE(t)
        t.lexer.multiline_newline_seen = True

    def t_multiline_error(self, t):
        raise ApacheConfigError(
            "Illegal character '%s' in multi-line text on line "
            "%d" % (t.value[0], t.lineno))

    def _remove_trailing_whitespace(self, value):
        # if stripped_value ends with an odd number of backslashes, the first
        # trailing whitespace character was escaped, should be in `value`
        def trailing_escape(s):
            return (len(s) - len(s.rstrip('\\'))) % 2 == 1
        value = value.rstrip()
        while trailing_escape(value):
            value = value[:-1].rstrip()
        return value

    def t_heredoc_OPTION_AND_VALUE(self, t):
        r'[^\r\n]+'
        if t.value.lstrip() != t.lexer.heredoc_anchor:
            return

        t.type = "OPTION_AND_VALUE"
        t.lexer.begin('INITIAL')

        value = t.lexer.lexdata[t.lexer.code_start:
                                t.lexer.lexpos - len(t.lexer.heredoc_anchor)]
        value = self._remove_trailing_whitespace(value)

        t.lexer.lineno += len(re.findall(r'\r\n|\n|\r', t.value))

        t.value = t.lexer.heredoc_option, t.lexer.heredoc_whitespace, value

        return t

    def t_heredoc_NEWLINE(self, t):
        r'\r\n|\n|\r'
        t.lexer.lineno += 1

    def t_heredoc_error(self, t):
        raise ApacheConfigError(
            "Illegal character '%s' in here-document text on line "
            "%d" % (t.value[0], t.lineno))

    def t_NEWLINE(self, t):
        r'[ \t]*((\r\n|\n|\r|\\)[\t ]*)+'
        if t.value != '\\':
            t.lexer.lineno += 1
        return t

    def t_WHITESPACE(self, t):
        r'[ \t]+'
        return t

    def t_error(self, t):
        raise ApacheConfigError(
            "Illegal character '%s' on line %d" % (t.value[0], t.lineno))


class OptionLexer(BaseApacheConfigLexer):
    def t_OPTION_AND_VALUE(self, t):
        r'[^ \n\r\t=\#]+([ \t=]+(?:\\\#|[^ \t\r\n\#])+)*'
        # Regex above matches (text, (spaces, text)*) where text
        # can include escaped hashes but not regular ones.
        return self._lex_option(t)


class NoStripLexer(BaseApacheConfigLexer):
    def t_OPTION_AND_VALUE_NOSTRIP(self, t):
        r'[^ \n\r\t=\#]+[ \t=]+(?:\\\#|[^\r\n\#])+'
        return self._lex_option(t)


def make_lexer(**options):
    lexer_class = OptionLexer
    if options.get('nostripvalues'):
        lexer_class = NoStripLexer

    lexer_class = type(str('ApacheConfigLexer'),
                       (lexer_class, HashCommentsLexer),
                       {'tokens': lexer_class.tokens +
                        HashCommentsLexer.tokens,
                        'states': lexer_class.states +
                        HashCommentsLexer.states,
                        'options': options})

    if options.get('ccomments', True):
        lexer_class = type(str('ApacheConfigLexer'),
                           (lexer_class, CStyleCommentsLexer),
                           {'tokens': lexer_class.tokens +
                            CStyleCommentsLexer.tokens,
                            'states': lexer_class.states +
                            CStyleCommentsLexer.states,
                            'options': options})

    if options.get('useapacheinclude', True):
        lexer_class = type(str('ApacheConfigLexer'),
                           (lexer_class, ApacheIncludesLexer),
                           {'tokens': lexer_class.tokens +
                            ApacheIncludesLexer.tokens,
                            'states': lexer_class.states +
                            ApacheIncludesLexer.states,
                            'options': options})

    return lexer_class