HEX
Server: nginx/1.22.0
System: Linux iZuf6jdxbygmf6cco977lcZ 5.10.84-10.4.al8.x86_64 #1 SMP Tue Apr 12 12:31:07 CST 2022 x86_64
User: root (0)
PHP: 7.4.29
Disabled: passthru,exec,system,chroot,chgrp,chown,shell_exec,proc_open,proc_get_status,ini_alter,ini_restore,dl,readlink,symlink,popepassthru,stream_socket_server,fsocket,popen
Upload Files
File: //usr/local/aegis/PythonLoaderTemp/third_party/aegis_checker/offline/check_high_cpu.py
# -*- coding: utf-8 -*-

import sys
import re

import logging
import re
from aegis_checker.common.print_log import *
from aegis_checker.common.aegis_client_log_parser import LogObserver, LOG_INFO, LOG_WARN
from aegis_checker.info.check_result import *


def _get_high_cpu_thread_name(high_cpu_line_num, log_file_path):
    """

    :param high_cpu_line_num: log "2020-05-06 16:59:52 [Info] OnCpuMax : 29 , HitCount : 42" line number, start with 0
    :param log_file_path:
    :return: string, if fail, return None
    """
    # 2020-05-06 16:59:52 [Info] cpu high thread id:21741
    # NetLinkProc         21741     10          279         629         10
    high_cpu_tid_reg = re.compile(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} \[Info\] cpu high thread id:(\d+)")
    high_cpu_thread_name_reg = re.compile(r"^(\w+)\s+(\d+)\s+\d+\s+\d+\s+\d+\s+\d+\s+")

    with open(log_file_path) as f:
        i = 0
        tid = None
        for line in f:
            i += 1
            if i < high_cpu_line_num:
                continue

            if i > (high_cpu_line_num + 50):
                logging.warning("can not find high cpu thread for line %d in %s", high_cpu_line_num, log_file_path)
                break

            if tid is None:
                match_obj = high_cpu_tid_reg.match(line)
                if match_obj:
                    tid = match_obj.group(1)
            else:
                match_obj = high_cpu_thread_name_reg.match(line)
                if match_obj and tid == match_obj.group(2):
                    return match_obj.group(1)


class HighCpuLogObserver(LogObserver):
    def __init__(self):
        self.__high_cpu_events = []

    def on_end(self, success):
        if not self.__high_cpu_events:
            return

        high_cpu_thread_dict = {}
        for high_cpu_event in self.__high_cpu_events:
            thread = high_cpu_event["thread_name"]
            if thread in high_cpu_thread_dict:
                high_cpu_thread_dict[thread] += 1
            else:
                high_cpu_thread_dict[thread] = 1

            if thread == "NetLinkProc":
                logging.warning(
                    "offline issue root cause is there is more than 200 process start in 1 second, yundun is high cpu and kill self when %s %s, log is %s" % (
                    high_cpu_event["date"], high_cpu_event["time"], high_cpu_event["content"]))
            else:
                logging.warning(
                    "offline issue may be caused by yundun is high cpu and kill self when %s %s, high cpu thread is %s, log is %s" % (
                        high_cpu_event["date"], high_cpu_event["time"], thread, high_cpu_event["content"]))

        # get the top high cpu thread
        high_cpu_thread_name = ""
        max_high_cpu_count = 0
        for thread_name, count in high_cpu_thread_dict.items():
            if count > max_high_cpu_count:
                high_cpu_thread_name = thread_name
                max_high_cpu_count = count

        if high_cpu_thread_name == "NetLinkProc":
            set_root_cause(ROOT_CAUSE_HIGH_CPU_BY_FREQUENT_NEW_PROCESS, "there is more than 200 process start in 1 second, yundun is high cpu and kill self")
        else:
            set_root_cause(ROOT_CAUSE_HIGH_CPU, "yundun is high cpu and kill self, high cpu thread is %s" % high_cpu_thread_name)

    def on_log(self, log_date, log_time, log_type, content, line, line_num, log_file_path):
        """
        2020-03-25 06:08:22 [Warn] GetMaxCpu : 23
        2020-03-25 06:08:22 [Info] OnCpuMax : 139 , HitCount : 47
        :param log_date:
        :param log_time:
        :param log_type:
        :param content:
        :param line:
        :return:
        """
        high_cpu_reg = r"OnCpuMax : \d+ , HitCount : \d+"
        if log_type == LOG_INFO and re.match(high_cpu_reg, content):
            # logging.warning("high cpu on %s" % line)
            thread_name = _get_high_cpu_thread_name(line_num, log_file_path)
            self.__high_cpu_events.append({
                "date": log_date,
                "time": log_time,
                "content": content,
                "thread_name": thread_name
            })


def test():
    logging.basicConfig(format='%(asctime)s [%(filename)s][%(levelname)s] %(message)s', level=logging.DEBUG)
    login_observer = HighCpuLogObserver()
    log_file_path = sys.argv[1]
    with open(log_file_path) as f:
        regular = r"^(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}) \[(\w+)\] (.+)"
        reg = re.compile(regular, re.I)
        line_num = 0
        for line in f:
            match_obj = reg.match(line)
            if match_obj:
                log_date, log_time, log_type, content = match_obj.groups()
                login_observer.on_log(log_date, log_time, log_type, content, line, line_num, log_file_path)
            line_num += 1

        login_observer.on_end(True)


if __name__ == '__main__':
    test()