File: //usr/local/aegis/PythonLoaderTemp/third_party/aegis_checker/offline/check_high_cpu.py
# -*- coding: utf-8 -*-
import sys
import re
import logging
import re
from aegis_checker.common.print_log import *
from aegis_checker.common.aegis_client_log_parser import LogObserver, LOG_INFO, LOG_WARN
from aegis_checker.info.check_result import *
def _get_high_cpu_thread_name(high_cpu_line_num, log_file_path):
"""
:param high_cpu_line_num: log "2020-05-06 16:59:52 [Info] OnCpuMax : 29 , HitCount : 42" line number, start with 0
:param log_file_path:
:return: string, if fail, return None
"""
# 2020-05-06 16:59:52 [Info] cpu high thread id:21741
# NetLinkProc 21741 10 279 629 10
high_cpu_tid_reg = re.compile(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} \[Info\] cpu high thread id:(\d+)")
high_cpu_thread_name_reg = re.compile(r"^(\w+)\s+(\d+)\s+\d+\s+\d+\s+\d+\s+\d+\s+")
with open(log_file_path) as f:
i = 0
tid = None
for line in f:
i += 1
if i < high_cpu_line_num:
continue
if i > (high_cpu_line_num + 50):
logging.warning("can not find high cpu thread for line %d in %s", high_cpu_line_num, log_file_path)
break
if tid is None:
match_obj = high_cpu_tid_reg.match(line)
if match_obj:
tid = match_obj.group(1)
else:
match_obj = high_cpu_thread_name_reg.match(line)
if match_obj and tid == match_obj.group(2):
return match_obj.group(1)
class HighCpuLogObserver(LogObserver):
def __init__(self):
self.__high_cpu_events = []
def on_end(self, success):
if not self.__high_cpu_events:
return
high_cpu_thread_dict = {}
for high_cpu_event in self.__high_cpu_events:
thread = high_cpu_event["thread_name"]
if thread in high_cpu_thread_dict:
high_cpu_thread_dict[thread] += 1
else:
high_cpu_thread_dict[thread] = 1
if thread == "NetLinkProc":
logging.warning(
"offline issue root cause is there is more than 200 process start in 1 second, yundun is high cpu and kill self when %s %s, log is %s" % (
high_cpu_event["date"], high_cpu_event["time"], high_cpu_event["content"]))
else:
logging.warning(
"offline issue may be caused by yundun is high cpu and kill self when %s %s, high cpu thread is %s, log is %s" % (
high_cpu_event["date"], high_cpu_event["time"], thread, high_cpu_event["content"]))
# get the top high cpu thread
high_cpu_thread_name = ""
max_high_cpu_count = 0
for thread_name, count in high_cpu_thread_dict.items():
if count > max_high_cpu_count:
high_cpu_thread_name = thread_name
max_high_cpu_count = count
if high_cpu_thread_name == "NetLinkProc":
set_root_cause(ROOT_CAUSE_HIGH_CPU_BY_FREQUENT_NEW_PROCESS, "there is more than 200 process start in 1 second, yundun is high cpu and kill self")
else:
set_root_cause(ROOT_CAUSE_HIGH_CPU, "yundun is high cpu and kill self, high cpu thread is %s" % high_cpu_thread_name)
def on_log(self, log_date, log_time, log_type, content, line, line_num, log_file_path):
"""
2020-03-25 06:08:22 [Warn] GetMaxCpu : 23
2020-03-25 06:08:22 [Info] OnCpuMax : 139 , HitCount : 47
:param log_date:
:param log_time:
:param log_type:
:param content:
:param line:
:return:
"""
high_cpu_reg = r"OnCpuMax : \d+ , HitCount : \d+"
if log_type == LOG_INFO and re.match(high_cpu_reg, content):
# logging.warning("high cpu on %s" % line)
thread_name = _get_high_cpu_thread_name(line_num, log_file_path)
self.__high_cpu_events.append({
"date": log_date,
"time": log_time,
"content": content,
"thread_name": thread_name
})
def test():
logging.basicConfig(format='%(asctime)s [%(filename)s][%(levelname)s] %(message)s', level=logging.DEBUG)
login_observer = HighCpuLogObserver()
log_file_path = sys.argv[1]
with open(log_file_path) as f:
regular = r"^(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}) \[(\w+)\] (.+)"
reg = re.compile(regular, re.I)
line_num = 0
for line in f:
match_obj = reg.match(line)
if match_obj:
log_date, log_time, log_type, content = match_obj.groups()
login_observer.on_log(log_date, log_time, log_type, content, line, line_num, log_file_path)
line_num += 1
login_observer.on_end(True)
if __name__ == '__main__':
test()