Your IP : 18.218.1.38
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import os
import psutil
import time
import json
# Set default values for thresholds and monitoring time
WARNNUM = 10
CRITNUM = 20
MONTIME = 15
LOG_FILE = '/var/log/check_stalled_procs.json'
def parse_arguments():
"""
Parse command-line arguments for warning and critical thresholds, and monitoring time.
"""
parser = argparse.ArgumentParser()
parser.add_argument('-w', type=int, default=WARNNUM, help='Warning threshold for process count')
parser.add_argument('-c', type=int, default=CRITNUM, help='Critical threshold for process count')
parser.add_argument('-t', type=int, default=MONTIME, help='Monitoring time in minutes')
return parser.parse_args()
def get_process_info(state):
"""
Retrieve information about processes in a given state.
"""
try:
# Collect processes matching the given state
processes = [p for p in psutil.process_iter(['pid', 'status', 'cmdline', 'username', 'name']) if p.info['status'] == state]
# Create a description for each process
descriptions = [{
'pid': p.info['pid'],
'user': p.info.get('username', 'unknown'),
'cmd': p.info['cmdline'][:3] if p.info['cmdline'] else p.info.get('name', 'unknown')
} for p in processes]
return len(processes), descriptions
except Exception as e:
print(f"Error retrieving process information: {e}")
return 0, []
def read_last_log():
"""
Read the last entry from the log file. Return None if the file does not exist or has 0 size
"""
if os.path.exists(LOG_FILE) and os.path.getsize(LOG_FILE) > 0:
with open(LOG_FILE, 'r') as log:
lines = log.readlines()
if lines:
return json.loads(lines[-1].strip())
return None
def write_log(current_time, d_count, z_count, status, d_desc, z_desc):
"""
Write a log entry to the log file.
"""
log_entry = {
'time': current_time,
'd_count': d_count,
'z_count': z_count,
'status': status,
'd_desc': d_desc,
'z_desc': z_desc
}
with open(LOG_FILE, 'a') as log:
log.write(json.dumps(log_entry) + '\n')
def main():
"""
Main function to monitor stalled processes and report their status.
"""
args = parse_arguments()
current_time = int(time.time())
# Get the count and description of processes in disk sleep (D) state and zombie (Z) state
d_count, d_desc = get_process_info(psutil.STATUS_DISK_SLEEP)
z_count, z_desc = get_process_info(psutil.STATUS_ZOMBIE)
# Read the last log entry
last_log = read_last_log()
if last_log:
last_time = int(last_log['time'])
last_d_count = int(last_log['d_count'])
last_z_count = int(last_log['z_count'])
last_status = last_log['status']
else:
# Initialize variables if no last log entry exists
last_time, last_d_count, last_z_count, last_status = current_time, d_count, z_count, "OK"
write_log(last_time, last_d_count, last_z_count, last_status, d_desc, z_desc)
time_diff = current_time - last_time
status = last_status
# Check if the monitoring time has elapsed
if time_diff >= args.t * 60:
# Use the maximum count between current and last counts to determine status
td_count = max(d_count, last_d_count)
tz_count = max(z_count, last_z_count)
if td_count >= args.c or tz_count >= args.c:
status = "CRITICAL"
elif td_count >= args.w or tz_count >= args.w:
status = "WARNING"
else:
status = "OK"
write_log(current_time, d_count, z_count, status, d_desc, z_desc)
elif last_status != "OK" and d_count < args.w and z_count < args.w:
# Reset status to OK if counts drop below warning thresholds and previous status was not OK
status = "OK"
write_log(current_time, d_count, z_count, status, d_desc, z_desc)
# Output the status, counts and perfdata
output = f"{status} - Processes in D state: {d_count}, Z state: {z_count} | D={d_count};{args.w};{args.c}; Z={z_count};{args.w};{args.c};"
print(output)
# Exit with the appropriate code
if status == "OK":
exit(0)
elif status == "WARNING":
exit(1)
elif status == "CRITICAL":
exit(2)
else:
exit(3)
if __name__ == "__main__":
main()