app-MAIL-temp/monitoring.py

107 lines
3.3 KiB
Python
Raw Permalink Normal View History

import configparser
2020-08-15 19:55:56 +02:00
import os
2022-05-11 15:30:09 +02:00
import subprocess
2020-08-15 19:55:56 +02:00
from time import sleep
2022-05-11 15:30:09 +02:00
from typing import List, Dict
2020-08-15 19:55:56 +02:00
import newrelic.agent
from app.db import Session
2020-08-15 19:55:56 +02:00
from app.log import LOG
from monitor.metric_exporter import MetricExporter
2020-08-15 19:55:56 +02:00
# the number of consecutive fails
# if more than _max_nb_fails, alert
# reset whenever the system comes back to normal
# a system is considered fail if incoming_queue + active_queue > 50
_nb_failed = 0
_max_nb_fails = 10
2020-10-11 18:11:49 +02:00
# the maximum number of emails in incoming & active queue
_max_incoming = 50
2020-10-11 18:11:49 +02:00
_NR_CONFIG_FILE_LOCATION_VAR = "NEW_RELIC_CONFIG_FILE"
def get_newrelic_license() -> str:
nr_file = os.environ.get(_NR_CONFIG_FILE_LOCATION_VAR, None)
if nr_file is None:
raise Exception(f"{_NR_CONFIG_FILE_LOCATION_VAR} not defined")
config = configparser.ConfigParser()
config.read(nr_file)
return config["newrelic"]["license_key"]
2020-08-15 19:55:56 +02:00
@newrelic.agent.background_task()
def log_postfix_metrics():
2020-08-15 19:55:56 +02:00
"""Look at different metrics and alert appropriately"""
incoming_queue = nb_files("/var/spool/postfix/incoming")
active_queue = nb_files("/var/spool/postfix/active")
deferred_queue = nb_files("/var/spool/postfix/deferred")
LOG.d("postfix queue sizes %s %s %s", incoming_queue, active_queue, deferred_queue)
newrelic.agent.record_custom_metric("Custom/postfix_incoming_queue", incoming_queue)
newrelic.agent.record_custom_metric("Custom/postfix_active_queue", active_queue)
newrelic.agent.record_custom_metric("Custom/postfix_deferred_queue", deferred_queue)
2020-08-15 19:55:56 +02:00
2022-05-11 15:30:09 +02:00
proc_counts = get_num_procs(["smtp", "smtpd", "bounce", "cleanup"])
for proc_name in proc_counts:
LOG.d(f"Process count {proc_counts}")
2022-05-11 15:30:09 +02:00
newrelic.agent.record_custom_metric(
f"Custom/process_{proc_name}_count", proc_counts[proc_name]
)
2020-08-15 19:55:56 +02:00
def nb_files(directory) -> int:
2022-01-13 16:34:22 +01:00
"""return the number of files in directory and its subdirectories"""
2020-08-15 19:55:56 +02:00
return sum(len(files) for _, _, files in os.walk(directory))
2022-05-11 15:30:09 +02:00
def get_num_procs(proc_names: List[str]) -> Dict[str, int]:
data = (
subprocess.Popen(["ps", "ax"], stdout=subprocess.PIPE)
.communicate()[0]
.decode("utf-8")
)
return _process_ps_output(proc_names, data)
def _process_ps_output(proc_names: List[str], data: str) -> Dict[str, int]:
2022-05-11 15:30:09 +02:00
proc_counts = {proc_name: 0 for proc_name in proc_names}
lines = data.split("\n")
for line in lines:
entry = [field for field in line.strip().split() if field.strip()]
2022-05-11 15:30:09 +02:00
if len(entry) < 5:
continue
if entry[4][0] == "[":
continue
for proc_name in proc_names:
if entry[4] == proc_name:
2022-05-11 15:30:09 +02:00
proc_counts[proc_name] += 1
return proc_counts
@newrelic.agent.background_task()
def log_nb_db_connection():
# get the number of connections to the DB
r = Session.execute("select count(*) from pg_stat_activity;")
nb_connection = list(r)[0][0]
LOG.d("number of db connections %s", nb_connection)
newrelic.agent.record_custom_metric("Custom/nb_db_connections", nb_connection)
2020-08-15 19:55:56 +02:00
if __name__ == "__main__":
exporter = MetricExporter(get_newrelic_license())
2020-08-15 19:55:56 +02:00
while True:
log_postfix_metrics()
log_nb_db_connection()
Session.close()
2020-08-15 19:55:56 +02:00
exporter.run()
# 1 min
sleep(60)