2023-08-04 12:19:00 +02:00
|
|
|
import configparser
|
2020-08-15 19:55:56 +02:00
|
|
|
import os
|
2022-05-11 15:30:09 +02:00
|
|
|
import subprocess
|
2020-08-15 19:55:56 +02:00
|
|
|
from time import sleep
|
2022-05-11 15:30:09 +02:00
|
|
|
from typing import List, Dict
|
2020-08-15 19:55:56 +02:00
|
|
|
|
2024-05-24 11:09:10 +02:00
|
|
|
import arrow
|
2022-01-13 10:39:22 +01:00
|
|
|
import newrelic.agent
|
|
|
|
|
2021-10-12 14:36:47 +02:00
|
|
|
from app.db import Session
|
2020-08-15 19:55:56 +02:00
|
|
|
from app.log import LOG
|
2023-08-04 12:19:00 +02:00
|
|
|
from monitor.metric_exporter import MetricExporter
|
2020-08-15 19:55:56 +02:00
|
|
|
|
2020-08-16 10:27:35 +02:00
|
|
|
# the number of consecutive fails
|
2020-10-03 10:34:07 +02:00
|
|
|
# if more than _max_nb_fails, alert
|
2020-08-16 10:27:35 +02:00
|
|
|
# reset whenever the system comes back to normal
|
2020-09-08 18:52:36 +02:00
|
|
|
# a system is considered fail if incoming_queue + active_queue > 50
|
2020-08-16 10:27:35 +02:00
|
|
|
_nb_failed = 0
|
|
|
|
|
2020-10-03 10:34:07 +02:00
|
|
|
_max_nb_fails = 10
|
|
|
|
|
2020-10-11 18:11:49 +02:00
|
|
|
# the maximum number of emails in incoming & active queue
|
2020-10-11 18:13:17 +02:00
|
|
|
_max_incoming = 50
|
2020-10-11 18:11:49 +02:00
|
|
|
|
2023-08-04 12:19:00 +02:00
|
|
|
_NR_CONFIG_FILE_LOCATION_VAR = "NEW_RELIC_CONFIG_FILE"
|
|
|
|
|
|
|
|
|
|
|
|
def get_newrelic_license() -> str:
|
|
|
|
nr_file = os.environ.get(_NR_CONFIG_FILE_LOCATION_VAR, None)
|
|
|
|
if nr_file is None:
|
|
|
|
raise Exception(f"{_NR_CONFIG_FILE_LOCATION_VAR} not defined")
|
|
|
|
|
|
|
|
config = configparser.ConfigParser()
|
|
|
|
config.read(nr_file)
|
|
|
|
return config["newrelic"]["license_key"]
|
|
|
|
|
2020-08-15 19:55:56 +02:00
|
|
|
|
2022-01-13 10:39:22 +01:00
|
|
|
@newrelic.agent.background_task()
|
|
|
|
def log_postfix_metrics():
|
2020-08-15 19:55:56 +02:00
|
|
|
"""Look at different metrics and alert appropriately"""
|
|
|
|
incoming_queue = nb_files("/var/spool/postfix/incoming")
|
|
|
|
active_queue = nb_files("/var/spool/postfix/active")
|
|
|
|
deferred_queue = nb_files("/var/spool/postfix/deferred")
|
|
|
|
LOG.d("postfix queue sizes %s %s %s", incoming_queue, active_queue, deferred_queue)
|
|
|
|
|
2022-01-13 10:39:22 +01:00
|
|
|
newrelic.agent.record_custom_metric("Custom/postfix_incoming_queue", incoming_queue)
|
|
|
|
newrelic.agent.record_custom_metric("Custom/postfix_active_queue", active_queue)
|
|
|
|
newrelic.agent.record_custom_metric("Custom/postfix_deferred_queue", deferred_queue)
|
2020-08-15 19:55:56 +02:00
|
|
|
|
2022-05-11 15:30:09 +02:00
|
|
|
proc_counts = get_num_procs(["smtp", "smtpd", "bounce", "cleanup"])
|
|
|
|
for proc_name in proc_counts:
|
2022-05-12 12:37:19 +02:00
|
|
|
LOG.d(f"Process count {proc_counts}")
|
2022-05-11 15:30:09 +02:00
|
|
|
newrelic.agent.record_custom_metric(
|
|
|
|
f"Custom/process_{proc_name}_count", proc_counts[proc_name]
|
|
|
|
)
|
|
|
|
|
2020-08-15 19:55:56 +02:00
|
|
|
|
|
|
|
def nb_files(directory) -> int:
|
2022-01-13 16:34:22 +01:00
|
|
|
"""return the number of files in directory and its subdirectories"""
|
2020-08-15 19:55:56 +02:00
|
|
|
return sum(len(files) for _, _, files in os.walk(directory))
|
|
|
|
|
|
|
|
|
2022-05-11 15:30:09 +02:00
|
|
|
def get_num_procs(proc_names: List[str]) -> Dict[str, int]:
|
|
|
|
data = (
|
|
|
|
subprocess.Popen(["ps", "ax"], stdout=subprocess.PIPE)
|
|
|
|
.communicate()[0]
|
|
|
|
.decode("utf-8")
|
|
|
|
)
|
2022-05-12 12:37:19 +02:00
|
|
|
return _process_ps_output(proc_names, data)
|
|
|
|
|
|
|
|
|
|
|
|
def _process_ps_output(proc_names: List[str], data: str) -> Dict[str, int]:
|
2022-05-11 15:30:09 +02:00
|
|
|
proc_counts = {proc_name: 0 for proc_name in proc_names}
|
|
|
|
lines = data.split("\n")
|
|
|
|
for line in lines:
|
2022-05-12 12:37:19 +02:00
|
|
|
entry = [field for field in line.strip().split() if field.strip()]
|
2022-05-11 15:30:09 +02:00
|
|
|
if len(entry) < 5:
|
|
|
|
continue
|
|
|
|
if entry[4][0] == "[":
|
|
|
|
continue
|
|
|
|
for proc_name in proc_names:
|
2022-05-12 12:37:19 +02:00
|
|
|
if entry[4] == proc_name:
|
2022-05-11 15:30:09 +02:00
|
|
|
proc_counts[proc_name] += 1
|
|
|
|
return proc_counts
|
|
|
|
|
|
|
|
|
2022-01-13 10:39:22 +01:00
|
|
|
@newrelic.agent.background_task()
|
|
|
|
def log_nb_db_connection():
|
|
|
|
# get the number of connections to the DB
|
|
|
|
r = Session.execute("select count(*) from pg_stat_activity;")
|
|
|
|
nb_connection = list(r)[0][0]
|
|
|
|
|
|
|
|
LOG.d("number of db connections %s", nb_connection)
|
|
|
|
newrelic.agent.record_custom_metric("Custom/nb_db_connections", nb_connection)
|
|
|
|
|
|
|
|
|
2024-05-24 10:21:19 +02:00
|
|
|
@newrelic.agent.background_task()
|
|
|
|
def log_pending_to_process_events():
|
2024-05-24 10:52:08 +02:00
|
|
|
r = Session.execute("select count(*) from sync_event WHERE taken_time IS NULL;")
|
2024-05-24 10:21:19 +02:00
|
|
|
events_pending = list(r)[0][0]
|
|
|
|
|
|
|
|
LOG.d("number of events pending to process %s", events_pending)
|
|
|
|
newrelic.agent.record_custom_metric(
|
|
|
|
"Custom/sync_events_pending_to_process", events_pending
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2024-05-24 11:09:10 +02:00
|
|
|
@newrelic.agent.background_task()
|
|
|
|
def log_events_pending_dead_letter():
|
|
|
|
since = arrow.now().shift(minutes=-10).datetime
|
|
|
|
r = Session.execute(
|
|
|
|
"""
|
|
|
|
SELECT COUNT(*)
|
|
|
|
FROM sync_event
|
|
|
|
WHERE (taken_time IS NOT NULL AND taken_time < :since)
|
|
|
|
OR (taken_time IS NULL AND created_at < :since)
|
|
|
|
""",
|
|
|
|
{"since": since},
|
|
|
|
)
|
|
|
|
events_pending = list(r)[0][0]
|
|
|
|
|
|
|
|
LOG.d("number of events pending dead letter %s", events_pending)
|
|
|
|
newrelic.agent.record_custom_metric(
|
|
|
|
"Custom/sync_events_pending_dead_letter", events_pending
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2024-09-19 16:20:56 +02:00
|
|
|
@newrelic.agent.background_task()
|
|
|
|
def log_failed_events():
|
|
|
|
r = Session.execute(
|
|
|
|
"""
|
|
|
|
SELECT COUNT(*)
|
|
|
|
FROM sync_event
|
|
|
|
WHERE retries >= 10;
|
|
|
|
""",
|
|
|
|
)
|
|
|
|
failed_events = list(r)[0][0]
|
|
|
|
|
|
|
|
LOG.d("number of failed events %s", failed_events)
|
|
|
|
newrelic.agent.record_custom_metric("Custom/sync_events_failed", failed_events)
|
|
|
|
|
|
|
|
|
2020-08-15 19:55:56 +02:00
|
|
|
if __name__ == "__main__":
|
2023-08-04 12:19:00 +02:00
|
|
|
exporter = MetricExporter(get_newrelic_license())
|
2020-08-15 19:55:56 +02:00
|
|
|
while True:
|
2022-01-13 10:39:22 +01:00
|
|
|
log_postfix_metrics()
|
|
|
|
log_nb_db_connection()
|
2024-05-24 10:21:19 +02:00
|
|
|
log_pending_to_process_events()
|
2024-05-24 11:09:10 +02:00
|
|
|
log_events_pending_dead_letter()
|
2024-09-19 16:20:56 +02:00
|
|
|
log_failed_events()
|
2022-03-07 17:52:16 +01:00
|
|
|
Session.close()
|
2020-08-15 19:55:56 +02:00
|
|
|
|
2023-08-04 12:19:00 +02:00
|
|
|
exporter.run()
|
|
|
|
|
2020-10-03 10:34:07 +02:00
|
|
|
# 1 min
|
|
|
|
sleep(60)
|